nlp

package

v0.3.0 Latest Latest Go to latest Published: Aug 1, 2024 License: Apache-2.0 Imports: 14 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/willbeason/wikipedia

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func CleanArticle(text string) string
func Normalize(w string) string
func NormalizeArticle(text string) string
func ToNgramDictionary(dictionary *Dictionary) map[string]bool
type Counter
- func (c Counter) Count(s string) map[string]int
type Dictionary
- func ReadDictionary(path string) (*Dictionary, error)
- func (*Dictionary) Descriptor() ([]byte, []int)deprecated
- func (x *Dictionary) GetWords() []string
- func (*Dictionary) ProtoMessage()
- func (x *Dictionary) ProtoReflect() protoreflect.Message
- func (x *Dictionary) Reset()
- func (x *Dictionary) String() string
type DocumentGender
- func ReadDocumentGender(line string) (DocumentGender, error)
- func ReadDocumentGenders(path string) ([]DocumentGender, error)
- func (dg DocumentGender) String() string
type FrequencyMap
- func (*FrequencyMap) Descriptor() ([]byte, []int)deprecated
- func (x *FrequencyMap) GetWords() map[string]uint32
- func (*FrequencyMap) ProtoMessage()
- func (x *FrequencyMap) ProtoReflect() protoreflect.Message
- func (x *FrequencyMap) Reset()
- func (x *FrequencyMap) String() string
type FrequencyTable
- func ToFrequencyTable(m *FrequencyMap) *FrequencyTable
- func (*FrequencyTable) Descriptor() ([]byte, []int)deprecated
- func (x *FrequencyTable) GetWords() []*WordCount
- func (*FrequencyTable) ProtoMessage()
- func (x *FrequencyTable) ProtoReflect() protoreflect.Message
- func (x *FrequencyTable) Reset()
- func (x *FrequencyTable) Sort()
- func (x *FrequencyTable) String() string
type Gender
- func InferGender(text string) Gender
type NgramTokenizer
- func (t NgramTokenizer) Tokenize(s string) []string
type Tokenizer
type WordCount
- func (*WordCount) Descriptor() ([]byte, []int)deprecated
- func (x *WordCount) GetCount() uint32
- func (x *WordCount) GetWord() string
- func (*WordCount) ProtoMessage()
- func (x *WordCount) ProtoReflect() protoreflect.Message
- func (x *WordCount) Reset()
- func (x *WordCount) String() string
type WordTokenizer
- func (t WordTokenizer) Tokenize(s string) []string
type XMLTokenizer
- func (x XMLTokenizer) Tokenize(s string) []string

Constants ¶

View Source

const (
	NumToken        = "_num_"
	PercentToken    = "_percent_"
	DateToken       = "_date_"
	MathToken       = "_math_"
	HieroglyphToken = "_hieroglyph_" //nolint:gosec // This is a reference to egyptian hieroglyphs.
)

Tokens to replace longer sequences with, that we treat as semantically identical for analysis.

View Source

const (
	// Months are all the months of the year.
	Months = "(january|february|march|april|may|june|july|august|september|october|november|december)"
)

Variables ¶

View Source

var (

	// CommentRegex matches commented-out text. Such text is not shown on pages
	// and is generally either off-first-links or low quality.
	//
	// Obviously not perfect and can match non-comments in rare cases.
	CommentRegex = regexp.MustCompile(`(?s)<!--.*?-->`)

	IgnoredTagsRegex     = regexp.MustCompile(fmt.Sprintf(`(?i)</?(%s).*?>`, strings.Join(ignoredTags(), "|")))
	TimelineRegex        = regexp.MustCompile(`(?is)<timeline.*?</timeline[\w\s]*>`)
	GalleryRegex         = regexp.MustCompile(`(?is)<gallery.*?</gallery[\w\s]*>`)
	GraphRegex           = regexp.MustCompile(`(?is)<graph.*?</graph[\w\s]*>`)
	ImageMapRegex        = regexp.MustCompile(`(?is)<imagemap.*?</imagemap[\w\s]*>`)
	MathRegex            = regexp.MustCompile(`(?is)<math.*?</math[\w\s]*>`)
	CodeRegex            = regexp.MustCompile(`(?is)<code.*?</code[\w\s]*>`)
	CiteRegex            = regexp.MustCompile(`(?is)<cite.*?</cite[\w\s]*>`)
	ChemRegex            = regexp.MustCompile(`(?is)<chem.*?</chem[\w\s]*>`)
	PoemRegex            = regexp.MustCompile(`(?is)<poem.*?</poem[\w\s]*>`)
	HieroglyphRegex      = regexp.MustCompile(`(?is)<hiero.*?</hiero[\w\s]*>`)
	MapframeRegex        = regexp.MustCompile(`(?is)<mapframe.*?</mapframe[\w\s]*>`)
	DelRegex             = regexp.MustCompile(`(?is)<del.*?</del[\w\s]*>`)
	SyntaxHighlightRegex = regexp.MustCompile(`(?is)<syntaxhighlight.*?</syntaxhighlight[\w\s]*>`)
	PreRegex             = regexp.MustCompile(`(?is)<pre.*?</pre[\w\s]*>`)
	TableRegex           = regexp.MustCompile(`(?is)<table.*?</table[\w\s]*>`)
	TableRegex2          = regexp.MustCompile(`(?s)({\||{{).*?\n\|}`)
	BrRegex              = regexp.MustCompile(`(?i)<(p|br|hr).*?>`)

	AlteredQuote = regexp.MustCompile(`\[([a-zA-Z])]`)

	RemoveLinks = regexp.MustCompile(`\[\[(:?Category:|List of)[^]]+]]`)

	WikipediaLinks = regexp.MustCompile(`\[\[([^[\]]+\|)?([^[|]+?)]]`)

	RefRegex = regexp.MustCompile(`(?s)<ref.*?(>.*?</ref>| ?/>)`)
)

Regular expressions for cleaning Wikipedia articles of XML tags and formatting.

View Source

var (
	WordRegex = regexp.MustCompile(`[\w']+`)

	NumberRegex  = regexp.MustCompile(`\b\d+(,\d{3})*(\.\d+)?\b`)
	PercentRegex = regexp.MustCompile(NumToken + "%")
	DateRegex    = regexp.MustCompile(fmt.Sprintf(`(?i)\b(%s (%s,? )?%s|%s %s,? %s)\b`,
		NumToken, Months, NumToken,
		Months, NumToken, NumToken,
	))
)

Regular expressions for detecting semantically-similar sequences.

View Source

var ErrUnknownGender = errors.New("unable to parse gender")

View Source

var ErrUnsupportedProtoExtension = errors.New("unsupported proto extension")

View Source

var File_pkg_nlp_nlp_proto protoreflect.FileDescriptor

View Source

var XMLTagRegex = regexp.MustCompile(`<[a-z][a-z0-9]+`)

XMLTagRegex tries to find XML tags which are still present in the corpus. Useful for finding problematic tags that we want to avoid.

Functions ¶

func CleanArticle ¶

func CleanArticle(text string) string

CleanArticle removes all parts of Wikipedia we never want to analyze.

func Normalize ¶

func Normalize(w string) string

func NormalizeArticle ¶

func NormalizeArticle(text string) string

func ToNgramDictionary ¶

func ToNgramDictionary(dictionary *Dictionary) map[string]bool

ToNgramDictionary constructs a dictionary of all n-grams including prefixes.

Types ¶

type Counter ¶

type Counter struct {
	Tokenizer
}

func (Counter) Count ¶

func (c Counter) Count(s string) map[string]int

type Dictionary ¶

type Dictionary struct {

	// Words is a list of recognized words, in the order they appear in a frequency table.
	Words []string `protobuf:"bytes,1,rep,name=words,proto3" json:"words,omitempty"`
	// contains filtered or unexported fields
}

Dictionary is a set of known words.

func ReadDictionary ¶

func ReadDictionary(path string) (*Dictionary, error)

ReadDictionary reads a Dictionary proto from a file. Returns an empty dictionary if path is the empty string.

func (*Dictionary) Descriptor deprecated

func (*Dictionary) Descriptor() ([]byte, []int)

Deprecated: Use Dictionary.ProtoReflect.Descriptor instead.

func (*Dictionary) GetWords ¶

func (x *Dictionary) GetWords() []string

func (*Dictionary) ProtoMessage ¶

func (*Dictionary) ProtoMessage()

func (*Dictionary) ProtoReflect ¶

func (x *Dictionary) ProtoReflect() protoreflect.Message

func (*Dictionary) Reset ¶

func (x *Dictionary) Reset()

func (*Dictionary) String ¶

func (x *Dictionary) String() string

type DocumentGender ¶ added in v0.2.0

type DocumentGender struct {
	ID     uint32
	Gender Gender
}

func ReadDocumentGender ¶ added in v0.2.0

func ReadDocumentGender(line string) (DocumentGender, error)

func ReadDocumentGenders ¶ added in v0.2.0

func ReadDocumentGenders(path string) ([]DocumentGender, error)

func (DocumentGender) String ¶ added in v0.2.0

func (dg DocumentGender) String() string

type FrequencyMap ¶

type FrequencyMap struct {
	Words map[string]uint32 `` /* 152-byte string literal not displayed */
	// contains filtered or unexported fields
}

FrequencyMap is a set of known words and their frequencies.

func (*FrequencyMap) Descriptor deprecated

func (*FrequencyMap) Descriptor() ([]byte, []int)

Deprecated: Use FrequencyMap.ProtoReflect.Descriptor instead.

func (*FrequencyMap) GetWords ¶

func (x *FrequencyMap) GetWords() map[string]uint32

func (*FrequencyMap) ProtoMessage ¶

func (*FrequencyMap) ProtoMessage()

func (*FrequencyMap) ProtoReflect ¶

func (x *FrequencyMap) ProtoReflect() protoreflect.Message

func (*FrequencyMap) Reset ¶

func (x *FrequencyMap) Reset()

func (*FrequencyMap) String ¶

func (x *FrequencyMap) String() string

type FrequencyTable ¶

type FrequencyTable struct {
	Words []*WordCount `protobuf:"bytes,1,rep,name=words,proto3" json:"words,omitempty"`
	// contains filtered or unexported fields
}

FrequencyMap is a set of known words and their frequencies.

func ToFrequencyTable ¶

func ToFrequencyTable(m *FrequencyMap) *FrequencyTable

func (*FrequencyTable) Descriptor deprecated

func (*FrequencyTable) Descriptor() ([]byte, []int)

Deprecated: Use FrequencyTable.ProtoReflect.Descriptor instead.

func (*FrequencyTable) GetWords ¶

func (x *FrequencyTable) GetWords() []*WordCount

func (*FrequencyTable) ProtoMessage ¶

func (*FrequencyTable) ProtoMessage()

func (*FrequencyTable) ProtoReflect ¶

func (x *FrequencyTable) ProtoReflect() protoreflect.Message

func (*FrequencyTable) Reset ¶

func (x *FrequencyTable) Reset()

func (*FrequencyTable) Sort ¶

func (x *FrequencyTable) Sort()

func (*FrequencyTable) String ¶

func (x *FrequencyTable) String() string

type Gender ¶ added in v0.2.0

type Gender string

const (
	Male      Gender = "male"
	Female    Gender = "female"
	Nonbinary Gender = "nonbinary"
	Multiple  Gender = "multiple"
	Unknown   Gender = "unknown"
)

func InferGender ¶ added in v0.2.0

func InferGender(text string) Gender

type NgramTokenizer ¶

type NgramTokenizer struct {
	Underlying WordTokenizer

	Dictionary map[string]bool
}

func (NgramTokenizer) Tokenize ¶

func (t NgramTokenizer) Tokenize(s string) []string

type Tokenizer ¶

type Tokenizer interface {
	// Tokenize splits s into distinct tokens.
	Tokenize(s string) []string
}

type WordCount ¶

type WordCount struct {
	Word  string `protobuf:"bytes,1,opt,name=word,proto3" json:"word,omitempty"`
	Count uint32 `protobuf:"varint,2,opt,name=count,proto3" json:"count,omitempty"`
	// contains filtered or unexported fields
}

func (*WordCount) Descriptor deprecated

func (*WordCount) Descriptor() ([]byte, []int)

Deprecated: Use WordCount.ProtoReflect.Descriptor instead.

func (*WordCount) GetCount ¶

func (x *WordCount) GetCount() uint32

func (*WordCount) GetWord ¶

func (x *WordCount) GetWord() string

func (*WordCount) ProtoMessage ¶

func (*WordCount) ProtoMessage()

func (*WordCount) ProtoReflect ¶

func (x *WordCount) ProtoReflect() protoreflect.Message

func (*WordCount) Reset ¶

func (x *WordCount) Reset()

func (*WordCount) String ¶

func (x *WordCount) String() string

type WordTokenizer ¶

type WordTokenizer struct{}

func (WordTokenizer) Tokenize ¶

func (t WordTokenizer) Tokenize(s string) []string

type XMLTokenizer ¶

type XMLTokenizer struct{}

func (XMLTokenizer) Tokenize ¶

func (x XMLTokenizer) Tokenize(s string) []string

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL