Documentation
¶
Index ¶
- Constants
- Variables
- func CleanArticle(text string) string
- func Normalize(w string) string
- func NormalizeArticle(text string) string
- func ToNgramDictionary(dictionary *Dictionary) map[string]bool
- type Counter
- type Dictionary
- type DocumentGender
- type FrequencyMap
- type FrequencyTable
- func (*FrequencyTable) Descriptor() ([]byte, []int)deprecated
- func (x *FrequencyTable) GetWords() []*WordCount
- func (*FrequencyTable) ProtoMessage()
- func (x *FrequencyTable) ProtoReflect() protoreflect.Message
- func (x *FrequencyTable) Reset()
- func (x *FrequencyTable) Sort()
- func (x *FrequencyTable) String() string
- type Gender
- type NgramTokenizer
- type Tokenizer
- type WordCount
- type WordTokenizer
- type XMLTokenizer
Constants ¶
const ( NumToken = "_num_" PercentToken = "_percent_" DateToken = "_date_" MathToken = "_math_" HieroglyphToken = "_hieroglyph_" //nolint:gosec // This is a reference to egyptian hieroglyphs. )
Tokens to replace longer sequences with, that we treat as semantically identical for analysis.
const (
// Months are all the months of the year.
Months = "(january|february|march|april|may|june|july|august|september|october|november|december)"
)
Variables ¶
var ( // CommentRegex matches commented-out text. Such text is not shown on pages // and is generally either off-first-links or low quality. // // Obviously not perfect and can match non-comments in rare cases. CommentRegex = regexp.MustCompile(`(?s)<!--.*?-->`) IgnoredTagsRegex = regexp.MustCompile(fmt.Sprintf(`(?i)</?(%s).*?>`, strings.Join(ignoredTags(), "|"))) TimelineRegex = regexp.MustCompile(`(?is)<timeline.*?</timeline[\w\s]*>`) GalleryRegex = regexp.MustCompile(`(?is)<gallery.*?</gallery[\w\s]*>`) GraphRegex = regexp.MustCompile(`(?is)<graph.*?</graph[\w\s]*>`) ImageMapRegex = regexp.MustCompile(`(?is)<imagemap.*?</imagemap[\w\s]*>`) MathRegex = regexp.MustCompile(`(?is)<math.*?</math[\w\s]*>`) CodeRegex = regexp.MustCompile(`(?is)<code.*?</code[\w\s]*>`) CiteRegex = regexp.MustCompile(`(?is)<cite.*?</cite[\w\s]*>`) ChemRegex = regexp.MustCompile(`(?is)<chem.*?</chem[\w\s]*>`) PoemRegex = regexp.MustCompile(`(?is)<poem.*?</poem[\w\s]*>`) HieroglyphRegex = regexp.MustCompile(`(?is)<hiero.*?</hiero[\w\s]*>`) MapframeRegex = regexp.MustCompile(`(?is)<mapframe.*?</mapframe[\w\s]*>`) DelRegex = regexp.MustCompile(`(?is)<del.*?</del[\w\s]*>`) SyntaxHighlightRegex = regexp.MustCompile(`(?is)<syntaxhighlight.*?</syntaxhighlight[\w\s]*>`) PreRegex = regexp.MustCompile(`(?is)<pre.*?</pre[\w\s]*>`) TableRegex = regexp.MustCompile(`(?is)<table.*?</table[\w\s]*>`) TableRegex2 = regexp.MustCompile(`(?s)({\||{{).*?\n\|}`) BrRegex = regexp.MustCompile(`(?i)<(p|br|hr).*?>`) AlteredQuote = regexp.MustCompile(`\[([a-zA-Z])]`) RemoveLinks = regexp.MustCompile(`\[\[(:?Category:|List of)[^]]+]]`) WikipediaLinks = regexp.MustCompile(`\[\[([^[\]]+\|)?([^[|]+?)]]`) RefRegex = regexp.MustCompile(`(?s)<ref.*?(>.*?</ref>| ?/>)`) )
Regular expressions for cleaning Wikipedia articles of XML tags and formatting.
var ( WordRegex = regexp.MustCompile(`[\w']+`) NumberRegex = regexp.MustCompile(`\b\d+(,\d{3})*(\.\d+)?\b`) PercentRegex = regexp.MustCompile(NumToken + "%") DateRegex = regexp.MustCompile(fmt.Sprintf(`(?i)\b(%s (%s,? )?%s|%s %s,? %s)\b`, NumToken, Months, NumToken, Months, NumToken, NumToken, )) )
Regular expressions for detecting semantically-similar sequences.
var ErrUnknownGender = errors.New("unable to parse gender")
var ErrUnsupportedProtoExtension = errors.New("unsupported proto extension")
var File_pkg_nlp_nlp_proto protoreflect.FileDescriptor
var XMLTagRegex = regexp.MustCompile(`<[a-z][a-z0-9]+`)
XMLTagRegex tries to find XML tags which are still present in the corpus. Useful for finding problematic tags that we want to avoid.
Functions ¶
func CleanArticle ¶
CleanArticle removes all parts of Wikipedia we never want to analyze.
func NormalizeArticle ¶
func ToNgramDictionary ¶
func ToNgramDictionary(dictionary *Dictionary) map[string]bool
ToNgramDictionary constructs a dictionary of all n-grams including prefixes.
Types ¶
type Dictionary ¶
type Dictionary struct { // Words is a list of recognized words, in the order they appear in a frequency table. Words []string `protobuf:"bytes,1,rep,name=words,proto3" json:"words,omitempty"` // contains filtered or unexported fields }
Dictionary is a set of known words.
func ReadDictionary ¶
func ReadDictionary(path string) (*Dictionary, error)
ReadDictionary reads a Dictionary proto from a file. Returns an empty dictionary if path is the empty string.
func (*Dictionary) Descriptor
deprecated
func (*Dictionary) Descriptor() ([]byte, []int)
Deprecated: Use Dictionary.ProtoReflect.Descriptor instead.
func (*Dictionary) GetWords ¶
func (x *Dictionary) GetWords() []string
func (*Dictionary) ProtoMessage ¶
func (*Dictionary) ProtoMessage()
func (*Dictionary) ProtoReflect ¶
func (x *Dictionary) ProtoReflect() protoreflect.Message
func (*Dictionary) Reset ¶
func (x *Dictionary) Reset()
func (*Dictionary) String ¶
func (x *Dictionary) String() string
type DocumentGender ¶ added in v0.2.0
func ReadDocumentGender ¶ added in v0.2.0
func ReadDocumentGender(line string) (DocumentGender, error)
func ReadDocumentGenders ¶ added in v0.2.0
func ReadDocumentGenders(path string) ([]DocumentGender, error)
func (DocumentGender) String ¶ added in v0.2.0
func (dg DocumentGender) String() string
type FrequencyMap ¶
type FrequencyMap struct { Words map[string]uint32 `` /* 152-byte string literal not displayed */ // contains filtered or unexported fields }
FrequencyMap is a set of known words and their frequencies.
func (*FrequencyMap) Descriptor
deprecated
func (*FrequencyMap) Descriptor() ([]byte, []int)
Deprecated: Use FrequencyMap.ProtoReflect.Descriptor instead.
func (*FrequencyMap) GetWords ¶
func (x *FrequencyMap) GetWords() map[string]uint32
func (*FrequencyMap) ProtoMessage ¶
func (*FrequencyMap) ProtoMessage()
func (*FrequencyMap) ProtoReflect ¶
func (x *FrequencyMap) ProtoReflect() protoreflect.Message
func (*FrequencyMap) Reset ¶
func (x *FrequencyMap) Reset()
func (*FrequencyMap) String ¶
func (x *FrequencyMap) String() string
type FrequencyTable ¶
type FrequencyTable struct { Words []*WordCount `protobuf:"bytes,1,rep,name=words,proto3" json:"words,omitempty"` // contains filtered or unexported fields }
FrequencyMap is a set of known words and their frequencies.
func ToFrequencyTable ¶
func ToFrequencyTable(m *FrequencyMap) *FrequencyTable
func (*FrequencyTable) Descriptor
deprecated
func (*FrequencyTable) Descriptor() ([]byte, []int)
Deprecated: Use FrequencyTable.ProtoReflect.Descriptor instead.
func (*FrequencyTable) GetWords ¶
func (x *FrequencyTable) GetWords() []*WordCount
func (*FrequencyTable) ProtoMessage ¶
func (*FrequencyTable) ProtoMessage()
func (*FrequencyTable) ProtoReflect ¶
func (x *FrequencyTable) ProtoReflect() protoreflect.Message
func (*FrequencyTable) Reset ¶
func (x *FrequencyTable) Reset()
func (*FrequencyTable) Sort ¶
func (x *FrequencyTable) Sort()
func (*FrequencyTable) String ¶
func (x *FrequencyTable) String() string
type NgramTokenizer ¶
type NgramTokenizer struct { Underlying WordTokenizer Dictionary map[string]bool }
func (NgramTokenizer) Tokenize ¶
func (t NgramTokenizer) Tokenize(s string) []string
type WordCount ¶
type WordCount struct { Word string `protobuf:"bytes,1,opt,name=word,proto3" json:"word,omitempty"` Count uint32 `protobuf:"varint,2,opt,name=count,proto3" json:"count,omitempty"` // contains filtered or unexported fields }
func (*WordCount) Descriptor
deprecated
func (*WordCount) ProtoMessage ¶
func (*WordCount) ProtoMessage()
func (*WordCount) ProtoReflect ¶
func (x *WordCount) ProtoReflect() protoreflect.Message
type WordTokenizer ¶
type WordTokenizer struct{}
func (WordTokenizer) Tokenize ¶
func (t WordTokenizer) Tokenize(s string) []string
type XMLTokenizer ¶
type XMLTokenizer struct{}
func (XMLTokenizer) Tokenize ¶
func (x XMLTokenizer) Tokenize(s string) []string