Documentation ¶
Index ¶
- Constants
- Variables
- func DisambiguateTags(page, category string) string
- func PersonInfoboxes() []string
- func ReadPages(pages chan<- *Page) func([]byte) error
- func WriteFrequencyTable(out string, t FrequencyTable) error
- type Categories
- func (x *Categories) Add(parent uint32)
- func (*Categories) Descriptor() ([]byte, []int)deprecated
- func (x *Categories) GetCategories() []uint32
- func (*Categories) ProtoMessage()
- func (x *Categories) ProtoReflect() protoreflect.Message
- func (x *Categories) Reset()
- func (x *Categories) String() string
- type Categorizer
- type Document
- type Frequency
- type FrequencyMap
- type FrequencyTable
- type InfoboxChecker
- type Namespace
- type Page
- func (*Page) Descriptor() ([]byte, []int)deprecated
- func (x *Page) GetID() uint32
- func (x *Page) GetText() string
- func (x *Page) GetTitle() string
- func (x *Page) ID() uint32
- func (*Page) ProtoMessage()
- func (x *Page) ProtoReflect() protoreflect.Message
- func (x *Page) Reset()
- func (x *Page) String() string
- type PageCategories
- func (x *PageCategories) Add(child, parent uint32)
- func (*PageCategories) Descriptor() ([]byte, []int)deprecated
- func (x *PageCategories) GetPages() map[uint32]*Categories
- func (*PageCategories) ProtoMessage()
- func (x *PageCategories) ProtoReflect() protoreflect.Message
- func (x *PageCategories) Reset()
- func (x *PageCategories) String() string
- type TitleIndex
- type WordSet
- type WordSets
- type XMLDocument
- type XMLPage
- type XMLRedirect
- type XMLRevision
Constants ¶
View Source
const ( NamespaceArticle Namespace = iota NamespaceTalk NamespaceUser NamespaceUserTalk NamespaceWikipedia NamespaceWikipediaTalk NamespaceFile NamespaceFileTalk NamespaceMediaWiki NamespaceMediaWikiTalk NamespaceTemplate NamespaceTemplateTalk NamespaceHelp NamespaceHelpTalk NamespaceCategory NamespaceCategoryTalk NamespacePortal = 100 NamespacePortalTalk = 101 NamespaceDraft = 118 NamespaceDraftTalk = 119 NamespaceTimedText = 710 NamespaceTimedTextTalk = 711 NamespaceModule = 828 NamespaceModuleTalk = 829 )
Variables ¶
View Source
var File_pkg_documents_documents_proto protoreflect.FileDescriptor
Functions ¶
func DisambiguateTags ¶
func PersonInfoboxes ¶ added in v0.2.0
func PersonInfoboxes() []string
func WriteFrequencyTable ¶
func WriteFrequencyTable(out string, t FrequencyTable) error
Types ¶
type Categories ¶
type Categories struct { Categories []uint32 `protobuf:"varint,1,rep,packed,name=categories,proto3" json:"categories,omitempty"` // contains filtered or unexported fields }
func (*Categories) Add ¶
func (x *Categories) Add(parent uint32)
func (*Categories) Descriptor
deprecated
func (*Categories) Descriptor() ([]byte, []int)
Deprecated: Use Categories.ProtoReflect.Descriptor instead.
func (*Categories) GetCategories ¶
func (x *Categories) GetCategories() []uint32
func (*Categories) ProtoMessage ¶
func (*Categories) ProtoMessage()
func (*Categories) ProtoReflect ¶
func (x *Categories) ProtoReflect() protoreflect.Message
func (*Categories) Reset ¶
func (x *Categories) Reset()
func (*Categories) String ¶
func (x *Categories) String() string
type Categorizer ¶
type Categorizer struct {
TitleIndex *TitleIndex
}
func (*Categorizer) Categorize ¶
func (c *Categorizer) Categorize(page *Page) *Categories
type Document ¶
type Document struct { Pages []*Page `protobuf:"bytes,1,rep,name=pages,proto3" json:"pages,omitempty"` // contains filtered or unexported fields }
func (*Document) Descriptor
deprecated
func (*Document) ProtoMessage ¶
func (*Document) ProtoMessage()
func (*Document) ProtoReflect ¶
func (x *Document) ProtoReflect() protoreflect.Message
type FrequencyMap ¶
func (*FrequencyMap) Collect ¶
func (f *FrequencyMap) Collect(words <-chan string)
Collect reads the words in a channel into a frequency table.
func (*FrequencyMap) CollectMaps ¶
func (*FrequencyMap) Filter ¶
func (f *FrequencyMap) Filter(minCount int)
Filter drops all words which have been seen fewer than minCount times.
type FrequencyTable ¶
type FrequencyTable struct {
Frequencies []Frequency
}
func ReadFrequencyTables ¶
func ReadFrequencyTables(paths ...string) (*FrequencyTable, error)
func ToFrequencyTable ¶
func ToFrequencyTable(wordCounts map[string]int) FrequencyTable
func (*FrequencyTable) ToNgramDictionary ¶
func (t *FrequencyTable) ToNgramDictionary() map[string]bool
type InfoboxChecker ¶ added in v0.2.0
type InfoboxChecker struct {
// contains filtered or unexported fields
}
func NewInfoboxChecker ¶ added in v0.2.0
func NewInfoboxChecker(want []string) (*InfoboxChecker, error)
func (*InfoboxChecker) Matches ¶ added in v0.2.0
func (r *InfoboxChecker) Matches(rawText string) bool
type Page ¶
type Page struct { Id uint32 `protobuf:"varint,1,opt,name=id,proto3" json:"id,omitempty"` Title string `protobuf:"bytes,2,opt,name=title,proto3" json:"title,omitempty"` Text string `protobuf:"bytes,3,opt,name=text,proto3" json:"text,omitempty"` // contains filtered or unexported fields }
func (*Page) Descriptor
deprecated
func (*Page) ProtoMessage ¶
func (*Page) ProtoMessage()
func (*Page) ProtoReflect ¶
func (x *Page) ProtoReflect() protoreflect.Message
type PageCategories ¶
type PageCategories struct { Pages map[uint32]*Categories `` /* 152-byte string literal not displayed */ // contains filtered or unexported fields }
func (*PageCategories) Add ¶
func (x *PageCategories) Add(child, parent uint32)
func (*PageCategories) Descriptor
deprecated
func (*PageCategories) Descriptor() ([]byte, []int)
Deprecated: Use PageCategories.ProtoReflect.Descriptor instead.
func (*PageCategories) GetPages ¶
func (x *PageCategories) GetPages() map[uint32]*Categories
func (*PageCategories) ProtoMessage ¶
func (*PageCategories) ProtoMessage()
func (*PageCategories) ProtoReflect ¶
func (x *PageCategories) ProtoReflect() protoreflect.Message
func (*PageCategories) Reset ¶
func (x *PageCategories) Reset()
func (*PageCategories) String ¶
func (x *PageCategories) String() string
type TitleIndex ¶
type TitleIndex struct { Titles map[string]uint32 `` /* 154-byte string literal not displayed */ // contains filtered or unexported fields }
func (*TitleIndex) Descriptor
deprecated
func (*TitleIndex) Descriptor() ([]byte, []int)
Deprecated: Use TitleIndex.ProtoReflect.Descriptor instead.
func (*TitleIndex) GetTitles ¶
func (x *TitleIndex) GetTitles() map[string]uint32
func (*TitleIndex) ProtoMessage ¶
func (*TitleIndex) ProtoMessage()
func (*TitleIndex) ProtoReflect ¶
func (x *TitleIndex) ProtoReflect() protoreflect.Message
func (*TitleIndex) Reset ¶
func (x *TitleIndex) Reset()
func (*TitleIndex) String ¶
func (x *TitleIndex) String() string
type XMLDocument ¶
type XMLDocument struct {
Pages []XMLPage `xml:"page"`
}
XMLDocument solely exists for extracting from pages-articles-multistream. Individual XML documents from the compressed file slices may contain one or more Pages.
func (*XMLDocument) ToProto ¶
func (d *XMLDocument) ToProto() *Document
type XMLPage ¶
type XMLPage struct { Title string `xml:"title"` // NS is the Wikipedia Namespace the page is categorized into. NS Namespace `xml:"ns"` ID uint32 `xml:"id"` Redirect XMLRedirect `yaml:",omitempty" xml:"redirect"` Revision XMLRevision `xml:"revision"` }
type XMLRedirect ¶
type XMLRedirect struct {
Title string `yaml:",omitempty" xml:"title,attr"`
}
type XMLRevision ¶
type XMLRevision struct {
Text string `xml:"text"`
}
Source Files ¶
Click to show internal directories.
Click to hide internal directories.