documents

package
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 1, 2024 License: Apache-2.0 Imports: 14 Imported by: 0

Documentation

Index

Constants

View Source
const (
	NamespaceArticle Namespace = iota
	NamespaceTalk
	NamespaceUser
	NamespaceUserTalk
	NamespaceWikipedia
	NamespaceWikipediaTalk
	NamespaceFile
	NamespaceFileTalk
	NamespaceMediaWiki
	NamespaceMediaWikiTalk
	NamespaceTemplate
	NamespaceTemplateTalk
	NamespaceHelp
	NamespaceHelpTalk
	NamespaceCategory
	NamespaceCategoryTalk
	NamespacePortal        = 100
	NamespacePortalTalk    = 101
	NamespaceDraft         = 118
	NamespaceDraftTalk     = 119
	NamespaceTimedText     = 710
	NamespaceTimedTextTalk = 711
	NamespaceModule        = 828
	NamespaceModuleTalk    = 829
)

Variables

View Source
var File_pkg_documents_documents_proto protoreflect.FileDescriptor

Functions

func DisambiguateTags

func DisambiguateTags(page, category string) string

func PersonInfoboxes added in v0.2.0

func PersonInfoboxes() []string

func ReadPages

func ReadPages(pages chan<- *Page) func([]byte) error

func WriteFrequencyTable

func WriteFrequencyTable(out string, t FrequencyTable) error

Types

type Categories

type Categories struct {
	Categories []uint32 `protobuf:"varint,1,rep,packed,name=categories,proto3" json:"categories,omitempty"`
	// contains filtered or unexported fields
}

func (*Categories) Add

func (x *Categories) Add(parent uint32)

func (*Categories) Descriptor deprecated

func (*Categories) Descriptor() ([]byte, []int)

Deprecated: Use Categories.ProtoReflect.Descriptor instead.

func (*Categories) GetCategories

func (x *Categories) GetCategories() []uint32

func (*Categories) ProtoMessage

func (*Categories) ProtoMessage()

func (*Categories) ProtoReflect

func (x *Categories) ProtoReflect() protoreflect.Message

func (*Categories) Reset

func (x *Categories) Reset()

func (*Categories) String

func (x *Categories) String() string

type Categorizer

type Categorizer struct {
	TitleIndex *TitleIndex
}

func (*Categorizer) Categorize

func (c *Categorizer) Categorize(page *Page) *Categories

type Document

type Document struct {
	Pages []*Page `protobuf:"bytes,1,rep,name=pages,proto3" json:"pages,omitempty"`
	// contains filtered or unexported fields
}

func (*Document) Descriptor deprecated

func (*Document) Descriptor() ([]byte, []int)

Deprecated: Use Document.ProtoReflect.Descriptor instead.

func (*Document) GetPages

func (x *Document) GetPages() []*Page

func (*Document) ProtoMessage

func (*Document) ProtoMessage()

func (*Document) ProtoReflect

func (x *Document) ProtoReflect() protoreflect.Message

func (*Document) Reset

func (x *Document) Reset()

func (*Document) String

func (x *Document) String() string

type Frequency

type Frequency struct {
	Word  string
	Count int
}

type FrequencyMap

type FrequencyMap struct {
	Counts map[string]int
}

func (*FrequencyMap) Collect

func (f *FrequencyMap) Collect(words <-chan string)

Collect reads the words in a channel into a frequency table.

func (*FrequencyMap) CollectMaps

func (f *FrequencyMap) CollectMaps(
	wordCountChannel <-chan map[string]int,
	countFilter,
	sizeThreshold int,
) *sync.WaitGroup

func (*FrequencyMap) Filter

func (f *FrequencyMap) Filter(minCount int)

Filter drops all words which have been seen fewer than minCount times.

type FrequencyTable

type FrequencyTable struct {
	Frequencies []Frequency
}

func ReadFrequencyTables

func ReadFrequencyTables(paths ...string) (*FrequencyTable, error)

func ToFrequencyTable

func ToFrequencyTable(wordCounts map[string]int) FrequencyTable

func (*FrequencyTable) ToNgramDictionary

func (t *FrequencyTable) ToNgramDictionary() map[string]bool

type InfoboxChecker added in v0.2.0

type InfoboxChecker struct {
	// contains filtered or unexported fields
}

func NewInfoboxChecker added in v0.2.0

func NewInfoboxChecker(want []string) (*InfoboxChecker, error)

func (*InfoboxChecker) Matches added in v0.2.0

func (r *InfoboxChecker) Matches(rawText string) bool

type Namespace

type Namespace int16

type Page

type Page struct {
	Id    uint32 `protobuf:"varint,1,opt,name=id,proto3" json:"id,omitempty"`
	Title string `protobuf:"bytes,2,opt,name=title,proto3" json:"title,omitempty"`
	Text  string `protobuf:"bytes,3,opt,name=text,proto3" json:"text,omitempty"`
	// contains filtered or unexported fields
}

func (*Page) Descriptor deprecated

func (*Page) Descriptor() ([]byte, []int)

Deprecated: Use Page.ProtoReflect.Descriptor instead.

func (*Page) GetID

func (x *Page) GetID() uint32

func (*Page) GetText

func (x *Page) GetText() string

func (*Page) GetTitle

func (x *Page) GetTitle() string

func (*Page) ID

func (x *Page) ID() uint32

func (*Page) ProtoMessage

func (*Page) ProtoMessage()

func (*Page) ProtoReflect

func (x *Page) ProtoReflect() protoreflect.Message

func (*Page) Reset

func (x *Page) Reset()

func (*Page) String

func (x *Page) String() string

type PageCategories

type PageCategories struct {
	Pages map[uint32]*Categories `` /* 152-byte string literal not displayed */
	// contains filtered or unexported fields
}

func (*PageCategories) Add

func (x *PageCategories) Add(child, parent uint32)

func (*PageCategories) Descriptor deprecated

func (*PageCategories) Descriptor() ([]byte, []int)

Deprecated: Use PageCategories.ProtoReflect.Descriptor instead.

func (*PageCategories) GetPages

func (x *PageCategories) GetPages() map[uint32]*Categories

func (*PageCategories) ProtoMessage

func (*PageCategories) ProtoMessage()

func (*PageCategories) ProtoReflect

func (x *PageCategories) ProtoReflect() protoreflect.Message

func (*PageCategories) Reset

func (x *PageCategories) Reset()

func (*PageCategories) String

func (x *PageCategories) String() string

type TitleIndex

type TitleIndex struct {
	Titles map[string]uint32 `` /* 154-byte string literal not displayed */
	// contains filtered or unexported fields
}

func (*TitleIndex) Descriptor deprecated

func (*TitleIndex) Descriptor() ([]byte, []int)

Deprecated: Use TitleIndex.ProtoReflect.Descriptor instead.

func (*TitleIndex) GetTitles

func (x *TitleIndex) GetTitles() map[string]uint32

func (*TitleIndex) ProtoMessage

func (*TitleIndex) ProtoMessage()

func (*TitleIndex) ProtoReflect

func (x *TitleIndex) ProtoReflect() protoreflect.Message

func (*TitleIndex) Reset

func (x *TitleIndex) Reset()

func (*TitleIndex) String

func (x *TitleIndex) String() string

type WordSet

type WordSet struct {
	// ID is the article ID
	ID int
	// Words is the sorted list of top words in the document.
	Words []uint16
}

type WordSets

type WordSets struct {
	InFile    string `json:"in_file,omitempty"`
	Documents []WordSet
}

type XMLDocument

type XMLDocument struct {
	Pages []XMLPage `xml:"page"`
}

XMLDocument solely exists for extracting from pages-articles-multistream. Individual XML documents from the compressed file slices may contain one or more Pages.

func (*XMLDocument) ToProto

func (d *XMLDocument) ToProto() *Document

type XMLPage

type XMLPage struct {
	Title string `xml:"title"`
	// NS is the Wikipedia Namespace the page is categorized into.
	NS       Namespace   `xml:"ns"`
	ID       uint32      `xml:"id"`
	Redirect XMLRedirect `yaml:",omitempty" xml:"redirect"`
	Revision XMLRevision `xml:"revision"`
}

func (*XMLPage) ToProto

func (p *XMLPage) ToProto() *Page

type XMLRedirect

type XMLRedirect struct {
	Title string `yaml:",omitempty" xml:"title,attr"`
}

type XMLRevision

type XMLRevision struct {
	Text string `xml:"text"`
}

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL