Documentation ¶
Overview ¶
Package for vocabulary analysis of a monolingual Chinese text corpus
This includes - reading the corpus documents from disk - tokenization of the corpus into multi-character arrays - computation of term and bigram frequencies - compilation of an index for later full text search - computation of term occurrence and usage in the corpus
Index ¶
- func GetDocFrequencies(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, ...) (*index.DocumentFrequency, error)
- func Subtract(headwords, subtract []dicttypes.Word) []dicttypes.Word
- func WriteCorpus(collections []corpus.CollectionEntry, outputConfig generator.HTMLOutPutConfig, ...) (*index.IndexState, error)
- func WriteCorpusAll(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, ...) (*index.IndexState, error)
- func WriteCorpusCol(collectionFile string, libLoader library.LibraryLoader, ...) error
- func WriteHwFiles(dep HWFileDependencies) error
- func WriteLibraryFile(lib library.Library, corpora []library.CorpusData, outputFile string, ...)
- type CollectionAResults
- func (results *CollectionAResults) AddResults(more *CollectionAResults)
- func (results *CollectionAResults) GetHeadwords(wdict map[string]*dicttypes.Word) []dicttypes.Word
- func (results *CollectionAResults) GetLexicalWordFreq(sortedWords []index.SortedWordItem, wdict map[string]*dicttypes.Word) []wFResult
- func (results *CollectionAResults) GetWordFreq(sortedWords []index.SortedWordItem, wdict map[string]*dicttypes.Word) []wFResult
- type DictEntry
- type Glossary
- type HWFileDependencies
- type HeadwordWriter
- type VocabAnalysis
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GetDocFrequencies ¶
func GetDocFrequencies(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, dict *dictionary.Dictionary) (*index.DocumentFrequency, error)
getWordFrequencies compute word doc frequencies for corpus
func WriteCorpus ¶
func WriteCorpus(collections []corpus.CollectionEntry, outputConfig generator.HTMLOutPutConfig, libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, indexConfig index.IndexConfig, dict *dictionary.Dictionary, c config.AppConfig, corpusConfig corpus.CorpusConfig) (*index.IndexState, error)
WriteCorpus write all the collections in the given corpus collections: The set of collections to write to HTML baseDir: The base directory to use to write the files
func WriteCorpusAll ¶
func WriteCorpusAll(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, outputConfig generator.HTMLOutPutConfig, indexConfig index.IndexConfig, dict *dictionary.Dictionary, c config.AppConfig) (*index.IndexState, error)
WriteCorpusAll write all the collections in the default corpus (collections.csv file)
func WriteCorpusCol ¶
func WriteCorpusCol(collectionFile string, libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, outputConfig generator.HTMLOutPutConfig, corpusConfig corpus.CorpusConfig, dict *dictionary.Dictionary, c config.AppConfig) error
WriteCorpusCol writes a corpus document collection to HTML, including all the entries contained in the collection collectionFile: the name of the collection file
func WriteHwFiles ¶
func WriteHwFiles(dep HWFileDependencies) error
Writes dictionary headword entries func WriteHwFiles(loader library.LibraryLoader,
dictTokenizer tokenizer.Tokenizer, outputConfig generator.HTMLOutPutConfig, indexState index.IndexState, wdict map[string]dicttypes.Word, vocabAnalysis VocabAnalysis, hww HeadwordWriter) error {
func WriteLibraryFile ¶
func WriteLibraryFile(lib library.Library, corpora []library.CorpusData, outputFile string, outputConfig generator.HTMLOutPutConfig)
WriteLibraryFile writes a HTML files describing the corpora in the library.
This is for both public and for the translation portal (requiring login).
Types ¶
type CollectionAResults ¶
type CollectionAResults struct { Vocab map[string]int Bigrams map[string]int Usage map[string]string BigramFrequencies ngram.BigramFreqMap Collocations ngram.CollocationMap WC, CCount int UnknownChars map[string]int WFDocMap index.TermFreqDocMap BigramDocMap index.TermFreqDocMap DocFreq index.DocumentFrequency BigramDF index.DocumentFrequency DocLengthArray []index.DocLength }
A struct to hold the analysis results for the collection
func NewCollectionAResults ¶
func NewCollectionAResults() CollectionAResults
Constructor for empty CollectionAResults
func ParseText ¶
func ParseText(text string, colTitle string, document *corpus.CorpusEntry, dictTokenizer tokenizer.Tokenizer, corpusConfig corpus.CorpusConfig, dict *dictionary.Dictionary) (list.List, *CollectionAResults)
ParseText tokenizes a Chinese text corpus document into terms Parameters:
text: the string to parse ColTitle: Optional parameter used for tracing collocation usage document: Optional parameter used for tracing collocation usage
Returns:
tokens: the tokens for the parsed text results: vocabulary analysis results
func (*CollectionAResults) AddResults ¶
func (results *CollectionAResults) AddResults(more *CollectionAResults)
Add more results to this set of results
func (*CollectionAResults) GetHeadwords ¶
Returns the subset of words that are lexical (content) words
func (*CollectionAResults) GetLexicalWordFreq ¶
func (results *CollectionAResults) GetLexicalWordFreq(sortedWords []index.SortedWordItem, wdict map[string]*dicttypes.Word) []wFResult
Returns the subset of words that are lexical (content) words
func (*CollectionAResults) GetWordFreq ¶
func (results *CollectionAResults) GetWordFreq(sortedWords []index.SortedWordItem, wdict map[string]*dicttypes.Word) []wFResult
Returns the subset of words that are lexical (content) words
type DictEntry ¶
type DictEntry struct { Title string Headword dicttypes.Word RelevantDocs []index.RetrievalResult ContainsByDomain []dicttypes.Word Contains []dicttypes.Word Collocations []ngram.BigramFreq UsageArr []wordUsage DateUpdated string }
DictEntry holds content used for writing a dictionary entry to HTML
type HWFileDependencies ¶ added in v0.0.41
type HWFileDependencies struct { Loader library.LibraryLoader DictTokenizer tokenizer.Tokenizer OutputConfig generator.HTMLOutPutConfig IndexState index.IndexState Dict *dictionary.Dictionary VocabAnalysis VocabAnalysis Hww HeadwordWriter BibNotesClient bibnotes.BibNotesClient }
type HeadwordWriter ¶ added in v0.0.31
hwWriter manages files for writing headwords to HTML
type VocabAnalysis ¶
type VocabAnalysis struct { UsageMap map[string]*[]wordUsage WFTotal map[*index.CorpusWord]index.CorpusWordFreq WCTotal map[string]int Collocations ngram.CollocationMap }
VocabAnalysis bundles up vocabulary analysis
func GetWordFrequencies ¶
func GetWordFrequencies(libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer, dict *dictionary.Dictionary) (*VocabAnalysis, error)
getWordFrequencies compute word frequencies, collocations, and usage for corpus