analysis

package
v0.0.55 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 16, 2022 License: Apache-2.0 Imports: 22 Imported by: 0

Documentation

Overview

Package for vocabulary analysis of a monolingual Chinese text corpus

This includes - reading the corpus documents from disk - tokenization of the corpus into multi-character arrays - computation of term and bigram frequencies - compilation of an index for later full text search - computation of term occurrence and usage in the corpus

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GetDocFrequencies

func GetDocFrequencies(libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer,
	dict *dictionary.Dictionary) (*index.DocumentFrequency, error)

getWordFrequencies compute word doc frequencies for corpus

func Subtract

func Subtract(headwords, subtract []dicttypes.Word) []dicttypes.Word

Subtract the items in the second list from the first

func WriteCorpus

func WriteCorpus(collections []corpus.CollectionEntry,
	outputConfig generator.HTMLOutPutConfig,
	libLoader library.LibraryLoader, dictTokenizer tokenizer.Tokenizer,
	indexConfig index.IndexConfig, dict *dictionary.Dictionary,
	c config.AppConfig, corpusConfig corpus.CorpusConfig) (*index.IndexState, error)

WriteCorpus write all the collections in the given corpus collections: The set of collections to write to HTML baseDir: The base directory to use to write the files

func WriteCorpusAll

func WriteCorpusAll(libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer, outputConfig generator.HTMLOutPutConfig,
	indexConfig index.IndexConfig, dict *dictionary.Dictionary,
	c config.AppConfig) (*index.IndexState, error)

WriteCorpusAll write all the collections in the default corpus (collections.csv file)

func WriteCorpusCol

func WriteCorpusCol(collectionFile string, libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer, outputConfig generator.HTMLOutPutConfig,
	corpusConfig corpus.CorpusConfig, dict *dictionary.Dictionary,
	c config.AppConfig) error

WriteCorpusCol writes a corpus document collection to HTML, including all the entries contained in the collection collectionFile: the name of the collection file

func WriteHwFiles

func WriteHwFiles(dep HWFileDependencies) error

Writes dictionary headword entries func WriteHwFiles(loader library.LibraryLoader,

dictTokenizer tokenizer.Tokenizer,
outputConfig generator.HTMLOutPutConfig,
indexState index.IndexState,
wdict map[string]dicttypes.Word,
vocabAnalysis VocabAnalysis,
hww HeadwordWriter) error {

func WriteLibraryFile

func WriteLibraryFile(lib library.Library, corpora []library.CorpusData,
	outputFile string, outputConfig generator.HTMLOutPutConfig)

WriteLibraryFile writes a HTML files describing the corpora in the library.

This is for both public and for the translation portal (requiring login).

Types

type CollectionAResults

type CollectionAResults struct {
	Vocab             map[string]int
	Bigrams           map[string]int
	Usage             map[string]string
	BigramFrequencies ngram.BigramFreqMap
	Collocations      ngram.CollocationMap
	WC, CCount        int
	UnknownChars      map[string]int
	WFDocMap          index.TermFreqDocMap
	BigramDocMap      index.TermFreqDocMap
	DocFreq           index.DocumentFrequency
	BigramDF          index.DocumentFrequency
	DocLengthArray    []index.DocLength
}

A struct to hold the analysis results for the collection

func NewCollectionAResults

func NewCollectionAResults() CollectionAResults

Constructor for empty CollectionAResults

func ParseText

func ParseText(text string, colTitle string, document *corpus.CorpusEntry, dictTokenizer tokenizer.Tokenizer, corpusConfig corpus.CorpusConfig, dict *dictionary.Dictionary) (list.List, *CollectionAResults)

ParseText tokenizes a Chinese text corpus document into terms Parameters:

text: the string to parse
ColTitle: Optional parameter used for tracing collocation usage
document: Optional parameter used for tracing collocation usage

Returns:

tokens: the tokens for the parsed text
results: vocabulary analysis results

func (*CollectionAResults) AddResults

func (results *CollectionAResults) AddResults(more *CollectionAResults)

Add more results to this set of results

func (*CollectionAResults) GetHeadwords

func (results *CollectionAResults) GetHeadwords(wdict map[string]*dicttypes.Word) []dicttypes.Word

Returns the subset of words that are lexical (content) words

func (*CollectionAResults) GetLexicalWordFreq

func (results *CollectionAResults) GetLexicalWordFreq(sortedWords []index.SortedWordItem,
	wdict map[string]*dicttypes.Word) []wFResult

Returns the subset of words that are lexical (content) words

func (*CollectionAResults) GetWordFreq

func (results *CollectionAResults) GetWordFreq(sortedWords []index.SortedWordItem,
	wdict map[string]*dicttypes.Word) []wFResult

Returns the subset of words that are lexical (content) words

type DictEntry

type DictEntry struct {
	Title            string
	Headword         dicttypes.Word
	RelevantDocs     []index.RetrievalResult
	ContainsByDomain []dicttypes.Word
	Contains         []dicttypes.Word
	Collocations     []ngram.BigramFreq
	UsageArr         []wordUsage
	DateUpdated      string
}

DictEntry holds content used for writing a dictionary entry to HTML

type Glossary

type Glossary struct {
	Domain string
	Words  dicttypes.Words
}

The content for a corpus entry

func MakeGlossary

func MakeGlossary(domain string, headwords []dicttypes.Word) Glossary

Makes a glossary by filtering by the domain label and sorting by Chinese pinyin.

type HWFileDependencies added in v0.0.41

type HWFileDependencies struct {
	Loader         library.LibraryLoader
	DictTokenizer  tokenizer.Tokenizer
	OutputConfig   generator.HTMLOutPutConfig
	IndexState     index.IndexState
	Dict           *dictionary.Dictionary
	VocabAnalysis  VocabAnalysis
	Hww            HeadwordWriter
	BibNotesClient bibnotes.BibNotesClient
}

type HeadwordWriter added in v0.0.31

type HeadwordWriter interface {
	NewWriter(hwId int) io.Writer
	CloseWriter(hwId int)
}

hwWriter manages files for writing headwords to HTML

type VocabAnalysis

type VocabAnalysis struct {
	UsageMap     map[string]*[]wordUsage
	WFTotal      map[*index.CorpusWord]index.CorpusWordFreq
	WCTotal      map[string]int
	Collocations ngram.CollocationMap
}

VocabAnalysis bundles up vocabulary analysis

func GetWordFrequencies

func GetWordFrequencies(libLoader library.LibraryLoader,
	dictTokenizer tokenizer.Tokenizer,
	dict *dictionary.Dictionary) (*VocabAnalysis, error)

getWordFrequencies compute word frequencies, collocations, and usage for corpus

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL