Documentation
¶
Overview ¶
Library for documents retrieval
Index ¶
- Constants
- func BuildDocTitleIndex(libLoader library.LibraryLoader, w io.Writer) error
- func FilterByDomain(words []SortedWordItem, domain string, wdict map[string]*dicttypes.Word) []dicttypes.Word
- func GetHeadwordArray(keywords Keywords, wdict map[string]dicttypes.Word) []dicttypes.Word
- func UpdateDictIndex(ctx context.Context, client FsClient, dict *dictionary.Dictionary, ...) error
- func UpdateDocTitleIndex(ctx context.Context, libLoader library.LibraryLoader, client FsClient, ...) error
- func WriteDocLengthToFile(dlArray []DocLength, f io.Writer)
- func WriteWFCorpus(wfStore WordFreqStore, sortedWords, sortedUnknownWords []SortedWordItem, ...) error
- type ByFrequencyDoc
- type CorpusWord
- type CorpusWordFreq
- type DocLength
- type DocumentFrequency
- type FsClient
- type IndexConfig
- type IndexState
- type IndexStore
- type Keyword
- type Keywords
- type RetrievalResult
- type SortedWF
- type SortedWordItem
- type TermFreqDocMap
- func (wfDocMap TermFreqDocMap) AddWF(vocab map[string]int, corpusFile, glossFile string, wc int)
- func (wfDocMap TermFreqDocMap) Merge(wfDocMap2 TermFreqDocMap)
- func (wfDocMap TermFreqDocMap) Put(record TermFreqDocRecord)
- func (termFreqDocMap TermFreqDocMap) WriteToFile(df DocumentFrequency, fileName string, indexConfig IndexConfig)
- type TermFreqDocRecord
- type WFDocEntry
- type WFEntry
- type WordFreqStore
Constants ¶
const BF_DOC_FILE = "bigram_freq_doc.txt"
Bigram frequencies for each file
const BigramDocFreqFile = "bigram_doc_freq.txt"
const DocFreqFile = "doc_freq.txt"
File name for document index
const DocLengthFile = "doc_length.tsv"
Word frequencies for each document
const KeywordIndexFile = "keyword_index.json"
File name for keyword index
const NgramCorpusFile = "ngram_frequencies.txt"
ngram frequencies for corpus
const UnknownCharsFile = "unknown.txt"
Unknown characters file
const WfCorpusFile = "word_frequencies.txt"
Word frequencies for corpus
const WfDocFile = "word_freq_doc.txt"
Word frequencies for each document
Variables ¶
This section is empty.
Functions ¶
func BuildDocTitleIndex ¶ added in v0.0.22
func BuildDocTitleIndex(libLoader library.LibraryLoader, w io.Writer) error
Builds a flat index of document titles from the hierarchical corpus. This is suitable for loading into the database or loading from for the web app when running without a database.
func FilterByDomain ¶
func FilterByDomain(words []SortedWordItem, domain string, wdict map[string]*dicttypes.Word) []dicttypes.Word
* Filters a slice of sorted words by domain label if any one of the word * senses matches the label.
func GetHeadwordArray ¶
Gets the dictionary definition of a slice of strings Parameters
terms: The Chinese (simplified or traditional) text of the words
Return
hws: an array of word senses
func UpdateDictIndex ¶ added in v0.0.55
func UpdateDictIndex(ctx context.Context, client FsClient, dict *dictionary.Dictionary, corpus string, generation int, domain string) error
UpdateDictIndex writes a list of dicitonary words with subtring array
func UpdateDocTitleIndex ¶ added in v0.0.54
func UpdateDocTitleIndex(ctx context.Context, libLoader library.LibraryLoader, client FsClient, corpus string, generation int) error
UpdateDocTitleIndex writes a list of document titles from the hierarchical corpus with subtring arrays
func WriteDocLengthToFile ¶
Append document analysis to a plain text file in the index directory
func WriteWFCorpus ¶
func WriteWFCorpus(wfStore WordFreqStore, sortedWords, sortedUnknownWords []SortedWordItem, bFreq []ngram.BigramFreq, wc int, indexConfig IndexConfig) error
Write corpus analysis to plain text files in the index directory
Types ¶
type ByFrequencyDoc ¶
type ByFrequencyDoc []WFDocEntry
func (ByFrequencyDoc) Len ¶
func (items ByFrequencyDoc) Len() int
func (ByFrequencyDoc) Less ¶
func (items ByFrequencyDoc) Less(i, j int) bool
func (ByFrequencyDoc) Swap ¶
func (items ByFrequencyDoc) Swap(i, j int)
type CorpusWordFreq ¶
A word frequency with corpus entry label
type DocumentFrequency ¶
Map from term to number of documents referencing the term
func NewDocumentFrequency ¶
func NewDocumentFrequency() DocumentFrequency
Initializes a DocumentFrequency struct
func ReadDocumentFrequency ¶
func ReadDocumentFrequency(r io.Reader) (*DocumentFrequency, error)
ReadDocumentFrequency a document frequency object from a CSV file
func (*DocumentFrequency) AddDocFreq ¶
func (df *DocumentFrequency) AddDocFreq(otherDF DocumentFrequency)
Merges the given document frequency to the map and increments the counts Param:
vocab - word frequencies are ignored, only the presence of the term is important
func (*DocumentFrequency) AddVocabulary ¶
func (df *DocumentFrequency) AddVocabulary(vocab map[string]int)
Adds the given vocabulary to the map and increments the document count Param:
vocab - word frequencies are ignored, only the presence of the term is important
func (*DocumentFrequency) IDF ¶
func (df *DocumentFrequency) IDF(term string) (val float64, ok bool)
Computes the inverse document frequency for the given term Param:
term: the term to find the idf for
func (*DocumentFrequency) Write ¶
func (df *DocumentFrequency) Write(w io.Writer)
WriteToFile writes the document frequency
type FsClient ¶ added in v0.0.54
type FsClient interface {
Collection(path string) *firestore.CollectionRef
}
FsClient defines Firestore interfaces needed
type IndexConfig ¶
type IndexConfig struct {
IndexDir string
}
IndexConfig encapsulates parameters for index configuration
type IndexState ¶
type IndexState struct { KeywordIndexReady bool // contains filtered or unexported fields }
A word frequency entry record
func BuildIndex ¶
func BuildIndex(indexConfig IndexConfig, indexStore IndexStore) (*IndexState, error)
Reads word frequencies data from files into memory and builds the keyword index
type IndexStore ¶
Storage for the keyword index
type Keywords ¶
type Keywords []Keyword
func SortByWeight ¶
func SortByWeight(vocab map[string]int, completeDF DocumentFrequency) Keywords
Orders the keyword with given frequency in a document by tf-idf weight Param:
vocab - word frequencies for a particular document
type RetrievalResult ¶
type RetrievalResult struct {
HTMLFile, Title, ColTitle string
Count int
HasEngTrans, HasParallel bool
}
A document-specific word frequency entry record
func FindDocsForKeyword ¶
func FindDocsForKeyword(keyword dicttypes.Word, corpusEntryMap map[string]corpus.CorpusEntry, indexState IndexState, bibnotesClient bibnotes.BibNotesClient) []RetrievalResult
Retrieves documents with title for a single keyword
type SortedWF ¶
type SortedWF struct {
// contains filtered or unexported fields
}
Sorted list of word frequencies
type SortedWordItem ¶
An entry in a sorted word array
func SortedFreq ¶
func SortedFreq(wf map[string]int) []SortedWordItem
* Sorts Word struct's based on frequency
type TermFreqDocMap ¶
type TermFreqDocMap map[string]TermFreqDocRecord
Remembers the word frequency for each term for each document in the corpus
func (TermFreqDocMap) AddWF ¶
func (wfDocMap TermFreqDocMap) AddWF(vocab map[string]int, corpusFile, glossFile string, wc int)
Ads a map of word frequencies for a given document to the map
func (TermFreqDocMap) Merge ¶
func (wfDocMap TermFreqDocMap) Merge(wfDocMap2 TermFreqDocMap)
Merge two TermFreqDocMap struct's together
func (TermFreqDocMap) Put ¶
func (wfDocMap TermFreqDocMap) Put(record TermFreqDocRecord)
Adds a record to the map
func (TermFreqDocMap) WriteToFile ¶
func (termFreqDocMap TermFreqDocMap) WriteToFile(df DocumentFrequency, fileName string, indexConfig IndexConfig)
Append document analysis to a plain text file in the index directory
type TermFreqDocRecord ¶
type TermFreqDocRecord struct { Word string Freq, DocLength int CollectionFile string GlossFile string }
Remembers the word frequency for each term for each document in the corpus
type WFDocEntry ¶
A document-specific word frequency entry record
func FindForKeyword ¶
func FindForKeyword(keyword string, wfdoc map[string][]WFDocEntry) []WFDocEntry
Retrieves raw results for a single keyword
func (WFDocEntry) String ¶
func (item WFDocEntry) String() string