find

package
v0.0.125 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 31, 2022 License: Apache-2.0 Imports: 15 Imported by: 2

Documentation

Overview

Functions for finding documents by full text search

Functions for parsing a search query

Index

Constants

This section is empty.

Variables

View Source
var WEIGHT = []float64{0.080, 2.327, 3.040} // [BM25 words, BM25 bigrams, bit vector]

From logistic regression

Functions

func Bigrams added in v0.0.106

func Bigrams(terms []string) []string

Bigrams constructs a slice of bigrams from pairs of terms

func LoadColMap added in v0.0.112

func LoadColMap(r io.Reader) (map[string]string, error)

LoadColMap gets the list of titles of collections in the corpus key: gloss_file, value: title

func LoadDocInfo added in v0.0.60

func LoadDocInfo(r io.Reader) (map[string]DocInfo, map[string]DocInfo)

Load title info for all documents

Types

type BM25Score added in v0.0.105

type BM25Score struct {
	Document      string
	Collection    string
	Score         float64
	BitVector     float64
	ContainsTerms string
}

type Collection

type Collection struct {
	GlossFile, Title string
}

type DictQueryParser

type DictQueryParser struct {
	Tokenizer *tokenizer.DictTokenizer[*dicttypes.Word]
}

func (DictQueryParser) ParseQuery

func (parser DictQueryParser) ParseQuery(query string) []TextSegment

The method for parsing the query text in this function is based on dictionary lookups

type DocFinder added in v0.0.17

type DocFinder interface {
	FindDocuments(ctx context.Context, dictSearcher dictionary.ReverseIndex,
		parser QueryParser, query string, advanced bool) (*QueryResults, error)
	FindDocumentsInCol(ctx context.Context, dictSearcher dictionary.ReverseIndex,
		parser QueryParser, query, col_gloss_file string) (*QueryResults, error)
}

DocFinder finds documents.

func NewDocFinder added in v0.0.17

func NewDocFinder(tfDocFinder TermFreqDocFinder, titleFinder TitleFinder) DocFinder

NewDocFinder creates and initializes an implementation of the DocFinder interface

type DocInfo added in v0.0.60

type DocInfo struct {
	CorpusFile, GlossFile, Title, TitleCN, TitleEN, CollectionFile, CollectionTitle string
}

type DocTitleRecord added in v0.0.123

type DocTitleRecord struct {
	RawFile         string `firestore:"plain_text_file"`
	GlossFile       string `firestore:"gloss_file"`
	DocTitle        string `firestore:"title"`
	DocTitleZh      string `firestore:"title_zh"`
	DocTitleEn      string `firestore:"title_en"`
	ColGlossFile    string `firestore:"col_gloss_file"`
	ColTitle        string `firestore:"col_title"`
	ColPlusDocTitle string `firestore:"col_plus_doc_title"`
}

DocTitleRecord holds expanded document title information plain_text_file", "gloss_file", "title", "title_cn", "title_en", "col_gloss_file", "col_title", "col_plus_doc_title

type Document

type Document struct {
	GlossFile, Title, CollectionFile, CollectionTitle, ContainsWords string
	ContainsBigrams                                                  string
	SimTitle, SimWords, SimBigram, SimBitVector, Similarity          float64
	ContainsTerms                                                    []string
	MatchDetails                                                     fulltext.MatchingText
	TitleCNMatch                                                     bool
}

func (Document) String

func (doc Document) String() string

For printing out retrieved document metadata

type QueryParser

type QueryParser interface {
	ParseQuery(query string) []TextSegment
}

Parses input queries into a slice of text segments

func NewQueryParser added in v0.0.101

func NewQueryParser(dict map[string]*dicttypes.Word) QueryParser

Creates a QueryParser

type QueryResults

type QueryResults struct {
	Query, CollectionFile        string
	NumCollections, NumDocuments int
	Collections                  []Collection
	Documents                    []Document
	Terms                        []TextSegment
	SimilarTerms                 []TextSegment
}

type TermFreqDocFinder added in v0.0.105

type TermFreqDocFinder interface {
	FindDocsTermFreq(ctx context.Context, terms []string) ([]BM25Score, error)
	FindDocsBigramFreq(ctx context.Context, bigrams []string) ([]BM25Score, error)
	FindDocsTermCo(ctx context.Context, terms []string, col string) ([]BM25Score, error)
	FindDocsBigramCo(ctx context.Context, bigrams []string, col string) ([]BM25Score, error)
}

DocFinder finds documents.

type TextSegment

type TextSegment struct {
	QueryText string
	DictEntry dicttypes.Word
	Senses    []dicttypes.WordSense
}

A text segment contains the QueryText searched for and possibly a matching dictionary entry. There will only be matching dictionary entries for Chinese words in the dictionary. Non-Chinese text, punctuation, and unknown Chinese words will have nil DictEntry values and matching values will be included in the Senses field.

type TitleFinder added in v0.0.105

type TitleFinder interface {
	CountCollections(ctx context.Context, query string) (int, error)
	FindCollections(ctx context.Context, query string) []Collection
	FindDocsByTitle(ctx context.Context, query string) ([]Document, error)
	FindDocsByTitleInCol(ctx context.Context, query, col_gloss_file string) ([]Document, error)
	ColMap() map[string]string
	DocMap() map[string]DocInfo
}

func NewFileTitleFinder added in v0.0.107

func NewFileTitleFinder(colMap map[string]string, dInfoCN, docMap map[string]DocInfo) TitleFinder

NewDocTitleFinder initializes a DocTitleFinder implementation Params

infoCache: key to the map is the Chinese part of the title

func NewMysqlTitleFinder added in v0.0.105

func NewMysqlTitleFinder(ctx context.Context, database *sql.DB, colMap map[string]string, docMap map[string]DocInfo) (TitleFinder, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL