Documentation ¶
Overview ¶
Package stats provides implementations of statistic sources.
Index ¶
- func ElasticsearchAnalysedField(field string) func(*ElasticsearchStatisticsSource)
- func ElasticsearchAnalyser(analyser string) func(*ElasticsearchStatisticsSource)
- func ElasticsearchDocumentType(documentType string) func(*ElasticsearchStatisticsSource)
- func ElasticsearchHosts(hosts ...string) func(*ElasticsearchStatisticsSource)
- func ElasticsearchIndex(index string) func(*ElasticsearchStatisticsSource)
- func ElasticsearchParameters(params map[string]float64) func(*ElasticsearchStatisticsSource)
- func ElasticsearchScroll(scroll bool) func(*ElasticsearchStatisticsSource)
- func ElasticsearchSearchOptions(options SearchOptions) func(*ElasticsearchStatisticsSource)
- func EntrezAPIKey(key string) func(source *EntrezStatisticsSource)
- func EntrezDb(db string) func(source *EntrezStatisticsSource)
- func EntrezEmail(email string) func(source *EntrezStatisticsSource)
- func EntrezLimiter(limit time.Duration) func(source *EntrezStatisticsSource)
- func EntrezOptions(options SearchOptions) func(source *EntrezStatisticsSource)
- func EntrezRank(rank bool) func(source *EntrezStatisticsSource)
- func EntrezTool(tool string) func(source *EntrezStatisticsSource)
- func GetDocumentIDs(query pipeline.Query, ss StatisticsSource) ([]uint32, error)
- func LanguageModelWeights(weights []float64) func(*LanguageModel)
- type ElasticsearchStatisticsSource
- func (es *ElasticsearchStatisticsSource) Analyse(text, analyser string) (tokens []string, err error)
- func (es *ElasticsearchStatisticsSource) CollectionSize() (float64, error)
- func (es *ElasticsearchStatisticsSource) DocumentFrequency(term string, field string) (float64, error)
- func (es *ElasticsearchStatisticsSource) Execute(query gpipeline.Query, options SearchOptions) (trecresults.ResultList, error)
- func (es *ElasticsearchStatisticsSource) ExecuteFast(query gpipeline.Query, options SearchOptions) ([]uint32, error)
- func (es *ElasticsearchStatisticsSource) InverseDocumentFrequency(term, field string) (float64, error)
- func (es *ElasticsearchStatisticsSource) Parameters() map[string]float64
- func (es *ElasticsearchStatisticsSource) RetrievalSize(query cqr.CommonQueryRepresentation) (float64, error)
- func (es *ElasticsearchStatisticsSource) SearchOptions() SearchOptions
- func (es *ElasticsearchStatisticsSource) TermFrequency(term, field, document string) (float64, error)
- func (es *ElasticsearchStatisticsSource) TermVector(document string) (TermVector, error)
- func (es *ElasticsearchStatisticsSource) TotalTermFrequency(term, field string) (float64, error)
- func (es *ElasticsearchStatisticsSource) VocabularySize(field string) (float64, error)
- type EntrezStatisticsSource
- func (e EntrezStatisticsSource) CollectionSize() (float64, error)
- func (e EntrezStatisticsSource) Count(term, field string) float64
- func (e EntrezStatisticsSource) DocumentFrequency(term, field string) (float64, error)
- func (e EntrezStatisticsSource) Execute(query pipeline.Query, options SearchOptions) (trecresults.ResultList, error)
- func (e EntrezStatisticsSource) Fetch(pmids []int, options ...func(p *entrez.Parameters)) ([]guru.MedlineDocument, error)
- func (e EntrezStatisticsSource) InverseDocumentFrequency(term, field string) (float64, error)
- func (e EntrezStatisticsSource) Link(pmids []int, linkname string) ([]int, error)
- func (e EntrezStatisticsSource) MarshalEasyJSON(w *jwriter.Writer)
- func (e EntrezStatisticsSource) MarshalJSON() ([]byte, error)
- func (e EntrezStatisticsSource) Parameters() map[string]float64
- func (e EntrezStatisticsSource) RetrievalSize(query cqr.CommonQueryRepresentation) (float64, error)
- func (e EntrezStatisticsSource) Search(query string, options ...func(p *entrez.Parameters)) ([]int, error)
- func (e EntrezStatisticsSource) SearchOptions() SearchOptions
- func (e EntrezStatisticsSource) SearchSize(n int) func(p *entrez.Parameters)
- func (e EntrezStatisticsSource) SearchStart(n int) func(p *entrez.Parameters)
- func (e EntrezStatisticsSource) SetDB(db string) EntrezStatisticsSource
- func (e EntrezStatisticsSource) Summary(ids []string, value interface{}, options ...func(p *entrez.Parameters)) error
- func (e EntrezStatisticsSource) TermFrequency(term, field, document string) (float64, error)
- func (e EntrezStatisticsSource) TermVector(document string) (TermVector, error)
- func (e EntrezStatisticsSource) TotalTermFrequency(term, _ string) (float64, error)
- func (e EntrezStatisticsSource) Translation(term string) ([]string, error)
- func (e *EntrezStatisticsSource) UnmarshalEasyJSON(l *jlexer.Lexer)
- func (e *EntrezStatisticsSource) UnmarshalJSON(data []byte) error
- func (e EntrezStatisticsSource) VocabularySize(field string) (float64, error)
- type LanguageModel
- type Search
- type SearchOptions
- type StatisticsSource
- type TermProbability
- type TermVector
- type TermVectorTerm
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ElasticsearchAnalysedField ¶
func ElasticsearchAnalysedField(field string) func(*ElasticsearchStatisticsSource)
ElasticsearchAnalysedField sets the analyser for the statistic source.
func ElasticsearchAnalyser ¶
func ElasticsearchAnalyser(analyser string) func(*ElasticsearchStatisticsSource)
ElasticsearchAnalyser sets the analyser for the statistic source.
func ElasticsearchDocumentType ¶
func ElasticsearchDocumentType(documentType string) func(*ElasticsearchStatisticsSource)
ElasticsearchDocumentType sets the document type for the Elasticsearch client.
func ElasticsearchHosts ¶
func ElasticsearchHosts(hosts ...string) func(*ElasticsearchStatisticsSource)
ElasticsearchHosts sets the hosts for the Elasticsearch client.
func ElasticsearchIndex ¶
func ElasticsearchIndex(index string) func(*ElasticsearchStatisticsSource)
ElasticsearchIndex sets the index for the Elasticsearch client.
func ElasticsearchParameters ¶
func ElasticsearchParameters(params map[string]float64) func(*ElasticsearchStatisticsSource)
ElasticsearchParameters sets the parameters for the statistic source.
func ElasticsearchScroll ¶
func ElasticsearchScroll(scroll bool) func(*ElasticsearchStatisticsSource)
ElasticsearchScroll sets the scroll for the statistic source.
func ElasticsearchSearchOptions ¶
func ElasticsearchSearchOptions(options SearchOptions) func(*ElasticsearchStatisticsSource)
ElasticsearchSearchOptions sets the execute options for the statistic source.
func EntrezAPIKey ¶
func EntrezAPIKey(key string) func(source *EntrezStatisticsSource)
EntrezTool sets the API key for entrez.
func EntrezDb ¶
func EntrezDb(db string) func(source *EntrezStatisticsSource)
EntrezDb sets the database to search.
func EntrezEmail ¶
func EntrezEmail(email string) func(source *EntrezStatisticsSource)
EntrezTool sets the email for entrez.
func EntrezLimiter ¶
func EntrezLimiter(limit time.Duration) func(source *EntrezStatisticsSource)
EntrezOptions sets any additional options for the entrez statistics source.
func EntrezOptions ¶
func EntrezOptions(options SearchOptions) func(source *EntrezStatisticsSource)
EntrezOptions sets any additional options for the entrez statistics source.
func EntrezRank ¶
func EntrezRank(rank bool) func(source *EntrezStatisticsSource)
func EntrezTool ¶
func EntrezTool(tool string) func(source *EntrezStatisticsSource)
EntrezTool sets the tool name for entrez.
func GetDocumentIDs ¶
func GetDocumentIDs(query pipeline.Query, ss StatisticsSource) ([]uint32, error)
GetDocumentIDs retrieves the document IDs for a query as fast as possible. Using Elasticsearch this will create a very fast concurrent scroll service. This method does not guarantee order.
func LanguageModelWeights ¶
func LanguageModelWeights(weights []float64) func(*LanguageModel)
LanguageModelWeights configures a language model to use the specified weights.
Types ¶
type ElasticsearchStatisticsSource ¶
type ElasticsearchStatisticsSource struct { Scroll bool Analyser string AnalyseField string // contains filtered or unexported fields }
ElasticsearchStatisticsSource is a way of gathering statistics for a collection using Elasticsearch.
func NewElasticsearchStatisticsSource ¶
func NewElasticsearchStatisticsSource(options ...func(*ElasticsearchStatisticsSource)) (*ElasticsearchStatisticsSource, error)
NewElasticsearchStatisticsSource creates a new ElasticsearchStatisticsSource using functional options.
func (*ElasticsearchStatisticsSource) Analyse ¶
func (es *ElasticsearchStatisticsSource) Analyse(text, analyser string) (tokens []string, err error)
Analyse is a specific Elasticsearch method used in the analyse transformation.
func (*ElasticsearchStatisticsSource) CollectionSize ¶
func (es *ElasticsearchStatisticsSource) CollectionSize() (float64, error)
func (*ElasticsearchStatisticsSource) DocumentFrequency ¶
func (es *ElasticsearchStatisticsSource) DocumentFrequency(term string, field string) (float64, error)
DocumentFrequency is the document frequency (the number of documents containing the current term).
func (*ElasticsearchStatisticsSource) Execute ¶
func (es *ElasticsearchStatisticsSource) Execute(query gpipeline.Query, options SearchOptions) (trecresults.ResultList, error)
Execute runs the query on Elasticsearch and returns results in trec format.
func (*ElasticsearchStatisticsSource) ExecuteFast ¶
func (es *ElasticsearchStatisticsSource) ExecuteFast(query gpipeline.Query, options SearchOptions) ([]uint32, error)
ExecuteFast executes an Elasticsearch query and retrieves only the document ids in the fastest possible way. Do not use this for ranked results as the concurrency of this method does not guarantee order.
func (*ElasticsearchStatisticsSource) InverseDocumentFrequency ¶
func (es *ElasticsearchStatisticsSource) InverseDocumentFrequency(term, field string) (float64, error)
InverseDocumentFrequency is the ratio of of documents in the collection to the number of documents the term appears in, logarithmically smoothed.
func (*ElasticsearchStatisticsSource) Parameters ¶
func (es *ElasticsearchStatisticsSource) Parameters() map[string]float64
Parameters gets the immutable parameters for the statistics source.
func (*ElasticsearchStatisticsSource) RetrievalSize ¶
func (es *ElasticsearchStatisticsSource) RetrievalSize(query cqr.CommonQueryRepresentation) (float64, error)
RetrievalSize is the minimum number of documents that contains at least one of the query terms.
func (*ElasticsearchStatisticsSource) SearchOptions ¶
func (es *ElasticsearchStatisticsSource) SearchOptions() SearchOptions
SearchOptions gets the immutable execute options for the statistics source.
func (*ElasticsearchStatisticsSource) TermFrequency ¶
func (es *ElasticsearchStatisticsSource) TermFrequency(term, field, document string) (float64, error)
TermFrequency is the term frequency in the field.
func (*ElasticsearchStatisticsSource) TermVector ¶
func (es *ElasticsearchStatisticsSource) TermVector(document string) (TermVector, error)
TermVector retrieves the term vector for a document.
func (*ElasticsearchStatisticsSource) TotalTermFrequency ¶
func (es *ElasticsearchStatisticsSource) TotalTermFrequency(term, field string) (float64, error)
TotalTermFrequency is a sum of total term frequencies (the sum of total term frequencies of each term in this field).
func (*ElasticsearchStatisticsSource) VocabularySize ¶
func (es *ElasticsearchStatisticsSource) VocabularySize(field string) (float64, error)
VocabularySize is the total number of terms in the vocabulary.
type EntrezStatisticsSource ¶
type EntrezStatisticsSource struct { Limit int // The size of PubMed. N float64 // contains filtered or unexported fields }
func NewEntrezStatisticsSource ¶
func NewEntrezStatisticsSource(options ...func(source *EntrezStatisticsSource)) (EntrezStatisticsSource, error)
NewEntrezStatisticsSource creates a new entrez statistics source for searching pubmed. When an API key is specified, the entrez request Limit is raised to 10 per second instead of the default 3.
func (EntrezStatisticsSource) CollectionSize ¶
func (e EntrezStatisticsSource) CollectionSize() (float64, error)
func (EntrezStatisticsSource) Count ¶
func (e EntrezStatisticsSource) Count(term, field string) float64
func (EntrezStatisticsSource) DocumentFrequency ¶
func (e EntrezStatisticsSource) DocumentFrequency(term, field string) (float64, error)
func (EntrezStatisticsSource) Execute ¶
func (e EntrezStatisticsSource) Execute(query pipeline.Query, options SearchOptions) (trecresults.ResultList, error)
func (EntrezStatisticsSource) Fetch ¶
func (e EntrezStatisticsSource) Fetch(pmids []int, options ...func(p *entrez.Parameters)) ([]guru.MedlineDocument, error)
Fetch uses the entrez eutils to fetch the pubmed Article given a set of pubmed identifiers.
func (EntrezStatisticsSource) InverseDocumentFrequency ¶
func (e EntrezStatisticsSource) InverseDocumentFrequency(term, field string) (float64, error)
func (EntrezStatisticsSource) Link ¶
func (e EntrezStatisticsSource) Link(pmids []int, linkname string) ([]int, error)
func (EntrezStatisticsSource) MarshalEasyJSON ¶
func (e EntrezStatisticsSource) MarshalEasyJSON(w *jwriter.Writer)
MarshalEasyJSON supports easyjson.Marshaler interface
func (EntrezStatisticsSource) MarshalJSON ¶
func (e EntrezStatisticsSource) MarshalJSON() ([]byte, error)
MarshalJSON supports json.Marshaler interface
func (EntrezStatisticsSource) Parameters ¶
func (e EntrezStatisticsSource) Parameters() map[string]float64
func (EntrezStatisticsSource) RetrievalSize ¶
func (e EntrezStatisticsSource) RetrievalSize(query cqr.CommonQueryRepresentation) (float64, error)
func (EntrezStatisticsSource) Search ¶
func (e EntrezStatisticsSource) Search(query string, options ...func(p *entrez.Parameters)) ([]int, error)
Search uses the entrez eutils to get the pmids for a given query.
func (EntrezStatisticsSource) SearchOptions ¶
func (e EntrezStatisticsSource) SearchOptions() SearchOptions
func (EntrezStatisticsSource) SearchSize ¶
func (e EntrezStatisticsSource) SearchSize(n int) func(p *entrez.Parameters)
func (EntrezStatisticsSource) SearchStart ¶
func (e EntrezStatisticsSource) SearchStart(n int) func(p *entrez.Parameters)
func (EntrezStatisticsSource) SetDB ¶
func (e EntrezStatisticsSource) SetDB(db string) EntrezStatisticsSource
func (EntrezStatisticsSource) Summary ¶
func (e EntrezStatisticsSource) Summary(ids []string, value interface{}, options ...func(p *entrez.Parameters)) error
Summary uses the entrez eutils to obtain summary documents for the ids.
func (EntrezStatisticsSource) TermFrequency ¶
func (e EntrezStatisticsSource) TermFrequency(term, field, document string) (float64, error)
func (EntrezStatisticsSource) TermVector ¶
func (e EntrezStatisticsSource) TermVector(document string) (TermVector, error)
func (EntrezStatisticsSource) TotalTermFrequency ¶
func (e EntrezStatisticsSource) TotalTermFrequency(term, _ string) (float64, error)
func (EntrezStatisticsSource) Translation ¶
func (e EntrezStatisticsSource) Translation(term string) ([]string, error)
func (*EntrezStatisticsSource) UnmarshalEasyJSON ¶
func (e *EntrezStatisticsSource) UnmarshalEasyJSON(l *jlexer.Lexer)
UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (*EntrezStatisticsSource) UnmarshalJSON ¶
func (e *EntrezStatisticsSource) UnmarshalJSON(data []byte) error
UnmarshalJSON supports json.Unmarshaler interface
func (EntrezStatisticsSource) VocabularySize ¶
func (e EntrezStatisticsSource) VocabularySize(field string) (float64, error)
type LanguageModel ¶
type LanguageModel struct { DocIds []string Scores []float64 Weights []float64 TermCount map[string]float64 DocLen float64 StatisticsSource StatisticsSource VocabularySize float64 TotalTermFrequency map[string]float64 }
LanguageModel is used for query likelihood statistics.
func NewLanguageModel ¶
func NewLanguageModel(source StatisticsSource, docIds []string, scores []float64, field string, options ...func(model *LanguageModel)) (*LanguageModel, error)
NewLanguageModel creates a new language model from a statistics source using the specified documents and scores for those documents. Optionally, the language model can use weights that can be configured through the functional arguments.
func (*LanguageModel) CollectionTermProbability ¶
func (lm *LanguageModel) CollectionTermProbability(term string) float64
CollectionTermProbability is the term probability for the background language model.
func (*LanguageModel) DocumentTermProbability ¶
func (lm *LanguageModel) DocumentTermProbability(term string) float64
DocumentTermProbability is the term probability for the document language model.
func (*LanguageModel) KLDivergence ¶
func (lm *LanguageModel) KLDivergence(lambda float64, probability TermProbability) (float64, error)
KLDivergence computes the KLDivergence between the background collection and the document language model.
type Search ¶
type Search struct {
Count int `xml:"Count"`
}
func (Search) MarshalEasyJSON ¶
MarshalEasyJSON supports easyjson.Marshaler interface
func (Search) MarshalJSON ¶
MarshalJSON supports json.Marshaler interface
func (*Search) UnmarshalEasyJSON ¶
UnmarshalEasyJSON supports easyjson.Unmarshaler interface
func (*Search) UnmarshalJSON ¶
UnmarshalJSON supports json.Unmarshaler interface
type SearchOptions ¶
SearchOptions are options that the statistics source will use for retrieval.
type StatisticsSource ¶
type StatisticsSource interface { SearchOptions() SearchOptions Parameters() map[string]float64 TermFrequency(term, field, document string) (float64, error) TermVector(document string) (TermVector, error) DocumentFrequency(term, field string) (float64, error) TotalTermFrequency(term, field string) (float64, error) InverseDocumentFrequency(term, field string) (float64, error) RetrievalSize(query cqr.CommonQueryRepresentation) (float64, error) VocabularySize(field string) (float64, error) Execute(query pipeline.Query, options SearchOptions) (trecresults.ResultList, error) CollectionSize() (float64, error) }
StatisticsSource represents the way statistics are calculated for a collection.
type TermProbability ¶
type TermProbability func(model LanguageModel, term string) float64
TermProbability returns a term probability for a term in a language model.
func DirichletTermProbability ¶
func DirichletTermProbability(mu float64) TermProbability
DirichletTermProbability computes Dirichlet distribution for term in a language model.
func JelinekMercerTermProbability ¶
func JelinekMercerTermProbability(lambda float64) TermProbability
JelinekMercerTermProbability computes Jelinek-Mercer probability for term in a language model.
type TermVector ¶
type TermVector []TermVectorTerm
TermVector is a standard format for returning term vectors from statistic sources.
func (TermVector) ToPipelineQuery ¶
func (tv TermVector) ToPipelineQuery(topic, name string) pipeline.Query
ToPipelineQuery creates a pipeline query from a term vector. This can be used to perform analysis on documents (since the term vector is a representation of a document).