types

package

v0.1.1-0...-d12b490 Latest Latest Go to latest Published: Nov 29, 2015 License: Apache-2.0 Imports: 4 Imported by: 4

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/henrylee2cn/wukong

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
type BM25Parameters
type DocInfo
type DocInfosShard
type DocumentIndex
type DocumentIndexData
type EngineInitOptions
- func (options *EngineInitOptions) Init()
type IndexedDocument
type IndexerInitOptions
type InvertedIndexShard
type KeywordIndex
type KeywordIndices
type RankByBM25
- func (rule RankByBM25) Score(doc IndexedDocument, fields interface{}) []float32
type RankOptions
type ScoredDocument
type ScoredDocuments
- func (docs ScoredDocuments) Len() int
- func (docs ScoredDocuments) Less(i, j int) bool
- func (docs ScoredDocuments) Swap(i, j int)
type ScoringCriteria
type SearchRequest
type SearchResponse
type TokenData

Constants ¶

View Source

const (
	// 仅存储文档的docId
	DocIdsIndex = 0

	// 存储关键词的词频，用于计算BM25
	FrequenciesIndex = 1

	// 存储关键词在文档中出现的具体字节位置（可能有多个）
	// 如果你希望得到关键词紧邻度数据，必须使用LocationsIndex类型的索引
	LocationsIndex = 2
)

这些常数定义了反向索引表存储的数据类型

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BM25Parameters ¶

type BM25Parameters struct {
	K1 float32
	B  float32
}

见http://en.wikipedia.org/wiki/Okapi_BM25 默认值见engine_init_options.go

type DocInfo ¶

type DocInfo struct {
	Fields       interface{}
	TokenLengths float32
}

type DocInfosShard ¶

type DocInfosShard struct {
	DocInfos     map[uint64]*DocInfo
	NumDocuments uint64 // 这实际上是总文档数的一个近似
	sync.RWMutex
}

文档信息[id]info

type DocumentIndex ¶

type DocumentIndex struct {
	// 文本的DocId
	DocId uint64

	// 文本的关键词长
	TokenLength float32

	// 加入的索引键
	Keywords []KeywordIndex
}

type DocumentIndexData ¶

type DocumentIndexData struct {
	// 文档全文（必须是UTF-8格式），用于生成待索引的关键词
	Content string

	// 文档的关键词
	// 当Content不为空的时候，优先从Content中分词得到关键词。
	// Tokens存在的意义在于绕过悟空内置的分词器，在引擎外部
	// 进行分词和预处理。
	Tokens []TokenData

	// 文档标签（必须是UTF-8格式），比如文档的类别属性等，这些标签并不出现在文档文本中
	Labels []string

	// 文档的评分字段，可以接纳任何类型的结构体
	Fields interface{}
}

type EngineInitOptions ¶

type EngineInitOptions struct {
	// 是否使用分词器
	// 默认使用，否则在启动阶段跳过SegmenterDictionaries和StopTokenFile设置
	// 如果你不需要在引擎内分词，可以将这个选项设为true
	// 注意，如果你不用分词器，那么在调用IndexDocument时DocumentIndexData中的Content会被忽略
	NotUsingSegmenter bool

	// 半角逗号分隔的字典文件，具体用法见
	// sego.Segmenter.LoadDictionary函数的注释
	SegmenterDictionaries string

	// 停用词文件
	StopTokenFile string

	// 分词器线程数
	NumSegmenterThreads int

	// 索引器/排序器/持久数据库的shard数目
	// 被检索/排序的文档会被均匀分配到各个shard中
	// 每个shard对应一对数据库文件（反向索引数据库和文档字段数据库）
	NumShards int

	// 索引器的信道缓冲长度
	IndexerBufferLength int

	// 索引器每个shard分配的线程数
	NumIndexerThreadsPerShard int

	// 排序器的信道缓冲长度
	RankerBufferLength int

	// 排序器每个shard分配的线程数
	NumRankerThreadsPerShard int

	// 索引器初始化选项
	IndexerInitOptions *IndexerInitOptions

	// 默认的搜索选项
	DefaultRankOptions *RankOptions

	// 是否使用持久数据库，以及数据库文件保存的目录
	UsePersistentStorage    bool
	PersistentStorageFolder string
}

func (*EngineInitOptions) Init ¶

func (options *EngineInitOptions) Init()

初始化EngineInitOptions，当用户未设定某个选项的值时用默认值取代

type IndexedDocument ¶

type IndexedDocument struct {
	DocId uint64

	// BM25，仅当索引类型为FrequenciesIndex或者LocationsIndex时返回有效值
	BM25 float32

	// 关键词在文档中的紧邻距离，紧邻距离的含义见computeTokenProximity的注释。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenProximity int32

	// 紧邻距离计算得到的关键词位置，和Lookup函数输入tokens的长度一样且一一对应。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenSnippetLocations []int

	// 关键词在文本中的具体位置。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenLocations [][]int
}

索引器返回结果

type IndexerInitOptions ¶

type IndexerInitOptions struct {
	// 索引表的类型，见上面的常数
	IndexType int

	// BM25参数
	BM25Parameters *BM25Parameters
}

初始化索引器选项

type InvertedIndexShard ¶

type InvertedIndexShard struct {
	InvertedIndex    map[string]*KeywordIndices
	TotalTokenLength float32 //总关键词数
	sync.RWMutex
}

反向索引表([关键词]反向索引表)

type KeywordIndex ¶

type KeywordIndex struct {
	// 搜索键的UTF-8文本
	Text string

	// 搜索键词频
	Frequency float32

	// 搜索键在文档中的起始字节位置，按照升序排列
	Starts []int
}

反向索引项，这实际上标注了一个（搜索键，文档）对。

type KeywordIndices ¶

type KeywordIndices struct {
	// 下面的切片是否为空，取决于初始化时IndexType的值
	DocIds      []uint64  // 全部类型都有
	Frequencies []float32 // IndexType == FrequenciesIndex
	Locations   [][]int   // IndexType == LocationsIndex
}

反向索引表的一行，收集了一个搜索键出现的所有文档，按照DocId从小到大排序。

type RankByBM25 ¶

type RankByBM25 struct {
}

一个简单的评分规则，文档分数为BM25

func (RankByBM25) Score ¶

func (rule RankByBM25) Score(doc IndexedDocument, fields interface{}) []float32

type RankOptions ¶

type RankOptions struct {
	// 文档的评分规则，值为nil时使用Engine初始化时设定的规则
	ScoringCriteria ScoringCriteria

	// 默认情况下（ReverseOrder=false）按照分数从大到小排序，否则从小到大排序
	ReverseOrder bool

	// 从第几条结果开始输出
	OutputOffset int

	// 最大输出的搜索结果数，为0时无限制
	MaxOutputs int
}

type ScoredDocument ¶

type ScoredDocument struct {
	DocId uint64

	// 文档的打分值
	// 搜索结果按照Scores的值排序，先按照第一个数排，如果相同则按照第二个数排序，依次类推。
	Scores []float32

	// 用于生成摘要的关键词在文本中的字节位置，该切片长度和SearchResponse.Tokens的长度一样
	// 只有当IndexType == LocationsIndex时不为空
	TokenSnippetLocations []int

	// 关键词出现的位置
	// 只有当IndexType == LocationsIndex时不为空
	TokenLocations [][]int
}

type ScoredDocuments ¶

type ScoredDocuments []ScoredDocument

func (ScoredDocuments) Len ¶

func (docs ScoredDocuments) Len() int

func (ScoredDocuments) Less ¶

func (docs ScoredDocuments) Less(i, j int) bool

func (ScoredDocuments) Swap ¶

func (docs ScoredDocuments) Swap(i, j int)

type ScoringCriteria ¶

type ScoringCriteria interface {
	// 给一个文档评分，文档排序时先用第一个分值比较，如果
	// 分值相同则转移到第二个分值，以此类推。
	// 返回空切片表明该文档应该从最终排序结果中剔除。
	Score(doc IndexedDocument, fields interface{}) []float32
}

评分规则通用接口

type SearchRequest ¶

type SearchRequest struct {
	// 搜索的短语（必须是UTF-8格式），会被分词
	// 当值为空字符串时关键词会从下面的Tokens读入
	Text string

	// 关键词（必须是UTF-8格式），当Text不为空时优先使用Text
	// 通常你不需要自己指定关键词，除非你运行自己的分词程序
	Tokens []string

	// 文档标签（必须是UTF-8格式），标签不存在文档文本中，但也属于搜索键的一种
	Labels []string

	// 当不为nil时，仅从这些DocIds包含的键中搜索（忽略值）
	DocIds map[uint64]bool

	// 排序选项
	RankOptions *RankOptions

	// 超时，单位毫秒（千分之一秒）。此值小于等于零时不设超时。
	// 搜索超时的情况下仍有可能返回部分排序结果。
	Timeout int

	// 设为true时仅统计搜索到的文档个数，不返回具体的文档
	CountDocsOnly bool

	// 不排序，对于可在引擎外部（比如客户端）排序情况适用
	// 对返回文档很多的情况打开此选项可以有效节省时间
	Orderless bool
}

type SearchResponse ¶

type SearchResponse struct {
	// 搜索用到的关键词
	Tokens []string

	// 搜索到的文档，已排序
	Docs []ScoredDocument

	// 搜索是否超时。超时的情况下也可能会返回部分结果
	Timeout bool

	// 搜索到的文档个数。注意这是全部文档中满足条件的个数，可能比返回的文档数要大
	NumDocs int
}

type TokenData ¶

type TokenData struct {
	// 关键词的字符串
	Text string

	// 关键词的首字节在文档中出现的位置
	Locations []int
}

文档的一个关键词

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL