types

package

v0.0.0-...-ecbd39f Latest Latest Go to latest Published: Aug 24, 2021 License: Apache-2.0 Imports: 6 Imported by: 59

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/huichen/wukong

Documentation ¶

Index ¶

Constants
type BM25Parameters
type DocumentIndex
type DocumentIndexData
type DocumentsId
- func (docs DocumentsId) Len() int
- func (docs DocumentsId) Less(i, j int) bool
- func (docs DocumentsId) Swap(i, j int)
type DocumentsIndex
- func (docs DocumentsIndex) Close()
- func (docs DocumentsIndex) Len() int
- func (docs DocumentsIndex) Less(i, j int) bool
- func (docs DocumentsIndex) Swap(i, j int)
type EngineInitOptions
- func (options *EngineInitOptions) Init()
type IndexedDocument
type IndexerInitOptions
- func (options *IndexerInitOptions) Init()
type KeywordIndex
type RankByBM25
- func (rule RankByBM25) Score(doc IndexedDocument, fields interface{}) []float32
type RankOptions
type ScoredDocument
type ScoredDocuments
- func (docs ScoredDocuments) Len() int
- func (docs ScoredDocuments) Less(i, j int) bool
- func (docs ScoredDocuments) Swap(i, j int)
type ScoringCriteria
type SearchRequest
type SearchResponse
type StopTokens
- func (st *StopTokens) Close()
- func (st *StopTokens) Init(stopTokenFile string)
- func (st *StopTokens) IsStopToken(token string) bool
type TokenData

Constants ¶

View Source

const (
	// 仅存储文档的docId
	DocIdsIndex = 0

	// 存储关键词的词频，用于计算BM25
	FrequenciesIndex = 1

	// 存储关键词在文档中出现的具体字节位置（可能有多个）
	// 如果你希望得到关键词紧邻度数据，必须使用LocationsIndex类型的索引
	LocationsIndex = 2
)

这些常数定义了反向索引表存储的数据类型

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type BM25Parameters ¶

type BM25Parameters struct {
	K1 float32
	B  float32
}

见http://en.wikipedia.org/wiki/Okapi_BM25 默认值见engine_init_options.go

type DocumentIndex ¶

type DocumentIndex struct {
	// 文本的DocId
	DocId uint64

	// 文本的关键词长
	TokenLength float32

	// 加入的索引键
	Keywords []KeywordIndex
}

type DocumentIndexData ¶

type DocumentIndexData struct {
	// 文档全文（必须是UTF-8格式），用于生成待索引的关键词
	Content string

	// 文档的关键词
	// 当Content不为空的时候，优先从Content中分词得到关键词。
	// Tokens存在的意义在于绕过悟空内置的分词器，在引擎外部
	// 进行分词和预处理。
	Tokens []TokenData

	// 文档标签（必须是UTF-8格式），比如文档的类别属性等，这些标签并不出现在文档文本中
	Labels []string

	// 文档的评分字段，可以接纳任何类型的结构体
	Fields interface{}
}

type DocumentsId ¶

type DocumentsId []uint64

方便批量删除文档索引

func (DocumentsId) Len ¶

func (docs DocumentsId) Len() int

func (DocumentsId) Less ¶

func (docs DocumentsId) Less(i, j int) bool

func (DocumentsId) Swap ¶

func (docs DocumentsId) Swap(i, j int)

type DocumentsIndex ¶

type DocumentsIndex []*DocumentIndex

方便批量加入文档索引

func (DocumentsIndex) Close ¶

func (docs DocumentsIndex) Close()

释放资源

func (DocumentsIndex) Len ¶

func (docs DocumentsIndex) Len() int

func (DocumentsIndex) Less ¶

func (docs DocumentsIndex) Less(i, j int) bool

func (DocumentsIndex) Swap ¶

func (docs DocumentsIndex) Swap(i, j int)

type EngineInitOptions ¶

type EngineInitOptions struct {
	// 是否使用分词器
	// 默认使用，否则在启动阶段跳过SegmenterDictionaries和StopTokenFile设置
	// 如果你不需要在引擎内分词，可以将这个选项设为true
	// 注意，如果你不用分词器，那么在调用IndexDocument时DocumentIndexData中的Content会被忽略
	NotUsingSegmenter bool

	// 尝试使用外部传入的 segmenter，如果为 nil 则使用下面的字典文件
	Segmenter *sego.Segmenter
	// 半角逗号分隔的字典文件，具体用法见
	// sego.Segmenter.LoadDictionary函数的注释
	SegmenterDictionaries string

	// 停用词文件，如果为 nil 则尝试从下面的文件载入
	StopTokens    *StopTokens
	StopTokenFile string

	// 分词器线程数
	NumSegmenterThreads int

	// 索引器和排序器的shard数目
	// 被检索/排序的文档会被均匀分配到各个shard中
	NumShards int

	// 索引器的信道缓冲长度
	IndexerBufferLength int

	// 索引器每个shard分配的线程数
	NumIndexerThreadsPerShard int

	// 排序器的信道缓冲长度
	RankerBufferLength int

	// 排序器每个shard分配的线程数
	NumRankerThreadsPerShard int

	// 索引器初始化选项
	IndexerInitOptions *IndexerInitOptions

	// 默认的搜索选项
	DefaultRankOptions *RankOptions

	// 是否使用持久数据库，以及数据库文件保存的目录和裂分数目
	UsePersistentStorage    bool
	PersistentStorageFolder string
	PersistentStorageShards int
}

func (*EngineInitOptions) Init ¶

func (options *EngineInitOptions) Init()

初始化EngineInitOptions，当用户未设定某个选项的值时用默认值取代

type IndexedDocument ¶

type IndexedDocument struct {
	DocId uint64

	// BM25，仅当索引类型为FrequenciesIndex或者LocationsIndex时返回有效值
	BM25 float32

	// 关键词在文档中的紧邻距离，紧邻距离的含义见computeTokenProximity的注释。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenProximity int32

	// 紧邻距离计算得到的关键词位置，和Lookup函数输入tokens的长度一样且一一对应。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenSnippetLocations []int

	// 关键词在文本中的具体位置。
	// 仅当索引类型为LocationsIndex时返回有效值。
	TokenLocations [][]int
}

索引器返回结果

type IndexerInitOptions ¶

type IndexerInitOptions struct {
	// 索引表的类型，见上面的常数
	IndexType int

	// 待插入索引表文档 CACHE SIZE
	DocCacheSize int

	// BM25参数
	BM25Parameters *BM25Parameters
}

初始化索引器选项

func (*IndexerInitOptions) Init ¶

func (options *IndexerInitOptions) Init()

type KeywordIndex ¶

type KeywordIndex struct {
	// 搜索键的UTF-8文本
	Text string

	// 搜索键词频
	Frequency float32

	// 搜索键在文档中的起始字节位置，按照升序排列
	Starts []int
}

反向索引项，这实际上标注了一个（搜索键，文档）对。

type RankByBM25 ¶

type RankByBM25 struct {
}

一个简单的评分规则，文档分数为BM25

func (RankByBM25) Score ¶

func (rule RankByBM25) Score(doc IndexedDocument, fields interface{}) []float32

type RankOptions ¶

type RankOptions struct {
	// 文档的评分规则，值为nil时使用Engine初始化时设定的规则
	ScoringCriteria ScoringCriteria

	// 默认情况下（ReverseOrder=false）按照分数从大到小排序，否则从小到大排序
	ReverseOrder bool

	// 从第几条结果开始输出
	OutputOffset int

	// 最大输出的搜索结果数，为0时无限制
	MaxOutputs int
}

type ScoredDocument ¶

type ScoredDocument struct {
	DocId uint64

	// 文档的打分值
	// 搜索结果按照Scores的值排序，先按照第一个数排，如果相同则按照第二个数排序，依次类推。
	Scores []float32

	// 用于生成摘要的关键词在文本中的字节位置，该切片长度和SearchResponse.Tokens的长度一样
	// 只有当IndexType == LocationsIndex时不为空
	TokenSnippetLocations []int

	// 关键词出现的位置
	// 只有当IndexType == LocationsIndex时不为空
	TokenLocations [][]int
}

type ScoredDocuments ¶

type ScoredDocuments []ScoredDocument

func (ScoredDocuments) Len ¶

func (docs ScoredDocuments) Len() int

func (ScoredDocuments) Less ¶

func (docs ScoredDocuments) Less(i, j int) bool

func (ScoredDocuments) Swap ¶

func (docs ScoredDocuments) Swap(i, j int)

type ScoringCriteria ¶

type ScoringCriteria interface {
	// 给一个文档评分，文档排序时先用第一个分值比较，如果
	// 分值相同则转移到第二个分值，以此类推。
	// 返回空切片表明该文档应该从最终排序结果中剔除。
	Score(doc IndexedDocument, fields interface{}) []float32
}

评分规则通用接口

type SearchRequest ¶

type SearchRequest struct {
	// 搜索的短语（必须是UTF-8格式），会被分词
	// 当值为空字符串时关键词会从下面的Tokens读入
	Text string

	// 关键词（必须是UTF-8格式），当Text不为空时优先使用Text
	// 通常你不需要自己指定关键词，除非你运行自己的分词程序
	Tokens []string

	// 文档标签（必须是UTF-8格式），标签不存在文档文本中，但也属于搜索键的一种
	Labels []string

	// 当不为nil时，仅从这些DocIds包含的键中搜索（忽略值）
	DocIds map[uint64]bool

	// 排序选项
	RankOptions *RankOptions

	// 超时，单位毫秒（千分之一秒）。此值小于等于零时不设超时。
	// 搜索超时的情况下仍有可能返回部分排序结果。
	Timeout int

	// 设为true时仅统计搜索到的文档个数，不返回具体的文档
	CountDocsOnly bool

	// 不排序，对于可在引擎外部（比如客户端）排序情况适用
	// 对返回文档很多的情况打开此选项可以有效节省时间
	Orderless bool
}

type SearchResponse ¶

type SearchResponse struct {
	// 搜索用到的关键词
	Tokens []string

	// 搜索到的文档，已排序
	Docs []ScoredDocument

	// 搜索是否超时。超时的情况下也可能会返回部分结果
	Timeout bool

	// 搜索到的文档个数。注意这是全部文档中满足条件的个数，可能比返回的文档数要大
	NumDocs int
}

type StopTokens ¶

type StopTokens struct {
	// contains filtered or unexported fields
}

func (*StopTokens) Close ¶

func (st *StopTokens) Close()

释放资源

func (*StopTokens) Init ¶

func (st *StopTokens) Init(stopTokenFile string)

从stopTokenFile中读入停用词，一个词一行文档索引建立时会跳过这些停用词

func (*StopTokens) IsStopToken ¶

func (st *StopTokens) IsStopToken(token string) bool

type TokenData ¶

type TokenData struct {
	// 关键词的字符串
	Text string

	// 关键词的首字节在文档中出现的位置
	Locations []int
}

文档的一个关键词

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL