types

package

v0.0.0-...-360e525 Latest Latest Go to latest Published: May 21, 2019 License: Apache-2.0 Imports: 3 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/oGre222/tea

Links

Open Source Insights

Documentation ¶

Overview ¶

Package types is riot types

Index ¶

Constants
type Attri
type BM25Parameters
type BaseResp
type Content
type DocData
type DocIndex
type DocIndexData
type DocInfo
type DocInfosShard
type DocsId
- func (docs DocsId) Len() int
- func (docs DocsId) Less(i, j int) bool
- func (docs DocsId) Swap(i, j int)
type DocsIndex
- func (docs DocsIndex) Len() int
- func (docs DocsIndex) Less(i, j int) bool
- func (docs DocsIndex) Swap(i, j int)
type EngineOpts
- func (options *EngineOpts) Init()
type Expr
type IndexedDoc
type IndexerOpts
- func (options *IndexerOpts) Init()
type InvertedIndexShard
type KeywordIndex
type KeywordIndices
type Logic
type RankByBM25
- func (rule RankByBM25) Score(doc IndexedDoc, fields interface{}) []float32
type RankOpts
type ScoredDoc
type ScoredDocs
- func (docs ScoredDocs) Len() int
- func (docs ScoredDocs) Less(i, j int) bool
- func (docs ScoredDocs) Swap(i, j int)
type ScoredID
type ScoredIDs
- func (docs ScoredIDs) Len() int
- func (docs ScoredIDs) Less(i, j int) bool
- func (docs ScoredIDs) Swap(i, j int)
type ScoringCriteria
type SearchDoc
type SearchID
type SearchReq
type SearchResp
type TokenData

Constants ¶

View Source

const (
	// DocIdsIndex 仅存储文档的 docId
	DocIdsIndex = 0

	// FrequenciesIndex 存储关键词的词频，用于计算BM25
	FrequenciesIndex = 1

	// LocsIndex 存储关键词在文档中出现的具体字节位置（可能有多个）
	// 如果你希望得到关键词紧邻度数据，必须使用 LocsIndex 类型的索引
	LocsIndex = 2
)

这些常数定义了反向索引表存储的数据类型

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Attri ¶

type Attri struct {
	Title  string `json:"title"`
	Author string `json:"author"`
	Time   string `json:"time"`
	Ts     int64  `json:"ts"`
}

Attri doc attribute

type BM25Parameters ¶

type BM25Parameters struct {
	K1 float32
	B  float32
}

BM25Parameters 见http://en.wikipedia.org/wiki/Okapi_BM25 默认值见 engine_init_options.go

type BaseResp ¶

type BaseResp struct {
	// 搜索用到的关键词
	Tokens []string

	// 搜索是否超时。超时的情况下也可能会返回部分结果
	Timeout bool

	// 搜索到的文档个数。注意这是全部文档中满足条件的个数，可能比返回的文档数要大
	NumDocs int
}

BaseResp search response options

type Content ¶

type Content struct {
	// new Content
	Content string

	// new 属性 Attri
	Attri interface{}

	// new 返回评分字段
	Fields interface{}
}

Content search content

type DocData ¶

type DocData struct {
	// 文档全文（必须是 UTF-8 格式），用于生成待索引的关键词
	Content string

	// new 类别
	// Class string
	// new 属性
	Attri interface{}

	// 文档的关键词
	// 当 Content 不为空的时候，优先从 Content 中分词得到关键词
	// 并叠加 Tokens。
	// Tokens 存在的意义在于可以绕过 riot 内置的分词器，在引擎外部
	// 进行分词和预处理。
	// Tokens []*TokenData
	Tokens []TokenData

	// 文档标签（必须是 UTF-8 格式），比如文档的类别属性等，
	// 这些标签并不出现在文档文本中
	Labels []string

	// 文档的评分字段，可以接纳任何类型的结构体
	Fields interface{}
}

DocData type document Index Data struct

type DocIndex ¶

type DocIndex struct {
	// DocId 文本的 DocId
	DocId string

	// TokenLen 文本的关键词长
	TokenLen float32

	// Keywords 加入的索引键
	Keywords []KeywordIndex
}

DocIndex document's index

type DocIndexData ¶

type DocIndexData = DocData

DocIndexData type document Index Data struct type DocIndexData DocData

type DocInfo ¶

type DocInfo struct {
	Fields    interface{}
	TokenLens float32
}

DocInfo document info

type DocInfosShard ¶

type DocInfosShard struct {
	DocInfos map[string]*DocInfo
	NumDocs  uint64 // 这实际上是总文档数的一个近似
	sync.RWMutex
}

DocInfosShard 文档信息[id]info

type DocsId ¶

type DocsId []string

DocsId 方便批量删除文档索引 type DocsId []uint64

func (DocsId) Len ¶

func (docs DocsId) Len() int

func (DocsId) Less ¶

func (docs DocsId) Less(i, j int) bool

func (DocsId) Swap ¶

func (docs DocsId) Swap(i, j int)

type DocsIndex ¶

type DocsIndex []*DocIndex

DocsIndex 方便批量加入文档索引

func (DocsIndex) Len ¶

func (docs DocsIndex) Len() int

func (DocsIndex) Less ¶

func (docs DocsIndex) Less(i, j int) bool

func (DocsIndex) Swap ¶

func (docs DocsIndex) Swap(i, j int)

type EngineOpts ¶

type EngineOpts struct {
	// 是否使用分词器
	// 默认使用，否则在启动阶段跳过 GseDict 和 StopTokenFile 设置
	// 如果你不需要在引擎内分词，可以将这个选项设为 true
	// 注意，如果你不用分词器，那么在调用 IndexDoc 时,
	// DocIndexData 中的 Content 会被忽略
	// Not use the gse segment
	NotUseGse bool `toml:"not_use_gse"`

	// new, 分词规则
	Using int `toml:"using"`

	// 半角逗号 "," 分隔的字典文件，具体用法见
	// gse.Segmenter.LoadDict 函数的注释
	GseDict string `toml:"gse_dict"`
	PinYin  bool   `toml:"pin_yin"`

	// 停用词文件
	StopTokenFile string `toml:"stop_file"`
	// Gse search mode
	GseMode bool   `toml:"gse_mode"`
	Hmm     bool   `toml:"hmm"`
	Model   string `toml:"model"`

	// 分词器线程数
	// NumSegmenterThreads int
	NumGseThreads int

	// 索引器和排序器的 shard 数目
	// 被检索/排序的文档会被均匀分配到各个 shard 中
	NumShards int

	// 索引器的信道缓冲长度
	IndexerBufLen int

	// 索引器每个shard分配的线程数
	NumIndexerThreads int

	// 排序器的信道缓冲长度
	RankerBufLen int

	// 排序器每个 shard 分配的线程数
	NumRankerThreads int

	// 索引器初始化选项
	IndexerOpts *IndexerOpts

	// 默认的搜索选项
	DefRankOpts *RankOpts

	// 是否使用持久数据库，以及数据库文件保存的目录和裂分数目
	StoreOnly bool `toml:"store_only"`
	UseStore  bool `toml:"use_store"`

	StoreFolder string `toml:"store_folder"`
	StoreShards int    `toml:"store_shards"`
	StoreEngine string `toml:"store_engine"`

	IDOnly bool `toml:"id_only"`

	//反向索引使用 tikv 存储
	UseTiKv bool `toml:"user_tikv"`
	//存储 key 前缀
	TiKvPrefix string `toml:"tikv_prefix"`
}

EngineOpts init engine options

func (*EngineOpts) Init ¶

func (options *EngineOpts) Init()

Init init engine options 初始化 EngineOpts，当用户未设定某个选项的值时用默认值取代

type Expr ¶

type Expr struct {

	// 与查询, 必须都存在
	Must []string

	// 或查询, 有一个存在即可
	Should []string

	// 非查询, 不包含
	NotIn []string
}

Expr logic expression options

type IndexedDoc ¶

type IndexedDoc struct {
	// DocId document id
	DocId string

	// BM25，仅当索引类型为 FrequenciesIndex 或者 LocsIndex 时返回有效值
	BM25 float32

	// TokenProximity 关键词在文档中的紧邻距离，
	// 紧邻距离的含义见 computeTokenProximity 的注释。
	// 仅当索引类型为 LocsIndex 时返回有效值。
	TokenProximity int32

	// TokenSnippetLocs 紧邻距离计算得到的关键词位置，
	// 和 Lookup 函数输入 tokens 的长度一样且一一对应。
	// 仅当索引类型为 LocsIndex 时返回有效值。
	TokenSnippetLocs []int

	// TokenLocs 关键词在文本中的具体位置。
	// 仅当索引类型为 LocsIndex 时返回有效值。
	TokenLocs [][]int
}

IndexedDoc 索引器返回结果

type IndexerOpts ¶

type IndexerOpts struct {
	// 索引表的类型，见上面的常数
	IndexType int

	// 待插入索引表文档 CACHE SIZE
	DocCacheSize int

	// BM25 参数
	BM25Parameters *BM25Parameters
}

IndexerOpts 初始化索引器选项

func (*IndexerOpts) Init ¶

func (options *IndexerOpts) Init()

Init init IndexerOpts

type InvertedIndexShard ¶

type InvertedIndexShard struct {
	InvertedIndex map[string]*KeywordIndices
	TotalTokenLen float32 //总关键词数
	sync.RWMutex
}

InvertedIndexShard 反向索引表([关键词]反向索引表)

type KeywordIndex ¶

type KeywordIndex struct {
	// Text 搜索键的 UTF-8 文本
	Text string

	// Frequency 搜索键词频
	Frequency float32

	// Starts 搜索键在文档中的起始字节位置，按照升序排列
	Starts []int
}

KeywordIndex 反向索引项，这实际上标注了一个（搜索键，文档）对。

type KeywordIndices ¶

type KeywordIndices struct {
	// 下面的切片是否为空，取决于初始化时 IndexType 的值
	DocIds      []uint64  // 全部类型都有
	Frequencies []float32 // IndexType == FrequenciesIndex
	Locations   [][]int   // IndexType == LocsIndex
}

KeywordIndices 反向索引表的一行，收集了一个搜索键出现的所有文档，按照 DocId 从小到大排序。

type Logic ¶

type Logic struct {

	// 与查询, 必须都存在
	Must bool

	// 或查询, 有一个存在即可
	Should bool

	// 非查询, 不包含
	NotIn bool

	Expr
}

Logic logic options

type RankByBM25 ¶

type RankByBM25 struct {
}

RankByBM25 一个简单的评分规则，文档分数为BM25

func (RankByBM25) Score ¶

func (rule RankByBM25) Score(doc IndexedDoc, fields interface{}) []float32

Score score

type RankOpts ¶

type RankOpts struct {
	// 文档的评分规则，值为 nil 时使用 Engine 初始化时设定的规则
	ScoringCriteria ScoringCriteria

	// 默认情况下（ReverseOrder = false）按照分数从大到小排序，否则从小到大排序
	ReverseOrder bool

	// 从第几条结果开始输出
	OutputOffset int

	// 最大输出的搜索结果数，为 0 时无限制
	MaxOutputs int
}

RankOpts rank options

type ScoredDoc ¶

type ScoredDoc struct {
	ScoredID

	// new 返回文档 Content
	Content string
	// new 返回文档属性 Attri
	Attri interface{}
	// new 返回评分字段
	Fields interface{}
}

ScoredDoc scored the document

type ScoredDocs ¶

type ScoredDocs []ScoredDoc

ScoredDocs 为了方便排序

func (ScoredDocs) Len ¶

func (docs ScoredDocs) Len() int

func (ScoredDocs) Less ¶

func (docs ScoredDocs) Less(i, j int) bool

func (ScoredDocs) Swap ¶

func (docs ScoredDocs) Swap(i, j int)

type ScoredID ¶

type ScoredID struct {
	DocId string

	// 文档的打分值
	// 搜索结果按照 Scores 的值排序，先按照第一个数排，
	// 如果相同则按照第二个数排序，依次类推。
	Scores []float32

	// 用于生成摘要的关键词在文本中的字节位置，
	// 该切片长度和 SearchResp.Tokens 的长度一样
	// 只有当 IndexType == LocsIndex 时不为空
	TokenSnippetLocs []int

	// 关键词出现的位置
	// 只有当 IndexType == LocsIndex 时不为空
	TokenLocs [][]int
}

ScoredID scored doc only id

type ScoredIDs ¶

type ScoredIDs []ScoredID

ScoredIDs 为了方便排序

func (ScoredIDs) Len ¶

func (docs ScoredIDs) Len() int

func (ScoredIDs) Less ¶

func (docs ScoredIDs) Less(i, j int) bool

func (ScoredIDs) Swap ¶

func (docs ScoredIDs) Swap(i, j int)

type ScoringCriteria ¶

type ScoringCriteria interface {
	// 给一个文档评分，文档排序时先用第一个分值比较，如果
	// 分值相同则转移到第二个分值，以此类推。
	// 返回空切片表明该文档应该从最终排序结果中剔除。
	Score(doc IndexedDoc, fields interface{}) []float32
}

ScoringCriteria 评分规则通用接口

type SearchDoc ¶

type SearchDoc struct {
	BaseResp
	// 搜索到的文档，已排序
	Docs []ScoredDoc
}

SearchDoc search response options

type SearchID ¶

type SearchID struct {
	BaseResp
	// 搜索到的文档，已排序
	Docs []ScoredID
}

SearchID search response options

type SearchReq ¶

type SearchReq struct {
	// 搜索的短语（必须是 UTF-8 格式），会被分词
	// 当值为空字符串时关键词会从下面的 Tokens 读入
	Text string

	// 关键词（必须是 UTF-8 格式），当 Text 不为空时优先使用 Text
	// 通常你不需要自己指定关键词，除非你运行自己的分词程序
	Tokens []string

	// 文档标签（必须是 UTF-8 格式），标签不存在文档文本中，
	// 但也属于搜索键的一种
	Labels []string

	// Logic 逻辑检索表达式
	Logic Logic

	// 当不为 nil 时，仅从这些 DocIds 包含的键中搜索（忽略值）
	DocIds map[string]bool

	// 排序选项
	RankOpts *RankOpts

	// 超时，单位毫秒（千分之一秒）。此值小于等于零时不设超时。
	// 搜索超时的情况下仍有可能返回部分排序结果。
	Timeout int

	// 设为 true 时仅统计搜索到的文档个数，不返回具体的文档
	CountDocsOnly bool

	// 不排序，对于可在引擎外部（比如客户端）排序情况适用
	// 对返回文档很多的情况打开此选项可以有效节省时间
	Orderless bool
}

SearchReq search request options

type SearchResp ¶

type SearchResp struct {
	BaseResp
	// 搜索到的文档，已排序
	Docs interface{}
}

SearchResp search response options

type TokenData ¶

type TokenData struct {
	// 关键词的字符串
	Text string

	// 关键词的首字节在文档中出现的位置
	Locations []int
}

TokenData 文档的一个关键词

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL