Documentation ¶
Index ¶
- Constants
- Variables
- func CalDocScore(frequency int32, pagerank int) float64
- func CalIDF(docNum int, df int) float64
- func Drain(idx Index, file string)
- func IfElseInt(condition bool, o1 int, o2 int) int
- func Load(file string) (chan *KVPair, error)
- func LoadDocumentStream(path string) (chan *Document, error)
- type BTreeIndex
- func (bt *BTreeIndex) Add(docs []Document)
- func (bt *BTreeIndex) Clear()
- func (bt *BTreeIndex) Close()
- func (bt *BTreeIndex) Get(term string) []Doc
- func (bt *BTreeIndex) Insert(key string, pl PostingList)
- func (bt *BTreeIndex) Keys() []string
- func (bt *BTreeIndex) Load()
- func (bt *BTreeIndex) Lookup(token string, dirty bool) PostingList
- func (bt *BTreeIndex) Property() *Property
- func (bt *BTreeIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc
- func (bt *BTreeIndex) Save()
- func (bt *BTreeIndex) SetProperty(p Property)
- type DataRange
- type Doc
- type Document
- type HashMapIndex
- func (idx *HashMapIndex) Add(docs []Document)
- func (idx *HashMapIndex) Clear()
- func (idx *HashMapIndex) Get(term string) []Doc
- func (idx *HashMapIndex) Keys() []string
- func (idx *HashMapIndex) Map() map[string]PostingList
- func (idx *HashMapIndex) Property() *Property
- func (idx *HashMapIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc
- type Index
- type KVPair
- type PostingList
- func (pl *PostingList) Append(docs ...Doc)
- func (pl PostingList) Bytes() []byte
- func (pl *PostingList) Filter(docs []Doc)
- func (pl PostingList) Find(id int) *Doc
- func (pl *PostingList) FromBytes(buf []byte)
- func (pl PostingList) IDs() []int
- func (pl *PostingList) Inter(docs []Doc)
- func (pl PostingList) Len() int
- func (pl PostingList) Less(i, j int) bool
- func (pl PostingList) Swap(i, j int)
- func (pl *PostingList) Union(docs []Doc)
- type Property
- type SearchModel
- type TF
- type TFIDF
- type Term
Constants ¶
const VirtualQueryDocId int32 = -10000
Variables ¶
var DefaultConfig = btree.Config{ IndexConfig: btree.IndexConfig{ Sectorsize: 512, Flistsize: 1000 * btree.OFFSET_SIZE, Blocksize: 512, }, Maxlevel: 4, RebalanceThrs: 30, AppendRatio: 0.7, DrainRate: 100, MaxLeafCache: 0, Sync: false, Nocache: false, }
Functions ¶
func CalDocScore ¶
CalDocScore todo: calculate doc static score by PageRank + frequency
func LoadDocumentStream ¶
Types ¶
type BTreeIndex ¶
type BTreeIndex struct { //skip-list vs btree: //https://stackoverflow.com/questions/256511/skip-list-vs-binary-search-tree/28270537#28270537 BT *btree.BTree IndexFile string // contains filtered or unexported fields }
func NewBTreeIndex ¶
func NewBTreeIndex(file string) *BTreeIndex
func (*BTreeIndex) Add ¶
func (bt *BTreeIndex) Add(docs []Document)
Add 该方法比较低效,批量插入文档会在posting list后不段追加新文档,但postinglist并未预留空间, 因此需要移动到新的空间,导致文件数据拷贝
func (*BTreeIndex) Clear ¶
func (bt *BTreeIndex) Clear()
func (*BTreeIndex) Close ¶
func (bt *BTreeIndex) Close()
func (*BTreeIndex) Get ¶
func (bt *BTreeIndex) Get(term string) []Doc
func (*BTreeIndex) Insert ¶
func (bt *BTreeIndex) Insert(key string, pl PostingList)
func (*BTreeIndex) Keys ¶
func (bt *BTreeIndex) Keys() []string
func (*BTreeIndex) Load ¶
func (bt *BTreeIndex) Load()
func (*BTreeIndex) Lookup ¶
func (bt *BTreeIndex) Lookup(token string, dirty bool) PostingList
func (*BTreeIndex) Property ¶
func (bt *BTreeIndex) Property() *Property
func (*BTreeIndex) Retrieval ¶
func (bt *BTreeIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc
func (*BTreeIndex) Save ¶
func (bt *BTreeIndex) Save()
func (*BTreeIndex) SetProperty ¶
func (bt *BTreeIndex) SetProperty(p Property)
type Doc ¶
type Doc struct { ID int32 //doc id DocLen int32 //doc length TF int32 //词频, eg. 在倒排表term->[doc1,doc2,doc3]中,仅表示term在docX中的词频 QualityScore float64 //静态分、质量分 Score float64 //bm25/Cosine score used by sort }
func CalCosine ¶
CalCosine 余弦距离相似度 https://blog.csdn.net/weixin_42398658/article/details/85063004
func DoRetrieval ¶
func DoRetrieval(idx Index, must []string, should []string, not []string, k int, r int, model SearchModel) []Doc
DoRetrieval returns top k docs sorted by boolean model todo: compress posting list and opt intersection/union rt https://blog.csdn.net/weixin_39890629/article/details/111268898
type Document ¶
type Document struct { Title string `xml:"title"` URL string `xml:"url"` Text string `xml:"abstract"` Timestamp int ID int }
Document represents a Wikipedia abstract dump document.
func LoadDocuments ¶
LoadDocuments loads a Wikipedia abstract dump and returns a slice of documents. Dump example from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract1.xml.gz
type HashMapIndex ¶
type HashMapIndex struct {
// contains filtered or unexported fields
}
HashMapIndex is an inverted index. It maps tokens to document IDs.
func NewHashMapIndex ¶
func NewHashMapIndex() *HashMapIndex
func (*HashMapIndex) Add ¶
func (idx *HashMapIndex) Add(docs []Document)
Add adds documents to the index. todo: Support indexing multiple document fields.
func (*HashMapIndex) Get ¶
func (idx *HashMapIndex) Get(term string) []Doc
func (*HashMapIndex) Keys ¶
func (idx *HashMapIndex) Keys() []string
func (*HashMapIndex) Map ¶
func (idx *HashMapIndex) Map() map[string]PostingList
func (*HashMapIndex) Property ¶
func (idx *HashMapIndex) Property() *Property
func (*HashMapIndex) Retrieval ¶
func (idx *HashMapIndex) Retrieval(must []string, should []string, not []string, k int, r int, m SearchModel) []Doc
type KVPair ¶
type KVPair struct { Key string Value PostingList }
type PostingList ¶
type PostingList []Doc
func (*PostingList) Append ¶
func (pl *PostingList) Append(docs ...Doc)
func (PostingList) Bytes ¶
func (pl PostingList) Bytes() []byte
func (*PostingList) Filter ¶
func (pl *PostingList) Filter(docs []Doc)
func (PostingList) Find ¶
func (pl PostingList) Find(id int) *Doc
func (*PostingList) FromBytes ¶
func (pl *PostingList) FromBytes(buf []byte)
func (PostingList) IDs ¶
func (pl PostingList) IDs() []int
func (*PostingList) Inter ¶
func (pl *PostingList) Inter(docs []Doc)
func (PostingList) Len ¶
func (pl PostingList) Len() int
func (PostingList) Less ¶
func (pl PostingList) Less(i, j int) bool
func (PostingList) Swap ¶
func (pl PostingList) Swap(i, j int)
func (*PostingList) Union ¶
func (pl *PostingList) Union(docs []Doc)
type Property ¶
type Property struct {
// contains filtered or unexported fields
}