Documentation ¶
Index ¶
- Constants
- type ICorpus
- type TCorpusImpl
- func (p *TCorpusImpl) Build(fname string) (err error)
- func (p *TCorpusImpl) BuildFromString(strA string) (err error)
- func (z *TCorpusImpl) DecodeMsg(dc *msgp.Reader) (err error)
- func (z *TCorpusImpl) EncodeMsg(en *msgp.Writer) (err error)
- func (p *TCorpusImpl) GetAllDocWords() (docs [][]*TWordItem)
- func (p *TCorpusImpl) GetAllDocWordsIdx() [][]int32
- func (p *TCorpusImpl) GetAllWords() (words TWordItemSlice)
- func (p *TCorpusImpl) GetDocCnt() int
- func (p *TCorpusImpl) GetDocWordsByDocid(id string) (doc []*TWordItem)
- func (p *TCorpusImpl) GetDocWordsByIdx(i int) (doc []*TWordItem)
- func (p *TCorpusImpl) GetVocabCnt() int
- func (p *TCorpusImpl) GetWordIdx(word string) (idx int32, ok bool)
- func (p *TCorpusImpl) GetWordItemByIdx(i int) (item *TWordItem)
- func (p *TCorpusImpl) GetWordsCnt() int
- func (z *TCorpusImpl) MarshalMsg(b []byte) (o []byte, err error)
- func (z *TCorpusImpl) Msgsize() (s int)
- func (p *TCorpusImpl) String() string
- func (p *TCorpusImpl) Transform(content string) (wordsidx []int32)
- func (z *TCorpusImpl) UnmarshalMsg(bts []byte) (o []byte, err error)
- type TWordItem
- type TWordItemSlice
- func (z *TWordItemSlice) DecodeMsg(dc *msgp.Reader) (err error)
- func (z TWordItemSlice) EncodeMsg(en *msgp.Writer) (err error)
- func (p TWordItemSlice) Len() int
- func (p TWordItemSlice) Less(i, j int) bool
- func (z TWordItemSlice) MarshalMsg(b []byte) (o []byte, err error)
- func (z TWordItemSlice) Msgsize() (s int)
- func (p TWordItemSlice) Swap(i, j int)
- func (z *TWordItemSlice) UnmarshalMsg(bts []byte) (o []byte, err error)
Constants ¶
View Source
const (
VOCAB_HASH_SIZE int = 30000000 //3kw, 30M
)
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ICorpus ¶
type ICorpus interface { Build(fname string) (err error) BuildFromString(strA string) (err error) GetVocabCnt() int //排重后的词库大小 GetDocCnt() int //doc个数, 按docid排重 GetWordsCnt() int //排重前的词数 GetWordIdx(word string) (idx int32, ok bool) GetWordItemByIdx(i int) (item *TWordItem) GetAllWords() (words TWordItemSlice) GetAllDocWordsIdx() [][]int32 GetAllDocWords() (doc [][]*TWordItem) GetDocWordsByDocid(id string) (doc []*TWordItem) GetDocWordsByIdx(i int) (doc []*TWordItem) Transform(content string) (wordsidx []int32) msgp.Encodable msgp.Decodable msgp.Marshaler msgp.Unmarshaler msgp.Sizer }
type TCorpusImpl ¶
type TCorpusImpl struct { Words TWordItemSlice //Vocab Word2Idx map[string]int32 //word -> words中的下标 Doc2WordsIdx [][]int32 // Doc2Idx map[string]int32 //docid -> Doc2WordsIdx中的下表 MinReduce int32 MinCnt int32 WordsCnt int //未排重的词数 }
func (*TCorpusImpl) Build ¶
func (p *TCorpusImpl) Build(fname string) (err error)
func (*TCorpusImpl) BuildFromString ¶
func (p *TCorpusImpl) BuildFromString(strA string) (err error)
func (*TCorpusImpl) DecodeMsg ¶
func (z *TCorpusImpl) DecodeMsg(dc *msgp.Reader) (err error)
DecodeMsg implements msgp.Decodable
func (*TCorpusImpl) EncodeMsg ¶
func (z *TCorpusImpl) EncodeMsg(en *msgp.Writer) (err error)
EncodeMsg implements msgp.Encodable
func (*TCorpusImpl) GetAllDocWords ¶
func (p *TCorpusImpl) GetAllDocWords() (docs [][]*TWordItem)
func (*TCorpusImpl) GetAllDocWordsIdx ¶
func (p *TCorpusImpl) GetAllDocWordsIdx() [][]int32
func (*TCorpusImpl) GetAllWords ¶
func (p *TCorpusImpl) GetAllWords() (words TWordItemSlice)
func (*TCorpusImpl) GetDocCnt ¶
func (p *TCorpusImpl) GetDocCnt() int
func (*TCorpusImpl) GetDocWordsByDocid ¶
func (p *TCorpusImpl) GetDocWordsByDocid(id string) (doc []*TWordItem)
func (*TCorpusImpl) GetDocWordsByIdx ¶
func (p *TCorpusImpl) GetDocWordsByIdx(i int) (doc []*TWordItem)
func (*TCorpusImpl) GetVocabCnt ¶
func (p *TCorpusImpl) GetVocabCnt() int
func (*TCorpusImpl) GetWordIdx ¶
func (p *TCorpusImpl) GetWordIdx(word string) (idx int32, ok bool)
func (*TCorpusImpl) GetWordItemByIdx ¶
func (p *TCorpusImpl) GetWordItemByIdx(i int) (item *TWordItem)
func (*TCorpusImpl) GetWordsCnt ¶
func (p *TCorpusImpl) GetWordsCnt() int
func (*TCorpusImpl) MarshalMsg ¶
func (z *TCorpusImpl) MarshalMsg(b []byte) (o []byte, err error)
MarshalMsg implements msgp.Marshaler
func (*TCorpusImpl) Msgsize ¶
func (z *TCorpusImpl) Msgsize() (s int)
Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message
func (*TCorpusImpl) String ¶
func (p *TCorpusImpl) String() string
func (*TCorpusImpl) Transform ¶
func (p *TCorpusImpl) Transform(content string) (wordsidx []int32)
func (*TCorpusImpl) UnmarshalMsg ¶
func (z *TCorpusImpl) UnmarshalMsg(bts []byte) (o []byte, err error)
UnmarshalMsg implements msgp.Unmarshaler
type TWordItem ¶
type TWordItem struct { Cnt int32 //term frequency Point []int32 //Huffman tree(n leaf + n inner node, include root) path. [root, leaf), node index Code []bool //Huffman code. (root, leaf], 0/1 codes Word string //word desc }
func (*TWordItem) MarshalMsg ¶
MarshalMsg implements msgp.Marshaler
type TWordItemSlice ¶
type TWordItemSlice []TWordItem
func (*TWordItemSlice) DecodeMsg ¶
func (z *TWordItemSlice) DecodeMsg(dc *msgp.Reader) (err error)
DecodeMsg implements msgp.Decodable
func (TWordItemSlice) EncodeMsg ¶
func (z TWordItemSlice) EncodeMsg(en *msgp.Writer) (err error)
EncodeMsg implements msgp.Encodable
func (TWordItemSlice) Len ¶
func (p TWordItemSlice) Len() int
func (TWordItemSlice) Less ¶
func (p TWordItemSlice) Less(i, j int) bool
func (TWordItemSlice) MarshalMsg ¶
func (z TWordItemSlice) MarshalMsg(b []byte) (o []byte, err error)
MarshalMsg implements msgp.Marshaler
func (TWordItemSlice) Msgsize ¶
func (z TWordItemSlice) Msgsize() (s int)
Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message
func (TWordItemSlice) Swap ¶
func (p TWordItemSlice) Swap(i, j int)
func (*TWordItemSlice) UnmarshalMsg ¶
func (z *TWordItemSlice) UnmarshalMsg(bts []byte) (o []byte, err error)
UnmarshalMsg implements msgp.Unmarshaler
Click to show internal directories.
Click to hide internal directories.