corpus

package
v0.0.0-...-b167170 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 23, 2020 License: Apache-2.0 Imports: 11 Imported by: 2

Documentation

Index

Constants

View Source
const (
	VOCAB_HASH_SIZE int = 30000000 //3kw, 30M
)

Variables

This section is empty.

Functions

This section is empty.

Types

type ICorpus

type ICorpus interface {
	Build(fname string) (err error)
	BuildFromString(strA string) (err error)
	GetVocabCnt() int //排重后的词库大小
	GetDocCnt() int   //doc个数, 按docid排重
	GetWordsCnt() int //排重前的词数
	GetWordIdx(word string) (idx int32, ok bool)
	GetWordItemByIdx(i int) (item *TWordItem)
	GetAllWords() (words TWordItemSlice)
	GetAllDocWordsIdx() [][]int32
	GetAllDocWords() (doc [][]*TWordItem)
	GetDocWordsByDocid(id string) (doc []*TWordItem)
	GetDocWordsByIdx(i int) (doc []*TWordItem)
	Transform(content string) (wordsidx []int32)
	msgp.Encodable
	msgp.Decodable
	msgp.Marshaler
	msgp.Unmarshaler
	msgp.Sizer
}

func NewCorpus

func NewCorpus() ICorpus

type TCorpusImpl

type TCorpusImpl struct {
	Words        TWordItemSlice   //Vocab
	Word2Idx     map[string]int32 //word -> words中的下标
	Doc2WordsIdx [][]int32        //
	Doc2Idx      map[string]int32 //docid -> Doc2WordsIdx中的下表
	MinReduce    int32
	MinCnt       int32
	WordsCnt     int //未排重的词数
}

func (*TCorpusImpl) Build

func (p *TCorpusImpl) Build(fname string) (err error)

func (*TCorpusImpl) BuildFromString

func (p *TCorpusImpl) BuildFromString(strA string) (err error)

func (*TCorpusImpl) DecodeMsg

func (z *TCorpusImpl) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (*TCorpusImpl) EncodeMsg

func (z *TCorpusImpl) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (*TCorpusImpl) GetAllDocWords

func (p *TCorpusImpl) GetAllDocWords() (docs [][]*TWordItem)

func (*TCorpusImpl) GetAllDocWordsIdx

func (p *TCorpusImpl) GetAllDocWordsIdx() [][]int32

func (*TCorpusImpl) GetAllWords

func (p *TCorpusImpl) GetAllWords() (words TWordItemSlice)

func (*TCorpusImpl) GetDocCnt

func (p *TCorpusImpl) GetDocCnt() int

func (*TCorpusImpl) GetDocWordsByDocid

func (p *TCorpusImpl) GetDocWordsByDocid(id string) (doc []*TWordItem)

func (*TCorpusImpl) GetDocWordsByIdx

func (p *TCorpusImpl) GetDocWordsByIdx(i int) (doc []*TWordItem)

func (*TCorpusImpl) GetVocabCnt

func (p *TCorpusImpl) GetVocabCnt() int

func (*TCorpusImpl) GetWordIdx

func (p *TCorpusImpl) GetWordIdx(word string) (idx int32, ok bool)

func (*TCorpusImpl) GetWordItemByIdx

func (p *TCorpusImpl) GetWordItemByIdx(i int) (item *TWordItem)

func (*TCorpusImpl) GetWordsCnt

func (p *TCorpusImpl) GetWordsCnt() int

func (*TCorpusImpl) MarshalMsg

func (z *TCorpusImpl) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (*TCorpusImpl) Msgsize

func (z *TCorpusImpl) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (*TCorpusImpl) String

func (p *TCorpusImpl) String() string

func (*TCorpusImpl) Transform

func (p *TCorpusImpl) Transform(content string) (wordsidx []int32)

func (*TCorpusImpl) UnmarshalMsg

func (z *TCorpusImpl) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

type TWordItem

type TWordItem struct {
	Cnt   int32   //term frequency
	Point []int32 //Huffman tree(n leaf + n inner node, include root) path. [root, leaf), node index
	Code  []bool  //Huffman code. (root, leaf], 0/1 codes
	Word  string  //word desc
}

func (*TWordItem) DecodeMsg

func (z *TWordItem) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (*TWordItem) EncodeMsg

func (z *TWordItem) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (*TWordItem) MarshalMsg

func (z *TWordItem) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (*TWordItem) Msgsize

func (z *TWordItem) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (*TWordItem) UnmarshalMsg

func (z *TWordItem) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

type TWordItemSlice

type TWordItemSlice []TWordItem

func (*TWordItemSlice) DecodeMsg

func (z *TWordItemSlice) DecodeMsg(dc *msgp.Reader) (err error)

DecodeMsg implements msgp.Decodable

func (TWordItemSlice) EncodeMsg

func (z TWordItemSlice) EncodeMsg(en *msgp.Writer) (err error)

EncodeMsg implements msgp.Encodable

func (TWordItemSlice) Len

func (p TWordItemSlice) Len() int

func (TWordItemSlice) Less

func (p TWordItemSlice) Less(i, j int) bool

func (TWordItemSlice) MarshalMsg

func (z TWordItemSlice) MarshalMsg(b []byte) (o []byte, err error)

MarshalMsg implements msgp.Marshaler

func (TWordItemSlice) Msgsize

func (z TWordItemSlice) Msgsize() (s int)

Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message

func (TWordItemSlice) Swap

func (p TWordItemSlice) Swap(i, j int)

func (*TWordItemSlice) UnmarshalMsg

func (z *TWordItemSlice) UnmarshalMsg(bts []byte) (o []byte, err error)

UnmarshalMsg implements msgp.Unmarshaler

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL