tokenizer

package
v0.0.0-...-063466f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 14, 2023 License: MIT Imports: 16 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DictStdFile       = "dict_std_utf8.txt"        // standard dictionary file
	DictUserFile      = "dict_user_utf8.txt"       // user-defined dictionary file
	IDFStdFile        = "idf_std_utf8.txt"         // standard IDF file
	StopWordsStdFile  = "stop_words_std_utf8.txt"  // standard stop words file
	StopWordsUserFile = "stop_words_user_utf8.txt" // user-defined stop words file

	RegExpEnglish   = "([a-zA-Z0-9])+"                     // English regular expression
	RegExpChinese   = "([\u4e00-\u9fa5])+"                 // Chinese regular expression
	RegExpText      = "([\u4e00-\u9fa5a-zA-Z0-9+#&._%-])+" // text regular expression
	RegExpNumber    = "[a-zA-Z0-9]+(\\.\\d+)?%?"           // numeric regular expression
	RegExpDelimiter = "[\\r\\n\\s\\t]"                     // delimiter regular expression

	DefaultWordsLen = 32 // default slice size of the word segmentation result
)
View Source
const (
	DefaultIDFSize = 300000
)

Variables

This section is empty.

Functions

func CutAccurateW

func CutAccurateW(s string, words *[]string)

func CutFullW

func CutFullW(s string, words *[]string)

func CutNoHMMW

func CutNoHMMW(s string, words *[]string)

func CutSymbolW

func CutSymbolW(s string, words *[]string)

func GetDictFS

func GetDictFS() fs.FS

func GetDictFile

func GetDictFile(file string) (string, error)

Get the dictionary file directory

func GetDictPath

func GetDictPath() string

func Init

func Init(dictPath string)

func InitDictionary

func InitDictionary()

func InitFSToken

func InitFSToken()

func InitTFIDF

func InitTFIDF()

func InitWithFS

func InitWithFS(fs fs.FS)

func IsChineseChars

func IsChineseChars(s string) bool

func IsEnglishChars

func IsEnglishChars(s string) bool

func IsTextChars

func IsTextChars(s string) bool

func SetDictFS

func SetDictFS(fs fs.FS)

func SetDictPath

func SetDictPath(path string)

func SplitChineseSeg

func SplitChineseSeg(s string) []string

Split sentence according to Chinese

func SplitNumberSeg

func SplitNumberSeg(s string) []string

Split sentence according to number

func SplitTextSeg

func SplitTextSeg(s string) []string

Split sentence according to normal text

Types

type Dictionary

type Dictionary struct {
	// contains filtered or unexported fields
}

func GetDictionary

func GetDictionary() *Dictionary

func (*Dictionary) AddWord

func (d *Dictionary) AddWord(
	word string,
	freq int,
	prop string,
) (exist bool, err error)

func (*Dictionary) Exist

func (d *Dictionary) Exist(word string) bool

func (*Dictionary) GetTotalFreq

func (d *Dictionary) GetTotalFreq() float64

func (*Dictionary) GetWord

func (d *Dictionary) GetWord(word string) (int, bool)

type FinalSeg

type FinalSeg struct {
	// contains filtered or unexported fields
}

func GetFinalSeg

func GetFinalSeg() *FinalSeg

func (*FinalSeg) Cut

func (fs *FinalSeg) Cut(sentence string) []string

type IDFLoader

type IDFLoader struct {
	// contains filtered or unexported fields
}

type Keyword

type Keyword struct {
	Word   string  `json:"word"`
	Weight float64 `json:"weight"`
}

type Keywords

type Keywords []Keyword

func (Keywords) Len

func (k Keywords) Len() int

func (Keywords) Less

func (k Keywords) Less(i, j int) bool

func (Keywords) Swap

func (k Keywords) Swap(i, j int)

type NodeDAG

type NodeDAG struct {
	X float64
	Y int
}

type Sentence

type Sentence struct {
	// contains filtered or unexported fields
}

func NewSentence

func NewSentence(s string) *Sentence

func (*Sentence) CalcDAG

func (s *Sentence) CalcDAG() []NodeDAG

func (*Sentence) GetChar

func (s *Sentence) GetChar(i int) string

func (*Sentence) GetDAG

func (s *Sentence) GetDAG() [][]int

func (*Sentence) GetWord

func (s *Sentence) GetWord(start, end int) string

func (*Sentence) Len

func (s *Sentence) Len() int

type StopWords

type StopWords struct {
	// contains filtered or unexported fields
}

type TFIDF

type TFIDF struct {
	// contains filtered or unexported fields
}

func GetTFIDF

func GetTFIDF() *TFIDF

func (*TFIDF) AddStopWord

func (t *TFIDF) AddStopWord(word string) (exist bool, err error)

func (*TFIDF) ExistStopWord

func (t *TFIDF) ExistStopWord(word string) bool

func (*TFIDF) ExtractKeywords

func (t *TFIDF) ExtractKeywords(
	s string,
	count int,
	withWeight bool,
) interface{}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL