gse

package module

v0.0.0-...-02f9e5b Latest Latest Go to latest Published: Dec 5, 2019 License: Apache-2.0 Imports: 17 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/Devister/gse

Links

Open Source Insights

README ¶

gse

Go efficient text segmentation; support english, chinese, japanese and other.

简体中文

Dictionary with double array trie (Double-Array Trie) to achieve, Sender algorithm is the shortest path based on word frequency plus dynamic programming, and DAG and HMM algorithm word segmentation.

Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes, support user dictionary, POS tagging, run JSON RPC service.

Support HMM cut text use Viterbi algorithm.

Text Segmentation speed single thread 9.2MB/s，goroutines concurrent 26.8MB/s. HMM text segmentation single thread 3.2MB/s. (2core 4threads Macbook Pro).

Binding:

gse-bind, binding JavaScript and other, support more language.

Install / update

go get -u github.com/go-ego/gse

Build-tools

go get -u github.com/go-ego/re

re gse

To create a new gse application

$ re gse my-gse

re run

To run the application we just created, you can navigate to the application folder and execute:

$ cd my-gse && re run

Use

package main

import (
	"fmt"

	"github.com/go-ego/gse"
)

var (
	text = "你好世界, Hello world."

	seg gse.Segmenter
)

func cut() {
	hmm := seg.Cut(text, true)
	fmt.Println("cut use hmm: ", hmm)

	hmm = seg.CutSearch(text, true)
	fmt.Println("cut search use hmm: ", hmm)

	hmm = seg.CutAll(text)
	fmt.Println("cut all: ", hmm)
}

func segCut() {
	// Text Segmentation
	tb := []byte(text)
	fmt.Println(seg.String(tb, true))

	segments := seg.Segment(tb)

	// Handle word segmentation results
	// Support for normal mode and search mode two participle,
	// see the comments in the code ToString function.
	// The search mode is mainly used to provide search engines
	// with as many keywords as possible
	fmt.Println(gse.ToString(segments, true))
}

func main() {
	// Loading the default dictionary
	seg.LoadDict()
	// Load the dictionary
	// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")

	cut()

	segCut()
}

Look at an custom dictionary example

package main

import (
	"fmt"

	"github.com/go-ego/gse"
)

func main() {
	var seg gse.Segmenter
	seg.LoadDict("zh,testdata/test_dict.txt,testdata/test_dict1.txt")

	text1 := []byte("你好世界, Hello world")
	fmt.Println(seg.String(text1, true))

	segments := seg.Segment(text1)
	fmt.Println(gse.ToString(segments))
}

Look at an Chinese example

Look at an Japanese example

Authors

License

Gse is primarily distributed under the terms of both the MIT license and the Apache License (Version 2.0), thanks for sego and jieba.

Documentation ¶

Overview ¶

Package gse Go efficient text segmentation, Go 语言高性能分词

Index ¶

Constants
Variables
func DictPaths(dictDir, filePath string) (files []string)
func GetVersion() string
func IsJp(segText string) bool
func Join(text []Text) string
func ToSlice(segs []Segment, searchMode ...bool) (output []string)
func ToString(segs []Segment, searchMode ...bool) (output string)
type Dictionary
- func NewDict() *Dictionary
- func (dict *Dictionary) Find(word []byte) (int, bool)
- func (dict *Dictionary) MaxTokenLen() int
- func (dict *Dictionary) NumTokens() int
- func (dict *Dictionary) TotalFreq() int64
type Prob
type Segment
- func (s *Segment) End() int
- func (s *Segment) Start() int
- func (s *Segment) Token() *Token
type Segmenter
- func (seg *Segmenter) AddToken(text string, frequency int, pos ...string)
- func (seg *Segmenter) AddTokenForce(text string, frequency int, pos ...string)
- func (seg *Segmenter) CalcToken()
- func (seg *Segmenter) Cut(str string, hmm ...bool) []string
- func (seg *Segmenter) CutAll(str string) []string
- func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string
- func (seg *Segmenter) Dictionary() *Dictionary
- func (seg *Segmenter) Find(str string) (int, bool)
- func (seg *Segmenter) HMMCut(str string) []string
- func (seg *Segmenter) HMMCutMod(str string, prob ...map[rune]float64) []string
- func (seg *Segmenter) LoadDict(files ...string) error
- func (seg *Segmenter) LoadModel(prob ...map[rune]float64)
- func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment
- func (seg *Segmenter) Read(file string) error
- func (seg *Segmenter) Segment(bytes []byte) []Segment
- func (seg *Segmenter) Slice(bytes []byte, searchMode ...bool) []string
- func (seg *Segmenter) String(bytes []byte, searchMode ...bool) string
type Text
type Token
- func (token *Token) Equals(str string) bool
- func (token *Token) Frequency() int
- func (token *Token) Pos() string
- func (token *Token) Segments() []*Segment
- func (token *Token) Text() string
type TokenJson

Constants ¶

View Source

const (
	// RatioWord ratio words and letters
	RatioWord float32 = 1.5
	// RatioWordFull full ratio words and letters
	RatioWordFull float32 = 1
)

Variables ¶

View Source

var (
	// LoadNoFreq load not have freq dict word
	LoadNoFreq bool
	// MinTokenFreq load min freq token
	MinTokenFreq = 2
)

Functions ¶

func DictPaths ¶

func DictPaths(dictDir, filePath string) (files []string)

DictPaths get the dict's paths

func GetVersion ¶

func GetVersion() string

GetVersion get the gse version

func IsJp ¶

func IsJp(segText string) bool

IsJp is jp char return true

func Join ¶

func Join(text []Text) string

Join is better string splicing

func ToSlice ¶

func ToSlice(segs []Segment, searchMode ...bool) (output []string)

ToSlice segments to slice 输出分词结果到一个字符串 slice

有两种输出模式，以 "山达尔星联邦共和国" 为例

普通模式（searchMode=false）输出一个分词"[山达尔星联邦共和国]"
搜索模式（searchMode=true） 输出普通模式的再细致切分：
    "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"

默认 searchMode=false 搜索模式主要用于给搜索引擎提供尽可能多的关键字，详情请见Token结构体的注释。

func ToString ¶

func ToString(segs []Segment, searchMode ...bool) (output string)

ToString segments to string 输出分词结果为字符串

有两种输出模式，以 "山达尔星联邦共和国" 为例

普通模式（searchMode=false）输出一个分词 "山达尔星联邦共和国/ns "
搜索模式（searchMode=true） 输出普通模式的再细致切分：
    "山达尔星/nz 联邦/n 共和/nz 国/n 共和国/ns 联邦共和国/nt 山达尔星联邦共和国/ns "

默认 searchMode=false 搜索模式主要用于给搜索引擎提供尽可能多的关键字，详情请见 Token 结构体的注释。

Types ¶

type Dictionary ¶

type Dictionary struct {
	// contains filtered or unexported fields
}

Dictionary 结构体实现了一个字串前缀树，一个分词可能出现在叶子节点也有可能出现在非叶节点

func NewDict ¶

func NewDict() *Dictionary

NewDict new dictionary

func (*Dictionary) Find ¶

func (dict *Dictionary) Find(word []byte) (int, bool)

Find find word in the dictionary is non-existent and the word's frequency

func (*Dictionary) MaxTokenLen ¶

func (dict *Dictionary) MaxTokenLen() int

MaxTokenLen 词典中最长的分词

func (*Dictionary) NumTokens ¶

func (dict *Dictionary) NumTokens() int

NumTokens 词典中分词数目

func (*Dictionary) TotalFreq ¶

func (dict *Dictionary) TotalFreq() int64

TotalFreq 词典中所有分词的频率之和

type Prob ¶

type Prob struct {
	B, E, M, S map[rune]float64
}

Prob type hmm model struct

type Segment ¶

type Segment struct {
	// contains filtered or unexported fields
}

Segment 文本中的一个分词

func (*Segment) End ¶

func (s *Segment) End() int

End 返回分词在文本中的结束字节位置（不包括该位置）

func (*Segment) Start ¶

func (s *Segment) Start() int

Start 返回分词在文本中的起始字节位置

func (*Segment) Token ¶

func (s *Segment) Token() *Token

Token 返回分词信息

type Segmenter ¶

type Segmenter struct {
	// contains filtered or unexported fields
}

Segmenter 分词器结构体

func (*Segmenter) AddToken ¶

func (seg *Segmenter) AddToken(text string, frequency int, pos ...string)

AddToken add new text to token

func (*Segmenter) AddTokenForce ¶

func (seg *Segmenter) AddTokenForce(text string, frequency int, pos ...string)

AddTokenForce add new text to token and force

func (*Segmenter) CalcToken ¶

func (seg *Segmenter) CalcToken()

CalcToken calc the segmenter token

func (*Segmenter) Cut ¶

func (seg *Segmenter) Cut(str string, hmm ...bool) []string

Cut cuts a str into words using accurate mode. Parameter hmm controls whether to use the HMM(Hidden Markov Model) or use the user's model.

func (*Segmenter) CutAll ¶

func (seg *Segmenter) CutAll(str string) []string

CutAll cuts a str into words using full mode.

func (*Segmenter) CutSearch ¶

func (seg *Segmenter) CutSearch(str string, hmm ...bool) []string

CutSearch cuts str into words using search engine mode.

func (*Segmenter) Dictionary ¶

func (seg *Segmenter) Dictionary() *Dictionary

Dictionary 返回分词器使用的词典

func (*Segmenter) Find ¶

func (seg *Segmenter) Find(str string) (int, bool)

Find find word in dictionary return word's frequency and existence

func (*Segmenter) HMMCut ¶

func (seg *Segmenter) HMMCut(str string) []string

HMMCut cut sentence string use HMM with Viterbi

func (*Segmenter) HMMCutMod ¶

func (seg *Segmenter) HMMCutMod(str string, prob ...map[rune]float64) []string

HMMCutMod cut sentence string use HMM with Viterbi

func (*Segmenter) LoadDict ¶

func (seg *Segmenter) LoadDict(files ...string) error

LoadDict load the dictionary from the file

The format of the dictionary is (one for each participle):

participle text, frequency, part of speech

Can load multiple dictionary files, the file name separated by "," the front of the dictionary preferentially load the participle,

such as: "user_dictionary.txt,common_dictionary.txt"

When a participle appears both in the user dictionary and in the `common dictionary`, the `user dictionary` is given priority.

从文件中载入词典

可以载入多个词典文件，文件名用 "," 分隔，排在前面的词典优先载入分词，比如:

"用户词典.txt,通用词典.txt"

当一个分词既出现在用户词典也出现在 `通用词典` 中，则优先使用 `用户词典`。

词典的格式为（每个分词一行）：

分词文本 频率 词性

func (*Segmenter) LoadModel ¶

func (seg *Segmenter) LoadModel(prob ...map[rune]float64)

LoadModel load the hmm model

Use the user's model:

seg.LoadModel(B, E, M, S map[rune]float64)

func (*Segmenter) ModeSegment ¶

func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment

ModeSegment segment using search mode if searchMode is true

func (*Segmenter) Read ¶

func (seg *Segmenter) Read(file string) error

Read read the dict flie

func (*Segmenter) Segment ¶

func (seg *Segmenter) Segment(bytes []byte) []Segment

Segment 对文本分词

输入参数：

bytes	UTF8 文本的字节数组

输出：

[]Segment	划分的分词

func (*Segmenter) Slice ¶

func (seg *Segmenter) Slice(bytes []byte, searchMode ...bool) []string

Slice use modeSegment segment retrun []string using search mode if searchMode is true

func (*Segmenter) String ¶

func (seg *Segmenter) String(bytes []byte, searchMode ...bool) string

Slice use modeSegment segment retrun string using search mode if searchMode is true

type Text ¶

type Text []byte

Text 字串类型，可以用来表达

一个字元，比如 "世" 又如 "界", 英文的一个字元是一个词
一个分词，比如 "世界" 又如 "人口"
一段文字，比如 "世界有七十亿人口"

type Token ¶

type Token struct {
	// contains filtered or unexported fields
}

Token 一个分词

func (*Token) Equals ¶

func (token *Token) Equals(str string) bool

Equals compare str split tokens

func (*Token) Frequency ¶

func (token *Token) Frequency() int

Frequency 返回分词在语料库中的词频

func (*Token) Pos ¶

func (token *Token) Pos() string

Pos 返回分词词性标注

func (*Token) Segments ¶

func (token *Token) Segments() []*Segment

Segments 该分词文本的进一步分词划分，比如 "山达尔星联邦共和国联邦政府" 这个分词有两个子分词 "山达尔星联邦共和国 " 和 "联邦政府"。子分词也可以进一步有子分词形成一个树结构，遍历这个树就可以得到该分词的所有细致分词划分，这主要用于搜索引擎对一段文本进行全文搜索。

func (*Token) Text ¶

func (token *Token) Text() string

Text 返回分词文本

type TokenJson ¶

type TokenJson struct {
	Text      string `json:"text"`
	Frequency string `json:"frequency"`
	Pos       string `json:"pos"`
}

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
benchmark
goroutines
crf
data
examples
dict
hmm
jp
hmm Package hmm is the Golang HMM cut module Package hmm model data The data from https://github.com/fxsjy/jieba	Package hmm is the Golang HMM cut module Package hmm model data The data from https://github.com/fxsjy/jieba
server
tf

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL