Documentation ¶
Overview ¶
Package filter prepares the inputs and outputs.
Index ¶
- func Drop(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)
- func Keep(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)
- func ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)
- type Feature
- type Features
- type FeaturesFilter
- type POS
- type POSFilter
- type SentenceSplitter
- type WordFilter
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ScanSentences ¶
ScanSentences implements SplitFunc interface of bufio.Scanner that returns each sentence of text. see. https://pkg.go.dev/bufio#SplitFunc
Example ¶
package main import ( "bufio" "fmt" "strings" "github.com/ikawaha/kagome/v2/filter" ) func main() { sampleText := ` 人魚は、南の方の海にばかり棲んでいるのではあ りません。北の海にも棲んでいたのであります。 北方の海うみの色は、青うございました。ある とき、岩の上に、女の人魚があがって、あたりの景 色をながめながら休んでいました。 小川未明作 赤い蝋燭と人魚より` scanner := bufio.NewScanner(strings.NewReader(sampleText)) scanner.Split(filter.ScanSentences) for scanner.Scan() { fmt.Println(scanner.Text()) } if err := scanner.Err(); err != nil { panic(err) } }
Output: 人魚は、南の方の海にばかり棲んでいるのではありません。 北の海にも棲んでいたのであります。 北方の海うみの色は、青うございました。 あるとき、岩の上に、女の人魚があがって、あたりの景色をながめながら休んでいました。 小川未明作赤い蝋燭と人魚より
Types ¶
type Feature ¶
type Feature = string
Feature represents a feature.
const Any Feature = "\x00"
Any represents an arbitrary feature.
type FeaturesFilter ¶
type FeaturesFilter struct {
// contains filtered or unexported fields
}
FeaturesFilter represents a filter that filters a vector of features.
func NewFeaturesFilter ¶
func NewFeaturesFilter(fs ...Features) *FeaturesFilter
NewFeaturesFilter returns a features filter.
func (*FeaturesFilter) Match ¶
func (f *FeaturesFilter) Match(fs Features) bool
Match returns true if a filter matches given features.
func (*FeaturesFilter) String ¶
func (f *FeaturesFilter) String() string
String implements string interface.
type POSFilter ¶
type POSFilter struct {
// contains filtered or unexported fields
}
POSFilter represents a part-of-speech filter.
Example ¶
package main import ( "fmt" "github.com/ikawaha/kagome-dict/dict" "github.com/ikawaha/kagome/v2/filter" "github.com/ikawaha/kagome/v2/tokenizer" ) const testDictPath = "../testdata/ipa.dict" func main() { d, err := dict.LoadDictFile(testDictPath) if err != nil { panic(err) } t, err := tokenizer.New(d, tokenizer.OmitBosEos()) if err != nil { panic(err) } posFilter := filter.NewPOSFilter([]filter.POS{ {"名詞", filter.Any, "人名"}, {"形容詞"}, }...) tokens := t.Tokenize("赤い蝋燭と人魚。小川未明") posFilter.Keep(&tokens) for _, v := range tokens { fmt.Println(v.Surface, v.POS()) } }
Output: 赤い [形容詞 自立 * *] 小川 [名詞 固有名詞 人名 姓] 未明 [名詞 固有名詞 人名 名]
func NewPOSFilter ¶
NewPOSFilter returns a part-of-speech filter.
type SentenceSplitter ¶
type SentenceSplitter struct { Delim []rune // delimiter set. ex. {'。','.'} Follower []rune // allow following after delimiters. ex. {'」','』'} SkipWhiteSpace bool // eliminate white space or not DoubleLineFeedSplit bool // splite at '\n\n' or not MaxRuneLen int // max sentence length }
SentenceSplitter is a tiny sentence splitter for japanese texts.
func (SentenceSplitter) ScanSentences ¶
func (s SentenceSplitter) ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)
ScanSentences is a split function for a Scanner that returns each sentence of text. nolint: gocyclo
type WordFilter ¶
type WordFilter struct {
// contains filtered or unexported fields
}
WordFilter represents a word filter.
Example ¶
d, err := dict.LoadDictFile(testDictPath) if err != nil { panic(err) } t, err := tokenizer.New(d, tokenizer.OmitBosEos()) if err != nil { panic(err) } stopWords := filter.NewWordFilter([]string{"私", "は", "が", "の", "。"}) tokens := t.Tokenize("私の猫の名前はアプロです。") stopWords.Drop(&tokens) for _, v := range tokens { fmt.Println(v.Surface) }
Output: 猫 名前 アプロ です
func NewWordFilter ¶
func NewWordFilter(words []string) *WordFilter
NewWordFilter returns a word filter.
func (WordFilter) Drop ¶
func (f WordFilter) Drop(tokens *[]tokenizer.Token)
Drop drops a token if a filter matches token's surface.
func (WordFilter) Keep ¶
func (f WordFilter) Keep(tokens *[]tokenizer.Token)
Keep keeps a token if a filter matches token's surface.
func (WordFilter) Match ¶
func (f WordFilter) Match(w string) bool
Match returns true if a filter matches a given word.