filter

package

v2.10.1 Latest Latest Go to latest Published: Mar 12, 2025 License: MIT Imports: 6 Imported by: 9

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/ikawaha/kagome

Documentation ¶

Overview ¶

Package filter prepares the inputs and outputs.

Index ¶

func Drop(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)
func Keep(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)
func ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)
type Feature
type Features
type FeaturesFilter
- func NewFeaturesFilter(fs ...Features) *FeaturesFilter
- func (f *FeaturesFilter) Match(fs Features) bool
- func (f *FeaturesFilter) String() string
type POS
type POSFilter
- func NewPOSFilter(p ...POS) *POSFilter
type SentenceSplitter
- func (s SentenceSplitter) ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)
type WordFilter
- func NewWordFilter(words []string) *WordFilter

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func Drop ¶

func Drop(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)

Drop drops a token given the provided match function.

func Keep ¶

func Keep(tokens *[]tokenizer.Token, match func(t tokenizer.Token) bool)

Keep keeps a token given the provided match function.

func ScanSentences ¶

func ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)

ScanSentences implements SplitFunc interface of bufio.Scanner that returns each sentence of text. see. https://pkg.go.dev/bufio#SplitFunc

Example ¶

package main

import (
	"bufio"
	"fmt"
	"strings"

	"github.com/ikawaha/kagome/v2/filter"
)

func main() {
	sampleText := `　人魚は、南の方の海にばかり棲んでいるのではあ
                         りません。北の海にも棲んでいたのであります。
                         　北方の海うみの色は、青うございました。ある
                         とき、岩の上に、女の人魚があがって、あたりの景
                         色をながめながら休んでいました。

                         小川未明作 赤い蝋燭と人魚より`

	scanner := bufio.NewScanner(strings.NewReader(sampleText))
	scanner.Split(filter.ScanSentences)
	for scanner.Scan() {
		fmt.Println(scanner.Text())
	}
	if err := scanner.Err(); err != nil {
		panic(err)
	}
}

Output:

人魚は、南の方の海にばかり棲んでいるのではありません。
北の海にも棲んでいたのであります。
北方の海うみの色は、青うございました。
あるとき、岩の上に、女の人魚があがって、あたりの景色をながめながら休んでいました。
小川未明作赤い蝋燭と人魚より

Types ¶

type Feature ¶

type Feature = string

Feature represents a feature.

const Any Feature = "\x00"

Any represents an arbitrary feature.

type Features ¶

type Features = []string

Features represents a vector of features.

type FeaturesFilter ¶

type FeaturesFilter struct {
	// contains filtered or unexported fields
}

FeaturesFilter represents a filter that filters a vector of features.

func NewFeaturesFilter ¶

func NewFeaturesFilter(fs ...Features) *FeaturesFilter

NewFeaturesFilter returns a features filter.

func (*FeaturesFilter) Match ¶

func (f *FeaturesFilter) Match(fs Features) bool

Match returns true if a filter matches given features.

func (*FeaturesFilter) String ¶

func (f *FeaturesFilter) String() string

String implements string interface.

type POS ¶

type POS = []string

POS represents a part-of-speech that is a vector of features.

type POSFilter ¶

type POSFilter struct {
	// contains filtered or unexported fields
}

POSFilter represents a part-of-speech filter.

Example ¶

package main

import (
	"fmt"

	"github.com/ikawaha/kagome-dict/dict"
	"github.com/ikawaha/kagome/v2/filter"
	"github.com/ikawaha/kagome/v2/tokenizer"
)

const testDictPath = "../testdata/ipa.dict"

func main() {
	d, err := dict.LoadDictFile(testDictPath)
	if err != nil {
		panic(err)
	}
	t, err := tokenizer.New(d, tokenizer.OmitBosEos())
	if err != nil {
		panic(err)
	}
	posFilter := filter.NewPOSFilter([]filter.POS{
		{"名詞", filter.Any, "人名"},
		{"形容詞"},
	}...)
	tokens := t.Tokenize("赤い蝋燭と人魚。小川未明")
	posFilter.Keep(&tokens)
	for _, v := range tokens {
		fmt.Println(v.Surface, v.POS())
	}
}

Output:

赤い [形容詞 自立 * *]
小川 [名詞 固有名詞 人名 姓]
未明 [名詞 固有名詞 人名 名]

func NewPOSFilter ¶

func NewPOSFilter(p ...POS) *POSFilter

NewPOSFilter returns a part-of-speech filter.

func (POSFilter) Drop ¶

func (f POSFilter) Drop(tokens *[]tokenizer.Token)

Drop drops a token if a filter matches token's POS.

func (POSFilter) Keep ¶

func (f POSFilter) Keep(tokens *[]tokenizer.Token)

Keep keeps a token if a filter matches token's POS.

func (POSFilter) Match ¶

func (f POSFilter) Match(p POS) bool

Match returns true if a filter matches given POS.

type SentenceSplitter ¶

type SentenceSplitter struct {
	Delim               []rune // delimiter set. ex. {'。','．'}
	Follower            []rune // allow following after delimiters. ex. {'」','』'}
	SkipWhiteSpace      bool   // eliminate white space or not
	DoubleLineFeedSplit bool   // splite at '\n\n' or not
	MaxRuneLen          int    // max sentence length
}

SentenceSplitter is a tiny sentence splitter for japanese texts.

func (SentenceSplitter) ScanSentences ¶

func (s SentenceSplitter) ScanSentences(data []byte, atEOF bool) (advance int, token []byte, err error)

ScanSentences is a split function for a Scanner that returns each sentence of text. nolint: gocyclo

type WordFilter ¶

type WordFilter struct {
	// contains filtered or unexported fields
}

WordFilter represents a word filter.

Example ¶

d, err := dict.LoadDictFile(testDictPath)
if err != nil {
	panic(err)
}
t, err := tokenizer.New(d, tokenizer.OmitBosEos())
if err != nil {
	panic(err)
}
stopWords := filter.NewWordFilter([]string{"私", "は", "が", "の", "。"})
tokens := t.Tokenize("私の猫の名前はアプロです。")
stopWords.Drop(&tokens)
for _, v := range tokens {
	fmt.Println(v.Surface)
}

Output:

猫
名前
アプロ
です

func NewWordFilter ¶

func NewWordFilter(words []string) *WordFilter

NewWordFilter returns a word filter.

func (WordFilter) Drop ¶

func (f WordFilter) Drop(tokens *[]tokenizer.Token)

Drop drops a token if a filter matches token's surface.

func (WordFilter) Keep ¶

func (f WordFilter) Keep(tokens *[]tokenizer.Token)

Keep keeps a token if a filter matches token's surface.

func (WordFilter) Match ¶

func (f WordFilter) Match(w string) bool

Match returns true if a filter matches a given word.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
ja

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL