Documentation ¶
Index ¶
- Constants
- Variables
- func NewJapaneseTokenizer(dict *dict.Dict, opts ...TokenizerOption) analysis.Tokenizer
- func NewUnicodeNormalizeCharFilter(form norm.Form) analysis.CharFilter
- func StopTagsTokenMapConstructor(_ map[string]any, _ *registry.Cache) (analysis.TokenMap, error)
- func StopWordsTokenFilterConstructor(_ map[string]any, cache *registry.Cache) (analysis.TokenFilter, error)
- func StopWordsTokenMapConstructor(_ map[string]any, _ *registry.Cache) (analysis.TokenMap, error)
- func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error)
- func UnicodeNormalizeCharFilterConstructor(config map[string]any, _ *registry.Cache) (analysis.CharFilter, error)
- type JapaneseTokenizer
- type TokenizerOption
- type UnicodeNormalizeCharFilter
Constants ¶
const ( Name = "ja_kagome" DictIPA = "ipa" DictUni = "uni" )
const NormalizeCharFilterName = "ja_normalize_unicode"
const StopTagsName = "stop_tags_ja"
const StopWordsName = "stop_words_ja"
StopWordsName is the name of the stop words filter.
Variables ¶
DefaultInflected represents POSs which has inflected form.
var StopTagsBytes []byte
StopTagsBytes is a stop tag list. see. https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt
var StopWordsBytes []byte
StopWordsBytes is a stop word list. see. https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt
Functions ¶
func NewJapaneseTokenizer ¶
func NewJapaneseTokenizer(dict *dict.Dict, opts ...TokenizerOption) analysis.Tokenizer
NewJapaneseTokenizer returns a Japanese tokenizer.
func NewUnicodeNormalizeCharFilter ¶
func NewUnicodeNormalizeCharFilter(form norm.Form) analysis.CharFilter
NewUnicodeNormalizeCharFilter returns a normalize char filter.
func StopTagsTokenMapConstructor ¶
StopTagsTokenMapConstructor returns a token map for stop tags (for IPA dict).
func StopWordsTokenFilterConstructor ¶
func StopWordsTokenFilterConstructor(_ map[string]any, cache *registry.Cache) (analysis.TokenFilter, error)
StopWordsTokenFilterConstructor returns a token filter for stop words.
func StopWordsTokenMapConstructor ¶
StopWordsTokenMapConstructor returns a token map for stop words.
func TokenizerConstructor ¶
Types ¶
type JapaneseTokenizer ¶
JapaneseTokenizer represents a Japanese tokenizer with filters.
func (*JapaneseTokenizer) Tokenize ¶
func (t *JapaneseTokenizer) Tokenize(input []byte) analysis.TokenStream
Tokenize tokenizes the input and filters them.
type TokenizerOption ¶
type TokenizerOption func(t *JapaneseTokenizer)
TokenizerOption represents an option of the japanese tokenizer.
func BaseFormFilter ¶
func BaseFormFilter(m analysis.TokenMap) TokenizerOption
BaseFormFilter returns an base form filter option.
func StopTagsFilter ¶
func StopTagsFilter(m analysis.TokenMap) TokenizerOption
StopTagsFilter returns a stop tags filter option.
type UnicodeNormalizeCharFilter ¶
type UnicodeNormalizeCharFilter struct {
// contains filtered or unexported fields
}
UnicodeNormalizeCharFilter represents unicode char filter.
func (UnicodeNormalizeCharFilter) Filter ¶
func (f UnicodeNormalizeCharFilter) Filter(input []byte) []byte
Filter applies per-char normalization.