Documentation ¶
Overview ¶
Package lowercase implements a TokenFilter which converts tokens to lower case according to unicode rules.
Package stop implements a TokenFilter removing tokens found in a TokenMap.
It constructor takes the following arguments:
"stop_token_map" (string): the name of the token map identifying tokens to remove.
Index ¶
- Constants
- type ApostropheFilter
- type CamelCaseFilter
- type DictionaryCompoundFilter
- type EdgeNgramFilter
- type ElisionFilter
- type KeyWordMarkerFilter
- type LengthFilter
- type LowerCaseFilter
- type LowerCaseState
- type NgramFilter
- type NonAlphaNumericCaseState
- type NumberCaseState
- type Parser
- type PorterStemmer
- type ReverseFilter
- type ShingleFilter
- type Side
- type State
- type StopTokensFilter
- type TruncateTokenFilter
- type UnicodeNormalizeFilter
- type UniqueTermFilter
- type UpperCaseState
Constants ¶
const Apostrophe = '\''
const Apostrophes = string(Apostrophe) + string(RightSingleQuotationMark)
const RightSingleQuotationMark = '’'
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ApostropheFilter ¶
type ApostropheFilter struct{}
func NewApostropheFilter ¶
func NewApostropheFilter() *ApostropheFilter
func (*ApostropheFilter) Filter ¶
func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type CamelCaseFilter ¶
type CamelCaseFilter struct{}
CamelCaseFilter splits a given token into a set of tokens where each resulting token falls into one the following classes:
- Upper case followed by lower case letters. Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
- Upper case followed by upper case letters. Terminated by a number, an upper case followed by a lower case letter, and a non alpha-numeric symbol.
- Lower case followed by lower case letters. Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
- Number followed by numbers. Terminated by a letter, and a non alpha-numeric symbol.
- Non alpha-numeric symbol followed by non alpha-numeric symbols. Terminated by a number, and a letter.
It does a one-time sequential pass over an input token, from left to right. The scan is greedy and generates the longest substring that fits into one of the classes.
See the test file for examples of classes and their parsings.
func NewCamelCaseFilter ¶
func NewCamelCaseFilter() *CamelCaseFilter
func (*CamelCaseFilter) Filter ¶
func (f *CamelCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type DictionaryCompoundFilter ¶
type DictionaryCompoundFilter struct {
// contains filtered or unexported fields
}
func NewDictionaryCompoundFilter ¶
func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter
func (*DictionaryCompoundFilter) Filter ¶
func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type EdgeNgramFilter ¶
type EdgeNgramFilter struct {
// contains filtered or unexported fields
}
func NewEdgeNgramFilter ¶
func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter
func (*EdgeNgramFilter) Filter ¶
func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type ElisionFilter ¶
type ElisionFilter struct {
// contains filtered or unexported fields
}
func NewElisionFilter ¶
func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter
func (*ElisionFilter) Filter ¶
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type KeyWordMarkerFilter ¶
type KeyWordMarkerFilter struct {
// contains filtered or unexported fields
}
func NewKeyWordMarkerFilter ¶
func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter
func (*KeyWordMarkerFilter) Filter ¶
func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type LengthFilter ¶
type LengthFilter struct {
// contains filtered or unexported fields
}
func NewLengthFilter ¶
func NewLengthFilter(min, max int) *LengthFilter
func (*LengthFilter) Filter ¶
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type LowerCaseFilter ¶
type LowerCaseFilter struct{}
func NewLowerCaseFilter ¶
func NewLowerCaseFilter() *LowerCaseFilter
func (*LowerCaseFilter) Filter ¶
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type LowerCaseState ¶
type LowerCaseState struct{}
func (*LowerCaseState) StartSym ¶
func (s *LowerCaseState) StartSym(sym rune) bool
type NgramFilter ¶
type NgramFilter struct {
// contains filtered or unexported fields
}
func NewNgramFilter ¶
func NewNgramFilter(minLength, maxLength int) *NgramFilter
func (*NgramFilter) Filter ¶
func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type NonAlphaNumericCaseState ¶
type NonAlphaNumericCaseState struct{}
func (*NonAlphaNumericCaseState) Member ¶
func (s *NonAlphaNumericCaseState) Member(sym rune, peek *rune) bool
func (*NonAlphaNumericCaseState) StartSym ¶
func (s *NonAlphaNumericCaseState) StartSym(sym rune) bool
type NumberCaseState ¶
type NumberCaseState struct{}
func (*NumberCaseState) StartSym ¶
func (s *NumberCaseState) StartSym(sym rune) bool
type Parser ¶
type Parser struct {
// contains filtered or unexported fields
}
Parser accepts a symbol and passes it to the current state (representing a class). The state can accept it (and accumulate it). Otherwise, the parser creates a new state that starts with the pushed symbol.
Parser accumulates a new resulting token every time it switches state. Use FlushTokens() to get the results after the last symbol was pushed.
func (*Parser) FlushTokens ¶
type PorterStemmer ¶
type PorterStemmer struct{}
func NewPorterStemmer ¶
func NewPorterStemmer() *PorterStemmer
func (*PorterStemmer) Filter ¶
func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream
type ReverseFilter ¶
type ReverseFilter struct{}
func NewReverseFilter ¶
func NewReverseFilter() *ReverseFilter
func (*ReverseFilter) Filter ¶
func (f *ReverseFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type ShingleFilter ¶
type ShingleFilter struct {
// contains filtered or unexported fields
}
func NewShingleFilter ¶
func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter
func (*ShingleFilter) Filter ¶
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type State ¶
type State interface { // is _sym_ the start character StartSym(sym rune) bool // is _sym_ a member of a class. // peek, the next sym on the tape, can also be used to determine a class. Member(sym rune, peek *rune) bool }
States codify the classes that the parser recognizes.
type StopTokensFilter ¶
type StopTokensFilter struct {
// contains filtered or unexported fields
}
func NewStopTokensFilter ¶
func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter
func (*StopTokensFilter) Filter ¶
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type TruncateTokenFilter ¶
type TruncateTokenFilter struct {
// contains filtered or unexported fields
}
func NewTruncateTokenFilter ¶
func NewTruncateTokenFilter(length int) *TruncateTokenFilter
func (*TruncateTokenFilter) Filter ¶
func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type UnicodeNormalizeFilter ¶
type UnicodeNormalizeFilter struct {
// contains filtered or unexported fields
}
func NewUnicodeNormalizeFilter ¶
func NewUnicodeNormalizeFilter(form norm.Form) *UnicodeNormalizeFilter
func (*UnicodeNormalizeFilter) Filter ¶
func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type UniqueTermFilter ¶
type UniqueTermFilter struct{}
UniqueTermFilter retains only the tokens which mark the first occurrence of a term. Tokens whose term appears in a preceding token are dropped.
func NewUniqueTermFilter ¶
func NewUniqueTermFilter() *UniqueTermFilter
func (*UniqueTermFilter) Filter ¶
func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream
type UpperCaseState ¶
type UpperCaseState struct {
// contains filtered or unexported fields
}
func (*UpperCaseState) StartSym ¶
func (s *UpperCaseState) StartSym(sym rune) bool