Documentation
¶
Index ¶
Constants ¶
const ( ALPHANUM = 0 NUM = 6 ACRONYM_DEP = 8 // deprecated 3.1 SOUTHEAST_ASIAN = 9 IDEOGRAPHIC = 10 HIRAGANA = 11 KATAKANA = 12 HANGUL = 13 )
const ( ZZ_UNKNOWN_ERROR = 0 ZZ_NO_MATCH = 1 )
error codes
const ( WORD_TYPE = ALPHANUM NUMERIC_TYPE = NUM SOUTH_EAST_ASIAN_TYPE = SOUTHEAST_ASIAN IDEOGRAPHIC_TYPE = IDEOGRAPHIC HIRAGANA_TYPE = HIRAGANA KATAKANA_TYPE = KATAKANA HANGUL_TYPE = HANGUL )
const DEFAULT_MAX_TOKEN_LENGTH = 255
Default maximum allowed token length
const YYEOF = -1
This character denotes the end of file
const YYINITIAL = 0
lexical states
const ZZ_BUFFERSIZE = 255
initial size of the lookahead buffer
Variables ¶
var STOP_WORDS_SET = ENGLISH_STOP_WORDS_SET
An unmodifiable set containing some common English words that are usually not useful for searching
var TOKEN_TYPES = []string{
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>",
"<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>",
"<HIRAGANA>",
"<KATAKANA>",
"<HANGUL>",
}
String token types that correspond to token type int constants
var ZZ_ACTION = zzUnpackAction([]int{
001, 000, 001, 001, 001, 002, 001, 003, 001, 004, 001, 005, 001, 001, 001, 006,
001, 007, 001, 002, 001, 001, 001, 010, 001, 002, 001, 000, 001, 002, 001, 000,
001, 004, 001, 000, 002, 002, 002, 000, 001, 001, 001, 0,
})
Translates DFA states to action switch labels.
var ZZ_ATTRIBUTE = zzUnpackAttribute([]int{
001, 000, 001, 011, 013, 001, 001, 000, 001, 001, 001, 000, 001, 001, 001, 000,
002, 001, 002, 000, 001, 001, 001, 0,
})
ZZ_ATTRIBUTE[aState] contains the attributes of state aState
var ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED)
Translates characters to character classes
var ZZ_CMAP_PACKED = []int{}/* 2836 elements not displayed */
Translates characters to character classes
var ZZ_ERROR_MSG = [3]string{
"Unkown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large",
}
error messages for the codes above
var ZZ_LEXSTATE = [2]int{0, 0}
ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l at the beginning of a line l is of the form l = 2*k, k a non negative integer
var ZZ_ROWMAP = zzUnpackRowMap([]int{ 000, 000, 000, 022, 000, 044, 000, 066, 000, 0110, 000, 0132, 000, 0154, 000, 176, 000, 0220, 000, 0242, 000, 0264, 000, 0306, 000, 0330, 000, 0352, 000, 0374, 000, int('\u010e'), 000, int('\u0120'), 000, 0154, 000, int('\u0132'), 000, int('\u0144'), 000, int('\u0156'), 000, 0264, 000, int('\u0168'), 000, int('\u017a'), })
Translates a state to a row index in the transition table
var ZZ_TRANS = zzUnpackTrans([]int{
001, 002, 001, 003, 001, 004, 001, 002, 001, 005, 001, 006, 003, 002, 001, 007,
001, 010, 001, 011, 002, 002, 001, 012, 001, 013, 002, 014, 023, 000, 003, 003,
001, 015, 001, 000, 001, 016, 001, 000, 001, 016, 001, 017, 002, 000, 001, 016,
001, 000, 001, 012, 002, 000, 001, 003, 001, 000, 001, 003, 002, 004, 001, 015,
001, 000, 001, 016, 001, 000, 001, 016, 001, 017, 002, 000, 001, 016, 001, 000,
001, 012, 002, 000, 001, 004, 001, 000, 002, 003, 002, 005, 002, 000, 002, 020,
001, 021, 002, 000, 001, 020, 001, 000, 001, 012, 002, 000, 001, 005, 003, 000,
001, 006, 001, 000, 001, 006, 003, 000, 001, 017, 007, 000, 001, 006, 001, 000,
002, 003, 001, 022, 001, 005, 001, 023, 003, 000, 001, 022, 004, 000, 001, 012,
002, 000, 001, 022, 003, 000, 001, 010, 015, 000, 001, 010, 003, 000, 001, 011,
015, 000, 001, 011, 001, 000, 002, 003, 001, 012, 001, 015, 001, 000, 001, 016,
001, 000, 001, 016, 001, 017, 002, 000, 001, 024, 001, 025, 001, 012, 002, 000,
001, 012, 003, 000, 001, 026, 013, 000, 001, 027, 001, 000, 001, 026, 003, 000,
001, 014, 014, 000, 002, 014, 001, 000, 002, 003, 002, 015, 002, 000, 002, 030,
001, 017, 002, 000, 001, 030, 001, 000, 001, 012, 002, 000, 001, 015, 001, 000,
002, 003, 001, 016, 012, 000, 001, 003, 002, 000, 001, 016, 001, 000, 002, 003,
001, 017, 001, 015, 001, 023, 003, 000, 001, 017, 004, 000, 001, 012, 002, 000,
001, 017, 003, 000, 001, 020, 001, 005, 014, 000, 001, 020, 001, 000, 002, 003,
001, 021, 001, 005, 001, 023, 003, 000, 001, 021, 004, 000, 001, 012, 002, 000,
001, 021, 003, 000, 001, 023, 001, 000, 001, 023, 003, 000, 001, 017, 007, 000,
001, 023, 001, 000, 002, 003, 001, 024, 001, 015, 004, 000, 001, 017, 004, 000,
001, 012, 002, 000, 001, 024, 003, 000, 001, 025, 012, 000, 001, 024, 002, 000,
001, 025, 003, 000, 001, 027, 013, 000, 001, 027, 001, 000, 001, 027, 003, 000,
001, 030, 001, 015, 014, 000, 001, 030,
})
The transition table of the DFA
Functions ¶
This section is empty.
Types ¶
type StandardAnalyzer ¶
type StandardAnalyzer struct { *StopwordAnalyzerBase // contains filtered or unexported fields }
Filters StandardTokenizer with StandardFilter, LowerCaseFilter and StopFilter, using a list of English stop words.
You may specify the Version compatibility when creating StandardAnalyzer:
- GoLucene supports 4.5+ only.
func NewStandardAnalyzer ¶
func NewStandardAnalyzer() *StandardAnalyzer
Buils an analyzer with the default stop words (STOP_WORDS_SET).
func NewStandardAnalyzerWithStopWords ¶
func NewStandardAnalyzerWithStopWords(stopWords map[string]bool) *StandardAnalyzer
Builds an analyzer with the given stop words.
func (*StandardAnalyzer) CreateComponents ¶
func (a *StandardAnalyzer) CreateComponents(fieldName string, reader io.RuneReader) *TokenStreamComponents
type StandardFilter ¶
type StandardFilter struct { *TokenFilter // contains filtered or unexported fields }
Normalizes tokens extracted with StandardTokenizer
func (*StandardFilter) IncrementToken ¶
func (f *StandardFilter) IncrementToken() (bool, error)
type StandardTokenizer ¶
type StandardTokenizer struct { *Tokenizer // contains filtered or unexported fields }
A grammar-based tokenizer constructed with JFlex.
As of Lucene version 3.1, this class implements the Word Break rules from the Unicode Text Segmentation algorithm, as specified in Unicode standard Annex #29.
Many applications have specific tokenizer needs. If this tokenizer does not suit your application, please consider copying this source code directory to your project and maintaining your own grammar-based tokenizer.
You may specify the Version compatibility when creating StandardTokenizer:
- As of 3.4, Hiragana and Han characters are no longer wrongly split from their combining characters. If you use a previous version number, you get the exact broken behavior for backwards compatibility.
- As of 3.1, StandardTokenizer implements Unicode text segmentation. If you use a previous version number, you get the exact behavior of ClassicTokenizer for backwards compatibility.
func (*StandardTokenizer) Close ¶
func (t *StandardTokenizer) Close() error
func (*StandardTokenizer) End ¶
func (t *StandardTokenizer) End() error
func (*StandardTokenizer) IncrementToken ¶
func (t *StandardTokenizer) IncrementToken() (bool, error)
func (*StandardTokenizer) Reset ¶
func (t *StandardTokenizer) Reset() error
type StandardTokenizerImpl ¶
type StandardTokenizerImpl struct {
// contains filtered or unexported fields
}
This class implements Word Break rules from the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29.
Tokens produced are of the following types:
- <ALPHANUM>: A sequence of alphabetic and numeric characters
- <NUM>: A number
- <SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast Asian languages, including Thai, Lao, Myanmar, and Khmer
- IDEOGRAPHIC>: A single CJKV ideographic character
- <HIRAGANA>: A single hiragana character
Technically it should auto generated by JFlex but there is no GoFlex yet. So it's a line-by-line port.
type StandardTokenizerInterface ¶
type StandardTokenizerInterface interface {
// contains filtered or unexported methods
}
Internal interface for supporting versioned grammars.