tokenizer

package

v1.3.1 Latest Latest Go to latest Published: Jan 2, 2025 License: Apache-2.0 Imports: 6 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

Documentation ¶

Overview ¶

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index ¶

Constants
Variables
func BuildSplitTable(splitChar string) ([]byte, uint8)
func FreeSimpleGramTokenizer(t Tokenizer)
func GetFullTextOption(ir *influxql.IndexRelation) *influxql.IndexOption
func Hash(bytes []byte) uint64
type SimpleGramTokenizer
- func (t *SimpleGramTokenizer) InitInput(bytes []byte)
- func (t *SimpleGramTokenizer) Next() bool
type SimpleGramTokenizerV0
- func (t *SimpleGramTokenizerV0) Next() bool
type SimpleGramTokenizerV1
- func NewSimpleGramTokenizerV1(splitTable []byte, defaultNilSplit uint8) *SimpleGramTokenizerV1
- func (t *SimpleGramTokenizerV1) CurrentHash() uint64
- func (t *SimpleGramTokenizerV1) InitInput(bytes []byte)
- func (t *SimpleGramTokenizerV1) Next() bool
type SimpleTokenFinder
- func NewSimpleTokenFinder(split []byte) *SimpleTokenFinder
- func (t *SimpleTokenFinder) CurrentOffset() int
- func (t *SimpleTokenFinder) InitInput(content []byte, token []byte)
- func (t *SimpleTokenFinder) Next() bool
type SimpleTokenizer
- func NewSimpleTokenizer(splitTable []byte) *SimpleTokenizer
- func (t *SimpleTokenizer) CurrentHash() uint64
- func (t *SimpleTokenizer) FreeSimpleGramTokenizer()
- func (t *SimpleTokenizer) InitInput(bytes []byte)
- func (t *SimpleTokenizer) Next() bool
- func (t *SimpleTokenizer) ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int
type SimpleUtf8Tokenizer
- func NewSimpleUtf8Tokenizer(splitTable []byte) *SimpleUtf8Tokenizer
- func (t *SimpleUtf8Tokenizer) Next() bool
type StandardTokenizer
- func NewStandardTokenizer(split string) *StandardTokenizer
- func (t *StandardTokenizer) Split(str string) []string
type TokenFilter
- func NewTokenFilter(schemas record.Schemas, expr influxql.Expr, split map[string][]byte) *TokenFilter
- func (t *TokenFilter) Filter(words [][]byte) bool
- func (t *TokenFilter) FilterRowByField(words [][]byte, expr influxql.Expr) bool
type TokenFinder
type Tokenizer
- func NewGramTokenizer(splitChars string, seed uint64, version uint32) Tokenizer
- func NewSimpleGramTokenizer(splitTable []byte, version uint32, defaultNilSplit uint8) Tokenizer
- func NewSimpleGramTokenizerWithSeed(splitTable []byte, version uint32, defaultNilSplit uint8, seed uint64) Tokenizer

Constants ¶

View Source

const (
	ConjunctionLength           = 3
	VersionLatest        uint32 = 4
	VersionBefore        uint32 = 3
	CONTENT_SPLITTER            = " \n\t`-=~!@#$%^&*()_+[]{}\\|;':\",.<>/?"
	TAGS_SPLITTER_CHAR          = byte(6)
	TAGS_SPLITTER               = string(TAGS_SPLITTER_CHAR)
	TAGS_SPLITTER_BEFORE        = " \t,"
)

View Source

const (
	Prime_64 uint64 = 0x9E3779B185EBCA87
)

Variables ¶

View Source

var (
	ZeroConjunction = []uint64{0, 0, 0}
	ZeroSymbol      = []bool{false, false, false}
	SetSymbol       = []bool{true, true, true}
)

View Source

var CONTENT_SPLIT_TABLE []byte = make([]byte, 256)

View Source

var MISS_CONTENT_SPLIT_TABLE_INDEX uint8

View Source

var MISS_TAGS_SPLIT_TABLE_INDEX uint8

View Source

var MISS_TAGS_SPLIT_TABLE_INDEX_BEFORE uint8

View Source

var ROTATE_TABLE []byte = make([]byte, 256)

View Source

var TAGS_SPLIT_TABLE []byte = make([]byte, 256)

View Source

var TAGS_SPLIT_TABLE_BEFORE []byte = make([]byte, 256)

Functions ¶

func BuildSplitTable ¶

func BuildSplitTable(splitChar string) ([]byte, uint8)

func FreeSimpleGramTokenizer ¶

func FreeSimpleGramTokenizer(t Tokenizer)

func GetFullTextOption ¶ added in v1.3.0

func GetFullTextOption(ir *influxql.IndexRelation) *influxql.IndexOption

func Hash ¶

func Hash(bytes []byte) uint64

Types ¶

type SimpleGramTokenizer ¶

type SimpleGramTokenizer struct {
	SimpleUtf8Tokenizer
	// contains filtered or unexported fields
}

func (*SimpleGramTokenizer) InitInput ¶

func (t *SimpleGramTokenizer) InitInput(bytes []byte)

func (*SimpleGramTokenizer) Next ¶

func (t *SimpleGramTokenizer) Next() bool

type SimpleGramTokenizerV0 ¶

type SimpleGramTokenizerV0 struct {
	*SimpleGramTokenizer
}

func (*SimpleGramTokenizerV0) Next ¶

func (t *SimpleGramTokenizerV0) Next() bool

type SimpleGramTokenizerV1 ¶

type SimpleGramTokenizerV1 struct {
	SimpleUtf8Tokenizer
	// contains filtered or unexported fields
}

func NewSimpleGramTokenizerV1 ¶

func NewSimpleGramTokenizerV1(splitTable []byte, defaultNilSplit uint8) *SimpleGramTokenizerV1

func (*SimpleGramTokenizerV1) CurrentHash ¶

func (t *SimpleGramTokenizerV1) CurrentHash() uint64

func (*SimpleGramTokenizerV1) InitInput ¶

func (t *SimpleGramTokenizerV1) InitInput(bytes []byte)

func (*SimpleGramTokenizerV1) Next ¶

func (t *SimpleGramTokenizerV1) Next() bool

type SimpleTokenFinder ¶

type SimpleTokenFinder struct {
	// contains filtered or unexported fields
}

func NewSimpleTokenFinder ¶

func NewSimpleTokenFinder(split []byte) *SimpleTokenFinder

func (*SimpleTokenFinder) CurrentOffset ¶

func (t *SimpleTokenFinder) CurrentOffset() int

func (*SimpleTokenFinder) InitInput ¶

func (t *SimpleTokenFinder) InitInput(content []byte, token []byte)

func (*SimpleTokenFinder) Next ¶

func (t *SimpleTokenFinder) Next() bool

type SimpleTokenizer ¶

type SimpleTokenizer struct {
	// contains filtered or unexported fields
}

func NewSimpleTokenizer ¶

func NewSimpleTokenizer(splitTable []byte) *SimpleTokenizer

func (*SimpleTokenizer) CurrentHash ¶

func (t *SimpleTokenizer) CurrentHash() uint64

func (*SimpleTokenizer) FreeSimpleGramTokenizer ¶

func (t *SimpleTokenizer) FreeSimpleGramTokenizer()

func (*SimpleTokenizer) InitInput ¶

func (t *SimpleTokenizer) InitInput(bytes []byte)

func (*SimpleTokenizer) Next ¶

func (t *SimpleTokenizer) Next() bool

func (*SimpleTokenizer) ProcessTokenizerBatch ¶

func (t *SimpleTokenizer) ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int

type SimpleUtf8Tokenizer ¶

type SimpleUtf8Tokenizer struct {
	SimpleTokenizer
	// contains filtered or unexported fields
}

func NewSimpleUtf8Tokenizer ¶

func NewSimpleUtf8Tokenizer(splitTable []byte) *SimpleUtf8Tokenizer

func (*SimpleUtf8Tokenizer) Next ¶

func (t *SimpleUtf8Tokenizer) Next() bool

type StandardTokenizer ¶ added in v1.3.0

type StandardTokenizer struct {
	// contains filtered or unexported fields
}

func NewStandardTokenizer ¶ added in v1.3.0

func NewStandardTokenizer(split string) *StandardTokenizer

func (*StandardTokenizer) Split ¶ added in v1.3.0

func (t *StandardTokenizer) Split(str string) []string

type TokenFilter ¶

type TokenFilter struct {
	// contains filtered or unexported fields
}

func NewTokenFilter ¶

func NewTokenFilter(schemas record.Schemas, expr influxql.Expr, split map[string][]byte) *TokenFilter

func (*TokenFilter) Filter ¶

func (t *TokenFilter) Filter(words [][]byte) bool

func (*TokenFilter) FilterRowByField ¶

func (t *TokenFilter) FilterRowByField(words [][]byte, expr influxql.Expr) bool

type TokenFinder ¶

type TokenFinder interface {
	Next() bool
	CurrentOffset() int
}

type Tokenizer ¶

type Tokenizer interface {
	InitInput([]byte)
	Next() bool
	CurrentHash() uint64
	ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int
	FreeSimpleGramTokenizer()
}

func NewGramTokenizer ¶

func NewGramTokenizer(splitChars string, seed uint64, version uint32) Tokenizer

func NewSimpleGramTokenizer ¶

func NewSimpleGramTokenizer(splitTable []byte, version uint32, defaultNilSplit uint8) Tokenizer

func NewSimpleGramTokenizerWithSeed ¶

func NewSimpleGramTokenizerWithSeed(splitTable []byte, version uint32, defaultNilSplit uint8, seed uint64) Tokenizer

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL