tokenizer

package

v1.2.0 Latest Latest Go to latest Published: Feb 29, 2024 License: Apache-2.0 Imports: 5 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/openGemini/openGemini

Links

Open Source Insights

Documentation ¶

Overview ¶

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index ¶

Constants
Variables
func BuildSplitTable(splitChar string) ([]byte, uint8)
func FreeSimpleGramTokenizer(t Tokenizer)
func Hash(bytes []byte) uint64
type SimpleGramTokenizer
- func (t *SimpleGramTokenizer) InitInput(bytes []byte)
- func (t *SimpleGramTokenizer) Next() bool
type SimpleGramTokenizerV0
- func (t *SimpleGramTokenizerV0) Next() bool
type SimpleGramTokenizerV1
- func NewSimpleGramTokenizerV1(splitTable []byte, defaultNilSplit uint8) *SimpleGramTokenizerV1
- func (t *SimpleGramTokenizerV1) CurrentHash() uint64
- func (t *SimpleGramTokenizerV1) InitInput(bytes []byte)
- func (t *SimpleGramTokenizerV1) Next() bool
type SimpleTokenFinder
- func NewSimpleTokenFinder(split []byte) *SimpleTokenFinder
- func (t *SimpleTokenFinder) CurrentOffset() int
- func (t *SimpleTokenFinder) InitInput(content []byte, token []byte)
- func (t *SimpleTokenFinder) Next() bool
type SimpleTokenizer
- func NewSimpleTokenizer(splitTable []byte) *SimpleTokenizer
- func (t *SimpleTokenizer) CurrentHash() uint64
- func (t *SimpleTokenizer) FreeSimpleGramTokenizer()
- func (t *SimpleTokenizer) InitInput(bytes []byte)
- func (t *SimpleTokenizer) Next() bool
- func (t *SimpleTokenizer) ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int
type SimpleUtf8Tokenizer
- func NewSimpleUtf8Tokenizer(splitTable []byte) *SimpleUtf8Tokenizer
- func (t *SimpleUtf8Tokenizer) Next() bool
type TokenFilter
- func NewTokenFilter(schemas record.Schemas, expr influxql.Expr, split map[string][]byte) *TokenFilter
- func (t *TokenFilter) Filter(words [][]byte) bool
- func (t *TokenFilter) FilterRowByField(words [][]byte, expr influxql.Expr) bool
type TokenFinder
type Tokenizer
- func NewGramTokenizer(splitChars string, seed uint64, version uint32) Tokenizer
- func NewSimpleGramTokenizer(splitTable []byte, version uint32, defaultNilSplit uint8) Tokenizer
- func NewSimpleGramTokenizerWithSeed(splitTable []byte, version uint32, defaultNilSplit uint8, seed uint64) Tokenizer

Constants ¶

View Source

const (
	ConjunctionLength           = 3
	VersionLatest        uint32 = 4
	VersionBefore        uint32 = 3
	CONTENT_SPLITTER            = " \n\t`-=~!@#$%^&*()_+[]{}\\|;':\",.<>/?"
	TAGS_SPLITTER_CHAR          = byte(6)
	TAGS_SPLITTER               = string(TAGS_SPLITTER_CHAR)
	TAGS_SPLITTER_BEFORE        = " \t,"
)

View Source

const (
	Prime_64 uint64 = 0x9E3779B185EBCA87
)

Variables ¶

View Source

var (
	ZeroConjunction = []uint64{0, 0, 0}
	ZeroSymbol      = []bool{false, false, false}
	SetSymbol       = []bool{true, true, true}
)

View Source

var CONTENT_SPLIT_TABLE []byte = make([]byte, 256)

View Source

var MISS_CONTENT_SPLIT_TABLE_INDEX uint8

View Source

var MISS_TAGS_SPLIT_TABLE_INDEX uint8

View Source

var MISS_TAGS_SPLIT_TABLE_INDEX_BEFORE uint8

View Source

var ROTATE_TABLE []byte = make([]byte, 256)

View Source

var TAGS_SPLIT_TABLE []byte = make([]byte, 256)

View Source

var TAGS_SPLIT_TABLE_BEFORE []byte = make([]byte, 256)

Functions ¶

func BuildSplitTable ¶

func BuildSplitTable(splitChar string) ([]byte, uint8)

func FreeSimpleGramTokenizer ¶

func FreeSimpleGramTokenizer(t Tokenizer)

func Hash ¶

func Hash(bytes []byte) uint64

Types ¶

type SimpleGramTokenizer ¶

type SimpleGramTokenizer struct {
	SimpleUtf8Tokenizer
	// contains filtered or unexported fields
}

func (*SimpleGramTokenizer) InitInput ¶

func (t *SimpleGramTokenizer) InitInput(bytes []byte)

func (*SimpleGramTokenizer) Next ¶

func (t *SimpleGramTokenizer) Next() bool

type SimpleGramTokenizerV0 ¶

type SimpleGramTokenizerV0 struct {
	*SimpleGramTokenizer
}

func (*SimpleGramTokenizerV0) Next ¶

func (t *SimpleGramTokenizerV0) Next() bool

type SimpleGramTokenizerV1 ¶

type SimpleGramTokenizerV1 struct {
	SimpleUtf8Tokenizer
	// contains filtered or unexported fields
}

func NewSimpleGramTokenizerV1 ¶

func NewSimpleGramTokenizerV1(splitTable []byte, defaultNilSplit uint8) *SimpleGramTokenizerV1

func (*SimpleGramTokenizerV1) CurrentHash ¶

func (t *SimpleGramTokenizerV1) CurrentHash() uint64

func (*SimpleGramTokenizerV1) InitInput ¶

func (t *SimpleGramTokenizerV1) InitInput(bytes []byte)

func (*SimpleGramTokenizerV1) Next ¶

func (t *SimpleGramTokenizerV1) Next() bool

type SimpleTokenFinder ¶

type SimpleTokenFinder struct {
	// contains filtered or unexported fields
}

func NewSimpleTokenFinder ¶

func NewSimpleTokenFinder(split []byte) *SimpleTokenFinder

func (*SimpleTokenFinder) CurrentOffset ¶

func (t *SimpleTokenFinder) CurrentOffset() int

func (*SimpleTokenFinder) InitInput ¶

func (t *SimpleTokenFinder) InitInput(content []byte, token []byte)

func (*SimpleTokenFinder) Next ¶

func (t *SimpleTokenFinder) Next() bool

type SimpleTokenizer ¶

type SimpleTokenizer struct {
	// contains filtered or unexported fields
}

func NewSimpleTokenizer ¶

func NewSimpleTokenizer(splitTable []byte) *SimpleTokenizer

func (*SimpleTokenizer) CurrentHash ¶

func (t *SimpleTokenizer) CurrentHash() uint64

func (*SimpleTokenizer) FreeSimpleGramTokenizer ¶

func (t *SimpleTokenizer) FreeSimpleGramTokenizer()

func (*SimpleTokenizer) InitInput ¶

func (t *SimpleTokenizer) InitInput(bytes []byte)

func (*SimpleTokenizer) Next ¶

func (t *SimpleTokenizer) Next() bool

func (*SimpleTokenizer) ProcessTokenizerBatch ¶

func (t *SimpleTokenizer) ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int

type SimpleUtf8Tokenizer ¶

type SimpleUtf8Tokenizer struct {
	SimpleTokenizer
	// contains filtered or unexported fields
}

func NewSimpleUtf8Tokenizer ¶

func NewSimpleUtf8Tokenizer(splitTable []byte) *SimpleUtf8Tokenizer

func (*SimpleUtf8Tokenizer) Next ¶

func (t *SimpleUtf8Tokenizer) Next() bool

type TokenFilter ¶

type TokenFilter struct {
	// contains filtered or unexported fields
}

func NewTokenFilter ¶

func NewTokenFilter(schemas record.Schemas, expr influxql.Expr, split map[string][]byte) *TokenFilter

func (*TokenFilter) Filter ¶

func (t *TokenFilter) Filter(words [][]byte) bool

func (*TokenFilter) FilterRowByField ¶

func (t *TokenFilter) FilterRowByField(words [][]byte, expr influxql.Expr) bool

type TokenFinder ¶

type TokenFinder interface {
	Next() bool
	CurrentOffset() int
}

type Tokenizer ¶

type Tokenizer interface {
	InitInput([]byte)
	Next() bool
	CurrentHash() uint64
	ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int
	FreeSimpleGramTokenizer()
}

func NewGramTokenizer ¶

func NewGramTokenizer(splitChars string, seed uint64, version uint32) Tokenizer

func NewSimpleGramTokenizer ¶

func NewSimpleGramTokenizer(splitTable []byte, version uint32, defaultNilSplit uint8) Tokenizer

func NewSimpleGramTokenizerWithSeed ¶

func NewSimpleGramTokenizerWithSeed(splitTable []byte, version uint32, defaultNilSplit uint8, seed uint64) Tokenizer

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL