tokenizer

package
v1.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 29, 2024 License: Apache-2.0 Imports: 5 Imported by: 0

Documentation

Overview

Copyright 2023 Huawei Cloud Computing Technologies Co., Ltd.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Copyright 2023 Huawei Cloud Computing Technologies Co., Ltd.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Copyright 2023 Huawei Cloud Computing Technologies Co., Ltd.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index

Constants

View Source
const (
	ConjunctionLength           = 3
	VersionLatest        uint32 = 4
	VersionBefore        uint32 = 3
	CONTENT_SPLITTER            = " \n\t`-=~!@#$%^&*()_+[]{}\\|;':\",.<>/?"
	TAGS_SPLITTER_CHAR          = byte(6)
	TAGS_SPLITTER               = string(TAGS_SPLITTER_CHAR)
	TAGS_SPLITTER_BEFORE        = " \t,"
)
View Source
const (
	Prime_64 uint64 = 0x9E3779B185EBCA87
)

Variables

View Source
var (
	ZeroConjunction = []uint64{0, 0, 0}
	ZeroSymbol      = []bool{false, false, false}
	SetSymbol       = []bool{true, true, true}
)
View Source
var CONTENT_SPLIT_TABLE []byte = make([]byte, 256)
View Source
var MISS_CONTENT_SPLIT_TABLE_INDEX uint8
View Source
var MISS_TAGS_SPLIT_TABLE_INDEX uint8
View Source
var MISS_TAGS_SPLIT_TABLE_INDEX_BEFORE uint8
View Source
var ROTATE_TABLE []byte = make([]byte, 256)
View Source
var TAGS_SPLIT_TABLE []byte = make([]byte, 256)
View Source
var TAGS_SPLIT_TABLE_BEFORE []byte = make([]byte, 256)

Functions

func BuildSplitTable

func BuildSplitTable(splitChar string) ([]byte, uint8)

func FreeSimpleGramTokenizer

func FreeSimpleGramTokenizer(t Tokenizer)

func Hash

func Hash(bytes []byte) uint64

Types

type SimpleGramTokenizer

type SimpleGramTokenizer struct {
	SimpleUtf8Tokenizer
	// contains filtered or unexported fields
}

func (*SimpleGramTokenizer) InitInput

func (t *SimpleGramTokenizer) InitInput(bytes []byte)

func (*SimpleGramTokenizer) Next

func (t *SimpleGramTokenizer) Next() bool

type SimpleGramTokenizerV0

type SimpleGramTokenizerV0 struct {
	*SimpleGramTokenizer
}

func (*SimpleGramTokenizerV0) Next

func (t *SimpleGramTokenizerV0) Next() bool

type SimpleGramTokenizerV1

type SimpleGramTokenizerV1 struct {
	SimpleUtf8Tokenizer
	// contains filtered or unexported fields
}

func NewSimpleGramTokenizerV1

func NewSimpleGramTokenizerV1(splitTable []byte, defaultNilSplit uint8) *SimpleGramTokenizerV1

func (*SimpleGramTokenizerV1) CurrentHash

func (t *SimpleGramTokenizerV1) CurrentHash() uint64

func (*SimpleGramTokenizerV1) InitInput

func (t *SimpleGramTokenizerV1) InitInput(bytes []byte)

func (*SimpleGramTokenizerV1) Next

func (t *SimpleGramTokenizerV1) Next() bool

type SimpleTokenFinder

type SimpleTokenFinder struct {
	// contains filtered or unexported fields
}

func NewSimpleTokenFinder

func NewSimpleTokenFinder(split []byte) *SimpleTokenFinder

func (*SimpleTokenFinder) CurrentOffset

func (t *SimpleTokenFinder) CurrentOffset() int

func (*SimpleTokenFinder) InitInput

func (t *SimpleTokenFinder) InitInput(content []byte, token []byte)

func (*SimpleTokenFinder) Next

func (t *SimpleTokenFinder) Next() bool

type SimpleTokenizer

type SimpleTokenizer struct {
	// contains filtered or unexported fields
}

func NewSimpleTokenizer

func NewSimpleTokenizer(splitTable []byte) *SimpleTokenizer

func (*SimpleTokenizer) CurrentHash

func (t *SimpleTokenizer) CurrentHash() uint64

func (*SimpleTokenizer) FreeSimpleGramTokenizer

func (t *SimpleTokenizer) FreeSimpleGramTokenizer()

func (*SimpleTokenizer) InitInput

func (t *SimpleTokenizer) InitInput(bytes []byte)

func (*SimpleTokenizer) Next

func (t *SimpleTokenizer) Next() bool

func (*SimpleTokenizer) ProcessTokenizerBatch

func (t *SimpleTokenizer) ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int

type SimpleUtf8Tokenizer

type SimpleUtf8Tokenizer struct {
	SimpleTokenizer
	// contains filtered or unexported fields
}

func NewSimpleUtf8Tokenizer

func NewSimpleUtf8Tokenizer(splitTable []byte) *SimpleUtf8Tokenizer

func (*SimpleUtf8Tokenizer) Next

func (t *SimpleUtf8Tokenizer) Next() bool

type TokenFilter

type TokenFilter struct {
	// contains filtered or unexported fields
}

func NewTokenFilter

func NewTokenFilter(schemas record.Schemas, expr influxql.Expr, split map[string][]byte) *TokenFilter

func (*TokenFilter) Filter

func (t *TokenFilter) Filter(words [][]byte) bool

func (*TokenFilter) FilterRowByField

func (t *TokenFilter) FilterRowByField(words [][]byte, expr influxql.Expr) bool

type TokenFinder

type TokenFinder interface {
	Next() bool
	CurrentOffset() int
}

type Tokenizer

type Tokenizer interface {
	InitInput([]byte)
	Next() bool
	CurrentHash() uint64
	ProcessTokenizerBatch(input, output []byte, offsets, lens []int32) int
	FreeSimpleGramTokenizer()
}

func NewGramTokenizer

func NewGramTokenizer(splitChars string, seed uint64, version uint32) Tokenizer

func NewSimpleGramTokenizer

func NewSimpleGramTokenizer(splitTable []byte, version uint32, defaultNilSplit uint8) Tokenizer

func NewSimpleGramTokenizerWithSeed

func NewSimpleGramTokenizerWithSeed(splitTable []byte, version uint32, defaultNilSplit uint8, seed uint64) Tokenizer

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL