bert

package
v1.4.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 27, 2023 License: MIT Imports: 9 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DefaultMaxSeqLength                      int    = 48
	ModelRespBodyOutputBinaryDataKey         string = "binary_data"
	ModelRespBodyOutputClassificationDataKey string = "classification"
	ModelBertModelSegmentIdsKey              string = "segment_ids"
	ModelBertModelInputIdsKey                string = "input_ids"
	ModelBertModelInputMaskKey               string = "input_mask"
	ModelInt32DataType                       string = "INT32"
	ModelInt64DataType                       string = "INT64"
)
View Source
const (
	DefaultCLS          string = "[CLS]"
	DefaultSEP          string = "[SEP]"
	DefaultUNK          string = "[UNK]"
	DefaultMask         string = "[MASK]"
	NumPadToken         string = "##"
	DefaultMaxWordChars int    = 200
	DataSplitString     string = " ||| "
)

Variables

This section is empty.

Functions

func GetStrings

func GetStrings(tokens []StringOffsetsPair) []string

GetStrings returns a sequence of string values from the given slice of StringOffsetsPair.

func IsDefaultSpecial

func IsDefaultSpecial(word string) bool

IsDefaultSpecial return whether the word matches a special token, or not.

Types

type BaseTokenizer

type BaseTokenizer struct {
	// contains filtered or unexported fields
}

BaseTokenizer is a straightforward tokenizer implementations, which splits by whitespace and punctuation characters.

func NewBaseTokenizer

func NewBaseTokenizer(opts ...OptionV1) *BaseTokenizer

NewBaseTokenizer returns a new base tokenizer ready to use.

func (*BaseTokenizer) Tokenize

func (t *BaseTokenizer) Tokenize(text string) []StringOffsetsPair

Tokenize converts the input text to a slice of tokens, where each token is a white-separated word, a number or a punctuation sign. The resulting tokens preserve the alignment with the portion of the original text they belong to.

func (*BaseTokenizer) TokenizeChinese

func (t *BaseTokenizer) TokenizeChinese(text string) []StringOffsetsPair

TokenizeChinese Like Tokenize but focus on Chinese.

func (*BaseTokenizer) TokenizeChineseCharMode added in v1.4.4

func (t *BaseTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair

TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER.

type Dict

type Dict struct {
	// contains filtered or unexported fields
}

Dict is a container for tokens NOTE: python uses an OrderedDict, unsure of implications.

func New

func New(tokens []string) Dict

New will return a vocab dict from the given tokens, IDs will match index.

func VocabFromFile

func VocabFromFile(path string) (Dict, error)

VocabFromFile will read a newline delimited file into a Dict.

func VocabFromSlice added in v1.3.0

func VocabFromSlice(vocabArr []string) (Dict, error)

VocabFromSlice will read vocab from config into a Dict.

func (Dict) Add

func (v Dict) Add(token string)

Add will add an item to the vocabulary, is not thread-safe.

func (Dict) ConvertItems

func (v Dict) ConvertItems(items []string) []ID

ConvertItems convert items to ids.

func (Dict) ConvertTokens

func (v Dict) ConvertTokens(tokens []string) []ID

ConvertTokens convert token to id.

func (Dict) GetID

func (v Dict) GetID(token string) ID

GetID will return the ID of the token in the vocab. Will be negative if it doesn't exist.

func (Dict) IsInVocab

func (v Dict) IsInVocab(token string) bool

IsInVocab token is in vocabs.

func (Dict) LongestSubstring

func (v Dict) LongestSubstring(token string) string

LongestSubstring returns the longest token that is a substring of the token.

func (Dict) Size

func (v Dict) Size() int

Size returns the size of the vocabulary.

type GenerateModelInferOutputRequest

type GenerateModelInferOutputRequest func(params ...interface{}) []*nvidia_inferenceserver.ModelInferRequest_InferRequestedOutputTensor

GenerateModelInferOutputRequest model output callback.

type GenerateModelInferRequest

type GenerateModelInferRequest func(batchSize, maxSeqLength int) []*nvidia_inferenceserver.ModelInferRequest_InferInputTensor

GenerateModelInferRequest model input callback.

type HTTPBatchInput

type HTTPBatchInput struct {
	Name     string    `json:"name"`
	Shape    []int64   `json:"shape"`
	DataType string    `json:"datatype"`
	Data     [][]int32 `json:"data"`
}

HTTPBatchInput Model HTTP Batch Request Input Struct (Support batch 1).

type HTTPOutput

type HTTPOutput struct {
	Name       string               `json:"name"`
	Parameters InferOutputParameter `json:"parameters"`
}

HTTPOutput Model HTTP Request Output Struct.

type HTTPRequestBody

type HTTPRequestBody struct {
	Inputs  []HTTPBatchInput `json:"inputs"`
	Outputs []HTTPOutput     `json:"outputs"`
}

HTTPRequestBody Model HTTP Request Body.

type ID

type ID int32

ID is used to identify vocab items.

func (ID) Int64

func (id ID) Int64() int64

Int64 int32 ID to int64.

type InferOutputParameter

type InferOutputParameter struct {
	BinaryData     bool  `json:"binary_data"`
	Classification int64 `json:"classification"`
}

InferOutputParameter triton inference server infer parameters.

type InputFeature

type InputFeature struct {
	Text     string   // origin text
	Tokens   []string // token. like CLS/SEP after tokenizer
	TokenIDs []int32  // input_ids
	Mask     []int32  // input_mast
	TypeIDs  []int32  // segment_ids
}

InputFeature Bert InputFeature.

type InputObjects

type InputObjects struct {
	Input    string
	Tokens   []string
	PosArray []OffsetsType
}

InputObjects bert input objects for position record.

type ModelService

type ModelService struct {
	BertVocab     Dict
	BertTokenizer *WordPieceTokenizer
	// contains filtered or unexported fields
}

func NewModelService

func NewModelService(
	bertVocabPath, httpAddr string,
	httpClient *fasthttp.Client, grpcConn *grpc.ClientConn,
	modelInputCallback GenerateModelInferRequest,
	modelOutputCallback GenerateModelInferOutputRequest,
	modelInferCallback nvidia_inferenceserver.DecoderFunc,
) (*ModelService, error)

func (*ModelService) CheckModelReady

func (m *ModelService) CheckModelReady(
	modelName, modelVersion string, requestTimeout time.Duration,
) (bool, error)

CheckModelReady check model is ready.

func (*ModelService) CheckServerAlive

func (m *ModelService) CheckServerAlive(requestTimeout time.Duration) (bool, error)

CheckServerAlive check server is alive.

func (*ModelService) CheckServerReady

func (m *ModelService) CheckServerReady(requestTimeout time.Duration) (bool, error)

CheckServerReady check server is ready.

func (*ModelService) GetAllModelInfo

func (m *ModelService) GetAllModelInfo(
	repoName string, isReady bool, requestTimeout time.Duration,
) (*nvidia_inferenceserver.RepositoryIndexResponse, error)

GetAllModelInfo get all model info.

func (*ModelService) GetModelConfig

func (m *ModelService) GetModelConfig(
	modelName, modelVersion string, requestTimeout time.Duration,
) (interface{}, error)

GetModelConfig get model config.

func (*ModelService) GetModelInferIsGRPC added in v1.2.7

func (m *ModelService) GetModelInferIsGRPC() bool

GetModelInferIsGRPC Get isGRPC flag.

func (*ModelService) GetModelInferStats

func (m *ModelService) GetModelInferStats(
	modelName, modelVersion string, requestTimeout time.Duration,
) (*nvidia_inferenceserver.ModelStatisticsResponse, error)

GetModelInferStats get model infer stats.

func (*ModelService) GetModelMeta

func (m *ModelService) GetModelMeta(
	modelName, modelVersion string, requestTimeout time.Duration,
) (*nvidia_inferenceserver.ModelMetadataResponse, error)

GetModelMeta get model meta.

func (*ModelService) GetModelName

func (m *ModelService) GetModelName() string

GetModelName Get model name.

func (*ModelService) GetServerMeta

func (m *ModelService) GetServerMeta(
	requestTimeout time.Duration,
) (*nvidia_inferenceserver.ServerMetadataResponse, error)

GetServerMeta get server meta.

func (*ModelService) GetTokenizerIsChineseMode added in v1.2.7

func (m *ModelService) GetTokenizerIsChineseMode() bool

GetTokenizerIsChineseMode Get isChinese flag.

func (*ModelService) ModelInfer

func (m *ModelService) ModelInfer(
	inferData []string,
	modelName, modelVersion string,
	requestTimeout time.Duration,
	params ...interface{},
) ([]interface{}, error)

ModelInfer API to call Triton Inference Server.

func (*ModelService) SetChineseTokenize

func (m *ModelService) SetChineseTokenize(isCharMode bool) *ModelService

SetChineseTokenize Use Chinese Tokenize when tokenize infer data.

func (*ModelService) SetJsonDecoder added in v1.4.6

func (m *ModelService) SetJsonDecoder(decoder utils.JSONUnmarshal) *ModelService

SetJsonDecoder set json decoder

func (*ModelService) SetJsonEncoder added in v1.4.6

func (m *ModelService) SetJsonEncoder(encoder utils.JSONMarshal) *ModelService

SetJsonEncoder set json encoder

func (*ModelService) SetMaxSeqLength

func (m *ModelService) SetMaxSeqLength(maxSeqLen int) *ModelService

SetMaxSeqLength Set model infer max sequence length.

func (*ModelService) SetModelInferWithGRPC

func (m *ModelService) SetModelInferWithGRPC() *ModelService

SetModelInferWithGRPC Use grpc to call triton.

func (*ModelService) SetModelName

func (m *ModelService) SetModelName(modelPrefix, modelName string) *ModelService

SetModelName Set model name must equal to Triton config.pbtxt model name.

func (*ModelService) SetSecondaryServerURL added in v1.4.2

func (m *ModelService) SetSecondaryServerURL(url string) *ModelService

SetSecondaryServerURL set secondary server url【Only HTTP】

func (*ModelService) SetTokenizerReturnPosInfo added in v1.3.3

func (m *ModelService) SetTokenizerReturnPosInfo() *ModelService

SetTokenizerReturnPosInfo Set tokenizer return pos info.

func (*ModelService) UnsetChineseTokenize

func (m *ModelService) UnsetChineseTokenize() *ModelService

UnsetChineseTokenize Un-use Chinese Tokenize when tokenize infer data.

func (*ModelService) UnsetModelInferWithGRPC

func (m *ModelService) UnsetModelInferWithGRPC() *ModelService

UnsetModelInferWithGRPC Un-use grpc to call triton.

func (*ModelService) UnsetTokenizerReturnPosInfo added in v1.3.3

func (m *ModelService) UnsetTokenizerReturnPosInfo() *ModelService

UnsetTokenizerReturnPosInfo Un-set tokenizer return pos info.

type OffsetsType

type OffsetsType struct {
	Start int
	End   int
}

OffsetsType represents a (start, end) offsets pair. It usually represents a lower inclusive index position, and an upper exclusive position.

func GetOffsets

func GetOffsets(tokens []StringOffsetsPair) []OffsetsType

GetOffsets returns a sequence of offsets values from the given slice of StringOffsetsPair.

type OptionV1

type OptionV1 func(*BaseTokenizer)

OptionV1 allows to configure a new BaseTokenizer with your specific needs.

func RegisterSpecialWords

func RegisterSpecialWords(specialWords ...string) OptionV1

RegisterSpecialWords is an option to register a special word.

type Provider

type Provider interface {
	Vocab() Dict
}

Provider is an interface for exposing a vocab.

type StringOffsetsPair

type StringOffsetsPair struct {
	String  string
	Offsets OffsetsType
}

StringOffsetsPair represents a string value paired with offsets bounds. It usually represents a token string and its offsets positions in the original string.

func MakeOffsetPairsFromGroups

func MakeOffsetPairsFromGroups(text string, tokens []StringOffsetsPair, groups []TokensRange) []StringOffsetsPair

MakeOffsetPairsFromGroups creates a sequence tokenizers.StringOffsetsPair elements from the given groups.

type TokenizerV1

type TokenizerV1 interface {
	Tokenize(text string) []StringOffsetsPair
	TokenizeChinese(text string) []StringOffsetsPair
	TokenizeChineseCharMode(text string) []StringOffsetsPair
}

TokenizerV1 is implemented by any value that has the Tokenize method.

type TokensRange

type TokensRange struct {
	Start int
	End   int
}

TokensRange represents an index offsets pair of a token.

func GroupPieces

func GroupPieces(tokens []StringOffsetsPair) []TokensRange

GroupPieces returns a list of tokens range each of which represents the start and the end index of the tokens that form a complete word.

type WordPieceTokenizer

type WordPieceTokenizer struct {
	// contains filtered or unexported fields
}

WordPieceTokenizer is a tokenizer that breaks tokens into sub-word units based on a supplied vocabulary. See https://arxiv.org/pdf/1609.08144.pdf Section 4.1 for details. WordPieceTokenizers uses BaseTokenizer to preprocess the input text.

func NewWordPieceTokenizer

func NewWordPieceTokenizer(vocabulary Dict) *WordPieceTokenizer

NewWordPieceTokenizer returns a new WordPieceTokenizer.

func (*WordPieceTokenizer) Tokenize

func (t *WordPieceTokenizer) Tokenize(text string) []StringOffsetsPair

Tokenize converts the input text to a slice of words or sub-words token units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.

func (*WordPieceTokenizer) TokenizeChinese

func (t *WordPieceTokenizer) TokenizeChinese(text string) []StringOffsetsPair

TokenizeChinese Like Tokenize but focus on Chinese

func (*WordPieceTokenizer) TokenizeChineseCharMode added in v1.4.4

func (t *WordPieceTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair

TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER

func (*WordPieceTokenizer) WordPieceTokenize

func (t *WordPieceTokenizer) WordPieceTokenize(tokens []StringOffsetsPair) []StringOffsetsPair

WordPieceTokenize transforms the input token in a new slice of words or sub-words units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL