Documentation ¶
Index ¶
- Constants
- func GetStrings(tokens []StringOffsetsPair) []string
- func IsDefaultSpecial(word string) bool
- type BaseTokenizer
- type BertModelService
- type Dict
- type HTTPBatchInput
- type HTTPOutput
- type HTTPRequestBody
- type ID
- type InferOutputParameter
- type InputFeature
- type InputObjects
- type OffsetsType
- type OptionV1
- type Provider
- type StringOffsetsPair
- type TokenizerV1
- type TokensRange
- type W2NERInputFeature
- type W2NerModelService
- type WordPieceTokenizer
- func (t *WordPieceTokenizer) Tokenize(text string) []StringOffsetsPair
- func (t *WordPieceTokenizer) TokenizeChinese(text string) []StringOffsetsPair
- func (t *WordPieceTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair
- func (t *WordPieceTokenizer) WordPieceTokenize(tokens []StringOffsetsPair) []StringOffsetsPair
Constants ¶
const ( DefaultMaxSeqLength int = 48 ModelRespBodyOutputBinaryDataKey string = "binary_data" ModelRespBodyOutputClassificationDataKey string = "classification" ModelBertModelSegmentIdsKey string = "segment_ids" ModelBertModelInputIdsKey string = "input_ids" ModelBertModelInputMaskKey string = "input_mask" ModelInt32DataType string = "INT32" ModelInt64DataType string = "INT64" ModelBoolDataType string = "BOOL" )
Variables ¶
This section is empty.
Functions ¶
func GetStrings ¶
func GetStrings(tokens []StringOffsetsPair) []string
GetStrings returns a sequence of string values from the given slice of StringOffsetsPair.
func IsDefaultSpecial ¶
IsDefaultSpecial return whether the word matches a special token, or not.
Types ¶
type BaseTokenizer ¶
type BaseTokenizer struct {
// contains filtered or unexported fields
}
BaseTokenizer is a straightforward tokenizer implementations, which splits by whitespace and punctuation characters.
func NewBaseTokenizer ¶
func NewBaseTokenizer(opts ...OptionV1) *BaseTokenizer
NewBaseTokenizer returns a new base tokenizer ready to use.
func (*BaseTokenizer) Tokenize ¶
func (t *BaseTokenizer) Tokenize(text string) []StringOffsetsPair
Tokenize converts the input text to a slice of tokens, where each token is a white-separated word, a number or a punctuation sign. The resulting tokens preserve the alignment with the portion of the original text they belong to.
func (*BaseTokenizer) TokenizeChinese ¶
func (t *BaseTokenizer) TokenizeChinese(text string) []StringOffsetsPair
TokenizeChinese Like Tokenize but focus on Chinese.
func (*BaseTokenizer) TokenizeChineseCharMode ¶
func (t *BaseTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair
TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER.
type BertModelService ¶
type BertModelService struct { models.ModelService BertVocab Dict BertTokenizer *WordPieceTokenizer }
func NewBertModelService ¶
func NewBertModelService( bertVocabPath, httpAddr string, httpClient *fasthttp.Client, grpcConn *grpc.ClientConn, modelInputCallback models.GenerateModelInferRequest, modelOutputCallback models.GenerateModelInferOutputRequest, modelInferCallback nvidia_inferenceserver.DecoderFunc, ) (*BertModelService, error)
func (*BertModelService) ModelInfer ¶
func (m *BertModelService) ModelInfer( inferData []string, modelName, modelVersion string, requestTimeout time.Duration, params ...interface{}, ) ([]interface{}, error)
ModelInfer API to call Triton Inference Server.
type Dict ¶
type Dict struct {
// contains filtered or unexported fields
}
Dict is a container for tokens NOTE: python uses an OrderedDict, unsure of implications.
func VocabFromFile ¶
VocabFromFile will read a newline delimited file into a Dict.
func VocabFromSlice ¶
VocabFromSlice will read vocab from config into a Dict.
func (Dict) ConvertItems ¶
ConvertItems convert items to ids.
func (Dict) ConvertTokens ¶
ConvertTokens convert token to id.
func (Dict) GetID ¶
GetID will return the ID of the token in the vocab. Will be negative if it doesn't exist.
func (Dict) LongestSubstring ¶
LongestSubstring returns the longest token that is a substring of the token.
type HTTPBatchInput ¶
type HTTPBatchInput struct { Name string `json:"name"` Shape []int64 `json:"shape"` DataType string `json:"datatype"` Data any `json:"data"` }
HTTPBatchInput Model HTTP Batch Request Input Struct (Support batch 1).
type HTTPOutput ¶
type HTTPOutput struct { Name string `json:"name"` Parameters InferOutputParameter `json:"parameters,omitempty"` }
HTTPOutput Model HTTP Request Output Struct.
type HTTPRequestBody ¶
type HTTPRequestBody struct { Inputs []HTTPBatchInput `json:"inputs"` Outputs []HTTPOutput `json:"outputs"` }
HTTPRequestBody Model HTTP Request Body.
type InferOutputParameter ¶
type InferOutputParameter struct { BinaryData bool `json:"binary_data,omitempty"` Classification int64 `json:"classification,omitempty"` }
InferOutputParameter triton inference server infer parameters.
type InputFeature ¶
type InputFeature struct { Text string // origin text Tokens []string // token. like CLS/SEP after tokenizer TokenIDs []int32 // input_ids Mask []int32 // input_mask TypeIDs []int32 // segment_ids }
InputFeature Bert InputFeature.
type InputObjects ¶
type InputObjects struct { Input string Tokens []string PosArray []OffsetsType }
InputObjects bert input objects for position record.
type OffsetsType ¶
OffsetsType represents a (start, end) offsets pair. It usually represents a lower inclusive index position, and an upper exclusive position.
func GetOffsets ¶
func GetOffsets(tokens []StringOffsetsPair) []OffsetsType
GetOffsets returns a sequence of offsets values from the given slice of StringOffsetsPair.
type OptionV1 ¶
type OptionV1 func(*BaseTokenizer)
OptionV1 allows to configure a new BaseTokenizer with your specific needs.
func RegisterSpecialWords ¶
RegisterSpecialWords is an option to register a special word.
type Provider ¶
type Provider interface {
Vocab() Dict
}
Provider is an interface for exposing a vocab.
type StringOffsetsPair ¶
type StringOffsetsPair struct { String string Offsets OffsetsType }
StringOffsetsPair represents a string value paired with offsets bounds. It usually represents a token string and its offsets positions in the original string.
func MakeOffsetPairsFromGroups ¶
func MakeOffsetPairsFromGroups(text string, tokens []StringOffsetsPair, groups []TokensRange) []StringOffsetsPair
MakeOffsetPairsFromGroups creates a sequence tokenizers.StringOffsetsPair elements from the given groups.
type TokenizerV1 ¶
type TokenizerV1 interface { Tokenize(text string) []StringOffsetsPair TokenizeChinese(text string) []StringOffsetsPair TokenizeChineseCharMode(text string) []StringOffsetsPair }
TokenizerV1 is implemented by any value that has the Tokenize method.
type TokensRange ¶
TokensRange represents an index offsets pair of a token.
func GroupPieces ¶
func GroupPieces(tokens []StringOffsetsPair) []TokensRange
GroupPieces returns a list of tokens range each of which represents the start and the end index of the tokens that form a complete word.
type W2NERInputFeature ¶
type W2NerModelService ¶
type W2NerModelService struct {
*BertModelService
}
func NewW2NERModelService ¶
func NewW2NERModelService( bertVocabPath, httpAddr string, httpClient *fasthttp.Client, grpcConn *grpc.ClientConn, modelInputCallback models.GenerateModelInferRequest, modelOutputCallback models.GenerateModelInferOutputRequest, modelInferCallback nvidia_inferenceserver.DecoderFunc, ) (*W2NerModelService, error)
func (*W2NerModelService) ModelInfer ¶
func (w *W2NerModelService) ModelInfer( inferData [][]string, modelName, modelVersion string, requestTimeout time.Duration, params ...interface{}, ) ([]interface{}, error)
ModelInfer API to call Triton Inference Server.
type WordPieceTokenizer ¶
type WordPieceTokenizer struct {
// contains filtered or unexported fields
}
WordPieceTokenizer is a tokenizer that breaks tokens into sub-word units based on a supplied vocabulary. See https://arxiv.org/pdf/1609.08144.pdf Section 4.1 for details. WordPieceTokenizers uses BaseTokenizer to preprocess the input text.
func NewWordPieceTokenizer ¶
func NewWordPieceTokenizer(vocabulary Dict) *WordPieceTokenizer
NewWordPieceTokenizer returns a new WordPieceTokenizer.
func (*WordPieceTokenizer) Tokenize ¶
func (t *WordPieceTokenizer) Tokenize(text string) []StringOffsetsPair
Tokenize converts the input text to a slice of words or sub-words token units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.
func (*WordPieceTokenizer) TokenizeChinese ¶
func (t *WordPieceTokenizer) TokenizeChinese(text string) []StringOffsetsPair
TokenizeChinese Like Tokenize but focus on Chinese
func (*WordPieceTokenizer) TokenizeChineseCharMode ¶
func (t *WordPieceTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair
TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER
func (*WordPieceTokenizer) WordPieceTokenize ¶
func (t *WordPieceTokenizer) WordPieceTokenize(tokens []StringOffsetsPair) []StringOffsetsPair
WordPieceTokenize transforms the input token in a new slice of words or sub-words units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.