transformers

package
v2.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 27, 2024 License: MIT Imports: 11 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DefaultMaxSeqLength                      int    = 48
	ModelRespBodyOutputBinaryDataKey         string = "binary_data"
	ModelRespBodyOutputClassificationDataKey string = "classification"
	ModelBertModelSegmentIdsKey              string = "segment_ids"
	ModelBertModelInputIdsKey                string = "input_ids"
	ModelBertModelInputMaskKey               string = "input_mask"
	ModelInt32DataType                       string = "INT32"
	ModelInt64DataType                       string = "INT64"
	ModelBoolDataType                        string = "BOOL"
)
View Source
const (
	DefaultCLS          string = "[CLS]"
	DefaultSEP          string = "[SEP]"
	DefaultUNK          string = "[UNK]"
	DefaultMask         string = "[MASK]"
	NumPadToken         string = "##"
	DefaultMaxWordChars int    = 200
	DataSplitString     string = " ||| "
)

Variables

This section is empty.

Functions

func GetStrings

func GetStrings(tokens []StringOffsetsPair) []string

GetStrings returns a sequence of string values from the given slice of StringOffsetsPair.

func IsDefaultSpecial

func IsDefaultSpecial(word string) bool

IsDefaultSpecial return whether the word matches a special token, or not.

Types

type BaseTokenizer

type BaseTokenizer struct {
	// contains filtered or unexported fields
}

BaseTokenizer is a straightforward tokenizer implementations, which splits by whitespace and punctuation characters.

func NewBaseTokenizer

func NewBaseTokenizer(opts ...OptionV1) *BaseTokenizer

NewBaseTokenizer returns a new base tokenizer ready to use.

func (*BaseTokenizer) Tokenize

func (t *BaseTokenizer) Tokenize(text string) []StringOffsetsPair

Tokenize converts the input text to a slice of tokens, where each token is a white-separated word, a number or a punctuation sign. The resulting tokens preserve the alignment with the portion of the original text they belong to.

func (*BaseTokenizer) TokenizeChinese

func (t *BaseTokenizer) TokenizeChinese(text string) []StringOffsetsPair

TokenizeChinese Like Tokenize but focus on Chinese.

func (*BaseTokenizer) TokenizeChineseCharMode

func (t *BaseTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair

TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER.

type BertModelService

type BertModelService struct {
	models.ModelService

	BertVocab     Dict
	BertTokenizer *WordPieceTokenizer
}

func NewBertModelService

func NewBertModelService(
	bertVocabPath, httpAddr string,
	httpClient *fasthttp.Client, grpcConn *grpc.ClientConn,
	modelInputCallback models.GenerateModelInferRequest,
	modelOutputCallback models.GenerateModelInferOutputRequest,
	modelInferCallback nvidia_inferenceserver.DecoderFunc,
) (*BertModelService, error)

func (*BertModelService) ModelInfer

func (m *BertModelService) ModelInfer(
	inferData []string,
	modelName, modelVersion string,
	requestTimeout time.Duration,
	params ...interface{},
) ([]interface{}, error)

ModelInfer API to call Triton Inference Server.

type Dict

type Dict struct {
	// contains filtered or unexported fields
}

Dict is a container for tokens NOTE: python uses an OrderedDict, unsure of implications.

func New

func New(tokens []string) Dict

New will return a vocab dict from the given tokens, IDs will match index.

func VocabFromFile

func VocabFromFile(path string) (Dict, error)

VocabFromFile will read a newline delimited file into a Dict.

func VocabFromSlice

func VocabFromSlice(vocabArr []string) (Dict, error)

VocabFromSlice will read vocab from config into a Dict.

func (Dict) Add

func (v Dict) Add(token string)

Add will add an item to the vocabulary, is not thread-safe.

func (Dict) ConvertItems

func (v Dict) ConvertItems(items []string) []ID

ConvertItems convert items to ids.

func (Dict) ConvertTokens

func (v Dict) ConvertTokens(tokens []string) []ID

ConvertTokens convert token to id.

func (Dict) GetID

func (v Dict) GetID(token string) ID

GetID will return the ID of the token in the vocab. Will be negative if it doesn't exist.

func (Dict) IsInVocab

func (v Dict) IsInVocab(token string) bool

IsInVocab token is in vocabs.

func (Dict) LongestSubstring

func (v Dict) LongestSubstring(token string) string

LongestSubstring returns the longest token that is a substring of the token.

func (Dict) Size

func (v Dict) Size() int

Size returns the size of the vocabulary.

type HTTPBatchInput

type HTTPBatchInput struct {
	Name     string  `json:"name"`
	Shape    []int64 `json:"shape"`
	DataType string  `json:"datatype"`
	Data     any     `json:"data"`
}

HTTPBatchInput Model HTTP Batch Request Input Struct (Support batch 1).

type HTTPOutput

type HTTPOutput struct {
	Name       string               `json:"name"`
	Parameters InferOutputParameter `json:"parameters,omitempty"`
}

HTTPOutput Model HTTP Request Output Struct.

type HTTPRequestBody

type HTTPRequestBody struct {
	Inputs  []HTTPBatchInput `json:"inputs"`
	Outputs []HTTPOutput     `json:"outputs"`
}

HTTPRequestBody Model HTTP Request Body.

type ID

type ID int32

ID is used to identify vocab items.

func (ID) Int64

func (id ID) Int64() int64

Int64 int32 ID to int64.

type InferOutputParameter

type InferOutputParameter struct {
	BinaryData     bool  `json:"binary_data,omitempty"`
	Classification int64 `json:"classification,omitempty"`
}

InferOutputParameter triton inference server infer parameters.

type InputFeature

type InputFeature struct {
	Text     string   // origin text
	Tokens   []string // token. like CLS/SEP after tokenizer
	TokenIDs []int32  // input_ids
	Mask     []int32  // input_mask
	TypeIDs  []int32  // segment_ids
}

InputFeature Bert InputFeature.

type InputObjects

type InputObjects struct {
	Input    string
	Tokens   []string
	PosArray []OffsetsType
}

InputObjects bert input objects for position record.

type OffsetsType

type OffsetsType struct {
	Start int
	End   int
}

OffsetsType represents a (start, end) offsets pair. It usually represents a lower inclusive index position, and an upper exclusive position.

func GetOffsets

func GetOffsets(tokens []StringOffsetsPair) []OffsetsType

GetOffsets returns a sequence of offsets values from the given slice of StringOffsetsPair.

type OptionV1

type OptionV1 func(*BaseTokenizer)

OptionV1 allows to configure a new BaseTokenizer with your specific needs.

func RegisterSpecialWords

func RegisterSpecialWords(specialWords ...string) OptionV1

RegisterSpecialWords is an option to register a special word.

type Provider

type Provider interface {
	Vocab() Dict
}

Provider is an interface for exposing a vocab.

type StringOffsetsPair

type StringOffsetsPair struct {
	String  string
	Offsets OffsetsType
}

StringOffsetsPair represents a string value paired with offsets bounds. It usually represents a token string and its offsets positions in the original string.

func MakeOffsetPairsFromGroups

func MakeOffsetPairsFromGroups(text string, tokens []StringOffsetsPair, groups []TokensRange) []StringOffsetsPair

MakeOffsetPairsFromGroups creates a sequence tokenizers.StringOffsetsPair elements from the given groups.

type TokenizerV1

type TokenizerV1 interface {
	Tokenize(text string) []StringOffsetsPair
	TokenizeChinese(text string) []StringOffsetsPair
	TokenizeChineseCharMode(text string) []StringOffsetsPair
}

TokenizerV1 is implemented by any value that has the Tokenize method.

type TokensRange

type TokensRange struct {
	Start int
	End   int
}

TokensRange represents an index offsets pair of a token.

func GroupPieces

func GroupPieces(tokens []StringOffsetsPair) []TokensRange

GroupPieces returns a list of tokens range each of which represents the start and the end index of the tokens that form a complete word.

type W2NERInputFeature

type W2NERInputFeature struct {
	Text        string
	TokenIDs    []int32   // input_ids
	GridMask2D  [][]bool  // grid_mask2d
	DistInputs  [][]int32 // disk_inputs
	Pieces2Word [][]bool  // pieces2word
}

type W2NerModelService

type W2NerModelService struct {
	*BertModelService
}

func NewW2NERModelService

func NewW2NERModelService(
	bertVocabPath, httpAddr string,
	httpClient *fasthttp.Client, grpcConn *grpc.ClientConn,
	modelInputCallback models.GenerateModelInferRequest,
	modelOutputCallback models.GenerateModelInferOutputRequest,
	modelInferCallback nvidia_inferenceserver.DecoderFunc,
) (*W2NerModelService, error)

func (*W2NerModelService) ModelInfer

func (w *W2NerModelService) ModelInfer(
	inferData [][]string,
	modelName, modelVersion string,
	requestTimeout time.Duration,
	params ...interface{},
) ([]interface{}, error)

ModelInfer API to call Triton Inference Server.

type WordPieceTokenizer

type WordPieceTokenizer struct {
	// contains filtered or unexported fields
}

WordPieceTokenizer is a tokenizer that breaks tokens into sub-word units based on a supplied vocabulary. See https://arxiv.org/pdf/1609.08144.pdf Section 4.1 for details. WordPieceTokenizers uses BaseTokenizer to preprocess the input text.

func NewWordPieceTokenizer

func NewWordPieceTokenizer(vocabulary Dict) *WordPieceTokenizer

NewWordPieceTokenizer returns a new WordPieceTokenizer.

func (*WordPieceTokenizer) Tokenize

func (t *WordPieceTokenizer) Tokenize(text string) []StringOffsetsPair

Tokenize converts the input text to a slice of words or sub-words token units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.

func (*WordPieceTokenizer) TokenizeChinese

func (t *WordPieceTokenizer) TokenizeChinese(text string) []StringOffsetsPair

TokenizeChinese Like Tokenize but focus on Chinese

func (*WordPieceTokenizer) TokenizeChineseCharMode

func (t *WordPieceTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair

TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER

func (*WordPieceTokenizer) WordPieceTokenize

func (t *WordPieceTokenizer) WordPieceTokenize(tokens []StringOffsetsPair) []StringOffsetsPair

WordPieceTokenize transforms the input token in a new slice of words or sub-words units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL