transformers

package

v2.0.2 Latest Latest Go to latest Published: May 27, 2024 License: MIT Imports: 11 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/sunhailin-Leo/triton-service-go

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func GetStrings(tokens []StringOffsetsPair) []string
func IsDefaultSpecial(word string) bool
type BaseTokenizer
- func NewBaseTokenizer(opts ...OptionV1) *BaseTokenizer
- func (t *BaseTokenizer) Tokenize(text string) []StringOffsetsPair
- func (t *BaseTokenizer) TokenizeChinese(text string) []StringOffsetsPair
- func (t *BaseTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair
type BertModelService
- func NewBertModelService(bertVocabPath, httpAddr string, httpClient *fasthttp.Client, ...) (*BertModelService, error)
- func (m *BertModelService) ModelInfer(inferData []string, modelName, modelVersion string, ...) ([]interface{}, error)
type Dict
- func New(tokens []string) Dict
- func VocabFromFile(path string) (Dict, error)
- func VocabFromSlice(vocabArr []string) (Dict, error)
- func (v Dict) Add(token string)
- func (v Dict) ConvertItems(items []string) []ID
- func (v Dict) ConvertTokens(tokens []string) []ID
- func (v Dict) GetID(token string) ID
- func (v Dict) IsInVocab(token string) bool
- func (v Dict) LongestSubstring(token string) string
- func (v Dict) Size() int
type HTTPBatchInput
type HTTPOutput
type HTTPRequestBody
type ID
- func (id ID) Int64() int64
type InferOutputParameter
type InputFeature
type InputObjects
type OffsetsType
- func GetOffsets(tokens []StringOffsetsPair) []OffsetsType
type OptionV1
- func RegisterSpecialWords(specialWords ...string) OptionV1
type Provider
type StringOffsetsPair
- func MakeOffsetPairsFromGroups(text string, tokens []StringOffsetsPair, groups []TokensRange) []StringOffsetsPair
type TokenizerV1
type TokensRange
- func GroupPieces(tokens []StringOffsetsPair) []TokensRange
type W2NERInputFeature
type W2NerModelService
- func NewW2NERModelService(bertVocabPath, httpAddr string, httpClient *fasthttp.Client, ...) (*W2NerModelService, error)
- func (w *W2NerModelService) ModelInfer(inferData [][]string, modelName, modelVersion string, ...) ([]interface{}, error)
type WordPieceTokenizer
- func NewWordPieceTokenizer(vocabulary Dict) *WordPieceTokenizer
- func (t *WordPieceTokenizer) Tokenize(text string) []StringOffsetsPair
- func (t *WordPieceTokenizer) TokenizeChinese(text string) []StringOffsetsPair
- func (t *WordPieceTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair
- func (t *WordPieceTokenizer) WordPieceTokenize(tokens []StringOffsetsPair) []StringOffsetsPair

Constants ¶

View Source

const (
	DefaultMaxSeqLength                      int    = 48
	ModelRespBodyOutputBinaryDataKey         string = "binary_data"
	ModelRespBodyOutputClassificationDataKey string = "classification"
	ModelBertModelSegmentIdsKey              string = "segment_ids"
	ModelBertModelInputIdsKey                string = "input_ids"
	ModelBertModelInputMaskKey               string = "input_mask"
	ModelInt32DataType                       string = "INT32"
	ModelInt64DataType                       string = "INT64"
	ModelBoolDataType                        string = "BOOL"
)

View Source

const (
	DefaultCLS          string = "[CLS]"
	DefaultSEP          string = "[SEP]"
	DefaultUNK          string = "[UNK]"
	DefaultMask         string = "[MASK]"
	NumPadToken         string = "##"
	DefaultMaxWordChars int    = 200
	DataSplitString     string = " ||| "
)

Variables ¶

This section is empty.

Functions ¶

func GetStrings ¶

func GetStrings(tokens []StringOffsetsPair) []string

GetStrings returns a sequence of string values from the given slice of StringOffsetsPair.

func IsDefaultSpecial ¶

func IsDefaultSpecial(word string) bool

IsDefaultSpecial return whether the word matches a special token, or not.

Types ¶

type BaseTokenizer ¶

type BaseTokenizer struct {
	// contains filtered or unexported fields
}

BaseTokenizer is a straightforward tokenizer implementations, which splits by whitespace and punctuation characters.

func NewBaseTokenizer ¶

func NewBaseTokenizer(opts ...OptionV1) *BaseTokenizer

NewBaseTokenizer returns a new base tokenizer ready to use.

func (*BaseTokenizer) Tokenize ¶

func (t *BaseTokenizer) Tokenize(text string) []StringOffsetsPair

Tokenize converts the input text to a slice of tokens, where each token is a white-separated word, a number or a punctuation sign. The resulting tokens preserve the alignment with the portion of the original text they belong to.

func (*BaseTokenizer) TokenizeChinese ¶

func (t *BaseTokenizer) TokenizeChinese(text string) []StringOffsetsPair

TokenizeChinese Like Tokenize but focus on Chinese.

func (*BaseTokenizer) TokenizeChineseCharMode ¶

func (t *BaseTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair

TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER.

type BertModelService ¶

type BertModelService struct {
	models.ModelService

	BertVocab     Dict
	BertTokenizer *WordPieceTokenizer
}

func NewBertModelService ¶

func NewBertModelService(
	bertVocabPath, httpAddr string,
	httpClient *fasthttp.Client, grpcConn *grpc.ClientConn,
	modelInputCallback models.GenerateModelInferRequest,
	modelOutputCallback models.GenerateModelInferOutputRequest,
	modelInferCallback nvidia_inferenceserver.DecoderFunc,
) (*BertModelService, error)

func (*BertModelService) ModelInfer ¶

func (m *BertModelService) ModelInfer(
	inferData []string,
	modelName, modelVersion string,
	requestTimeout time.Duration,
	params ...interface{},
) ([]interface{}, error)

ModelInfer API to call Triton Inference Server.

type Dict ¶

type Dict struct {
	// contains filtered or unexported fields
}

Dict is a container for tokens NOTE: python uses an OrderedDict, unsure of implications.

func New ¶

func New(tokens []string) Dict

New will return a vocab dict from the given tokens, IDs will match index.

func VocabFromFile ¶

func VocabFromFile(path string) (Dict, error)

VocabFromFile will read a newline delimited file into a Dict.

func VocabFromSlice ¶

func VocabFromSlice(vocabArr []string) (Dict, error)

VocabFromSlice will read vocab from config into a Dict.

func (Dict) Add ¶

func (v Dict) Add(token string)

Add will add an item to the vocabulary, is not thread-safe.

func (Dict) ConvertItems ¶

func (v Dict) ConvertItems(items []string) []ID

ConvertItems convert items to ids.

func (Dict) ConvertTokens ¶

func (v Dict) ConvertTokens(tokens []string) []ID

ConvertTokens convert token to id.

func (Dict) GetID ¶

func (v Dict) GetID(token string) ID

GetID will return the ID of the token in the vocab. Will be negative if it doesn't exist.

func (Dict) IsInVocab ¶

func (v Dict) IsInVocab(token string) bool

IsInVocab token is in vocabs.

func (Dict) LongestSubstring ¶

func (v Dict) LongestSubstring(token string) string

LongestSubstring returns the longest token that is a substring of the token.

func (Dict) Size ¶

func (v Dict) Size() int

Size returns the size of the vocabulary.

type HTTPBatchInput ¶

type HTTPBatchInput struct {
	Name     string  `json:"name"`
	Shape    []int64 `json:"shape"`
	DataType string  `json:"datatype"`
	Data     any     `json:"data"`
}

HTTPBatchInput Model HTTP Batch Request Input Struct (Support batch 1).

type HTTPOutput ¶

type HTTPOutput struct {
	Name       string               `json:"name"`
	Parameters InferOutputParameter `json:"parameters,omitempty"`
}

HTTPOutput Model HTTP Request Output Struct.

type HTTPRequestBody ¶

type HTTPRequestBody struct {
	Inputs  []HTTPBatchInput `json:"inputs"`
	Outputs []HTTPOutput     `json:"outputs"`
}

HTTPRequestBody Model HTTP Request Body.

type ID ¶

type ID int32

ID is used to identify vocab items.

func (ID) Int64 ¶

func (id ID) Int64() int64

Int64 int32 ID to int64.

type InferOutputParameter ¶

type InferOutputParameter struct {
	BinaryData     bool  `json:"binary_data,omitempty"`
	Classification int64 `json:"classification,omitempty"`
}

InferOutputParameter triton inference server infer parameters.

type InputFeature ¶

type InputFeature struct {
	Text     string   // origin text
	Tokens   []string // token. like CLS/SEP after tokenizer
	TokenIDs []int32  // input_ids
	Mask     []int32  // input_mask
	TypeIDs  []int32  // segment_ids
}

InputFeature Bert InputFeature.

type InputObjects ¶

type InputObjects struct {
	Input    string
	Tokens   []string
	PosArray []OffsetsType
}

InputObjects bert input objects for position record.

type OffsetsType ¶

type OffsetsType struct {
	Start int
	End   int
}

OffsetsType represents a (start, end) offsets pair. It usually represents a lower inclusive index position, and an upper exclusive position.

func GetOffsets ¶

func GetOffsets(tokens []StringOffsetsPair) []OffsetsType

GetOffsets returns a sequence of offsets values from the given slice of StringOffsetsPair.

type OptionV1 ¶

type OptionV1 func(*BaseTokenizer)

OptionV1 allows to configure a new BaseTokenizer with your specific needs.

func RegisterSpecialWords ¶

func RegisterSpecialWords(specialWords ...string) OptionV1

RegisterSpecialWords is an option to register a special word.

type Provider ¶

type Provider interface {
	Vocab() Dict
}

Provider is an interface for exposing a vocab.

type StringOffsetsPair ¶

type StringOffsetsPair struct {
	String  string
	Offsets OffsetsType
}

StringOffsetsPair represents a string value paired with offsets bounds. It usually represents a token string and its offsets positions in the original string.

func MakeOffsetPairsFromGroups ¶

func MakeOffsetPairsFromGroups(text string, tokens []StringOffsetsPair, groups []TokensRange) []StringOffsetsPair

MakeOffsetPairsFromGroups creates a sequence tokenizers.StringOffsetsPair elements from the given groups.

type TokenizerV1 ¶

type TokenizerV1 interface {
	Tokenize(text string) []StringOffsetsPair
	TokenizeChinese(text string) []StringOffsetsPair
	TokenizeChineseCharMode(text string) []StringOffsetsPair
}

TokenizerV1 is implemented by any value that has the Tokenize method.

type TokensRange ¶

type TokensRange struct {
	Start int
	End   int
}

TokensRange represents an index offsets pair of a token.

func GroupPieces ¶

func GroupPieces(tokens []StringOffsetsPair) []TokensRange

GroupPieces returns a list of tokens range each of which represents the start and the end index of the tokens that form a complete word.

type W2NERInputFeature ¶

type W2NERInputFeature struct {
	Text        string
	TokenIDs    []int32   // input_ids
	GridMask2D  [][]bool  // grid_mask2d
	DistInputs  [][]int32 // disk_inputs
	Pieces2Word [][]bool  // pieces2word
}

type W2NerModelService ¶

type W2NerModelService struct {
	*BertModelService
}

func NewW2NERModelService ¶

func NewW2NERModelService(
	bertVocabPath, httpAddr string,
	httpClient *fasthttp.Client, grpcConn *grpc.ClientConn,
	modelInputCallback models.GenerateModelInferRequest,
	modelOutputCallback models.GenerateModelInferOutputRequest,
	modelInferCallback nvidia_inferenceserver.DecoderFunc,
) (*W2NerModelService, error)

func (*W2NerModelService) ModelInfer ¶

func (w *W2NerModelService) ModelInfer(
	inferData [][]string,
	modelName, modelVersion string,
	requestTimeout time.Duration,
	params ...interface{},
) ([]interface{}, error)

ModelInfer API to call Triton Inference Server.

type WordPieceTokenizer ¶

type WordPieceTokenizer struct {
	// contains filtered or unexported fields
}

WordPieceTokenizer is a tokenizer that breaks tokens into sub-word units based on a supplied vocabulary. See https://arxiv.org/pdf/1609.08144.pdf Section 4.1 for details. WordPieceTokenizers uses BaseTokenizer to preprocess the input text.

func NewWordPieceTokenizer ¶

func NewWordPieceTokenizer(vocabulary Dict) *WordPieceTokenizer

NewWordPieceTokenizer returns a new WordPieceTokenizer.

func (*WordPieceTokenizer) Tokenize ¶

func (t *WordPieceTokenizer) Tokenize(text string) []StringOffsetsPair

Tokenize converts the input text to a slice of words or sub-words token units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.

func (*WordPieceTokenizer) TokenizeChinese ¶

func (t *WordPieceTokenizer) TokenizeChinese(text string) []StringOffsetsPair

TokenizeChinese Like Tokenize but focus on Chinese

func (*WordPieceTokenizer) TokenizeChineseCharMode ¶

func (t *WordPieceTokenizer) TokenizeChineseCharMode(text string) []StringOffsetsPair

TokenizeChineseCharMode Like TokenizeChinese but focus on Chinese NER

func (*WordPieceTokenizer) WordPieceTokenize ¶

func (t *WordPieceTokenizer) WordPieceTokenize(tokens []StringOffsetsPair) []StringOffsetsPair

WordPieceTokenize transforms the input token in a new slice of words or sub-words units based on the supplied vocabulary. The resulting tokens preserve the alignment with the portion of the original text they belong to.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL