Versions in this module Expand all Collapse all v1 v1.0.0 Sep 20, 2024 Changes in this version + const ConfigName + const HFpath + const PretokenizedCowInput + const PretokenizedInput + const PretokenizedOwnedInput + const RawInput + const SecondSequenceNotProvided + const SequenceTooShort + const TokenizerName + const WeightName + var CachedDir string = "NOT_SETTING" + var DUMMY_INPUT [][]int64 = [][]int64 + func CachedPath(modelNameOrPath, fileName string) (resolvedPath string, err error) + func CleanCache() error + type ATOption func(at *AddedToken) + func WithLStrip(lstrip bool) ATOption + func WithNormalized(normalized bool) ATOption + func WithRStrip(rstrip bool) ATOption + func WithSingleWord(singleWord bool) ATOption + type AddedToken struct + Content string + LStrip bool + Normalized bool + RStrip bool + SingleWord bool + func DefaultAddedToken() (retVal AddedToken) + func NewAddedToken(s string, special bool, opts ...ATOption) (retVal AddedToken) + func (at AddedToken) GetPattern(n normalizer.Normalizer) (retVal string) + func (at AddedToken) SetLStrip(lstrip bool) (retVal AddedToken) + func (at AddedToken) SetNormalized(normalized bool) (retVal AddedToken) + func (at AddedToken) SetRStrip(rstrip bool) (retVal AddedToken) + func (at AddedToken) SetSingleWord(singleWord bool) (retVal AddedToken) + type AddedTokenWithId struct + Id int + Special bool + Token AddedToken + type AddedVocabulary struct + func NewAddedVocabulary() (retVal AddedVocabulary) + func (av *AddedVocabulary) AddSpecialTokens(tokens []AddedToken, model Model, normalizer normalizer.Normalizer) (retVal int) + func (av *AddedVocabulary) AddTokens(tokens []AddedToken, model Model, normalizer normalizer.Normalizer) (retVal int) + func (av *AddedVocabulary) ExtractAndNormalize(sequence string, n normalizer.Normalizer) *PreTokenizedString + func (av *AddedVocabulary) GetVocab() (retVal map[string]int) + func (av *AddedVocabulary) IdToToken(id int, model Model) (retVal string, ok bool) + func (av *AddedVocabulary) IsSpecialToken(token string) bool + func (av *AddedVocabulary) Len() int + func (av *AddedVocabulary) TokenToId(token string, model Model) (retVal int, ok bool) + type BytesToCharOffsetConverter struct + func NewBytesToCharOffsetConverter(sequence string) *BytesToCharOffsetConverter + func (c *BytesToCharOffsetConverter) Convert(offsets []int) ([]int, error) + type Config struct + AddedTokens []TokenConfig + Decoder map[string]interface{} + Model map[string]interface{} + Normalizer map[string]interface{} + Padding map[string]interface{} + PostProcessor map[string]interface{} + PreTokenizer map[string]interface{} + Truncation map[string]interface{} + Version string + func ConfigFromFile(file string) (*Config, error) + type Decoder interface + Decode func(tokens []string) string + DecodeChain func(tokens []string) []string + type DecoderConfig struct + Decoders []map[string]interface{} + Type string + type Dual struct + Pair InputSequence + Sentence InputSequence + type EncodeInput interface + func NewDualEncodeInput(sentence, pairSentence InputSequence) (retVal EncodeInput) + func NewSingleEncodeInput(sentence InputSequence) (retVal EncodeInput) + type Encoding struct + AttentionMask []int + Ids []int + Offsets [][]int + Overflowing []Encoding + SequenceRanges map[int]Range + SpecialTokenMask []int + Tokens []string + TypeIds []int + Words []int + func DefaultEncoding() *Encoding + func DefaultProcess(encoding, pairEncoding *Encoding, addSpecialTokens bool) *Encoding + func MergeEncodings(encodings []Encoding, growingOffsets bool) *Encoding + func NewEncoding(ids []int, typeIds []int, tokens []string, offsets [][]int, ...) *Encoding + func NewEncodingFromTokens(tokens []Token, typeId int) (retVal *Encoding) + func NewEncodingWithCapacity(l int) (retVal *Encoding) + func PadEncodings(encodings []Encoding, params PaddingParams) []Encoding + func PrepareEncodings(encoding, pairEncoding *Encoding) (out []Encoding) + func TruncateEncodings(encoding, pairEncoding *Encoding, params *TruncationParams) (tEncoding, tPairEncoding *Encoding) + func (e *Encoding) Char2Token(pos int) (retVal int, ok bool) + func (e *Encoding) Char2Word(pos int) (retVal int, ok bool) + func (e *Encoding) Clone() *Encoding + func (e *Encoding) GetAttentionMask() []int + func (e *Encoding) GetIds() []int + func (e *Encoding) GetOffsets() [][]int + func (e *Encoding) GetOverflowing() []Encoding + func (e *Encoding) GetSequenceIds() []int + func (e *Encoding) GetSpecialTokenMask() []int + func (e *Encoding) GetTokens() []string + func (e *Encoding) GetTypeIds() []int + func (e *Encoding) GetWords() []int + func (e *Encoding) IsEmpty() (retVal bool) + func (e *Encoding) Len() (retVal int) + func (e *Encoding) Merge(encodings []Encoding, growingOffsets bool) (retVal *Encoding) + func (e *Encoding) MergeWith(pair *Encoding, growingOffsets bool) (retVal *Encoding) + func (e *Encoding) NSequences() int + func (e *Encoding) Pad(targetLength, padId, padTypeId int, padToken string, ...) *Encoding + func (e *Encoding) SequenceRange(sequencId int) (Range, error) + func (e *Encoding) SetOverflowing(overflowing []Encoding) + func (e *Encoding) SetSequenceIds(sequenceId int) + func (e *Encoding) SetTypeIds(typeIds []int) + func (e *Encoding) SetWord(index int, val int) + func (e *Encoding) TakeOverflowing() []Encoding + func (e *Encoding) Token2Chars(tokenIdx int) (retVal []int, ok bool) + func (e *Encoding) Token2Sequence(token int) (int, bool) + func (e *Encoding) Token2Word(tokenIdx int) (retVal int, ok bool) + func (e *Encoding) Truncate(maxLen int, stride int) (retVal *Encoding, err error) + func (e *Encoding) Word2Chars(word int) (retVal []int, ok bool) + func (e *Encoding) Word2Tokens(word int) (startTok, endTok int, ok bool) + type EncodingOpt func(o *EncodingOpts) + func WithSequenceRangeEncodingOpt(v map[int]Range) EncodingOpt + func WithWordsEncodingOpt(v []int) EncodingOpt + type EncodingOpts struct + SequenceRange map[int]Range + Words []int + func DefaultEncodingOpts() *EncodingOpts + type InputSequence struct + func NewInputSequence(input interface{}) (retVal InputSequence) + type InputType int + type Model interface + GetVocab func() map[string]int + GetVocabSize func() int + IdToToken func(id int) (token string, ok bool) + Save func(path string, prefixOpt ...string) error + TokenToId func(token string) (id int, ok bool) + Tokenize func(sequence string) ([]Token, error) + type ModelConfig struct + ByteFallback bool + ContinuingSubwordPrefix interface{} + Dropout interface{} + EndOfWordSuffix interface{} + FuseUnk bool + MaxInputCharsPerWord interface{} + Merges []string + Type string + UnkToken string + Vocab map[string]int + type NormalizerConfig struct + Normalizers []map[string]interface{} + Type string + type OffsetConverter interface + Convert func(offsets []int) ([]int, error) + type OffsetType int + const Byte + const Char + type PaddingDirection int + const Left + const Right + type PaddingParams struct + Direction PaddingDirection + PadId int + PadToken string + PadTypeId int + Strategy PaddingStrategy + type PaddingStrategy struct + Name string + Value interface{} + func NewPaddingStrategy(opts ...PaddingStrategyOption) *PaddingStrategy + type PaddingStrategyOption func(*PaddingStrategy) + func WithBatchLongest() PaddingStrategyOption + func WithFixed(size int) PaddingStrategyOption + type PostProcessor interface + AddedTokens func(isPair bool) int + Process func(encoding, pairEncoding *Encoding, addSpecialTokens bool) *Encoding + type PostProcessorConfig struct + Pair []map[string]interface{} + Single []map[string]interface{} + SpecialTokens map[string]interface{} + Type string + type PreToken struct + Offsets []int + Tokens []Token + Value string + type PreTokenizedString struct + func NewPreTokenizedString(s string) *PreTokenizedString + func NewPreTokenizedStringFromNS(n *normalizer.NormalizedString) *PreTokenizedString + func (pt *PreTokenizedString) GetSplits(offsetRef normalizer.IndexOn, offsetType OffsetType) []PreToken + func (pt *PreTokenizedString) IntoEncoding(typeId int, wordIdx int, offsetType OffsetType) (*Encoding, error) + func (pt *PreTokenizedString) Normalize(nFn func(*normalizer.NormalizedString) *normalizer.NormalizedString) *PreTokenizedString + func (pt *PreTokenizedString) Split(splitFn SplitFn) *PreTokenizedString + func (pt *PreTokenizedString) Tokenize(tokFn func(*normalizer.NormalizedString) ([]Token, error)) (*PreTokenizedString, error) + type PreTokenizer interface + PreTokenize func(*PreTokenizedString) (*PreTokenizedString, error) + type PreTokenizerConfig struct + type Range []int + func NewRange(start, end int) Range + func (r Range) Contains(item int) bool + func (r Range) IsEmpty() bool + func (r Range) Len() int + type Single struct + Sentence InputSequence + type Split struct + func NewSplit(normalized *normalizer.NormalizedString, tokens []Token) Split + type SplitFn func(int, *normalizer.NormalizedString) []SplitIdx + type SplitIdx struct + Normalized *normalizer.NormalizedString + Tokens []Token + type Token struct + Id int + Offsets []int + Value string + func NewToken(id int, value string, offsets []int) Token + type TokenConfig struct + Content string + Id int64 + Lstrip bool + Normalized bool + Rstrip bool + SingleWord bool + Special bool + type Tokenizer struct + func NewTokenizer(model Model) *Tokenizer + func NewTokenizerFromFile(file string) (retVal *Tokenizer) + func (t *Tokenizer) AddSpecialTokens(tokens []AddedToken) (retVal int) + func (t *Tokenizer) AddTokens(tokens []AddedToken) (retVal int) + func (t *Tokenizer) Decode(ids []int, skipSpecialTokens bool) (retVal string) + func (t *Tokenizer) DecodeBatch(sentences [][]int, skipSpecialTokens bool) []string + func (t *Tokenizer) Encode(input EncodeInput, addSpecialTokens bool) (retVal *Encoding, err error) + func (t *Tokenizer) EncodeBatch(inputs []EncodeInput, addSpecialTokens bool) (retVal []Encoding, err error) + func (t *Tokenizer) EncodeCharOffsets(input EncodeInput, addSpecialTokens bool) (*Encoding, error) + func (t *Tokenizer) EncodePair(input, pair string, addSpecialTokensOpt ...bool) (*Encoding, error) + func (t *Tokenizer) EncodeSingle(input string, addSpecialTokensOpt ...bool) (*Encoding, error) + func (t *Tokenizer) EncodeSingleSequence(sequence InputSequence, typeId int, offsetType OffsetType) (*Encoding, error) + func (t *Tokenizer) GetDecoder() Decoder + func (t *Tokenizer) GetModel() Model + func (t *Tokenizer) GetNormalizer() normalizer.Normalizer + func (t *Tokenizer) GetPadding() (retVal *PaddingParams) + func (t *Tokenizer) GetPostProcessor() PostProcessor + func (t *Tokenizer) GetPreTokenizer() PreTokenizer + func (t *Tokenizer) GetSpecialTokens() []string + func (t *Tokenizer) GetTruncation() *TruncationParams + func (t *Tokenizer) GetVocab(withAddedTokens bool) map[string]int + func (t *Tokenizer) GetVocabSize(withAddedTokens bool) int + func (t *Tokenizer) IdToToken(id int) (token string, ok bool) + func (t *Tokenizer) PostProcess(encoding, pairEncoding *Encoding, addSpecialTokens bool) (retVal *Encoding) + func (t *Tokenizer) Save(path string, pretty bool) (err error) + func (t *Tokenizer) Serialize(pretty bool) (retVal string) + func (t *Tokenizer) TokenToId(token string) (id int, ok bool) + func (t *Tokenizer) Tokenize(input string, addSpecialTokensOpt ...bool) ([]string, error) + func (t *Tokenizer) Train(trainer Trainer, files []string) error + func (t *Tokenizer) TrainAndReplace(trainer Model, files []string) (err error) + func (t *Tokenizer) WithDecoder(decoder Decoder) + func (t *Tokenizer) WithModel(model Model) + func (t *Tokenizer) WithNormalizer(n normalizer.Normalizer) + func (t *Tokenizer) WithPadding(padding *PaddingParams) + func (t *Tokenizer) WithPostProcessor(postProcessor PostProcessor) + func (t *Tokenizer) WithPreTokenizer(preTokenizer PreTokenizer) + func (t *Tokenizer) WithTruncation(trunc *TruncationParams) + type Trainer interface + ProcessTokens func(words map[string]int, tokens []string) + Train func(words map[string]int) (Model, []AddedToken) + WithProgressBar func() bool + type TruncationParams struct + MaxLength int + Strategy TruncationStrategy + Stride int + type TruncationStrategy int + const LongestFirst + const OnlyFirst + const OnlySecond