Documentation ¶
Overview ¶
Package tokenizer represents a tokenization pipeline.
Index ¶
- Constants
- Variables
- func CachedPath(modelNameOrPath, fileName string) (resolvedPath string, err error)
- func CleanCache() error
- type ATOption
- type AddedToken
- func (at AddedToken) GetPattern(n normalizer.Normalizer) (retVal string)
- func (at AddedToken) SetLStrip(lstrip bool) (retVal AddedToken)
- func (at AddedToken) SetNormalized(normalized bool) (retVal AddedToken)
- func (at AddedToken) SetRStrip(rstrip bool) (retVal AddedToken)
- func (at AddedToken) SetSingleWord(singleWord bool) (retVal AddedToken)
- type AddedTokenWithId
- type AddedVocabulary
- func (av *AddedVocabulary) AddSpecialTokens(tokens []AddedToken, model Model, normalizer normalizer.Normalizer) (retVal int)
- func (av *AddedVocabulary) AddTokens(tokens []AddedToken, model Model, normalizer normalizer.Normalizer) (retVal int)
- func (av *AddedVocabulary) ExtractAndNormalize(sequence string, n normalizer.Normalizer) *PreTokenizedString
- func (av *AddedVocabulary) GetVocab() (retVal map[string]int)
- func (av *AddedVocabulary) IdToToken(id int, model Model) (retVal string, ok bool)
- func (av *AddedVocabulary) IsSpecialToken(token string) bool
- func (av *AddedVocabulary) Len() int
- func (av *AddedVocabulary) TokenToId(token string, model Model) (retVal int, ok bool)
- type BytesToCharOffsetConverter
- type Config
- type Decoder
- type DecoderConfig
- type Dual
- type EncodeInput
- type Encoding
- func DefaultEncoding() *Encoding
- func DefaultProcess(encoding, pairEncoding *Encoding, addSpecialTokens bool) *Encoding
- func MergeEncodings(encodings []Encoding, growingOffsets bool) *Encoding
- func NewEncoding(ids []int, typeIds []int, tokens []string, offsets [][]int, ...) *Encoding
- func NewEncodingFromTokens(tokens []Token, typeId int) (retVal *Encoding)
- func NewEncodingWithCapacity(l int) (retVal *Encoding)
- func PadEncodings(encodings []Encoding, params PaddingParams) []Encoding
- func PrepareEncodings(encoding, pairEncoding *Encoding) (out []Encoding)
- func TruncateEncodings(encoding, pairEncoding *Encoding, params *TruncationParams) (tEncoding, tPairEncoding *Encoding)
- func (e *Encoding) Char2Token(pos int) (retVal int, ok bool)
- func (e *Encoding) Char2Word(pos int) (retVal int, ok bool)
- func (e *Encoding) Clone() *Encoding
- func (e *Encoding) GetAttentionMask() []int
- func (e *Encoding) GetIds() []int
- func (e *Encoding) GetOffsets() [][]int
- func (e *Encoding) GetOverflowing() []Encoding
- func (e *Encoding) GetSequenceIds() []int
- func (e *Encoding) GetSpecialTokenMask() []int
- func (e *Encoding) GetTokens() []string
- func (e *Encoding) GetTypeIds() []int
- func (e *Encoding) GetWords() []int
- func (e *Encoding) IsEmpty() (retVal bool)
- func (e *Encoding) Len() (retVal int)
- func (e *Encoding) Merge(encodings []Encoding, growingOffsets bool) (retVal *Encoding)
- func (e *Encoding) MergeWith(pair *Encoding, growingOffsets bool) (retVal *Encoding)
- func (e *Encoding) NSequences() int
- func (e *Encoding) Pad(targetLength, padId, padTypeId int, padToken string, ...) *Encoding
- func (e *Encoding) SequenceRange(sequencId int) (Range, error)
- func (e *Encoding) SetOverflowing(overflowing []Encoding)
- func (e *Encoding) SetSequenceIds(sequenceId int)
- func (e *Encoding) SetTypeIds(typeIds []int)
- func (e *Encoding) SetWord(index int, val int)
- func (e *Encoding) TakeOverflowing() []Encoding
- func (e *Encoding) Token2Chars(tokenIdx int) (retVal []int, ok bool)
- func (e *Encoding) Token2Sequence(token int) (int, bool)
- func (e *Encoding) Token2Word(tokenIdx int) (retVal int, ok bool)
- func (e *Encoding) Truncate(maxLen int, stride int) (retVal *Encoding, err error)
- func (e *Encoding) Word2Chars(word int) (retVal []int, ok bool)
- func (e *Encoding) Word2Tokens(word int) (startTok, endTok int, ok bool)
- type EncodingOpt
- type EncodingOpts
- type InputSequence
- type InputType
- type Model
- type ModelConfig
- type NormalizerConfig
- type OffsetConverter
- type OffsetType
- type PaddingDirection
- type PaddingParams
- type PaddingStrategy
- type PaddingStrategyOption
- type PostProcessor
- type PostProcessorConfig
- type PreToken
- type PreTokenizedString
- func (pt *PreTokenizedString) GetSplits(offsetRef normalizer.IndexOn, offsetType OffsetType) []PreToken
- func (pt *PreTokenizedString) IntoEncoding(typeId int, wordIdx int, offsetType OffsetType) (*Encoding, error)
- func (pt *PreTokenizedString) Normalize(nFn func(*normalizer.NormalizedString) *normalizer.NormalizedString) *PreTokenizedString
- func (pt *PreTokenizedString) Split(splitFn SplitFn) *PreTokenizedString
- func (pt *PreTokenizedString) Tokenize(tokFn func(*normalizer.NormalizedString) ([]Token, error)) (*PreTokenizedString, error)
- type PreTokenizer
- type PreTokenizerConfig
- type Range
- type Single
- type Split
- type SplitFn
- type SplitIdx
- type Token
- type TokenConfig
- type Tokenizer
- func (t *Tokenizer) AddSpecialTokens(tokens []AddedToken) (retVal int)
- func (t *Tokenizer) AddTokens(tokens []AddedToken) (retVal int)
- func (t *Tokenizer) Decode(ids []int, skipSpecialTokens bool) (retVal string)
- func (t *Tokenizer) DecodeBatch(sentences [][]int, skipSpecialTokens bool) []string
- func (t *Tokenizer) Encode(input EncodeInput, addSpecialTokens bool) (retVal *Encoding, err error)
- func (t *Tokenizer) EncodeBatch(inputs []EncodeInput, addSpecialTokens bool) (retVal []Encoding, err error)
- func (t *Tokenizer) EncodeCharOffsets(input EncodeInput, addSpecialTokens bool) (*Encoding, error)
- func (t *Tokenizer) EncodePair(input, pair string, addSpecialTokensOpt ...bool) (*Encoding, error)
- func (t *Tokenizer) EncodeSingle(input string, addSpecialTokensOpt ...bool) (*Encoding, error)
- func (t *Tokenizer) EncodeSingleSequence(sequence InputSequence, typeId int, offsetType OffsetType) (*Encoding, error)
- func (t *Tokenizer) GetDecoder() Decoder
- func (t *Tokenizer) GetModel() Model
- func (t *Tokenizer) GetNormalizer() normalizer.Normalizer
- func (t *Tokenizer) GetPadding() (retVal *PaddingParams)
- func (t *Tokenizer) GetPostProcessor() PostProcessor
- func (t *Tokenizer) GetPreTokenizer() PreTokenizer
- func (t *Tokenizer) GetSpecialTokens() []string
- func (t *Tokenizer) GetTruncation() *TruncationParams
- func (t *Tokenizer) GetVocab(withAddedTokens bool) map[string]int
- func (t *Tokenizer) GetVocabSize(withAddedTokens bool) int
- func (t *Tokenizer) IdToToken(id int) (token string, ok bool)
- func (t *Tokenizer) PostProcess(encoding, pairEncoding *Encoding, addSpecialTokens bool) (retVal *Encoding)
- func (t *Tokenizer) Save(path string, pretty bool) (err error)
- func (t *Tokenizer) Serialize(pretty bool) (retVal string)
- func (t *Tokenizer) TokenToId(token string) (id int, ok bool)
- func (t *Tokenizer) Tokenize(input string, addSpecialTokensOpt ...bool) ([]string, error)
- func (t *Tokenizer) Train(trainer Trainer, files []string) error
- func (t *Tokenizer) TrainAndReplace(trainer Model, files []string) (err error)
- func (t *Tokenizer) WithDecoder(decoder Decoder)
- func (t *Tokenizer) WithModel(model Model)
- func (t *Tokenizer) WithNormalizer(n normalizer.Normalizer)
- func (t *Tokenizer) WithPadding(padding *PaddingParams)
- func (t *Tokenizer) WithPostProcessor(postProcessor PostProcessor)
- func (t *Tokenizer) WithPreTokenizer(preTokenizer PreTokenizer)
- func (t *Tokenizer) WithTruncation(trunc *TruncationParams)
- type Trainer
- type TruncationParams
- type TruncationStrategy
Examples ¶
Constants ¶
const ( WeightName = "pytorch_model.gt" ConfigName = "config.json" TokenizerName = "tokenizer.json" // NOTE. URL form := `$HFpath/ModelName/resolve/main/WeightName` HFpath = "https://huggingface.co" )
const ( RawInput = iota PretokenizedInput PretokenizedOwnedInput PretokenizedCowInput )
const ( SecondSequenceNotProvided = "Truncation error: Second sequence not provided" SequenceTooShort = "Truncation error: Sequence to truncate too short to respect the provided max_length" )
Variables ¶
var (
CachedDir string = "NOT_SETTING"
)
var (
DUMMY_INPUT [][]int64 = [][]int64{
{7, 6, 0, 0, 1},
{1, 2, 3, 0, 0},
{0, 0, 0, 4, 5},
}
)
Functions ¶
func CachedPath ¶
CachedPath resolves and caches data based on input string, then returns fullpath to the cached data.
Parameters: - `modelNameOrPath`: model name e.g., "bert-base-uncase" or path to directory contains model/config files. - `fileName`: model or config file name. E.g., "pytorch_model.py", "config.json"
CachedPath does several things consequently: 1. Resolves input string to a fullpath cached filename candidate. 2. Check it at `CachedPath`, if exists, then return the candidate. If not 3. Retrieves and Caches data to `CachedPath` and returns path to cached data
NOTE. default `CachedDir` is at "{$HOME}/.cache/transformer" Custom `CachedDir` can be changed by setting with environment `GO_TRANSFORMER`
func CleanCache ¶
func CleanCache() error
CleanCache removes all files cached in transformer cache directory `CachedDir`.
NOTE. custom `CachedDir` can be changed by setting environment `GO_TRANSFORMER`
Types ¶
type ATOption ¶
type ATOption func(at *AddedToken)
func WithLStrip ¶
WithLStrip specify whether this token should include all the whitespaces on its left in order to strip them out.
func WithNormalized ¶
WithNormalized specifies whether this token should be normalized and match against its normalized version in the input text.
func WithRStrip ¶
WithRStrip specify whether this token should include all the whitespaces on its right in order to strip them out.
func WithSingleWord ¶
WithSingleWord specifies whether this token should only match on whole single words, and never part of a word.
type AddedToken ¶
type AddedToken struct { // Content is the content of added token Content string // whether this token is single word or break words SingleWord bool // Whether this token should strip whitespace on its left LStrip bool // Whether this token should strip whitespace on its right RStrip bool // Whether this token should be normalized Normalized bool }
AddedToken represents a token added by the user on top of the existing model vocabulary.
AddedToken can be configured to specify the behaviour they should have in various situations. I.e.,: - Whether they should only match single words - Whether to include any whitespace on its left or right
func DefaultAddedToken ¶
func DefaultAddedToken() (retVal AddedToken)
DefaultAddedToken initiates a default AddedToken
func NewAddedToken ¶
func NewAddedToken(s string, special bool, opts ...ATOption) (retVal AddedToken)
NewAddedToken builds an AddedToken from given content specifying whether it is intended to be a special token. NOTE. Special token ar not normalized by default.
func (AddedToken) GetPattern ¶
func (at AddedToken) GetPattern(n normalizer.Normalizer) (retVal string)
GetPattern retrieves the pattern built for this token, according to all the specified parameters.
NOTE. normalizer input is optional
func (AddedToken) SetLStrip ¶
func (at AddedToken) SetLStrip(lstrip bool) (retVal AddedToken)
Specify whether this token should include all the whitespaces on its left, in order to strip them out.
func (AddedToken) SetNormalized ¶
func (at AddedToken) SetNormalized(normalized bool) (retVal AddedToken)
Specify whether this token should be normalized and match against its normalized version in the input text.
func (AddedToken) SetRStrip ¶
func (at AddedToken) SetRStrip(rstrip bool) (retVal AddedToken)
Specify whether this token should include all the whitespaces on its right, in order to strip them out.
func (AddedToken) SetSingleWord ¶
func (at AddedToken) SetSingleWord(singleWord bool) (retVal AddedToken)
Specify whether this token should only match on whole single words, and never part of a word.
type AddedTokenWithId ¶
type AddedTokenWithId struct { Id int // Id assigned to this token Special bool // whether this is a special token Token AddedToken // the target AddedToken }
type AddedVocabulary ¶
type AddedVocabulary struct {
// contains filtered or unexported fields
}
AddedVocabulary is a vocabulary built on top of the Model
This provides a way to add new vocabulary to a Tokenizer that has already been trained, in a previous process, maybe by someone else. This is especially interesting in the case of fine-tunings, where we want to finetune a model while adding some new functionalities using some new special tokens, or maybe add some tokens in the case of unknown tokens, etc.
One of the reasons we need to handle these tokens outside of the model is simply that for many models, it is not possible to add new tokens after the training process. For example, using BPE, the training process generates merges pairs along the vocabulary, and any token in the vocabulary can be decomposed in other tokens, down to the original alphabet. If we were to add new tokens after this training process, we couldn't make sure the merges pairs exist as required.
func NewAddedVocabulary ¶
func NewAddedVocabulary() (retVal AddedVocabulary)
func (*AddedVocabulary) AddSpecialTokens ¶
func (av *AddedVocabulary) AddSpecialTokens(tokens []AddedToken, model Model, normalizer normalizer.Normalizer) (retVal int)
Add some special tokens to the vocabulary It returns number of added tokens
func (*AddedVocabulary) AddTokens ¶
func (av *AddedVocabulary) AddTokens(tokens []AddedToken, model Model, normalizer normalizer.Normalizer) (retVal int)
Add some tokens to the vocabulary It returns number of added tokens
func (*AddedVocabulary) ExtractAndNormalize ¶
func (av *AddedVocabulary) ExtractAndNormalize(sequence string, n normalizer.Normalizer) *PreTokenizedString
ExtractAndNormalize extracts the additional vocabulary from the given sentence, normalizing it along the way.
Some tokens should match against their normalized representation, as well as the non-normalized one. For example, when we expect to extract the token `yesterday` in the input sentence `I read a book Yesterday`, if the normalizer is supposed to lowercase everything, we expect a match.
func (*AddedVocabulary) GetVocab ¶
func (av *AddedVocabulary) GetVocab() (retVal map[string]int)
GetVocab gets the additional vocabulary
func (*AddedVocabulary) IdToToken ¶
func (av *AddedVocabulary) IdToToken(id int, model Model) (retVal string, ok bool)
Get the token matching the given id if it exists
func (*AddedVocabulary) IsSpecialToken ¶
func (av *AddedVocabulary) IsSpecialToken(token string) bool
Check if a token is a special token
func (*AddedVocabulary) Len ¶
func (av *AddedVocabulary) Len() int
Len returns size of the additional vocabulary
type BytesToCharOffsetConverter ¶
type BytesToCharOffsetConverter struct {
// contains filtered or unexported fields
}
func NewBytesToCharOffsetConverter ¶
func NewBytesToCharOffsetConverter(sequence string) *BytesToCharOffsetConverter
type Config ¶
type Config struct { Version string `json:"version"` Truncation map[string]interface{} `json:"truncation"` Padding map[string]interface{} `json:"padding"` AddedTokens []TokenConfig `json:"added_tokens"` Normalizer map[string]interface{} `json:"normalizer"` PreTokenizer map[string]interface{} `json:"pre_tokenizer"` PostProcessor map[string]interface{} `json:"post_processor"` Decoder map[string]interface{} `json:"decoder"` Model map[string]interface{} `json:"model"` }
Config construct configuration for creating Tokenizer.
Example ¶
tokFile, err := CachedPath("hf-internal-testing/llama-tokenizer", "tokenizer.json") if err != nil { panic(err) } f, err := os.Open(tokFile) if err != nil { panic(err) } dec := json.NewDecoder(f) var config *Config err = dec.Decode(&config) if err != nil { panic(err) } modelConfig := util.NewParams(config.Model) modelType := modelConfig.Get("type", "").(string) fmt.Println(modelType)
Output: BPE
func ConfigFromFile ¶
ConfigFromFile loads config from file.
type DecoderConfig ¶
type Dual ¶
type Dual struct { Sentence InputSequence Pair InputSequence }
type EncodeInput ¶
type EncodeInput interface {
// contains filtered or unexported methods
}
func NewDualEncodeInput ¶
func NewDualEncodeInput(sentence, pairSentence InputSequence) (retVal EncodeInput)
func NewSingleEncodeInput ¶
func NewSingleEncodeInput(sentence InputSequence) (retVal EncodeInput)
type Encoding ¶
type Encoding struct { Ids []int // ID produced by the `tokenizer` TypeIds []int // Type of the ID Tokens []string // Tokens associated with each ID Offsets [][]int // Offsets of the token/ID from the NormalizedString SpecialTokenMask []int // Mask identifying special tokens AttentionMask []int // Mask identifying padding tokens for the attention mechanism Overflowing []Encoding // A list of overflowing generated when being truncated Words []int // Optional - Indexes of the word associated with each token/ID. None value = -1 SequenceRanges map[int]Range // Range of tokens covered by each sequence. If empty -> only one sequence and covers the entire range. }
Encoding represents the output of tokenizer
func DefaultEncoding ¶
func DefaultEncoding() *Encoding
Default creates an encoding with default values
func DefaultProcess ¶
DefaultProcess is a helper function of PostProcessor's Process method It helps to fast track by just merging encoding and its pair.
func MergeEncodings ¶
MergeEncodings merges slice of encodings together.
func NewEncoding ¶
func NewEncoding(ids []int, typeIds []int, tokens []string, offsets [][]int, specialTokenMask []int, attentionMask []int, overflowing []Encoding, opts ...EncodingOpt) *Encoding
NewEncoding initiate a new encoding from input data
func NewEncodingFromTokens ¶
NewEncodingFromTokens initiate Encoding from input tokens
func NewEncodingWithCapacity ¶
func PadEncodings ¶
func PadEncodings(encodings []Encoding, params PaddingParams) []Encoding
func PrepareEncodings ¶
PrepareEncodings prepares encoding and pairEncoding if any before `ProcessEncodings` call.
func TruncateEncodings ¶
func TruncateEncodings(encoding, pairEncoding *Encoding, params *TruncationParams) (tEncoding, tPairEncoding *Encoding)
func (*Encoding) Char2Token ¶
Char2Token returns a token index that contains the given `char` index
func (*Encoding) GetAttentionMask ¶
GetAttentionMask returns attentionMask from encoding
func (*Encoding) GetOffsets ¶
GetOffsets returns offsets from encoding
func (*Encoding) GetOverflowing ¶
GetOverflowing returns overflowing from encoding
func (*Encoding) GetSequenceIds ¶
func (*Encoding) GetSpecialTokenMask ¶
GetSpecialTokenMask returns specialTokenMask from encoding
func (*Encoding) GetTypeIds ¶
GetTypeIds returns type Ids from encoding
func (*Encoding) NSequences ¶
NSequences returns number of sequences combined in this encoding.
func (*Encoding) Pad ¶
func (e *Encoding) Pad(targetLength, padId, padTypeId int, padToken string, direction PaddingDirection) *Encoding
Pad pads current encoding with given length, values to either Left or Right direction
func (*Encoding) SequenceRange ¶
SequenceRange returns the range to target to retrieve something (word id, offsets, ...) related to the given sequence id.
func (*Encoding) SetOverflowing ¶
SetOverflowing set overflowing.
func (*Encoding) SetSequenceIds ¶
SetSequenceIds set the given sequence id for the whole range of tokens contained in this Encoding
func (*Encoding) SetTypeIds ¶
func (*Encoding) TakeOverflowing ¶
TakeOverflowing returns overflowing and reset it to empty at encoding
func (*Encoding) Token2Chars ¶
Token2Chars get the offsets of the token at the given index
func (*Encoding) Token2Sequence ¶
Token2Sequence returns the index of the sequence containing the given token.
func (*Encoding) Token2Word ¶
Token2Word get the word index of corresponding token if existing
func (*Encoding) Word2Chars ¶
Word2Chars get the offsets of the word at a given index in the input sequence
func (*Encoding) Word2Tokens ¶
Word2Tokens gets the encoded tokens corresponding the word at the given index in the input sequence in the form `(startToken, endToken + 1)`
NOTE. e.Words is optional, therefore, there's case of `none` result if `none` result, `ok` will be false.
type EncodingOpt ¶
type EncodingOpt func(o *EncodingOpts)
func WithSequenceRangeEncodingOpt ¶
func WithSequenceRangeEncodingOpt(v map[int]Range) EncodingOpt
func WithWordsEncodingOpt ¶
func WithWordsEncodingOpt(v []int) EncodingOpt
type EncodingOpts ¶
func DefaultEncodingOpts ¶
func DefaultEncodingOpts() *EncodingOpts
type InputSequence ¶
type InputSequence struct {
// contains filtered or unexported fields
}
func NewInputSequence ¶
func NewInputSequence(input interface{}) (retVal InputSequence)
NewInputSequence creates a new InputSequence from input A valid input can be a string type (RawInput) or slice of string (PretokenizedInput)
type Model ¶
type Model interface { // Tokenize tokenizes the given sequence into multiple underlying `Token` // The `offsets` on the `Token` are expected to be relative to the given // sequence Tokenize(sequence string) ([]Token, error) // TokenToId finds the ID associated with a string token TokenToId(token string) (id int, ok bool) // IdToToken find the string token associated with an ID IdToToken(id int) (token string, ok bool) // GetVocab retrieves the entire vocabulary mapping (token -> Id) GetVocab() map[string]int // GetVocabSize retrieves the entire vocabulary mapping(map[token]id) GetVocabSize() int // Save saves the current `Model` in the given folder, using the // given `prefixOpt` for various files that need to be saved. Save(path string, prefixOpt ...string) error }
Model represents a model used during tokenization (i.e., BPE, Word, or Unigram)
type ModelConfig ¶
type ModelConfig struct { Type string `json:"type"` Dropout interface{} `json:"dropout"` UnkToken string `json:"unk_token"` ContinuingSubwordPrefix interface{} `json:"continuing_subword_prefix"` EndOfWordSuffix interface{} `json:"end_of_word_suffix"` FuseUnk bool `json:"fuse_unk"` ByteFallback bool `json:"byte_fallback"` Vocab map[string]int `json:"vocab"` Merges []string `json:"merges"` MaxInputCharsPerWord interface{} `json:"max_input_chars_per_word"` }
type NormalizerConfig ¶
type OffsetConverter ¶
type OffsetType ¶
type OffsetType int
OffsetType is a enum-like possible type of offsets
const ( Byte OffsetType = iota Char )
type PaddingParams ¶
type PaddingParams struct { Strategy PaddingStrategy Direction PaddingDirection PadId int PadTypeId int PadToken string }
type PaddingStrategy ¶
type PaddingStrategy struct { Value interface{} Name string }
PaddingStrategy is a enum of either - string `BatchLongest` - or a func type `Fixed(uint)` which return a uint Example:
func main() { var ps PaddingStrategy ps = NewPaddingStrategy(WithFixed(3)) fmt.Println(ps.Value) }
func NewPaddingStrategy ¶
func NewPaddingStrategy(opts ...PaddingStrategyOption) *PaddingStrategy
type PaddingStrategyOption ¶
type PaddingStrategyOption func(*PaddingStrategy)
func WithBatchLongest ¶
func WithBatchLongest() PaddingStrategyOption
func WithFixed ¶
func WithFixed(size int) PaddingStrategyOption
type PostProcessor ¶
type PostProcessor interface { // AddedTokens returns the number of tokens that will be added during the processing step AddedTokens(isPair bool) int // Process processes both encodings and returns a new merged one // NOTE: pairEncoding is optional Process(encoding, pairEncoding *Encoding, addSpecialTokens bool) *Encoding }
PostProcessor is in charge of post-processing an encoded output of the `Tokenizer`. It adds any special tokens that a language model would require.
type PostProcessorConfig ¶
type PreTokenizedString ¶
type PreTokenizedString struct {
// contains filtered or unexported fields
}
The `PreTokenizedString` is in charge of splitting an underlying string, making sure everything is fine while doing so, and providing ways to normalize and tokenize these splits.
Once everything has been normalized and tokenized, the `PreTokenizedString` is able to build an `Encoding` with all the relevant offsets and word ids, relative to the original string.
func NewPreTokenizedString ¶
func NewPreTokenizedString(s string) *PreTokenizedString
NewPreTokenizedString create a new PreTokenizedString from input string
func NewPreTokenizedStringFromNS ¶
func NewPreTokenizedStringFromNS(n *normalizer.NormalizedString) *PreTokenizedString
NewNormalizedStringFromNS creates a PreTokenizedString from input NormalizedString
func (*PreTokenizedString) GetSplits ¶
func (pt *PreTokenizedString) GetSplits(offsetRef normalizer.IndexOn, offsetType OffsetType) []PreToken
GetSplits returns a list of splits, each of them being a slice of the normalized string, the associated offsets either in original or normalized referential, as well as the potention tokens
func (*PreTokenizedString) IntoEncoding ¶
func (pt *PreTokenizedString) IntoEncoding(typeId int, wordIdx int, offsetType OffsetType) (*Encoding, error)
IntoEncoding transforms the current `PreTokenizedString` into an `Encoding`.
If a `wordIdx` is provided, any word in the generated `Encoding` will be set to this value. This is generally used with pre-tokenized input, that do not need the `PreTokenizedString` to generate word ids.
This method will fail if some splits do not have associated `Token`.
func (*PreTokenizedString) Normalize ¶
func (pt *PreTokenizedString) Normalize(nFn func(*normalizer.NormalizedString) *normalizer.NormalizedString) *PreTokenizedString
Normalize normalizes all the splits that do not have attached `Tokens`, using the provided `normalize` function.
func (*PreTokenizedString) Split ¶
func (pt *PreTokenizedString) Split(splitFn SplitFn) *PreTokenizedString
Split splits the `PreTokenizedString` by providing a `SplitFn` which is in charge of splitting each substring (`NormalizedString`) into multiple parts. func (pt *PreTokenizedString) Split(splitFn SplitFn) *PreTokenizedString {
func (*PreTokenizedString) Tokenize ¶
func (pt *PreTokenizedString) Tokenize(tokFn func(*normalizer.NormalizedString) ([]Token, error)) (*PreTokenizedString, error)
Tokenize tokenizes all the splits that do not have attached `Tokens`, using the provided `tokenize` function
type PreTokenizer ¶
type PreTokenizer interface {
PreTokenize(*PreTokenizedString) (*PreTokenizedString, error)
}
PreTokenizer is in charge of doing the pre-segmentation step. It splits the given string in multiple substrings, keeping track of the offsets of said substrings from the `NormalizedString`. In some occasions, the `PreTokenizer` might need to modify the given `NormalizedString` to ensure we can entirely keep track of the offsets and the mapping with the original string.
type PreTokenizerConfig ¶
type PreTokenizerConfig struct{}
type Single ¶
type Single struct {
Sentence InputSequence
}
type Split ¶
type Split struct {
// contains filtered or unexported fields
}
Split contains the underlying `NormalizedString` as well as its offsets in the original string. These offsets are in the `original` referential. It also contains any `Token` associated to the current split
func NewSplit ¶
func NewSplit(normalized *normalizer.NormalizedString, tokens []Token) Split
NewSplit creates a new Split from a input NormalizedString
type SplitFn ¶
type SplitFn func(int, *normalizer.NormalizedString) []SplitIdx
SplitFn takes a `NormalizedString` and returns an iterator over the produced `NormalizedString`.
NOTE. SplitFn is free of modifying these `NormalizedString` as long as: The produced `NormalizedString`, if combined back together, must have the same `original` string as the original one given to `SplitFn`. This means that for the offsets tracking to work as expected, `SplitFn` must produce "splits" of the ORIGINAL string.
type SplitIdx ¶
type SplitIdx struct { Normalized *normalizer.NormalizedString Tokens []Token }
type TokenConfig ¶
type Tokenizer ¶
type Tokenizer struct {
// contains filtered or unexported fields
}
Tokenizer represents a tokenization pipeline. It can implement any encoding or decoding of any text.
func NewTokenizerFromFile ¶
NewTokenizerFromFile instantiates a new Tokenizer from the given file
func (*Tokenizer) AddSpecialTokens ¶
func (t *Tokenizer) AddSpecialTokens(tokens []AddedToken) (retVal int)
AddSpecialTokens registers the given tokens as special tokens. This is especially useful for removing these special tokens while decoding
func (*Tokenizer) AddTokens ¶
func (t *Tokenizer) AddTokens(tokens []AddedToken) (retVal int)
AddTokens adds the given tokens to the added vocabulary
func (*Tokenizer) DecodeBatch ¶
DecodeBatch decodes all sentences in concurrency
func (*Tokenizer) Encode ¶
func (t *Tokenizer) Encode(input EncodeInput, addSpecialTokens bool) (retVal *Encoding, err error)
Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
Example ¶
package main import ( "fmt" "log" "github.com/danmolitor/tokenizer/pretrained" ) func main() { tk := pretrained.BertBaseUncased() sentence := `Yesterday I saw a [MASK] far away` en, err := tk.EncodeSingle(sentence) if err != nil { log.Fatal(err) } fmt.Printf("tokens: %v\n", en.GetTokens()) fmt.Printf("offsets: %v\n", en.GetOffsets()) }
Output: tokens: [yesterday i saw a [MASK] far away] offsets: [[0 9] [10 11] [12 15] [16 17] [18 24] [25 28] [29 33]]
func (*Tokenizer) EncodeBatch ¶
func (t *Tokenizer) EncodeBatch(inputs []EncodeInput, addSpecialTokens bool) (retVal []Encoding, err error)
EncodeBatch encodes all sentences in concurrency
func (*Tokenizer) EncodeCharOffsets ¶
func (t *Tokenizer) EncodeCharOffsets(input EncodeInput, addSpecialTokens bool) (*Encoding, error)
EncodeCharOffsets encodes the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
func (*Tokenizer) EncodePair ¶
EncodePair encodes a pair of string sequences.
Params: - input: the sequence string to be tokenized - pair: the pair sequence stirng to be tokenized with - addSpecialTokensOpt: optional (default = false) whether adding special tokens e.g. in BERT model `[CLS]` `[UNK]` or `[SEP]`
func (*Tokenizer) EncodeSingle ¶
EncodeSingle encodes a single input string.
Params: - input: the input string to be tokenized - addSpecialTokensOpt: optional (default = false) whether adding special tokens e.g. in BERT model `[CLS]` `[UNK]` or `[SEP]`
func (*Tokenizer) EncodeSingleSequence ¶
func (t *Tokenizer) EncodeSingleSequence(sequence InputSequence, typeId int, offsetType OffsetType) (*Encoding, error)
EncodeSingleSequence encodes a single sequence
func (*Tokenizer) GetDecoder ¶
func (*Tokenizer) GetNormalizer ¶
func (t *Tokenizer) GetNormalizer() normalizer.Normalizer
func (*Tokenizer) GetPadding ¶
func (t *Tokenizer) GetPadding() (retVal *PaddingParams)
func (*Tokenizer) GetPostProcessor ¶
func (t *Tokenizer) GetPostProcessor() PostProcessor
func (*Tokenizer) GetPreTokenizer ¶
func (t *Tokenizer) GetPreTokenizer() PreTokenizer
func (*Tokenizer) GetSpecialTokens ¶
GetSpecialTokens returns a slice of special tokens.
func (*Tokenizer) GetTruncation ¶
func (t *Tokenizer) GetTruncation() *TruncationParams
func (*Tokenizer) GetVocabSize ¶
GetVocabSize get the size of vocabulary
func (*Tokenizer) PostProcess ¶
func (t *Tokenizer) PostProcess(encoding, pairEncoding *Encoding, addSpecialTokens bool) (retVal *Encoding)
PostProcess does post-processing logic, handling the case where there is no PostProcessor set
func (*Tokenizer) Tokenize ¶
Tokenize slices input string into tokens.
Params: - input: the input string to be tokenized - addSpecialTokensOpt: optional (default = false) whether adding special tokens e.g. in BERT model `[CLS]` `[UNK]` or `[SEP]`
func (*Tokenizer) Train ¶
Train trains a model and replaces the current model using a given trainer The tokenizer does the following steps
- Concurrently, reads training data (text) from files, normalizes text using specified normalizer, and generates a slice of words and their frequency (count)
- Train tokenizer model using specified tokenizer configuration on slice of word-count generated from previous step to create `vocab` and `merges` data (files)
- Update current tokenizer with newly generated model (`vocab` and `merges` data)
func (*Tokenizer) TrainAndReplace ¶
Train a model and replace our current Model, using the given Trainer
func (*Tokenizer) WithDecoder ¶
func (*Tokenizer) WithNormalizer ¶
func (t *Tokenizer) WithNormalizer(n normalizer.Normalizer)
func (*Tokenizer) WithPadding ¶
func (t *Tokenizer) WithPadding(padding *PaddingParams)
func (*Tokenizer) WithPostProcessor ¶
func (t *Tokenizer) WithPostProcessor(postProcessor PostProcessor)
func (*Tokenizer) WithPreTokenizer ¶
func (t *Tokenizer) WithPreTokenizer(preTokenizer PreTokenizer)
func (*Tokenizer) WithTruncation ¶
func (t *Tokenizer) WithTruncation(trunc *TruncationParams)
type Trainer ¶
type Trainer interface { // Whether showing progress bar or not WithProgressBar() bool // Actual training method. It will return a trained model and // a list of `special tokens` to be added directly to the tokenizer // along with the model Train(words map[string]int) (Model, []AddedToken) // ProcessTokens processes a bunch of tokens and counts them as relevant ProcessTokens(words map[string]int, tokens []string) }
Trainer is responsible for training a model. It takes lines/sentences and returns a tokenizer `Model` when done.
type TruncationParams ¶
type TruncationParams struct { MaxLength int Strategy TruncationStrategy Stride int }
type TruncationStrategy ¶
type TruncationStrategy int
TruncationStrategy is enum of int type represents truncation strategy
const ( LongestFirst TruncationStrategy = iota OnlyFirst OnlySecond )