Versions in this module Expand all Collapse all v1 v1.0.0 Sep 20, 2024 Changes in this version + var BytesChar map[uint8]string = GenerateBytesChar() + var CharBytes map[string]uint8 = func() map[string]uint8 { ... }() + func DefaultSplit() normalizer.SplitDelimiterBehavior + func FixedScript(c rune) string + func GenerateBytesChar() map[uint8]string + func GetScript(r rune) string + func ProcessOffsets(encoding *tokenizer.Encoding, addPrefixSpace bool) *tokenizer.Encoding + type BertPreTokenizer struct + func NewBertPreTokenizer() *BertPreTokenizer + func (bt *BertPreTokenizer) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type ByteLevel struct + AddPrefixSpace bool + TrimOffsets bool + func NewByteLevel() *ByteLevel + func (bl *ByteLevel) AddedToken(isPair bool) int + func (bl *ByteLevel) Alphabet() map[string]struct{} + func (bl *ByteLevel) Decode(tokens []string) string + func (bl *ByteLevel) DecodeChain(tokens []string) []string + func (bl *ByteLevel) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + func (bl *ByteLevel) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) *tokenizer.Encoding + func (bl *ByteLevel) SetAddPrefixSpace(v bool) + func (bl *ByteLevel) SetTrimOffsets(v bool) + type CharDelimiterSplit struct + Delimiter rune + func NewCharDelimiterSplit(delimiter rune) *CharDelimiterSplit + func (d *CharDelimiterSplit) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type Digits struct + IndividualDigits bool + func DefaultDigits() *Digits + func NewDigits(individualDigits bool) *Digits + func (p *Digits) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type Metaspace struct + AddPrefixSpace bool + Replacement string + StrRep string + func DefaultMetaspace() *Metaspace + func NewMetaspace(replacement string, addPrefixSpace bool) *Metaspace + func (m *Metaspace) Decode(tokens []string) string + func (m *Metaspace) DecodeChain(tokens []string) []string + func (m *Metaspace) GetReplacement() string + func (m *Metaspace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + func (m *Metaspace) SetReplacement(replacement string) + type Punctuation struct + Behavior normalizer.SplitDelimiterBehavior + func DefaultPunctuation() *Punctuation + func NewPunctuation(behavior normalizer.SplitDelimiterBehavior) *Punctuation + func (p *Punctuation) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type Sequence struct + func NewSequence(pretokenizers []tokenizer.PreTokenizer) *Sequence + func (p *Sequence) PreTokenize(v *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type Split struct + Behavior normalizer.SplitDelimiterBehavior + Invert bool + Pattern normalizer.Pattern + func NewSplit(pattern normalizer.Pattern, behavior normalizer.SplitDelimiterBehavior, ...) *Split + func (s *Split) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type UnicodeScript struct + func DefaultUnicodeScript() *UnicodeScript + func NewUnicodeScript() *UnicodeScript + func (us *UnicodeScript) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type Whitespace struct + func DefaultWhitespace() *Whitespace + func NewWhitespace() *Whitespace + func (p *Whitespace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error) + type WhitespaceSplit struct + func NewWhitespaceSplit() *WhitespaceSplit + func (p *WhitespaceSplit) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)