Documentation
¶
Index ¶
- Variables
- func DefaultSplit() normalizer.SplitDelimiterBehavior
- func FixedScript(c rune) string
- func GenerateBytesChar() map[uint8]string
- func GetScript(r rune) string
- func ProcessOffsets(encoding *tokenizer.Encoding, addPrefixSpace bool) *tokenizer.Encoding
- type BertPreTokenizer
- type ByteLevel
- func (bl *ByteLevel) AddedToken(isPair bool) int
- func (bl *ByteLevel) Alphabet() map[string]struct{}
- func (bl *ByteLevel) Decode(tokens []string) string
- func (bl *ByteLevel) DecodeChain(tokens []string) []string
- func (bl *ByteLevel) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
- func (bl *ByteLevel) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) *tokenizer.Encoding
- func (bl *ByteLevel) SetAddPrefixSpace(v bool)
- func (bl *ByteLevel) SetTrimOffsets(v bool)
- type CharDelimiterSplit
- type Digits
- type Metaspace
- func (m *Metaspace) Decode(tokens []string) string
- func (m *Metaspace) DecodeChain(tokens []string) []string
- func (m *Metaspace) GetReplacement() string
- func (m *Metaspace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
- func (m *Metaspace) SetReplacement(replacement string)
- type Punctuation
- type Sequence
- type Split
- type UnicodeScript
- type Whitespace
- type WhitespaceSplit
Constants ¶
This section is empty.
Variables ¶
var BytesChar map[uint8]string = GenerateBytesChar()
Functions ¶
func DefaultSplit ¶
func DefaultSplit() normalizer.SplitDelimiterBehavior
func FixedScript ¶
func GenerateBytesChar ¶
BytesChar maps first 0-255 (byte) to first 0-255 `char` in unicode Ref. https://en.wikipedia.org/wiki/List_of_Unicode_characters Ref. https://rosettacode.org/wiki/UTF-8_encode_and_decode See example: https://play.golang.org/p/_1W0ni2uZWm
Types ¶
type BertPreTokenizer ¶
type BertPreTokenizer struct{}
func NewBertPreTokenizer ¶
func NewBertPreTokenizer() *BertPreTokenizer
func (*BertPreTokenizer) PreTokenize ¶
func (bt *BertPreTokenizer) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
PreTokenize implements PreTokenizer interface for BertPreTokenizer
type ByteLevel ¶
type ByteLevel struct { // whether to add a leading space to the first word. // It allows to treat the leading word just as any other words. AddPrefixSpace bool // Whether the post processing step should trim offsets // to avoid including whitespaces. TrimOffsets bool }
ByteLevel provides all the neccessary steps to handle the BPE tokenization at byte-level. It takes care of all the required processing steps to transform a utf-8 string as needed before and after the BPE model does it job.
func NewByteLevel ¶
func NewByteLevel() *ByteLevel
NewByteLevel returns a default ByteLevel with both AddPrefixSpace and TrimOffsets set true
func (*ByteLevel) AddedToken ¶
func (*ByteLevel) Decode ¶
Decode converts any byte-level characters to their unicode couterpart before merging everything back into a single string
func (*ByteLevel) DecodeChain ¶
func (*ByteLevel) PreTokenize ¶
func (bl *ByteLevel) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
PreTokenizer, as a `PreTokenizer`, `ByteLevel` is in charge of transforming all the unicode characters into their byte-level counterpart. It also splits the input according to the configured regex.
func (*ByteLevel) SetAddPrefixSpace ¶
SetAddPrefixSpace set `AddPrefixSpace` property
func (*ByteLevel) SetTrimOffsets ¶
SetTrimOffsets set `TrimOffsets` property
type CharDelimiterSplit ¶
type CharDelimiterSplit struct {
Delimiter rune
}
func NewCharDelimiterSplit ¶
func NewCharDelimiterSplit(delimiter rune) *CharDelimiterSplit
func (*CharDelimiterSplit) PreTokenize ¶
func (d *CharDelimiterSplit) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
type Digits ¶
type Digits struct {
IndividualDigits bool
}
func DefaultDigits ¶
func DefaultDigits() *Digits
func (*Digits) PreTokenize ¶
func (p *Digits) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
PreTokenize implements tokenizer.PreTokenizer.
type Metaspace ¶
Metaspace constructs a Metaspace struct. It replaces all the whitespaces by the provided meta character and then splits on this character.
func DefaultMetaspace ¶
func DefaultMetaspace() *Metaspace
func NewMetaspace ¶
func (*Metaspace) DecodeChain ¶
DecodeChain implements Decoder interface.
func (*Metaspace) GetReplacement ¶
func (*Metaspace) PreTokenize ¶
func (m *Metaspace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
PreTokenize implements PreTokenizer interface
func (*Metaspace) SetReplacement ¶
type Punctuation ¶
type Punctuation struct {
Behavior normalizer.SplitDelimiterBehavior
}
func DefaultPunctuation ¶
func DefaultPunctuation() *Punctuation
func NewPunctuation ¶
func NewPunctuation(behavior normalizer.SplitDelimiterBehavior) *Punctuation
func (*Punctuation) PreTokenize ¶
func (p *Punctuation) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
impl PreTokenizer for Punctuation { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { pretokenized.split(|_, s| s.split(is_punc, self.behavior)) } }
PreTokenize implements tokenizer.PreTokenizer.
type Sequence ¶
type Sequence struct {
// contains filtered or unexported fields
}
func NewSequence ¶
func NewSequence(pretokenizers []tokenizer.PreTokenizer) *Sequence
func (*Sequence) PreTokenize ¶
func (p *Sequence) PreTokenize(v *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
type Split ¶
type Split struct { Pattern normalizer.Pattern Behavior normalizer.SplitDelimiterBehavior Invert bool }
func NewSplit ¶
func NewSplit(pattern normalizer.Pattern, behavior normalizer.SplitDelimiterBehavior, invert bool) *Split
func (*Split) PreTokenize ¶
func (s *Split) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
type UnicodeScript ¶
type UnicodeScript struct{}
func DefaultUnicodeScript ¶
func DefaultUnicodeScript() *UnicodeScript
func NewUnicodeScript ¶
func NewUnicodeScript() *UnicodeScript
func (*UnicodeScript) PreTokenize ¶
func (us *UnicodeScript) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
type Whitespace ¶
type Whitespace struct{}
func DefaultWhitespace ¶
func DefaultWhitespace() *Whitespace
func NewWhitespace ¶
func NewWhitespace() *Whitespace
func (*Whitespace) PreTokenize ¶
func (p *Whitespace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
type WhitespaceSplit ¶
type WhitespaceSplit struct{}
func NewWhitespaceSplit ¶
func NewWhitespaceSplit() *WhitespaceSplit
func (*WhitespaceSplit) PreTokenize ¶
func (p *WhitespaceSplit) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)