pretokenizer

package
v1.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 17, 2024 License: Apache-2.0 Imports: 6 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var BytesChar map[uint8]string = GenerateBytesChar()
View Source
var CharBytes map[string]uint8 = func() map[string]uint8 {
	var bc = GenerateBytesChar()
	var cb map[string]uint8 = make(map[string]uint8)
	for b, c := range bc {
		cb[c] = b
	}
	return cb
}()

Functions

func FixedScript

func FixedScript(c rune) string

func GenerateBytesChar

func GenerateBytesChar() map[uint8]string

BytesChar maps first 0-255 (byte) to first 0-255 `char` in unicode Ref. https://en.wikipedia.org/wiki/List_of_Unicode_characters Ref. https://rosettacode.org/wiki/UTF-8_encode_and_decode See example: https://play.golang.org/p/_1W0ni2uZWm

func GetScript

func GetScript(r rune) string

GetScript returns key to script in `unicode.Scripts`.

func ProcessOffsets

func ProcessOffsets(encoding *tokenizer.Encoding, addPrefixSpace bool) *tokenizer.Encoding

Types

type BertPreTokenizer

type BertPreTokenizer struct{}

func NewBertPreTokenizer

func NewBertPreTokenizer() *BertPreTokenizer

func (*BertPreTokenizer) PreTokenize

PreTokenize implements PreTokenizer interface for BertPreTokenizer

type ByteLevel

type ByteLevel struct {
	// whether to add a leading space to the first word.
	// It allows to treat the leading word just as any other words.
	AddPrefixSpace bool

	// Whether the post processing step should trim offsets
	// to avoid including whitespaces.
	TrimOffsets bool
}

ByteLevel provides all the neccessary steps to handle the BPE tokenization at byte-level. It takes care of all the required processing steps to transform a utf-8 string as needed before and after the BPE model does it job.

func NewByteLevel

func NewByteLevel() *ByteLevel

NewByteLevel returns a default ByteLevel with both AddPrefixSpace and TrimOffsets set true

func (*ByteLevel) AddedToken

func (bl *ByteLevel) AddedToken(isPair bool) int

func (*ByteLevel) Alphabet

func (bl *ByteLevel) Alphabet() map[string]struct{}

Alphabet returns set of first 256 unicode `char`

func (*ByteLevel) Decode

func (bl *ByteLevel) Decode(tokens []string) string

Decode converts any byte-level characters to their unicode couterpart before merging everything back into a single string

func (*ByteLevel) DecodeChain

func (bl *ByteLevel) DecodeChain(tokens []string) []string

func (*ByteLevel) PreTokenize

func (bl *ByteLevel) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

PreTokenizer, as a `PreTokenizer`, `ByteLevel` is in charge of transforming all the unicode characters into their byte-level counterpart. It also splits the input according to the configured regex.

func (*ByteLevel) Process

func (bl *ByteLevel) Process(encoding, pairEncoding *tokenizer.Encoding, addSpecialTokens bool) *tokenizer.Encoding

func (*ByteLevel) SetAddPrefixSpace

func (bl *ByteLevel) SetAddPrefixSpace(v bool)

SetAddPrefixSpace set `AddPrefixSpace` property

func (*ByteLevel) SetTrimOffsets

func (bl *ByteLevel) SetTrimOffsets(v bool)

SetTrimOffsets set `TrimOffsets` property

type CharDelimiterSplit

type CharDelimiterSplit struct {
	Delimiter rune
}

func NewCharDelimiterSplit

func NewCharDelimiterSplit(delimiter rune) *CharDelimiterSplit

func (*CharDelimiterSplit) PreTokenize

type Digits

type Digits struct {
	IndividualDigits bool
}

func DefaultDigits

func DefaultDigits() *Digits

func NewDigits

func NewDigits(individualDigits bool) *Digits

func (*Digits) PreTokenize

func (p *Digits) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

PreTokenize implements tokenizer.PreTokenizer.

type Metaspace

type Metaspace struct {
	Replacement    string
	AddPrefixSpace bool
	StrRep         string
}

Metaspace constructs a Metaspace struct. It replaces all the whitespaces by the provided meta character and then splits on this character.

func DefaultMetaspace

func DefaultMetaspace() *Metaspace

func NewMetaspace

func NewMetaspace(replacement string, addPrefixSpace bool) *Metaspace

func (*Metaspace) Decode

func (m *Metaspace) Decode(tokens []string) string

func (*Metaspace) DecodeChain

func (m *Metaspace) DecodeChain(tokens []string) []string

DecodeChain implements Decoder interface.

func (*Metaspace) GetReplacement

func (m *Metaspace) GetReplacement() string

func (*Metaspace) PreTokenize

func (m *Metaspace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

PreTokenize implements PreTokenizer interface

func (*Metaspace) SetReplacement

func (m *Metaspace) SetReplacement(replacement string)

type Punctuation

type Punctuation struct {
	Behavior normalizer.SplitDelimiterBehavior
}

func DefaultPunctuation

func DefaultPunctuation() *Punctuation

func NewPunctuation

func NewPunctuation(behavior normalizer.SplitDelimiterBehavior) *Punctuation

func (*Punctuation) PreTokenize

func (p *Punctuation) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)
impl PreTokenizer for Punctuation {
    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
        pretokenized.split(|_, s| s.split(is_punc, self.behavior))
    }
}

PreTokenize implements tokenizer.PreTokenizer.

type Sequence

type Sequence struct {
	// contains filtered or unexported fields
}

func NewSequence

func NewSequence(pretokenizers []tokenizer.PreTokenizer) *Sequence

func (*Sequence) PreTokenize

type Split

type Split struct {
	Pattern  normalizer.Pattern
	Behavior normalizer.SplitDelimiterBehavior
	Invert   bool
}

func NewSplit

func NewSplit(pattern normalizer.Pattern, behavior normalizer.SplitDelimiterBehavior, invert bool) *Split

func (*Split) PreTokenize

func (s *Split) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

type UnicodeScript

type UnicodeScript struct{}

func DefaultUnicodeScript

func DefaultUnicodeScript() *UnicodeScript

func NewUnicodeScript

func NewUnicodeScript() *UnicodeScript

func (*UnicodeScript) PreTokenize

func (us *UnicodeScript) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

type Whitespace

type Whitespace struct{}

func DefaultWhitespace

func DefaultWhitespace() *Whitespace

func NewWhitespace

func NewWhitespace() *Whitespace

func (*Whitespace) PreTokenize

func (p *Whitespace) PreTokenize(pretokenized *tokenizer.PreTokenizedString) (*tokenizer.PreTokenizedString, error)

type WhitespaceSplit

type WhitespaceSplit struct{}

func NewWhitespaceSplit

func NewWhitespaceSplit() *WhitespaceSplit

func (*WhitespaceSplit) PreTokenize

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL