tokenizer

package module

v0.0.0-...-33467e6 Latest Latest Go to latest Published: Feb 10, 2023 License: MIT Imports: 8 Imported by: 2

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/go-aie/tokenizer

Links

Open Source Insights

README ¶

tokenizer

NLP tokenizers in Go.

Installation

$ go get -u github.com/go-aie/tokenizer

Documentation

Check out the documentation.

License

MIT

Documentation ¶

Index ¶

func NewWordLevel(vocab map[string]int, unkToken string) *wordlevel.WordLevel
type RuneLevel
- func NewRuneLevel(vocab RuneLevelVocab) *RuneLevel
- func (rl *RuneLevel) Tokenize(token string) ([]tokenizer.Token, error)
type RuneLevelVocab
type Tokenizer
- func (t *Tokenizer) EncodeBatchSerially(inputs []tokenizer.EncodeInput, addSpecialTokens bool) ([]tokenizer.Encoding, error)
- func (t *Tokenizer) EncodeBatchTexts(texts []string, addSpecialTokens bool) ([]tokenizer.Encoding, error)
type Vocab
- func NewVocabFromFile[T constraints.Integer](filename, separator, unkToken string) (*Vocab[T], error)
- func NewVocabFromSlice[T constraints.Integer](lines []string, separator, unkToken string) (*Vocab[T], error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func NewWordLevel ¶

func NewWordLevel(vocab map[string]int, unkToken string) *wordlevel.WordLevel

NewWordLevel creates a WordLevel model from a given vocab.

Types ¶

type RuneLevel ¶

type RuneLevel struct {
	*wordlevel.WordLevel
	// contains filtered or unexported fields
}

RuneLevel is a model tokenizer that splits each word into runes and maps runes to IDs.

func NewRuneLevel ¶

func NewRuneLevel(vocab RuneLevelVocab) *RuneLevel

func (*RuneLevel) Tokenize ¶

func (rl *RuneLevel) Tokenize(token string) ([]tokenizer.Token, error)

Tokenize transforms given input token into a list of rune-level sub-tokens.

type RuneLevelVocab ¶

type RuneLevelVocab interface {
	Vocab() map[string]int
	UnkToken() string
	TokenToID(token string) (int, error)
}

type Tokenizer ¶

type Tokenizer struct {
	*tokenizer.Tokenizer
}

func (*Tokenizer) EncodeBatchSerially ¶

func (t *Tokenizer) EncodeBatchSerially(inputs []tokenizer.EncodeInput, addSpecialTokens bool) ([]tokenizer.Encoding, error)

EncodeBatchSerially encodes all sentences serially.

func (*Tokenizer) EncodeBatchTexts ¶

func (t *Tokenizer) EncodeBatchTexts(texts []string, addSpecialTokens bool) ([]tokenizer.Encoding, error)

type Vocab ¶

type Vocab[T constraints.Integer] struct {
	// contains filtered or unexported fields
}

func NewVocabFromFile ¶

func NewVocabFromFile[T constraints.Integer](filename, separator, unkToken string) (*Vocab[T], error)

func NewVocabFromSlice ¶

func NewVocabFromSlice[T constraints.Integer](lines []string, separator, unkToken string) (*Vocab[T], error)

func (*Vocab[T]) IDToToken ¶

func (v *Vocab[T]) IDToToken(id T) (string, error)

func (*Vocab[T]) IDsToTokens ¶

func (v *Vocab[T]) IDsToTokens(ids []T) (tokens []string, err error)

func (*Vocab[T]) TokenToID ¶

func (v *Vocab[T]) TokenToID(token string) (T, error)

func (*Vocab[T]) TokensToIDs ¶

func (v *Vocab[T]) TokensToIDs(tokens []string) (ids []T, err error)

func (*Vocab[T]) UnkToken ¶

func (v *Vocab[T]) UnkToken() string

func (*Vocab[T]) Vocab ¶

func (v *Vocab[T]) Vocab() map[string]T

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL