sentencepiece

package
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 21, 2021 License: MIT Imports: 14 Imported by: 4

Documentation

Index

Constants

View Source
const (
	Default_TrainerSpec_ModelType                  = TrainerSpec_UNIGRAM
	Default_TrainerSpec_VocabSize                  = int32(8000)
	Default_TrainerSpec_SelfTestSampleSize         = int32(0)
	Default_TrainerSpec_CharacterCoverage          = float32(0.9994999766349792)
	Default_TrainerSpec_InputSentenceSize          = int32(0)
	Default_TrainerSpec_ShuffleInputSentence       = bool(true)
	Default_TrainerSpec_SeedSentencepieceSize      = int32(1000000)
	Default_TrainerSpec_ShrinkingFactor            = float32(0.75)
	Default_TrainerSpec_MaxSentenceLength          = int32(4192)
	Default_TrainerSpec_NumThreads                 = int32(16)
	Default_TrainerSpec_NumSubIterations           = int32(2)
	Default_TrainerSpec_MaxSentencepieceLength     = int32(16)
	Default_TrainerSpec_SplitByUnicodeScript       = bool(true)
	Default_TrainerSpec_SplitByNumber              = bool(true)
	Default_TrainerSpec_SplitByWhitespace          = bool(true)
	Default_TrainerSpec_TreatWhitespaceAsSuffix    = bool(false)
	Default_TrainerSpec_SplitDigits                = bool(false)
	Default_TrainerSpec_ByteFallback               = bool(false)
	Default_TrainerSpec_VocabularyOutputPieceScore = bool(true)
	Default_TrainerSpec_HardVocabLimit             = bool(true)
	Default_TrainerSpec_UseAllVocab                = bool(false)
	Default_TrainerSpec_UnkId                      = int32(0)
	Default_TrainerSpec_BosId                      = int32(1)
	Default_TrainerSpec_EosId                      = int32(2)
	Default_TrainerSpec_PadId                      = int32(-1)
	Default_TrainerSpec_UnkPiece                   = string("<unk>")
	Default_TrainerSpec_BosPiece                   = string("<s>")
	Default_TrainerSpec_EosPiece                   = string("</s>")
	Default_TrainerSpec_PadPiece                   = string("<pad>")
	Default_TrainerSpec_UnkSurface                 = string(" ⁇ ")
	Default_TrainerSpec_TrainExtremelyLargeCorpus  = bool(false)
)

Default values for TrainerSpec fields.

View Source
const (
	Default_NormalizerSpec_AddDummyPrefix         = bool(true)
	Default_NormalizerSpec_RemoveExtraWhitespaces = bool(true)
	Default_NormalizerSpec_EscapeWhitespaces      = bool(true)
)

Default values for NormalizerSpec fields.

View Source
const (
	Default_ModelProto_SentencePiece_Type = ModelProto_SentencePiece_NORMAL
)

Default values for ModelProto_SentencePiece fields.

Variables

View Source
var (
	TrainerSpec_ModelType_name = map[int32]string{
		1: "UNIGRAM",
		2: "BPE",
		3: "WORD",
		4: "CHAR",
	}
	TrainerSpec_ModelType_value = map[string]int32{
		"UNIGRAM": 1,
		"BPE":     2,
		"WORD":    3,
		"CHAR":    4,
	}
)

Enum value maps for TrainerSpec_ModelType.

View Source
var (
	ModelProto_SentencePiece_Type_name = map[int32]string{
		1: "NORMAL",
		2: "UNKNOWN",
		3: "CONTROL",
		4: "USER_DEFINED",
		6: "BYTE",
		5: "UNUSED",
	}
	ModelProto_SentencePiece_Type_value = map[string]int32{
		"NORMAL":       1,
		"UNKNOWN":      2,
		"CONTROL":      3,
		"USER_DEFINED": 4,
		"BYTE":         6,
		"UNUSED":       5,
	}
)

Enum value maps for ModelProto_SentencePiece_Type.

View Source
var File_sentencepiece_sentencepiece_model_proto protoreflect.FileDescriptor

Functions

This section is empty.

Types

type ModelProto

type ModelProto struct {

	// Sentence pieces with scores.
	Pieces []*ModelProto_SentencePiece `protobuf:"bytes,1,rep,name=pieces" json:"pieces,omitempty"`
	// Spec used to generate this model file.
	TrainerSpec *TrainerSpec `protobuf:"bytes,2,opt,name=trainer_spec,json=trainerSpec" json:"trainer_spec,omitempty"`
	// Spec for text normalization.
	NormalizerSpec *NormalizerSpec `protobuf:"bytes,3,opt,name=normalizer_spec,json=normalizerSpec" json:"normalizer_spec,omitempty"`
	// Stores sample input and its expected segmentation to verify the model.
	SelfTestData *SelfTestData `protobuf:"bytes,4,opt,name=self_test_data,json=selfTestData" json:"self_test_data,omitempty"`
	// Spec for text de-normalization.
	DenormalizerSpec *NormalizerSpec `protobuf:"bytes,5,opt,name=denormalizer_spec,json=denormalizerSpec" json:"denormalizer_spec,omitempty"`
	// contains filtered or unexported fields
}

ModelProto stores model parameters. SentencePieceProcessor is supposed to be self-contained. All settings/parameters which may change the behavior must be encoded in ModelProto.

func (*ModelProto) Descriptor deprecated

func (*ModelProto) Descriptor() ([]byte, []int)

Deprecated: Use ModelProto.ProtoReflect.Descriptor instead.

func (*ModelProto) ExtensionRangeArray deprecated

func (*ModelProto) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use ModelProto.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*ModelProto) GetDenormalizerSpec

func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec

func (*ModelProto) GetNormalizerSpec

func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec

func (*ModelProto) GetPieces

func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece

func (*ModelProto) GetSelfTestData

func (x *ModelProto) GetSelfTestData() *SelfTestData

func (*ModelProto) GetTrainerSpec

func (x *ModelProto) GetTrainerSpec() *TrainerSpec

func (*ModelProto) ProtoMessage

func (*ModelProto) ProtoMessage()

func (*ModelProto) ProtoReflect

func (x *ModelProto) ProtoReflect() protoreflect.Message

func (*ModelProto) Reset

func (x *ModelProto) Reset()

func (*ModelProto) String

func (x *ModelProto) String() string

type ModelProto_SentencePiece

type ModelProto_SentencePiece struct {
	Piece *string                        `protobuf:"bytes,1,opt,name=piece" json:"piece,omitempty"` // piece must not be empty.
	Score *float32                       `protobuf:"fixed32,2,opt,name=score" json:"score,omitempty"`
	Type  *ModelProto_SentencePiece_Type `protobuf:"varint,3,opt,name=type,enum=sentencepiece.ModelProto_SentencePiece_Type,def=1" json:"type,omitempty"`
	// contains filtered or unexported fields
}

func (*ModelProto_SentencePiece) Descriptor deprecated

func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int)

Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor instead.

func (*ModelProto_SentencePiece) ExtensionRangeArray deprecated

func (*ModelProto_SentencePiece) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*ModelProto_SentencePiece) GetPiece

func (x *ModelProto_SentencePiece) GetPiece() string

func (*ModelProto_SentencePiece) GetScore

func (x *ModelProto_SentencePiece) GetScore() float32

func (*ModelProto_SentencePiece) GetType

func (*ModelProto_SentencePiece) ProtoMessage

func (*ModelProto_SentencePiece) ProtoMessage()

func (*ModelProto_SentencePiece) ProtoReflect

func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message

func (*ModelProto_SentencePiece) Reset

func (x *ModelProto_SentencePiece) Reset()

func (*ModelProto_SentencePiece) String

func (x *ModelProto_SentencePiece) String() string

type ModelProto_SentencePiece_Type

type ModelProto_SentencePiece_Type int32
const (
	ModelProto_SentencePiece_NORMAL       ModelProto_SentencePiece_Type = 1 // normal symbol
	ModelProto_SentencePiece_UNKNOWN      ModelProto_SentencePiece_Type = 2 // unknown symbol. only <unk> for now.
	ModelProto_SentencePiece_CONTROL      ModelProto_SentencePiece_Type = 3 // control symbols. </s>, <s>, <2ja> etc.
	ModelProto_SentencePiece_USER_DEFINED ModelProto_SentencePiece_Type = 4 // user defined symbols.
	// Typical usage of USER_DEFINED symbol
	// is placeholder.
	ModelProto_SentencePiece_BYTE   ModelProto_SentencePiece_Type = 6 // byte symbols. Used when `byte_fallback` is true.
	ModelProto_SentencePiece_UNUSED ModelProto_SentencePiece_Type = 5 // this piece is not used.
)

func (ModelProto_SentencePiece_Type) Descriptor

func (ModelProto_SentencePiece_Type) Enum

func (ModelProto_SentencePiece_Type) EnumDescriptor deprecated

func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int)

Deprecated: Use ModelProto_SentencePiece_Type.Descriptor instead.

func (ModelProto_SentencePiece_Type) Number

func (ModelProto_SentencePiece_Type) String

func (ModelProto_SentencePiece_Type) Type

func (*ModelProto_SentencePiece_Type) UnmarshalJSON deprecated

func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) error

Deprecated: Do not use.

type NormalizerSpec

type NormalizerSpec struct {

	// name of normalization rule.
	Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
	// Pre-compiled normalization rule created by
	// Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
	// Usually this field is set by Builder::GetNormalizerSpec() method.
	PrecompiledCharsmap []byte `protobuf:"bytes,2,opt,name=precompiled_charsmap,json=precompiledCharsmap" json:"precompiled_charsmap,omitempty"`
	// Adds dummy whitespace at the beginning of text in order to
	// treat "world" in "world" and "hello world" in the same way.
	AddDummyPrefix *bool `protobuf:"varint,3,opt,name=add_dummy_prefix,json=addDummyPrefix,def=1" json:"add_dummy_prefix,omitempty"`
	// Removes leading, trailing, and duplicate internal whitespace.
	RemoveExtraWhitespaces *bool `` /* 129-byte string literal not displayed */
	// Replaces whitespace with meta symbol.
	// This field must be true to train sentence piece model.
	EscapeWhitespaces *bool `protobuf:"varint,5,opt,name=escape_whitespaces,json=escapeWhitespaces,def=1" json:"escape_whitespaces,omitempty"`
	// Custom normalization rule file in TSV format.
	// https://github.com/google/sentencepiece/blob/master/doc/normalization.md
	// This field is only used in SentencePieceTrainer::Train() method, which
	// compiles the rule into the binary rule stored in `precompiled_charsmap`.
	NormalizationRuleTsv *string `protobuf:"bytes,6,opt,name=normalization_rule_tsv,json=normalizationRuleTsv" json:"normalization_rule_tsv,omitempty"`
	// contains filtered or unexported fields
}

NormalizerSpec encodes a various parameters for string normalizaiton

func (*NormalizerSpec) Descriptor deprecated

func (*NormalizerSpec) Descriptor() ([]byte, []int)

Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor instead.

func (*NormalizerSpec) ExtensionRangeArray deprecated

func (*NormalizerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*NormalizerSpec) GetAddDummyPrefix

func (x *NormalizerSpec) GetAddDummyPrefix() bool

func (*NormalizerSpec) GetEscapeWhitespaces

func (x *NormalizerSpec) GetEscapeWhitespaces() bool

func (*NormalizerSpec) GetName

func (x *NormalizerSpec) GetName() string

func (*NormalizerSpec) GetNormalizationRuleTsv

func (x *NormalizerSpec) GetNormalizationRuleTsv() string

func (*NormalizerSpec) GetPrecompiledCharsmap

func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte

func (*NormalizerSpec) GetRemoveExtraWhitespaces

func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool

func (*NormalizerSpec) ProtoMessage

func (*NormalizerSpec) ProtoMessage()

func (*NormalizerSpec) ProtoReflect

func (x *NormalizerSpec) ProtoReflect() protoreflect.Message

func (*NormalizerSpec) Reset

func (x *NormalizerSpec) Reset()

func (*NormalizerSpec) String

func (x *NormalizerSpec) String() string

type SelfTestData

type SelfTestData struct {
	Samples []*SelfTestData_Sample `protobuf:"bytes,1,rep,name=samples" json:"samples,omitempty"`
	// contains filtered or unexported fields
}

Proto to store samples for self-testing.

func (*SelfTestData) Descriptor deprecated

func (*SelfTestData) Descriptor() ([]byte, []int)

Deprecated: Use SelfTestData.ProtoReflect.Descriptor instead.

func (*SelfTestData) ExtensionRangeArray deprecated

func (*SelfTestData) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use SelfTestData.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*SelfTestData) GetSamples

func (x *SelfTestData) GetSamples() []*SelfTestData_Sample

func (*SelfTestData) ProtoMessage

func (*SelfTestData) ProtoMessage()

func (*SelfTestData) ProtoReflect

func (x *SelfTestData) ProtoReflect() protoreflect.Message

func (*SelfTestData) Reset

func (x *SelfTestData) Reset()

func (*SelfTestData) String

func (x *SelfTestData) String() string

type SelfTestData_Sample

type SelfTestData_Sample struct {
	Input    *string `protobuf:"bytes,1,opt,name=input" json:"input,omitempty"`
	Expected *string `protobuf:"bytes,2,opt,name=expected" json:"expected,omitempty"`
	// contains filtered or unexported fields
}

func (*SelfTestData_Sample) Descriptor deprecated

func (*SelfTestData_Sample) Descriptor() ([]byte, []int)

Deprecated: Use SelfTestData_Sample.ProtoReflect.Descriptor instead.

func (*SelfTestData_Sample) GetExpected

func (x *SelfTestData_Sample) GetExpected() string

func (*SelfTestData_Sample) GetInput

func (x *SelfTestData_Sample) GetInput() string

func (*SelfTestData_Sample) ProtoMessage

func (*SelfTestData_Sample) ProtoMessage()

func (*SelfTestData_Sample) ProtoReflect

func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message

func (*SelfTestData_Sample) Reset

func (x *SelfTestData_Sample) Reset()

func (*SelfTestData_Sample) String

func (x *SelfTestData_Sample) String() string

type Sentencepiece

type Sentencepiece struct {
	// contains filtered or unexported fields
}

Sentencepiece holds the model

func NewEmptySentencepiece

func NewEmptySentencepiece(lowercase bool) Sentencepiece

NewEmptySentencepiece creates an empty sentencepiece model

func NewSentencepieceFromFile

func NewSentencepieceFromFile(filename string, lowercase bool) (Sentencepiece, error)

NewSentencepieceFromFile creates sentencepiece from file.

func (*Sentencepiece) GetControlWord added in v1.1.0

func (s *Sentencepiece) GetControlWord(word string) (int32, bool)

GetControlWord gets the index for the given control word

func (*Sentencepiece) GetUnknownIndex added in v1.1.0

func (s *Sentencepiece) GetUnknownIndex() int32

GetUnknownIndex gets the index of the unknown id

func (*Sentencepiece) SetControlWord added in v1.1.0

func (s *Sentencepiece) SetControlWord(word string, index int32)

SetControlWord sets the index for the given control word

func (*Sentencepiece) SetUnknownIndex

func (s *Sentencepiece) SetUnknownIndex(index int32)

SetUnknownIndex sets the index for the unknown id

func (*Sentencepiece) Tokenize

func (s *Sentencepiece) Tokenize(text string) []Token

Tokenize tokenizes text into pieces

func (*Sentencepiece) TokenizeToIDs

func (s *Sentencepiece) TokenizeToIDs(text string) []int32

TokenizeToIDs tokenizes text into ids from the vocab

type Token

type Token struct {
	ID   int32
	Text string
}

Token holds a unit of a tokenized word

type TrainerSpec

type TrainerSpec struct {

	///////////////////////////////////////////////////////////////////
	// General parameters
	//
	// Input corpus files.
	//  Trainer accepts the following two formats:
	//  A) Monolingual: plain text, one sentence per line.
	//  B) Bilingual:   TSV, source sentence <tab> target sentence
	//  When bilingual data is passed, shared vocabulary model is built.
	//  Note that the input file must be raw corpus, not a preprocessed corpus.
	//  Trainer only loads the first `input_sentence_size` sentences specified
	//  with this parameter.
	Input []string `protobuf:"bytes,1,rep,name=input" json:"input,omitempty"`
	// Input corpus format:
	// "text": one-sentence-per-line text format (default)
	// "tsv":  sentence <tab> freq
	InputFormat *string `protobuf:"bytes,7,opt,name=input_format,json=inputFormat" json:"input_format,omitempty"`
	// Output model file prefix.
	// <model_prefix>.model and <model_prefix>.vocab are generated.
	ModelPrefix *string                `protobuf:"bytes,2,opt,name=model_prefix,json=modelPrefix" json:"model_prefix,omitempty"`
	ModelType   *TrainerSpec_ModelType `` /* 129-byte string literal not displayed */
	// Vocabulary size. 8k is the default size.
	VocabSize *int32 `protobuf:"varint,4,opt,name=vocab_size,json=vocabSize,def=8000" json:"vocab_size,omitempty"`
	// List of the languages this model can accept.
	// Since the model is language-agnostic, this field is used as a reference.
	AcceptLanguage []string `protobuf:"bytes,5,rep,name=accept_language,json=acceptLanguage" json:"accept_language,omitempty"`
	// Size of self-test samples, which are encoded in the model file.
	SelfTestSampleSize *int32 `protobuf:"varint,6,opt,name=self_test_sample_size,json=selfTestSampleSize,def=0" json:"self_test_sample_size,omitempty"`
	///////////////////////////////////////////////////////////////////
	// Training parameters.
	//
	// Uses characters which cover the corpus with the ratio of `chars_coverage`.
	// This parameter determines the set of basic Alphabet of sentence piece.
	// 1.0 - `chars_coverage` characters are treated as UNK.
	// See also required_chars field.
	CharacterCoverage *float32 `protobuf:"fixed32,10,opt,name=character_coverage,json=characterCoverage,def=0.9995" json:"character_coverage,omitempty"`
	// Maximum size of sentences the trainer loads from `input` parameter.
	// Trainer simply loads the `input` files in sequence.
	// It is better to shuffle the input corpus randomly.
	InputSentenceSize    *int32 `protobuf:"varint,11,opt,name=input_sentence_size,json=inputSentenceSize,def=0" json:"input_sentence_size,omitempty"`
	ShuffleInputSentence *bool  `protobuf:"varint,19,opt,name=shuffle_input_sentence,json=shuffleInputSentence,def=1" json:"shuffle_input_sentence,omitempty"`
	// Maximum size of sentences to make seed sentence pieces.
	// Extended suffix array is constructed to extract frequent
	// sub-strings from the corpus. This uses 20N working space,
	// where N is the size of corpus.
	//
	// Deprecated: Do not use.
	MiningSentenceSize *int32 `protobuf:"varint,12,opt,name=mining_sentence_size,json=miningSentenceSize" json:"mining_sentence_size,omitempty"`
	// Maximum size of sentences to train sentence pieces.
	//
	// Deprecated: Do not use.
	TrainingSentenceSize *int32 `protobuf:"varint,13,opt,name=training_sentence_size,json=trainingSentenceSize" json:"training_sentence_size,omitempty"`
	// The size of seed sentencepieces.
	// `seed_sentencepiece_size` must be larger than `vocab_size`.
	SeedSentencepieceSize *int32 `` /* 133-byte string literal not displayed */
	// In every EM sub-iterations, keeps top
	// `shrinking_factor` * `current sentencepieces size` with respect to
	// the loss of the sentence piece. This value should be smaller than 1.0.
	ShrinkingFactor *float32 `protobuf:"fixed32,15,opt,name=shrinking_factor,json=shrinkingFactor,def=0.75" json:"shrinking_factor,omitempty"`
	// The maximum sentence length in byte. The sentences with the length
	// larger than `max_sentence_length` is simply ignored.
	// Longer input tends to bring the following risks:
	//  * Overflow during EM training (unigram language model only)
	//  * Performance drop because of O(n log n) cost in BPE.
	MaxSentenceLength *int32 `protobuf:"varint,18,opt,name=max_sentence_length,json=maxSentenceLength,def=4192" json:"max_sentence_length,omitempty"`
	// Number of threads in the training.
	NumThreads *int32 `protobuf:"varint,16,opt,name=num_threads,json=numThreads,def=16" json:"num_threads,omitempty"`
	// Number of EM sub iterations.
	NumSubIterations *int32 `protobuf:"varint,17,opt,name=num_sub_iterations,json=numSubIterations,def=2" json:"num_sub_iterations,omitempty"`
	///////////////////////////////////////////////////////////////////
	// SentencePiece parameters which control the shapes of sentence piece.
	//
	// Maximum length of sentencepiece.
	MaxSentencepieceLength *int32 `` /* 131-byte string literal not displayed */
	// Uses Unicode script to split sentence pieces.
	// When `split_by_unicode_script` is true, we do not allow sentence piece to
	// include multiple Unicode scripts, e.g. "F1" is not a valid piece.
	// Exception: CJ characters (Hiragana/Katakana/Han) are all handled
	// as one script type, since Japanese word can consist of multiple scripts.
	// This exception is always applied regardless of the accept-language
	// parameter.
	SplitByUnicodeScript *bool `` /* 126-byte string literal not displayed */
	// When `split_by_number` is true, put a boundary between number and
	// non-number transition. If we want to treat "F1" is one token, set this flag
	// to be false.
	SplitByNumber *bool `protobuf:"varint,23,opt,name=split_by_number,json=splitByNumber,def=1" json:"split_by_number,omitempty"`
	// Use a white space to split sentence pieces.
	// When `split_by_whitespace` is false, we may have the piece containing
	// a white space in the middle. e.g., "in_the".
	SplitByWhitespace *bool `protobuf:"varint,22,opt,name=split_by_whitespace,json=splitByWhitespace,def=1" json:"split_by_whitespace,omitempty"`
	// Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
	// hello_. When `treat_whitespace_as_suffix` is true,
	// NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
	// of sentence.
	TreatWhitespaceAsSuffix *bool `` /* 135-byte string literal not displayed */
	// Split all digits (0-9) into separate pieces.
	SplitDigits *bool `protobuf:"varint,25,opt,name=split_digits,json=splitDigits,def=0" json:"split_digits,omitempty"`
	///////////////////////////////////////////////////////////////////
	// Vocabulary management
	//
	// Defines control symbols used as an indicator to
	// change the behavior of the decoder. <s> and </s> are pre-defined.
	// We can use this field to encode various meta information,
	// including language indicator in multilingual model.
	// These symbols are not visible to users, but visible to
	// the decoder. Note that when the input sentence contains control symbols,
	// they are not treated as one token, but segmented into normal pieces.
	// Control symbols must be inserted independently from the segmentation.
	ControlSymbols []string `protobuf:"bytes,30,rep,name=control_symbols,json=controlSymbols" json:"control_symbols,omitempty"`
	// Defines user defined symbols.
	// These symbols are added with extremely high score
	// so they are always treated as one unique symbol in any context.
	// Typical usage of user_defined_symbols is placeholder for named entities.
	UserDefinedSymbols []string `protobuf:"bytes,31,rep,name=user_defined_symbols,json=userDefinedSymbols" json:"user_defined_symbols,omitempty"`
	// Defines required characters. Each UTF8 character in this string is included
	// in the character set regardless of character_coverage value. Unlike
	// user_defined_symbols, these characters have scores based on the frequency
	// on input sentences, and the model can form subwords using characters
	// in this field.
	RequiredChars *string `protobuf:"bytes,36,opt,name=required_chars,json=requiredChars" json:"required_chars,omitempty"`
	// Decomposes unknown pieces into UTF-8 bytes.
	ByteFallback *bool `protobuf:"varint,35,opt,name=byte_fallback,json=byteFallback,def=0" json:"byte_fallback,omitempty"`
	// When creating the vocabulary file, defines whether or not to additionally
	// output the score for each piece.
	VocabularyOutputPieceScore *bool `` /* 144-byte string literal not displayed */
	// `vocab_size` is treated as hard limit. Crash if
	// the model can not produce the vocab of size `vocab_size`,
	// When `hard_vocab_limit` is false, vocab_size is treated
	// as soft limit. Note that when model_type=char,
	// always assumes hard_vocab_limit = false.
	HardVocabLimit *bool `protobuf:"varint,33,opt,name=hard_vocab_limit,json=hardVocabLimit,def=1" json:"hard_vocab_limit,omitempty"`
	// use all symbols for vocab extraction. This flag is valid
	// if model type is either CHAR or WORD
	UseAllVocab *bool `protobuf:"varint,34,opt,name=use_all_vocab,json=useAllVocab,def=0" json:"use_all_vocab,omitempty"`
	///////////////////////////////////////////////////////////////////
	// Reserved special meta tokens.
	// * -1 is not used.
	// * unk_id must not be -1.
	// Id must starts with 0 and be contigous.
	UnkId    *int32  `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"`  // <unk>
	BosId    *int32  `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"`  // <s>
	EosId    *int32  `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"`  // </s>
	PadId    *int32  `protobuf:"varint,43,opt,name=pad_id,json=padId,def=-1" json:"pad_id,omitempty"` // <pad> (padding)
	UnkPiece *string `protobuf:"bytes,45,opt,name=unk_piece,json=unkPiece,def=<unk>" json:"unk_piece,omitempty"`
	BosPiece *string `protobuf:"bytes,46,opt,name=bos_piece,json=bosPiece,def=<s>" json:"bos_piece,omitempty"`
	EosPiece *string `protobuf:"bytes,47,opt,name=eos_piece,json=eosPiece,def=</s>" json:"eos_piece,omitempty"`
	PadPiece *string `protobuf:"bytes,48,opt,name=pad_piece,json=padPiece,def=<pad>" json:"pad_piece,omitempty"`
	// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
	// since this character can be useful both for user and
	// developer. We can easily figure out that <unk> is emitted.
	UnkSurface *string `protobuf:"bytes,44,opt,name=unk_surface,json=unkSurface,def= ⁇ " json:"unk_surface,omitempty"`
	// Increase bit depth to allow unigram model training on large
	// (>10M sentences) corpora. A Side-effect of enabling this flag
	// is increased memory usage.
	TrainExtremelyLargeCorpus *bool `` /* 141-byte string literal not displayed */
	// contains filtered or unexported fields
}

BEGIN GOOGLE-INTERNAL LINT.IfChange END GOOGLE-INTERNAL TrainerSpec encodes a various parameters for SentencePiece training.

func (*TrainerSpec) Descriptor deprecated

func (*TrainerSpec) Descriptor() ([]byte, []int)

Deprecated: Use TrainerSpec.ProtoReflect.Descriptor instead.

func (*TrainerSpec) ExtensionRangeArray deprecated

func (*TrainerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1

Deprecated: Use TrainerSpec.ProtoReflect.Descriptor.ExtensionRanges instead.

func (*TrainerSpec) GetAcceptLanguage

func (x *TrainerSpec) GetAcceptLanguage() []string

func (*TrainerSpec) GetBosId

func (x *TrainerSpec) GetBosId() int32

func (*TrainerSpec) GetBosPiece

func (x *TrainerSpec) GetBosPiece() string

func (*TrainerSpec) GetByteFallback

func (x *TrainerSpec) GetByteFallback() bool

func (*TrainerSpec) GetCharacterCoverage

func (x *TrainerSpec) GetCharacterCoverage() float32

func (*TrainerSpec) GetControlSymbols

func (x *TrainerSpec) GetControlSymbols() []string

func (*TrainerSpec) GetEosId

func (x *TrainerSpec) GetEosId() int32

func (*TrainerSpec) GetEosPiece

func (x *TrainerSpec) GetEosPiece() string

func (*TrainerSpec) GetHardVocabLimit

func (x *TrainerSpec) GetHardVocabLimit() bool

func (*TrainerSpec) GetInput

func (x *TrainerSpec) GetInput() []string

func (*TrainerSpec) GetInputFormat

func (x *TrainerSpec) GetInputFormat() string

func (*TrainerSpec) GetInputSentenceSize

func (x *TrainerSpec) GetInputSentenceSize() int32

func (*TrainerSpec) GetMaxSentenceLength

func (x *TrainerSpec) GetMaxSentenceLength() int32

func (*TrainerSpec) GetMaxSentencepieceLength

func (x *TrainerSpec) GetMaxSentencepieceLength() int32

func (*TrainerSpec) GetMiningSentenceSize deprecated

func (x *TrainerSpec) GetMiningSentenceSize() int32

Deprecated: Do not use.

func (*TrainerSpec) GetModelPrefix

func (x *TrainerSpec) GetModelPrefix() string

func (*TrainerSpec) GetModelType

func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType

func (*TrainerSpec) GetNumSubIterations

func (x *TrainerSpec) GetNumSubIterations() int32

func (*TrainerSpec) GetNumThreads

func (x *TrainerSpec) GetNumThreads() int32

func (*TrainerSpec) GetPadId

func (x *TrainerSpec) GetPadId() int32

func (*TrainerSpec) GetPadPiece

func (x *TrainerSpec) GetPadPiece() string

func (*TrainerSpec) GetRequiredChars

func (x *TrainerSpec) GetRequiredChars() string

func (*TrainerSpec) GetSeedSentencepieceSize

func (x *TrainerSpec) GetSeedSentencepieceSize() int32

func (*TrainerSpec) GetSelfTestSampleSize

func (x *TrainerSpec) GetSelfTestSampleSize() int32

func (*TrainerSpec) GetShrinkingFactor

func (x *TrainerSpec) GetShrinkingFactor() float32

func (*TrainerSpec) GetShuffleInputSentence

func (x *TrainerSpec) GetShuffleInputSentence() bool

func (*TrainerSpec) GetSplitByNumber

func (x *TrainerSpec) GetSplitByNumber() bool

func (*TrainerSpec) GetSplitByUnicodeScript

func (x *TrainerSpec) GetSplitByUnicodeScript() bool

func (*TrainerSpec) GetSplitByWhitespace

func (x *TrainerSpec) GetSplitByWhitespace() bool

func (*TrainerSpec) GetSplitDigits

func (x *TrainerSpec) GetSplitDigits() bool

func (*TrainerSpec) GetTrainExtremelyLargeCorpus

func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool

func (*TrainerSpec) GetTrainingSentenceSize deprecated

func (x *TrainerSpec) GetTrainingSentenceSize() int32

Deprecated: Do not use.

func (*TrainerSpec) GetTreatWhitespaceAsSuffix

func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool

func (*TrainerSpec) GetUnkId

func (x *TrainerSpec) GetUnkId() int32

func (*TrainerSpec) GetUnkPiece

func (x *TrainerSpec) GetUnkPiece() string

func (*TrainerSpec) GetUnkSurface

func (x *TrainerSpec) GetUnkSurface() string

func (*TrainerSpec) GetUseAllVocab

func (x *TrainerSpec) GetUseAllVocab() bool

func (*TrainerSpec) GetUserDefinedSymbols

func (x *TrainerSpec) GetUserDefinedSymbols() []string

func (*TrainerSpec) GetVocabSize

func (x *TrainerSpec) GetVocabSize() int32

func (*TrainerSpec) GetVocabularyOutputPieceScore

func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool

func (*TrainerSpec) ProtoMessage

func (*TrainerSpec) ProtoMessage()

func (*TrainerSpec) ProtoReflect

func (x *TrainerSpec) ProtoReflect() protoreflect.Message

func (*TrainerSpec) Reset

func (x *TrainerSpec) Reset()

func (*TrainerSpec) String

func (x *TrainerSpec) String() string

type TrainerSpec_ModelType

type TrainerSpec_ModelType int32

Model type. only have UNIGRAM now.

const (
	TrainerSpec_UNIGRAM TrainerSpec_ModelType = 1 // Unigram language model with dynamic algorithm
	TrainerSpec_BPE     TrainerSpec_ModelType = 2 // Byte Pair Encoding
	TrainerSpec_WORD    TrainerSpec_ModelType = 3 // Delimitered by whitespace.
	TrainerSpec_CHAR    TrainerSpec_ModelType = 4 // tokenizes into character sequence
)

func (TrainerSpec_ModelType) Descriptor

func (TrainerSpec_ModelType) Enum

func (TrainerSpec_ModelType) EnumDescriptor deprecated

func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int)

Deprecated: Use TrainerSpec_ModelType.Descriptor instead.

func (TrainerSpec_ModelType) Number

func (TrainerSpec_ModelType) String

func (x TrainerSpec_ModelType) String() string

func (TrainerSpec_ModelType) Type

func (*TrainerSpec_ModelType) UnmarshalJSON deprecated

func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) error

Deprecated: Do not use.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL