bart

package

v0.0.0-...-53a6fda Latest Latest Go to latest Published: Aug 6, 2024 License: BSD-2-Clause Imports: 17 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/yinziyang/cybertron

Links

Open Source Insights

Documentation ¶

Overview ¶

Package bart implements the transformer model introduced by Mike et al., 2019. "Bart: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension" https://arxiv.org/abs/1910.13461

Index ¶

type Cache
- func (c Cache) Layer(i int) [2]multiheadattention.Cache
type Classifier
- func NewClassifier[T float.DType](c ClassifierConfig) *Classifier
- func (m *Classifier) Forward(xs mat.Tensor) mat.Tensor
type ClassifierConfig
type Config
- func ConfigFromFile(file string) (Config, error)
- func (c *Config) ContradictionID() (int, error)
- func (c *Config) EntailmentID() (int, error)
type CrossAttentionBlock
type CrossAttentionBlockConfig
type Decoder
- func NewDecoder[T float.DType](c Config, shared embedding.Shared) *Decoder
- func (m *Decoder) Decode(encoderStates []mat.Tensor, inputIDs []int, cache Cache, curLen int) ([]mat.Tensor, Cache)
type DecoderLayer
- func NewDecoderLayer[T float.DType](c Config) *DecoderLayer
- func (m *DecoderLayer) Forward(cache [2]multiheadattention.Cache, seq1 []mat.Tensor, seq2 []mat.Tensor) ([]mat.Tensor, [2]multiheadattention.Cache)
type DecodingInput
type DecodingOutput
type Embeddings
- func NewEmbeddings[T float.DType](c Config, shared embedding.Shared, isDecoder bool) *Embeddings
- func (m *Embeddings) Encode(inputIDs []int, offset int) []mat.Tensor
type Encoder
- func NewEncoder[T float.DType](c Config, shared embedding.Shared) *Encoder
- func (m *Encoder) Encode(inputIDs []int) []mat.Tensor
type EncoderLayer
- func NewEncoderLayer[T float.DType](c Config) *EncoderLayer
- func (m *EncoderLayer) Forward(xs ...mat.Tensor) []mat.Tensor
type FeedForwardBlock
type Model
- func New[T float.DType](c Config) *Model
- func (m *Model) Forward(inputIDs []int) []mat.Tensor
type ModelForConditionalGeneration
- func NewModelForConditionalGeneration[T float.DType](bart *Model) *ModelForConditionalGeneration
- func (m *ModelForConditionalGeneration) DecodingFunc(encoderInputIDs []int, scoreProc generationutils.ScoreProcessor, ...) func(batch []*DecodingInput) []*DecodingOutput
type ModelForSequenceClassification
- func NewModelForSequenceClassification[T float.DType](bart *Model) *ModelForSequenceClassification
- func (m *ModelForSequenceClassification) Forward(inputIds []int) mat.Tensor
type NewFeedForwardBlockConfig
type PositionalEncoder
- func NewPositionalEncoder[T float.DType](config PositionalEncoderConfig) *PositionalEncoder
- func (m *PositionalEncoder) Encode(positions []int) []mat.Tensor
type PositionalEncoderConfig
type PostNormCrossAttentionBlock
- func (m PostNormCrossAttentionBlock) Forward(cache multiheadattention.Cache, seq1 []mat.Tensor, seq2 []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)
type PostNormFeedForwardBlock
- func (m PostNormFeedForwardBlock) Forward(xs []mat.Tensor) []mat.Tensor
type PostNormSelfAttentionBlock
- func (m PostNormSelfAttentionBlock) Forward(cache multiheadattention.Cache, xs []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)
type PreNormCrossAttentionBlock
- func (m PreNormCrossAttentionBlock) Forward(cache multiheadattention.Cache, seq1 []mat.Tensor, seq2 []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)
type PreNormFeedForwardBlock
- func (m PreNormFeedForwardBlock) Forward(xs []mat.Tensor) []mat.Tensor
type PreNormSelfAttentionBlock
- func (m PreNormSelfAttentionBlock) Forward(cache multiheadattention.Cache, xs []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)
type ResidualNormCrossAttention
- func NewCrossAttentionBlock[T float.DType](c CrossAttentionBlockConfig) ResidualNormCrossAttention
type ResidualNormFeedForward
- func NewFeedForwardBlock[T float.DType](c NewFeedForwardBlockConfig) ResidualNormFeedForward
type ResidualNormSelfAttention
- func NewSelfAttentionBlock[T float.DType](c SelfAttentionBlockConfig) ResidualNormSelfAttention
type SelfAttentionBlock
type SelfAttentionBlockConfig

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type Cache ¶

type Cache [][2]multiheadattention.Cache

Cache contains the cache of each DecoderLayer. For each layer, the cache contains the queries, keys and values used by the self-attention at index 0 and cross-attention at index 1.

func (Cache) Layer ¶

func (c Cache) Layer(i int) [2]multiheadattention.Cache

Layer returns the cache at the given index.

type Classifier ¶

type Classifier struct {
	nn.Module
	// Config is the configuration of the classifier.
	Config ClassifierConfig
	// Layers is the list of layers of the MLP.
	Layers nn.ModuleList[nn.StandardModel]
}

Classifier is a model for Bart head for sentence-level classification tasks.

func NewClassifier ¶

func NewClassifier[T float.DType](c ClassifierConfig) *Classifier

NewClassifier returns a new Classifier.

func (*Classifier) Forward ¶

func (m *Classifier) Forward(xs mat.Tensor) mat.Tensor

Forward implements the forward pass of the Classifier.

type ClassifierConfig ¶

type ClassifierConfig struct {
	// InputSize is the input size of the classifier.
	InputSize int
	// HiddenSize is the hidden size of the classifier.
	HiddenSize int
	// OutputSize is the output size of the classifier.
	OutputSize int
	// PoolerDropout is the dropout rate for the classifier.
	PoolerDropout float64
}

ClassifierConfig provides configuration settings for a Bart head for sentence-level Classifier model.

type Config ¶

type Config struct {
	NumLabels                  int               `json:"_num_labels,omitempty"`
	ActivationDropout          float64           `json:"activation_dropout,omitempty"`
	ActivationFunction         string            `json:"activation_function,omitempty"`
	BiasLogits                 bool              `json:"add_bias_logits,omitempty"`
	FinalLayerNorm             bool              `json:"add_final_layer_norm,omitempty"`
	Architecture               []string          `json:"architectures,omitempty"`
	AttentionDropout           float64           `json:"attention_dropout,omitempty"`
	BosTokenID                 int               `json:"bos_token_id,omitempty"`
	ClassifierDropout          float64           `json:"classif_dropout,omitempty"`
	DModel                     int               `json:"d_model,omitempty"`
	DecoderAttentionHeads      int               `json:"decoder_attention_heads,omitempty"`
	DecoderFFNDim              int               `json:"decoder_ffn_dim,omitempty"`
	DecoderLayerDrop           float64           `json:"decoder_layerdrop,omitempty"`
	DecoderLayers              int               `json:"decoder_layers,omitempty"`
	DecoderStartTokenID        int               `json:"decoder_start_token_id,omitempty"`
	Dropout                    float64           `json:"dropout,omitempty"`
	EncoderAttentionHeads      int               `json:"encoder_attention_heads,omitempty"`
	EncoderFFNDim              int               `json:"encoder_ffn_dim,omitempty"`
	EncoderLayerDrop           float64           `json:"encoder_layerdrop,omitempty"`
	EncoderLayers              int               `json:"encoder_layers,omitempty"`
	EosTokenID                 int               `json:"eos_token_id,omitempty"`
	FineTuningTask             string            `json:"finetuning_task,omitempty"`
	ForceBosTokenToBeGenerated bool              `json:"force_bos_token_to_be_generated,omitempty"`
	ID2Label                   map[string]string `json:"id2label,omitempty"`
	InitStd                    float64           `json:"init_std,omitempty"`
	IsEncoderDecoder           bool              `json:"is_encoder_decoder,omitempty"`
	Label2ID                   map[string]int    `json:"label2id,omitempty"`
	LengthPenalty              float64           `json:"length_penalty,omitempty"`
	MaxPositionEmbeddings      int               `json:"max_position_embeddings,omitempty"`
	ModelType                  string            `json:"model_type,omitempty"`
	NormalizeBefore            bool              `json:"normalize_before,omitempty"`
	NormalizeEmbedding         bool              `json:"normalize_embedding,omitempty"`
	NumHiddenLayers            int               `json:"num_hidden_layers,omitempty"`
	OutputPast                 bool              `json:"output_past,omitempty"`
	PadTokenID                 int               `json:"pad_token_id,omitempty"`
	ScaleEmbedding             bool              `json:"scale_embedding,omitempty"`
	StaticPositionEmbeddings   bool              `json:"static_position_embeddings,omitempty"`
	TotalFlos                  float64           `json:"total_flos,omitempty"`
	VocabSize                  int               `json:"vocab_size,omitempty"`
	NumBeams                   int               `json:"num_beams,omitempty"`
	MaxLength                  int               `json:"max_length,omitempty"`
	MinLength                  int               `json:"min_length,omitempty"`
	BadWordsIDs                [][]int           `json:"bad_words_ids,omitempty"`
	EarlyStopping              bool              `json:"early_stopping,omitempty"`
	NoRepeatNGramSize          int               `json:"no_repeat_ngram_size,omitempty"`
	ExtraSpecialTokens         map[int]string    `json:"extra_special_tokens,omitempty"`
	Cybertron                  struct {
		Training                           bool   `json:"training,omitempty"`
		PositionalEncoderOffset            int    `json:"positional_encoder_offset,omitempty"`
		SharedEmbeddingsStoreName          string `json:"shared_embeddings_store_name,omitempty"`
		DecoderPositionalEncodingStoreName string `json:"decoder_positional_encoding_store_name,omitempty"`
		EncoderPositionalEncodingStoreName string `json:"encoder_positional_encoding_store_name,omitempty"`
	}
}

Config contains the global configuration of the Bart model and the heads of fine-tuning tasks. The configuration coincides with that of Hugging Face to facilitate compatibility between the two architectures.

func ConfigFromFile ¶

func ConfigFromFile(file string) (Config, error)

ConfigFromFile loads a Bart model Config from file.

func (*Config) ContradictionID ¶

func (c *Config) ContradictionID() (int, error)

ContradictionID returns the id of the `contradiction` labels.

func (*Config) EntailmentID ¶

func (c *Config) EntailmentID() (int, error)

EntailmentID returns the id of the `entailment` labels.

type CrossAttentionBlock ¶

type CrossAttentionBlock struct {
	nn.Module
	Attention *multiheadattention.Model
	Norm      *layernorm.Model
}

CrossAttentionBlock implements a cross-attention block.

type CrossAttentionBlockConfig ¶

type CrossAttentionBlockConfig struct {
	Dim             int
	NumOfHeads      int
	NormalizeBefore bool
}

CrossAttentionBlockConfig provides configuration settings for a CrossAttentionBlock.

type Decoder ¶

type Decoder struct {
	nn.Module
	// Embeddings is the embedding module.
	Embeddings *Embeddings
	// Layers is the list of decoder layers.
	Layers []*DecoderLayer
	// LayerNorm is the layer normalization module.
	LayerNorm *layernorm.Model
	// Config is the configuration of the decoder.
	Config Config
}

Decoder implements a Bart decoder.

func NewDecoder ¶

func NewDecoder[T float.DType](c Config, shared embedding.Shared) *Decoder

NewDecoder returns a new Decoder.

func (*Decoder) Decode ¶

func (m *Decoder) Decode(encoderStates []mat.Tensor, inputIDs []int, cache Cache, curLen int) ([]mat.Tensor, Cache)

Decode performs the decoding considering the encoder output and the decoder input.

type DecoderLayer ¶

type DecoderLayer struct {
	nn.Module
	// SelfAttention is the self-attention block.
	SelfAttention ResidualNormSelfAttention
	// CrossAttention is the cross-attention block.
	CrossAttention ResidualNormCrossAttention
	// FF is the feed-forward block with normalization and residual connection.
	FF ResidualNormFeedForward
	// Config is the configuration of the decoder layer.
	Config Config
}

DecoderLayer implements a Bart decoder layer.

func NewDecoderLayer ¶

func NewDecoderLayer[T float.DType](c Config) *DecoderLayer

NewDecoderLayer returns a new decoder layer.

func (*DecoderLayer) Forward ¶

func (m *DecoderLayer) Forward(cache [2]multiheadattention.Cache, seq1 []mat.Tensor, seq2 []mat.Tensor) ([]mat.Tensor, [2]multiheadattention.Cache)

Forward performs the forward pass.

type DecodingInput ¶

type DecodingInput struct {
	// InputIDs are the input IDs for the decoder.
	InputIDs []int
	// CurLen is the current length of the generating sequence.
	CurLen int
	// Cache is the cache for the decoder.
	Cache Cache
}

DecodingInput is the input for the decoding function of the model for conditional generation.

type DecodingOutput ¶

type DecodingOutput struct {
	// LogProbRaw is the raw (not processed) log probability of the generated token.
	LogProbRaw mat.Tensor
	// LogProbValue is the post-processed log probability of the generated token.
	LogProbValue mat.Matrix
	// NextCache is the next cache.
	NextCache Cache
}

DecodingOutput is the output of the decoding function of the model for conditional generation.

type Embeddings ¶

type Embeddings struct {
	nn.Module
	// SharedEmbeddings is the shared embedding module.
	SharedEmbeddings embedding.Shared
	// PositionalEncoder is the positional encoder module.
	PositionalEncoder *PositionalEncoder
	// Norm is the normalization module.
	Norm *layernorm.Model
	// ScaleFactor is the scaling factor for the shared embeddings.
	ScaleFactor *nn.Buffer
	// Config is the configuration of the embeddings.
	Config Config
}

func NewEmbeddings ¶

func NewEmbeddings[T float.DType](c Config, shared embedding.Shared, isDecoder bool) *Embeddings

NewEmbeddings returns a new Embeddings.

func (*Embeddings) Encode ¶

func (m *Embeddings) Encode(inputIDs []int, offset int) []mat.Tensor

Encode performs the Bart initial input encoding.

type Encoder ¶

type Encoder struct {
	nn.Module
	// Embeddings is the embedding layer.
	Embeddings *Embeddings
	// Layers is the list of encoder layers.
	Layers nn.ModuleList[*EncoderLayer]
	// LayerNorm is the layer normalization module.
	LayerNorm *layernorm.Model
	// Config is the configuration of the encoder.
	Config Config
}

Encoder implements a Bart encoder.

func NewEncoder ¶

func NewEncoder[T float.DType](c Config, shared embedding.Shared) *Encoder

NewEncoder returns a new Encoder.

func (*Encoder) Encode ¶

func (m *Encoder) Encode(inputIDs []int) []mat.Tensor

Encode performs the Bart encoding.

type EncoderLayer ¶

type EncoderLayer struct {
	nn.Module
	// SelfAttention is the self attention block.
	SelfAttention ResidualNormSelfAttention
	// FF is the feed-forward block with normalization and residual connection.
	FF ResidualNormFeedForward
	// Config is the configuration of the encoder layer.
	Config Config
}

EncoderLayer implements a Bart encoder layer.

func NewEncoderLayer ¶

func NewEncoderLayer[T float.DType](c Config) *EncoderLayer

NewEncoderLayer returns a new encoder layer.

func (*EncoderLayer) Forward ¶

func (m *EncoderLayer) Forward(xs ...mat.Tensor) []mat.Tensor

Forward performs the forward pass.

type FeedForwardBlock ¶

type FeedForwardBlock struct {
	nn.Module
	FFN  nn.ModuleList[nn.StandardModel]
	Norm *layernorm.Model
}

FeedForwardBlock is a feed-forward block with normalization and residual connection.

type Model ¶

type Model struct {
	nn.Module
	// Config is the model configuration.
	Config Config
	// Encoder is the encoder model.
	Encoder *Encoder
	// Decoder is the decoder model.
	Decoder *Decoder
	// Embeddings contains the embeddings shared between the encoder and the decoder.
	Embeddings *embedding.Model
}

Model implements a base Bart encoder-decoder model without any head on top.

func New ¶

func New[T float.DType](c Config) *Model

New returns a new Bart model.

func (*Model) Forward ¶

func (m *Model) Forward(inputIDs []int) []mat.Tensor

Forward performs encoding-decoding over the same input sequence producing the final encoded sequence.

type ModelForConditionalGeneration ¶

type ModelForConditionalGeneration struct {
	nn.Module
	// Bart is the fine-tuned BART model.
	Bart *Model
	// Projection is the projection layer from the decoder output to the vocabulary.
	Projection *linear.Model
	// PadMask is the mask for the pad token.
	PadMask *nn.Buffer
	// EosMask is the mask for the EOS token.
	EosMask *nn.Buffer
}

ModelForConditionalGeneration is a model for conditional generation tasks which embeds a Bart fine-tuned model.

func NewModelForConditionalGeneration ¶

func NewModelForConditionalGeneration[T float.DType](bart *Model) *ModelForConditionalGeneration

NewModelForConditionalGeneration returns a new model for conditional generation.

func (*ModelForConditionalGeneration) DecodingFunc ¶

func (m *ModelForConditionalGeneration) DecodingFunc(encoderInputIDs []int, scoreProc generationutils.ScoreProcessor, inference bool) func(batch []*DecodingInput) []*DecodingOutput

DecodingFunc returns a decoding function that works using the encoder states derived from the input. During inference, it adjusts the logits to avoid impossible tokens.

type ModelForSequenceClassification ¶

type ModelForSequenceClassification struct {
	nn.Module
	// Bart is the Bart fine-tuned model.
	Bart *Model
	// Classifier is the final classifier layer.
	Classifier *Classifier
}

ModelForSequenceClassification is a model for sequence classification tasks which embeds a Bart fine-tuned model.

func NewModelForSequenceClassification ¶

func NewModelForSequenceClassification[T float.DType](bart *Model) *ModelForSequenceClassification

NewModelForSequenceClassification returns a new model for sentence-level classification.

func (*ModelForSequenceClassification) Forward ¶

func (m *ModelForSequenceClassification) Forward(inputIds []int) mat.Tensor

Forward performs the classification using the last transformed state.

type NewFeedForwardBlockConfig ¶

type NewFeedForwardBlockConfig struct {
	// Dim is the dimension of the input.
	Dim int
	// HiddenDim is the dimension of the hidden layer.
	HiddenDim int
	// ActivationFunction is the activation function.
	Activation activation.Activation
	// NormalizeBefore is whether to normalize the input before the MLP.
	NormalizeBefore bool
}

NewFeedForwardBlockConfig is the configuration of a feed-forward block.

type PositionalEncoder ¶

type PositionalEncoder struct {
	nn.Module
	// Embeddings contains the embeddings for each position.
	Embeddings *embedding.Model
	// Config contains the configuration settings.
	Config PositionalEncoderConfig
}

PositionalEncoder contains positional embeddings fine-tuned during the training phase.

func NewPositionalEncoder ¶

func NewPositionalEncoder[T float.DType](config PositionalEncoderConfig) *PositionalEncoder

NewPositionalEncoder returns a new PositionalEncoder.

func (*PositionalEncoder) Encode ¶

func (m *PositionalEncoder) Encode(positions []int) []mat.Tensor

Encode performs the forward step for each input and returns the result.

type PositionalEncoderConfig ¶

type PositionalEncoderConfig struct {
	NumEmbeddings int
	EmbeddingDim  int
	PaddingIDX    int
	Offset        int
	StoreName     string
	Trainable     bool
}

PositionalEncoderConfig provides configuration settings for a PositionalEncoder model.

type PostNormCrossAttentionBlock ¶

type PostNormCrossAttentionBlock struct {
	*CrossAttentionBlock
}

PostNormCrossAttentionBlock embeds a cross-attention block to perform cross attention with post normalization

func (PostNormCrossAttentionBlock) Forward ¶

func (m PostNormCrossAttentionBlock) Forward(cache multiheadattention.Cache, seq1 []mat.Tensor, seq2 []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)

Forward performs the forward pass.

type PostNormFeedForwardBlock ¶

type PostNormFeedForwardBlock struct {
	*FeedForwardBlock
}

PostNormFeedForwardBlock is a feed-forward block with post-normalization normalization.

func (PostNormFeedForwardBlock) Forward ¶

func (m PostNormFeedForwardBlock) Forward(xs []mat.Tensor) []mat.Tensor

Forward performs the forward pass.

type PostNormSelfAttentionBlock ¶

type PostNormSelfAttentionBlock struct {
	*SelfAttentionBlock
}

PostNormSelfAttentionBlock embeds a self-attention block to perform self attention with post normalization.

func (PostNormSelfAttentionBlock) Forward ¶

func (m PostNormSelfAttentionBlock) Forward(cache multiheadattention.Cache, xs []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)

Forward performs the forward pass.

type PreNormCrossAttentionBlock ¶

type PreNormCrossAttentionBlock struct {
	*CrossAttentionBlock
}

PreNormCrossAttentionBlock embeds a cross-attention block to perform cross attention with pre normalization

func (PreNormCrossAttentionBlock) Forward ¶

func (m PreNormCrossAttentionBlock) Forward(cache multiheadattention.Cache, seq1 []mat.Tensor, seq2 []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)

type PreNormFeedForwardBlock ¶

type PreNormFeedForwardBlock struct {
	*FeedForwardBlock
}

PreNormFeedForwardBlock is a feed-forward block with pre-normalization normalization.

func (PreNormFeedForwardBlock) Forward ¶

func (m PreNormFeedForwardBlock) Forward(xs []mat.Tensor) []mat.Tensor

Forward performs the forward pass.

type PreNormSelfAttentionBlock ¶

type PreNormSelfAttentionBlock struct {
	*SelfAttentionBlock
}

PreNormSelfAttentionBlock embeds a self-attention block to perform self attention with pre-normalization.

func (PreNormSelfAttentionBlock) Forward ¶

func (m PreNormSelfAttentionBlock) Forward(cache multiheadattention.Cache, xs []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)

Forward performs the forward pass.

type ResidualNormCrossAttention ¶

type ResidualNormCrossAttention interface {
	// Forward performs the forward pass.
	Forward(cache multiheadattention.Cache, seq1 []mat.Tensor, seq2 []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)
}

ResidualNormCrossAttention is a cross-attention block with residual connection.

func NewCrossAttentionBlock ¶

func NewCrossAttentionBlock[T float.DType](c CrossAttentionBlockConfig) ResidualNormCrossAttention

NewCrossAttentionBlock returns a new CrossAttentionBlock.

type ResidualNormFeedForward ¶

type ResidualNormFeedForward interface {
	Forward(xs []mat.Tensor) []mat.Tensor
}

ResidualNormFeedForward is a feed-forward block with normalization and residual connection.

func NewFeedForwardBlock ¶

func NewFeedForwardBlock[T float.DType](c NewFeedForwardBlockConfig) ResidualNormFeedForward

NewFeedForwardBlock returns a new PreNormFeedForwardBlock or PostNormFeedForwardBlock depending on the configuration.

type ResidualNormSelfAttention ¶

type ResidualNormSelfAttention interface {
	Forward(cache multiheadattention.Cache, xs []mat.Tensor) ([]mat.Tensor, multiheadattention.Cache)
}

ResidualNormSelfAttention is a self-attention block with residual normalization.

func NewSelfAttentionBlock ¶

func NewSelfAttentionBlock[T float.DType](c SelfAttentionBlockConfig) ResidualNormSelfAttention

NewSelfAttentionBlock returns a new PreNormSelfAttentionBlock or PostNormSelfAttentionBlock depending on the configuration.

type SelfAttentionBlock ¶

type SelfAttentionBlock struct {
	nn.Module
	// Attention is the multi-head attention module.
	Attention *multiheadattention.Model
	// Norm is the layer normalization module.
	Norm *layernorm.Model
}

SelfAttentionBlock implements a self-attention block.

type SelfAttentionBlockConfig ¶

type SelfAttentionBlockConfig struct {
	// Dim is the dimension of the input and output.
	Dim int
	// NumOfHeads is the number of heads.
	NumOfHeads int
	// NormalizeBefore indicates whether the normalization is applied before or after the attention.
	NormalizeBefore bool
	// UseCausalMask indicates whether to use a causal mask.
	UseCausalMask bool
}

SelfAttentionBlockConfig is the configuration of a self-attention block.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL