Documentation ¶
Index ¶
- Constants
- Variables
- func CalculateContext(config ModelConfig, memory, bpw float64, kvCacheQuant KVCacheQuantisation) (int, error)
- func CalculateVRAM(config ModelConfig, bpw float64, context int, kvCacheQuant KVCacheQuantisation) (float64, error)
- func CalculateVRAMRaw(config ModelConfig, bpwValues BPWValues, context int, numGPUs int, gqa bool) float64
- func DownloadFile(url, filePath string, headers map[string]string) error
- func GetAvailableMemory() (float64, error)
- func GetOllamaQuantLevel(modelName string) (string, error)
- func GetSystemRAM() (float64, error)
- func ParseBPW(bpw string) float64
- func ParseBPWOrQuant(input string) (float64, error)
- func PrintFormattedTable(table QuantResultTable) string
- type BPWValues
- type ContextVRAM
- type KVCacheQuantisation
- type ModelConfig
- type OllamaModelInfo
- type QuantRecommendations
- type QuantResult
- type QuantResultTable
- type VRAMEstimation
Constants ¶
const ( DefaultVRAM = 24.0 DefaultContextSize = 8192 DefaultQuantLevel = "Q4_K_M" )
Default values for VRAM, context size and quantisation level if not provided.
const (
CUDASize = 500 * 1024 * 1024 // 500 MB
)
Variables ¶
var EXL2Options []float64
EXL2Options contains the EXL2 quantisation options
var GGUFMapping = map[string]float64{
"Q8_0": 8.5,
"Q6_K": 6.59,
"Q5_K_L": 5.75,
"Q5_K_M": 5.69,
"Q5_K_S": 5.54,
"Q5_0": 5.54,
"Q4_K_L": 4.9,
"Q4_K_M": 4.85,
"Q4_K_S": 4.58,
"Q4_0": 4.55,
"IQ4_NL": 4.5,
"Q3_K_L": 4.27,
"IQ4_XS": 4.25,
"Q3_K_M": 3.91,
"IQ3_M": 3.7,
"IQ3_S": 3.5,
"Q3_K_S": 3.5,
"Q2_K": 3.35,
"IQ3_XS": 3.3,
"IQ3_XXS": 3.06,
"IQ2_M": 2.7,
"IQ2_S": 2.5,
"IQ2_XS": 2.31,
"IQ2_XXS": 2.06,
"IQ1_S": 1.56,
}
GGUFMapping maps GGUF quantisation types to their corresponding bits per weight
var Version string
Version can be set at build time
Functions ¶
func CalculateContext ¶
func CalculateContext(config ModelConfig, memory, bpw float64, kvCacheQuant KVCacheQuantisation) (int, error)
CalculateContext calculates the maximum context for a given memory constraint
Parameters:
- modelID: A string representing the model ID.
- memory: A float64 representing the available VRAM in GB.
- bpw: A float64 representing the bits per weight.
- kvCacheQuant: The KV cache quantization level.
- ollamaModelInfo: A pointer to an OllamaModelInfo struct.
Returns:
- int: An integer representing the maximum context size.
- error: An error if the calculation fails.
Example:
context, err := CalculateContext("llama3.1", 24.0, 8.0, KVCacheFP16, nil) if err != nil { log.Fatal(err) }
func CalculateVRAM ¶
func CalculateVRAM(config ModelConfig, bpw float64, context int, kvCacheQuant KVCacheQuantisation) (float64, error)
CalculateVRAM calculates the VRAM usage for a given model and configuration
Parameters:
- modelName: A string representing the model name.
- bpw: A float64 representing the bits per weight.
- contextSize: An integer representing the context size.
- kvCacheQuant: The KV cache quantization level.
- ollamaModelInfo: A pointer to an OllamaModelInfo struct.
Returns:
- float64: A float64 representing the VRAM usage in GB.
- error: An error if the calculation fails.
Example:
vram, _ := CalculateVRAM("llama3.1", 24.0, 8192, KVCacheFP16, nil)
func CalculateVRAMRaw ¶
func CalculateVRAMRaw(config ModelConfig, bpwValues BPWValues, context int, numGPUs int, gqa bool) float64
CalculateVRAMRaw calculates the raw VRAM usage for a given model configuration
Parameters:
- config: A ModelConfig struct containing the model configuration.
- bpwValues: A BPWValues struct containing the bits per weight values.
- context: An integer representing the context size.
- numGPUs: An integer representing the number of GPUs.
- gqa: A boolean indicating whether the model is GQA.
Returns:
- float64: A float64 representing the VRAM usage in GB.
Example:
vram := CalculateVRAMRaw(config, bpwValues, 8192, 1, true)
func DownloadFile ¶
DownloadFile downloads a file from a URL and saves it to the specified path
func GetAvailableMemory ¶
func GetOllamaQuantLevel ¶ added in v0.0.9
A function that takes an ollama model/name and returns the quantisation level
func GetSystemRAM ¶
func ParseBPWOrQuant ¶
parseBPWOrQuant takes a string and returns a float64 BPW value
func PrintFormattedTable ¶
func PrintFormattedTable(table QuantResultTable) string
PrintFormattedTable prints a formatted table of the quantisation results.
Parameters:
- table: A QuantResultTable struct containing the quantisation results.
Returns:
- string: A string containing the formatted table.
Example:
table, _ := GenerateQuantTable("llama3.1", 24.0, nil)
Types ¶
type BPWValues ¶
BPWValues represents the bits per weight values for a given quantisation.
func GetBPWValues ¶
func GetBPWValues(bpw float64, kvCacheQuant KVCacheQuantisation) BPWValues
GetBPWValues parses the BPW values based on the input
type ContextVRAM ¶
ContextVRAM represents the VRAM usage for a given context quantisation.
type KVCacheQuantisation ¶
type KVCacheQuantisation string
KVCacheQuantisation represents the KV cache quantisation options.
const ( KVCacheFP16 KVCacheQuantisation = "fp16" KVCacheQ8_0 KVCacheQuantisation = "q8_0" KVCacheQ4_0 KVCacheQuantisation = "q4_0" )
Quantisation represents the KV cache quantisation options.
type ModelConfig ¶
type ModelConfig struct { ModelName string `json:"-"` NumParams float64 `json:"-"` MaxPositionEmbeddings int `json:"max_position_embeddings"` NumHiddenLayers int `json:"num_hidden_layers"` HiddenSize int `json:"hidden_size"` NumKeyValueHeads int `json:"num_key_value_heads"` NumAttentionHeads int `json:"num_attention_heads"` IntermediateSize int `json:"intermediate_size"` VocabSize int `json:"vocab_size"` IsOllama bool `json:"-"` QuantLevel string `json:"quant_level"` }
ModelConfig represents the configuration of a model.
func GetHFModelConfig ¶ added in v0.0.6
func GetHFModelConfig(modelID string) (ModelConfig, error)
GetHFModelConfig retrieves and parses the model configuration from Huggingface
Parameters:
- modelID: A string representing the model ID.
Returns:
- ModelConfig: A ModelConfig struct containing the model configuration.
- error: An error if the request fails.
Example:
config, err := GetHFModelConfig("meta/llama3.1") if err != nil { log.Fatal(err) }
func GetModelConfig ¶
func GetModelConfig(modelName string) (ModelConfig, error)
func GetOllamaModelConfig ¶ added in v0.0.6
func GetOllamaModelConfig(modelID string) (ModelConfig, error)
type OllamaModelInfo ¶
type OllamaModelInfo struct { Details struct { ParentModel string `json:"parent_model"` Format string `json:"format"` Family string `json:"family"` Families []string `json:"families"` ParameterSize string `json:"parameter_size"` QuantizationLevel string `json:"quantization_level"` } `json:"details"` ModelInfo struct { Architecture string `json:"general.architecture"` ParameterCount int64 `json:"general.parameter_count"` ContextLength int `json:"llama.context_length"` AttentionHeadCount int `json:"llama.attention.head_count"` AttentionHeadCountKV int `json:"llama.attention.head_count_kv"` EmbeddingLength int `json:"llama.embedding_length"` FeedForwardLength int `json:"llama.feed_forward_length"` RopeDimensionCount int `json:"llama.rope.dimension_count"` VocabSize int `json:"llama.vocab_size"` } `json:"model_info"` }
OllamaModelInfo represents the model information returned by Ollama.
func FetchOllamaModelInfo ¶
func FetchOllamaModelInfo(modelName string) (*OllamaModelInfo, error)
OllamaModelInfo gets model information from Ollama.
Parameters:
- modelName: A string representing the model name.
Returns:
- *OllamaModelInfo: A pointer to an OllamaModelInfo struct containing the model information.
- error: An error if the request fails.
Example:
modelInfo, err := FetchOllamaModelInfo("llama3.1:8b") if err != nil { log.Fatal(err) } fmt.Printf("Model Info: %+v\n", modelInfo)
type QuantRecommendations ¶
QuantRecommendations holds the recommended quantizations for different context sizes
func CalculateBPW ¶
func CalculateBPW(config ModelConfig, memory float64, context int, kvCacheQuant KVCacheQuantisation, quantType string) (interface{}, QuantRecommendations, error)
CalculateBPW calculates the best BPW for a given memory and context constraint
type QuantResult ¶
type QuantResult struct { QuantType string BPW float64 Contexts map[int]ContextVRAM }
type QuantResultTable ¶
type QuantResultTable struct { ModelID string Results []QuantResult FitsVRAM float64 }
QuantResultTable represents the results of a quantisation analysis.
func GenerateQuantTable ¶
func GenerateQuantTable(config ModelConfig, fitsVRAM float64) (QuantResultTable, error)
GenerateQuantTable generates a quantisation table for a given model.
Parameters:
- modelID: A string representing the model ID.
- fitsVRAM: A float64 representing the available VRAM in GB.
- ollamaModelInfo: A pointer to an OllamaModelInfo struct.
Returns:
- QuantResultTable: A QuantResultTable struct containing the quantisation results.
- error: An error if the quantisation fails.
Example:
table, _ := GenerateQuantTable("llama3.1", 24.0, nil)
type VRAMEstimation ¶
type VRAMEstimation struct { ModelName string ModelConfig ModelConfig ContextSize int KVCacheQuant KVCacheQuantisation AvailableVRAM float64 QuantLevel string EstimatedVRAM float64 FitsAvailable bool MaxContextSize int MaximumQuant string Recommendations map[int]string // contains filtered or unexported fields }
VRAMEstimation represents the results of a VRAM estimation.
func EstimateVRAM ¶
func EstimateVRAM( modelName *string, contextSize int, kvCacheQuant KVCacheQuantisation, availableVRAM float64, quantLevel string, ) (*VRAMEstimation, error)
EstimateVRAM calculates VRAM usage for a given model configuration.
Parameters:
- modelName: A pointer to a string representing the model name (Huggingface/ModelID or Ollama:modelName).
- contextSize: An integer representing the context size.
- kvCacheQuant: The KV cache quantization level.
- availableVRAM: A float64 representing the available VRAM in GB.
- quantLevel: A string representing the quantization level.
Returns:
- *VRAMEstimation: A pointer to a VRAMEstimation struct containing the estimation results.
- error: An error if the estimation fails.
Example:
estimation, err := quantest.EstimateVRAM( &modelName, 8192, quantest.KVCacheFP16, 24.0, "Q4_K_M", ) if err != nil { log.Fatal(err) } fmt.Printf("Max Context Size: %d\n", estimation.MaxContextSize)