Documentation ¶
Index ¶
- Constants
- Variables
- func CompleteShardGGUFFilename(name string) []string
- func DefaultCachePath() string
- func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64
- func GGMLHashSize(base uint64) uint64
- func GGMLMemoryPadding(size uint64) uint64
- func GGMLPadding(size, align uint64) uint64
- func GGMLTensorOverhead() uint64
- func GuessFLUXDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func GuessSD1DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func GuessSD2DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func GuessSD35LargeDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func GuessSD35MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func GuessSD3MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func GuessSDXLDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func GuessSDXLRefinerDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64
- func IsShardGGUFFilename(name string) bool
- func OllamaRegistryAuthorize(ctx context.Context, cli *http.Client, authnToken string) (string, error)
- func OllamaRegistryAuthorizeRetry(resp *http.Response, cli *http.Client) bool
- func OllamaSingKeyLoad() (ssh.Signer, error)
- func OllamaUserAgent() string
- func ValueNumeric[T constraints.Integer | constraints.Float](kv GGUFMetadataKV) T
- func ValuesNumeric[T constraints.Integer | constraints.Float](av GGUFMetadataKVArrayValue) []T
- type BytesPerSecondScalar
- type FLOPSScalar
- type GGMLType
- type GGMLTypeTrait
- type GGUFArchitecture
- type GGUFArchitectureDiffusionAutoencoder
- type GGUFArchitectureDiffusionConditioner
- type GGUFArchitectureDiffusionConditioners
- type GGUFBitsPerWeightScalar
- type GGUFBytesScalar
- type GGUFFile
- func ParseGGUFFile(path string, opts ...GGUFReadOption) (*GGUFFile, error)
- func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error)
- func ParseGGUFFileFromModelScope(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error)
- func ParseGGUFFileFromOllama(ctx context.Context, model string, opts ...GGUFReadOption) (*GGUFFile, error)
- func ParseGGUFFileFromOllamaModel(ctx context.Context, model *OllamaModel, opts ...GGUFReadOption) (gf *GGUFFile, err error)
- func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (gf *GGUFFile, err error)
- func (gf *GGUFFile) Architecture() (ga GGUFArchitecture)
- func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMACppRunEstimate)
- func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) (e StableDiffusionCppRunEstimate)
- func (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos
- func (gf *GGUFFile) Metadata() (gm GGUFMetadata)
- func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizer)
- type GGUFFileCache
- type GGUFFileType
- type GGUFFilename
- type GGUFHeader
- type GGUFLayerTensorInfos
- func (ltis GGUFLayerTensorInfos) Bytes() uint64
- func (ltis GGUFLayerTensorInfos) Count() uint64
- func (ltis GGUFLayerTensorInfos) Cut(names []string) (before, after GGUFLayerTensorInfos, found bool)
- func (ltis GGUFLayerTensorInfos) Elements() uint64
- func (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo, found bool)
- func (ltis GGUFLayerTensorInfos) GetFileType() GGUFFileType
- func (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int)
- func (ltis GGUFLayerTensorInfos) Match(nameRegex *regexp.Regexp) bool
- func (ltis GGUFLayerTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)
- type GGUFMagic
- type GGUFMetadata
- type GGUFMetadataKV
- func (kv GGUFMetadataKV) ValueArray() GGUFMetadataKVArrayValue
- func (kv GGUFMetadataKV) ValueBool() bool
- func (kv GGUFMetadataKV) ValueFloat32() float32
- func (kv GGUFMetadataKV) ValueFloat64() float64
- func (kv GGUFMetadataKV) ValueInt16() int16
- func (kv GGUFMetadataKV) ValueInt32() int32
- func (kv GGUFMetadataKV) ValueInt64() int64
- func (kv GGUFMetadataKV) ValueInt8() int8
- func (kv GGUFMetadataKV) ValueString() string
- func (kv GGUFMetadataKV) ValueUint16() uint16
- func (kv GGUFMetadataKV) ValueUint32() uint32
- func (kv GGUFMetadataKV) ValueUint64() uint64
- func (kv GGUFMetadataKV) ValueUint8() uint8
- type GGUFMetadataKVArrayValue
- func (av GGUFMetadataKVArrayValue) ValuesArray() []GGUFMetadataKVArrayValue
- func (av GGUFMetadataKVArrayValue) ValuesBool() []bool
- func (av GGUFMetadataKVArrayValue) ValuesFloat32() []float32
- func (av GGUFMetadataKVArrayValue) ValuesFloat64() []float64
- func (av GGUFMetadataKVArrayValue) ValuesInt16() []int16
- func (av GGUFMetadataKVArrayValue) ValuesInt32() []int32
- func (av GGUFMetadataKVArrayValue) ValuesInt64() []int64
- func (av GGUFMetadataKVArrayValue) ValuesInt8() []int8
- func (av GGUFMetadataKVArrayValue) ValuesString() []string
- func (av GGUFMetadataKVArrayValue) ValuesUint16() []uint16
- func (av GGUFMetadataKVArrayValue) ValuesUint32() []uint32
- func (av GGUFMetadataKVArrayValue) ValuesUint64() []uint64
- func (av GGUFMetadataKVArrayValue) ValuesUint8() []uint8
- type GGUFMetadataKVs
- type GGUFMetadataValueType
- type GGUFNamedTensorInfos
- type GGUFParametersScalar
- type GGUFReadOption
- func SkipCache() GGUFReadOption
- func SkipDNSCache() GGUFReadOption
- func SkipLargeMetadata() GGUFReadOption
- func SkipProxy() GGUFReadOption
- func SkipRangeDownloadDetection() GGUFReadOption
- func SkipTLSVerification() GGUFReadOption
- func UseBearerAuth(token string) GGUFReadOption
- func UseBufferSize(size int) GGUFReadOption
- func UseCache() GGUFReadOption
- func UseCacheExpiration(expiration time.Duration) GGUFReadOption
- func UseCachePath(path string) GGUFReadOption
- func UseDebug() GGUFReadOption
- func UseMMap() GGUFReadOption
- func UseProxy(url *url.URL) GGUFReadOption
- type GGUFRunDeviceMetric
- type GGUFRunEstimateOption
- func WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOption
- func WithFlashAttention() GGUFRunEstimateOption
- func WithLLaMACppAdapters(adp []LLaMACppRunEstimate) GGUFRunEstimateOption
- func WithLLaMACppCacheKeyType(t GGMLType) GGUFRunEstimateOption
- func WithLLaMACppCacheValueType(t GGMLType) GGUFRunEstimateOption
- func WithLLaMACppContextSize(size int32) GGUFRunEstimateOption
- func WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption
- func WithLLaMACppLogicalBatchSize(size int32) GGUFRunEstimateOption
- func WithLLaMACppOffloadLayers(layers uint64) GGUFRunEstimateOption
- func WithLLaMACppPhysicalBatchSize(size int32) GGUFRunEstimateOption
- func WithLLaMACppProjector(prj *LLaMACppRunEstimate) GGUFRunEstimateOption
- func WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption
- func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption
- func WithMainGPUIndex(di int) GGUFRunEstimateOption
- func WithParallelSize(size int32) GGUFRunEstimateOption
- func WithRPCServers(srvs []string) GGUFRunEstimateOption
- func WithStableDiffusionCppAutoencoderTiling() GGUFRunEstimateOption
- func WithStableDiffusionCppBatchCount(count int32) GGUFRunEstimateOption
- func WithStableDiffusionCppControlNet(cn *StableDiffusionCppRunEstimate) GGUFRunEstimateOption
- func WithStableDiffusionCppFreeComputeMemoryImmediately() GGUFRunEstimateOption
- func WithStableDiffusionCppHeight(height uint32) GGUFRunEstimateOption
- func WithStableDiffusionCppOffloadLayers(layers uint64) GGUFRunEstimateOption
- func WithStableDiffusionCppUpscaler(ups *StableDiffusionCppRunEstimate) GGUFRunEstimateOption
- func WithStableDiffusionCppWidth(width uint32) GGUFRunEstimateOption
- func WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption
- func WithinLLaMACppMaxContextSize() GGUFRunEstimateOption
- func WithoutLLaMACppOffloadKVCache() GGUFRunEstimateOption
- func WithoutStableDiffusionCppOffloadAutoencoder() GGUFRunEstimateOption
- func WithoutStableDiffusionCppOffloadConditioner() GGUFRunEstimateOption
- type GGUFTensorInfo
- func (ti GGUFTensorInfo) Bytes() uint64
- func (ti GGUFTensorInfo) Count() uint64
- func (ti GGUFTensorInfo) Elements() uint64
- func (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found bool)
- func (ti GGUFTensorInfo) GetFileType() GGUFFileType
- func (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFTensorInfo, found int)
- func (ti GGUFTensorInfo) Match(nameRegex *regexp.Regexp) bool
- func (ti GGUFTensorInfo) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)
- type GGUFTensorInfos
- func (tis GGUFTensorInfos) Bytes() uint64
- func (tis GGUFTensorInfos) Count() uint64
- func (tis GGUFTensorInfos) Elements() uint64
- func (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, found bool)
- func (tis GGUFTensorInfos) GetFileType() GGUFFileType
- func (tis GGUFTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int)
- func (tis GGUFTensorInfos) Layers(ignores ...string) GGUFLayerTensorInfos
- func (tis GGUFTensorInfos) Match(nameRegex *regexp.Regexp) bool
- func (tis GGUFTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)
- type GGUFTokenizer
- type GGUFTokensPerSecondScalar
- type GGUFVersion
- type IGGUFTensorInfos
- type LLaMACppComputationMemoryUsage
- type LLaMACppKVCacheMemoryUsage
- type LLaMACppParameterUsage
- type LLaMACppRunDeviceUsage
- type LLaMACppRunEstimate
- type LLaMACppRunEstimateMemory
- type LLaMACppRunEstimateSummary
- type LLaMACppRunEstimateSummaryItem
- type LLaMACppSplitMode
- type LLaMACppWeightMemoryUsage
- type OllamaModel
- func (om *OllamaModel) Complete(ctx context.Context, cli *http.Client) error
- func (om *OllamaModel) GetLayer(mediaType string) (OllamaModelLayer, bool)
- func (om *OllamaModel) License(ctx context.Context, cli *http.Client) ([]string, error)
- func (om *OllamaModel) Messages(ctx context.Context, cli *http.Client) ([]json.RawMessage, error)
- func (om *OllamaModel) Params(ctx context.Context, cli *http.Client) (map[string]any, error)
- func (om *OllamaModel) SearchLayers(mediaTypeRegex *regexp.Regexp) []OllamaModelLayer
- func (om *OllamaModel) String() string
- func (om *OllamaModel) System(ctx context.Context, cli *http.Client) (string, error)
- func (om *OllamaModel) Template(ctx context.Context, cli *http.Client) (string, error)
- func (om *OllamaModel) WebPageURL() *url.URL
- type OllamaModelLayer
- type OllamaModelOption
- func SetOllamaModelBaseURL(baseURL string) OllamaModelOption
- func SetOllamaModelDefaultNamespace(namespace string) OllamaModelOption
- func SetOllamaModelDefaultRegistry(registry string) OllamaModelOption
- func SetOllamaModelDefaultScheme(scheme string) OllamaModelOption
- func SetOllamaModelDefaultTag(tag string) OllamaModelOption
- type SizeScalar
- type StableDiffusionCppRunDeviceUsage
- type StableDiffusionCppRunEstimate
- func (e StableDiffusionCppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (es StableDiffusionCppRunEstimateSummary)
- func (e StableDiffusionCppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (emi StableDiffusionCppRunEstimateSummaryItem)
- type StableDiffusionCppRunEstimateMemory
- type StableDiffusionCppRunEstimateSummary
- type StableDiffusionCppRunEstimateSummaryItem
Constants ¶
const ( // GGMLTensorSize is the size of GGML tensor in bytes, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L606. GGMLTensorSize = 368 // GGMLObjectSize is the size of GGML object in bytes, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L563. GGMLObjectSize = 32 )
GGML tensor constants.
const ( // GGMLComputationGraphSize is the size of GGML computation graph in bytes. GGMLComputationGraphSize = 80 // GGMLComputationGraphNodesMaximum is the maximum nodes of the computation graph, // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L103. GGMLComputationGraphNodesMaximum = 8192 // GGMLComputationGraphNodesDefault is the default nodes of the computation graph, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L237. GGMLComputationGraphNodesDefault = 2048 )
GGML computation graph constants.
const ( OllamaDefaultScheme = "https" OllamaDefaultRegistry = "registry.ollama.ai" OllamaDefaultNamespace = "library" OllamaDefaultTag = "latest" )
Variables ¶
var ( ErrGGUFFileCacheDisabled = errors.New("GGUF file cache disabled") ErrGGUFFileCacheMissed = errors.New("GGUF file cache missed") ErrGGUFFileCacheCorrupted = errors.New("GGUF file cache corrupted") )
var ( ErrOllamaInvalidModel = errors.New("ollama invalid model") ErrOllamaBaseLayerNotFound = errors.New("ollama base layer not found") )
var ErrGGUFFileInvalidFormat = errors.New("invalid GGUF format")
var GGUFBytesScalarStringInMiBytes bool
GGUFBytesScalarStringInMiBytes is the flag to show the GGUFBytesScalar string in MiB.
var GGUFFilenameRegex = regexp.MustCompile(`^(?P<BaseName>[A-Za-z\s][A-Za-z0-9._\s]*(?:(?:-(?:(?:[A-Za-z\s][A-Za-z0-9._\s]*)|(?:[0-9._\s]*)))*))-(?:(?P<SizeLabel>(?:\d+x)?(?:\d+\.)?\d+[A-Za-z](?:-[A-Za-z]+(\d+\.)?\d+[A-Za-z]+)?)(?:-(?P<FineTune>[A-Za-z][A-Za-z0-9\s_-]+[A-Za-z](?i:[^BFKIQ])))?)?(?:-(?P<Version>[vV]\d+(?:\.\d+)*))?(?i:-(?P<Encoding>(BF16|F32|F16|([KI]?Q[0-9][A-Z0-9_]*))))?(?:-(?P<Type>LoRA|vocab))?(?:-(?P<Shard>\d{5})-of-(?P<ShardTotal>\d{5}))?\.gguf$`) // nolint:lll
var ShardGGUFFilenameRegex = regexp.MustCompile(`^(?P<Prefix>.*)-(?:(?P<Shard>\d{5})-of-(?P<ShardTotal>\d{5}))\.gguf$`)
Functions ¶
func CompleteShardGGUFFilename ¶ added in v0.7.2
CompleteShardGGUFFilename returns the list of shard GGUF filenames that are related to the given shard GGUF filename.
Only available if the given filename is a shard GGUF filename.
func DefaultCachePath ¶ added in v0.7.2
func DefaultCachePath() string
DefaultCachePath returns the default cache path.
func GGMLComputationGraphOverhead ¶
GGMLComputationGraphOverhead is the overhead of GGML graph in bytes, see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L18905-L18917.
func GGMLHashSize ¶
GGMLHashSize returns the size of the hash table for the given base, see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L17698-L17722.
func GGMLMemoryPadding ¶
GGMLMemoryPadding returns the padded size of the given size according to GGML memory padding, see https://github.com/ggerganov/ggml/blob/0cbb7c0/include/ggml/ggml.h#L238-L243.
func GGMLPadding ¶
GGMLPadding returns the padded size of the given size according to given align, see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L255.
func GGMLTensorOverhead ¶
func GGMLTensorOverhead() uint64
GGMLTensorOverhead is the overhead of GGML tensor in bytes, see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L2765-L2767.
func GuessFLUXDiffusionModelMemoryUsage ¶ added in v0.13.3
GuessFLUXDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func GuessSD1DiffusionModelMemoryUsage ¶ added in v0.13.3
GuessSD1DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func GuessSD2DiffusionModelMemoryUsage ¶ added in v0.13.3
GuessSD2DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func GuessSD35LargeDiffusionModelMemoryUsage ¶ added in v0.13.3
GuessSD35LargeDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func GuessSD35MediumDiffusionModelMemoryUsage ¶ added in v0.13.3
GuessSD35MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func GuessSD3MediumDiffusionModelMemoryUsage ¶ added in v0.13.3
GuessSD3MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func GuessSDXLDiffusionModelMemoryUsage ¶ added in v0.13.3
GuessSDXLDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func GuessSDXLRefinerDiffusionModelMemoryUsage ¶ added in v0.13.3
GuessSDXLRefinerDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, which is calculated by linear regression or polynomial regression.
func IsShardGGUFFilename ¶ added in v0.7.2
IsShardGGUFFilename returns true if the given filename is a shard GGUF filename.
func OllamaRegistryAuthorize ¶ added in v0.6.1
func OllamaRegistryAuthorize(ctx context.Context, cli *http.Client, authnToken string) (string, error)
OllamaRegistryAuthorize authorizes the request with the given authentication token, and returns the authorization token.
func OllamaRegistryAuthorizeRetry ¶ added in v0.6.1
OllamaRegistryAuthorizeRetry returns true if the request should be retried with authorization.
OllamaRegistryAuthorizeRetry leverages OllamaRegistryAuthorize to obtain an authorization token, and configures the request with the token.
func OllamaSingKeyLoad ¶ added in v0.6.1
OllamaSingKeyLoad loads the signing key for Ollama, and generates a new key if not exists.
func OllamaUserAgent ¶ added in v0.6.1
func OllamaUserAgent() string
OllamaUserAgent returns the user agent string for Ollama, since llama3.1, the user agent is required to be set, otherwise the request will be rejected by 412.
func ValueNumeric ¶
func ValueNumeric[T constraints.Integer | constraints.Float](kv GGUFMetadataKV) T
ValueNumeric returns the numeric values of the GGUFMetadataKV, and panics if the value type is not numeric.
ValueNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float.
Compare to the GGUFMetadataKV's Value* functions, ValueNumeric will cast the original value to the target type.
func ValuesNumeric ¶
func ValuesNumeric[T constraints.Integer | constraints.Float](av GGUFMetadataKVArrayValue) []T
ValuesNumeric returns the numeric values of the GGUFMetadataKVArrayValue, and panics if the value type is not numeric.
ValuesNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float.
Compare to the GGUFMetadataKVArrayValue's Value* functions, ValuesNumeric will cast the original value to the target type.
Types ¶
type BytesPerSecondScalar ¶ added in v0.10.0
type BytesPerSecondScalar uint64
BytesPerSecondScalar is the scalar for bytes per second (Bps).
func ParseBytesPerSecondScalar ¶ added in v0.10.0
func ParseBytesPerSecondScalar(s string) (_ BytesPerSecondScalar, err error)
ParseBytesPerSecondScalar parses the BytesPerSecondScalar from the string.
func (BytesPerSecondScalar) String ¶ added in v0.10.0
func (s BytesPerSecondScalar) String() string
type FLOPSScalar ¶ added in v0.10.0
type FLOPSScalar uint64
FLOPSScalar is the scalar for FLOPS.
func ParseFLOPSScalar ¶ added in v0.10.0
func ParseFLOPSScalar(s string) (_ FLOPSScalar, err error)
ParseFLOPSScalar parses the FLOPSScalar from the string.
func (FLOPSScalar) String ¶ added in v0.10.0
func (s FLOPSScalar) String() string
type GGMLType ¶
type GGMLType uint32
GGMLType is a type of GGML tensor, see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/include/ggml.h#L363-L401.
const ( GGMLTypeF32 GGMLType = iota GGMLTypeF16 GGMLTypeQ4_0 GGMLTypeQ4_1 GGMLTypeQ4_2 GGMLTypeQ4_3 GGMLTypeQ5_0 GGMLTypeQ5_1 GGMLTypeQ8_0 GGMLTypeQ8_1 GGMLTypeQ2_K GGMLTypeQ3_K GGMLTypeQ4_K GGMLTypeQ5_K GGMLTypeQ6_K GGMLTypeQ8_K GGMLTypeIQ2_XXS GGMLTypeIQ2_XS GGMLTypeIQ3_XXS GGMLTypeIQ1_S GGMLTypeIQ4_NL GGMLTypeIQ3_S GGMLTypeIQ2_S GGMLTypeIQ4_XS GGMLTypeI8 GGMLTypeI16 GGMLTypeI32 GGMLTypeI64 GGMLTypeF64 GGMLTypeIQ1_M GGMLTypeBF16 GGMLTypeQ4_0_4_4 GGMLTypeQ4_0_4_8 GGMLTypeQ4_0_8_8 GGMLTypeTQ1_0 GGMLTypeTQ2_0 GGMLTypeIQ4_NL_4_4 GGMLTypeIQ4_NL_4_8 GGMLTypeIQ4_NL_8_8 )
GGMLType constants.
GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated.
func (GGMLType) RowSizeOf ¶
RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait, which is inspired by https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145.
The index of the given dimensions means the number of dimension, i.e. 0 is the first dimension, 1 is the second dimension, and so on.
The value of the item is the number of elements in the corresponding dimension.
func (GGMLType) Trait ¶
func (t GGMLType) Trait() (GGMLTypeTrait, bool)
Trait returns the GGMLTypeTrait of the GGMLType.
type GGMLTypeTrait ¶
type GGMLTypeTrait struct { BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64. TypeSize uint64 // Original is uint32, in order to reduce conversion, here we use uint64. Quantized bool }
GGMLTypeTrait holds the trait of a GGMLType, see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/src/ggml.c#L663-L1082.
type GGUFArchitecture ¶ added in v0.8.0
type GGUFArchitecture struct { // Type describes the type of the file, // default is "model". Type string `json:"type"` // Architecture describes what architecture this model implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // MaximumContextLength(n_ctx_train) is the maximum context length of the model. // // For most architectures, this is the hard limit on the length of the input. // Architectures, like RWKV, // that are not reliant on transformer-style attention may be able to handle larger inputs, // but this is not guaranteed. MaximumContextLength uint64 `json:"maximumContextLength,omitempty"` // EmbeddingLength(n_embd) is the length of the embedding layer. EmbeddingLength uint64 `json:"embeddingLength,omitempty"` // BlockCount(n_layer) is the number of blocks of attention and feed-forward layers, // i.e. the bulk of the LLM. // This does not include the input or embedding layers. BlockCount uint64 `json:"blockCount,omitempty"` // FeedForwardLength(n_ff) is the length of the feed-forward layer. FeedForwardLength uint64 `json:"feedForwardLength,omitempty"` // ExpertFeedForwardLength(expert_feed_forward_length) is the length of the feed-forward layer in the expert model. ExpertFeedForwardLength uint64 `json:"expertFeedForwardLength,omitempty"` ExpertSharedFeedForwardLength uint64 `json:"expertSharedFeedForwardLength,omitempty"` // ExpertCount(n_expert) is the number of experts in MoE models. ExpertCount uint32 `json:"expertCount,omitempty"` // ExpertUsedCount(n_expert_used) is the number of experts used during each token evaluation in MoE models. ExpertUsedCount uint32 `json:"expertUsedCount,omitempty"` // AttentionHeadCount(n_head) is the number of attention heads. AttentionHeadCount uint64 `json:"attentionHeadCount,omitempty"` // AttentionHeadCountKV(n_head_kv) is the number of attention heads per group used in Grouped-Query-Attention. // // If not provided or equal to AttentionHeadCount, // the model does not use Grouped-Query-Attention. AttentionHeadCountKV uint64 `json:"attentionHeadCountKV,omitempty"` // AttentionMaxALiBIBias is the maximum bias to use for ALiBI. AttentionMaxALiBIBias float32 `json:"attentionMaxALiBIBias,omitempty"` // AttentionClampKQV describes a value `C`, // which is used to clamp the values of the `Q`, `K` and `V` tensors between `[-C, C]`. AttentionClampKQV float32 `json:"attentionClampKQV,omitempty"` // AttentionLayerNormEpsilon is the epsilon value used in the LayerNorm(Layer Normalization). AttentionLayerNormEpsilon float32 `json:"attentionLayerNormEpsilon,omitempty"` // AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(root Mean Square Layer Normalization), // which is a simplification of the original LayerNorm. AttentionLayerNormRMSEpsilon float32 `json:"attentionLayerNormRMSEpsilon,omitempty"` // AttentionKeyLength(n_embd_head_k) is the size of a key head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. AttentionKeyLength uint32 `json:"attentionKeyLength,omitempty"` // AttentionValueLength(n_embd_head_v) is the size of a value head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. AttentionValueLength uint32 `json:"attentionValueLength,omitempty"` // AttentionCausal is true if the attention is causal. AttentionCausal bool `json:"attentionCausal,omitempty"` // RoPEDimensionCount is the number of dimensions in the RoPE(Rotary Positional Encoding). RoPEDimensionCount uint64 `json:"ropeDimensionCount,omitempty"` // RoPEFrequencyBase is the base frequency of the RoPE. RoPEFrequencyBase float32 `json:"ropeFrequencyBase,omitempty"` // RoPEFrequencyScale is the frequency scale of the RoPE. RoPEScalingType string `json:"ropeScalingType,omitempty"` // RoPEScalingFactor is the scaling factor of the RoPE. RoPEScalingFactor float32 `json:"ropeScalingFactor,omitempty"` // RoPEScalingOriginalContextLength is the original context length of the RoPE scaling. RoPEScalingOriginalContextLength uint64 `json:"ropeScalingOriginalContextLength,omitempty"` // RoPEScalingFinetuned is true if the RoPE scaling is fine-tuned. RoPEScalingFinetuned bool `json:"ropeScalingFinetuned,omitempty"` // SSMConvolutionKernel is the size of the convolution kernel used in the SSM(Selective State Space Model). SSMConvolutionKernel uint32 `json:"ssmConvolutionKernel,omitempty"` // SSMInnerSize is the embedding size of the state in SSM. SSMInnerSize uint32 `json:"ssmInnerSize,omitempty"` // SSMStateSize is the size of the recurrent state in SSM. SSMStateSize uint32 `json:"ssmStateSize,omitempty"` // SSMTimeStepRank is the rank of the time steps in SSM. SSMTimeStepRank uint32 `json:"ssmTimeStepRank,omitempty"` // VocabularyLength is the size of the vocabulary. // // VocabularyLength is the same as the tokenizer's token size. VocabularyLength uint64 `json:"vocabularyLength,omitempty"` // EmbeddingGGQA is the GQA of the embedding layer. EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"` // EmbeddingKeyGQA is the number of key GQA in the embedding layer. EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"` // EmbeddingValueGQA is the number of value GQA in the embedding layer. EmbeddingValueGQA uint64 `json:"embeddingValueGQA,omitempty"` // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". ClipProjectorType string `json:"clipProjectorType,omitempty"` // ClipHasLLaVAProjector indicates whether the clip model has LLaVA projector or not. // // Only used when Architecture is "clip". ClipHasLLaVAProjector bool `json:"clipHasLLaVAProjector,omitempty"` // ClipHasMiniCPMVProjector indicates whether the clip model has MiniCPMV projector or not. // // Only used when Architecture is "clip". ClipHasMiniCPMVProjector bool `json:"clipHasMiniCPMVProject,omitempty"` // ClipMiniCPMVVersion is the version of the MiniCPMV projector. // // Only used when Architecture is "clip" and ClipHasMiniCPMVProjector is true. ClipMiniCPMVVersion int32 `json:"clipMiniCPMVVersion,omitempty"` // ClipHasQwen2VLMerger indicates whether the clip model has Qwen2VL merger or not. // // Only used when Architecture is "clip". ClipHasQwen2VLMerger bool `json:"clipHasQwen2VLMerger,omitempty"` // ClipHasTextEncoder indicates whether the clip model has text encoder or not. // // Only used when Architecture is "clip". ClipHasTextEncoder bool `json:"clipHasTextEncoder,omitempty"` // ClipHasVisionEncoder indicates whether the clip model has vision encoder or not. // // Only used when Architecture is "clip". ClipHasVisionEncoder bool `json:"clipHasVisionEncoder,omitempty"` // ClipVisionImageSize indicates the image size of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionImageSize uint32 `json:"clipVisionImageSize,omitempty"` // ClipVisionPatchSize indicates the patch size of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionPatchSize uint32 `json:"clipVisionPatchSize,omitempty"` // ClipVisionProjectionDim indicates the projection dimension of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionProjectionDim uint32 `json:"clipVisionProjectionDim,omitempty"` // ClipVisionMMPatchMergeType indicates the merge type of the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionMMPatchMergeType string `json:"clipVisionMMPatchMergeType,omitempty"` // AdapterType is the type of the adapter. // // Only used when Architecture is "adapter". AdapterType string `json:"adapterType,omitempty"` // AdapterLoRAAlpha is the alpha value of the LoRA adapter. // // Only used when AdapterType is "lora". AdapterLoRAAlpha float32 `json:"adapterLoRAAlpha,omitempty"` // AdapterControlVectorLayerCount is the number of layers in the control vector. // // Only used when Architecture is "control_vector". AdapterControlVectorLayerCount uint32 `json:"adapterControlVectorLayerCount,omitempty"` // DiffusionArchitecture is the actual architecture of the diffusion model. // // Only used when Architecture is "diffusion". DiffusionArchitecture string `json:"diffusionArchitecture,omitempty"` // DiffusionTransformer indicates whether the diffusion model is a diffusion transformer or not. // DiffusionTransformer bool `json:"diffusionTransformer,omitempty"` // DiffusionConditioners is the list of diffusion conditioners. // // Only used when Architecture is "diffusion". DiffusionConditioners GGUFArchitectureDiffusionConditioners `json:"diffusionConditioners,omitempty"` // DiffusionAutoencoder represents the autoencoder of the diffusion model. // // Only used when Architecture is "diffusion". DiffusionAutoencoder *GGUFArchitectureDiffusionAutoencoder `json:"diffusionAutoencoder,omitempty"` }
GGUFArchitecture represents the architecture metadata of a GGUF file.
func (GGUFArchitecture) DiffusionHasAutoencoder ¶ added in v0.13.0
func (ga GGUFArchitecture) DiffusionHasAutoencoder() bool
DiffusionHasAutoencoder returns true if the diffusion model has an autoencoder.
func (GGUFArchitecture) DiffusionHasConditioners ¶ added in v0.13.0
func (ga GGUFArchitecture) DiffusionHasConditioners() bool
DiffusionHasConditioners returns true if the diffusion model has conditioners.
type GGUFArchitectureDiffusionAutoencoder ¶ added in v0.13.0
type GGUFArchitectureDiffusionAutoencoder struct { // Architecture is the architecture of the diffusion autoencoder. // // Currently, only "VAE" is supported. Architecture string `json:"architecture"` // FileType describes the type of the majority of the tensors in the GGUF file. FileType GGUFFileType `json:"fileType"` }
GGUFArchitectureDiffusionAutoencoder represents the autoencoder metadata of the diffusion architecture.
func (GGUFArchitectureDiffusionAutoencoder) String ¶ added in v0.13.0
func (gaa GGUFArchitectureDiffusionAutoencoder) String() string
type GGUFArchitectureDiffusionConditioner ¶ added in v0.13.0
type GGUFArchitectureDiffusionConditioner struct { // Architecture is the architecture of the diffusion conditioner. Architecture string `json:"architecture"` // FileType describes the type of the majority of the tensors in the GGUF file. FileType GGUFFileType `json:"fileType"` }
GGUFArchitectureDiffusionConditioner represents the conditioner metadata of the diffusion architecture.
func (GGUFArchitectureDiffusionConditioner) String ¶ added in v0.13.0
func (gac GGUFArchitectureDiffusionConditioner) String() string
type GGUFArchitectureDiffusionConditioners ¶ added in v0.13.0
type GGUFArchitectureDiffusionConditioners []GGUFArchitectureDiffusionConditioner
GGUFArchitectureDiffusionConditioners is the list of GGUFArchitectureDiffusionConditioner.
func (GGUFArchitectureDiffusionConditioners) String ¶ added in v0.13.0
func (gacs GGUFArchitectureDiffusionConditioners) String() string
type GGUFBitsPerWeightScalar ¶
type GGUFBitsPerWeightScalar float64
GGUFBitsPerWeightScalar is the scalar for bits per weight.
func (GGUFBitsPerWeightScalar) String ¶
func (s GGUFBitsPerWeightScalar) String() string
type GGUFBytesScalar ¶
type GGUFBytesScalar uint64
GGUFBytesScalar is the scalar for bytes.
func ParseGGUFBytesScalar ¶ added in v0.10.0
func ParseGGUFBytesScalar(s string) (_ GGUFBytesScalar, err error)
ParseGGUFBytesScalar parses the GGUFBytesScalar from the string.
func (GGUFBytesScalar) String ¶
func (s GGUFBytesScalar) String() string
type GGUFFile ¶
type GGUFFile struct { // Header is the header of the GGUF file. Header GGUFHeader `json:"header"` // TensorInfos are the tensor infos of the GGUF file, // the size of TensorInfos is equal to `Header.TensorCount`. TensorInfos GGUFTensorInfos `json:"tensorInfos"` // Padding is the padding size of the GGUF file, // which is used to split Header and TensorInfos from tensor data. Padding int64 `json:"padding"` // SplitPaddings holds the padding size slice of the GGUF file splits, // each item represents splitting Header and TensorInfos from tensor data. // // The length of SplitPaddings is the number of split files. SplitPaddings []int64 `json:"splitPaddings,omitempty"` // TensorDataStartOffset is the offset in bytes of the tensor data in this file. // // The offset is the start of the file. TensorDataStartOffset int64 `json:"tensorDataStartOffset"` // SplitTensorDataStartOffsets holds the offset slice in bytes of the tensor data of the GGUF file splits, // each item represents the offset of the tensor data in the split file. // // The length of SplitTensorDataStartOffsets is the number of split files. SplitTensorDataStartOffsets []int64 `json:"splitTensorDataStartOffsets,omitempty"` // Size is the size of the GGUF file, // if the file is split, the size is the sum of all split files. Size GGUFBytesScalar `json:"size"` // SplitSizes holds the size slice of the GGUF file splits, // each item represents the size of the split file. // // The length of SplitSizes is the number of split files. SplitSizes []GGUFBytesScalar `json:"splitSizes,omitempty"` // ModelSize is the size of the model when loading. ModelSize GGUFBytesScalar `json:"modelSize"` // SplitModelSizes holds the size slice of the model, // each item represents a size when loading of the split file. // // The length of SplitModelSizes is the number of split files. SplitModelSizes []GGUFBytesScalar `json:"splitModelSizes,omitempty"` // ModelParameters is the number of the model parameters. ModelParameters GGUFParametersScalar `json:"modelParameters"` // ModelBitsPerWeight is the bits per weight of the model, // which describes how many bits are used to store a weight, // higher is better. ModelBitsPerWeight GGUFBitsPerWeightScalar `json:"modelBitsPerWeight"` }
GGUFFile represents a GGUF file, see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.
Compared with the complete GGUF file, this structure lacks the tensor data part.
func ParseGGUFFile ¶
func ParseGGUFFile(path string, opts ...GGUFReadOption) (*GGUFFile, error)
ParseGGUFFile parses a GGUF file from the local given path, and returns the GGUFFile, or an error if any.
func ParseGGUFFileFromHuggingFace ¶
func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error)
ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face(https://huggingface.co/), and returns a GGUFFile, or an error if any.
func ParseGGUFFileFromModelScope ¶
func ParseGGUFFileFromModelScope(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error)
ParseGGUFFileFromModelScope parses a GGUF file from Model Scope(https://modelscope.cn/), and returns a GGUFFile, or an error if any.
func ParseGGUFFileFromOllama ¶
func ParseGGUFFileFromOllama(ctx context.Context, model string, opts ...GGUFReadOption) (*GGUFFile, error)
ParseGGUFFileFromOllama parses a GGUF file from Ollama model's base layer, and returns a GGUFFile, or an error if any.
func ParseGGUFFileFromOllamaModel ¶
func ParseGGUFFileFromOllamaModel(ctx context.Context, model *OllamaModel, opts ...GGUFReadOption) (gf *GGUFFile, err error)
ParseGGUFFileFromOllamaModel is similar to ParseGGUFFileFromOllama, but inputs an OllamaModel instead of a string.
The given OllamaModel will be completed(fetching MediaType, Config and Layers) after calling this function.
func ParseGGUFFileRemote ¶
func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (gf *GGUFFile, err error)
ParseGGUFFileRemote parses a GGUF file from a remote BlobURL, and returns a GGUFFile, or an error if any.
func (*GGUFFile) Architecture ¶
func (gf *GGUFFile) Architecture() (ga GGUFArchitecture)
Architecture returns the architecture metadata of the GGUF file.
func (*GGUFFile) EstimateLLaMACppRun ¶ added in v0.9.0
func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMACppRunEstimate)
EstimateLLaMACppRun returns the inference estimated result of the GGUF file.
func (*GGUFFile) EstimateStableDiffusionCppRun ¶ added in v0.13.0
func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) (e StableDiffusionCppRunEstimate)
func (*GGUFFile) Layers ¶
func (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos
Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos.
func (*GGUFFile) Metadata ¶ added in v0.8.0
func (gf *GGUFFile) Metadata() (gm GGUFMetadata)
Metadata returns the metadata of the GGUF file.
func (*GGUFFile) Tokenizer ¶
func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizer)
Tokenizer returns the tokenizer metadata of a GGUF file.
type GGUFFileCache ¶
type GGUFFileCache string
func (GGUFFileCache) Delete ¶
func (c GGUFFileCache) Delete(key string) error
type GGUFFileType ¶
type GGUFFileType uint32
GGUFFileType is a type of GGUF file, see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/ggml/include/ggml.h#L404-L433.
const ( GGUFFileTypeAllF32 GGUFFileType = iota // F32 GGUFFileTypeMostlyF16 // F16 GGUFFileTypeMostlyQ4_0 // Q4_0 GGUFFileTypeMostlyQ4_1 // Q4_1 GGUFFileTypeMostlyQ4_1_F16 // Q4_1_F16 GGUFFileTypeMostlyQ4_2 // Q4_2 GGUFFileTypeMostlyQ4_3 // Q4_3 GGUFFileTypeMostlyQ8_0 // Q8_0 GGUFFileTypeMostlyQ5_0 // Q5_0 GGUFFileTypeMostlyQ5_1 // Q5_1 GGUFFileTypeMostlyQ2_K // Q2_K GGUFFileTypeMostlyQ3_K // Q3_K/Q3_K_S GGUFFileTypeMostlyQ4_K // Q4_K/Q3_K_M GGUFFileTypeMostlyQ5_K // Q5_K/Q3_K_L GGUFFileTypeMostlyQ6_K // Q6_K/Q4_K_S GGUFFileTypeMostlyIQ2_XXS // IQ2_XXS/Q4_K_M GGUFFileTypeMostlyIQ2_XS // IQ2_XS/Q5_K_S GGUFFileTypeMostlyIQ3_XXS // IQ3_XXS/Q5_K_M GGUFFileTypeMostlyIQ1_S // IQ1_S/Q6_K GGUFFileTypeMostlyIQ4_NL // IQ4_NL GGUFFileTypeMostlyIQ3_S // IQ3_S GGUFFileTypeMostlyIQ2_S // IQ2_S GGUFFileTypeMostlyIQ4_XS // IQ4_XS GGUFFileTypeMostlyIQ1_M // IQ1_M GGUFFileTypeMostlyBF16 // BF16 GGUFFileTypeMostlyQ4_0_4_4 // Q4_0_4x4 GGUFFileTypeMostlyQ4_0_4_8 // Q4_0_4x8 GGUFFileTypeMostlyQ4_0_8_8 // Q4_0_8x8 GGUFFileTypeMostlyTQ1_0 // TQ1_0 GGUFFileTypeMostlyTQ2_0 // TQ2_0 GGUFFileTypeMostlyIQ4_NL_4_4 // IQ4_NL_4x4 GGUFFileTypeMostlyIQ4_NL_4_8 // IQ4_NL_4x8 GGUFFileTypeMostlyIQ4_NL_8_8 // IQ4_NL_8x8 )
GGUFFileType constants.
GGUFFileTypeMostlyQ4_2, GGUFFileTypeMostlyQ4_3 are deprecated.
GGUFFileTypeMostlyQ4_1_F16 is a special case where the majority of the tensors are Q4_1, but 'token_embd.weight' and 'output.weight' tensors are F16.
func GetFileType ¶ added in v0.13.0
func GetFileType(cm map[GGMLType]int) GGUFFileType
GetFileType returns the GGUFFileType represented the mostly GGMLType of the given tensors counter.
The input `cm` is a map of GGMLType to the count of tensors of that type.
func (GGUFFileType) GGMLType ¶
func (t GGUFFileType) GGMLType() GGMLType
GGMLType returns the GGMLType of the GGUFFileType, which is inspired by https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2730-L2763.
func (GGUFFileType) String ¶
func (i GGUFFileType) String() string
type GGUFFilename ¶
type GGUFFilename struct { BaseName string `json:"baseName"` SizeLabel string `json:"sizeLabel"` FineTune string `json:"fineTune"` Version string `json:"version"` Encoding string `json:"encoding"` Type string `json:"type"` Shard *int `json:"shard,omitempty"` ShardTotal *int `json:"shardTotal,omitempty"` }
GGUFFilename represents a GGUF filename, see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention.
func ParseGGUFFilename ¶
func ParseGGUFFilename(name string) *GGUFFilename
ParseGGUFFilename parses the given GGUF filename string, and returns the GGUFFilename, or nil if the filename is invalid.
func (GGUFFilename) IsShard ¶ added in v0.7.2
func (gn GGUFFilename) IsShard() bool
IsShard returns true if the GGUF filename is a shard.
func (GGUFFilename) String ¶
func (gn GGUFFilename) String() string
type GGUFHeader ¶
type GGUFHeader struct { // Magic is a magic number that announces that this is a GGUF file. Magic GGUFMagic `json:"magic"` // Version is a version of the GGUF file format. Version GGUFVersion `json:"version"` // TensorCount is the number of tensors in the file. TensorCount uint64 `json:"tensorCount"` // MetadataKVCount is the number of key-value pairs in the metadata. MetadataKVCount uint64 `json:"metadataKVCount"` // MetadataKV are the key-value pairs in the metadata, MetadataKV GGUFMetadataKVs `json:"metadataKV"` }
GGUFHeader represents the header of a GGUF file.
type GGUFLayerTensorInfos ¶
type GGUFLayerTensorInfos []IGGUFTensorInfos
GGUFLayerTensorInfos represents hierarchical tensor infos of a GGUF file, it can save GGUFNamedTensorInfos, GGUFTensorInfos, and GGUFTensorInfo.
func (GGUFLayerTensorInfos) Bytes ¶
func (ltis GGUFLayerTensorInfos) Bytes() uint64
Bytes returns the number of bytes of the GGUFLayerTensorInfos.
func (GGUFLayerTensorInfos) Count ¶
func (ltis GGUFLayerTensorInfos) Count() uint64
Count returns the number of GGUF tensors of the GGUFLayerTensorInfos.
func (GGUFLayerTensorInfos) Cut ¶
func (ltis GGUFLayerTensorInfos) Cut(names []string) (before, after GGUFLayerTensorInfos, found bool)
Cut splits the GGUFLayerTensorInfos into two parts, and returns the GGUFLayerTensorInfos with the names that match the given names at first, and the GGUFLayerTensorInfos without the names at second, and true if the GGUFLayerTensorInfos with the names are found, and false otherwise.
The given names support glob pattern, for example, "a*" matches "a", "ab", "abc", and so on.
func (GGUFLayerTensorInfos) Elements ¶
func (ltis GGUFLayerTensorInfos) Elements() uint64
Elements returns the number of elements of the GGUFLayerTensorInfos.
func (GGUFLayerTensorInfos) Get ¶
func (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo, found bool)
Get returns the IGGUFTensorInfos with the given name, and true if found, and false otherwise.
func (GGUFLayerTensorInfos) GetFileType ¶ added in v0.13.0
func (ltis GGUFLayerTensorInfos) GetFileType() GGUFFileType
GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFLayerTensorInfos.
func (GGUFLayerTensorInfos) Index ¶
func (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int)
Index returns a map value to the GGUFTensorInfos with the given names, and the number of names found.
func (GGUFLayerTensorInfos) Match ¶ added in v0.13.0
func (ltis GGUFLayerTensorInfos) Match(nameRegex *regexp.Regexp) bool
Match returns true if a tensor of GGUFLayerTensorInfos matches the given regex.
func (GGUFLayerTensorInfos) Search ¶
func (ltis GGUFLayerTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)
Search returns a list of GGUFTensorInfo with the names that match the given regex.
type GGUFMagic ¶
type GGUFMagic uint32
GGUFMagic is a magic number of GGUF file, see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#historical-state-of-affairs.
type GGUFMetadata ¶ added in v0.8.0
type GGUFMetadata struct { // Type describes what type this GGUF file is, // default is "model". Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // QuantizationVersion describes the version of the quantization format. // // Not required if the model is not quantized (i.e. no tensors are quantized). // If any tensors are quantized, this must be present. // This is separate to the quantization scheme of the tensors itself, // the quantization version may change without changing the scheme's name, // e.g. the quantization scheme is Q5_K, and the QuantizationVersion is 4. QuantizationVersion uint32 `json:"quantizationVersion,omitempty"` // Alignment describes the alignment of the GGUF file. // // This can vary to allow for different alignment schemes, but it must be a multiple of 8. // Some writers may not write the alignment. // // Default is 32. Alignment uint32 `json:"alignment"` // Name to the model. // // This should be a human-readable name that can be used to identify the GGUF file. // It should be unique within the community that the model is defined in. Name string `json:"name,omitempty"` // Author to the model. Author string `json:"author,omitempty"` // URL to the model's homepage. // // This can be a GitHub repo, a paper, etc. URL string `json:"url,omitempty"` // Description to the model. Description string `json:"description,omitempty"` // License to the model. // // This is expressed as a SPDX license expression, e.g. "MIT OR Apache-2.0". License string `json:"license,omitempty"` // FileType describes the type of the majority of the tensors in the GGUF file. FileType GGUFFileType `json:"fileType"` // LittleEndian is true if the GGUF file is little-endian, // and false for big-endian. LittleEndian bool `json:"littleEndian"` // FileSize is the size of the GGUF file in bytes. FileSize GGUFBytesScalar `json:"fileSize"` // Size is the model size. Size GGUFBytesScalar `json:"size"` // Parameters is the parameters of the GGUF file. Parameters GGUFParametersScalar `json:"parameters"` // BitsPerWeight is the bits per weight of the GGUF file. BitsPerWeight GGUFBitsPerWeightScalar `json:"bitsPerWeight"` }
GGUFMetadata represents the model metadata of a GGUF file.
type GGUFMetadataKV ¶
type GGUFMetadataKV struct { // Key is the key of the metadata key-value pair, // which is no larger than 64 bytes long. Key string `json:"key"` // ValueType is the type of the metadata value. ValueType GGUFMetadataValueType `json:"valueType"` // Value is the value of the metadata key-value pair. Value any `json:"value"` }
GGUFMetadataKV is a key-value pair in the metadata of a GGUF file.
func (GGUFMetadataKV) ValueArray ¶
func (kv GGUFMetadataKV) ValueArray() GGUFMetadataKVArrayValue
func (GGUFMetadataKV) ValueBool ¶
func (kv GGUFMetadataKV) ValueBool() bool
func (GGUFMetadataKV) ValueFloat32 ¶
func (kv GGUFMetadataKV) ValueFloat32() float32
func (GGUFMetadataKV) ValueFloat64 ¶
func (kv GGUFMetadataKV) ValueFloat64() float64
func (GGUFMetadataKV) ValueInt16 ¶
func (kv GGUFMetadataKV) ValueInt16() int16
func (GGUFMetadataKV) ValueInt32 ¶
func (kv GGUFMetadataKV) ValueInt32() int32
func (GGUFMetadataKV) ValueInt64 ¶
func (kv GGUFMetadataKV) ValueInt64() int64
func (GGUFMetadataKV) ValueInt8 ¶
func (kv GGUFMetadataKV) ValueInt8() int8
func (GGUFMetadataKV) ValueString ¶
func (kv GGUFMetadataKV) ValueString() string
func (GGUFMetadataKV) ValueUint16 ¶
func (kv GGUFMetadataKV) ValueUint16() uint16
func (GGUFMetadataKV) ValueUint32 ¶
func (kv GGUFMetadataKV) ValueUint32() uint32
func (GGUFMetadataKV) ValueUint64 ¶
func (kv GGUFMetadataKV) ValueUint64() uint64
func (GGUFMetadataKV) ValueUint8 ¶
func (kv GGUFMetadataKV) ValueUint8() uint8
type GGUFMetadataKVArrayValue ¶
type GGUFMetadataKVArrayValue struct { // Type is the type of the array item. Type GGUFMetadataValueType `json:"type"` // Len is the length of the array. Len uint64 `json:"len"` // Array holds all array items. Array []any `json:"array,omitempty"` // StartOffset is the offset in bytes of the GGUFMetadataKVArrayValue in the GGUFFile file. // // The offset is the start of the file. StartOffset int64 `json:"startOffset"` // Size is the size of the array in bytes. Size int64 `json:"size"` }
GGUFMetadataKVArrayValue is a value of a GGUFMetadataKV with type GGUFMetadataValueTypeArray.
func (GGUFMetadataKVArrayValue) ValuesArray ¶
func (av GGUFMetadataKVArrayValue) ValuesArray() []GGUFMetadataKVArrayValue
func (GGUFMetadataKVArrayValue) ValuesBool ¶
func (av GGUFMetadataKVArrayValue) ValuesBool() []bool
func (GGUFMetadataKVArrayValue) ValuesFloat32 ¶
func (av GGUFMetadataKVArrayValue) ValuesFloat32() []float32
func (GGUFMetadataKVArrayValue) ValuesFloat64 ¶
func (av GGUFMetadataKVArrayValue) ValuesFloat64() []float64
func (GGUFMetadataKVArrayValue) ValuesInt16 ¶
func (av GGUFMetadataKVArrayValue) ValuesInt16() []int16
func (GGUFMetadataKVArrayValue) ValuesInt32 ¶
func (av GGUFMetadataKVArrayValue) ValuesInt32() []int32
func (GGUFMetadataKVArrayValue) ValuesInt64 ¶
func (av GGUFMetadataKVArrayValue) ValuesInt64() []int64
func (GGUFMetadataKVArrayValue) ValuesInt8 ¶
func (av GGUFMetadataKVArrayValue) ValuesInt8() []int8
func (GGUFMetadataKVArrayValue) ValuesString ¶
func (av GGUFMetadataKVArrayValue) ValuesString() []string
func (GGUFMetadataKVArrayValue) ValuesUint16 ¶
func (av GGUFMetadataKVArrayValue) ValuesUint16() []uint16
func (GGUFMetadataKVArrayValue) ValuesUint32 ¶
func (av GGUFMetadataKVArrayValue) ValuesUint32() []uint32
func (GGUFMetadataKVArrayValue) ValuesUint64 ¶
func (av GGUFMetadataKVArrayValue) ValuesUint64() []uint64
func (GGUFMetadataKVArrayValue) ValuesUint8 ¶
func (av GGUFMetadataKVArrayValue) ValuesUint8() []uint8
type GGUFMetadataKVs ¶
type GGUFMetadataKVs []GGUFMetadataKV
GGUFMetadataKVs is a list of GGUFMetadataKV.
func (GGUFMetadataKVs) Get ¶
func (kvs GGUFMetadataKVs) Get(key string) (value GGUFMetadataKV, found bool)
Get returns the GGUFMetadataKV with the given key, and true if found, and false otherwise.
func (GGUFMetadataKVs) Index ¶
func (kvs GGUFMetadataKVs) Index(keys []string) (values map[string]GGUFMetadataKV, found int)
Index returns a map value to the GGUFMetadataKVs with the given keys, and the number of keys found.
func (GGUFMetadataKVs) Search ¶
func (kvs GGUFMetadataKVs) Search(keyRegex *regexp.Regexp) (values []GGUFMetadataKV)
Search returns a list of GGUFMetadataKV with the keys that match the given regex.
type GGUFMetadataValueType ¶
type GGUFMetadataValueType uint32
GGUFMetadataValueType is a type of GGUF metadata value, see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.
const ( GGUFMetadataValueTypeUint8 GGUFMetadataValueType = iota GGUFMetadataValueTypeInt8 GGUFMetadataValueTypeUint16 GGUFMetadataValueTypeInt16 GGUFMetadataValueTypeUint32 GGUFMetadataValueTypeInt32 GGUFMetadataValueTypeFloat32 GGUFMetadataValueTypeBool GGUFMetadataValueTypeString GGUFMetadataValueTypeArray GGUFMetadataValueTypeUint64 GGUFMetadataValueTypeInt64 GGUFMetadataValueTypeFloat64 )
GGUFMetadataValueType constants.
func (GGUFMetadataValueType) String ¶
func (i GGUFMetadataValueType) String() string
type GGUFNamedTensorInfos ¶
type GGUFNamedTensorInfos struct { // Name is the name of the namespace. Name string `json:"name"` // GGUFLayerTensorInfos can save GGUFNamedTensorInfos, GGUFTensorInfos, or GGUFTensorInfo. // // If the item is type of GGUFTensorInfo, it must be the leaf node. // // Any branch nodes are type of GGUFNamedTensorInfos or GGUFTensorInfos, // which can be nested. // // Branch nodes store in type pointer. GGUFLayerTensorInfos `json:"items,omitempty"` }
GGUFNamedTensorInfos is the namespace for relevant tensors, which must has a name.
type GGUFParametersScalar ¶
type GGUFParametersScalar uint64
GGUFParametersScalar is the scalar for parameters.
func (GGUFParametersScalar) String ¶
func (s GGUFParametersScalar) String() string
type GGUFReadOption ¶
type GGUFReadOption func(o *_GGUFReadOptions)
GGUFReadOption is the option for reading the file.
func SkipCache ¶
func SkipCache() GGUFReadOption
SkipCache skips the cache when reading from remote.
func SkipDNSCache ¶
func SkipDNSCache() GGUFReadOption
SkipDNSCache skips the DNS cache when reading from remote.
func SkipLargeMetadata ¶
func SkipLargeMetadata() GGUFReadOption
SkipLargeMetadata skips reading large GGUFMetadataKV items, which are not necessary for most cases.
func SkipProxy ¶
func SkipProxy() GGUFReadOption
SkipProxy skips the proxy when reading from remote.
func SkipRangeDownloadDetection ¶
func SkipRangeDownloadDetection() GGUFReadOption
SkipRangeDownloadDetection skips the range download detection when reading from remote.
func SkipTLSVerification ¶
func SkipTLSVerification() GGUFReadOption
SkipTLSVerification skips the TLS verification when reading from remote.
func UseBearerAuth ¶
func UseBearerAuth(token string) GGUFReadOption
UseBearerAuth uses the given token as a bearer auth when reading from remote.
func UseBufferSize ¶
func UseBufferSize(size int) GGUFReadOption
UseBufferSize sets the buffer size when reading from remote.
func UseCacheExpiration ¶
func UseCacheExpiration(expiration time.Duration) GGUFReadOption
UseCacheExpiration uses the given expiration to cache the remote reading result.
Disable cache expiration by setting it to 0.
func UseCachePath ¶
func UseCachePath(path string) GGUFReadOption
UseCachePath uses the given path to cache the remote reading result.
func UseProxy ¶
func UseProxy(url *url.URL) GGUFReadOption
UseProxy uses the given url as a proxy when reading from remote.
type GGUFRunDeviceMetric ¶ added in v0.13.0
type GGUFRunDeviceMetric struct { // FLOPS is the floating-point operations per second of the device. FLOPS FLOPSScalar // UpBandwidth is the bandwidth of the device to transmit data to calculate, // unit is Bps (bytes per second). UpBandwidth BytesPerSecondScalar // DownBandwidth is the bandwidth of the device to transmit calculated result to next layer, // unit is Bps (bytes per second). DownBandwidth BytesPerSecondScalar }
GGUFRunDeviceMetric holds the device metric for the estimate.
When the device represents a CPU, FLOPS refers to the floating-point operations per second of that CPU, while UpBandwidth indicates the bandwidth of the RAM (since SRAM is typically small and cannot hold all weights, the RAM here refers to the bandwidth of DRAM, unless the device's SRAM can accommodate the corresponding model weights).
When the device represents a GPU, FLOPS refers to the floating-point operations per second of that GPU, while UpBandwidth indicates the bandwidth of the VRAM.
When the device represents a specific node, FLOPS depends on whether a CPU or GPU is being used, while UpBandwidth refers to the network bandwidth between nodes.
type GGUFRunEstimateOption ¶ added in v0.13.0
type GGUFRunEstimateOption func(*_GGUFRunEstimateOptions)
GGUFRunEstimateOption is the options for the estimate.
func WithDeviceMetrics ¶ added in v0.10.0
func WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOption
WithDeviceMetrics sets the device metrics for the estimate.
func WithFlashAttention ¶
func WithFlashAttention() GGUFRunEstimateOption
WithFlashAttention sets the flash attention flag.
func WithLLaMACppAdapters ¶ added in v0.13.0
func WithLLaMACppAdapters(adp []LLaMACppRunEstimate) GGUFRunEstimateOption
WithLLaMACppAdapters sets the adapters estimate usage.
func WithLLaMACppCacheKeyType ¶ added in v0.13.0
func WithLLaMACppCacheKeyType(t GGMLType) GGUFRunEstimateOption
WithLLaMACppCacheKeyType sets the cache key type for the estimate.
func WithLLaMACppCacheValueType ¶ added in v0.13.0
func WithLLaMACppCacheValueType(t GGMLType) GGUFRunEstimateOption
WithLLaMACppCacheValueType sets the cache value type for the estimate.
func WithLLaMACppContextSize ¶ added in v0.13.0
func WithLLaMACppContextSize(size int32) GGUFRunEstimateOption
WithLLaMACppContextSize sets the context size for the estimate.
func WithLLaMACppDrafter ¶ added in v0.13.0
func WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption
WithLLaMACppDrafter sets the drafter estimate usage.
func WithLLaMACppLogicalBatchSize ¶ added in v0.13.0
func WithLLaMACppLogicalBatchSize(size int32) GGUFRunEstimateOption
WithLLaMACppLogicalBatchSize sets the logical batch size for the estimate.
func WithLLaMACppOffloadLayers ¶ added in v0.13.0
func WithLLaMACppOffloadLayers(layers uint64) GGUFRunEstimateOption
WithLLaMACppOffloadLayers sets the number of layers to offload.
func WithLLaMACppPhysicalBatchSize ¶ added in v0.13.0
func WithLLaMACppPhysicalBatchSize(size int32) GGUFRunEstimateOption
WithLLaMACppPhysicalBatchSize sets the physical batch size for the estimate.
func WithLLaMACppProjector ¶ added in v0.13.0
func WithLLaMACppProjector(prj *LLaMACppRunEstimate) GGUFRunEstimateOption
WithLLaMACppProjector sets the multimodal projector estimate usage.
func WithLLaMACppSplitMode ¶ added in v0.13.0
func WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption
WithLLaMACppSplitMode sets the split mode for the estimate.
func WithLLaMACppVisualMaxImageSize ¶ added in v0.13.7
func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption
WithLLaMACppVisualMaxImageSize sets the visual maximum image size input for the estimate.
func WithMainGPUIndex ¶ added in v0.7.0
func WithMainGPUIndex(di int) GGUFRunEstimateOption
WithMainGPUIndex sets the main device for the estimate.
When split mode is LLaMACppSplitModeNone, the main device is the only device. When split mode is LLaMACppSplitModeRow, the main device handles the intermediate results and KV.
WithMainGPUIndex needs to combine with WithTensorSplitFraction.
func WithParallelSize ¶
func WithParallelSize(size int32) GGUFRunEstimateOption
WithParallelSize sets the (decoding sequences) parallel size for the estimate.
func WithRPCServers ¶ added in v0.8.0
func WithRPCServers(srvs []string) GGUFRunEstimateOption
WithRPCServers sets the RPC servers for the estimate.
func WithStableDiffusionCppAutoencoderTiling ¶ added in v0.13.0
func WithStableDiffusionCppAutoencoderTiling() GGUFRunEstimateOption
WithStableDiffusionCppAutoencoderTiling enables tiling for the autoencoder.
func WithStableDiffusionCppBatchCount ¶ added in v0.13.0
func WithStableDiffusionCppBatchCount(count int32) GGUFRunEstimateOption
WithStableDiffusionCppBatchCount sets the batch count for the estimate.
func WithStableDiffusionCppControlNet ¶ added in v0.13.0
func WithStableDiffusionCppControlNet(cn *StableDiffusionCppRunEstimate) GGUFRunEstimateOption
WithStableDiffusionCppControlNet sets the control net estimate usage.
func WithStableDiffusionCppFreeComputeMemoryImmediately ¶ added in v0.13.3
func WithStableDiffusionCppFreeComputeMemoryImmediately() GGUFRunEstimateOption
WithStableDiffusionCppFreeComputeMemoryImmediately enables freeing compute memory immediately.
func WithStableDiffusionCppHeight ¶ added in v0.13.0
func WithStableDiffusionCppHeight(height uint32) GGUFRunEstimateOption
WithStableDiffusionCppHeight sets the image height for the estimate.
func WithStableDiffusionCppOffloadLayers ¶ added in v0.13.9
func WithStableDiffusionCppOffloadLayers(layers uint64) GGUFRunEstimateOption
WithStableDiffusionCppOffloadLayers sets the number of layers to offload.
func WithStableDiffusionCppUpscaler ¶ added in v0.13.0
func WithStableDiffusionCppUpscaler(ups *StableDiffusionCppRunEstimate) GGUFRunEstimateOption
WithStableDiffusionCppUpscaler sets the upscaler estimate usage.
func WithStableDiffusionCppWidth ¶ added in v0.13.0
func WithStableDiffusionCppWidth(width uint32) GGUFRunEstimateOption
WithStableDiffusionCppWidth sets the image width for the estimate.
func WithTensorSplitFraction ¶ added in v0.7.0
func WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption
WithTensorSplitFraction sets the tensor split cumulative fractions for the estimate.
WithTensorSplitFraction accepts a variadic number of fractions, all fraction values must be in the range of [0, 1], and the last fraction must be 1.
For example, WithTensorSplitFraction(0.2, 0.4, 0.6, 0.8, 1) will split the tensor into five parts with 20% each.
func WithinLLaMACppMaxContextSize ¶ added in v0.13.0
func WithinLLaMACppMaxContextSize() GGUFRunEstimateOption
WithinLLaMACppMaxContextSize limits the context size to the maximum, if the context size is over the maximum.
func WithoutLLaMACppOffloadKVCache ¶ added in v0.13.0
func WithoutLLaMACppOffloadKVCache() GGUFRunEstimateOption
WithoutLLaMACppOffloadKVCache disables offloading the KV cache.
func WithoutStableDiffusionCppOffloadAutoencoder ¶ added in v0.13.0
func WithoutStableDiffusionCppOffloadAutoencoder() GGUFRunEstimateOption
WithoutStableDiffusionCppOffloadAutoencoder disables offloading the autoencoder.
func WithoutStableDiffusionCppOffloadConditioner ¶ added in v0.13.0
func WithoutStableDiffusionCppOffloadConditioner() GGUFRunEstimateOption
WithoutStableDiffusionCppOffloadConditioner disables offloading the conditioner(text encoder).
type GGUFTensorInfo ¶
type GGUFTensorInfo struct { // Name is the name of the tensor, // which is no larger than 64 bytes long. Name string `json:"name"` // NDimensions is the number of dimensions of the tensor. NDimensions uint32 `json:"nDimensions"` // Dimensions is the dimensions of the tensor, // the length is NDimensions. Dimensions []uint64 `json:"dimensions"` // Type is the type of the tensor. Type GGMLType `json:"type"` // Offset is the offset in bytes of the tensor's data in this file. // // The offset is relative to tensor data, not to the start of the file. Offset uint64 `json:"offset"` // StartOffset is the offset in bytes of the GGUFTensorInfo in the GGUFFile file. // // The offset is the start of the file. StartOffset int64 `json:"startOffset"` }
GGUFTensorInfo represents a tensor info in a GGUF file.
func (GGUFTensorInfo) Bytes ¶
func (ti GGUFTensorInfo) Bytes() uint64
Bytes returns the number of bytes of the GGUFTensorInfo, which is inspired by https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2609-L2626.
func (GGUFTensorInfo) Count ¶
func (ti GGUFTensorInfo) Count() uint64
Count returns the number of GGUF tensors of the GGUFTensorInfo, which is always 1.
func (GGUFTensorInfo) Elements ¶
func (ti GGUFTensorInfo) Elements() uint64
Elements returns the number of elements of the GGUFTensorInfo, which is inspired by https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2597-L2601.
func (GGUFTensorInfo) Get ¶
func (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found bool)
Get returns the GGUFTensorInfo with the given name, and true if found, and false otherwise.
func (GGUFTensorInfo) GetFileType ¶ added in v0.13.0
func (ti GGUFTensorInfo) GetFileType() GGUFFileType
GetFileType returns the GGUFFileType.
func (GGUFTensorInfo) Index ¶
func (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFTensorInfo, found int)
Index returns a map value to the GGUFTensorInfo with the given names, and the number of names found.
func (GGUFTensorInfo) Match ¶ added in v0.13.0
func (ti GGUFTensorInfo) Match(nameRegex *regexp.Regexp) bool
Match returns true if the name of the GGUFTensorInfo matches the given regex.
func (GGUFTensorInfo) Search ¶
func (ti GGUFTensorInfo) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)
Search returns a list of GGUFTensorInfo with the names that match the given regex.
type GGUFTensorInfos ¶
type GGUFTensorInfos []GGUFTensorInfo
GGUFTensorInfos is a list of GGUFTensorInfo.
func (GGUFTensorInfos) Bytes ¶
func (tis GGUFTensorInfos) Bytes() uint64
Bytes returns the number of bytes of the GGUFTensorInfos.
func (GGUFTensorInfos) Count ¶
func (tis GGUFTensorInfos) Count() uint64
Count returns the number of GGUF tensors of the GGUFTensorInfos.
func (GGUFTensorInfos) Elements ¶
func (tis GGUFTensorInfos) Elements() uint64
Elements returns the number of elements of the GGUFTensorInfos.
func (GGUFTensorInfos) Get ¶
func (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, found bool)
Get returns the GGUFTensorInfo with the given name, and true if found, and false otherwise.
func (GGUFTensorInfos) GetFileType ¶ added in v0.13.0
func (tis GGUFTensorInfos) GetFileType() GGUFFileType
GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFTensorInfos.
func (GGUFTensorInfos) Index ¶
func (tis GGUFTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int)
Index returns a map value to the GGUFTensorInfos with the given names, and the number of names found.
func (GGUFTensorInfos) Layers ¶ added in v0.13.0
func (tis GGUFTensorInfos) Layers(ignores ...string) GGUFLayerTensorInfos
Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos.
func (GGUFTensorInfos) Match ¶ added in v0.13.0
func (tis GGUFTensorInfos) Match(nameRegex *regexp.Regexp) bool
Match returns true if a tensor of GGUFTensorInfos matches the given regex.
func (GGUFTensorInfos) Search ¶
func (tis GGUFTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)
Search returns a list of GGUFTensorInfo with the names that match the given regex.
type GGUFTokenizer ¶ added in v0.8.0
type GGUFTokenizer struct { // Model is the model of the tokenizer. Model string `json:"model"` // TokensLength is the size of tokens. TokensLength uint64 `json:"tokensLength"` // MergeLength is the size of merges. MergesLength uint64 `json:"mergesLength"` // AddedTokensLength is the size of added tokens after training. AddedTokensLength uint64 `json:"addedTokenLength"` // BOSTokenID is the ID of the beginning of sentence token. // // Use -1 if the token is not found. BOSTokenID int64 `json:"bosTokenID"` // EOSTokenID is the ID of the end of sentence token. // // Use -1 if the token is not found. EOSTokenID int64 `json:"eosTokenID"` // EOTTokenID is the ID of the end of text token. // // Use -1 if the token is not found. EOTTokenID int64 `json:"eotTokenID"` // EOMTokenID is the ID of the end of message token. // // Use -1 if the token is not found. EOMTokenID int64 `json:"eomTokenID"` // UnknownTokenID is the ID of the unknown token. // // Use -1 if the token is not found. UnknownTokenID int64 `json:"unknownTokenID"` // SeparatorTokenID is the ID of the separator token. // // Use -1 if the token is not found. SeparatorTokenID int64 `json:"separatorTokenID"` // PaddingTokenID is the ID of the padding token. // // Use -1 if the token is not found. PaddingTokenID int64 `json:"paddingTokenID"` // TokenSize is the size of tokens in bytes. TokensSize int64 `json:"tokensSize"` // MergesSize is the size of merges in bytes. MergesSize int64 `json:"mergesSize"` }
GGUFTokenizer represents the tokenizer metadata of a GGUF file.
type GGUFTokensPerSecondScalar ¶ added in v0.10.0
type GGUFTokensPerSecondScalar float64
GGUFTokensPerSecondScalar is the scalar for tokens per second.
func (GGUFTokensPerSecondScalar) String ¶ added in v0.10.0
func (s GGUFTokensPerSecondScalar) String() string
type GGUFVersion ¶
type GGUFVersion uint32
GGUFVersion is a version of GGUF file format, see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#version-history.
const ( GGUFVersionV1 GGUFVersion = iota + 1 GGUFVersionV2 GGUFVersionV3 )
GGUFVersion constants.
func (GGUFVersion) String ¶
func (i GGUFVersion) String() string
type IGGUFTensorInfos ¶
type IGGUFTensorInfos interface { // Get returns the GGUFTensorInfo with the given name, // and true if found, and false otherwise. Get(name string) (info GGUFTensorInfo, found bool) // GetFileType returns the GGUFFileType. GetFileType() GGUFFileType // Match returns true if the name matches the given regex, and false otherwise. Match(nameRegex *regexp.Regexp) bool // Search returns a list of GGUFTensorInfo with the names that match the given regex. Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) // Index returns a map value to the GGUFTensorInfo with the given names, // and the number of names found. Index(names []string) (infos map[string]GGUFTensorInfo, found int) // Elements returns the number of elements(parameters). Elements() uint64 // Bytes returns the number of bytes. Bytes() uint64 // Count returns the number of tensors. Count() uint64 }
IGGUFTensorInfos is an interface for GGUF tensor infos, which includes basic operations.
type LLaMACppComputationMemoryUsage ¶ added in v0.9.0
type LLaMACppComputationMemoryUsage struct { // Footprint is the memory footprint for computation. Footprint GGUFBytesScalar `json:"footprint"` // Input is the memory usage for input. Input GGUFBytesScalar `json:"input"` // Compute is the memory usage for computation. Compute GGUFBytesScalar `json:"graph"` // Output is the memory usage for output. Output GGUFBytesScalar `json:"output"` }
LLaMACppComputationMemoryUsage represents the memory usage of computation in llama.cpp.
func (LLaMACppComputationMemoryUsage) Sum ¶ added in v0.9.0
func (u LLaMACppComputationMemoryUsage) Sum() GGUFBytesScalar
type LLaMACppKVCacheMemoryUsage ¶ added in v0.9.0
type LLaMACppKVCacheMemoryUsage struct { // Key is the memory usage for caching previous keys. Key GGUFBytesScalar `json:"key"` // Value is the memory usage for caching previous values. Value GGUFBytesScalar `json:"value"` }
LLaMACppKVCacheMemoryUsage represents the memory usage of caching previous KV in llama.cpp.
func (LLaMACppKVCacheMemoryUsage) Sum ¶ added in v0.9.0
func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar
type LLaMACppParameterUsage ¶ added in v0.10.0
type LLaMACppParameterUsage struct { // KVCache is the parameter usage for caching previous KV. KVCache GGUFParametersScalar `json:"kvCache"` // Input is the parameter usage for input tensors. Input GGUFParametersScalar `json:"input"` // Compute is the parameter usage for compute tensors. Compute GGUFParametersScalar `json:"compute"` // Output is the parameter usage for output tensors. Output GGUFParametersScalar `json:"output"` }
LLaMACppParameterUsage represents the parameter usage for running the GGUF file in llama.cpp.
type LLaMACppRunDeviceUsage ¶ added in v0.10.0
type LLaMACppRunDeviceUsage struct { // HandleLayers is the number of layers that the device can handle. HandleLayers uint64 `json:"handleLayers"` // HandleLastLayer is the index of the last layer the device can handle. HandleLastLayer int `json:"handleLastLayer"` // HandleOutputLayer is the flag to indicate whether the device can handle the output layer, // true for handle. HandleOutputLayer bool `json:"handleOutputLayer"` // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // Footprint is the memory footprint for bootstrapping. Footprint GGUFBytesScalar `json:"footprint"` // Parameter is the running parameters that the device processes. Parameter LLaMACppParameterUsage `json:"parameter"` // Weight is the memory usage of weights that the device loads. Weight LLaMACppWeightMemoryUsage `json:"weight"` // KVCache is the memory usage of kv that the device caches. KVCache LLaMACppKVCacheMemoryUsage `json:"kvCache"` // Computation is the memory usage of computation that the device processes. Computation LLaMACppComputationMemoryUsage `json:"computation"` }
LLaMACppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp.
type LLaMACppRunEstimate ¶ added in v0.9.0
type LLaMACppRunEstimate struct { // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". ClipProjectorType string `json:"clipProjectorType,omitempty"` // AdapterType is the type of the adapter. // // Only used when Architecture is "adapter". AdapterType string `json:"adapterType,omitempty"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // ContextSize is the size of the context. ContextSize uint64 `json:"contextSize"` // OffloadLayers is the number of offloaded layers. OffloadLayers uint64 `json:"offloadLayers"` // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // NoMMap is the flag to indicate whether support the mmap, // true for support. NoMMap bool `json:"noMMap"` // EmbeddingOnly is the flag to indicate whether the model is used for embedding only, // true for embedding only. EmbeddingOnly bool `json:"embeddingOnly"` // Reranking is the flag to indicate whether the model is used for reranking, // true for reranking. // // Only available when EmbeddingOnly is true. Reranking bool `json:"reranking"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` // LogicalBatchSize is the logical batch size. LogicalBatchSize int32 `json:"logicalBatchSize"` // PhysicalBatchSize is the physical batch size. PhysicalBatchSize int32 `json:"physicalBatchSize"` // Devices represents the usage for running the GGUF file, // the first device is the CPU, and the rest are GPUs. Devices []LLaMACppRunDeviceUsage `json:"devices"` // Drafter is the estimated result of drafter. Drafter *LLaMACppRunEstimate `json:"drafter,omitempty"` // Projector is the estimated result of multimodal projector. Projector *LLaMACppRunEstimate `json:"projector,omitempty"` // Adapters is the estimated result of adapters. Adapters []LLaMACppRunEstimate `json:"adapters,omitempty"` // MaximumTokensPerSecond represents the maximum tokens per second for running the GGUF file. MaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:"maximumTokensPerSecond,omitempty"` }
LLaMACppRunEstimate represents the estimated result of loading the GGUF file in llama.cpp.
func (LLaMACppRunEstimate) Summarize ¶ added in v0.9.0
func (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (es LLaMACppRunEstimateSummary)
Summarize returns the corresponding LLaMACppRunEstimateSummary with the given options.
func (LLaMACppRunEstimate) SummarizeItem ¶ added in v0.10.0
func (e LLaMACppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (emi LLaMACppRunEstimateSummaryItem)
SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options.
type LLaMACppRunEstimateMemory ¶ added in v0.10.0
type LLaMACppRunEstimateMemory struct { // HandleLayers is the number of layers that the device can handle. HandleLayers uint64 `json:"handleLayers"` // HandleLastLayer is the index of the last layer the device can handle. HandleLastLayer int `json:"handleLastLayer"` // HandleOutputLayer is the flag to indicate whether the device can handle the output layer, // true for handle. HandleOutputLayer bool `json:"handleOutputLayer"` // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // UMA represents the usage of Unified Memory Architecture. UMA GGUFBytesScalar `json:"uma"` // NonUMA represents the usage of Non-Unified Memory Architecture. NonUMA GGUFBytesScalar `json:"nonuma"` }
LLaMACppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp.
type LLaMACppRunEstimateSummary ¶ added in v0.10.0
type LLaMACppRunEstimateSummary struct { // Items Items []LLaMACppRunEstimateSummaryItem `json:"items"` // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". ClipProjectorType string `json:"clipProjectorType,omitempty"` // AdapterType is the type of the adapter. // // Only used when Architecture is "adapter". AdapterType string `json:"adapterType,omitempty"` // ContextSize is the size of the context. ContextSize uint64 `json:"contextSize"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // NoMMap is the flag to indicate whether the file must be loaded without mmap, // true for total loaded. NoMMap bool `json:"noMMap"` // EmbeddingOnly is the flag to indicate whether the model is used for embedding only, // true for embedding only. EmbeddingOnly bool `json:"embeddingOnly"` // Reranking is the flag to indicate whether the model is used for reranking, // true for reranking. // // Only available when EmbeddingOnly is true. Reranking bool `json:"reranking"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` // LogicalBatchSize is the logical batch size. LogicalBatchSize int32 `json:"logicalBatchSize"` // PhysicalBatchSize is the physical batch size. PhysicalBatchSize int32 `json:"physicalBatchSize"` }
LLaMACppRunEstimateSummary represents the summary of the usage for loading the GGUF file in llama.cpp.
type LLaMACppRunEstimateSummaryItem ¶ added in v0.10.0
type LLaMACppRunEstimateSummaryItem struct { // OffloadLayers is the number of offloaded layers. OffloadLayers uint64 `json:"offloadLayers"` // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // MaximumTokensPerSecond is the maximum tokens per second for running the GGUF file. MaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:"maximumTokensPerSecond,omitempty"` // RAM is the memory usage for loading the GGUF file in RAM. RAM LLaMACppRunEstimateMemory `json:"ram"` // VRAMs is the memory usage for loading the GGUF file in VRAM per device. VRAMs []LLaMACppRunEstimateMemory `json:"vrams"` }
LLaMACppRunEstimateSummaryItem represents one summary item for loading the GGUF file in llama.cpp.
type LLaMACppSplitMode ¶ added in v0.7.0
type LLaMACppSplitMode uint
LLaMACppSplitMode is the split mode for LLaMACpp.
const ( LLaMACppSplitModeLayer LLaMACppSplitMode = iota LLaMACppSplitModeRow LLaMACppSplitModeNone )
type LLaMACppWeightMemoryUsage ¶ added in v0.9.0
type LLaMACppWeightMemoryUsage struct { // Input is the memory usage for loading input tensors. Input GGUFBytesScalar `json:"input"` // Compute is the memory usage for loading compute tensors. Compute GGUFBytesScalar `json:"compute"` // Output is the memory usage for loading output tensors. Output GGUFBytesScalar `json:"output"` }
LLaMACppWeightMemoryUsage represents the memory usage of loading weights in llama.cpp.
func (LLaMACppWeightMemoryUsage) Sum ¶ added in v0.9.0
func (u LLaMACppWeightMemoryUsage) Sum() GGUFBytesScalar
type OllamaModel ¶
type OllamaModel struct { Schema string `json:"schema"` Registry string `json:"registry"` Namespace string `json:"namespace"` Repository string `json:"repository"` Tag string `json:"tag"` SchemaVersion uint32 `json:"schemaVersion"` MediaType string `json:"mediaType"` Config OllamaModelLayer `json:"config"` Layers []OllamaModelLayer `json:"layers"` // Client is the http client used to complete the OllamaModel's network operations. // // When this field is nil, // it will be set to the client used by OllamaModel.Complete. // // When this field is offered, // the network operations will be done with this client. Client *http.Client `json:"-"` }
OllamaModel represents an Ollama model, its manifest(including MediaType, Config and Layers) can be completed further by calling the Complete method.
func ParseOllamaModel ¶
func ParseOllamaModel(model string, opts ...OllamaModelOption) *OllamaModel
ParseOllamaModel parses the given Ollama model string, and returns the OllamaModel, or nil if the model is invalid.
func (*OllamaModel) Complete ¶
Complete completes the OllamaModel with the given context and http client.
func (*OllamaModel) GetLayer ¶
func (om *OllamaModel) GetLayer(mediaType string) (OllamaModelLayer, bool)
GetLayer returns the OllamaModelLayer with the given media type, and true if found, and false otherwise.
func (*OllamaModel) Messages ¶
func (om *OllamaModel) Messages(ctx context.Context, cli *http.Client) ([]json.RawMessage, error)
Messages returns the messages of the OllamaModel.
func (*OllamaModel) SearchLayers ¶
func (om *OllamaModel) SearchLayers(mediaTypeRegex *regexp.Regexp) []OllamaModelLayer
SearchLayers returns a list of OllamaModelLayer with the media type that matches the given regex.
func (*OllamaModel) String ¶
func (om *OllamaModel) String() string
func (*OllamaModel) WebPageURL ¶
func (om *OllamaModel) WebPageURL() *url.URL
WebPageURL returns the Ollama web page URL of the OllamaModel.
type OllamaModelLayer ¶
type OllamaModelLayer struct { MediaType string `json:"mediaType"` Size uint64 `json:"size"` Digest string `json:"digest"` // Root points to the root OllamaModel, // which is never serialized or deserialized. // // When called OllamaModel.Complete, // this field will be set to the OllamaModel itself. // If not, this field will be nil, // and must be set manually to the root OllamaModel before calling the method of OllamaModelLayer. Root *OllamaModel `json:"-"` }
OllamaModelLayer represents an Ollama model layer, its digest can be used to download the artifact.
func (*OllamaModelLayer) BlobURL ¶
func (ol *OllamaModelLayer) BlobURL() *url.URL
BlobURL returns the blob URL of the OllamaModelLayer.
func (*OllamaModelLayer) FetchBlob ¶
FetchBlob fetches the blob of the OllamaModelLayer with the given context and http client, and returns the response body as bytes.
func (*OllamaModelLayer) FetchBlobFunc ¶
func (ol *OllamaModelLayer) FetchBlobFunc(ctx context.Context, cli *http.Client, process func(*http.Response) error) error
FetchBlobFunc fetches the blob of the OllamaModelLayer with the given context and http client, and processes the response with the given function.
type OllamaModelOption ¶ added in v0.6.4
type OllamaModelOption func(*_OllamaModelOptions)
func SetOllamaModelBaseURL ¶ added in v0.6.4
func SetOllamaModelBaseURL(baseURL string) OllamaModelOption
SetOllamaModelBaseURL parses the given base URL, and sets default schema/registry for OllamaModel.
func SetOllamaModelDefaultNamespace ¶ added in v0.6.4
func SetOllamaModelDefaultNamespace(namespace string) OllamaModelOption
SetOllamaModelDefaultNamespace sets the default namespace for OllamaModel.
func SetOllamaModelDefaultRegistry ¶ added in v0.6.4
func SetOllamaModelDefaultRegistry(registry string) OllamaModelOption
SetOllamaModelDefaultRegistry sets the default registry for OllamaModel.
func SetOllamaModelDefaultScheme ¶ added in v0.6.4
func SetOllamaModelDefaultScheme(scheme string) OllamaModelOption
SetOllamaModelDefaultScheme sets the default scheme for OllamaModel.
func SetOllamaModelDefaultTag ¶ added in v0.6.4
func SetOllamaModelDefaultTag(tag string) OllamaModelOption
SetOllamaModelDefaultTag sets the default tag for OllamaModel.
type SizeScalar ¶ added in v0.10.0
type SizeScalar uint64
SizeScalar is the scalar for size.
func ParseSizeScalar ¶ added in v0.10.0
func ParseSizeScalar(s string) (_ SizeScalar, err error)
ParseSizeScalar parses the SizeScalar from the string.
func (SizeScalar) String ¶ added in v0.10.0
func (s SizeScalar) String() string
type StableDiffusionCppRunDeviceUsage ¶ added in v0.13.0
type StableDiffusionCppRunDeviceUsage struct { // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // Footprint is the memory footprint for bootstrapping. Footprint GGUFBytesScalar `json:"footprint"` // Parameter is the running parameters that the device processes. Parameter GGUFParametersScalar `json:"parameter"` // Weight is the memory usage of weights that the device loads. Weight GGUFBytesScalar `json:"weight"` // Computation is the memory usage of computation that the device processes. Computation GGUFBytesScalar `json:"computation"` }
StableDiffusionCppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp.
type StableDiffusionCppRunEstimate ¶ added in v0.13.0
type StableDiffusionCppRunEstimate struct { // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // NoMMap is the flag to indicate whether support the mmap, // true for support. NoMMap bool `json:"noMMap"` // ImageOnly is the flag to indicate whether the model is used for generating image, // true for generating image only. ImageOnly bool `json:"imageOnly"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` // Devices represents the usage for running the GGUF file, // the first device is the CPU, and the rest are GPUs. Devices []StableDiffusionCppRunDeviceUsage `json:"devices"` // Autoencoder is the estimated result of the autoencoder. Autoencoder *StableDiffusionCppRunEstimate `json:"autoencoder,omitempty"` // Conditioners is the estimated result of the conditioners. Conditioners []StableDiffusionCppRunEstimate `json:"conditioners,omitempty"` // Upscaler is the estimated result of the upscaler. Upscaler *StableDiffusionCppRunEstimate `json:"upscaler,omitempty"` // ControlNet is the estimated result of the control net. ControlNet *StableDiffusionCppRunEstimate `json:"controlNet,omitempty"` }
StableDiffusionCppRunEstimate represents the estimated result of loading the GGUF file in stable-diffusion.cpp.
func (StableDiffusionCppRunEstimate) Summarize ¶ added in v0.13.0
func (e StableDiffusionCppRunEstimate) Summarize( mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64, ) (es StableDiffusionCppRunEstimateSummary)
Summarize returns the corresponding StableDiffusionCppRunEstimate with the given options.
func (StableDiffusionCppRunEstimate) SummarizeItem ¶ added in v0.13.0
func (e StableDiffusionCppRunEstimate) SummarizeItem( mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64, ) (emi StableDiffusionCppRunEstimateSummaryItem)
SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options.
type StableDiffusionCppRunEstimateMemory ¶ added in v0.13.0
type StableDiffusionCppRunEstimateMemory struct { // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // UMA represents the usage of Unified Memory Architecture. UMA GGUFBytesScalar `json:"uma"` // NonUMA represents the usage of Non-Unified Memory Architecture. NonUMA GGUFBytesScalar `json:"nonuma"` }
StableDiffusionCppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp.
type StableDiffusionCppRunEstimateSummary ¶ added in v0.13.0
type StableDiffusionCppRunEstimateSummary struct { // Items Items []StableDiffusionCppRunEstimateSummaryItem `json:"items"` // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // NoMMap is the flag to indicate whether the file must be loaded without mmap, // true for total loaded. NoMMap bool `json:"noMMap"` // ImageOnly is the flag to indicate whether the model is used for generating image, // true for embedding only. ImageOnly bool `json:"imageOnly"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` }
StableDiffusionCppRunEstimateSummary represents the estimated summary of loading the GGUF file in stable-diffusion.cpp.
type StableDiffusionCppRunEstimateSummaryItem ¶ added in v0.13.0
type StableDiffusionCppRunEstimateSummaryItem struct { // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // RAM is the memory usage for loading the GGUF file in RAM. RAM StableDiffusionCppRunEstimateMemory `json:"ram"` // VRAMs is the memory usage for loading the GGUF file in VRAM per device. VRAMs []StableDiffusionCppRunEstimateMemory `json:"vrams"` }
StableDiffusionCppRunEstimateSummaryItem represents the estimated summary item of loading the GGUF file in stable-diffusion.cpp.
Source Files ¶
- cache.go
- file.go
- file_architecture.go
- file_estimate__llamacpp.go
- file_estimate__stablediffusioncpp.go
- file_estimate_option.go
- file_from_distro.go
- file_from_remote.go
- file_metadata.go
- file_option.go
- file_tokenizer.go
- filename.go
- gen.go
- ggml.go
- ollama_model.go
- ollama_model_option.go
- ollama_registry_authenticate.go
- scalar.go
- zz_generated.diffusion_model_memory_usage.regression.go
- zz_generated.ggmltype.stringer.go
- zz_generated.gguffiletype.stringer.go
- zz_generated.ggufmagic.stringer.go
- zz_generated.ggufmetadatavaluetype.stringer.go
- zz_generated.ggufversion.stringer.go