Documentation ¶
Overview ¶
Copyright (c) Microsoft Corporation. Licensed under the MIT license.
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type HuggingfaceTransformersParam ¶
type HuggingfaceTransformersParam struct { BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line. TorchRunParams map[string]string // Parameters for configuring the torchrun command. TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic). InferenceMainFile string // The main file for inference. ModelRunParams map[string]string // Parameters for running the model training/inference. }
func (*HuggingfaceTransformersParam) DeepCopy ¶
func (h *HuggingfaceTransformersParam) DeepCopy() HuggingfaceTransformersParam
type Model ¶
type Model interface { GetInferenceParameters() *PresetParam GetTuningParameters() *PresetParam SupportDistributedInference() bool //If true, the model workload will be a StatefulSet, using the torch elastic runtime framework. SupportTuning() bool }
type PresetParam ¶
type PresetParam struct { Tag string // The model image tag ModelFamilyName string // The name of the model family. ImageAccessMode string // Defines where the Image is Public or Private. DiskStorageRequirement string // Disk storage requirements for the model. GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference. TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference. PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference. TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning. WorldSize int // Defines the number of processes required for distributed inference. RuntimeParam // ReadinessTimeout defines the maximum duration for creating the workload. // This timeout accommodates the size of the image, ensuring pull completion // even under slower network conditions or unforeseen delays. ReadinessTimeout time.Duration }
PresetParam defines the preset inference parameters for a model.
func (*PresetParam) DeepCopy ¶
func (p *PresetParam) DeepCopy() *PresetParam
func (*PresetParam) GetInferenceCommand ¶
func (p *PresetParam) GetInferenceCommand(runtime RuntimeName, skuNumGPUs string) []string
builds the container command: eg. torchrun <TORCH_PARAMS> <OPTIONAL_RDZV_PARAMS> baseCommand <MODEL_PARAMS>
type RuntimeName ¶
type RuntimeName string
RuntimeName is LLM runtime name.
const ( RuntimeNameHuggingfaceTransformers RuntimeName = "transformers" RuntimeNameVLLM RuntimeName = "vllm" )
type RuntimeParam ¶
type RuntimeParam struct { Transformers HuggingfaceTransformersParam VLLM VLLMParam // Disable the tensor parallelism DisableTensorParallelism bool }
RuntimeParam defines the llm runtime parameters.
func (*RuntimeParam) DeepCopy ¶
func (rp *RuntimeParam) DeepCopy() RuntimeParam
type VLLMParam ¶
type VLLMParam struct { BaseCommand string // The model name used in the openai serving API. // see https://platform.openai.com/docs/api-reference/chat/create#chat-create-model. ModelName string // Parameters for distributed inference. DistributionParams map[string]string // Parameters for running the model training/inference. ModelRunParams map[string]string }
Click to show internal directories.
Click to hide internal directories.