Documentation ¶
Index ¶
- Constants
- type AssumeRoleConfig
- type Config
- type DebugConfig
- type LeaderElectionConfig
- type ModelConfig
- type ModelConfigItem
- type ObjectStoreConfig
- type OllamaConfig
- type PersistentVolume
- type ProcessedModelConfig
- type Resources
- type RuntimeConfig
- type S3Config
- type TolerationConfig
- type WorkerConfig
- type WorkerTLSConfig
Constants ¶
const ( // RuntimeNameOllama is the Ollama runtime name. RuntimeNameOllama string = "ollama" // RuntimeNameVLLM is the VLLM runtime name. RuntimeNameVLLM string = "vllm" // RuntimeNameTriton is the runtime name for Nvidia Triton Inference Server. RuntimeNameTriton string = "triton" )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AssumeRoleConfig ¶
type AssumeRoleConfig struct { RoleARN string `yaml:"roleArn"` ExternalID string `yaml:"externalId"` }
AssumeRoleConfig is the assume role configuration.
type Config ¶
type Config struct { Runtime RuntimeConfig `yaml:"runtime"` Ollama OllamaConfig `yaml:"ollama"` Model ModelConfig `yaml:"model"` HealthPort int `yaml:"healthPort"` MetricsPort int `yaml:"metricsPort"` // GracefulShutdownTimeout is the duration given to runnable to stop // before the manager actually returns on stop. Default is 30 seconds. GracefulShutdownTimeout time.Duration `yaml:"gracefulShutdownTimeout"` LeaderElection LeaderElectionConfig `yaml:"leaderElection"` Autoscaler autoscaler.Config `yaml:"autoscaler"` ObjectStore ObjectStoreConfig `yaml:"objectStore"` // PreloadedModelIDs is a list of model IDs to preload. These models are downloaded locally // at the startup time. // TODO(kenji):Remove once every env uses ModelConfig. PreloadedModelIDs []string `yaml:"preloadedModelIds"` // ModelContextLengths is a map of model ID to context length. If not specified, the default // context length is used. // TODO(kenji):Remove once every env uses ModelConfig. ModelContextLengths map[string]int `yaml:"modelContextLengths"` Debug DebugConfig `yaml:"debug"` InferenceManagerServerWorkerServiceAddr string `yaml:"inferenceManagerServerWorkerServiceAddr"` ModelManagerServerWorkerServiceAddr string `yaml:"modelManagerServerWorkerServiceAddr"` Worker WorkerConfig `yaml:"worker"` // ComponentStatusSender is the configuration for the component status sender. ComponentStatusSender status.Config `yaml:"componentStatusSender"` }
Config is the configuration.
type DebugConfig ¶
type DebugConfig struct { // Standalone is true if the service is running in standalone mode (except the // dependency to inference-manager-server). Standalone bool `yaml:"standalone"` }
DebugConfig is the debug configuration.
type LeaderElectionConfig ¶
type LeaderElectionConfig struct { ID string `yaml:"id"` // LeaseDuration is the duration that non-leader candidates will // wait to force acquire leadership. This is measured against time of // last observed ack. Default is 15 seconds. LeaseDuration *time.Duration `yaml:"leaseDuration"` // RenewDeadline is the duration that the acting controlplane will retry // refreshing leadership before giving up. Default is 10 seconds. RenewDeadline *time.Duration `yaml:"renewDeadline"` // RetryPeriod is the duration the LeaderElector clients should wait // between tries of actions. Default is 2 seconds. RetryPeriod *time.Duration `yaml:"retryPeriod"` }
LeaderElectionConfig is the leader election configuration.
type ModelConfig ¶
type ModelConfig struct { Default ModelConfigItem `yaml:"default"` // Overrides is a map of model ID to the model configuration item to be overriden. Only // fields that are set in the overrides are applied. Overrides map[string]ModelConfigItem `yaml:"overrides"` }
ModelConfig is the model configuration.
type ModelConfigItem ¶
type ModelConfigItem struct { RuntimeName string `yaml:"runtimeName"` Resources Resources `yaml:"resources"` Replicas int `yaml:"replicas"` // Preloaded is true if the model is preloaded. // If this is set to true in the the default model item, all models that are specified in override items // are preloaded. Preloaded bool `yaml:"preloaded"` // ContextLength is the context length for the model. If the value is 0, // the default context length is used. ContextLength int `yaml:"contextLength"` // VLLMExtraFlags is the extra flags for VLLM. VLLMExtraFlags []string `yaml:"vllmExtraFlags"` // SchedulerName is the name of the scheduler to use. // This is set when a vLLM runs on Inferentia instances and // requires Neuron scheduling extension. // See https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/tutorials/k8s-setup.html. SchedulerName string `yaml:"schedulerName"` // ContainerRuntimeClassName is the name of a K8s Runtime Class // (https://kubernetes.io/docs/concepts/containers/runtime-class/) used by model runtime. // This is set the Runtime Class of Nvidia container runtime if it is not a cluster default. ContainerRuntimeClassName string `yaml:"containerRuntimeClassName"` }
ModelConfigItem is the model configuration item.
type ObjectStoreConfig ¶
type ObjectStoreConfig struct {
S3 S3Config `yaml:"s3"`
}
ObjectStoreConfig is the object store configuration.
func (*ObjectStoreConfig) Validate ¶
func (c *ObjectStoreConfig) Validate() error
Validate validates the object store configuration.
type OllamaConfig ¶
type OllamaConfig struct { // KeepAlive is the keep-alive duration for Ollama. // This controls how long Ollama keeps models in GPU memory. KeepAlive time.Duration `yaml:"keepAlive"` // NumParallel is the maximum number of requests procesed in parallel. NumParallel int `yaml:"numParallel"` // ForceSpreading is true if the models should be spread across all GPUs. ForceSpreading bool `yaml:"forceSpreading"` Debug bool `yaml:"debug"` RunnersDir string `yaml:"runnersDir"` }
OllamaConfig is the Ollama configuration.
type PersistentVolume ¶
type PersistentVolume struct { // ShareWithReplicas sets whether to share the volume among replicas. StorageClassName string `yaml:"storageClassName"` Size string `yaml:"size"` AccessMode string `yaml:"accessMode"` }
PersistentVolume is the persistent volume configuration.
type ProcessedModelConfig ¶
type ProcessedModelConfig struct {
// contains filtered or unexported fields
}
ProcessedModelConfig is the processed model configuration.
func NewProcessedModelConfig ¶
func NewProcessedModelConfig(c *Config) *ProcessedModelConfig
NewProcessedModelConfig returns a new ProcessedModelConfig.
func (*ProcessedModelConfig) ModelConfigItem ¶
func (c *ProcessedModelConfig) ModelConfigItem(modelID string) ModelConfigItem
ModelConfigItem returns the model configuration item for the given model ID.
func (*ProcessedModelConfig) PreloadedModelIDs ¶
func (c *ProcessedModelConfig) PreloadedModelIDs() []string
PreloadedModelIDs returns the IDs of the models to be preloaded.
type Resources ¶
type Resources struct { Requests map[string]string `yaml:"requests"` Limits map[string]string `yaml:"limits"` Volume *PersistentVolume `yaml:"volume"` }
Resources is the resources configuration.
type RuntimeConfig ¶
type RuntimeConfig struct { PullerImage string `yaml:"pullerImage"` TritonProxyImage string `yaml:"tritonProxyImage"` RuntimeImages map[string]string `yaml:"runtimeImages"` PullerImagePullPolicy string `yaml:"pullerImagePullPolicy"` TritonProxyImagePullPolicy string `yaml:"tritonProxyImagePullPolicy"` RuntimeImagePullPolicy string `yaml:"runtimeImagePullPolicy"` ConfigMapName string `yaml:"configMapName"` AWSSecretName string `yaml:"awsSecretName"` AWSKeyIDEnvKey string `yaml:"awsKeyIdEnvKey"` AWSAccessKeyEnvKey string `yaml:"awsAccessKeyEnvKey"` LLMOWorkerSecretName string `yaml:"llmoWorkerSecretName"` LLMOKeyEnvKey string `yaml:"llmoKeyEnvKey"` ServiceAccountName string `yaml:"serviceAccountName"` PodAnnotations map[string]string `yaml:"podAnnotations"` NodeSelector map[string]string `yaml:"nodeSelector"` Tolerations []TolerationConfig `yaml:"tolerations"` UnstructuredAffinity any `yaml:"affinity"` Affinity *corev1.Affinity `yaml:"-"` }
RuntimeConfig is the runtime configuration.
type S3Config ¶
type S3Config struct { EndpointURL string `yaml:"endpointUrl"` Region string `yaml:"region"` Bucket string `yaml:"bucket"` AssumeRole *AssumeRoleConfig `yaml:"assumeRole"` }
S3Config is the S3 configuration.
type TolerationConfig ¶
type TolerationConfig struct { Key string `yaml:"key"` Operator string `yaml:"operator"` Value string `yaml:"value"` Effect string `yaml:"effect"` TolerationSeconds int64 `yaml:"tolerationSeconds"` }
TolerationConfig is the toleration configuration.
type WorkerConfig ¶
type WorkerConfig struct {
TLS WorkerTLSConfig `yaml:"tls"`
}
WorkerConfig is the worker configuration.
type WorkerTLSConfig ¶
type WorkerTLSConfig struct {
Enable bool `yaml:"enable"`
}
WorkerTLSConfig is the worker TLS configuration.