Documentation ¶
Overview ¶
Copyright 2018 The Kubeflow Authors
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License
Copyright 2018 The Kubeflow Authors ¶
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License
Index ¶
- Constants
- Variables
- type AdvancedGpuMetric
- type AllNodeInfo
- type ArenaClientArgs
- type AutoscalerOptions
- type CommonCronArgs
- type CommonGPUNodeInfo
- type CommonModelArgs
- type CommonNodeInfo
- type CommonServingArgs
- type CommonSubmitArgs
- type CommonUpdateServingArgs
- type ConcurrencyPolicy
- type ConfigFileInfo
- type CronHistoryInfo
- type CronInfo
- type CronTFJobArgs
- type CronType
- type CustomServingArgs
- type DataDirVolume
- type Destination
- type DestinationRuleCRD
- type DestinationWeight
- type DistributedServingArgs
- type Driver
- type Endpoint
- type EvaluateJobArgs
- type EvaluateJobInfo
- type EvaluateJobType
- type Executor
- type FormatStyle
- type GPUDeviceInfo
- type GPUExclusiveNodeInfo
- type GPUExclusivePodInfo
- type GPUShareNodeDevice
- type GPUShareNodeInfo
- type GPUSharePodInfo
- type GPUTopology
- type GPUTopologyNodeDevice
- type GPUTopologyNodeInfo
- type GPUTopologyPodInfo
- type GpuMetric
- type GpuMetricInfo
- type HTTPMatchRequest
- type HTTPRoute
- type HeadGroupSpec
- type JobConditionType
- type JobGpuMetric
- type K8sObject
- type KFServingArgs
- type KServeArgs
- type LimitedPodSecurityContext
- type LogArgs
- type LogLevel
- type ModelBenchmarkArgs
- type ModelEvaluateArgs
- type ModelFormat
- type ModelJobInfo
- type ModelJobInstance
- type ModelJobStatus
- type ModelJobType
- type ModelOptimizeArgs
- type ModelProfileArgs
- type ModelTypeInfo
- type ModelVersion
- type ModelVersionStatus
- type ModelVersionTag
- type NodeGpuMetric
- type NodeType
- type NodeTypeInfo
- type NormalNodeInfo
- type PodGpuMetric
- type PortSelector
- type PreprocesObject
- type PrometheusMetric
- type PrometheusMetricData
- type PrometheusMetricResult
- type PrometheusMetricValue
- type PrometheusServer
- type RayClusterSpec
- type RegisteredModel
- type RegisteredModelAlias
- type RegisteredModelTag
- type Runtime
- type ScaleETJobArgs
- type ScaleInETJobArgs
- type ScaleOutETJobArgs
- type SeldonServingArgs
- type ServingInstance
- type ServingJobInfo
- type ServingJobType
- type ServingTypeInfo
- type ServingVersionWeight
- type StringMatchPrefix
- type SubmitDeepSpeedJobArgs
- type SubmitETJobArgs
- type SubmitHorovodJobArgs
- type SubmitMPIJobArgs
- type SubmitPyTorchJobArgs
- type SubmitRayJobArgs
- type SubmitSparkJobArgs
- type SubmitSyncCodeArgs
- type SubmitTFJobArgs
- type SubmitTensorboardArgs
- type SubmitVolcanoJobArgs
- type TFRuntime
- type TensorFlowServingArgs
- type TensorRTServingArgs
- type TolerationArgs
- type TrafficRouterSplitArgs
- type TrainingJobInfo
- type TrainingJobInstance
- type TrainingJobStatus
- type TrainingJobType
- type TrainingJobTypeInfo
- type TritonServingArgs
- type UpdateCustomServingArgs
- type UpdateDistributedServingArgs
- type UpdateKServeArgs
- type UpdateTensorFlowServingArgs
- type UpdateTritonServingArgs
- type VirtualService
- type VirtualServiceCRD
- type WorkerGroupSpec
Constants ¶
const ()
const ( AliyunGPUResourceName = "aliyun.com/gpu" GPUTopologyAllocationLabel = "topology.kubernetes.io/gpu-group" GPUTopologyVisibleGPULabel = "topology.kubernetes.io/gpu-visible" GPUTopologyNodeLabels = "ack.node.gpu.schedule=topology" )
const ( MultiTenantIsolationLabel = "arena.kubeflow.org/isolate-user" UserNameIdLabel = "arena.kubeflow.org/uid" UserNameNameLabel = "arena.kubeflow.org/username" SSHSecretName = "arena.kubeflow.org/ssh-secret" )
const (
CPUResourceName = "cpu"
)
const KUBEFLOW_NAMESPACE = "kubeflow"
const KUBE_SYSTEM_NAMESPACE = "kube-system"
const NODE_METRIC_TMP = `{__name__=~"%s", node_name=~"%s"}`
const (
// defines the nvidia resource name
NvidiaGPUResourceName = "nvidia.com/gpu"
)
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
const PROMETHEUS_SCHEME = "http"
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
const (
RequestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner"
)
Variables ¶
var ( ErrTrainingJobNotFound = errors.New("training job not found,please use 'arena list' to make sure job is existed.") ErrNoPrivilegesToOperateJob = errors.New("you have no privileges to operate the job,because the owner of job is not you") )
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
var ModelTypeMap = map[ModelJobType]ModelTypeInfo{ ModelProfileJob: { Name: ModelProfileJob, Alias: "Profile", Shorthand: "profile", }, ModelOptimizeJob: { Name: ModelOptimizeJob, Alias: "Optimize", Shorthand: "optimize", }, ModelBenchmarkJob: { Name: ModelBenchmarkJob, Alias: "Benchmark", Shorthand: "benchmark", }, ModelEvaluateJob: { Name: ModelEvaluateJob, Alias: "Evaluate", Shorthand: "evaluate", }, }
ModelTypeMap collects model job type and their alias
var NodeTypeSlice = []NodeTypeInfo{ { Name: NormalNode, Alias: "none", Shorthand: "n", }, { Name: GPUExclusiveNode, Alias: "exclusive", Shorthand: "e", }, { Name: GPUTopologyNode, Alias: "topology", Shorthand: "t", }, { Name: GPUShareNode, Alias: "share", Shorthand: "s", }, }
var SUPPORT_PROMETHEUS_SERVERS = []*PrometheusServer{ { Name: "arms-prometheus-admin", ServiceLabels: "kubernetes.io/service-name=prometheus-admin", Protocol: "http", Port: "9335", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, { Name: "default", ServiceLabels: "kubernetes.io/service-name=prometheus-server", Protocol: "http", Port: "9090", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, { Name: "default-old", ServiceLabels: "kubernetes.io/name=Prometheus", Protocol: "http", Port: "9090", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, }
var ServingTypeMap = map[ServingJobType]ServingTypeInfo{ CustomServingJob: { Name: CustomServingJob, Alias: "Custom", Shorthand: "custom", }, KFServingJob: { Name: KFServingJob, Alias: "KFServing", Shorthand: "kf", }, KServeJob: { Name: KServeJob, Alias: "KServe", Shorthand: "kserve", }, TFServingJob: { Name: TFServingJob, Alias: "Tensorflow", Shorthand: "tf", }, TRTServingJob: { Name: TRTServingJob, Alias: "Tensorrt", Shorthand: "trt", }, TritonServingJob: { Name: TritonServingJob, Alias: "Triton", Shorthand: "Triton", }, SeldonServingJob: { Name: SeldonServingJob, Alias: "Seldon", Shorthand: "seldon", }, DistributedServingJob: { Name: DistributedServingJob, Alias: "Distributed", Shorthand: "distributed", }, }
ServingTypeMap collects serving job type and their alias
var TrainingTypeMap = map[TrainingJobType]TrainingJobTypeInfo{ TFTrainingJob: { Name: TFTrainingJob, Alias: "Tensorflow", Shorthand: "tf", }, MPITrainingJob: { Name: MPITrainingJob, Alias: "MPI", Shorthand: "mpi", }, PytorchTrainingJob: { Name: PytorchTrainingJob, Alias: "Pytorch", Shorthand: "py", }, HorovodTrainingJob: { Name: HorovodTrainingJob, Alias: "Horovod", Shorthand: "horovod", }, VolcanoTrainingJob: { Name: VolcanoTrainingJob, Alias: "Volcano", Shorthand: "volcano", }, ETTrainingJob: { Name: ETTrainingJob, Alias: "ElasticTraining", Shorthand: "et", }, SparkTrainingJob: { Name: SparkTrainingJob, Alias: "Spark", Shorthand: "spark", }, DeepSpeedTrainingJob: { Name: DeepSpeedTrainingJob, Alias: "DeepSpeed", Shorthand: "dp", }, RayJob: { Name: RayJob, Alias: "RayJob", Shorthand: "rj", }, }
ServingTypeMap collects serving job type and their alias
Functions ¶
This section is empty.
Types ¶
type AdvancedGpuMetric ¶
type AdvancedGpuMetric struct { Id string `json:"id" yaml:"id"` UUID string `json:"uuid" yaml:"uuid"` GpuDutyCycle float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"` GpuMemoryUsed float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"` GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"` // PodName is combined with namespace and pod name,like 'namespace/pod_name' PodNames []string `json:"podNames" yaml:"podNames"` }
type AllNodeInfo ¶
type AllNodeInfo map[string][]interface{}
type ArenaClientArgs ¶
type AutoscalerOptions ¶ added in v0.11.0
type AutoscalerOptions struct { // cpu specifies optional resource request and limit overrides for the autoscaler container. // Default values: 500m CPU request and limit. Cpu string `yaml:"cpu,omitempty"` // memory specifies optional resource request and limit overrides for the autoscaler // Default values: 512Mi memory request and limit. Memory string `yaml:"memory,omitempty"` // Image optionally overrides the autoscaler's container image. This override is for provided for autoscaler testing and development. Image string `yaml:"image,omitempty"` // ImagePullPolicy optionally overrides the autoscaler container's image pull policy. This override is for provided for autoscaler testing and development. ImagePullPolicy string `yaml:"imagePullPolicy,omitempty"` // IdleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. // Defaults to 60 (one minute). It is not read by the KubeRay operator but by the Ray autoscaler. IdleTimeoutSeconds int32 `yaml:"idleTimeoutSeconds,omitempty"` // UpscalingMode is "Conservative", "Default", or "Aggressive." // Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. // Default: Upscaling is not rate-limited. // Aggressive: An alias for Default; upscaling is not rate-limited. // It is not read by the KubeRay operator but by the Ray autoscaler. // +kubebuilder:validation:Enum=Default;Aggressive;Conservative UpscalingMode string `yaml:"upscalingMode,omitempty"` }
AutoscalerOptions specifies optional configuration for the Ray autoscaler.
type CommonCronArgs ¶ added in v0.8.2
type CommonCronArgs struct { // The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron. Schedule string `yaml:"schedule"` // --schedule // Specifies how to treat concurrent executions of a Job. // Valid values are: // - "Allow" (default): allows CronJobs to run concurrently; // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet; // - "Replace": cancels currently running job and replaces it with a new one // +optional ConcurrencyPolicy string `yaml:"concurrencyPolicy"` // --concurrency-policy // This flag tells the controller to suspend subsequent executions, it does // not apply to already started executions. Defaults to false. // +optional Suspend bool `yaml:"suspend"` // --suspend // Deadline is the timestamp that a cron job can keep scheduling util then. Deadline string `yaml:"deadline"` // --deadline // The number of finished job history to retain. // This is a pointer to distinguish between explicit zero and not specified. // +optional HistoryLimit int `yaml:"historyLimit"` // --history-limit }
type CommonGPUNodeInfo ¶
type CommonGPUNodeInfo struct { TotalGPUs float64 `json:"totalGPUs" yaml:"totalGPUs"` AllocatedGPUs float64 `json:"allocatedGPUs" yaml:"allocatedGPUs"` UnhealthyGPUs float64 `json:"unhealthyGPUs" yaml:"unhealthyGPUs"` GPUMetrics []*AdvancedGpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"` }
type CommonModelArgs ¶ added in v0.9.0
type CommonModelArgs struct { Name string `yaml:"name"` // --name Namespace string `yaml:"namespace"` // --namespace ModelConfigFile string `yaml:"modelConfigFile"` // --model-config-file ModelName string `yaml:"modelName"` // --model-name ModelPath string `yaml:"modelPath"` // --model-path Inputs string `yaml:"inputs"` // --inputs Outputs string `yaml:"outputs"` // --outputs Image string `yaml:"image"` // --image ImagePullPolicy string `yaml:"imagePullPolicy"` // --image-pull-policy // ImagePullSecrets stores image pull secrets,match option --image-pull-secrets ImagePullSecrets []string `yaml:"imagePullSecrets"` GPUCount int `yaml:"gpuCount"` // --gpus GPUMemory int `yaml:"gpuMemory"` // --gpumemory GPUCore int `yaml:"gpuCore"` // --gpucore Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // DataSet stores the kubernetes pvc names DataSet map[string]string `yaml:"dataset"` // --data // DataDirs stores the files(or directories) in k8s node which will map to containers DataDirs []DataDirVolume `yaml:"dataDirs"` // --data-dir Envs map[string]string `yaml:"envs"` // --env NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector Tolerations []TolerationArgs `yaml:"tolerations"` // --toleration Annotations map[string]string `yaml:"annotations"` // --annotation Labels map[string]string `yaml:"labels"` // --label Shell string `yaml:"shell"` // --shell Command string `yaml:"command"` Type ModelJobType `yaml:"type"` // HelmOptions stores the helm options HelmOptions []string `yaml:"-"` }
type CommonNodeInfo ¶
type CommonServingArgs ¶
type CommonServingArgs struct { Name string `yaml:"servingName"` Version string `yaml:"servingVersion"` Namespace string `yaml:"-"` Type ServingJobType `yaml:"-"` Image string `yaml:"image"` ImagePullPolicy string `yaml:"imagePullPolicy"` // --imagePullPolicy GPUCount int `yaml:"gpuCount"` // --gpus GPUMemory int `yaml:"gpuMemory"` // --gpumemory GPUCore int `yaml:"gpuCore"` // --gpucore Devices map[string]string `yaml:"devices"` // --device Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory Envs map[string]string `yaml:"envs"` // --envs EnvsFromSecret map[string]string `yaml:"envsFromSecret"` // --env-from-secret Shell string `yaml:"shell"` // --shell Command string `yaml:"command"` // --command Replicas int `yaml:"replicas"` // --replicas EnableIstio bool `yaml:"enableIstio"` // --enableIstio ExposeService bool `yaml:"exposeService"` // --exposeService ModelDirs map[string]string `yaml:"modelDirs"` // --data DataSubpathExprs map[string]string `yaml:"dataSubPathExprs"` // --data-subpath-expr TempDirSubpathExpr map[string]string `yaml:"tempDirSubPathExprs"` // --temp-dir-subpath-expr TempDirs map[string]string `yaml:"tempDirs"` // --temp-dir ImagePullSecrets []string `yaml:"imagePullSecrets"` //--image-pull-secrets HostVolumes []DataDirVolume `yaml:"dataDirs"` // --data-dir NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector Tolerations []TolerationArgs `yaml:"tolerations"` // --toleration Annotations map[string]string `yaml:"annotations"` Labels map[string]string `yaml:"labels"` // --label // ConfigFiles stores the config file which is existed in client host node // and map it to container,match option --config-file ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"` // HelmOptions stores the helm options HelmOptions []string `yaml:"-"` ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists ModelName string `yaml:"modelName"` // --model-name ModelVersion string `yaml:"modelVersion"` // --model-version }
type CommonSubmitArgs ¶
type CommonSubmitArgs struct { // Name stores the job name,match option --name Name string `yaml:"-"` // Namespace stores the namespace of job,match option --namespace Namespace string `yaml:"-"` // TrainingType stores the trainingType TrainingType TrainingJobType `yaml:"trainingType"` // NodeSelectors defines the node selectors,match option --selector NodeSelectors map[string]string `yaml:"nodeSelectors"` // ConfigFiles stores the config file which is existed in client host node // and map it to container,match option --config-file ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"` // Tolerations defines the tolerations which tolerates node taints // match option --toleration Tolerations []TolerationArgs `yaml:"tolerations"` // Image stores the docker image of job,match option --image Image string `yaml:"image"` // ImagePullPolicy stores the docker image pull policy of job,match option --image-pull-policy ImagePullPolicy string `yaml:"imagePullPolicy"` // GPUCount stores the gpu count of the job needs,match option --gpus GPUCount int `yaml:"gpuCount"` // Devices stores chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1,match option --device Devices map[string]string `yaml:"devices"` // Envs stores the envs of container in job, match option --env Envs map[string]string `yaml:"envs"` // WorkingDir stores the working directory of container in job,match option --working-dir WorkingDir string `yaml:"workingDir"` // Shell specify the linux shell type Shell string `yaml:"shell"` // Command stores the command of job Command string `yaml:"command"` // Mode is used for horovod,match option --sync-mode Mode string `yaml:"mode"` // WorkerCount stores the count of job worker,match option --workers WorkerCount int `yaml:"workers"` // Retry defines the retry times Retry int `yaml:"retry"` // DataSet stores the kubernetes pvc names DataSet map[string]string `yaml:"dataset"` // DataDirs stores the files(or directories) in k8s node which will map to containers // match option --data-dir DataDirs []DataDirVolume `yaml:"dataDirs"` // EnableRDMA enable rdma or not,match option --rdma EnableRDMA bool `yaml:"enableRDMA"` // EnableQueue enables the feature to queue jobs after they are scheduled. EnableQueue bool `yaml:"enableQueue"` // UseENI defines using eni or not UseENI bool `yaml:"useENI"` // Annotations defines pod annotations of job,match option --annotation Annotations map[string]string `yaml:"annotations"` // Labels specify the job labels and it is work for pods Labels map[string]string `yaml:"labels"` // IsNonRoot is root user or not IsNonRoot bool `yaml:"isNonRoot"` // PodSecurityContext defines the pod security context PodSecurityContext LimitedPodSecurityContext `yaml:"podSecurityContext"` // PriorityClassName defines the priority class PriorityClassName string `yaml:"priorityClassName"` // Coscheduling defines using Coscheduling Coscheduling bool // PodGroupName stores pod group name PodGroupName string `yaml:"podGroupName"` // PodGroupMinAvailable stores pod group min available PodGroupMinAvailable string `yaml:"podGroupMinAvailable"` // ImagePullSecrets stores image pull secrets,match option --image-pull-secrets ImagePullSecrets []string `yaml:"imagePullSecrets"` // HelmOptions stores the helm options HelmOptions []string `yaml:"-"` // EnableSpotInstance enables the feature of SuperVisor manage spot instance training. EnableSpotInstance bool `yaml:"enableSpotInstance"` // MaxWaitTime stores the maximum length of time a job waits for resources MaxWaitTime int `yaml:"maxWaitTime"` // SchedulerName stores the scheduler name,match option --scheduler SchedulerName string `yaml:"schedulerName"` // UseHostNetwork defines using useHostNetwork UseHostNetwork bool `yaml:"useHostNetwork"` // UseHostPID defines using useHostPID UseHostPID bool `yaml:"useHostPID"` // UseHostIPC defines using useHostIPC UseHostIPC bool `yaml:"useHostIPC"` // ModelName defines the model name associates with the job ModelName string `yaml:"modelName"` // ModelSource defines the model source ModelSource string `yaml:"modelSource"` }
CommonSubmitArgs defines the common parts of the submitAthd
type CommonUpdateServingArgs ¶ added in v0.8.9
type CommonUpdateServingArgs struct { Name string `yaml:"servingName"` Version string `yaml:"servingVersion"` Namespace string `yaml:"-"` Type ServingJobType `yaml:"-"` Image string `yaml:"image"` GPUCount int `yaml:"gpuCount"` // --gpus GPUMemory int `yaml:"gpuMemory"` // --gpumemory GPUCore int `yaml:"gpuCore"` // --gpucore Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory Replicas int `yaml:"replicas"` // --replicas Envs map[string]string `yaml:"envs"` // --envs Annotations map[string]string `yaml:"annotations"` // --annotation Labels map[string]string `yaml:"labels"` // --label NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector Tolerations []TolerationArgs `yaml:"tolerations"` // --toleration Shell string `yaml:"shell"` // --shell Command string `yaml:"command"` // --command ModelDirs map[string]string `yaml:"modelDirs"` // --data }
type ConcurrencyPolicy ¶ added in v0.8.2
type ConcurrencyPolicy string
ConcurrencyPolicy describes how the job will be handled. Only one of the following concurrent policies may be specified. If none of the following policies is specified, the default one is AllowConcurrent.
const ( ConcurrencyAllow ConcurrencyPolicy = "Allow" ConcurrencyForbid ConcurrencyPolicy = "Forbid" ConcurrencyReplace ConcurrencyPolicy = "Replace" )
type ConfigFileInfo ¶
type ConfigFileInfo struct { ContainerFileName string `yaml:"containerFileName"` HostFile string `yaml:"hostFile"` Key string `yaml:"key"` ContainerFilePath string `yaml:"containerFilePath"` }
ConfigFileInfo defines the config files which will be mounted to containers
type CronHistoryInfo ¶ added in v0.8.2
type CronHistoryInfo struct { Name string `json:"name" yaml:"name"` Namespace string `json:"namespace" yaml:"namespace"` Group string `json:"group" yaml:"group"` Kind string `json:"kind" yaml:"kind"` Status string `json:"status" yaml:"status"` CreateTime string `json:"createTime" yaml:"createTime"` FinishTime string `json:"finishTime" yaml:"finishTime"` }
type CronInfo ¶ added in v0.8.2
type CronInfo struct { UUID string `json:"uuid" yaml:"uuid"` Name string `json:"name" yaml:"name"` Namespace string `json:"namespace" yaml:"namespace"` // Type is the job type, like TFjob、PyTorchJob Type string `json:"type" yaml:"type"` // The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron. Schedule string `json:"schedule" yaml:"schedule"` // Specifies how to treat concurrent executions of a Job. // Valid values are: // - "Allow" (default): allows CronJobs to run concurrently; // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet; // - "Replace": cancels currently running job and replaces it with a new one // +optional ConcurrencyPolicy string `json:"concurrencyPolicy" yaml:"concurrencyPolicy"` // --concurrency-policy // This flag tells the controller to suspend subsequent executions, it does // not apply to already started executions. Defaults to false. // +optional Suspend bool `json:"suspend" yaml:"suspend"` // --suspend // Deadline is the timestamp that a cron job can keep scheduling util then. Deadline string `json:"deadline" yaml:"deadline"` // --deadline // The number of finished job history to retain. // This is a pointer to distinguish between explicit zero and not specified. // +optional HistoryLimit int64 `json:"historyLimit" yaml:"historyLimit"` // --history-limit // Information when was the last time the job was successfully scheduled. // +optional LastScheduleTime string `json:"lastScheduleTime" yaml:"lastScheduleTime"` // CreationTimestamp stores the creation timestamp of job CreationTimestamp string `json:"creationTimestamp" yaml:"creationTimestamp"` History []CronHistoryInfo `json:"cronHistory" yaml:"cronHistory"` }
type CronTFJobArgs ¶ added in v0.8.2
type CronTFJobArgs struct { CommonCronArgs `yaml:"cron"` SubmitTFJobArgs `yaml:"tfjob"` }
type CronType ¶ added in v0.8.2
type CronType string
CronType defines the supporting job type
const ( // CronTFTrainingJob defines the cron tfjob CronTFTrainingJob CronType = "tfjob" )
type CustomServingArgs ¶
type CustomServingArgs struct { Port int `yaml:"port"` // --port RestfulPort int `yaml:"restApiPort"` // --restfulPort MetricsPort int `yaml:"metricsPort"` // --metrics-port MaxSurge string `yaml:"maxSurge"` // --maxSurge LivenessProbeAction string `yaml:"livenessProbeAction"` // --liveness-probe-action LivenessProbeActionOption []string `yaml:"livenessProbeActionOption"` // --liveness-probe-action-option LivenessProbeOption []string `yaml:"livenessProbeOption"` // --liveness-probe-option ReadinessProbeAction string `yaml:"readinessProbeAction"` // --readiness-probe-action ReadinessProbeActionOption []string `yaml:"readinessProbeActionOption"` // --readiness-probe-action-option ReadinessProbeOption []string `yaml:"readinessProbeOption"` // --readiness-probe-option StartupProbeAction string `yaml:"startupProbeAction"` // --startup-probe-action StartupProbeActionOption []string `yaml:"startupProbeActionOption"` // --startup-probe-action-option StartupProbeOption []string `yaml:"startupProbeOption"` // --startup-probe-option CommonServingArgs `yaml:",inline"` }
type DataDirVolume ¶
type DataDirVolume struct { // HostPath defines the host path HostPath string `yaml:"hostPath"` // ContainerPath defines container path ContainerPath string `yaml:"containerPath"` // Name defines the volume name Name string `yaml:"name"` }
DataDirVolume defines the volume of kubernetes
type Destination ¶
type Destination struct { *istiov1alpha3.Destination Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"` }
type DestinationRuleCRD ¶
type DestinationRuleCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec *istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }
type DestinationWeight ¶
type DestinationWeight struct { Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"` Weight int32 `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"` }
type DistributedServingArgs ¶ added in v0.12.0
type DistributedServingArgs struct { Masters int `yaml:"masters"` // --masters Workers int `yaml:"workers"` // --workers MasterCpu string `yaml:"masterCpus"` // --master-cpu WorkerCpu string `yaml:"workerCpus"` // --worker-cpu MasterGPUCount int `yaml:"masterGpus"` // --master-gpus WorkerGPUCount int `yaml:"workerGpus"` // --worker-gpus MasterMemory string `yaml:"masterMemory"` // --master-memory WorkerMemory string `yaml:"workerMemory"` // --worker-memory MasterGPUMemory int `yaml:"masterGPUMemory"` // --master-gpumemory WorkerGPUMemory int `yaml:"workerGPUMemory"` // --worker-gpumemory MasterGPUCore int `yaml:"masterGPUCore"` // --master-gpucore WorkerGPUCore int `yaml:"workerGPUCore"` // --worker-gpucore MasterCommand string `yaml:"masterCommand"` // --master-command WorkerCommand string `yaml:"workerCommand"` // --worker-command InitBackend string `yaml:"initBackend"` // --init-backend CustomServingArgs `yaml:",inline"` }
type EvaluateJobArgs ¶ added in v0.8.8
type EvaluateJobArgs struct { // Name stores the job name,match option --name Name string `yaml:"-"` // Namespace stores the namespace of job,match option --namespace Namespace string `yaml:"-"` // NodeSelectors defines the node selectors,match option --selector NodeSelectors map[string]string `yaml:"nodeSelectors"` // Tolerations defines the tolerations which tolerates node taints // match option --toleration Tolerations []TolerationArgs `yaml:"tolerations"` // Image stores the docker image of job,match option --image Image string `yaml:"image"` // Envs stores the envs of container in job, match option --env Envs map[string]string `yaml:"envs"` WorkingDir string `yaml:"workingDir"` // Command stores the command of job Command string `yaml:"command"` // DataDirs stores the files(or directories) in k8s node which will map to containers // match option --data-dir DataDirs []DataDirVolume `yaml:"dataDirs"` // DataSources stores the kubernetes pvc names DataSources map[string]string `yaml:"dataSources"` // Annotations defines pod annotations of job,match option --annotation Annotations map[string]string `yaml:"annotations"` // Labels specify the job labels and it is work for pods Labels map[string]string `yaml:"labels"` // ImagePullSecrets stores image pull secrets,match option --image-pull-secrets ImagePullSecrets []string `yaml:"imagePullSecrets"` // HelmOptions stores the helm options HelmOptions []string `yaml:"-"` ModelName string `yaml:"modelName"` // --model-name ModelPath string `yaml:"modelPath"` // --model-path ModelVersion string `yaml:"modelVersion"` // --model-version MetricsPath string `yaml:"metricsPath"` // --metrics-path DatasetPath string `yaml:"datasetPath"` // --dataset-path Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory GPUCount int `yaml:"gpuCount"` // --gpus // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` }
type EvaluateJobInfo ¶ added in v0.8.8
type EvaluateJobInfo struct { UUID string `json:"uuid" yaml:"uuid"` JobID string `json:"jobId" yaml:"jobId"` Name string `json:"name" yaml:"name"` Namespace string `json:"namespace" yaml:"namespace"` ModelName string `json:"modelName" yaml:"modelName"` ModelPath string `json:"modelPath" yaml:"modelPath"` ModelVersion string `json:"modelVersion" yaml:"modelVersion"` MetricsPath string `json:"metricsPath" yaml:"metricsPath"` DatasetPath string `json:"datasetPath" yaml:"datasetPath"` Status string `json:"status" yaml:"status"` CreationTimestamp string `json:"creationTimestamp" yaml:"creationTimestamp"` }
type EvaluateJobType ¶ added in v0.8.8
type EvaluateJobType string
const ( // EvaluateJob defines the tensorflow serving job EvaluateJob EvaluateJobType = "evaluatejob" )
type FormatStyle ¶
type FormatStyle string
PrintFormatStyle defines the format of output it only used in cmd
const ( // Wide defines the wide format WideFormat FormatStyle = "wide" // Json defines the json format JsonFormat FormatStyle = "json" // Yaml defines the yaml format YamlFormat FormatStyle = "yaml" // Unknwon defines the unknown format UnknownFormat FormatStyle = "unknown" )
type GPUDeviceInfo ¶
type GPUDeviceInfo struct { ID string `json:"id" yaml:"id"` TotalGPUMemory float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"` AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"` UsedGPUMemory float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"` DutyCycle float64 `json:"dutyCycle" yaml:"dutyCycle"` }
type GPUExclusiveNodeInfo ¶
type GPUExclusiveNodeInfo struct { PodInfos []GPUExclusivePodInfo `json:"instances" yaml:"instances"` CommonNodeInfo `yaml:",inline" json:",inline"` CommonGPUNodeInfo `yaml:",inline" json:",inline"` }
type GPUExclusivePodInfo ¶
type GPUShareNodeDevice ¶
type GPUShareNodeDevice struct {}
type GPUShareNodeInfo ¶
type GPUShareNodeInfo struct {}
type GPUSharePodInfo ¶
type GPUSharePodInfo struct {}
type GPUTopology ¶
type GPUTopologyNodeDevice ¶
type GPUTopologyNodeInfo ¶
type GPUTopologyNodeInfo struct { PodInfos []GPUTopologyPodInfo `json:"instances" yaml:"instances"` GPUTopology GPUTopology `json:"gpuTopology" yaml:"gpuTopology"` CommonGPUNodeInfo `yaml:",inline" json:",inline"` CommonNodeInfo `yaml:",inline" json:",inline"` Devices []GPUTopologyNodeDevice `json:"devices" yaml:"devices"` }
type GPUTopologyPodInfo ¶
type GPUTopologyPodInfo struct { Name string `json:"name" yaml:"name"` Namespace string `json:"namespace" yaml:"namespace"` Status string `json:"status" yaml:"status"` RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"` Allocation []string `json:"allocation" yaml:"allocation"` VisibleGPUs []string `json:"visibleGPUs" yaml:"visibleGPUs"` }
type GpuMetricInfo ¶
type HTTPMatchRequest ¶
type HTTPMatchRequest struct { *istiov1alpha3.HTTPMatchRequest Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"` }
type HTTPRoute ¶
type HTTPRoute struct { *istiov1alpha3.HTTPRoute Match []*HTTPMatchRequest `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"` Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"` }
type HeadGroupSpec ¶ added in v0.11.0
type HeadGroupSpec struct { // ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod ServiceType string `yaml:"serviceType,omitempty"` Image string `yaml:"image"` Cpu string `yaml:"cpu"` Memory string `yaml:"memory"` Gpu int `yaml:"gpu"` }
HeadGroupSpec are the spec for the head pod
type JobConditionType ¶ added in v0.8.2
type JobConditionType string
JobConditionType defines all kinds of types of JobStatus.
const ( // JobCreated means the job has been accepted by the system, // but one or more of the pods/services has not been started. // This includes time before pods being scheduled and launched. JobCreated JobConditionType = "Created" // JobRunning means all sub-resources (e.g. services/pods) of this job // have been successfully scheduled and launched. // The training is running without error. JobRunning JobConditionType = "Running" // JobRestarting means one or more sub-resources (e.g. services/pods) of this job // reached phase failed but maybe restarted according to it's restart policy // which specified by user in v1.PodTemplateSpec. // The training is freezing/pending. JobRestarting JobConditionType = "Restarting" // JobSucceeded means all sub-resources (e.g. services/pods) of this job // reached phase have terminated in success. // The training is complete without error. JobSucceeded JobConditionType = "Succeeded" // JobFailed means one or more sub-resources (e.g. services/pods) of this job // reached phase failed with no restarting. // The training has failed its execution. JobFailed JobConditionType = "Failed" )
type JobGpuMetric ¶
type JobGpuMetric map[string]PodGpuMetric
type K8sObject ¶ added in v0.8.7
type K8sObject struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` }
type KFServingArgs ¶
type KFServingArgs struct { Port int `yaml:"port"` // --port ModelType string `yaml:"modelType"` // --modelType CanaryPercent int `yaml:"canaryPercent"` // --canaryTrafficPercent StorageUri string `yaml:"storageUri"` // --storageUri CommonServingArgs `yaml:",inline"` }
type KServeArgs ¶ added in v0.9.11
type KServeArgs struct { ModelFormat *ModelFormat `yaml:"modelFormat"` // --model-format Runtime string `yaml:"runtime"` // --runtime StorageUri string `yaml:"storageUri"` // --storageUri RuntimeVersion string `yaml:"runtimeVersion"` // --runtime-version ProtocolVersion string `yaml:"protocolVersion"` // --protocol-version MinReplicas int `yaml:"minReplicas"` // --min-replicas MaxReplicas int `yaml:"maxReplicas"` // --max-replicas ScaleTarget int `yaml:"scaleTarget"` // --scale-target ScaleMetric string `yaml:"scaleMetric"` // --scale-metric ContainerConcurrency int64 `yaml:"containerConcurrency"` // --container-concurrency TimeoutSeconds int64 `yaml:"timeout"` // --timeout CanaryTrafficPercent int64 `yaml:"canaryTrafficPercent,omitempty"` // --canary-traffic-percent Port int `yaml:"port"` // --port EnablePrometheus bool `yaml:"enablePrometheus,omitempty"` // --enable-prometheus MetricsPort int `yaml:"metricsPort,omitempty"` // --metrics-port SecurityContext map[string]string `yaml:"securityContext,omitempty"` // --security-context CommonServingArgs `yaml:",inline"` }
type LimitedPodSecurityContext ¶
type LimitedPodSecurityContext struct { RunAsUser int64 `yaml:"runAsUser"` RunAsNonRoot bool `yaml:"runAsNonRoot"` RunAsGroup int64 `yaml:"runAsGroup"` SupplementalGroups []int64 `yaml:"supplementalGroups"` }
LimitedPodSecurityContext defines the kuberntes pod security context
type ModelBenchmarkArgs ¶ added in v0.9.0
type ModelBenchmarkArgs struct { Concurrency int `yaml:"concurrency"` // --concurrency Requests int `yaml:"requests"` // --requests Duration int `yaml:"duration"` // --duration (seconds) ReportPath string `yaml:"reportPath"` // --report-path CommonModelArgs `yaml:",inline"` }
type ModelEvaluateArgs ¶ added in v0.9.0
type ModelEvaluateArgs struct { ModelPlatform string `yaml:"modelPlatform"` // --model-platform DatasetPath string `yaml:"datasetPath"` // --dataset-path ReportPath string `yaml:"reportPath"` // --report-path BatchSize int `yaml:"batchSize"` // --batch-size CommonModelArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` }
type ModelFormat ¶ added in v0.9.11
type ModelFormat struct { // Name of the model format. // +required Name string `yaml:"name"` // Version of the model format. // Used in validating that a predictor is supported by a runtime. // Can be "major", "major.minor" or "major.minor.patch". // +optional Version *string `yaml:"version,omitempty"` }
type ModelJobInfo ¶ added in v0.9.0
type ModelJobInfo struct { // The unique identity of the model job UUID string `json:"uuid" yaml:"uuid"` // The name of the model job Name string `json:"name" yaml:"name"` // The namespace of the model job Namespace string `json:"namespace" yaml:"namespace"` // The time of the model job Duration string `json:"duration" yaml:"duration"` // Age specifies the model job age Age string `json:"age" yaml:"age"` // The status of the model Job Status string `json:"status" yaml:"status"` // The model type of the model job Type string `json:"type" yaml:"type"` // The instances under the model job Instances []ModelJobInstance `json:"instances" yaml:"instances"` // RequestCPUs GPU count of the Job RequestCPUs int64 `json:"requestCPUs" yaml:"requestCPUs"` // RequestGPUs stores the request gpus RequestGPUs int64 `json:"requestGPUs" yaml:"requestGPUs"` // RequestGPUMemory stores the request gpus RequestGPUMemory int64 `json:"requestGPUMemory" yaml:"requestGPUMemory"` // RequestGPUCore stores the request gpus core RequestGPUCore int64 `json:"requestGPUCore" yaml:"requestGPUCore"` // CreationTimestamp stores the creation timestamp of job CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` // CreationTimestamp stores the job parameters Params map[string]string `json:"params" yaml:"params"` }
type ModelJobInstance ¶ added in v0.9.0
type ModelJobInstance struct { // Name gives the instance name Name string `json:"name" yaml:"name"` // Status gives the instance status Status string `json:"status" yaml:"status"` // Age gives the instance ge Age string `json:"age" yaml:"age"` // ReadyContainer represents the count of ready containers ReadyContainer int `json:"readyContainers" yaml:"readyContainers"` // TotalContainer represents the count of total containers TotalContainer int `json:"totalContainers" yaml:"totalContainers"` // RestartCount represents the count of instance restarts RestartCount int `json:"restartCount" yaml:"restartCount"` // HostIP specifies host ip of instance NodeIP string `json:"nodeIP" yaml:"nodeIP"` // NodeName returns the node name NodeName string `json:"nodeName" yaml:"nodeName"` // IP returns the instance ip IP string `json:"ip" yaml:"ip"` // RequestGPU returns the request gpus RequestGPUs float64 `json:"requestGPUs" yaml:"requestGPUs"` // RequestGPUMemory returns the request gpu memory RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"` // RequestGPUCore returns the request gpu core RequestGPUCore int `json:"requestGPUCore" yaml:"requestGPUCore"` // CreationTimestamp returns the creation timestamp of instance CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` }
type ModelJobStatus ¶ added in v0.9.0
type ModelJobStatus string
ModelJobStatus defines all the kinds of JobStatus
const ( // ModelJobPending means the job is pending ModelJobPending ModelJobStatus = "PENDING" // ModelJobRunning means the job is running ModelJobRunning ModelJobStatus = "RUNNING" // ModelJobComplete means the job is complete ModelJobComplete ModelJobStatus = "COMPLETE" // ModelJobFailed means the job is failed ModelJobFailed ModelJobStatus = "FAILED" // ModelJobUnknown means the job status is unknown ModelJobUnknown ModelJobStatus = "UNKNOWN" )
type ModelJobType ¶ added in v0.9.0
type ModelJobType string
ModelJobType defines the supporting model job type
const ( // ModelProfileJob defines the model profile job ModelProfileJob ModelJobType = "profile" // ModelOptimizeJob defines the model optimize job ModelOptimizeJob ModelJobType = "optimize" // ModelBenchmarkJob defines the model benchmark job ModelBenchmarkJob ModelJobType = "benchmark" // ModelEvaluateJob defines the model evaluate job ModelEvaluateJob ModelJobType = "evaluate" // AllModelJob defines all model job AllModelJob ModelJobType = "" // UnknownModelJob defines the unknown model job UnknownModelJob ModelJobType = "unknown" )
type ModelOptimizeArgs ¶ added in v0.9.0
type ModelOptimizeArgs struct { Optimizer string `yaml:"optimizer"` // --optimizer TargetDevice string `yaml:"targetDevice"` // --target-device ExportPath string `yaml:"exportPath"` // --export-path CommonModelArgs `yaml:",inline"` }
type ModelProfileArgs ¶ added in v0.9.0
type ModelProfileArgs struct { ReportPath string `yaml:"reportPath"` // --report-path UseTensorboard bool `yaml:"useTensorboard"` // --tensorboard TensorboardImage string `yaml:"tensorboardImage"` // --tensorboardImage CommonModelArgs `yaml:",inline"` }
type ModelTypeInfo ¶ added in v0.9.0
type ModelTypeInfo struct { Name ModelJobType Alias string Shorthand string }
type ModelVersion ¶ added in v0.9.14
type ModelVersion struct { Name string `json:"name"` Version string `json:"version,omitempty"` CreationTimestamp int64 `json:"creation_timestamp,omitempty"` LastUpdatedTimestamp int64 `json:"last_updated_timestamp,omitempty"` Description string `json:"description,omitempty"` UserId string `json:"user_id,omitempty"` CurrentStage string `json:"current_stage,omitempty"` Source string `json:"source,omitempty"` RunId string `json:"run_id,omitempty"` Status ModelVersionStatus `json:"status,omitempty"` StatusMessage string `json:"status_message,omitempty"` Tags []*ModelVersionTag `json:"tags,omitempty"` RunLink string `json:"run_link,omitempty"` Aliases []string `json:"aliases,omitempty"` }
type ModelVersionStatus ¶ added in v0.9.14
type ModelVersionStatus string
const ( PENDING_REGISTRATION ModelVersionStatus = "PENDING_REGISTRATION" FAILED_REGISTRATION ModelVersionStatus = "FAILED_REGISTRATION" READY ModelVersionStatus = "READY" )
type ModelVersionTag ¶ added in v0.9.14
type NodeTypeInfo ¶
type NormalNodeInfo ¶
type NormalNodeInfo struct {
CommonNodeInfo `yaml:",inline" json:",inline"`
}
type PodGpuMetric ¶
type PortSelector ¶
type PortSelector struct { *istiov1alpha3.PortSelector Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"` }
type PreprocesObject ¶
type PreprocesObject struct { ServiceName string Namespace string DestinationRule DestinationRuleCRD VirtualService VirtualServiceCRD }
type PrometheusMetric ¶
type PrometheusMetric struct { Status string `json:"status,inline"` Data PrometheusMetricData `json:"data,omitempty"` }
type PrometheusMetricData ¶
type PrometheusMetricData struct { Result []PrometheusMetricResult `json:"result"` ResultType string `json:"resultType"` }
type PrometheusMetricResult ¶
type PrometheusMetricResult struct { Metric map[string]string `json:"metric"` Value []PrometheusMetricValue `json:"value"` }
type PrometheusMetricValue ¶
type PrometheusMetricValue interface{}
type PrometheusServer ¶
type PrometheusServer struct { Name string ServiceLabels string Protocol string Port string Path string MetricList []string Service *v1.Service }
PrometheusServer is used to define prometheus server
type RayClusterSpec ¶ added in v0.11.0
type RayClusterSpec struct { // The version of Ray you are using. Make sure all Ray containers are running this version of Ray. RayVersion string `yaml:"rayVersion"` // EnableInTreeAutoscaling indicates whether operator should create in tree autoscaling configs EnableInTreeAutoscaling bool `yaml:"enableInTreeAutoscaling,omitempty"` // AutoscalerOptions specifies optional configuration for the Ray autoscaler. AutoscalerOptions AutoscalerOptions `yaml:"autoscalerOptions,omitempty"` HeadGroupSpec HeadGroupSpec `yaml:"head"` WorkerGroupSpec WorkerGroupSpec `yaml:"worker"` // the command that needs to be executed before stopping PreStopCmd []string `yaml:"preStopCmd"` }
type RegisteredModel ¶ added in v0.9.14
type RegisteredModel struct { Name string `json:"name"` CreationTimestamp int64 `json:"creation_timestamp,omitempty"` LastUpdatedTimestamp int64 `json:"last_updated_timestamp,omitempty"` Description string `json:"description,omitempty"` LatestVersions []*ModelVersion `json:"latest_versions,omitempty"` Tags []*RegisteredModelTag `json:"tags,omitempty"` Aliases []*RegisteredModelAlias `json:"aliases,omitempty"` }
Model Management
type RegisteredModelAlias ¶ added in v0.9.14
type RegisteredModelTag ¶ added in v0.9.14
func (RegisteredModelTag) String ¶ added in v0.9.14
func (t RegisteredModelTag) String() string
type ScaleETJobArgs ¶
type ScaleETJobArgs struct { //--name string required, et job name Name string `yaml:"etName"` // TrainingType stores the trainingType JobType TrainingJobType `yaml:"-"` // Namespace stores the namespace of job,match option --namespace Namespace string `yaml:"-"` //--timeout int timeout of callback scaler script. Timeout int `yaml:"timeout"` //--retry int retry times. Retry int `yaml:"retry"` //--count int the nums of you want to add or delete worker. Count int `yaml:"count"` //--script string script of scaling. Script string `yaml:"script"` //-e, --env stringArray the environment variables Envs map[string]string `yaml:"envs"` }
type ScaleInETJobArgs ¶
type ScaleInETJobArgs struct { // common args ScaleETJobArgs `yaml:",inline"` }
type ScaleOutETJobArgs ¶
type ScaleOutETJobArgs struct { // common args ScaleETJobArgs `yaml:",inline"` }
type SeldonServingArgs ¶ added in v0.8.0
type SeldonServingArgs struct { Implementation string `yaml:"implementation"` // --implementation ModelUri string `yaml:"modelUri"` // --modelUri CommonServingArgs `yaml:",inline"` }
type ServingInstance ¶
type ServingInstance struct { // Name gives the instance name Name string `json:"name" yaml:"name"` // Status gives the instance status Status string `json:"status" yaml:"status"` // Age gives the instance ge Age string `json:"age" yaml:"age"` // ReadyContainer represents the count of ready containers ReadyContainer int `json:"readyContainers" yaml:"readyContainers"` // TotalContainer represents the count of total containers TotalContainer int `json:"totalContainers" yaml:"totalContainers"` // RestartCount represents the count of instance restarts RestartCount int `json:"restartCount" yaml:"restartCount"` // HostIP specifies host ip of instance NodeIP string `json:"nodeIP" yaml:"nodeIP"` // NodeName returns the node name NodeName string `json:"nodeName" yaml:"nodeName"` // IP returns the instance ip IP string `json:"ip" yaml:"ip"` // RequestGPU returns the request gpus RequestGPUs float64 `json:"requestGPUs" yaml:"requestGPUs"` // RequestGPUMemory returns the request gpu memory RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"` // RequestGPUMemory specifies the request gpu core,only for gpushare RequestGPUCore int `json:"requestGPUCore" yaml:"requestGPUCore"` // CreationTimestamp returns the creation timestamp of instance CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` }
type ServingJobInfo ¶
type ServingJobInfo struct { // UUID specifies the unique identity of the serving job UUID string `json:"uuid" yaml:"uuid"` // Name specifies serving job name Name string `json:"name" yaml:"name"` // Namespace specifies serving job namespace Namespace string `json:"namespace" yaml:"namespace"` // Type specifies serving job type Type string `json:"type" yaml:"type"` // Version specifies serving job version Version string `json:"version" yaml:"version"` // Age specifies the serving job age Age string `json:"age" yaml:"age"` // Desired specifies the desired instances Desired int `json:"desiredInstances" yaml:"desiredInstances"` // Available specifies the available instances Available int `json:"availableInstances" yaml:"availableInstances"` // Endpoints specifies the endpoints Endpoints []Endpoint `json:"endpoints" yaml:"endpoints"` // IPAddress specifies the ip address IPAddress string `json:"ip" yaml:"ip"` // Instances gives the instance informations Instances []ServingInstance `json:"instances" yaml:"instances"` // RequestCPUs specifies the request cpus RequestCPUs float64 `json:"requestCPUs" yaml:"requestCPUs"` // RequestGPUs specifies the request gpus RequestGPUs float64 `json:"requestGPUs" yaml:"requestGPUs"` // RequestGPUMemory specifies the request gpu memory,only for gpushare RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"` // RequestGPUMemory specifies the request gpu core,only for gpushare RequestGPUCore int `json:"requestGPUCore" yaml:"requestGPUCore"` // CreationTimestamp stores the creation timestamp of job CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` }
ServingJobInfo display serving job information
type ServingJobType ¶
type ServingJobType string
ServingJobType defines the serving job type name must like shorthand + "-serving"
const ( // TFServingJob defines the tensorflow serving job TFServingJob ServingJobType = "tf-serving" // TRTServingJob defines the tensorrt serving job TRTServingJob ServingJobType = "trt-serving" // KFServingJob defines the kfserving job KFServingJob ServingJobType = "kf-serving" // KServeJob defines the kserve job KServeJob ServingJobType = "kserve" // SeldonServingJob defines the seldon core job SeldonServingJob ServingJobType = "seldon-serving" // TritonServingJob defines the nvidia triton server job TritonServingJob ServingJobType = "triton-serving" // CustomServingJob defines the custom serving job CustomServingJob ServingJobType = "custom-serving" // DistributedServingJob defines the distributed serving job DistributedServingJob ServingJobType = "distributed-serving" // AllServingJob represents all serving job type AllServingJob ServingJobType = "" // UnknownServingJob defines the unknown serving job UnknownServingJob ServingJobType = "unknown" )
type ServingTypeInfo ¶
type ServingTypeInfo struct { Name ServingJobType Alias string Shorthand string }
type ServingVersionWeight ¶
type StringMatchPrefix ¶
type StringMatchPrefix struct {
Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}
type SubmitDeepSpeedJobArgs ¶ added in v0.9.9
type SubmitDeepSpeedJobArgs struct { Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // SubmitTensorboardArgs stores tensorboard information SubmitTensorboardArgs `yaml:",inline"` // SubmitSyncCodeArgs stores syncing code information SubmitSyncCodeArgs `yaml:",inline"` LauncherSelectors map[string]string `yaml:"launcherSelectors"` // --launcher-selector JobRestartPolicy string `yaml:"jobRestartPolicy"` // --job-restart-policy JobBackoffLimit int `yaml:"jobBackoffLimit"` // --job-backoff-limit // SSHSecret enables create secret for job. SSHSecret string `yaml:"sshSecret"` SecretData map[string]string `yaml:"secretData"` // Annotations defines launcher pod annotations of job,match option --launcher-annotation LauncherAnnotations map[string]string `yaml:"launcherAnnotations"` // Annotations defines worker pod annotations of job,match option --worker-annotation WorkerAnnotations map[string]string `yaml:"workerAnnotations"` }
type SubmitETJobArgs ¶
type SubmitETJobArgs struct { Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // SubmitTensorboardArgs stores tensorboard information SubmitTensorboardArgs `yaml:",inline"` // SubmitSyncCodeArgs stores syncing code information SubmitSyncCodeArgs `yaml:",inline"` MaxWorkers int `yaml:"maxWorkers"` MinWorkers int `yaml:"minWorkers"` LauncherSelectors map[string]string `yaml:"launcherSelectors"` // --launcher-selector JobRestartPolicy string `yaml:"jobRestartPolicy"` // --job-restart-policy WorkerRestartPolicy string `yaml:"workerRestartPolicy"` // --worker-restart-policy JobBackoffLimit int `yaml:"jobBackoffLimit"` // --job-backoff-limit // SSHSecret enables create secret for job. SSHSecret string `yaml:"sshSecret"` SecretData map[string]string `yaml:"secretData"` // Annotations defines launcher pod annotations of job,match option --launcher-annotation LauncherAnnotations map[string]string `yaml:"launcherAnnotations"` // Annotations defines worker pod annotations of job,match option --worker-annotation WorkerAnnotations map[string]string `yaml:"workerAnnotations"` }
type SubmitHorovodJobArgs ¶
type SubmitHorovodJobArgs struct { SSHPort int `yaml:"sshPort"` Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // for tensorboard SubmitTensorboardArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` }
type SubmitMPIJobArgs ¶
type SubmitMPIJobArgs struct { Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // for tensorboard SubmitTensorboardArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` // enable gpu topology scheduling GPUTopology bool `yaml:"gputopology"` GPUTopologyReplica string `yaml:"gputopologyreplica"` MountsOnLauncher bool `yaml:"mountsOnLauncher"` // clean-task-policy CleanPodPolicy string `yaml:"cleanPodPolicy"` }
type SubmitPyTorchJobArgs ¶
type SubmitPyTorchJobArgs struct { Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // for tensorboard SubmitTensorboardArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` // clean-task-policy CleanPodPolicy string `yaml:"cleanPodPolicy"` // ActiveDeadlineSeconds Specifies the duration (in seconds) since startTime during which the job can remain active // before it is terminated ActiveDeadlineSeconds int64 `yaml:"activeDeadlineSeconds,omitempty"` // Defines the TTL for cleaning up finished PytorchJobs. Defaults to infinite. TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"` // TrainingOperatorCRD compatible with training-operator crd. TrainingOperatorCRD bool `yaml:"trainingOperatorCRD,omitempty"` ShareMemory string `yaml:"shareMemory"` }
type SubmitRayJobArgs ¶ added in v0.11.0
type SubmitRayJobArgs struct { // for common args CommonSubmitArgs `yaml:",inline"` // for tensorboard SubmitTensorboardArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` // ShutdownAfterJobFinishes will determine whether to delete the ray cluster once rayJob succeed or failed. // default:=false ShutdownAfterJobFinishes bool `yaml:"shutdownAfterJobFinishes,omitempty"` // TTLSecondsAfterFinished is the TTL to clean up RayCluster. // It's only working when ShutdownAfterJobFinishes set to true. // default:=0 TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"` // ActiveDeadlineSeconds is the duration in seconds that the RayJob may be active before // KubeRay actively tries to terminate the RayJob; value must be positive integer. ActiveDeadlineSeconds int32 `yaml:"activeDeadlineSeconds,omitempty"` // suspend specifies whether the RayJob controller should create a RayCluster instance // If a job is applied with the suspend field set to true, // the RayCluster will not be created and will wait for the transition to false. // If the RayCluster is already created, it will be deleted. // In case of transition to false a new RayCluster will be created. Suspend bool `yaml:"suspend,omitempty"` RayClusterSpec `yaml:",inline"` ShareMemory string `yaml:"shareMemory"` }
type SubmitSparkJobArgs ¶
type SubmitSparkJobArgs struct { Name string `yaml:"-"` Namespace string `yaml:"-"` TrainingType TrainingJobType `yaml:"-"` Image string `yaml:"Image"` MainClass string `yaml:"MainClass"` Jar string `yaml:"Jar"` SparkVersion string `yaml:"SparkVersion"` Driver *Driver `yaml:"Driver"` Executor *Executor `yaml:"Executor"` // Annotations defines pod annotations of job,match option --annotation Annotations map[string]string `yaml:"annotations"` // Labels specify the job labels and it is work for pods Labels map[string]string `yaml:"labels"` }
type SubmitSyncCodeArgs ¶
type SubmitSyncCodeArgs struct { SyncMode string `yaml:"syncMode"` // --syncMode: rsync, hdfs, git SyncSource string `yaml:"syncSource"` // --syncSource SyncImage string `yaml:"syncImage,omitempty"` // --syncImage // syncGitProjectName SyncGitProjectName string `yaml:"syncGitProjectName,omitempty"` // --syncImage }
type SubmitTFJobArgs ¶
type SubmitTFJobArgs struct { // TFNodeSelectors assigns tfjob node selectors TFNodeSelectors map[string]map[string]string `yaml:"tfNodeSelectors"` // Port defines the defaut port if workerPort and PSPort are not set Port int // WorkerImage assigns worker image,match option --worker-image WorkerImage string `yaml:"workerImage"` // WorkerPort stores worker port,match option --work-port WorkerPort int `yaml:"workerPort"` // PSPort stores the ps port,match option --ps-port PSPort int `yaml:"psPort"` // PSCount stores the ps count,--ps-count PSCount int `yaml:"ps"` // PSImage stores the ps image,--ps-image PSImage string `yaml:"psImage"` // WorkerCpu stores the cpu of job worker,match option --worker-cpu WorkerCpu string `yaml:"workerCPU"` // WorkerCpuLimit stores the cpu limit of job worker,match option --worker-cpu-limit WorkerCpuLimit string `yaml:"workerCPULimit"` //WorkerNodeSelectors map[string]string `yaml:"workerNodeSelectors"` // --worker-selector // WorkerMemory stores woker memory,match option --worker-memory WorkerMemory string `yaml:"workerMemory"` // WorkerMemoryLimit stores woker memory limit,match option --worker-memory-limit WorkerMemoryLimit string `yaml:"workerMemoryLimit"` // PSCpu stores ps cpu,match option --ps-cpu PSCpu string `yaml:"psCPU"` // PSCpuLimit stores ps cpu limit,match option --ps-cpu-limit PSCpuLimit string `yaml:"psCPULimit"` // PSGpu stores ps gpu,match option --ps-gpus PSGpu int `yaml:"psGPU"` // --ps-gpus // PSMemory stores the ps memory,match option --ps-memory PSMemory string `yaml:"psMemory"` // PSMemoryLimit stores the ps memory limit,match option --ps-memory-limit PSMemoryLimit string `yaml:"psMemoryLimit"` // SuccessPolicy defines the policy to mark the TFJob as succeeded. SuccessPolicy string `yaml:"successPolicy"` // CleanPodPolicy stores the cleaning pod policy,match option --clean-task-policy CleanPodPolicy string `yaml:"cleanPodPolicy"` // UseChief stores the using chief or not,match option --chief UseChief bool `yaml:",omitempty"` // --chief // ChiefCount stores the chief count of job,match option --chief-count ChiefCount int `yaml:"chief"` // UseEvaluator is used to enable evaluator or not,match option --evaluator UseEvaluator bool `yaml:",omitempty"` // ChiefPort stores the chief port,match option --chief-port ChiefPort int `yaml:"chiefPort"` //ChiefNodeSelectors map[string]string `yaml:"chiefNodeSelectors"` // --chief-selector // ChiefCpu stores the chief pod cpu,match option --chief-cpu ChiefCpu string `yaml:"chiefCPU"` // ChiefCpuLimit stores the chief pod cpu limit,match option --chief-cpu-limit ChiefCpuLimit string `yaml:"chiefCPULimit"` // ChiefMemory stores the chief pod memory,match option --chief-memory ChiefMemory string `yaml:"chiefMemory"` // ChiefMemoryLimit stores the chief pod memory limit,match option --chief-memory-limit ChiefMemoryLimit string `yaml:"chiefMemoryLimit"` // EvaluatorCpu stores the evaluator pod cpu,match option --evaluator-cpu EvaluatorCpu string `yaml:"evaluatorCPU"` // EvaluatorCpuLimit stores the evaluator pod cpu limit,match option --evaluator-cpu-limit EvaluatorCpuLimit string `yaml:"evaluatorCPULimit"` //EvaluatorNodeSelectors map[string]string `yaml:"evaluatorNodeSelectors"` // --evaluator-selector // EvaluatorMemory stores the evaluator pod memory,match option --evaluator-memory EvaluatorMemory string `yaml:"evaluatorMemory"` // --evaluatorMemory // EvaluatorMemoryLimit stores the evaluator pod memory limit,match option --evaluator-memory-limit EvaluatorMemoryLimit string `yaml:"evaluatorMemoryLimit"` // --evaluatorMemoryLimit // EvaluatorCount stores the evaluator pod count,match option --evaluator-count EvaluatorCount int `yaml:"evaluator"` // HasGangScheduler determines if it has gang scheduler HasGangScheduler bool `yaml:"hasGangScheduler"` // ActiveDeadlineSeconds Specifies the duration (in seconds) since startTime during which the job can remain active // before it is terminated ActiveDeadlineSeconds int64 `yaml:"activeDeadlineSeconds,omitempty"` // StartingDeadlineSeconds Specifies the duration (in seconds) since startTime during which the job can remain pending // before it is terminated StartingDeadlineSeconds int64 `yaml:"startingDeadlineSeconds,omitempty"` // Defines the TTL for cleaning up finished TFJobs. Defaults to infinite. TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"` ShareMemory string `yaml:"shareMemory"` // for common args CommonSubmitArgs `yaml:",inline"` // SubmitTensorboardArgs stores tensorboard information SubmitTensorboardArgs `yaml:",inline"` // SubmitSyncCodeArgs stores syncing code information SubmitSyncCodeArgs `yaml:",inline"` // TFRuntime stores the runtime TFRuntime `yaml:"-"` // TrainingOperatorCRD compatible with training-operator crd. TrainingOperatorCRD bool `yaml:"trainingOperatorCRD,omitempty"` }
type SubmitTensorboardArgs ¶
type SubmitTensorboardArgs struct { UseTensorboard bool `yaml:"useTensorboard"` // --tensorboard TensorboardImage string `yaml:"tensorboardImage,omitempty"` // --tensorboardImage TrainingLogdir string `yaml:"trainingLogdir"` // --logdir HostLogPath string `yaml:"hostLogPath"` IsLocalLogging bool `yaml:"isLocalLogging"` }
SubmitTensorboardArgs is used to store tensorborad information
type SubmitVolcanoJobArgs ¶
type SubmitVolcanoJobArgs struct { // Name stores the job name Name string // Namespace stores the namespace of job Namespace string // TrainingType is used to accept job type TrainingType TrainingJobType // Command defines the job command Command string // The MinAvailable available pods to run for this Job MinAvailable int `yaml:"minAvailable"` // Specifies the queue that will be used in the scheduler, "default" queue is used this leaves empty. Queue string `yaml:"queue"` // SchedulerName is the default value of `tasks.template.spec.schedulerName`. SchedulerName string `yaml:"schedulerName"` // TaskName specifies the name of task TaskName string `yaml:"taskName"` // TaskImages specifies the task image TaskImages []string `yaml:"taskImages"` // TaskReplicas specifies the replicas of this Task in Job TaskReplicas int `yaml:"taskReplicas"` // TaskCPU specifies the cpu resource required for each replica of Task in Job. default is 250m TaskCPU string `yaml:"taskCPU"` // TaskMemory specifies the memory resource required for each replica of Task in Job. default is 128Mi TaskMemory string `yaml:"taskMemory"` // TaskPort specifies the task port TaskPort int `yaml:"taskPort"` // Annotations defines pod annotations of job,match option --annotation Annotations map[string]string `yaml:"annotations"` // Labels specify the job labels and it is work for pods Labels map[string]string `yaml:"labels"` }
type TFRuntime ¶
type TFRuntime interface { // check the tfjob args Check(tf *SubmitTFJobArgs) (err error) // transform the tfjob Transform(tf *SubmitTFJobArgs) (err error) Runtime }
Customized runtime for tf training training
type TensorFlowServingArgs ¶
type TensorFlowServingArgs struct { VersionPolicy string `yaml:"versionPolicy"` // --version-policy ModelConfigFile string `yaml:"modelConfigFile"` // --model-config-file MonitoringConfigFile string `yaml:"monitoringConfigFile"` // --monitoring-config-file ModelPath string `yaml:"modelPath"` // --model-path Port int `yaml:"port"` // --port RestfulPort int `yaml:"restApiPort"` // --restful-port CommonServingArgs `yaml:",inline"` }
type TensorRTServingArgs ¶
type TensorRTServingArgs struct { ModelStore string `yaml:"modelStore"` // --modelStore MetricsPort int `yaml:"metricsPort"` // --metricsPort HttpPort int `yaml:"httpPort"` // --httpPort GrpcPort int `yaml:"grpcPort"` // --grpcPort AllowMetrics bool `yaml:"allowMetrics"` // --allowMetrics CommonServingArgs `yaml:",inline"` }
type TolerationArgs ¶ added in v0.9.2
type TrafficRouterSplitArgs ¶
type TrafficRouterSplitArgs struct { ServingName string `yaml:"servingName,omitempty"` //--name Namespace string `yaml:"namespace,omitempty"` //--namespace Versions string `yaml:"versions,omitempty"` //--versions Weights string `yaml:"weights,omitempty"` //--weights VersionWeights []ServingVersionWeight }
type TrainingJobInfo ¶
type TrainingJobInfo struct { // The unique identity of the training job UUID string `json:"uuid" yaml:"uuid"` // The name of the training job Name string `json:"name" yaml:"name"` // The namespace of the training job Namespace string `json:"namespace" yaml:"namespace"` // The time of the training job Duration string `json:"duration" yaml:"duration"` // The status of the training Job Status TrainingJobStatus `json:"status" yaml:"status"` // The training type of the training job Trainer TrainingJobType `json:"trainer" yaml:"trainer"` // The tensorboard of the training job Tensorboard string `json:"tensorboard" yaml:"tensorboard"` // The name of the chief Instance ChiefName string `json:"chiefName" yaml:"chiefName"` // The instances under the training job Instances []TrainingJobInstance `json:"instances" yaml:"instances"` // The priority of the training job Priority string `json:"priority" yaml:"priority"` // RequestGPU stores the request gpus RequestGPU int64 `json:"requestGPUs" yaml:"requestGPUs"` // AllocatedGPU stores the allocated gpus AllocatedGPU int64 `json:"allocatedGPUs" yaml:"allocatedGPUs"` // CreationTimestamp stores the creation timestamp of job CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` // Model information associated with this job ModelName string `json:"modelName"` ModelVersion string `json:"modelVersion"` ModelSource string `json:"modelSource"` }
TrainingJobInfo stores training job information
type TrainingJobInstance ¶
type TrainingJobInstance struct { // IP defines the instance ip IP string `json:"ip" yaml:"ip"` // the status of of instance Status string `json:"status"` // the name of instance Name string `json:"name"` // the age of instance Age string `json:"age"` // the node instance runs on Node string `json:"node"` // NodeIP is store the node ip NodeIP string `json:"nodeIP" yaml:"nodeIP"` // the instance is chief or not IsChief bool `json:"chief" yaml:"chief"` // RequestGPUs is used to store request gpu count RequestGPUs int `json:"requestGPUs" yaml:"requestGPUs"` // GpuDutyCycle stores the gpu metrics GPUMetrics map[string]GpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"` // CreationTimestamp returns the creation timestamp of instance CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` }
TrainingJobInstance defines the instance of training job
type TrainingJobStatus ¶
type TrainingJobStatus string
TrainingJobStatus defines all the kinds of JobStatus
const ( // TrainingJobQueuing means the job is queuing TrainingJobQueuing TrainingJobStatus = "QUEUING" // TrainingJobPending means the job is pending TrainingJobPending TrainingJobStatus = "PENDING" // TrainingJobRunning means the job is running TrainingJobRunning TrainingJobStatus = "RUNNING" // TrainingJobSucceeded means the job is Succeeded TrainingJobSucceeded TrainingJobStatus = "SUCCEEDED" // TrainingJobFailed means the job is failed TrainingJobFailed TrainingJobStatus = "FAILED" )
type TrainingJobType ¶
type TrainingJobType string
TrainingJobType defines the supporting training job type
const ( // TFTrainingJob defines the tfjob TFTrainingJob TrainingJobType = "tfjob" // MPITrainingJob defines the mpijob MPITrainingJob TrainingJobType = "mpijob" // PytorchTrainingJob defines the pytorchjob PytorchTrainingJob TrainingJobType = "pytorchjob" // HorovodTrainingJob defines the horovod job HorovodTrainingJob TrainingJobType = "horovodjob" // VolcanoTrainingJob defines the volcano job VolcanoTrainingJob TrainingJobType = "volcanojob" // ETTrainingJob defines the etjob ETTrainingJob TrainingJobType = "etjob" // SparkTrainingJob defines the spark job SparkTrainingJob TrainingJobType = "sparkjob" // DeepSpeedTrainingJob defines the deepspeed job DeepSpeedTrainingJob TrainingJobType = "deepspeedjob" // AllTrainingJob represents all job types AllTrainingJob TrainingJobType = "" // UnknownTrainingJob defines the unknown training UnknownTrainingJob TrainingJobType = "unknown" // RayJob defines the ray job RayJob TrainingJobType = "rayjob" )
type TrainingJobTypeInfo ¶
type TrainingJobTypeInfo struct { Name TrainingJobType Alias string Shorthand string }
type TritonServingArgs ¶ added in v0.8.5
type TritonServingArgs struct { Backend string `yaml:"backend"` // --backend ModelRepository string `yaml:"modelRepository"` // --model-repository MetricsPort int `yaml:"metricsPort"` // --metrics-port HttpPort int `yaml:"httpPort"` // --http-port GrpcPort int `yaml:"grpcPort"` // --grpc-port AllowMetrics bool `yaml:"allowMetrics"` // --allow-metrics LoadModels []string `yaml:"loadModels"` // --load-model ExtendCommand string `yaml:"extendCommand"` // --extend-command CommonServingArgs `yaml:",inline"` }
type UpdateCustomServingArgs ¶ added in v0.8.9
type UpdateCustomServingArgs struct {
CommonUpdateServingArgs `yaml:",inline"`
}
type UpdateDistributedServingArgs ¶ added in v0.12.0
type UpdateDistributedServingArgs struct { Workers int `yaml:"workers"` // --workers MasterCpu string `yaml:"masterCPU"` // --master-cpu WorkerCpu string `yaml:"workerCPU"` // --worker-cpu MasterGPUCount int `yaml:"masterGPUCount"` // master-gpus WorkerGPUCount int `yaml:"workerGPUCount"` // worker-gpus MasterMemory string `yaml:"masterMemory"` // master-memory WorkerMemory string `yaml:"workerMemory"` // worker-memory MasterGPUMemory int `yaml:"masterGPUMemory"` // master-gpumemory WorkerGPUMemory int `yaml:"workerGPUMemory"` // worker-gpumemory MasterGPUCore int `yaml:"masterGPUCore"` // master-gpucore WorkerGPUCore int `yaml:"workerGPUCore"` // worker-gpucore MasterCommand string `yaml:"masterCommand"` // master-command WorkerCommand string `yaml:"workerCommand"` // worker-command CommonUpdateServingArgs `yaml:",inline"` }
type UpdateKServeArgs ¶ added in v0.9.11
type UpdateKServeArgs struct { ModelFormat *ModelFormat `yaml:"modelFormat"` // --model-format Runtime string `yaml:"runtime"` // --runtime StorageUri string `yaml:"storageUri"` // --storageUri RuntimeVersion string `yaml:"runtimeVersion"` // --runtime-version ProtocolVersion string `yaml:"protocolVersion"` // --protocol-version MinReplicas int `yaml:"minReplicas"` // --min-replicas MaxReplicas int `yaml:"maxReplicas"` // --max-replicas ScaleTarget int `yaml:"scaleTarget"` // --scale-target ScaleMetric string `yaml:"scaleMetric"` // --scale-metric ContainerConcurrency int64 `yaml:"containerConcurrency"` // --container-concurrency TimeoutSeconds int64 `yaml:"timeout"` // --timeout CanaryTrafficPercent int64 `yaml:"canaryTrafficPercent,omitempty"` // --canary-traffic-percent Port int `yaml:"port"` // --port CommonUpdateServingArgs `yaml:",inline"` }
type UpdateTensorFlowServingArgs ¶ added in v0.8.9
type UpdateTensorFlowServingArgs struct { ModelConfigFile string `yaml:"modelConfigFile"` // --model-config-file MonitoringConfigFile string `yaml:"monitoringConfigFile"` // --monitoring-config-file ModelName string `yaml:"modelName"` // --model-name ModelPath string `yaml:"modelPath"` // --model-path CommonUpdateServingArgs `yaml:",inline"` }
type UpdateTritonServingArgs ¶ added in v0.8.9
type UpdateTritonServingArgs struct { ModelRepository string `yaml:"modelRepository"` // --model-repository AllowMetrics bool `yaml:"allowMetrics"` // --allow-metrics CommonUpdateServingArgs `yaml:",inline"` }
type VirtualService ¶
type VirtualService struct { *istiov1alpha3.VirtualService Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"` }
type VirtualServiceCRD ¶
type VirtualServiceCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }
type WorkerGroupSpec ¶ added in v0.11.0
type WorkerGroupSpec struct { Image string `yaml:"image"` Cpu string `yaml:"cpu"` Memory string `yaml:"memory"` Gpu int `yaml:"gpu"` // Replicas is the number of desired Pods for this worker group. // +kubebuilder:default:=0 Replicas int32 `yaml:"replicas,omitempty"` // MinReplicas denotes the minimum number of desired Pods for this worker group. // +kubebuilder:default:=0 MinReplicas int32 `yaml:"minReplicas"` // MaxReplicas denotes the maximum number of desired Pods for this worker group, and the default value is maxInt32. // +kubebuilder:default:=2147483647 MaxReplicas int32 `yaml:"maxReplicas"` // NumOfHosts denotes the number of hosts to create per replica. The default value is 1. // +kubebuilder:default:=1 NumOfHosts int32 `yaml:"numOfHosts,omitempty"` }
WorkerGroupSpec are the specs for the worker pods
Source Files ¶
- const.go
- cron.go
- evaluatejob.go
- gpu_metric.go
- gpunode.go
- loglevel.go
- model.go
- serving.go
- submit.go
- submit_deepspeedjob.go
- submit_etjob.go
- submit_horovodjob.go
- submit_mpijob.go
- submit_pytorchjob.go
- submit_rayjob.go
- submit_sparkjob.go
- submit_tfjob.go
- submit_volcanojob.go
- traffic_router.go
- training.go
- types.go
- update_serving.go