Documentation ¶
Overview ¶
Copyright 2018 The Kubeflow Authors
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License
Copyright 2018 The Kubeflow Authors ¶
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License
Index ¶
- Constants
- Variables
- type AdvancedGpuMetric
- type AllNodeInfo
- type ArenaClientArgs
- type CommonCronArgs
- type CommonGPUNodeInfo
- type CommonNodeInfo
- type CommonServingArgs
- type CommonSubmitArgs
- type ConcurrencyPolicy
- type ConfigFileInfo
- type CronHistoryInfo
- type CronInfo
- type CronTFJobArgs
- type CronType
- type CustomServingArgs
- type DataDirVolume
- type Destination
- type DestinationRuleCRD
- type DestinationWeight
- type Driver
- type Endpoint
- type Executor
- type FormatStyle
- type GPUDeviceInfo
- type GPUExclusiveNodeInfo
- type GPUExclusivePodInfo
- type GPUShareNodeDevice
- type GPUShareNodeInfo
- type GPUSharePodInfo
- type GPUTopology
- type GPUTopologyNodeDevice
- type GPUTopologyNodeInfo
- type GPUTopologyPodInfo
- type GpuMetric
- type GpuMetricInfo
- type HTTPMatchRequest
- type HTTPRoute
- type JobConditionType
- type JobGpuMetric
- type KFServingArgs
- type LimitedPodSecurityContext
- type LogArgs
- type LogLevel
- type NodeGpuMetric
- type NodeType
- type NodeTypeInfo
- type NormalNodeInfo
- type PodGpuMetric
- type PortSelector
- type PreprocesObject
- type PrometheusMetric
- type PrometheusMetricData
- type PrometheusMetricResult
- type PrometheusMetricValue
- type PrometheusServer
- type Runtime
- type ScaleETJobArgs
- type ScaleInETJobArgs
- type ScaleOutETJobArgs
- type SeldonServingArgs
- type ServingInstance
- type ServingJobInfo
- type ServingJobType
- type ServingTypeInfo
- type ServingVersionWeight
- type StringMatchPrefix
- type SubmitETJobArgs
- type SubmitHorovodJobArgs
- type SubmitMPIJobArgs
- type SubmitPyTorchJobArgs
- type SubmitSparkJobArgs
- type SubmitSyncCodeArgs
- type SubmitTFJobArgs
- type SubmitTensorboardArgs
- type SubmitVolcanoJobArgs
- type TFRuntime
- type TensorFlowServingArgs
- type TensorRTServingArgs
- type TrafficRouterSplitArgs
- type TrainingJobInfo
- type TrainingJobInstance
- type TrainingJobStatus
- type TrainingJobType
- type TrainingJobTypeInfo
- type VirtualService
- type VirtualServiceCRD
Constants ¶
const ()
const ( AliyunGPUResourceName = "aliyun.com/gpu" GPUTopologyAllocationLabel = "topology.kubernetes.io/gpu-group" GPUTopologyVisibleGPULabel = "topology.kubernetes.io/gpu-visible" GPUTopologyNodeLabels = "ack.node.gpu.schedule=topology" )
const KUBEFLOW_NAMESPACE = "kubeflow"
const KUBE_SYSTEM_NAMESPACE = "kube-system"
const NODE_METRIC_TMP = `{__name__=~"%s", node_name=~"%s"}`
const (
// defines the nvidia resource name
NvidiaGPUResourceName = "nvidia.com/gpu"
)
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
const PROMETHEUS_SCHEME = "http"
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
const (
RequestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner"
)
Variables ¶
var ErrTrainingJobNotFound = errors.New("training job not found,please use 'arena list' to make sure job is existed.")
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
var NodeTypeSlice = []NodeTypeInfo{ { Name: NormalNode, Alias: "none", Shorthand: "n", }, { Name: GPUExclusiveNode, Alias: "exclusive", Shorthand: "e", }, { Name: GPUTopologyNode, Alias: "topology", Shorthand: "t", }, { Name: GPUShareNode, Alias: "share", Shorthand: "s", }, }
var SUPPORT_PROMETHEUS_SERVERS = []*PrometheusServer{ { Name: "arms-prometheus-admin", ServiceLabels: "kubernetes.io/service-name=prometheus-admin", Protocol: "http", Port: "9335", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, { Name: "default", ServiceLabels: "kubernetes.io/service-name=prometheus-server", Protocol: "http", Port: "9090", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, { Name: "default-old", ServiceLabels: "kubernetes.io/name=Prometheus", Protocol: "http", Port: "9090", Path: "api/v1/query", MetricList: []string{ "nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes", }, }, }
var ServingTypeMap = map[ServingJobType]ServingTypeInfo{ CustomServingJob: { Name: CustomServingJob, Alias: "Custom", Shorthand: "custom", }, KFServingJob: { Name: KFServingJob, Alias: "KFServing", Shorthand: "kf", }, TFServingJob: { Name: TFServingJob, Alias: "Tensorflow", Shorthand: "tf", }, TRTServingJob: { Name: TRTServingJob, Alias: "Tensorrt", Shorthand: "trt", }, SeldonServingJob: { Name: SeldonServingJob, Alias: "Seldon", Shorthand: "seldon", }, }
ServingTypeMap collects serving job type and their alias
var TrainingTypeMap = map[TrainingJobType]TrainingJobTypeInfo{ TFTrainingJob: { Name: TFTrainingJob, Alias: "Tensorflow", Shorthand: "tf", }, MPITrainingJob: { Name: MPITrainingJob, Alias: "MPI", Shorthand: "mpi", }, PytorchTrainingJob: { Name: PytorchTrainingJob, Alias: "Pytorch", Shorthand: "py", }, HorovodTrainingJob: { Name: HorovodTrainingJob, Alias: "Horovod", Shorthand: "horovod", }, VolcanoTrainingJob: { Name: VolcanoTrainingJob, Alias: "Volcano", Shorthand: "volcano", }, ETTrainingJob: { Name: ETTrainingJob, Alias: "ElasticTraining", Shorthand: "et", }, SparkTrainingJob: { Name: SparkTrainingJob, Alias: "Spark", Shorthand: "spark", }, }
ServingTypeMap collects serving job type and their alias
Functions ¶
This section is empty.
Types ¶
type AdvancedGpuMetric ¶
type AdvancedGpuMetric struct { Id string `json:"id" yaml:"id"` UUID string `json:"uuid" yaml:"uuid"` GpuDutyCycle float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"` GpuMemoryUsed float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"` GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"` // PodName is combined with namespace and pod name,like 'namespace/pod_name' PodNames []string `json:"podNames" yaml:"podNames"` }
type AllNodeInfo ¶
type AllNodeInfo map[string][]interface{}
type ArenaClientArgs ¶
type CommonCronArgs ¶ added in v0.8.2
type CommonCronArgs struct { // The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron. Schedule string `yaml:"schedule"` // --schedule // Specifies how to treat concurrent executions of a Job. // Valid values are: // - "Allow" (default): allows CronJobs to run concurrently; // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet; // - "Replace": cancels currently running job and replaces it with a new one // +optional ConcurrencyPolicy string `yaml:"concurrencyPolicy"` // --concurrency-policy // This flag tells the controller to suspend subsequent executions, it does // not apply to already started executions. Defaults to false. // +optional Suspend bool `yaml:"suspend"` // --suspend // Deadline is the timestamp that a cron job can keep scheduling util then. Deadline string `yaml:"deadline"` // --deadline // The number of finished job history to retain. // This is a pointer to distinguish between explicit zero and not specified. // +optional HistoryLimit int `yaml:"historyLimit"` // --history-limit }
type CommonGPUNodeInfo ¶
type CommonGPUNodeInfo struct { TotalGPUs float64 `json:"totalGPUs" yaml:"totalGPUs"` AllocatedGPUs float64 `json:"allocatedGPUs" yaml:"allocatedGPUs"` UnhealthyGPUs float64 `json:"unhealthyGPUs" yaml:"unhealthyGPUs"` GPUMetrics []*AdvancedGpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"` }
type CommonNodeInfo ¶
type CommonServingArgs ¶
type CommonServingArgs struct { Name string `yaml:"servingName"` Version string `yaml:"servingVersion"` Namespace string `yaml:"-"` Type ServingJobType `yaml:"-"` Image string `yaml:"image"` ImagePullPolicy string `yaml:"imagePullPolicy"` // --imagePullPolicy GPUCount int `yaml:"gpuCount"` // --gpus GPUMemory int `yaml:"gpuMemory"` // --gpumemory Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory Envs map[string]string `yaml:"envs"` // --envs Command string `yaml:"command"` // --command Replicas int `yaml:"replicas"` // --replicas EnableIstio bool `yaml:"enableIstio"` // --enableIstio ExposeService bool `yaml:"exposeService"` // --exposeService ModelDirs map[string]string `yaml:"modelDirs"` HostVolumes []DataDirVolume `yaml:"hostVolumes"` // --data-dir NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector Tolerations []string `yaml:"tolerations"` // --toleration Annotations map[string]string `yaml:"annotations"` ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists }
type CommonSubmitArgs ¶
type CommonSubmitArgs struct { // Name stores the job name,match option --name Name string `yaml:"-"` // Namespace stores the namespace of job,match option --namespace Namespace string `yaml:"-"` // TrainingType stores the trainingType TrainingType TrainingJobType `yaml:"-"` // NodeSelectors defines the node selectors,match option --selector NodeSelectors map[string]string `yaml:"nodeSelectors"` // ConfigFiles stores the config file which is existed in client host node // and map it to container,match option --config-file ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"` // Tolerations defines the tolerations which tolerates node taints // match option --toleration Tolerations []string `yaml:"tolerations"` // Image stores the docker image of job,match option --image Image string `yaml:"image"` // GPUCount stores the gpu count of the job needs,match option --gpus GPUCount int `yaml:"gpuCount"` // Envs stores the envs of container in job, match option --env Envs map[string]string `yaml:"envs"` // WorkingDir stores the working directory of container in job,match option --working-dir WorkingDir string `yaml:"workingDir"` // Command stores the command of job Command string `yaml:"command"` // Mode is used for horovod,match option --sync-mode Mode string `yaml:"mode"` // WorkerCount stores the count of job worker,match option --workers WorkerCount int `yaml:"workers"` // Retry defines the retry times Retry int `yaml:"retry"` // DataSet stores the kubernetes pvc names DataSet map[string]string `yaml:"dataset"` // DataDirs stores the files(or directories) in k8s node which will map to containers // match option --data-dir DataDirs []DataDirVolume `yaml:"dataDirs"` // EnableRDMA enable rdma or not,match option --rdma EnableRDMA bool `yaml:"enableRDMA"` // UseENI defines using eni or not UseENI bool `yaml:"useENI"` // Annotations defines pod annotations of job,match option --annotation Annotations map[string]string `yaml:"annotations"` // IsNonRoot is root user or not IsNonRoot bool `yaml:"isNonRoot"` // PodSecurityContext defines the pod security context PodSecurityContext LimitedPodSecurityContext `yaml:"podSecurityContext"` // PriorityClassName defines the priority class PriorityClassName string `yaml:"priorityClassName"` // Conscheduling defines using Conscheduling Conscheduling bool // PodGroupName stores pod group name PodGroupName string `yaml:"podGroupName"` // PodGroupMinAvailable stores pod group min available PodGroupMinAvailable string `yaml:"podGroupMinAvailable"` // ImagePullSecrets stores image pull secrets,match option --image-pull-secrets ImagePullSecrets []string `yaml:"imagePullSecrets"` // HelmOptions stores the helm options HelmOptions []string `yaml:"-"` }
CommonSubmitArgs defines the common parts of the submitAthd
type ConcurrencyPolicy ¶ added in v0.8.2
type ConcurrencyPolicy string
ConcurrencyPolicy describes how the job will be handled. Only one of the following concurrent policies may be specified. If none of the following policies is specified, the default one is AllowConcurrent.
const ( ConcurrencyAllow ConcurrencyPolicy = "Allow" ConcurrencyForbid ConcurrencyPolicy = "Forbid" ConcurrencyReplace ConcurrencyPolicy = "Replace" )
type ConfigFileInfo ¶
type ConfigFileInfo struct { ContainerFileName string `yaml:"containerFileName"` HostFile string `yaml:"hostFile"` Key string `yaml:"key"` ContainerFilePath string `yaml:"containerFilePath"` }
ConfigFileInfo defines the config files which will be mounted to containers
type CronHistoryInfo ¶ added in v0.8.2
type CronHistoryInfo struct { Name string `json:"name" yaml:"name"` Namespace string `json:"namespace" yaml:"namespace"` Group string `json:"group" yaml:"group"` Kind string `json:"kind" yaml:"kind"` Status string `json:"status" yaml:"status"` CreateTime string `json:"createTime" yaml:"createTime"` FinishTime string `json:"finishTime" yaml:"finishTime"` }
type CronInfo ¶ added in v0.8.2
type CronInfo struct { Name string `json:"name" yaml:"name"` Namespace string `json:"namespace" yaml:"namespace"` // Type is the job type, like TFjob、PyTorchJob Type string `json:"type" yaml:"type"` // The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron. Schedule string `json:"schedule" yaml:"schedule"` // Specifies how to treat concurrent executions of a Job. // Valid values are: // - "Allow" (default): allows CronJobs to run concurrently; // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet; // - "Replace": cancels currently running job and replaces it with a new one // +optional ConcurrencyPolicy string `json:"concurrencyPolicy" yaml:"concurrencyPolicy"` // --concurrency-policy // This flag tells the controller to suspend subsequent executions, it does // not apply to already started executions. Defaults to false. // +optional Suspend bool `json:"suspend" yaml:"suspend"` // --suspend // Deadline is the timestamp that a cron job can keep scheduling util then. Deadline string `json:"deadline" yaml:"deadline"` // --deadline // The number of finished job history to retain. // This is a pointer to distinguish between explicit zero and not specified. // +optional HistoryLimit int64 `json:"historyLimit" yaml:"historyLimit"` // --history-limit // Information when was the last time the job was successfully scheduled. // +optional LastScheduleTime string `json:"lastScheduleTime" yaml:"lastScheduleTime"` // CreationTimestamp stores the creation timestamp of job CreationTimestamp string `json:"creationTimestamp" yaml:"creationTimestamp"` History []CronHistoryInfo `json:"cronHistory" yaml:"cronHistory"` }
type CronTFJobArgs ¶ added in v0.8.2
type CronTFJobArgs struct { CommonCronArgs `yaml:"cron"` SubmitTFJobArgs `yaml:"tfjob"` }
type CronType ¶ added in v0.8.2
type CronType string
CronType defines the supporting job type
const ( // CronTFTrainingJob defines the cron tfjob CronTFTrainingJob CronType = "tfjob" )
type CustomServingArgs ¶
type CustomServingArgs struct { Port int `yaml:"port"` // --port RestfulPort int `yaml:"restApiPort"` // --restfulPort CommonServingArgs `yaml:",inline"` }
type DataDirVolume ¶
type DataDirVolume struct { // HostPath defines the host path HostPath string `yaml:"hostPath"` // ContainerPath defines container path ContainerPath string `yaml:"containerPath"` // Name defines the volume name Name string `yaml:"name"` }
DataDirVolume defines the volume of kubernetes
type Destination ¶
type Destination struct { *istiov1alpha3.Destination Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"` }
type DestinationRuleCRD ¶
type DestinationRuleCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }
type DestinationWeight ¶
type DestinationWeight struct { Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"` Weight int32 `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"` }
type FormatStyle ¶
type FormatStyle string
PrintFormatStyle defines the format of output it only used in cmd
const ( // Wide defines the wide format WideFormat FormatStyle = "wide" // Json defines the json format JsonFormat FormatStyle = "json" // Yaml defines the yaml format YamlFormat FormatStyle = "yaml" // Unknwon defines the unknown format UnknownFormat FormatStyle = "unknown" )
type GPUDeviceInfo ¶
type GPUDeviceInfo struct { ID string `json:"id" yaml:"id"` TotalGPUMemory float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"` AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"` UsedGPUMemory float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"` DutyCycle float64 `json:"dutyCycle" yaml:"dutyCycle"` }
type GPUExclusiveNodeInfo ¶
type GPUExclusiveNodeInfo struct { PodInfos []GPUExclusivePodInfo `json:"instances" yaml:"instances"` CommonNodeInfo `yaml:",inline" json:",inline"` CommonGPUNodeInfo `yaml:",inline" json:",inline"` }
type GPUExclusivePodInfo ¶
type GPUShareNodeDevice ¶
type GPUShareNodeDevice struct {}
type GPUShareNodeInfo ¶
type GPUShareNodeInfo struct {}
type GPUSharePodInfo ¶
type GPUSharePodInfo struct {}
type GPUTopology ¶
type GPUTopologyNodeDevice ¶
type GPUTopologyNodeInfo ¶
type GPUTopologyNodeInfo struct { PodInfos []GPUTopologyPodInfo `json:"instances" yaml:"instances"` GPUTopology GPUTopology `json:"gpuTopology" yaml:"gpuTopology"` CommonGPUNodeInfo `yaml:",inline" json:",inline"` CommonNodeInfo `yaml:",inline" json:",inline"` Devices []GPUTopologyNodeDevice `yaml:"devices" yaml:"devices"` }
type GPUTopologyPodInfo ¶
type GPUTopologyPodInfo struct { Name string `json:"name" yaml:"name"` Namespace string `json:"namespace" yaml:"namespace"` Status string `json:"status" yaml:"status"` RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"` Allocation []string `json:"allocation" yaml:"allocation"` VisibleGPUs []string `json:"visibleGPUs" yaml:"visibleGPUs"` }
type GpuMetricInfo ¶
type HTTPMatchRequest ¶
type HTTPMatchRequest struct { *istiov1alpha3.HTTPMatchRequest Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"` }
type HTTPRoute ¶
type HTTPRoute struct { *istiov1alpha3.HTTPRoute Match []*HTTPMatchRequest `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"` Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"` }
type JobConditionType ¶ added in v0.8.2
type JobConditionType string
JobConditionType defines all kinds of types of JobStatus.
const ( // JobCreated means the job has been accepted by the system, // but one or more of the pods/services has not been started. // This includes time before pods being scheduled and launched. JobCreated JobConditionType = "Created" // JobRunning means all sub-resources (e.g. services/pods) of this job // have been successfully scheduled and launched. // The training is running without error. JobRunning JobConditionType = "Running" // JobRestarting means one or more sub-resources (e.g. services/pods) of this job // reached phase failed but maybe restarted according to it's restart policy // which specified by user in v1.PodTemplateSpec. // The training is freezing/pending. JobRestarting JobConditionType = "Restarting" // JobSucceeded means all sub-resources (e.g. services/pods) of this job // reached phase have terminated in success. // The training is complete without error. JobSucceeded JobConditionType = "Succeeded" // JobFailed means one or more sub-resources (e.g. services/pods) of this job // reached phase failed with no restarting. // The training has failed its execution. JobFailed JobConditionType = "Failed" )
type JobGpuMetric ¶
type JobGpuMetric map[string]PodGpuMetric
type KFServingArgs ¶
type KFServingArgs struct { Port int `yaml:"port"` // --port ModelType string `yaml:"modelType"` // --modelType CanaryPercent int `yaml:"canaryPercent"` // --canaryTrafficPercent StorageUri string `yaml:"storageUri"` // --storageUri CommonServingArgs `yaml:",inline"` }
type LimitedPodSecurityContext ¶
type LimitedPodSecurityContext struct { RunAsUser int64 `yaml:"runAsUser"` RunAsNonRoot bool `yaml:"runAsNonRoot"` RunAsGroup int64 `yaml:"runAsGroup"` SupplementalGroups []int64 `yaml:"supplementalGroups"` }
LimitedPodSecurityContext defines the kuberntes pod security context
type NodeTypeInfo ¶
type NormalNodeInfo ¶
type NormalNodeInfo struct {
CommonNodeInfo `yaml:",inline" json:",inline"`
}
type PodGpuMetric ¶
type PortSelector ¶
type PortSelector struct { *istiov1alpha3.PortSelector Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"` }
type PreprocesObject ¶
type PreprocesObject struct { ServiceName string Namespace string DestinationRule DestinationRuleCRD VirtualService VirtualServiceCRD }
type PrometheusMetric ¶
type PrometheusMetric struct { Status string `json:"status,inline"` Data PrometheusMetricData `json:"data,omitempty"` }
type PrometheusMetricData ¶
type PrometheusMetricData struct { Result []PrometheusMetricResult `json:"result"` ResultType string `json:"resultType"` }
type PrometheusMetricResult ¶
type PrometheusMetricResult struct { Metric map[string]string `json:"metric"` Value []PrometheusMetricValue `json:"value"` }
type PrometheusMetricValue ¶
type PrometheusMetricValue interface{}
type PrometheusServer ¶
type PrometheusServer struct { Name string ServiceLabels string Protocol string Port string Path string MetricList []string Service *v1.Service }
PrometheusServer is used to define prometheus server
type ScaleETJobArgs ¶
type ScaleETJobArgs struct { //--name string required, et job name Name string `yaml:"etName"` // TrainingType stores the trainingType JobType TrainingJobType `yaml:"-"` // Namespace stores the namespace of job,match option --namespace Namespace string `yaml:"-"` //--timeout int timeout of callback scaler script. Timeout int `yaml:"timeout"` //--retry int retry times. Retry int `yaml:"retry"` //--count int the nums of you want to add or delete worker. Count int `yaml:"count"` //--script string script of scaling. Script string `yaml:"script"` //-e, --env stringArray the environment variables Envs map[string]string `yaml:"envs"` }
type ScaleInETJobArgs ¶
type ScaleInETJobArgs struct { // common args ScaleETJobArgs `yaml:",inline"` }
type ScaleOutETJobArgs ¶
type ScaleOutETJobArgs struct { // common args ScaleETJobArgs `yaml:",inline"` }
type SeldonServingArgs ¶ added in v0.8.0
type SeldonServingArgs struct { Implementation string `yaml:"implementation"` // --implementation ModelUri string `yaml:"modelUri"` // --modelUri CommonServingArgs `yaml:",inline"` }
type ServingInstance ¶
type ServingInstance struct { // Name gives the instance name Name string `json:"name" yaml:"name"` // Status gives the instance status Status string `json:"status" yaml:"status"` // Age gives the instance ge Age string `json:"age" yaml:"age"` // ReadyContainer represents the count of ready containers ReadyContainer int `json:"readyContainers" yaml:"readyContainers"` // TotalContainer represents the count of total containers TotalContainer int `json:"totalContainers" yaml:"totalContainers"` // RestartCount represents the count of instance restarts RestartCount int `json:"restartCount" yaml:"restartCount"` // HostIP specifies host ip of instance NodeIP string `json:"nodeIP" yaml:"nodeIP"` // NodeName returns the node name NodeName string `json:"nodeName" yaml:"nodeName"` // IP returns the instance ip IP string `json:"ip" yaml:"ip"` // RequestGPU returns the request gpus RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"` // RequestGPUMemory returns the request gpu memory RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"` }
type ServingJobInfo ¶
type ServingJobInfo struct { // Name specifies serving job name Name string `json:"name" yaml:"name"` // Namespace specifies serving job namespace Namespace string `json:"namespace" yaml:"namespace"` // Type specifies serving job type Type string `json:"type" yaml:"type"` // Version specifies serving job version Version string `json:"version" yaml:"version"` // Age specifies the serving job age Age string `json:"age" yaml:"age"` // Desired specifies the desired instances Desired int `json:"desiredInstances" yaml:"desiredInstances"` // Available specifies the available instances Available int `json:"availableInstances" yaml:"availableInstances"` // Endpoints specifies the endpoints Endpoints []Endpoint `json:"endpoints" yaml:"endpoints"` // IPAddress specifies the ip address IPAddress string `json:"ip" yaml:"ip"` // Instances gives the instance informations Instances []ServingInstance `json:"instances" yaml:"instances"` // RequestGPU specifies the request gpus RequestGPU int `json:"requestGPUs" yaml:"requestGPUs"` // RequestGPUMemory specifies the request gpu memory,only for gpushare RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"` // CreationTimestamp stores the creation timestamp of job CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` }
ServingJobInfo display serving job information
type ServingJobType ¶
type ServingJobType string
ServingJobType defines the serving job type name must like shorthand + "-serving"
const ( // TFServingJob defines the tensorflow serving job TFServingJob ServingJobType = "tf-serving" // TRTServingJob defines the tensorrt serving job TRTServingJob ServingJobType = "trt-serving" // KFServingJob defines the kfserving job KFServingJob ServingJobType = "kf-serving" // SeldonServingJob defines the seldon core job SeldonServingJob ServingJobType = "seldon-serving" // CustomServingJob defines the custom serving job CustomServingJob ServingJobType = "custom-serving" // AllServingJob represents all serving job type AllServingJob ServingJobType = "" // UnknownServingJob defines the unknown serving job UnknownServingJob ServingJobType = "unknown" )
type ServingTypeInfo ¶
type ServingTypeInfo struct { Name ServingJobType Alias string Shorthand string }
type ServingVersionWeight ¶
type StringMatchPrefix ¶
type StringMatchPrefix struct {
Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}
type SubmitETJobArgs ¶
type SubmitETJobArgs struct { Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // SubmitTensorboardArgs stores tensorboard information SubmitTensorboardArgs `yaml:",inline"` // SubmitSyncCodeArgs stores syncing code information SubmitSyncCodeArgs `yaml:",inline"` MaxWorkers int `yaml:"maxWorkers"` MinWorkers int `yaml:"minWorkers"` }
type SubmitHorovodJobArgs ¶
type SubmitHorovodJobArgs struct { SSHPort int `yaml:"sshPort"` Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // for tensorboard SubmitTensorboardArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` }
type SubmitMPIJobArgs ¶
type SubmitMPIJobArgs struct { Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // for tensorboard SubmitTensorboardArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` // enable gpu topology scheduling GPUTopology bool `yaml:"gputopology"` GPUTopologyReplica string `yaml:"gputopologyreplica"` }
type SubmitPyTorchJobArgs ¶
type SubmitPyTorchJobArgs struct { Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory // for common args CommonSubmitArgs `yaml:",inline"` // for tensorboard SubmitTensorboardArgs `yaml:",inline"` // for sync up source code SubmitSyncCodeArgs `yaml:",inline"` // clean-task-policy CleanPodPolicy string `yaml:"cleanPodPolicy"` }
type SubmitSparkJobArgs ¶
type SubmitSyncCodeArgs ¶
type SubmitSyncCodeArgs struct { SyncMode string `yaml:"syncMode"` // --syncMode: rsync, hdfs, git SyncSource string `yaml:"syncSource"` // --syncSource SyncImage string `yaml:"syncImage,omitempty"` // --syncImage // syncGitProjectName SyncGitProjectName string `yaml:"syncGitProjectName,omitempty"` // --syncImage }
type SubmitTFJobArgs ¶
type SubmitTFJobArgs struct { // TFNodeSelectors assigns tfjob node selectors TFNodeSelectors map[string]map[string]string `yaml:"tfNodeSelectors"` // Port defines the defaut port if workerPort and PSPort are not set Port int // WorkerImage assigns worker image,match option --worker-image WorkerImage string `yaml:"workerImage"` // WorkerPort stores worker port,match option --work-port WorkerPort int `yaml:"workerPort"` // PSPort stores the ps port,match option --ps-port PSPort int `yaml:"psPort"` // PSCount stores the ps count,--ps-count PSCount int `yaml:"ps"` // PSImage stores the ps image,--ps-image PSImage string `yaml:"psImage"` // WorkerCpu stores the cpu of job worker,match option --worker-cpu WorkerCpu string `yaml:"workerCPU"` //WorkerNodeSelectors map[string]string `yaml:"workerNodeSelectors"` // --worker-selector // WorkerMemory stores woker memory,match option --worker-memory WorkerMemory string `yaml:"workerMemory"` // PSCpu stores ps cpu,match option --ps-cpu PSCpu string `yaml:"psCPU"` // PSGpu stores ps gpu,match option --ps-gpus PSGpu int `yaml:"psGPU"` // --ps-gpus // PSMemory stores the ps memory,match option --ps-memory PSMemory string `yaml:"psMemory"` // CleanPodPolicy stores the cleaning pod policy,match option --clean-task-policy CleanPodPolicy string `yaml:"cleanPodPolicy"` // UseChief stores the using chief or not,match option --chief UseChief bool `yaml:",omitempty"` // --chief // ChiefCount stores the chief count of job,match option --chief-count ChiefCount int `yaml:"chief"` // UseEvaluator is used to enable evaluator or not,match option --evaluator UseEvaluator bool `yaml:",omitempty"` // ChiefPort stores the chief port,match option --chief-port ChiefPort int `yaml:"chiefPort"` //ChiefNodeSelectors map[string]string `yaml:"chiefNodeSelectors"` // --chief-selector // ChiefCpu stores the chief pod cpu,match option --chief-cpu ChiefCpu string `yaml:"chiefCPU"` // ChiefMemory stores the chief pod memory,match option --chief-memory ChiefMemory string `yaml:"chiefMemory"` // EvaluatorCpu stores the evaluator pod cpu,match option --evaluator-cpu EvaluatorCpu string `yaml:"evaluatorCPU"` //EvaluatorNodeSelectors map[string]string `yaml:"evaluatorNodeSelectors"` // --evaluator-selector // EvaluatorMemory stores the evaluator pod memory,match option --evaluator-memory EvaluatorMemory string `yaml:"evaluatorMemory"` // --evaluatorMemory // EvaluatorCount stores the evaluator pod count,match option --evaluator-count EvaluatorCount int `yaml:"evaluator"` // HasGangScheduler determines if it has gang scheduler HasGangScheduler bool `yaml:"hasGangScheduler"` // for common args CommonSubmitArgs `yaml:",inline"` // SubmitTensorboardArgs stores tensorboard information SubmitTensorboardArgs `yaml:",inline"` // SubmitSyncCodeArgs stores syncing code information SubmitSyncCodeArgs `yaml:",inline"` // TFRuntime stores the runtime TFRuntime `yaml:"-"` }
type SubmitTensorboardArgs ¶
type SubmitTensorboardArgs struct { UseTensorboard bool `yaml:"useTensorboard"` // --tensorboard TensorboardImage string `yaml:"tensorboardImage"` // --tensorboardImage TrainingLogdir string `yaml:"trainingLogdir"` // --logdir HostLogPath string `yaml:"hostLogPath"` IsLocalLogging bool `yaml:"isLocalLogging"` }
SubmitTensorboardArgs is used to store tensorborad information
type SubmitVolcanoJobArgs ¶
type SubmitVolcanoJobArgs struct { // Name stores the job name Name string // Namespace stores the namespace of job Namespace string // TrainingType is used to accept job type TrainingType TrainingJobType // Command defines the job command Command string // The MinAvailable available pods to run for this Job MinAvailable int `yaml:"minAvailable"` // Specifies the queue that will be used in the scheduler, "default" queue is used this leaves empty. Queue string `yaml:"queue"` // SchedulerName is the default value of `tasks.template.spec.schedulerName`. SchedulerName string `yaml:"schedulerName"` // TaskName specifies the name of task TaskName string `yaml:"taskName"` // TaskImages specifies the task image TaskImages []string `yaml:"taskImages"` // TaskReplicas specifies the replicas of this Task in Job TaskReplicas int `yaml:"taskReplicas"` // TaskCPU specifies the cpu resource required for each replica of Task in Job. default is 250m TaskCPU string `yaml:"taskCPU"` // TaskMemory specifies the memory resource required for each replica of Task in Job. default is 128Mi TaskMemory string `yaml:"taskMemory"` // TaskPort specifies the task port TaskPort int `yaml:"taskPort"` }
type TFRuntime ¶
type TFRuntime interface { // check the tfjob args Check(tf *SubmitTFJobArgs) (err error) // transform the tfjob Transform(tf *SubmitTFJobArgs) (err error) Runtime }
Customized runtime for tf training training
type TensorFlowServingArgs ¶
type TensorFlowServingArgs struct { VersionPolicy string `yaml:"versionPolicy"` // --versionPolicy ModelConfigFile string `yaml:"modelConfigFile"` // --modelConfigFile ModelConfigFileContent string `yaml:"modelConfigFileContent"` ModelName string `yaml:"modelName"` // --modelName ModelPath string `yaml:"modelPath"` // --modelPath Port int `yaml:"port"` // --port RestfulPort int `yaml:"restApiPort"` // --restfulPort CommonServingArgs `yaml:",inline"` }
type TensorRTServingArgs ¶
type TensorRTServingArgs struct { ModelStore string `yaml:"modelStore"` // --modelStore MetricsPort int `yaml:"metricsPort"` // --metricsPort HttpPort int `yaml:"httpPort"` // --httpPort GrpcPort int `yaml:"grpcPort"` // --grpcPort AllowMetrics bool `yaml:"allowMetrics"` // --allowMetrics CommonServingArgs `yaml:",inline"` }
type TrafficRouterSplitArgs ¶
type TrafficRouterSplitArgs struct { ServingName string `yaml:"servingName,omitempty"` //--name Namespace string `yaml:"namespace,omitempty"` //--namespace Versions string `yaml:"versions,omitempty"` //--versions Weights string `yaml:"weights,omitempty"` //--weights VersionWeights []ServingVersionWeight }
type TrainingJobInfo ¶
type TrainingJobInfo struct { // The name of the training job Name string `json:"name" yaml:"name"` // The namespace of the training job Namespace string `json:"namespace" yaml:"namespace"` // The time of the training job Duration string `json:"duration" yaml:"duration"` // The status of the training Job Status TrainingJobStatus `json:"status" yaml:"status"` // The training type of the training job Trainer TrainingJobType `json:"trainer" yaml:"trainer"` // The tensorboard of the training job Tensorboard string `json:"tensorboard" yaml:"tensorboard"` // The name of the chief Instance ChiefName string `json:"chiefName" yaml:"chiefName"` // The instances under the training job Instances []TrainingJobInstance `json:"instances" yaml:"instances"` // The priority of the training job Priority string `json:"priority" yaml:"priority"` // RequestGPU stores the request gpus RequestGPU int64 `json:"requestGPUs" yaml:"requestGPUs"` // AllocatedGPU stores the allocated gpus AllocatedGPU int64 `json:"allocatedGPUs" yaml:"allocatedGPUs"` // CreationTimestamp stores the creation timestamp of job CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"` }
TrainingJobInfo stores training job information
type TrainingJobInstance ¶
type TrainingJobInstance struct { // IP defines the instance ip IP string `json:"ip" yaml:"ip"` // the status of of instance Status string `json:"status"` // the name of instance Name string `json:"name"` // the age of instance Age string `json:"age"` // the node instance runs on Node string `json:"node"` // NodeIP is store the node ip NodeIP string `json:"nodeIP" yaml:"nodeIP"` // the instance is chief or not IsChief bool `json:"chief" yaml:"chief"` // RequestGPUs is used to store request gpu count RequestGPUs int `json:"requestGPUs" yaml:"requestGPUs"` // GpuDutyCycle stores the gpu metrics GPUMetrics map[string]GpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"` }
TrainingJobInstance defines the instance of training job
type TrainingJobStatus ¶
type TrainingJobStatus string
TrainingJobStatus defines all the kinds of JobStatus
const ( // TrainingJobPending means the job is pending TrainingJobPending TrainingJobStatus = "PENDING" // TrainingJobRunning means the job is running TrainingJobRunning TrainingJobStatus = "RUNNING" // TrainingJobSucceeded means the job is Succeeded TrainingJobSucceeded TrainingJobStatus = "SUCCEEDED" // TrainingJobFailed means the job is failed TrainingJobFailed TrainingJobStatus = "FAILED" )
type TrainingJobType ¶
type TrainingJobType string
TrainingJobType defines the supporting training job type
const ( // TFTrainingJob defines the tfjob TFTrainingJob TrainingJobType = "tfjob" // MPITrainingJob defines the mpijob MPITrainingJob TrainingJobType = "mpijob" // PytorchTrainingJob defines the pytorchjob PytorchTrainingJob TrainingJobType = "pytorchjob" // HorovodTrainingJob defines the horovod job HorovodTrainingJob TrainingJobType = "horovodjob" // VolcanoTrainingJob defines the volcano job VolcanoTrainingJob TrainingJobType = "volcanojob" // ETTrainingJob defines the etjob ETTrainingJob TrainingJobType = "etjob" // SparkTrainingJob defines the spark job SparkTrainingJob TrainingJobType = "sparkjob" // AllTrainingJob represents all job types AllTrainingJob TrainingJobType = "" // UnknownTrainingJob defines the unknown training UnknownTrainingJob TrainingJobType = "unknown" )
type TrainingJobTypeInfo ¶
type TrainingJobTypeInfo struct { Name TrainingJobType Alias string Shorthand string }
type VirtualService ¶
type VirtualService struct { *istiov1alpha3.VirtualService Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"` }
type VirtualServiceCRD ¶
type VirtualServiceCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }