types

package
v0.12.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 25, 2024 License: Apache-2.0 Imports: 7 Imported by: 40

Documentation

Overview

Copyright 2018 The Kubeflow Authors

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License

Index

Constants

View Source
const (
	GPUShareResourceName        = "aliyun.com/gpu-mem"
	GPUCoreShareResourceName    = "aliyun.com/gpu-core.percentage"
	GPUShareCountName           = "aliyun.com/gpu-count"
	GPUShareEnvGPUID            = "ALIYUN_COM_GPU_MEM_IDX"
	GPUShareAllocationLabel     = "scheduler.framework.gpushare.allocation"
	GPUCoreShareAllocationLabel = "gpushare.alibabacloud.com/core-percentage"
	GPUShareNodeLabels          = "gpushare=true,cgpu=true,ack.node.gpu.schedule=share,ack.node.gpu.schedule=cgpu"
)
View Source
const (
	AliyunGPUResourceName      = "aliyun.com/gpu"
	GPUTopologyAllocationLabel = "topology.kubernetes.io/gpu-group"
	GPUTopologyVisibleGPULabel = "topology.kubernetes.io/gpu-visible"
	GPUTopologyNodeLabels      = "ack.node.gpu.schedule=topology"
)
View Source
const (
	MultiTenantIsolationLabel = "arena.kubeflow.org/isolate-user"
	UserNameIdLabel           = "arena.kubeflow.org/uid"
	UserNameNameLabel         = "arena.kubeflow.org/username"
	SSHSecretName             = "arena.kubeflow.org/ssh-secret"
)
View Source
const (
	CPUResourceName = "cpu"
)
View Source
const KUBEFLOW_NAMESPACE = "kubeflow"
View Source
const KUBE_SYSTEM_NAMESPACE = "kube-system"
View Source
const NODE_METRIC_TMP = `{__name__=~"%s", node_name=~"%s"}`
View Source
const (
	// defines the nvidia resource name
	NvidiaGPUResourceName = "nvidia.com/gpu"
)
View Source
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
View Source
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
View Source
const PROMETHEUS_SCHEME = "http"
View Source
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
View Source
const (
	RequestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner"
)

Variables

View Source
var (
	ErrTrainingJobNotFound      = errors.New("training job not found,please use 'arena list' to make sure job is existed.")
	ErrNoPrivilegesToOperateJob = errors.New("you have no privileges to operate the job,because the owner of job is not you")
)
View Source
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
View Source
var ModelTypeMap = map[ModelJobType]ModelTypeInfo{
	ModelProfileJob: {
		Name:      ModelProfileJob,
		Alias:     "Profile",
		Shorthand: "profile",
	},
	ModelOptimizeJob: {
		Name:      ModelOptimizeJob,
		Alias:     "Optimize",
		Shorthand: "optimize",
	},
	ModelBenchmarkJob: {
		Name:      ModelBenchmarkJob,
		Alias:     "Benchmark",
		Shorthand: "benchmark",
	},
	ModelEvaluateJob: {
		Name:      ModelEvaluateJob,
		Alias:     "Evaluate",
		Shorthand: "evaluate",
	},
}

ModelTypeMap collects model job type and their alias

View Source
var NodeTypeSlice = []NodeTypeInfo{
	{
		Name:      NormalNode,
		Alias:     "none",
		Shorthand: "n",
	},
	{
		Name:      GPUExclusiveNode,
		Alias:     "exclusive",
		Shorthand: "e",
	},
	{
		Name:      GPUTopologyNode,
		Alias:     "topology",
		Shorthand: "t",
	},
	{
		Name:      GPUShareNode,
		Alias:     "share",
		Shorthand: "s",
	},
}
View Source
var SUPPORT_PROMETHEUS_SERVERS = []*PrometheusServer{

	{
		Name:          "arms-prometheus-admin",
		ServiceLabels: "kubernetes.io/service-name=prometheus-admin",
		Protocol:      "http",
		Port:          "9335",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},
	{
		Name:          "default",
		ServiceLabels: "kubernetes.io/service-name=prometheus-server",
		Protocol:      "http",
		Port:          "9090",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},

	{
		Name:          "default-old",
		ServiceLabels: "kubernetes.io/name=Prometheus",
		Protocol:      "http",
		Port:          "9090",
		Path:          "api/v1/query",
		MetricList: []string{
			"nvidia_gpu_duty_cycle",
			"nvidia_gpu_memory_used_bytes",
			"nvidia_gpu_memory_total_bytes",
		},
	},
}
View Source
var ServingTypeMap = map[ServingJobType]ServingTypeInfo{
	CustomServingJob: {
		Name:      CustomServingJob,
		Alias:     "Custom",
		Shorthand: "custom",
	},
	KFServingJob: {
		Name:      KFServingJob,
		Alias:     "KFServing",
		Shorthand: "kf",
	},
	KServeJob: {
		Name:      KServeJob,
		Alias:     "KServe",
		Shorthand: "kserve",
	},
	TFServingJob: {
		Name:      TFServingJob,
		Alias:     "Tensorflow",
		Shorthand: "tf",
	},
	TRTServingJob: {
		Name:      TRTServingJob,
		Alias:     "Tensorrt",
		Shorthand: "trt",
	},
	TritonServingJob: {
		Name:      TritonServingJob,
		Alias:     "Triton",
		Shorthand: "Triton",
	},
	SeldonServingJob: {
		Name:      SeldonServingJob,
		Alias:     "Seldon",
		Shorthand: "seldon",
	},
	DistributedServingJob: {
		Name:      DistributedServingJob,
		Alias:     "Distributed",
		Shorthand: "distributed",
	},
}

ServingTypeMap collects serving job type and their alias

View Source
var TrainingTypeMap = map[TrainingJobType]TrainingJobTypeInfo{
	TFTrainingJob: {
		Name:      TFTrainingJob,
		Alias:     "Tensorflow",
		Shorthand: "tf",
	},
	MPITrainingJob: {
		Name:      MPITrainingJob,
		Alias:     "MPI",
		Shorthand: "mpi",
	},
	PytorchTrainingJob: {
		Name:      PytorchTrainingJob,
		Alias:     "Pytorch",
		Shorthand: "py",
	},
	HorovodTrainingJob: {
		Name:      HorovodTrainingJob,
		Alias:     "Horovod",
		Shorthand: "horovod",
	},
	VolcanoTrainingJob: {
		Name:      VolcanoTrainingJob,
		Alias:     "Volcano",
		Shorthand: "volcano",
	},
	ETTrainingJob: {
		Name:      ETTrainingJob,
		Alias:     "ElasticTraining",
		Shorthand: "et",
	},
	SparkTrainingJob: {
		Name:      SparkTrainingJob,
		Alias:     "Spark",
		Shorthand: "spark",
	},
	DeepSpeedTrainingJob: {
		Name:      DeepSpeedTrainingJob,
		Alias:     "DeepSpeed",
		Shorthand: "dp",
	},
	RayJob: {
		Name:      RayJob,
		Alias:     "RayJob",
		Shorthand: "rj",
	},
}

ServingTypeMap collects serving job type and their alias

Functions

This section is empty.

Types

type AdvancedGpuMetric

type AdvancedGpuMetric struct {
	Id             string  `json:"id" yaml:"id"`
	UUID           string  `json:"uuid" yaml:"uuid"`
	GpuDutyCycle   float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"`
	GpuMemoryUsed  float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	// PodName is combined with namespace and  pod name,like 'namespace/pod_name'
	PodNames []string `json:"podNames" yaml:"podNames"`
}

type AllNodeInfo

type AllNodeInfo map[string][]interface{}

type ArenaClientArgs

type ArenaClientArgs struct {
	Kubeconfig     string
	Namespace      string
	ArenaNamespace string
	IsDaemonMode   bool
	LogLevel       string
}

type AutoscalerOptions added in v0.11.0

type AutoscalerOptions struct {
	// cpu specifies optional resource request and limit overrides for the autoscaler container.
	// Default values: 500m CPU request and limit.
	Cpu string `yaml:"cpu,omitempty"`
	// memory specifies optional resource request and limit overrides for the autoscaler
	//  Default values: 512Mi memory request and limit.
	Memory string `yaml:"memory,omitempty"`
	// Image optionally overrides the autoscaler's container image. This override is for provided for autoscaler testing and development.
	Image string `yaml:"image,omitempty"`
	// ImagePullPolicy optionally overrides the autoscaler container's image pull policy. This override is for provided for autoscaler testing and development.
	ImagePullPolicy string `yaml:"imagePullPolicy,omitempty"`
	// IdleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
	// Defaults to 60 (one minute). It is not read by the KubeRay operator but by the Ray autoscaler.
	IdleTimeoutSeconds int32 `yaml:"idleTimeoutSeconds,omitempty"`
	// UpscalingMode is "Conservative", "Default", or "Aggressive."
	// Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
	// Default: Upscaling is not rate-limited.
	// Aggressive: An alias for Default; upscaling is not rate-limited.
	// It is not read by the KubeRay operator but by the Ray autoscaler.
	// +kubebuilder:validation:Enum=Default;Aggressive;Conservative
	UpscalingMode string `yaml:"upscalingMode,omitempty"`
}

AutoscalerOptions specifies optional configuration for the Ray autoscaler.

type CommonCronArgs added in v0.8.2

type CommonCronArgs struct {
	// The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron.
	Schedule string `yaml:"schedule"` // --schedule

	// Specifies how to treat concurrent executions of a Job.
	// Valid values are:
	// - "Allow" (default): allows CronJobs to run concurrently;
	// - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
	// - "Replace": cancels currently running job and replaces it with a new one
	// +optional
	ConcurrencyPolicy string `yaml:"concurrencyPolicy"` // --concurrency-policy

	// This flag tells the controller to suspend subsequent executions, it does
	// not apply to already started executions.  Defaults to false.
	// +optional
	Suspend bool `yaml:"suspend"` // --suspend

	// Deadline is the timestamp that a cron job can keep scheduling util then.
	Deadline string `yaml:"deadline"` // --deadline

	// The number of finished job history to retain.
	// This is a pointer to distinguish between explicit zero and not specified.
	// +optional
	HistoryLimit int `yaml:"historyLimit"` // --history-limit
}

type CommonGPUNodeInfo

type CommonGPUNodeInfo struct {
	TotalGPUs     float64              `json:"totalGPUs" yaml:"totalGPUs"`
	AllocatedGPUs float64              `json:"allocatedGPUs" yaml:"allocatedGPUs"`
	UnhealthyGPUs float64              `json:"unhealthyGPUs" yaml:"unhealthyGPUs"`
	GPUMetrics    []*AdvancedGpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
}

type CommonModelArgs added in v0.9.0

type CommonModelArgs struct {
	Name            string `yaml:"name"`            // --name
	Namespace       string `yaml:"namespace"`       // --namespace
	ModelConfigFile string `yaml:"modelConfigFile"` // --model-config-file
	ModelName       string `yaml:"modelName"`       // --model-name
	ModelPath       string `yaml:"modelPath"`       // --model-path
	Inputs          string `yaml:"inputs"`          // --inputs
	Outputs         string `yaml:"outputs"`         // --outputs

	Image           string `yaml:"image"`           // --image
	ImagePullPolicy string `yaml:"imagePullPolicy"` // --image-pull-policy
	// ImagePullSecrets stores image pull secrets,match option --image-pull-secrets
	ImagePullSecrets []string `yaml:"imagePullSecrets"`

	GPUCount  int    `yaml:"gpuCount"`  // --gpus
	GPUMemory int    `yaml:"gpuMemory"` // --gpumemory
	GPUCore   int    `yaml:"gpuCore"`   // --gpucore
	Cpu       string `yaml:"cpu"`       // --cpu
	Memory    string `yaml:"memory"`    // --memory

	// DataSet stores the kubernetes pvc names
	DataSet map[string]string `yaml:"dataset"` // --data
	// DataDirs stores the files(or directories) in k8s node which will map to containers
	DataDirs []DataDirVolume `yaml:"dataDirs"` // --data-dir

	Envs          map[string]string `yaml:"envs"`          // --env
	NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector
	Tolerations   []TolerationArgs  `yaml:"tolerations"`   // --toleration
	Annotations   map[string]string `yaml:"annotations"`   // --annotation
	Labels        map[string]string `yaml:"labels"`        // --label

	Shell   string `yaml:"shell"` // --shell
	Command string `yaml:"command"`

	Type ModelJobType `yaml:"type"`
	// HelmOptions stores the helm options
	HelmOptions []string `yaml:"-"`
}

type CommonNodeInfo

type CommonNodeInfo struct {
	Name        string   `json:"name" yaml:"name"`
	Description string   `json:"description" yaml:"description"`
	IP          string   `json:"ip" yaml:"ip"`
	Status      string   `json:"status" yaml:"status"`
	Role        string   `json:"role" yaml:"role"`
	Type        NodeType `json:"type" yaml:"type"`
}

type CommonServingArgs

type CommonServingArgs struct {
	Name               string            `yaml:"servingName"`
	Version            string            `yaml:"servingVersion"`
	Namespace          string            `yaml:"-"`
	Type               ServingJobType    `yaml:"-"`
	Image              string            `yaml:"image"`
	ImagePullPolicy    string            `yaml:"imagePullPolicy"`     // --imagePullPolicy
	GPUCount           int               `yaml:"gpuCount"`            // --gpus
	GPUMemory          int               `yaml:"gpuMemory"`           // --gpumemory
	GPUCore            int               `yaml:"gpuCore"`             // --gpucore
	Devices            map[string]string `yaml:"devices"`             // --device
	Cpu                string            `yaml:"cpu"`                 // --cpu
	Memory             string            `yaml:"memory"`              // --memory
	Envs               map[string]string `yaml:"envs"`                // --envs
	EnvsFromSecret     map[string]string `yaml:"envsFromSecret"`      // --env-from-secret
	Shell              string            `yaml:"shell"`               // --shell
	Command            string            `yaml:"command"`             // --command
	Replicas           int               `yaml:"replicas"`            // --replicas
	EnableIstio        bool              `yaml:"enableIstio"`         // --enableIstio
	ExposeService      bool              `yaml:"exposeService"`       // --exposeService
	ModelDirs          map[string]string `yaml:"modelDirs"`           // --data
	DataSubpathExprs   map[string]string `yaml:"dataSubPathExprs"`    // --data-subpath-expr
	TempDirSubpathExpr map[string]string `yaml:"tempDirSubPathExprs"` // --temp-dir-subpath-expr
	TempDirs           map[string]string `yaml:"tempDirs"`            // --temp-dir
	ShareMemory        string            `yaml:"shareMemory"`         // --share-memory

	ImagePullSecrets []string          `yaml:"imagePullSecrets"` //--image-pull-secrets
	HostVolumes      []DataDirVolume   `yaml:"dataDirs"`         // --data-dir
	NodeSelectors    map[string]string `yaml:"nodeSelectors"`    // --selector
	Tolerations      []TolerationArgs  `yaml:"tolerations"`      // --toleration
	Annotations      map[string]string `yaml:"annotations"`
	Labels           map[string]string `yaml:"labels"` // --label
	// ConfigFiles stores the config file which is existed in client host node
	// and map it to container,match option --config-file
	ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"`
	// HelmOptions stores the helm options
	HelmOptions []string `yaml:"-"`

	ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists

	ModelName    string `yaml:"modelName"`    // --model-name
	ModelVersion string `yaml:"modelVersion"` // --model-version
}

type CommonSubmitArgs

type CommonSubmitArgs struct {

	// Name stores the job name,match option --name
	Name string `yaml:"-"`

	// Namespace  stores the namespace of job,match option --namespace
	Namespace string `yaml:"-"`

	// TrainingType stores the trainingType
	TrainingType TrainingJobType `yaml:"trainingType"`

	// NodeSelectors defines the node selectors,match option --selector
	NodeSelectors map[string]string `yaml:"nodeSelectors"`

	// ConfigFiles stores the config file which is existed in client host node
	// and map it to container,match option --config-file
	ConfigFiles map[string]map[string]ConfigFileInfo `yaml:"configFiles"`

	// Tolerations defines the tolerations which tolerates node taints
	// match option --toleration
	Tolerations []TolerationArgs `yaml:"tolerations"`

	// Image stores the docker image of job,match option --image
	Image string `yaml:"image"`

	// ImagePullPolicy stores the docker image pull policy of job,match option --image-pull-policy
	ImagePullPolicy string `yaml:"imagePullPolicy"`

	// GPUCount stores the gpu count of the job needs,match option --gpus
	GPUCount int `yaml:"gpuCount"`

	// Devices stores chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1,match option --device
	Devices map[string]string `yaml:"devices"`

	// Envs stores the envs of container in job, match option --env
	Envs map[string]string `yaml:"envs"`

	// WorkingDir stores the working directory of container in job,match option --working-dir
	WorkingDir string `yaml:"workingDir"`

	// Shell specify the linux shell type
	Shell string `yaml:"shell"`

	// Command stores the command of job
	Command string `yaml:"command"`

	// Mode is used for horovod,match option --sync-mode
	Mode string `yaml:"mode"`

	// WorkerCount stores the count of job worker,match option --workers
	WorkerCount int `yaml:"workers"`

	// Retry defines the retry times
	Retry int `yaml:"retry"`

	// DataSet stores the kubernetes pvc names
	DataSet map[string]string `yaml:"dataset"`

	// DataDirs stores the files(or directories) in k8s node which will map to containers
	// match option --data-dir
	DataDirs []DataDirVolume `yaml:"dataDirs"`

	// EnableRDMA enable rdma or not,match option --rdma
	EnableRDMA bool `yaml:"enableRDMA"`

	// EnableQueue enables the feature to queue jobs after they are scheduled.
	EnableQueue bool `yaml:"enableQueue"`

	// UseENI defines using eni or not
	UseENI bool `yaml:"useENI"`

	// Annotations defines pod annotations of job,match option --annotation
	Annotations map[string]string `yaml:"annotations"`

	// Labels specify the job labels and it is work for pods
	Labels map[string]string `yaml:"labels"`

	// IsNonRoot is root user or not
	IsNonRoot bool `yaml:"isNonRoot"`

	// PodSecurityContext defines the pod security context
	PodSecurityContext LimitedPodSecurityContext `yaml:"podSecurityContext"`

	// PriorityClassName defines the priority class
	PriorityClassName string `yaml:"priorityClassName"`

	// Coscheduling defines using Coscheduling
	Coscheduling bool

	// PodGroupName stores pod group name
	PodGroupName string `yaml:"podGroupName"`

	// PodGroupMinAvailable stores pod group min available
	PodGroupMinAvailable string `yaml:"podGroupMinAvailable"`

	// ImagePullSecrets stores image pull secrets,match option --image-pull-secrets
	ImagePullSecrets []string `yaml:"imagePullSecrets"`

	// HelmOptions stores the helm options
	HelmOptions []string `yaml:"-"`

	// EnableSpotInstance enables the feature of SuperVisor manage spot instance training.
	EnableSpotInstance bool `yaml:"enableSpotInstance"`

	// MaxWaitTime stores the maximum length of time a job waits for resources
	MaxWaitTime int `yaml:"maxWaitTime"`
	// SchedulerName stores the scheduler name,match option --scheduler
	SchedulerName string `yaml:"schedulerName"`

	// UseHostNetwork defines using useHostNetwork
	UseHostNetwork bool `yaml:"useHostNetwork"`

	// UseHostPID defines using useHostPID
	UseHostPID bool `yaml:"useHostPID"`

	// UseHostIPC defines using useHostIPC
	UseHostIPC bool `yaml:"useHostIPC"`

	// ModelName defines the model name associates with the job
	ModelName string `yaml:"modelName"`

	// ModelSource defines the model source
	ModelSource string `yaml:"modelSource"`
}

CommonSubmitArgs defines the common parts of the submitAthd

type CommonUpdateServingArgs added in v0.8.9

type CommonUpdateServingArgs struct {
	Name          string            `yaml:"servingName"`
	Version       string            `yaml:"servingVersion"`
	Namespace     string            `yaml:"-"`
	Type          ServingJobType    `yaml:"-"`
	Image         string            `yaml:"image"`
	GPUCount      int               `yaml:"gpuCount"`      // --gpus
	GPUMemory     int               `yaml:"gpuMemory"`     // --gpumemory
	GPUCore       int               `yaml:"gpuCore"`       // --gpucore
	Cpu           string            `yaml:"cpu"`           // --cpu
	Memory        string            `yaml:"memory"`        // --memory
	Replicas      int               `yaml:"replicas"`      // --replicas
	Envs          map[string]string `yaml:"envs"`          // --envs
	Annotations   map[string]string `yaml:"annotations"`   // --annotation
	Labels        map[string]string `yaml:"labels"`        // --label
	NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector
	Tolerations   []TolerationArgs  `yaml:"tolerations"`   // --toleration
	Shell         string            `yaml:"shell"`         // --shell
	Command       string            `yaml:"command"`       // --command
	ModelDirs     map[string]string `yaml:"modelDirs"`     // --data
}

type ConcurrencyPolicy added in v0.8.2

type ConcurrencyPolicy string

ConcurrencyPolicy describes how the job will be handled. Only one of the following concurrent policies may be specified. If none of the following policies is specified, the default one is AllowConcurrent.

const (
	ConcurrencyAllow   ConcurrencyPolicy = "Allow"
	ConcurrencyForbid  ConcurrencyPolicy = "Forbid"
	ConcurrencyReplace ConcurrencyPolicy = "Replace"
)

type ConfigFileInfo

type ConfigFileInfo struct {
	ContainerFileName string `yaml:"containerFileName"`
	HostFile          string `yaml:"hostFile"`
	Key               string `yaml:"key"`
	ContainerFilePath string `yaml:"containerFilePath"`
}

ConfigFileInfo defines the config files which will be mounted to containers

type CronHistoryInfo added in v0.8.2

type CronHistoryInfo struct {
	Name       string `json:"name" yaml:"name"`
	Namespace  string `json:"namespace" yaml:"namespace"`
	Group      string `json:"group" yaml:"group"`
	Kind       string `json:"kind" yaml:"kind"`
	Status     string `json:"status" yaml:"status"`
	CreateTime string `json:"createTime" yaml:"createTime"`
	FinishTime string `json:"finishTime" yaml:"finishTime"`
}

type CronInfo added in v0.8.2

type CronInfo struct {
	UUID string `json:"uuid" yaml:"uuid"`

	Name string `json:"name" yaml:"name"`

	Namespace string `json:"namespace" yaml:"namespace"`

	// Type is the job type, like TFjob、PyTorchJob
	Type string `json:"type" yaml:"type"`

	// The schedule in Cron format, see https://en.wikipedia.org/wiki/Cron.
	Schedule string `json:"schedule" yaml:"schedule"`

	// Specifies how to treat concurrent executions of a Job.
	// Valid values are:
	// - "Allow" (default): allows CronJobs to run concurrently;
	// - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
	// - "Replace": cancels currently running job and replaces it with a new one
	// +optional
	ConcurrencyPolicy string `json:"concurrencyPolicy" yaml:"concurrencyPolicy"` // --concurrency-policy

	// This flag tells the controller to suspend subsequent executions, it does
	// not apply to already started executions.  Defaults to false.
	// +optional
	Suspend bool `json:"suspend" yaml:"suspend"` // --suspend

	// Deadline is the timestamp that a cron job can keep scheduling util then.
	Deadline string `json:"deadline" yaml:"deadline"` // --deadline

	// The number of finished job history to retain.
	// This is a pointer to distinguish between explicit zero and not specified.
	// +optional
	HistoryLimit int64 `json:"historyLimit" yaml:"historyLimit"` // --history-limit

	// Information when was the last time the job was successfully scheduled.
	// +optional
	LastScheduleTime string `json:"lastScheduleTime" yaml:"lastScheduleTime"`

	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp string `json:"creationTimestamp" yaml:"creationTimestamp"`

	History []CronHistoryInfo `json:"cronHistory" yaml:"cronHistory"`
}

type CronTFJobArgs added in v0.8.2

type CronTFJobArgs struct {
	CommonCronArgs  `yaml:"cron"`
	SubmitTFJobArgs `yaml:"tfjob"`
}

type CronType added in v0.8.2

type CronType string

CronType defines the supporting job type

const (
	// CronTFTrainingJob defines the cron tfjob
	CronTFTrainingJob CronType = "tfjob"
)

type CustomServingArgs

type CustomServingArgs struct {
	Port                       int      `yaml:"port"`                       // --port
	RestfulPort                int      `yaml:"restApiPort"`                // --restfulPort
	MetricsPort                int      `yaml:"metricsPort"`                // --metrics-port
	MaxSurge                   string   `yaml:"maxSurge"`                   // --maxSurge
	MaxUnavailable             string   `yaml:"maxUnavailable"`             // --maxUnavailable
	LivenessProbeAction        string   `yaml:"livenessProbeAction"`        // --liveness-probe-action
	LivenessProbeActionOption  []string `yaml:"livenessProbeActionOption"`  // --liveness-probe-action-option
	LivenessProbeOption        []string `yaml:"livenessProbeOption"`        // --liveness-probe-option
	ReadinessProbeAction       string   `yaml:"readinessProbeAction"`       // --readiness-probe-action
	ReadinessProbeActionOption []string `yaml:"readinessProbeActionOption"` // --readiness-probe-action-option
	ReadinessProbeOption       []string `yaml:"readinessProbeOption"`       // --readiness-probe-option
	StartupProbeAction         string   `yaml:"startupProbeAction"`         // --startup-probe-action
	StartupProbeActionOption   []string `yaml:"startupProbeActionOption"`   // --startup-probe-action-option
	StartupProbeOption         []string `yaml:"startupProbeOption"`         // --startup-probe-option
	CommonServingArgs          `yaml:",inline"`
}

type DataDirVolume

type DataDirVolume struct {
	// HostPath defines the host path
	HostPath string `yaml:"hostPath"`
	// ContainerPath defines container path
	ContainerPath string `yaml:"containerPath"`
	// Name defines the volume name
	Name string `yaml:"name"`
}

DataDirVolume defines the volume of kubernetes

type Destination

type Destination struct {
	*istiov1alpha3.Destination
	Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"`
}

type DestinationRuleCRD

type DestinationRuleCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              *istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type DestinationWeight

type DestinationWeight struct {
	Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"`
	Weight      int32        `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"`
}

type DistributedServingArgs added in v0.12.0

type DistributedServingArgs struct {
	Masters           int    `yaml:"masters"`         // --masters
	Workers           int    `yaml:"workers"`         // --workers
	MasterCpu         string `yaml:"masterCpus"`      // --master-cpu
	WorkerCpu         string `yaml:"workerCpus"`      // --worker-cpu
	MasterGPUCount    int    `yaml:"masterGpus"`      // --master-gpus
	WorkerGPUCount    int    `yaml:"workerGpus"`      // --worker-gpus
	MasterMemory      string `yaml:"masterMemory"`    // --master-memory
	WorkerMemory      string `yaml:"workerMemory"`    // --worker-memory
	MasterGPUMemory   int    `yaml:"masterGPUMemory"` // --master-gpumemory
	WorkerGPUMemory   int    `yaml:"workerGPUMemory"` // --worker-gpumemory
	MasterGPUCore     int    `yaml:"masterGPUCore"`   // --master-gpucore
	WorkerGPUCore     int    `yaml:"workerGPUCore"`   // --worker-gpucore
	MasterCommand     string `yaml:"masterCommand"`   // --master-command
	WorkerCommand     string `yaml:"workerCommand"`   // --worker-command
	InitBackend       string `yaml:"initBackend"`     // --init-backend
	CustomServingArgs `yaml:",inline"`
}

type Driver

type Driver struct {
	CPURequest     int    `yaml:"CPURequest"`
	MemoryRequest  string `yaml:"MemoryRequest"`
	ServiceAccount string `yaml:"ServiceAccount"`
}

type Endpoint

type Endpoint struct {
	// Endpoint Name
	Name string `json:"name" yaml:"name"`
	// Port specifies endpoint port
	Port int `json:"port" yaml:"port"`
	// NodePort specifies the node port
	NodePort int `json:"nodePort" yaml:"nodePort"`
}

type EvaluateJobArgs added in v0.8.8

type EvaluateJobArgs struct {

	// Name stores the job name,match option --name
	Name string `yaml:"-"`

	// Namespace  stores the namespace of job,match option --namespace
	Namespace string `yaml:"-"`

	// NodeSelectors defines the node selectors,match option --selector
	NodeSelectors map[string]string `yaml:"nodeSelectors"`

	// Tolerations defines the tolerations which tolerates node taints
	// match option --toleration
	Tolerations []TolerationArgs `yaml:"tolerations"`

	// Image stores the docker image of job,match option --image
	Image string `yaml:"image"`

	// Envs stores the envs of container in job, match option --env
	Envs map[string]string `yaml:"envs"`

	WorkingDir string `yaml:"workingDir"`

	// Command stores the command of job
	Command string `yaml:"command"`

	// DataDirs stores the files(or directories) in k8s node which will map to containers
	// match option --data-dir
	DataDirs []DataDirVolume `yaml:"dataDirs"`

	// DataSources stores the kubernetes pvc names
	DataSources map[string]string `yaml:"dataSources"`

	// Annotations defines pod annotations of job,match option --annotation
	Annotations map[string]string `yaml:"annotations"`

	// Labels specify the job labels and it is work for pods
	Labels map[string]string `yaml:"labels"`

	// ImagePullSecrets stores image pull secrets,match option --image-pull-secrets
	ImagePullSecrets []string `yaml:"imagePullSecrets"`

	// HelmOptions stores the helm options
	HelmOptions []string `yaml:"-"`

	ModelName string `yaml:"modelName"` // --model-name

	ModelPath string `yaml:"modelPath"` // --model-path

	ModelVersion string `yaml:"modelVersion"` // --model-version

	MetricsPath string `yaml:"metricsPath"` // --metrics-path

	DatasetPath string `yaml:"datasetPath"` // --dataset-path

	Cpu string `yaml:"cpu"` // --cpu

	Memory string `yaml:"memory"` // --memory

	GPUCount int `yaml:"gpuCount"` // --gpus

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`
}

type EvaluateJobInfo added in v0.8.8

type EvaluateJobInfo struct {
	UUID string `json:"uuid" yaml:"uuid"`

	JobID string `json:"jobId" yaml:"jobId"`

	Name string `json:"name" yaml:"name"`

	Namespace string `json:"namespace" yaml:"namespace"`

	ModelName string `json:"modelName" yaml:"modelName"`

	ModelPath string `json:"modelPath" yaml:"modelPath"`

	ModelVersion string `json:"modelVersion" yaml:"modelVersion"`

	MetricsPath string `json:"metricsPath" yaml:"metricsPath"`

	DatasetPath string `json:"datasetPath" yaml:"datasetPath"`

	Status string `json:"status" yaml:"status"`

	CreationTimestamp string `json:"creationTimestamp" yaml:"creationTimestamp"`
}

type EvaluateJobType added in v0.8.8

type EvaluateJobType string
const (
	// EvaluateJob defines the tensorflow serving job
	EvaluateJob EvaluateJobType = "evaluatejob"
)

type Executor

type Executor struct {
	Replicas      int    `yaml:"Replicas"`
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type FormatStyle

type FormatStyle string

PrintFormatStyle defines the format of output it only used in cmd

const (
	// Wide defines the wide format
	WideFormat FormatStyle = "wide"
	// Json defines the json format
	JsonFormat FormatStyle = "json"
	// Yaml defines the yaml format
	YamlFormat FormatStyle = "yaml"
	// Unknwon defines the unknown format
	UnknownFormat FormatStyle = "unknown"
)

type GPUDeviceInfo

type GPUDeviceInfo struct {
	ID                 string  `json:"id" yaml:"id"`
	TotalGPUMemory     float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
	UsedGPUMemory      float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	DutyCycle          float64 `json:"dutyCycle" yaml:"dutyCycle"`
}

type GPUExclusiveNodeInfo

type GPUExclusiveNodeInfo struct {
	PodInfos          []GPUExclusivePodInfo `json:"instances" yaml:"instances"`
	CommonNodeInfo    `yaml:",inline" json:",inline"`
	CommonGPUNodeInfo `yaml:",inline" json:",inline"`
}

type GPUExclusivePodInfo

type GPUExclusivePodInfo struct {
	Name       string `json:"name" yaml:"name"`
	Namespace  string `json:"namespace" yaml:"namespace"`
	Status     string `json:"status" yaml:"status"`
	RequestGPU int    `json:"requestGPUs" yaml:"requestGPUs"`
}

type GPUShareNodeDevice

type GPUShareNodeDevice struct {
	Id                 string  `json:"id" yaml:"id"`
	TotalGPUMemory     float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64 `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
	TotalGPUCore       int64   `json:"totalGPUCore" yaml:"totalGPUCore"`
	AllocatedGPUCore   int64   `json:"allocatedGPUCore" yaml:"allocatedGPUCore"`
}

type GPUShareNodeInfo

type GPUShareNodeInfo struct {
	PodInfos           []GPUSharePodInfo    `json:"instances" yaml:"instances"`
	TotalGPUMemory     float64              `json:"totalGPUMemory" yaml:"totalGPUMemory"`
	AllocatedGPUMemory float64              `json:"allocatedGPUMemory" yaml:"allocatedGPUMemory"`
	TotalGPUCore       int64                `json:"totalGPUCore" yaml:"totalGPUCore"`
	AllocatedGPUCore   int64                `json:"allocatedGPUCore" yaml:"allocatedGPUCore"`
	Devices            []GPUShareNodeDevice `json:"devices" yaml:"devices"`
	CommonGPUNodeInfo  `yaml:",inline" json:",inline"`
	CommonNodeInfo     `yaml:",inline" json:",inline"`
}

type GPUSharePodInfo

type GPUSharePodInfo struct {
	Name                string         `json:"name" yaml:"name"`
	Namespace           string         `json:"namespace" yaml:"namespace"`
	Status              string         `json:"status" yaml:"status"`
	RequestMemory       int            `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	RequestCore         int            `json:"requestGPUCore" yaml:"requestGPUCore"`
	GPUMemoryAllocation map[string]int `json:"gpuMemoryAllocation" yaml:"gpuMemoryAllocation"`
	GPUCoreAllocation   map[string]int `json:"gpuCoreAllocation" yaml:"gpuCoreAllocation"`
}

type GPUTopology

type GPUTopology struct {
	LinkMatrix      [][]string  `json:"linkMatrix" yaml:"linkMatrix"`
	BandwidthMatrix [][]float32 `json:"bandwidthMatrix" yaml:"bandwidthMatrix"`
}

type GPUTopologyNodeDevice

type GPUTopologyNodeDevice struct {
	Id      string `json:"id" yaml:"id"`
	Healthy bool   `json:"healthy" yaml:"healthy"`
	Status  string `json:"status" yaml:"status"`
}

type GPUTopologyNodeInfo

type GPUTopologyNodeInfo struct {
	PodInfos          []GPUTopologyPodInfo `json:"instances" yaml:"instances"`
	GPUTopology       GPUTopology          `json:"gpuTopology" yaml:"gpuTopology"`
	CommonGPUNodeInfo `yaml:",inline" json:",inline"`
	CommonNodeInfo    `yaml:",inline" json:",inline"`
	Devices           []GPUTopologyNodeDevice `json:"devices" yaml:"devices"`
}

type GPUTopologyPodInfo

type GPUTopologyPodInfo struct {
	Name        string   `json:"name" yaml:"name"`
	Namespace   string   `json:"namespace" yaml:"namespace"`
	Status      string   `json:"status" yaml:"status"`
	RequestGPU  int      `json:"requestGPUs" yaml:"requestGPUs"`
	Allocation  []string `json:"allocation" yaml:"allocation"`
	VisibleGPUs []string `json:"visibleGPUs" yaml:"visibleGPUs"`
}

type GpuMetric

type GpuMetric struct {
	GpuDutyCycle   float64 `json:"gpuDutyCycle" yaml:"gpuDutyCycle"`
	GpuMemoryUsed  float64 `json:"usedGPUMemory" yaml:"usedGPUMemory"`
	GpuMemoryTotal float64 `json:"totalGPUMemory" yaml:"totalGPUMemory"`
}

type GpuMetricInfo

type GpuMetricInfo struct {
	MetricName    string
	Value         string
	Time          float64
	PodName       string
	PodNamespace  string
	ContainerName string
	NodeName      string
	GPUUID        string
	Id            string
	AllocateMode  string
}

type HTTPMatchRequest

type HTTPMatchRequest struct {
	*istiov1alpha3.HTTPMatchRequest
	Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"`
}

type HTTPRoute

type HTTPRoute struct {
	*istiov1alpha3.HTTPRoute
	Match []*HTTPMatchRequest  `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"`
	Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"`
}

type HeadGroupSpec added in v0.11.0

type HeadGroupSpec struct {
	// ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod
	ServiceType string `yaml:"serviceType,omitempty"`
	Image       string `yaml:"image"`
	Cpu         string `yaml:"cpu"`
	Memory      string `yaml:"memory"`
	Gpu         int    `yaml:"gpu"`
}

HeadGroupSpec are the spec for the head pod

type JobConditionType added in v0.8.2

type JobConditionType string

JobConditionType defines all kinds of types of JobStatus.

const (
	// JobCreated means the job has been accepted by the system,
	// but one or more of the pods/services has not been started.
	// This includes time before pods being scheduled and launched.
	JobCreated JobConditionType = "Created"

	// JobRunning means all sub-resources (e.g. services/pods) of this job
	// have been successfully scheduled and launched.
	// The training is running without error.
	JobRunning JobConditionType = "Running"

	// JobRestarting means one or more sub-resources (e.g. services/pods) of this job
	// reached phase failed but maybe restarted according to it's restart policy
	// which specified by user in v1.PodTemplateSpec.
	// The training is freezing/pending.
	JobRestarting JobConditionType = "Restarting"

	// JobSucceeded means all sub-resources (e.g. services/pods) of this job
	// reached phase have terminated in success.
	// The training is complete without error.
	JobSucceeded JobConditionType = "Succeeded"

	// JobFailed means one or more sub-resources (e.g. services/pods) of this job
	// reached phase failed with no restarting.
	// The training has failed its execution.
	JobFailed JobConditionType = "Failed"
)

type JobGpuMetric

type JobGpuMetric map[string]PodGpuMetric

type K8sObject added in v0.8.7

type K8sObject struct {
	metav1.TypeMeta   `json:",inline"`
	metav1.ObjectMeta `json:"metadata,omitempty"`
}

type KFServingArgs

type KFServingArgs struct {
	Port              int    `yaml:"port"`          // --port
	ModelType         string `yaml:"modelType"`     // --modelType
	CanaryPercent     int    `yaml:"canaryPercent"` // --canaryTrafficPercent
	StorageUri        string `yaml:"storageUri"`    // --storageUri
	CommonServingArgs `yaml:",inline"`
}

type KServeArgs added in v0.9.11

type KServeArgs struct {
	ModelFormat          *ModelFormat      `yaml:"modelFormat"`                    // --model-format
	Runtime              string            `yaml:"runtime"`                        // --runtime
	StorageUri           string            `yaml:"storageUri"`                     // --storageUri
	RuntimeVersion       string            `yaml:"runtimeVersion"`                 // --runtime-version
	ProtocolVersion      string            `yaml:"protocolVersion"`                // --protocol-version
	MinReplicas          int               `yaml:"minReplicas"`                    // --min-replicas
	MaxReplicas          int               `yaml:"maxReplicas"`                    // --max-replicas
	ScaleTarget          int               `yaml:"scaleTarget"`                    // --scale-target
	ScaleMetric          string            `yaml:"scaleMetric"`                    // --scale-metric
	ContainerConcurrency int64             `yaml:"containerConcurrency"`           // --container-concurrency
	TimeoutSeconds       int64             `yaml:"timeout"`                        // --timeout
	CanaryTrafficPercent int64             `yaml:"canaryTrafficPercent,omitempty"` // --canary-traffic-percent
	Port                 int               `yaml:"port"`                           // --port
	EnablePrometheus     bool              `yaml:"enablePrometheus,omitempty"`     // --enable-prometheus
	MetricsPort          int               `yaml:"metricsPort,omitempty"`          // --metrics-port
	SecurityContext      map[string]string `yaml:"securityContext,omitempty"`      // --security-context
	CommonServingArgs    `yaml:",inline"`
}

type LimitedPodSecurityContext

type LimitedPodSecurityContext struct {
	RunAsUser          int64   `yaml:"runAsUser"`
	RunAsNonRoot       bool    `yaml:"runAsNonRoot"`
	RunAsGroup         int64   `yaml:"runAsGroup"`
	SupplementalGroups []int64 `yaml:"supplementalGroups"`
}

LimitedPodSecurityContext defines the kuberntes pod security context

type LogArgs

type LogArgs struct {
	Namespace     string
	JobName       string
	InstanceName  string
	ContainerName string
	Follow        bool
	SinceSeconds  *int64
	SinceTime     *metav1.Time
	Tail          *int64
	Timestamps    bool
	RetryCnt      int
	RetryTimeout  time.Duration
	WriterCloser  io.WriteCloser
}

type LogLevel

type LogLevel string
const (
	LogDebug   LogLevel = "debug"
	LogInfo    LogLevel = "info"
	LogWarning LogLevel = "warn"
	LogError   LogLevel = "error"
	LogUnknown LogLevel = "unknown"
)

type ModelBenchmarkArgs added in v0.9.0

type ModelBenchmarkArgs struct {
	Concurrency     int    `yaml:"concurrency"` // --concurrency
	Requests        int    `yaml:"requests"`    // --requests
	Duration        int    `yaml:"duration"`    // --duration (seconds)
	ReportPath      string `yaml:"reportPath"`  // --report-path
	CommonModelArgs `yaml:",inline"`
}

type ModelEvaluateArgs added in v0.9.0

type ModelEvaluateArgs struct {
	ModelPlatform   string `yaml:"modelPlatform"` // --model-platform
	DatasetPath     string `yaml:"datasetPath"`   // --dataset-path
	ReportPath      string `yaml:"reportPath"`    // --report-path
	BatchSize       int    `yaml:"batchSize"`     // --batch-size
	CommonModelArgs `yaml:",inline"`
	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`
}

type ModelFormat added in v0.9.11

type ModelFormat struct {
	// Name of the model format.
	// +required
	Name string `yaml:"name"`
	// Version of the model format.
	// Used in validating that a predictor is supported by a runtime.
	// Can be "major", "major.minor" or "major.minor.patch".
	// +optional
	Version *string `yaml:"version,omitempty"`
}

type ModelJobInfo added in v0.9.0

type ModelJobInfo struct {
	// The unique identity of the model job
	UUID string `json:"uuid" yaml:"uuid"`

	// The name of the model job
	Name string `json:"name" yaml:"name"`

	// The namespace of the model job
	Namespace string `json:"namespace" yaml:"namespace"`

	// The time of the model job
	Duration string `json:"duration" yaml:"duration"`

	// Age specifies the model job age
	Age string `json:"age" yaml:"age"`

	// The status of the model Job
	Status string `json:"status" yaml:"status"`

	// The model type of the model job
	Type string `json:"type" yaml:"type"`

	// The instances under the model job
	Instances []ModelJobInstance `json:"instances" yaml:"instances"`

	// RequestCPUs GPU count of the Job
	RequestCPUs int64 `json:"requestCPUs" yaml:"requestCPUs"`

	// RequestGPUs stores the request gpus
	RequestGPUs int64 `json:"requestGPUs" yaml:"requestGPUs"`

	// RequestGPUMemory stores the request gpus
	RequestGPUMemory int64 `json:"requestGPUMemory" yaml:"requestGPUMemory"`

	// RequestGPUCore stores the request gpus core
	RequestGPUCore int64 `json:"requestGPUCore" yaml:"requestGPUCore"`

	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`

	// CreationTimestamp stores the job parameters
	Params map[string]string `json:"params" yaml:"params"`
}

type ModelJobInstance added in v0.9.0

type ModelJobInstance struct {
	// Name gives the instance name
	Name string `json:"name" yaml:"name"`
	// Status gives the instance status
	Status string `json:"status" yaml:"status"`
	// Age gives the instance ge
	Age string `json:"age" yaml:"age"`
	// ReadyContainer represents the count of ready containers
	ReadyContainer int `json:"readyContainers" yaml:"readyContainers"`
	// TotalContainer represents the count of  total containers
	TotalContainer int `json:"totalContainers" yaml:"totalContainers"`
	// RestartCount represents the count of instance restarts
	RestartCount int `json:"restartCount" yaml:"restartCount"`
	// HostIP specifies host ip of instance
	NodeIP string `json:"nodeIP" yaml:"nodeIP"`
	// NodeName returns the node name
	NodeName string `json:"nodeName" yaml:"nodeName"`
	// IP returns the instance ip
	IP string `json:"ip" yaml:"ip"`
	// RequestGPU returns the request gpus
	RequestGPUs float64 `json:"requestGPUs" yaml:"requestGPUs"`
	// RequestGPUMemory returns the request gpu memory
	RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	// RequestGPUCore returns the request gpu core
	RequestGPUCore int `json:"requestGPUCore" yaml:"requestGPUCore"`
	// CreationTimestamp returns the creation timestamp of instance
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

type ModelJobStatus added in v0.9.0

type ModelJobStatus string

ModelJobStatus defines all the kinds of JobStatus

const (
	// ModelJobPending means the job is pending
	ModelJobPending ModelJobStatus = "PENDING"
	// ModelJobRunning means the job is running
	ModelJobRunning ModelJobStatus = "RUNNING"
	// ModelJobComplete means the job is complete
	ModelJobComplete ModelJobStatus = "COMPLETE"
	// ModelJobFailed means the job is failed
	ModelJobFailed ModelJobStatus = "FAILED"
	// ModelJobUnknown means the job status is unknown
	ModelJobUnknown ModelJobStatus = "UNKNOWN"
)

type ModelJobType added in v0.9.0

type ModelJobType string

ModelJobType defines the supporting model job type

const (
	// ModelProfileJob defines the model profile job
	ModelProfileJob ModelJobType = "profile"
	// ModelOptimizeJob defines the model optimize job
	ModelOptimizeJob ModelJobType = "optimize"
	// ModelBenchmarkJob defines the model benchmark job
	ModelBenchmarkJob ModelJobType = "benchmark"
	// ModelEvaluateJob defines the model evaluate job
	ModelEvaluateJob ModelJobType = "evaluate"
	// AllModelJob defines all model job
	AllModelJob ModelJobType = ""
	// UnknownModelJob defines the unknown model job
	UnknownModelJob ModelJobType = "unknown"
)

type ModelOptimizeArgs added in v0.9.0

type ModelOptimizeArgs struct {
	Optimizer       string `yaml:"optimizer"`    // --optimizer
	TargetDevice    string `yaml:"targetDevice"` // --target-device
	ExportPath      string `yaml:"exportPath"`   // --export-path
	CommonModelArgs `yaml:",inline"`
}

type ModelProfileArgs added in v0.9.0

type ModelProfileArgs struct {
	ReportPath       string `yaml:"reportPath"`       // --report-path
	UseTensorboard   bool   `yaml:"useTensorboard"`   // --tensorboard
	TensorboardImage string `yaml:"tensorboardImage"` // --tensorboardImage

	CommonModelArgs `yaml:",inline"`
}

type ModelTypeInfo added in v0.9.0

type ModelTypeInfo struct {
	Name      ModelJobType
	Alias     string
	Shorthand string
}

type ModelVersion added in v0.9.14

type ModelVersion struct {
	Name                 string             `json:"name"`
	Version              string             `json:"version,omitempty"`
	CreationTimestamp    int64              `json:"creation_timestamp,omitempty"`
	LastUpdatedTimestamp int64              `json:"last_updated_timestamp,omitempty"`
	Description          string             `json:"description,omitempty"`
	UserId               string             `json:"user_id,omitempty"`
	CurrentStage         string             `json:"current_stage,omitempty"`
	Source               string             `json:"source,omitempty"`
	RunId                string             `json:"run_id,omitempty"`
	Status               ModelVersionStatus `json:"status,omitempty"`
	StatusMessage        string             `json:"status_message,omitempty"`
	Tags                 []*ModelVersionTag `json:"tags,omitempty"`
	RunLink              string             `json:"run_link,omitempty"`
	Aliases              []string           `json:"aliases,omitempty"`
}

type ModelVersionStatus added in v0.9.14

type ModelVersionStatus string
const (
	PENDING_REGISTRATION ModelVersionStatus = "PENDING_REGISTRATION"
	FAILED_REGISTRATION  ModelVersionStatus = "FAILED_REGISTRATION"
	READY                ModelVersionStatus = "READY"
)

type ModelVersionTag added in v0.9.14

type ModelVersionTag struct {
	Key   string `json:"key"`
	Value string `json:"value"`
}

type NodeGpuMetric

type NodeGpuMetric map[string]*AdvancedGpuMetric

key of map is device id

type NodeType

type NodeType string
const (
	GPUShareNode     NodeType = "GPUShare"
	GPUExclusiveNode NodeType = "GPUExclusive"
	GPUTopologyNode  NodeType = "GPUTopology"
	NormalNode       NodeType = "Normal"
	UnknownNode      NodeType = "unknown"
	AllKnownNode     NodeType = ""
)

type NodeTypeInfo

type NodeTypeInfo struct {
	Name      NodeType
	Alias     string
	Shorthand string
}

type NormalNodeInfo

type NormalNodeInfo struct {
	CommonNodeInfo `yaml:",inline" json:",inline"`
}

type PodGpuMetric

type PodGpuMetric map[string]*GpuMetric

type PortSelector

type PortSelector struct {
	*istiov1alpha3.PortSelector
	Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"`
}

type PreprocesObject

type PreprocesObject struct {
	ServiceName     string
	Namespace       string
	DestinationRule DestinationRuleCRD
	VirtualService  VirtualServiceCRD
}

type PrometheusMetric

type PrometheusMetric struct {
	Status string               `json:"status,inline"`
	Data   PrometheusMetricData `json:"data,omitempty"`
}

type PrometheusMetricData

type PrometheusMetricData struct {
	Result     []PrometheusMetricResult `json:"result"`
	ResultType string                   `json:"resultType"`
}

type PrometheusMetricResult

type PrometheusMetricResult struct {
	Metric map[string]string       `json:"metric"`
	Value  []PrometheusMetricValue `json:"value"`
}

type PrometheusMetricValue

type PrometheusMetricValue interface{}

type PrometheusServer

type PrometheusServer struct {
	Name          string
	ServiceLabels string
	Protocol      string
	Port          string
	Path          string
	MetricList    []string
	Service       *v1.Service
}

PrometheusServer is used to define prometheus server

type RayClusterSpec added in v0.11.0

type RayClusterSpec struct {
	// The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
	RayVersion string `yaml:"rayVersion"`
	// EnableInTreeAutoscaling indicates whether operator should create in tree autoscaling configs
	EnableInTreeAutoscaling bool `yaml:"enableInTreeAutoscaling,omitempty"`
	// AutoscalerOptions specifies optional configuration for the Ray autoscaler.
	AutoscalerOptions AutoscalerOptions `yaml:"autoscalerOptions,omitempty"`

	HeadGroupSpec HeadGroupSpec `yaml:"head"`

	WorkerGroupSpec WorkerGroupSpec `yaml:"worker"`
	// the command that needs to be executed before stopping
	PreStopCmd []string `yaml:"preStopCmd"`
}

type RegisteredModel added in v0.9.14

type RegisteredModel struct {
	Name                 string                  `json:"name"`
	CreationTimestamp    int64                   `json:"creation_timestamp,omitempty"`
	LastUpdatedTimestamp int64                   `json:"last_updated_timestamp,omitempty"`
	Description          string                  `json:"description,omitempty"`
	LatestVersions       []*ModelVersion         `json:"latest_versions,omitempty"`
	Tags                 []*RegisteredModelTag   `json:"tags,omitempty"`
	Aliases              []*RegisteredModelAlias `json:"aliases,omitempty"`
}

Model Management

type RegisteredModelAlias added in v0.9.14

type RegisteredModelAlias struct {
	Alias   string `json:"alias"`
	Version string `json:"version"`
}

type RegisteredModelTag added in v0.9.14

type RegisteredModelTag struct {
	Key   string `json:"key"`
	Value string `json:"value"`
}

func (RegisteredModelTag) String added in v0.9.14

func (t RegisteredModelTag) String() string

type Runtime

type Runtime interface {
	// get the chart
	GetChartName() string
	// defines the runtime is default or not
	IsDefault() bool
}

type ScaleETJobArgs

type ScaleETJobArgs struct {
	//--name string     required, et job name
	Name string `yaml:"etName"`
	// TrainingType stores the trainingType
	JobType TrainingJobType `yaml:"-"`
	// Namespace  stores the namespace of job,match option --namespace
	Namespace string `yaml:"-"`
	//--timeout int     timeout of callback scaler script.
	Timeout int `yaml:"timeout"`
	//--retry int       retry times.
	Retry int `yaml:"retry"`
	//--count int       the nums of you want to add or delete worker.
	Count int `yaml:"count"`
	//--script string        script of scaling.
	Script string `yaml:"script"`
	//-e, --env stringArray      the environment variables
	Envs map[string]string `yaml:"envs"`
}

type ScaleInETJobArgs

type ScaleInETJobArgs struct {
	// common args
	ScaleETJobArgs `yaml:",inline"`
}

type ScaleOutETJobArgs

type ScaleOutETJobArgs struct {
	// common args
	ScaleETJobArgs `yaml:",inline"`
}

type SeldonServingArgs added in v0.8.0

type SeldonServingArgs struct {
	Implementation    string `yaml:"implementation"` // --implementation
	ModelUri          string `yaml:"modelUri"`       // --modelUri
	CommonServingArgs `yaml:",inline"`
}

type ServingInstance

type ServingInstance struct {
	// Name gives the instance name
	Name string `json:"name" yaml:"name"`
	// Status gives the instance status
	Status string `json:"status" yaml:"status"`
	// Age gives the instance ge
	Age string `json:"age" yaml:"age"`
	// ReadyContainer represents the count of ready containers
	ReadyContainer int `json:"readyContainers" yaml:"readyContainers"`
	// TotalContainer represents the count of  total containers
	TotalContainer int `json:"totalContainers" yaml:"totalContainers"`
	// RestartCount represents the count of instance restarts
	RestartCount int `json:"restartCount" yaml:"restartCount"`
	// HostIP specifies host ip of instance
	NodeIP string `json:"nodeIP" yaml:"nodeIP"`
	// NodeName returns the node name
	NodeName string `json:"nodeName" yaml:"nodeName"`
	// IP returns the instance ip
	IP string `json:"ip" yaml:"ip"`
	// RequestGPU returns the request gpus
	RequestGPUs float64 `json:"requestGPUs" yaml:"requestGPUs"`
	// RequestGPUMemory returns the request gpu memory
	RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	// RequestGPUMemory specifies the request gpu core,only for gpushare
	RequestGPUCore int `json:"requestGPUCore" yaml:"requestGPUCore"`
	// CreationTimestamp returns the creation timestamp of instance
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

type ServingJobInfo

type ServingJobInfo struct {
	// UUID specifies the unique identity of the serving job
	UUID string `json:"uuid" yaml:"uuid"`
	// Name specifies serving job name
	Name string `json:"name" yaml:"name"`
	// Namespace specifies serving job namespace
	Namespace string `json:"namespace" yaml:"namespace"`
	// Type specifies serving job type
	Type string `json:"type" yaml:"type"`
	// Version specifies serving job version
	Version string `json:"version" yaml:"version"`
	// Age specifies the serving job age
	Age string `json:"age" yaml:"age"`
	// Desired specifies the desired instances
	Desired int `json:"desiredInstances" yaml:"desiredInstances"`
	// Available specifies the available instances
	Available int `json:"availableInstances" yaml:"availableInstances"`
	// Endpoints specifies the endpoints
	Endpoints []Endpoint `json:"endpoints" yaml:"endpoints"`
	// IPAddress specifies the ip address
	IPAddress string `json:"ip" yaml:"ip"`
	// Instances gives the instance informations
	Instances []ServingInstance `json:"instances" yaml:"instances"`
	// RequestCPUs specifies the request cpus
	RequestCPUs float64 `json:"requestCPUs" yaml:"requestCPUs"`
	// RequestGPUs specifies the request gpus
	RequestGPUs float64 `json:"requestGPUs" yaml:"requestGPUs"`
	// RequestGPUMemory specifies the request gpu memory,only for gpushare
	RequestGPUMemory int `json:"requestGPUMemory" yaml:"requestGPUMemory"`
	// RequestGPUMemory specifies the request gpu core,only for gpushare
	RequestGPUCore int `json:"requestGPUCore" yaml:"requestGPUCore"`
	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

ServingJobInfo display serving job information

type ServingJobType

type ServingJobType string

ServingJobType defines the serving job type name must like shorthand + "-serving"

const (
	// TFServingJob defines the tensorflow serving job
	TFServingJob ServingJobType = "tf-serving"
	// TRTServingJob defines the tensorrt serving job
	TRTServingJob ServingJobType = "trt-serving"
	// KFServingJob defines the kfserving job
	KFServingJob ServingJobType = "kf-serving"
	// KServeJob defines the kserve job
	KServeJob ServingJobType = "kserve"
	// SeldonServingJob defines the seldon core job
	SeldonServingJob ServingJobType = "seldon-serving"
	// TritonServingJob defines the nvidia triton server job
	TritonServingJob ServingJobType = "triton-serving"
	// CustomServingJob defines the custom serving job
	CustomServingJob ServingJobType = "custom-serving"
	// DistributedServingJob defines the distributed serving job
	DistributedServingJob ServingJobType = "distributed-serving"
	// AllServingJob represents all serving job type
	AllServingJob ServingJobType = ""
	// UnknownServingJob defines the unknown serving job
	UnknownServingJob ServingJobType = "unknown"
)

type ServingTypeInfo

type ServingTypeInfo struct {
	Name      ServingJobType
	Alias     string
	Shorthand string
}

type ServingVersionWeight

type ServingVersionWeight struct {
	Version string
	Weight  int
}

type StringMatchPrefix

type StringMatchPrefix struct {
	Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}

type SubmitDeepSpeedJobArgs added in v0.9.9

type SubmitDeepSpeedJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`
	// SubmitTensorboardArgs stores tensorboard information
	SubmitTensorboardArgs `yaml:",inline"`
	// SubmitSyncCodeArgs stores syncing code information
	SubmitSyncCodeArgs `yaml:",inline"`
	LauncherSelectors  map[string]string `yaml:"launcherSelectors"` // --launcher-selector
	JobRestartPolicy   string            `yaml:"jobRestartPolicy"`  // --job-restart-policy
	JobBackoffLimit    int               `yaml:"jobBackoffLimit"`   // --job-backoff-limit
	// SSHSecret enables create secret for job.
	SSHSecret  string            `yaml:"sshSecret"`
	SecretData map[string]string `yaml:"secretData"`
	// Annotations defines launcher pod annotations of job,match option --launcher-annotation
	LauncherAnnotations map[string]string `yaml:"launcherAnnotations"`
	// Annotations defines worker pod annotations of job,match option --worker-annotation
	WorkerAnnotations map[string]string `yaml:"workerAnnotations"`
}

type SubmitETJobArgs

type SubmitETJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`
	// SubmitTensorboardArgs stores tensorboard information
	SubmitTensorboardArgs `yaml:",inline"`
	// SubmitSyncCodeArgs stores syncing code information
	SubmitSyncCodeArgs  `yaml:",inline"`
	MaxWorkers          int               `yaml:"maxWorkers"`
	MinWorkers          int               `yaml:"minWorkers"`
	LauncherSelectors   map[string]string `yaml:"launcherSelectors"`   // --launcher-selector
	JobRestartPolicy    string            `yaml:"jobRestartPolicy"`    // --job-restart-policy
	WorkerRestartPolicy string            `yaml:"workerRestartPolicy"` // --worker-restart-policy
	JobBackoffLimit     int               `yaml:"jobBackoffLimit"`     // --job-backoff-limit
	// SSHSecret enables create secret for job.
	SSHSecret  string            `yaml:"sshSecret"`
	SecretData map[string]string `yaml:"secretData"`
	// Annotations defines launcher pod annotations of job,match option --launcher-annotation
	LauncherAnnotations map[string]string `yaml:"launcherAnnotations"`
	// Annotations defines worker pod annotations of job,match option --worker-annotation
	WorkerAnnotations map[string]string `yaml:"workerAnnotations"`
}

type SubmitHorovodJobArgs

type SubmitHorovodJobArgs struct {
	SSHPort int    `yaml:"sshPort"`
	Cpu     string `yaml:"cpu"`    // --cpu
	Memory  string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`
}

type SubmitMPIJobArgs

type SubmitMPIJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`

	// enable gpu topology scheduling
	GPUTopology        bool   `yaml:"gputopology"`
	GPUTopologyReplica string `yaml:"gputopologyreplica"`
	MountsOnLauncher   bool   `yaml:"mountsOnLauncher"`

	// clean-task-policy
	CleanPodPolicy string `yaml:"cleanPodPolicy"`
}

type SubmitPyTorchJobArgs

type SubmitPyTorchJobArgs struct {
	Cpu    string `yaml:"cpu"`    // --cpu
	Memory string `yaml:"memory"` // --memory
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`

	// clean-task-policy
	CleanPodPolicy string `yaml:"cleanPodPolicy"`

	// ActiveDeadlineSeconds Specifies the duration (in seconds) since startTime during which the job can remain active
	// before it is terminated
	ActiveDeadlineSeconds int64 `yaml:"activeDeadlineSeconds,omitempty"`

	// Defines the TTL for cleaning up finished PytorchJobs. Defaults to infinite.
	TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"`

	// TrainingOperatorCRD compatible with training-operator crd.
	TrainingOperatorCRD bool `yaml:"trainingOperatorCRD,omitempty"`

	// ShareMemory Specifies the shared memory size
	ShareMemory string `yaml:"shareMemory"`
}

type SubmitRayJobArgs added in v0.11.0

type SubmitRayJobArgs struct {
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// for tensorboard
	SubmitTensorboardArgs `yaml:",inline"`

	// for sync up source code
	SubmitSyncCodeArgs `yaml:",inline"`

	// ShutdownAfterJobFinishes will determine whether to delete the ray cluster once rayJob succeed or failed.
	// default:=false
	ShutdownAfterJobFinishes bool `yaml:"shutdownAfterJobFinishes,omitempty"`

	// TTLSecondsAfterFinished is the TTL to clean up RayCluster.
	// It's only working when ShutdownAfterJobFinishes set to true.
	// default:=0
	TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"`

	// ActiveDeadlineSeconds is the duration in seconds that the RayJob may be active before
	// KubeRay actively tries to terminate the RayJob; value must be positive integer.
	ActiveDeadlineSeconds int32 `yaml:"activeDeadlineSeconds,omitempty"`

	// suspend specifies whether the RayJob controller should create a RayCluster instance
	// If a job is applied with the suspend field set to true,
	// the RayCluster will not be created and will wait for the transition to false.
	// If the RayCluster is already created, it will be deleted.
	// In case of transition to false a new RayCluster will be created.
	Suspend bool `yaml:"suspend,omitempty"`

	RayClusterSpec `yaml:",inline"`

	// ShareMemory Specifies the shared memory size
	ShareMemory string `yaml:"shareMemory"`
}

type SubmitSparkJobArgs

type SubmitSparkJobArgs struct {
	Name         string          `yaml:"-"`
	Namespace    string          `yaml:"-"`
	TrainingType TrainingJobType `yaml:"-"`
	Image        string          `yaml:"Image"`
	MainClass    string          `yaml:"MainClass"`
	Jar          string          `yaml:"Jar"`
	SparkVersion string          `yaml:"SparkVersion"`
	Driver       *Driver         `yaml:"Driver"`
	Executor     *Executor       `yaml:"Executor"`
	// Annotations defines pod annotations of job,match option --annotation
	Annotations map[string]string `yaml:"annotations"`
	// Labels specify the job labels and it is work for pods
	Labels map[string]string `yaml:"labels"`
}

type SubmitSyncCodeArgs

type SubmitSyncCodeArgs struct {
	SyncMode   string `yaml:"syncMode"`            // --syncMode: rsync, hdfs, git
	SyncSource string `yaml:"syncSource"`          // --syncSource
	SyncImage  string `yaml:"syncImage,omitempty"` // --syncImage
	// syncGitProjectName
	SyncGitProjectName string `yaml:"syncGitProjectName,omitempty"` // --syncImage
}

type SubmitTFJobArgs

type SubmitTFJobArgs struct {
	// TFNodeSelectors assigns tfjob node selectors
	TFNodeSelectors map[string]map[string]string `yaml:"tfNodeSelectors"`
	// Port defines the defaut port if workerPort and PSPort are not set
	Port int
	// WorkerImage assigns worker image,match option --worker-image
	WorkerImage string `yaml:"workerImage"`
	// WorkerPort stores worker port,match option --work-port
	WorkerPort int `yaml:"workerPort"`
	// PSPort stores the ps port,match option --ps-port
	PSPort int `yaml:"psPort"`
	// PSCount stores the ps count,--ps-count
	PSCount int `yaml:"ps"`
	// PSImage stores the ps image,--ps-image
	PSImage string `yaml:"psImage"`
	// WorkerCpu stores the cpu of job worker,match option --worker-cpu
	WorkerCpu string `yaml:"workerCPU"`
	// WorkerCpuLimit stores the cpu limit of job worker,match option --worker-cpu-limit
	WorkerCpuLimit string `yaml:"workerCPULimit"`
	//WorkerNodeSelectors map[string]string `yaml:"workerNodeSelectors"` // --worker-selector
	// WorkerMemory stores woker memory,match option --worker-memory
	WorkerMemory string `yaml:"workerMemory"`
	// WorkerMemoryLimit stores woker memory limit,match option --worker-memory-limit
	WorkerMemoryLimit string `yaml:"workerMemoryLimit"`
	// PSCpu stores ps cpu,match option --ps-cpu
	PSCpu string `yaml:"psCPU"`
	// PSCpuLimit stores ps cpu limit,match option --ps-cpu-limit
	PSCpuLimit string `yaml:"psCPULimit"`
	// PSGpu stores ps gpu,match option --ps-gpus
	PSGpu int `yaml:"psGPU"` // --ps-gpus
	// PSMemory stores the ps memory,match option --ps-memory
	PSMemory string `yaml:"psMemory"`
	// PSMemoryLimit stores the ps memory limit,match option --ps-memory-limit
	PSMemoryLimit string `yaml:"psMemoryLimit"`
	// SuccessPolicy defines the policy to mark the TFJob as succeeded.
	SuccessPolicy string `yaml:"successPolicy"`
	// CleanPodPolicy stores the cleaning pod policy,match option --clean-task-policy
	CleanPodPolicy string `yaml:"cleanPodPolicy"`
	// UseChief stores the using chief or not,match option --chief
	UseChief bool `yaml:",omitempty"` // --chief
	// ChiefCount stores the chief count of job,match option --chief-count
	ChiefCount int `yaml:"chief"`
	// UseEvaluator is used to enable evaluator or not,match option --evaluator
	UseEvaluator bool `yaml:",omitempty"`
	// ChiefPort stores the chief port,match option --chief-port
	ChiefPort int `yaml:"chiefPort"`
	//ChiefNodeSelectors map[string]string `yaml:"chiefNodeSelectors"` // --chief-selector
	// ChiefCpu stores the chief pod cpu,match option --chief-cpu
	ChiefCpu string `yaml:"chiefCPU"`
	// ChiefCpuLimit stores the chief pod cpu limit,match option --chief-cpu-limit
	ChiefCpuLimit string `yaml:"chiefCPULimit"`
	// ChiefMemory stores the chief pod memory,match option --chief-memory
	ChiefMemory string `yaml:"chiefMemory"`
	// ChiefMemoryLimit stores the chief pod memory limit,match option --chief-memory-limit
	ChiefMemoryLimit string `yaml:"chiefMemoryLimit"`
	// EvaluatorCpu stores the evaluator pod cpu,match option --evaluator-cpu
	EvaluatorCpu string `yaml:"evaluatorCPU"`
	// EvaluatorCpuLimit stores the evaluator pod cpu limit,match option --evaluator-cpu-limit
	EvaluatorCpuLimit string `yaml:"evaluatorCPULimit"`
	//EvaluatorNodeSelectors map[string]string `yaml:"evaluatorNodeSelectors"` // --evaluator-selector
	// EvaluatorMemory stores the evaluator pod memory,match option --evaluator-memory
	EvaluatorMemory string `yaml:"evaluatorMemory"` // --evaluatorMemory
	// EvaluatorMemoryLimit stores the evaluator pod memory limit,match option --evaluator-memory-limit
	EvaluatorMemoryLimit string `yaml:"evaluatorMemoryLimit"` // --evaluatorMemoryLimit
	// EvaluatorCount stores the evaluator pod count,match option --evaluator-count
	EvaluatorCount int `yaml:"evaluator"`
	// HasGangScheduler determines if it has gang scheduler
	HasGangScheduler bool `yaml:"hasGangScheduler"`
	// ActiveDeadlineSeconds Specifies the duration (in seconds) since startTime during which the job can remain active
	// before it is terminated
	ActiveDeadlineSeconds int64 `yaml:"activeDeadlineSeconds,omitempty"`
	// StartingDeadlineSeconds Specifies the duration (in seconds) since startTime during which the job can remain pending
	// before it is terminated
	StartingDeadlineSeconds int64 `yaml:"startingDeadlineSeconds,omitempty"`
	// Defines the TTL for cleaning up finished TFJobs. Defaults to infinite.
	TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"`
	// ShareMemory Specifies the shared memory size
	ShareMemory string `yaml:"shareMemory"`
	// for common args
	CommonSubmitArgs `yaml:",inline"`

	// SubmitTensorboardArgs stores tensorboard information
	SubmitTensorboardArgs `yaml:",inline"`

	// SubmitSyncCodeArgs stores syncing code information
	SubmitSyncCodeArgs `yaml:",inline"`

	// TFRuntime stores the runtime
	TFRuntime `yaml:"-"`

	// TrainingOperatorCRD compatible with training-operator crd.
	TrainingOperatorCRD bool `yaml:"trainingOperatorCRD,omitempty"`
}

type SubmitTensorboardArgs

type SubmitTensorboardArgs struct {
	UseTensorboard   bool   `yaml:"useTensorboard"`             // --tensorboard
	TensorboardImage string `yaml:"tensorboardImage,omitempty"` // --tensorboardImage
	TrainingLogdir   string `yaml:"trainingLogdir"`             // --logdir
	HostLogPath      string `yaml:"hostLogPath"`
	IsLocalLogging   bool   `yaml:"isLocalLogging"`
}

SubmitTensorboardArgs is used to store tensorborad information

type SubmitVolcanoJobArgs

type SubmitVolcanoJobArgs struct {
	// Name stores the job name
	Name string
	// Namespace stores the namespace of job
	Namespace string
	// TrainingType is used to accept job type
	TrainingType TrainingJobType
	// Command defines the job command
	Command string
	// The MinAvailable available pods to run for this Job
	MinAvailable int `yaml:"minAvailable"`
	// Specifies the queue that will be used in the scheduler, "default" queue is used this leaves empty.
	Queue string `yaml:"queue"`
	// SchedulerName is the default value of `tasks.template.spec.schedulerName`.
	SchedulerName string `yaml:"schedulerName"`
	// TaskName specifies the name of task
	TaskName string `yaml:"taskName"`
	// TaskImages specifies the task image
	TaskImages []string `yaml:"taskImages"`
	// TaskReplicas specifies the replicas of this Task in Job
	TaskReplicas int `yaml:"taskReplicas"`
	// TaskCPU specifies the cpu resource required for each replica of Task in Job. default is 250m
	TaskCPU string `yaml:"taskCPU"`
	// TaskMemory specifies the memory resource required for each replica of Task in Job. default is 128Mi
	TaskMemory string `yaml:"taskMemory"`
	// TaskPort specifies the task port
	TaskPort int `yaml:"taskPort"`
	// Annotations defines pod annotations of job,match option --annotation
	Annotations map[string]string `yaml:"annotations"`

	// Labels specify the job labels and it is work for pods
	Labels map[string]string `yaml:"labels"`
}

type TFRuntime

type TFRuntime interface {
	// check the tfjob args
	Check(tf *SubmitTFJobArgs) (err error)
	// transform the tfjob
	Transform(tf *SubmitTFJobArgs) (err error)
	Runtime
}

Customized runtime for tf training training

type TensorFlowServingArgs

type TensorFlowServingArgs struct {
	VersionPolicy        string `yaml:"versionPolicy"`        // --version-policy
	ModelConfigFile      string `yaml:"modelConfigFile"`      // --model-config-file
	MonitoringConfigFile string `yaml:"monitoringConfigFile"` // --monitoring-config-file
	ModelPath            string `yaml:"modelPath"`            // --model-path
	Port                 int    `yaml:"port"`                 // --port
	RestfulPort          int    `yaml:"restApiPort"`          // --restful-port
	CommonServingArgs    `yaml:",inline"`
}

type TensorRTServingArgs

type TensorRTServingArgs struct {
	ModelStore        string `yaml:"modelStore"`   // --modelStore
	MetricsPort       int    `yaml:"metricsPort"`  // --metricsPort
	HttpPort          int    `yaml:"httpPort"`     // --httpPort
	GrpcPort          int    `yaml:"grpcPort"`     // --grpcPort
	AllowMetrics      bool   `yaml:"allowMetrics"` // --allowMetrics
	CommonServingArgs `yaml:",inline"`
}

type TolerationArgs added in v0.9.2

type TolerationArgs struct {
	Key      string `yaml:"key,omitempty"`
	Value    string `yaml:"value,omitempty"`
	Operator string `yaml:"operator,omitempty"`
	Effect   string `yaml:"effect,omitempty"`
}

type TrafficRouterSplitArgs

type TrafficRouterSplitArgs struct {
	ServingName    string `yaml:"servingName,omitempty"` //--name
	Namespace      string `yaml:"namespace,omitempty"`   //--namespace
	Versions       string `yaml:"versions,omitempty"`    //--versions
	Weights        string `yaml:"weights,omitempty"`     //--weights
	VersionWeights []ServingVersionWeight
}

type TrainingJobInfo

type TrainingJobInfo struct {
	// The unique identity of the training job
	UUID string `json:"uuid" yaml:"uuid"`
	// The name of the training job
	Name string `json:"name" yaml:"name"`
	// The namespace of the training job
	Namespace string `json:"namespace" yaml:"namespace"`
	// The time of the training job
	Duration string `json:"duration" yaml:"duration"`
	// The status of the training Job
	Status TrainingJobStatus `json:"status" yaml:"status"`

	// The training type of the training job
	Trainer TrainingJobType `json:"trainer" yaml:"trainer"`
	// The tensorboard of the training job
	Tensorboard string `json:"tensorboard" yaml:"tensorboard"`

	// The name of the chief Instance
	ChiefName string `json:"chiefName" yaml:"chiefName"`

	// The instances under the training job
	Instances []TrainingJobInstance `json:"instances" yaml:"instances"`

	// The priority of the training job
	Priority string `json:"priority" yaml:"priority"`

	// RequestGPU stores the request gpus
	RequestGPU int64 `json:"requestGPUs" yaml:"requestGPUs"`

	// AllocatedGPU stores the allocated gpus
	AllocatedGPU int64 `json:"allocatedGPUs" yaml:"allocatedGPUs"`

	// CreationTimestamp stores the creation timestamp of job
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`

	// Model information associated with this job
	ModelName    string `json:"modelName"`
	ModelVersion string `json:"modelVersion"`
	ModelSource  string `json:"modelSource"`
}

TrainingJobInfo stores training job information

type TrainingJobInstance

type TrainingJobInstance struct {
	// IP defines the instance ip
	IP string `json:"ip" yaml:"ip"`
	// the status of of instance
	Status string `json:"status"`
	// the name of instance
	Name string `json:"name"`
	// the age of instance
	Age string `json:"age"`
	// the node instance runs on
	Node string `json:"node"`
	// NodeIP is store the node ip
	NodeIP string `json:"nodeIP" yaml:"nodeIP"`
	// the instance is chief or not
	IsChief bool `json:"chief" yaml:"chief"`
	// RequestGPUs is used to store request gpu count
	RequestGPUs int `json:"requestGPUs" yaml:"requestGPUs"`
	// GpuDutyCycle stores the gpu metrics
	GPUMetrics map[string]GpuMetric `json:"gpuMetrics" yaml:"gpuMetrics"`
	// CreationTimestamp returns the creation timestamp of instance
	CreationTimestamp int64 `json:"creationTimestamp" yaml:"creationTimestamp"`
}

TrainingJobInstance defines the instance of training job

type TrainingJobStatus

type TrainingJobStatus string

TrainingJobStatus defines all the kinds of JobStatus

const (
	// TrainingJobQueuing means the job is queuing
	TrainingJobQueuing TrainingJobStatus = "QUEUING"
	// TrainingJobPending means the job is pending
	TrainingJobPending TrainingJobStatus = "PENDING"
	// TrainingJobRunning means the job is running
	TrainingJobRunning TrainingJobStatus = "RUNNING"
	// TrainingJobSucceeded means the job is Succeeded
	TrainingJobSucceeded TrainingJobStatus = "SUCCEEDED"
	// TrainingJobFailed means the job is failed
	TrainingJobFailed TrainingJobStatus = "FAILED"
)

type TrainingJobType

type TrainingJobType string

TrainingJobType defines the supporting training job type

const (
	// TFTrainingJob defines the tfjob
	TFTrainingJob TrainingJobType = "tfjob"
	// MPITrainingJob defines the mpijob
	MPITrainingJob TrainingJobType = "mpijob"
	// PytorchTrainingJob defines the pytorchjob
	PytorchTrainingJob TrainingJobType = "pytorchjob"
	// HorovodTrainingJob defines the horovod job
	HorovodTrainingJob TrainingJobType = "horovodjob"
	// VolcanoTrainingJob defines the volcano job
	VolcanoTrainingJob TrainingJobType = "volcanojob"
	// ETTrainingJob defines the etjob
	ETTrainingJob TrainingJobType = "etjob"
	// SparkTrainingJob defines the spark job
	SparkTrainingJob TrainingJobType = "sparkjob"
	// DeepSpeedTrainingJob defines the deepspeed job
	DeepSpeedTrainingJob TrainingJobType = "deepspeedjob"
	// AllTrainingJob represents all job types
	AllTrainingJob TrainingJobType = ""
	// UnknownTrainingJob defines the unknown training
	UnknownTrainingJob TrainingJobType = "unknown"
	// RayJob defines the ray job
	RayJob TrainingJobType = "rayjob"
)

type TrainingJobTypeInfo

type TrainingJobTypeInfo struct {
	Name      TrainingJobType
	Alias     string
	Shorthand string
}

type TritonServingArgs added in v0.8.5

type TritonServingArgs struct {
	Backend           string   `yaml:"backend"`         // --backend
	ModelRepository   string   `yaml:"modelRepository"` // --model-repository
	MetricsPort       int      `yaml:"metricsPort"`     // --metrics-port
	HttpPort          int      `yaml:"httpPort"`        // --http-port
	GrpcPort          int      `yaml:"grpcPort"`        // --grpc-port
	AllowMetrics      bool     `yaml:"allowMetrics"`    // --allow-metrics
	LoadModels        []string `yaml:"loadModels"`      // --load-model
	ExtendCommand     string   `yaml:"extendCommand"`   // --extend-command
	CommonServingArgs `yaml:",inline"`
}

type UpdateCustomServingArgs added in v0.8.9

type UpdateCustomServingArgs struct {
	CommonUpdateServingArgs `yaml:",inline"`
}

type UpdateDistributedServingArgs added in v0.12.0

type UpdateDistributedServingArgs struct {
	Workers                 int    `yaml:"workers"`         // --workers
	MasterCpu               string `yaml:"masterCPU"`       // --master-cpu
	WorkerCpu               string `yaml:"workerCPU"`       // --worker-cpu
	MasterGPUCount          int    `yaml:"masterGPUCount"`  // master-gpus
	WorkerGPUCount          int    `yaml:"workerGPUCount"`  // worker-gpus
	MasterMemory            string `yaml:"masterMemory"`    // master-memory
	WorkerMemory            string `yaml:"workerMemory"`    // worker-memory
	MasterGPUMemory         int    `yaml:"masterGPUMemory"` // master-gpumemory
	WorkerGPUMemory         int    `yaml:"workerGPUMemory"` // worker-gpumemory
	MasterGPUCore           int    `yaml:"masterGPUCore"`   // master-gpucore
	WorkerGPUCore           int    `yaml:"workerGPUCore"`   // worker-gpucore
	MasterCommand           string `yaml:"masterCommand"`   // master-command
	WorkerCommand           string `yaml:"workerCommand"`   // worker-command
	CommonUpdateServingArgs `yaml:",inline"`
}

type UpdateKServeArgs added in v0.9.11

type UpdateKServeArgs struct {
	ModelFormat             *ModelFormat `yaml:"modelFormat"`                    // --model-format
	Runtime                 string       `yaml:"runtime"`                        // --runtime
	StorageUri              string       `yaml:"storageUri"`                     // --storageUri
	RuntimeVersion          string       `yaml:"runtimeVersion"`                 // --runtime-version
	ProtocolVersion         string       `yaml:"protocolVersion"`                // --protocol-version
	MinReplicas             int          `yaml:"minReplicas"`                    // --min-replicas
	MaxReplicas             int          `yaml:"maxReplicas"`                    // --max-replicas
	ScaleTarget             int          `yaml:"scaleTarget"`                    // --scale-target
	ScaleMetric             string       `yaml:"scaleMetric"`                    // --scale-metric
	ContainerConcurrency    int64        `yaml:"containerConcurrency"`           // --container-concurrency
	TimeoutSeconds          int64        `yaml:"timeout"`                        // --timeout
	CanaryTrafficPercent    int64        `yaml:"canaryTrafficPercent,omitempty"` // --canary-traffic-percent
	Port                    int          `yaml:"port"`                           // --port
	CommonUpdateServingArgs `yaml:",inline"`
}

type UpdateTensorFlowServingArgs added in v0.8.9

type UpdateTensorFlowServingArgs struct {
	ModelConfigFile         string `yaml:"modelConfigFile"`      // --model-config-file
	MonitoringConfigFile    string `yaml:"monitoringConfigFile"` // --monitoring-config-file
	ModelName               string `yaml:"modelName"`            // --model-name
	ModelPath               string `yaml:"modelPath"`            // --model-path
	CommonUpdateServingArgs `yaml:",inline"`
}

type UpdateTritonServingArgs added in v0.8.9

type UpdateTritonServingArgs struct {
	ModelRepository         string `yaml:"modelRepository"` // --model-repository
	AllowMetrics            bool   `yaml:"allowMetrics"`    // --allow-metrics
	CommonUpdateServingArgs `yaml:",inline"`
}

type VirtualService

type VirtualService struct {
	*istiov1alpha3.VirtualService
	Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"`
}

type VirtualServiceCRD

type VirtualServiceCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type WorkerGroupSpec added in v0.11.0

type WorkerGroupSpec struct {
	Image  string `yaml:"image"`
	Cpu    string `yaml:"cpu"`
	Memory string `yaml:"memory"`
	Gpu    int    `yaml:"gpu"`
	// Replicas is the number of desired Pods for this worker group.
	// +kubebuilder:default:=0
	Replicas int32 `yaml:"replicas,omitempty"`
	// MinReplicas denotes the minimum number of desired Pods for this worker group.
	// +kubebuilder:default:=0
	MinReplicas int32 `yaml:"minReplicas"`
	// MaxReplicas denotes the maximum number of desired Pods for this worker group, and the default value is maxInt32.
	// +kubebuilder:default:=2147483647
	MaxReplicas int32 `yaml:"maxReplicas"`
	// NumOfHosts denotes the number of hosts to create per replica. The default value is 1.
	// +kubebuilder:default:=1
	NumOfHosts int32 `yaml:"numOfHosts,omitempty"`
}

WorkerGroupSpec are the specs for the worker pods

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL