Documentation ¶
Overview ¶
Copyright 2018 The Kubeflow Authors
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Index ¶
- Constants
- Variables
- func BuildJobInfo(job TrainingJob) *types.JobInfo
- func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string
- func GetJobRealStatus(job TrainingJob) string
- func GetNamespace() string
- func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)
- func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)
- func GpuMonitoringInstalled(client *kubernetes.Clientset) bool
- func ListServing(client *kubernetes.Clientset) ([]servejob.Serving, error)
- func ListServingJobsByHelm() ([]servejob.Serving, error)
- func ListServingsByName(client *kubernetes.Clientset, name string) (servings []servejob.Serving, err error)
- func NewCommand() *cobra.Command
- func NewCompletionCommand() *cobra.Command
- func NewDataCommand() *cobra.Command
- func NewDataListCommand() *cobra.Command
- func NewDeleteCommand() *cobra.Command
- func NewGetCommand() *cobra.Command
- func NewListCommand() *cobra.Command
- func NewLogViewerCommand() *cobra.Command
- func NewLogsCommand() *cobra.Command
- func NewPruneCommand() *cobra.Command
- func NewServeCommand() *cobra.Command
- func NewServingCustomCommand() *cobra.Command
- func NewServingDeleteCommand() *cobra.Command
- func NewServingGetCommand() *cobra.Command
- func NewServingListCommand() *cobra.Command
- func NewServingLogCommand() *cobra.Command
- func NewServingTensorFlowCommand() *cobra.Command
- func NewServingTensorRTCommand() *cobra.Command
- func NewSparkApplicationCommand() *cobra.Command
- func NewSubmitCommand() *cobra.Command
- func NewSubmitHorovodJobCommand() *cobra.Command
- func NewSubmitMPIJobCommand() *cobra.Command
- func NewSubmitPyTorchJobCommand() *cobra.Command
- func NewSubmitSparkJobArgs() *submitSparkJobArgs
- func NewSubmitStandaloneJobCommand() *cobra.Command
- func NewSubmitTFJobCommand() *cobra.Command
- func NewSubmitVolcanoJobArgs() *submitVolcanoJobArgs
- func NewTopCommand() *cobra.Command
- func NewTopJobCommand() *cobra.Command
- func NewTopNodeCommand() *cobra.Command
- func NewTrafficRouterSplitCommand() *cobra.Command
- func NewVersionCmd(cliName string) *cobra.Command
- func NewVolcanoJobCommand() *cobra.Command
- func ParseMountPath(dataset []string) (err error)
- func PrintLine(w io.Writer, fields ...string)
- func SortMapKeys(podMetric PodGpuMetric) []string
- type BasicJobInfo
- type Destination
- type DestinationRuleCRD
- type DestinationWeight
- type Driver
- type Executor
- type GpuMetric
- type GpuMetricInfo
- type HTTPMatchRequest
- type HTTPRoute
- type HorovodJob
- type HorovodJobTrainer
- type JobGpuMetric
- type JobInfo
- func (ji *JobInfo) Age() time.Duration
- func (ji *JobInfo) AllPods() []v1.Pod
- func (ji *JobInfo) AllocatedGPU() int64
- func (ji *JobInfo) ChiefPod() v1.Pod
- func (ji *JobInfo) Duration() time.Duration
- func (ji *JobInfo) GetStatus() (status string)
- func (ji *JobInfo) HostIPOfChief() (hostIP string)
- func (ji *JobInfo) Name() string
- func (ji *JobInfo) Namespace() string
- func (ji *JobInfo) RequestedGPU() int64
- func (ji *JobInfo) StartTime() *metav1.Time
- func (ji *JobInfo) Trainer() string
- func (ji *JobInfo) Uid() string
- type MPIJob
- func (mj *MPIJob) Age() time.Duration
- func (mj *MPIJob) AllPods() []v1.Pod
- func (mj *MPIJob) AllocatedGPU() int64
- func (mj *MPIJob) ChiefPod() v1.Pod
- func (mj *MPIJob) Duration() time.Duration
- func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (m *MPIJob) GetPriorityClass() string
- func (mj *MPIJob) GetStatus() (status string)
- func (mj *MPIJob) HostIPOfChief() (hostIP string)
- func (mj *MPIJob) Name() string
- func (mj *MPIJob) Namespace() string
- func (mj *MPIJob) RequestedGPU() int64
- func (mj *MPIJob) StartTime() *metav1.Time
- func (mj *MPIJob) Trainer() string
- func (mj *MPIJob) Uid() string
- type MPIJobTrainer
- type NodeDescriber
- type NodeInfo
- type PodGpuMetric
- type PortSelector
- type PreprocesObject
- type PrintArgs
- type PrometheusMetric
- type PrometheusMetricData
- type PrometheusMetricResult
- type PrometheusMetricValue
- type PruneArgs
- type PyTorchJob
- func (pj *PyTorchJob) Age() time.Duration
- func (pj *PyTorchJob) AllPods() []v1.Pod
- func (pj *PyTorchJob) AllocatedGPU() int64
- func (pj *PyTorchJob) ChiefPod() v1.Pod
- func (pj *PyTorchJob) Duration() time.Duration
- func (pj *PyTorchJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (p *PyTorchJob) GetPriorityClass() string
- func (pj *PyTorchJob) GetStatus() (status string)
- func (pj *PyTorchJob) HostIPOfChief() (hostIP string)
- func (pj *PyTorchJob) Name() string
- func (pj *PyTorchJob) Namespace() string
- func (pj *PyTorchJob) RequestedGPU() int64
- func (pj *PyTorchJob) StartTime() *metav1.Time
- func (pj *PyTorchJob) Trainer() string
- func (pj *PyTorchJob) Uid() string
- type PyTorchJobTrainer
- type Resource
- type ResourceType
- type Runtime
- type ServeArgs
- type ServeCustomArgs
- type ServeTensorFlowArgs
- type ServeTensorRTArgs
- type SortPodConditionByLastTransitionTime
- type SparkJob
- func (sj *SparkJob) Age() time.Duration
- func (sj *SparkJob) AllPods() []v1.Pod
- func (sj *SparkJob) AllocatedGPU() int64
- func (sj *SparkJob) ChiefPod() v1.Pod
- func (sj *SparkJob) Duration() time.Duration
- func (sj *SparkJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (sj *SparkJob) GetPriorityClass() string
- func (sj *SparkJob) GetStatus() (status string)
- func (sj *SparkJob) HostIPOfChief() (hostIP string)
- func (sj *SparkJob) Name() string
- func (sj *SparkJob) Namespace() string
- func (sj *SparkJob) RequestedGPU() int64
- func (sj *SparkJob) StartTime() *metav1.Time
- func (sj *SparkJob) Trainer() string
- func (sj *SparkJob) Uid() string
- type SparkJobTrainer
- type StandaloneJob
- type StandaloneJobTrainer
- type StringMatchPrefix
- type TensorFlowJob
- func (tj *TensorFlowJob) Age() time.Duration
- func (tj *TensorFlowJob) AllPods() []v1.Pod
- func (tj *TensorFlowJob) AllocatedGPU() int64
- func (tj *TensorFlowJob) ChiefPod() v1.Pod
- func (tj *TensorFlowJob) Duration() time.Duration
- func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (t *TensorFlowJob) GetPriorityClass() string
- func (tj *TensorFlowJob) GetStatus() (status string)
- func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
- func (tj *TensorFlowJob) Name() string
- func (tj *TensorFlowJob) Namespace() string
- func (tj *TensorFlowJob) RequestedGPU() int64
- func (tj *TensorFlowJob) StartTime() *metav1.Time
- func (tj *TensorFlowJob) Trainer() string
- func (tj *TensorFlowJob) Uid() string
- type TensorFlowJobTrainer
- type Trainer
- func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer
- func NewMPIJobTrainer(client *kubernetes.Clientset) Trainer
- func NewPyTorchJobTrainer(client *kubernetes.Clientset) Trainer
- func NewSparkJobTrainer(client *kubernetes.Clientset) Trainer
- func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer
- func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer
- func NewTrainers(client *kubernetes.Clientset) []Trainer
- func NewVolcanoJobTrainer(client *kubernetes.Clientset) Trainer
- type TrainingJob
- type VirtualService
- type VirtualServiceCRD
- type VolcanoJob
- func (vj *VolcanoJob) Age() time.Duration
- func (vj *VolcanoJob) AllPods() []v1.Pod
- func (vj *VolcanoJob) AllocatedGPU() int64
- func (vj *VolcanoJob) ChiefPod() v1.Pod
- func (vj *VolcanoJob) Duration() time.Duration
- func (vj *VolcanoJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (vj *VolcanoJob) GetPriorityClass() string
- func (vj *VolcanoJob) GetStatus() (status string)
- func (vj *VolcanoJob) HostIPOfChief() (hostIP string)
- func (vj *VolcanoJob) Name() string
- func (vj *VolcanoJob) Namespace() string
- func (vj *VolcanoJob) RequestedGPU() int64
- func (vj *VolcanoJob) StartTime() *metav1.Time
- func (vj *VolcanoJob) Trainer() string
- func (vj *VolcanoJob) Uid() string
- type VolcanoJobTrainer
- func (st *VolcanoJobTrainer) GetTrainingJob(name, namespace string) (job TrainingJob, err error)
- func (st *VolcanoJobTrainer) GetTrainingJobAtSubmit(name, namespace string) (job TrainingJob, err error)
- func (st *VolcanoJobTrainer) IsSupported(name, ns string) bool
- func (st *VolcanoJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
- func (st *VolcanoJobTrainer) Type() string
Constants ¶
const ( RecommendedConfigPathEnvVar = "ARENA_CONFIG" DefaultArenaConfigPath = "~/.arena/config" )
const ( CHART_PKG_LOC = "CHARTREPO" // GPUResourceName is the extended name of the GPU resource since v1.8 // this uses the device plugin mechanism NVIDIAGPUResourceName = "nvidia.com/gpu" ALIYUNGPUResourceName = "aliyun.com/gpu-mem" DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu" )
const (
// CLIName is the name of the CLI
CLIName = "arena"
)
const KUBEFLOW_NAMESPACE = "kubeflow"
const KUBE_SYSTEM_NAMESPACE = "kube-system"
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
const PROMETHEUS_SCHEME = "http"
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
const ResourceTypeJob = ResourceType("Job")
const ResourceTypePod = ResourceType("Pod")
const ResourceTypeStatefulSet = ResourceType("StatefulSet")
Variables ¶
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
Functions ¶
func BuildJobInfo ¶ added in v0.2.0
func BuildJobInfo(job TrainingJob) *types.JobInfo
* * BuildTrainingJobInfo returns types.TrainingJobInfo
func GetJobDashboards ¶
func GetJobRealStatus ¶
func GetJobRealStatus(job TrainingJob) string
Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending
func GetNamespace ¶ added in v0.2.0
func GetNamespace() string
func GetPrometheusServiceName ¶
func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)
* * Get Prometheus from different namespaces
func GetResourcesEvents ¶ added in v0.3.0
func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)
Get Event of the Job
func GpuMonitoringInstalled ¶
func GpuMonitoringInstalled(client *kubernetes.Clientset) bool
func ListServing ¶ added in v0.2.0
func ListServing(client *kubernetes.Clientset) ([]servejob.Serving, error)
ListServing returns a list of serving
func ListServingJobsByHelm ¶ added in v0.2.0
func ListServingsByName ¶ added in v0.3.0
func ListServingsByName(client *kubernetes.Clientset, name string) (servings []servejob.Serving, err error)
List Servings by name
func NewCommand ¶
NewCommand returns a new instance of an Arena command
func NewCompletionCommand ¶
func NewListCommand ¶
func NewLogViewerCommand ¶
func NewLogsCommand ¶
func NewPruneCommand ¶
func NewServeCommand ¶
func NewServingCustomCommand ¶ added in v0.3.0
func NewServingGetCommand ¶ added in v0.3.0
NewServingGetCommand starts the command
func NewServingListCommand ¶
func NewServingLogCommand ¶ added in v0.3.1
func NewServingTensorRTCommand ¶ added in v0.2.0
func NewSparkApplicationCommand ¶ added in v0.2.0
* https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
sparkApplication is the supported as default scheduledSparkApplication is not supported.
func NewSubmitCommand ¶
func NewSubmitHorovodJobCommand ¶
NewSubmitHorovodJobCommand
func NewSubmitMPIJobCommand ¶
func NewSubmitPyTorchJobCommand ¶ added in v0.5.0
func NewSubmitSparkJobArgs ¶ added in v0.2.0
func NewSubmitSparkJobArgs() *submitSparkJobArgs
func NewSubmitTFJobCommand ¶
func NewSubmitVolcanoJobArgs ¶ added in v0.2.0
func NewSubmitVolcanoJobArgs() *submitVolcanoJobArgs
func NewTopCommand ¶
func NewTopJobCommand ¶
func NewTopNodeCommand ¶
func NewVersionCmd ¶
func NewVolcanoJobCommand ¶ added in v0.2.0
func ParseMountPath ¶
func SortMapKeys ¶
func SortMapKeys(podMetric PodGpuMetric) []string
Types ¶
type BasicJobInfo ¶ added in v0.3.0
type BasicJobInfo struct {
// contains filtered or unexported fields
}
func (*BasicJobInfo) Resources ¶ added in v0.3.0
func (j *BasicJobInfo) Resources() []Resource
type Destination ¶
type Destination struct { *istiov1alpha3.Destination Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"` }
type DestinationRuleCRD ¶
type DestinationRuleCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }
type DestinationWeight ¶
type DestinationWeight struct { Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"` Weight int32 `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"` }
type GpuMetricInfo ¶
type GpuMetricInfo struct { MetricName string Value string Time float64 PodName string PodNamespace string ContainerName string NodeName string GPUUID string Id string }
func QueryMetricByPrometheus ¶
func QueryMetricByPrometheus(client *kubernetes.Clientset, prometheusServiceName string, namespace string, query string) ([]GpuMetricInfo, error)
type HTTPMatchRequest ¶
type HTTPMatchRequest struct { *istiov1alpha3.HTTPMatchRequest Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"` }
type HTTPRoute ¶
type HTTPRoute struct { *istiov1alpha3.HTTPRoute Match []*HTTPMatchRequest `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"` Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"` }
type HorovodJob ¶
type HorovodJob struct {
*JobInfo
}
Horovod Job Information
func (*HorovodJob) AllPods ¶
func (hj *HorovodJob) AllPods() []v1.Pod
Get all the pods of the Training Job
func (*HorovodJob) GetJobDashboards ¶
func (hj *HorovodJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*HorovodJob) GetPriorityClass ¶ added in v0.3.0
func (hj *HorovodJob) GetPriorityClass() string
Get PriorityClass
func (*HorovodJob) HostIPOfChief ¶
func (hj *HorovodJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
type HorovodJobTrainer ¶
type HorovodJobTrainer struct {
// contains filtered or unexported fields
}
Horovod Job trainer
func (*HorovodJobTrainer) GetTrainingJob ¶
func (m *HorovodJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*HorovodJobTrainer) IsSupported ¶
func (m *HorovodJobTrainer) IsSupported(name, ns string) bool
check if it's Horovod job
func (*HorovodJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (hj *HorovodJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
func (*HorovodJobTrainer) Type ¶
func (m *HorovodJobTrainer) Type() string
type JobGpuMetric ¶
type JobGpuMetric map[string]PodGpuMetric
func GetJobGpuMetric ¶
func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric JobGpuMetric, err error)
func GetPodsGpuInfo ¶
func GetPodsGpuInfo(client *kubernetes.Clientset, prometheusServiceName string, namespace string, podNames []string) (JobGpuMetric, error)
func (JobGpuMetric) GetPodMetrics ¶
func (m JobGpuMetric) GetPodMetrics(podName string) PodGpuMetric
func (*JobGpuMetric) SetPodMetric ¶
func (m *JobGpuMetric) SetPodMetric(metric GpuMetricInfo)
type JobInfo ¶
type JobInfo struct { *BasicJobInfo // contains filtered or unexported fields }
func (*JobInfo) AllocatedGPU ¶
Requested GPU count of the Job
func (*JobInfo) HostIPOfChief ¶
Get the hostIP of the chief Pod
func (*JobInfo) RequestedGPU ¶
Requested GPU count of the Job
type MPIJob ¶
type MPIJob struct { *BasicJobInfo // contains filtered or unexported fields }
MPI Job Information
func (*MPIJob) GetJobDashboards ¶
func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*MPIJob) GetPriorityClass ¶ added in v0.3.0
Get PriorityClass
func (*MPIJob) HostIPOfChief ¶
Get the hostIP of the chief Pod
type MPIJobTrainer ¶
type MPIJobTrainer struct {
// contains filtered or unexported fields
}
MPI Job trainer
func (*MPIJobTrainer) GetTrainingJob ¶
func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
Get the training job from cache or directly
func (*MPIJobTrainer) IsSupported ¶
func (tt *MPIJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*MPIJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (tt *MPIJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
type NodeDescriber ¶
type NodeDescriber struct {
// contains filtered or unexported fields
}
type PodGpuMetric ¶
type PortSelector ¶
type PortSelector struct { *istiov1alpha3.PortSelector Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"` }
type PreprocesObject ¶
type PreprocesObject struct { ServiceName string Namespace string DestinationRule DestinationRuleCRD VirtualService VirtualServiceCRD }
type PrometheusMetric ¶
type PrometheusMetric struct { Status string `json:"status,inline"` Data PrometheusMetricData `json:"data,omitempty"` }
type PrometheusMetricData ¶
type PrometheusMetricData struct { Result []PrometheusMetricResult `json:"result"` ResultType string `json:"resultType"` }
type PrometheusMetricResult ¶
type PrometheusMetricResult struct { Metric map[string]string `json:"metric"` Value []PrometheusMetricValue `json:"value"` }
type PrometheusMetricValue ¶
type PrometheusMetricValue interface{}
type PyTorchJob ¶ added in v0.5.0
type PyTorchJob struct { *BasicJobInfo // contains filtered or unexported fields }
PyTorch Job Information
func (*PyTorchJob) AllPods ¶ added in v0.5.0
func (pj *PyTorchJob) AllPods() []v1.Pod
Get all the pods of the Training Job
func (*PyTorchJob) AllocatedGPU ¶ added in v0.5.0
func (pj *PyTorchJob) AllocatedGPU() int64
Requested GPU count of the Job
func (*PyTorchJob) ChiefPod ¶ added in v0.5.0
func (pj *PyTorchJob) ChiefPod() v1.Pod
Get the master Pod of the Job.
func (*PyTorchJob) Duration ¶ added in v0.5.0
func (pj *PyTorchJob) Duration() time.Duration
Get the Job Training Duration
func (*PyTorchJob) GetJobDashboards ¶ added in v0.5.0
func (pj *PyTorchJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*PyTorchJob) GetPriorityClass ¶ added in v0.5.0
func (p *PyTorchJob) GetPriorityClass() string
Get PriorityClass
func (*PyTorchJob) GetStatus ¶ added in v0.5.0
func (pj *PyTorchJob) GetStatus() (status string)
Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (*PyTorchJob) HostIPOfChief ¶ added in v0.5.0
func (pj *PyTorchJob) HostIPOfChief() (hostIP string)
Get the hostIP of the master Pod
func (*PyTorchJob) Name ¶ added in v0.5.0
func (pj *PyTorchJob) Name() string
func (*PyTorchJob) Namespace ¶ added in v0.5.0
func (pj *PyTorchJob) Namespace() string
func (*PyTorchJob) RequestedGPU ¶ added in v0.5.0
func (pj *PyTorchJob) RequestedGPU() int64
Requested GPU count of the Job
func (*PyTorchJob) StartTime ¶ added in v0.5.0
func (pj *PyTorchJob) StartTime() *metav1.Time
Get the start time
func (*PyTorchJob) Trainer ¶ added in v0.5.0
func (pj *PyTorchJob) Trainer() string
func (*PyTorchJob) Uid ¶ added in v0.5.0
func (pj *PyTorchJob) Uid() string
type PyTorchJobTrainer ¶ added in v0.5.0
type PyTorchJobTrainer struct {
// contains filtered or unexported fields
}
PyTorch Job trainer
func (*PyTorchJobTrainer) GetTrainingJob ¶ added in v0.5.0
func (tt *PyTorchJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
Get the training job from cache or directly
func (*PyTorchJobTrainer) IsSupported ¶ added in v0.5.0
func (tt *PyTorchJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*PyTorchJobTrainer) ListTrainingJobs ¶ added in v0.5.0
func (tt *PyTorchJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
func (*PyTorchJobTrainer) Type ¶ added in v0.5.0
func (tt *PyTorchJobTrainer) Type() string
Get the type
type Resource ¶ added in v0.3.0
type Resource struct { Name string Uid string ResourceType ResourceType }
type ResourceType ¶ added in v0.3.0
type ResourceType string
type Runtime ¶ added in v0.3.3
type Runtime interface {
// contains filtered or unexported methods
}
type ServeArgs ¶
type ServeArgs struct { ImagePullPolicy string `yaml:"imagePullPolicy"` // --imagePullPolicy GPUCount int `yaml:"gpuCount"` // --gpus GPUMemory int `yaml:"gpuMemory"` // --gpumemory Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory Envs map[string]string `yaml:"envs"` // --envs Command string `yaml:"command"` // --command Replicas int `yaml:"replicas"` // --replicas Port int `yaml:"port"` // --port RestfulPort int `yaml:"restApiPort"` // --restfulPort EnableIstio bool `yaml:"enableIstio"` // --enableIstio ExposeService bool `yaml:"exposeService"` // --exposeService ServingName string `yaml:"servingName"` // --servingName ServingVersion string `yaml:"servingVersion"` // --servingVersion ModelDirs map[string]string `yaml:"modelDirs"` NodeSelectors map[string]string `yaml:"nodeSelectors"` // --selector Tolerations []string `yaml:"tolerations"` // --toleration Annotations map[string]string `yaml:"annotations"` ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists }
type ServeCustomArgs ¶ added in v0.3.0
type ServeTensorFlowArgs ¶
type ServeTensorFlowArgs struct { VersionPolicy string `yaml:"versionPolicy"` // --versionPolicy ModelConfigFile string `yaml:"modelConfigFile"` // --modelConfigFile ModelConfigFileContent string `yaml:"modelConfigFileContent"` Image string `yaml:"image"` // --image ModelName string `yaml:"modelName"` // --modelName ModelPath string `yaml:"modelPath"` // --modelPath ServeArgs `yaml:",inline"` }
type ServeTensorRTArgs ¶ added in v0.2.0
type ServeTensorRTArgs struct { Image string `yaml:"image"` // --image ModelStore string `yaml:"modelStore"` // --modelStore MetricsPort int `yaml:"metricsPort"` // --metricsPort HttpPort int `yaml:"httpPort"` // --httpPort GrpcPort int `yaml:"grpcPort"` // --grpcPort AllowMetrics bool `yaml:"allowMetrics"` // --allowMetrics ServeArgs `yaml:",inline"` }
type SortPodConditionByLastTransitionTime ¶ added in v0.2.0
type SortPodConditionByLastTransitionTime []v1.PodCondition
Sort the pod condition by time.
func (SortPodConditionByLastTransitionTime) Len ¶ added in v0.2.0
func (s SortPodConditionByLastTransitionTime) Len() int
func (SortPodConditionByLastTransitionTime) Less ¶ added in v0.2.0
func (s SortPodConditionByLastTransitionTime) Less(i, j int) bool
func (SortPodConditionByLastTransitionTime) Swap ¶ added in v0.2.0
func (s SortPodConditionByLastTransitionTime) Swap(i, j int)
type SparkJob ¶ added in v0.2.0
type SparkJob struct { *BasicJobInfo // contains filtered or unexported fields }
spark application wrapper
func (*SparkJob) AllocatedGPU ¶ added in v0.2.0
spark job without gpu supported
func (*SparkJob) GetJobDashboards ¶ added in v0.2.0
func (sj *SparkJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
func (*SparkJob) GetPriorityClass ¶ added in v0.3.0
Get PriorityClass TODO: @moyuan
func (*SparkJob) GetStatus ¶ added in v0.2.0
spark job driver state
------------------------------------------------------- NewState ApplicationStateType = "" SubmittedState ApplicationStateType = "SUBMITTED" RunningState ApplicationStateType = "RUNNING" CompletedState ApplicationStateType = "COMPLETED" FailedState ApplicationStateType = "FAILED" FailedSubmissionState ApplicationStateType = "SUBMISSION_FAILED" PendingRerunState ApplicationStateType = "PENDING_RERUN" InvalidatingState ApplicationStateType = "INVALIDATING" SucceedingState ApplicationStateType = "SUCCEEDING" FailingState ApplicationStateType = "FAILING" UnknownState ApplicationStateType = "UNKNOWN"
spark job executor state
------------------------------------------------------- ExecutorPendingState ExecutorState = "PENDING" ExecutorRunningState ExecutorState = "RUNNING" ExecutorCompletedState ExecutorState = "COMPLETED" ExecutorFailedState ExecutorState = "FAILED" ExecutorUnknownState ExecutorState = "UNKNOWN"
func (*SparkJob) HostIPOfChief ¶ added in v0.2.0
Get the hostIP of the driver Pod
func (*SparkJob) RequestedGPU ¶ added in v0.2.0
spark job without gpu supported
type SparkJobTrainer ¶ added in v0.2.0
type SparkJobTrainer struct {
// contains filtered or unexported fields
}
spark job trainer
func (*SparkJobTrainer) GetTrainingJob ¶ added in v0.2.0
func (st *SparkJobTrainer) GetTrainingJob(name, namespace string) (job TrainingJob, err error)
func (*SparkJobTrainer) IsSupported ¶ added in v0.2.0
func (st *SparkJobTrainer) IsSupported(name, ns string) bool
func (*SparkJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (st *SparkJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
func (*SparkJobTrainer) Type ¶ added in v0.2.0
func (st *SparkJobTrainer) Type() string
type StandaloneJob ¶
type StandaloneJob struct {
*JobInfo
}
Standalone Job Information
func (*StandaloneJob) GetJobDashboards ¶
func (sj *StandaloneJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*StandaloneJob) GetPriorityClass ¶ added in v0.3.0
func (sj *StandaloneJob) GetPriorityClass() string
Get PriorityClass
type StandaloneJobTrainer ¶
type StandaloneJobTrainer struct {
// contains filtered or unexported fields
}
Standalone Job trainer
func (*StandaloneJobTrainer) GetTrainingJob ¶
func (s *StandaloneJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*StandaloneJobTrainer) IsSupported ¶
func (s *StandaloneJobTrainer) IsSupported(name, ns string) bool
check if it's Standalone job
func (*StandaloneJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (s *StandaloneJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
func (*StandaloneJobTrainer) Type ¶
func (s *StandaloneJobTrainer) Type() string
type StringMatchPrefix ¶
type StringMatchPrefix struct {
Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}
type TensorFlowJob ¶
type TensorFlowJob struct { *BasicJobInfo // contains filtered or unexported fields }
TensorFlow Job Information
func (*TensorFlowJob) AllPods ¶
func (tj *TensorFlowJob) AllPods() []v1.Pod
Get all the pods of the Training Job
func (*TensorFlowJob) AllocatedGPU ¶
func (tj *TensorFlowJob) AllocatedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) ChiefPod ¶
func (tj *TensorFlowJob) ChiefPod() v1.Pod
Get the chief Pod of the Job.
func (*TensorFlowJob) Duration ¶ added in v0.2.0
func (tj *TensorFlowJob) Duration() time.Duration
Get the Job Training Duration
func (*TensorFlowJob) GetJobDashboards ¶
func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*TensorFlowJob) GetPriorityClass ¶ added in v0.3.0
func (t *TensorFlowJob) GetPriorityClass() string
Get PriorityClass
func (*TensorFlowJob) GetStatus ¶
func (tj *TensorFlowJob) GetStatus() (status string)
Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (*TensorFlowJob) HostIPOfChief ¶
func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
func (*TensorFlowJob) Name ¶
func (tj *TensorFlowJob) Name() string
func (*TensorFlowJob) Namespace ¶ added in v0.2.0
func (tj *TensorFlowJob) Namespace() string
func (*TensorFlowJob) RequestedGPU ¶
func (tj *TensorFlowJob) RequestedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) StartTime ¶
func (tj *TensorFlowJob) StartTime() *metav1.Time
func (*TensorFlowJob) Trainer ¶
func (tj *TensorFlowJob) Trainer() string
func (*TensorFlowJob) Uid ¶ added in v0.3.0
func (tj *TensorFlowJob) Uid() string
type TensorFlowJobTrainer ¶
type TensorFlowJobTrainer struct {
// contains filtered or unexported fields
}
TensorFlow Job trainer
func (*TensorFlowJobTrainer) GetTrainingJob ¶
func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*TensorFlowJobTrainer) IsSupported ¶
func (tt *TensorFlowJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*TensorFlowJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (tt *TensorFlowJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
func (*TensorFlowJobTrainer) Type ¶
func (tt *TensorFlowJobTrainer) Type() string
type Trainer ¶
type Trainer interface { // Check if the training job is supported IsSupported(name, ns string) bool // Get TrainingJob object directly. this method is called when `arena get` GetTrainingJob(name, namespace string) (TrainingJob, error) // Get the type of trainer Type() string ListTrainingJobs() ([]TrainingJob, error) }
func NewHorovodJobTrainer ¶
func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer
Create HorovodJob Trainer
func NewMPIJobTrainer ¶
func NewMPIJobTrainer(client *kubernetes.Clientset) Trainer
NewMPIJobTrainer
func NewPyTorchJobTrainer ¶ added in v0.5.0
func NewPyTorchJobTrainer(client *kubernetes.Clientset) Trainer
NewPyTorchJobTrainer
func NewSparkJobTrainer ¶ added in v0.2.0
func NewSparkJobTrainer(client *kubernetes.Clientset) Trainer
func NewStandaloneJobTrainer ¶
func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer
func NewTensorFlowJobTrainer ¶
func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer
func NewTrainers ¶
func NewTrainers(client *kubernetes.Clientset) []Trainer
construct the trainer list
func NewVolcanoJobTrainer ¶ added in v0.2.0
func NewVolcanoJobTrainer(client *kubernetes.Clientset) Trainer
type TrainingJob ¶
type TrainingJob interface { // Get the chief Pod of the Job. ChiefPod() v1.Pod // Get the name of the Training Job Name() string // Get the namespace of the Training Job Namespace() string // Get all the pods of the Training Job AllPods() []v1.Pod // Get all the kubernetes resource of the Training Job Resources() []Resource // Get the Status of the Job: RUNNING, PENDING, GetStatus() string // Return trainer Type, support MPI, standalone, tensorflow Trainer() string // Get the Job Age Age() time.Duration // Get the Job Duration Duration() time.Duration // Get start time StartTime() *metav1.Time // Get Dashboard GetJobDashboards(client *kubernetes.Clientset) ([]string, error) // Requested GPU count of the Job RequestedGPU() int64 // Requested GPU count of the Job AllocatedGPU() int64 // the host ip of the chief pod HostIPOfChief() string // The priority class name of the training job GetPriorityClass() string }
The Training Job can be TensorFlow, MPI and Caffe
type VirtualService ¶
type VirtualService struct { *istiov1alpha3.VirtualService Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"` }
type VirtualServiceCRD ¶
type VirtualServiceCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }
type VolcanoJob ¶ added in v0.2.0
type VolcanoJob struct { *BasicJobInfo // contains filtered or unexported fields }
volcano Job wrapper
func (*VolcanoJob) Age ¶ added in v0.2.0
func (vj *VolcanoJob) Age() time.Duration
func (*VolcanoJob) AllPods ¶ added in v0.2.0
func (vj *VolcanoJob) AllPods() []v1.Pod
return pods from cache
func (*VolcanoJob) AllocatedGPU ¶ added in v0.2.0
func (vj *VolcanoJob) AllocatedGPU() int64
volcano job without gpu supported
func (*VolcanoJob) ChiefPod ¶ added in v0.2.0
func (vj *VolcanoJob) ChiefPod() v1.Pod
return driver pod
func (*VolcanoJob) Duration ¶ added in v0.2.0
func (vj *VolcanoJob) Duration() time.Duration
Get the Job Training Duration
func (*VolcanoJob) GetJobDashboards ¶ added in v0.2.0
func (vj *VolcanoJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
func (*VolcanoJob) GetPriorityClass ¶ added in v0.3.0
func (vj *VolcanoJob) GetPriorityClass() string
Get PriorityClass
func (*VolcanoJob) GetStatus ¶ added in v0.2.0
func (vj *VolcanoJob) GetStatus() (status string)
func (*VolcanoJob) HostIPOfChief ¶ added in v0.2.0
func (vj *VolcanoJob) HostIPOfChief() (hostIP string)
Get the hostIP of the driver Pod
func (*VolcanoJob) Name ¶ added in v0.2.0
func (vj *VolcanoJob) Name() string
func (*VolcanoJob) Namespace ¶ added in v0.2.0
func (vj *VolcanoJob) Namespace() string
func (*VolcanoJob) RequestedGPU ¶ added in v0.2.0
func (vj *VolcanoJob) RequestedGPU() int64
volcano job without gpu supported
func (*VolcanoJob) StartTime ¶ added in v0.2.0
func (vj *VolcanoJob) StartTime() *metav1.Time
func (*VolcanoJob) Trainer ¶ added in v0.2.0
func (vj *VolcanoJob) Trainer() string
return trainerType: volcano job
func (*VolcanoJob) Uid ¶ added in v0.3.0
func (vj *VolcanoJob) Uid() string
type VolcanoJobTrainer ¶ added in v0.2.0
type VolcanoJobTrainer struct {
// contains filtered or unexported fields
}
volcano job trainer
func NewVolcanoJobTrainerSubmit ¶ added in v0.2.0
func NewVolcanoJobTrainerSubmit(client *kubernetes.Clientset) *VolcanoJobTrainer
func (*VolcanoJobTrainer) GetTrainingJob ¶ added in v0.2.0
func (st *VolcanoJobTrainer) GetTrainingJob(name, namespace string) (job TrainingJob, err error)
func (*VolcanoJobTrainer) GetTrainingJobAtSubmit ¶ added in v0.2.0
func (st *VolcanoJobTrainer) GetTrainingJobAtSubmit(name, namespace string) (job TrainingJob, err error)
func (*VolcanoJobTrainer) IsSupported ¶ added in v0.2.0
func (st *VolcanoJobTrainer) IsSupported(name, ns string) bool
func (*VolcanoJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (st *VolcanoJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
func (*VolcanoJobTrainer) Type ¶ added in v0.2.0
func (st *VolcanoJobTrainer) Type() string
Source Files ¶
- common.go
- completion.go
- completion_bash.go
- config.go
- const.go
- dashboard_helper.go
- data.go
- data_info.go
- data_list.go
- delete.go
- get.go
- get_advanced.go
- gpu.go
- gpu_info.go
- job_info.go
- list.go
- logs.go
- logviewer.go
- pod_helper.go
- prune.go
- resource.go
- root.go
- serve.go
- serve_custom.go
- serve_delete.go
- serve_get.go
- serve_list.go
- serve_log.go
- serve_tensorflow.go
- serve_tensorrt.go
- submit.go
- submit_horovod.go
- submit_mpi.go
- submit_pytorchjob.go
- submit_spark.go
- submit_standalone.go
- submit_tfjob.go
- submit_volcano.go
- sync_code.go
- tensorboard.go
- top.go
- top_job.go
- top_node.go
- traffic_router_split.go
- trainer.go
- trainer_horovod.go
- trainer_interface.go
- trainer_mpi.go
- trainer_pytorch.go
- trainer_spark.go
- trainer_standalone.go
- trainer_tensorflow.go
- trainer_volcano.go
- training_plugin_interface.go
- version.go