Documentation ¶
Index ¶
- Constants
- func AcceptJobLog(jobName string, trainingType types.TrainingJobType, args *types.LogArgs) error
- func BuildJobInfo(job TrainingJob, showGPUs bool, services []*v1.Service, nodes []*v1.Node) *types.TrainingJobInfo
- func CheckJobIsOwnedByTrainer(labels map[string]string) error
- func CheckOperatorIsInstalled(crdName string) bool
- func CheckPrintFormat(format string) error
- func CompatibleJobCRD(crdName, fieldToCheck string) bool
- func DeleteTrainingJob(jobName, namespace string, jobType types.TrainingJobType) error
- func DisplayTrainingJobList(jobInfoList []TrainingJob, format string, allNamespaces bool)
- func GetAllTrainers() map[types.TrainingJobType]Trainer
- func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string
- func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric prometheus.JobGpuMetric, err error)
- func GetJobRealStatus(job TrainingJob) string
- func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)
- func GetTrainingJobLabels(jobType types.TrainingJobType) string
- func PrepareServicesAndNodesForTensorboard(jobs []TrainingJob, allNamespaces bool) ([]*v1.Service, []*v1.Node)
- func PrintLine(w io.Writer, fields ...string)
- func PrintTrainingJob(job TrainingJob, modelVersion *types.ModelVersion, format string, ...)
- func PruneTrainingJobs(namespace string, allNamespaces bool, since time.Duration) error
- func SortMapKeys(podMetric map[string]types.GpuMetric) []string
- func SubmitDeepSpeedJob(namespace string, submitArgs *types.SubmitDeepSpeedJobArgs) (err error)
- func SubmitETJob(namespace string, submitArgs *types.SubmitETJobArgs) (err error)
- func SubmitHorovodJob(namespace string, submitArgs *types.SubmitHorovodJobArgs) (err error)
- func SubmitMPIJob(namespace string, submitArgs *types.SubmitMPIJobArgs) (err error)
- func SubmitPytorchJob(namespace string, submitArgs *types.SubmitPyTorchJobArgs) (err error)
- func SubmitRayJob(namespace string, submitArgs *types.SubmitRayJobArgs) (err error)
- func SubmitScaleInETJob(namespace string, submitArgs *types.ScaleInETJobArgs) error
- func SubmitScaleOutETJob(namespace string, submitArgs *types.ScaleOutETJobArgs) error
- func SubmitSparkJob(namespace string, submitArgs *types.SubmitSparkJobArgs) (err error)
- func SubmitTFJob(namespace string, submitArgs *types.SubmitTFJobArgs) (err error)
- func SubmitVolcanoJob(namespace string, submitArgs *types.SubmitVolcanoJobArgs) error
- func TopTrainingJobs(args []string, namespace string, allNamespaces bool, ...) error
- type BasicJobInfo
- type DeepSpeedJob
- func (dsj *DeepSpeedJob) Age() time.Duration
- func (dsj *DeepSpeedJob) AllPods() []*v1.Pod
- func (dsj *DeepSpeedJob) AllocatedGPU() int64
- func (dsj *DeepSpeedJob) ChiefPod() *v1.Pod
- func (dsj *DeepSpeedJob) Duration() time.Duration
- func (dsj *DeepSpeedJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (dsj *DeepSpeedJob) GetLabels() map[string]string
- func (dsj *DeepSpeedJob) GetPriorityClass() string
- func (dsj *DeepSpeedJob) GetStatus() (status string)
- func (dsj *DeepSpeedJob) GetTrainJob() interface{}
- func (dsj *DeepSpeedJob) GetWorkerMaxReplicas(maxWorkers int) interface{}
- func (dsj *DeepSpeedJob) GetWorkerMinReplicas(minWorkers int) interface{}
- func (dsj *DeepSpeedJob) HostIPOfChief() (hostIP string)
- func (dsj *DeepSpeedJob) Name() string
- func (dsj *DeepSpeedJob) Namespace() string
- func (dsj *DeepSpeedJob) RequestedGPU() int64
- func (dsj *DeepSpeedJob) StartTime() *metav1.Time
- func (dsj *DeepSpeedJob) Trainer() types.TrainingJobType
- func (dsj *DeepSpeedJob) Uid() string
- type DeepSpeedJobTrainer
- func (dst *DeepSpeedJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (dst *DeepSpeedJobTrainer) IsEnabled() bool
- func (dst *DeepSpeedJobTrainer) IsSupported(name, ns string) bool
- func (dst *DeepSpeedJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (dst *DeepSpeedJobTrainer) Type() types.TrainingJobType
- type ETJob
- func (ej *ETJob) Age() time.Duration
- func (ej *ETJob) AllPods() []*v1.Pod
- func (ej *ETJob) AllocatedGPU() int64
- func (ej *ETJob) ChiefPod() *v1.Pod
- func (ej *ETJob) Duration() time.Duration
- func (ej *ETJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (ej *ETJob) GetLabels() map[string]string
- func (ej *ETJob) GetPriorityClass() string
- func (ej *ETJob) GetStatus() (status string)
- func (ej *ETJob) GetTrainJob() interface{}
- func (ej *ETJob) GetWorkerMaxReplicas(maxWorkers int) interface{}
- func (ej *ETJob) GetWorkerMinReplicas(minWorkers int) interface{}
- func (ej *ETJob) HostIPOfChief() (hostIP string)
- func (ej *ETJob) Name() string
- func (ej *ETJob) Namespace() string
- func (ej *ETJob) RequestedGPU() int64
- func (ej *ETJob) StartTime() *metav1.Time
- func (ej *ETJob) Trainer() types.TrainingJobType
- func (ej *ETJob) Uid() string
- type ETJobTrainer
- func (ejt *ETJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (ejt *ETJobTrainer) IsEnabled() bool
- func (ejt *ETJobTrainer) IsSupported(name, ns string) bool
- func (ejt *ETJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (ejt *ETJobTrainer) Type() types.TrainingJobType
- type MPIJob
- func (mj *MPIJob) Age() time.Duration
- func (mj *MPIJob) AllPods() []*v1.Pod
- func (mj *MPIJob) AllocatedGPU() int64
- func (mj *MPIJob) ChiefPod() *v1.Pod
- func (mj *MPIJob) Duration() time.Duration
- func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (mj *MPIJob) GetLabels() map[string]string
- func (m *MPIJob) GetPriorityClass() string
- func (mj *MPIJob) GetStatus() (status string)
- func (mj *MPIJob) GetTrainJob() interface{}
- func (mj *MPIJob) HostIPOfChief() (hostIP string)
- func (mj *MPIJob) Name() string
- func (mj *MPIJob) Namespace() string
- func (mj *MPIJob) RequestedGPU() int64
- func (mj *MPIJob) StartTime() *metav1.Time
- func (mj *MPIJob) Trainer() types.TrainingJobType
- func (mj *MPIJob) Uid() string
- type MPIJobTrainer
- func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (tt *MPIJobTrainer) IsEnabled() bool
- func (tt *MPIJobTrainer) IsSupported(name, ns string) bool
- func (tt *MPIJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (tt *MPIJobTrainer) Type() types.TrainingJobType
- type PyTorchJob
- func (pj *PyTorchJob) Age() time.Duration
- func (pj *PyTorchJob) AllPods() []*v1.Pod
- func (pj *PyTorchJob) AllocatedGPU() int64
- func (pj *PyTorchJob) ChiefPod() *v1.Pod
- func (pj *PyTorchJob) Duration() time.Duration
- func (pj *PyTorchJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (pj *PyTorchJob) GetLabels() map[string]string
- func (p *PyTorchJob) GetPriorityClass() string
- func (pj *PyTorchJob) GetStatus() (status string)
- func (pj *PyTorchJob) GetTrainJob() interface{}
- func (pj *PyTorchJob) HostIPOfChief() (hostIP string)
- func (pj *PyTorchJob) Name() string
- func (pj *PyTorchJob) Namespace() string
- func (pj *PyTorchJob) RequestedGPU() int64
- func (pj *PyTorchJob) StartTime() *metav1.Time
- func (pj *PyTorchJob) Trainer() types.TrainingJobType
- func (pj *PyTorchJob) Uid() string
- type PyTorchJobTrainer
- func (tt *PyTorchJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (tt *PyTorchJobTrainer) IsEnabled() bool
- func (tt *PyTorchJobTrainer) IsSupported(name, ns string) bool
- func (tt *PyTorchJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (tt *PyTorchJobTrainer) Type() types.TrainingJobType
- type RayJob
- func (rj *RayJob) Age() time.Duration
- func (rj *RayJob) AllPods() []*v1.Pod
- func (rj *RayJob) AllocatedGPU() int64
- func (rj *RayJob) ChiefPod() *v1.Pod
- func (rj *RayJob) Duration() time.Duration
- func (rj *RayJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (rj *RayJob) GetLabels() map[string]string
- func (rj *RayJob) GetPriorityClass() string
- func (rj *RayJob) GetStatus() (status string)
- func (rj *RayJob) GetTrainJob() interface{}
- func (rj *RayJob) HostIPOfChief() (hostIP string)
- func (rj *RayJob) Name() string
- func (rj *RayJob) Namespace() string
- func (rj *RayJob) RequestedGPU() int64
- func (rj *RayJob) StartTime() *metav1.Time
- func (rj *RayJob) Trainer() types.TrainingJobType
- func (rj *RayJob) Uid() string
- type RayJobTrainer
- func (rjt *RayJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (rjt *RayJobTrainer) IsEnabled() bool
- func (rjt *RayJobTrainer) IsSupported(name, ns string) bool
- func (rjt *RayJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (rjt *RayJobTrainer) Type() types.TrainingJobType
- type Resource
- type ResourceType
- type SortPodConditionByLastTransitionTime
- type SparkJob
- func (sj *SparkJob) Age() time.Duration
- func (sj *SparkJob) AllPods() []*v1.Pod
- func (sj *SparkJob) AllocatedGPU() int64
- func (sj *SparkJob) ChiefPod() *v1.Pod
- func (sj *SparkJob) Duration() time.Duration
- func (sj *SparkJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (sj *SparkJob) GetLabels() map[string]string
- func (sj *SparkJob) GetPriorityClass() string
- func (sj *SparkJob) GetStatus() (status string)
- func (sj *SparkJob) GetTrainJob() interface{}
- func (sj *SparkJob) HostIPOfChief() (hostIP string)
- func (sj *SparkJob) Name() string
- func (sj *SparkJob) Namespace() string
- func (sj *SparkJob) RequestedGPU() int64
- func (sj *SparkJob) StartTime() *metav1.Time
- func (sj *SparkJob) Trainer() types.TrainingJobType
- func (sj *SparkJob) Uid() string
- type SparkJobTrainer
- func (st *SparkJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (st *SparkJobTrainer) IsEnabled() bool
- func (st *SparkJobTrainer) IsSupported(name, ns string) bool
- func (st *SparkJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (st *SparkJobTrainer) Type() types.TrainingJobType
- type TensorFlowJob
- func (tj *TensorFlowJob) Age() time.Duration
- func (tj *TensorFlowJob) AllPods() []*v1.Pod
- func (tj *TensorFlowJob) AllocatedGPU() int64
- func (tj *TensorFlowJob) ChiefPod() *v1.Pod
- func (tj *TensorFlowJob) Duration() time.Duration
- func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (tj *TensorFlowJob) GetLabels() map[string]string
- func (t *TensorFlowJob) GetPriorityClass() string
- func (tj *TensorFlowJob) GetStatus() (status string)
- func (tj *TensorFlowJob) GetTrainJob() interface{}
- func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
- func (tj *TensorFlowJob) Name() string
- func (tj *TensorFlowJob) Namespace() string
- func (tj *TensorFlowJob) RequestedGPU() int64
- func (tj *TensorFlowJob) StartTime() *metav1.Time
- func (tj *TensorFlowJob) Trainer() types.TrainingJobType
- func (tj *TensorFlowJob) Uid() string
- type TensorFlowJobTrainer
- func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (tt *TensorFlowJobTrainer) IsEnabled() bool
- func (tt *TensorFlowJobTrainer) IsSupported(name, namespace string) bool
- func (tt *TensorFlowJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (tt *TensorFlowJobTrainer) Type() types.TrainingJobType
- type Trainer
- type TrainingJob
- type VolcanoJob
- func (vj *VolcanoJob) Age() time.Duration
- func (vj *VolcanoJob) AllPods() []*v1.Pod
- func (vj *VolcanoJob) AllocatedGPU() int64
- func (vj *VolcanoJob) ChiefPod() *v1.Pod
- func (vj *VolcanoJob) Duration() time.Duration
- func (vj *VolcanoJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
- func (vj *VolcanoJob) GetLabels() map[string]string
- func (vj *VolcanoJob) GetPriorityClass() string
- func (vj *VolcanoJob) GetStatus() (status string)
- func (vj *VolcanoJob) GetTrainJob() interface{}
- func (vj *VolcanoJob) HostIPOfChief() (hostIP string)
- func (vj *VolcanoJob) Name() string
- func (vj *VolcanoJob) Namespace() string
- func (vj *VolcanoJob) RequestedGPU() int64
- func (vj *VolcanoJob) StartTime() *metav1.Time
- func (vj *VolcanoJob) Trainer() types.TrainingJobType
- func (vj *VolcanoJob) Uid() string
- type VolcanoJobTrainer
- func (st *VolcanoJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
- func (st *VolcanoJobTrainer) IsEnabled() bool
- func (st *VolcanoJobTrainer) IsSupported(name, ns string) bool
- func (st *VolcanoJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
- func (st *VolcanoJobTrainer) Type() types.TrainingJobType
Constants ¶
const ( // NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8 // this uses the device plugin mechanism NVIDIAGPUResourceName = "nvidia.com/gpu" DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu" // TrainingReplicaTypeLabel training-operator replica type label TrainingReplicaTypeLabel = "training.kubeflow.org/replica-type" // TrainingReplicaIndexLabel training-operator replica index label TrainingReplicaIndexLabel = "training.kubeflow.org/replica-index" )
const ( ETJOB_MAXWORKERS = 1000 ETJOB_MINWORKERS = 1 )
const ResourceTypeJob = ResourceType("Job")
const ResourceTypePod = ResourceType("Pod")
const ResourceTypeStatefulSet = ResourceType("StatefulSet")
Variables ¶
This section is empty.
Functions ¶
func AcceptJobLog ¶
AcceptJobLog is used for arena-go-sdk
func BuildJobInfo ¶
func BuildJobInfo(job TrainingJob, showGPUs bool, services []*v1.Service, nodes []*v1.Node) *types.TrainingJobInfo
* * BuildTrainingJobInfo returns types.TrainingJobInfo
func CheckJobIsOwnedByTrainer ¶ added in v0.8.7
func CheckPrintFormat ¶
func CompatibleJobCRD ¶ added in v0.9.12
CompatibleJobCRD Compatible with training-operator CRD.
func DeleteTrainingJob ¶
func DeleteTrainingJob(jobName, namespace string, jobType types.TrainingJobType) error
func DisplayTrainingJobList ¶
func DisplayTrainingJobList(jobInfoList []TrainingJob, format string, allNamespaces bool)
func GetAllTrainers ¶
func GetAllTrainers() map[types.TrainingJobType]Trainer
func GetJobDashboards ¶
func GetJobGpuMetric ¶
func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric prometheus.JobGpuMetric, err error)
func GetJobRealStatus ¶
func GetJobRealStatus(job TrainingJob) string
Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending
func GetResourcesEvents ¶
func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)
Get Event of the Job
func GetTrainingJobLabels ¶ added in v0.8.7
func GetTrainingJobLabels(jobType types.TrainingJobType) string
func PrepareServicesAndNodesForTensorboard ¶ added in v0.8.0
func PrintTrainingJob ¶
func PrintTrainingJob(job TrainingJob, modelVersion *types.ModelVersion, format string, showEvents bool, showGPUs bool)
func PruneTrainingJobs ¶
func SubmitDeepSpeedJob ¶ added in v0.9.9
func SubmitDeepSpeedJob(namespace string, submitArgs *types.SubmitDeepSpeedJobArgs) (err error)
func SubmitETJob ¶
func SubmitETJob(namespace string, submitArgs *types.SubmitETJobArgs) (err error)
func SubmitHorovodJob ¶
func SubmitHorovodJob(namespace string, submitArgs *types.SubmitHorovodJobArgs) (err error)
func SubmitMPIJob ¶
func SubmitMPIJob(namespace string, submitArgs *types.SubmitMPIJobArgs) (err error)
func SubmitPytorchJob ¶
func SubmitPytorchJob(namespace string, submitArgs *types.SubmitPyTorchJobArgs) (err error)
func SubmitRayJob ¶ added in v0.11.0
func SubmitRayJob(namespace string, submitArgs *types.SubmitRayJobArgs) (err error)
func SubmitScaleInETJob ¶
func SubmitScaleInETJob(namespace string, submitArgs *types.ScaleInETJobArgs) error
func SubmitScaleOutETJob ¶
func SubmitScaleOutETJob(namespace string, submitArgs *types.ScaleOutETJobArgs) error
func SubmitSparkJob ¶
func SubmitSparkJob(namespace string, submitArgs *types.SubmitSparkJobArgs) (err error)
func SubmitTFJob ¶
func SubmitTFJob(namespace string, submitArgs *types.SubmitTFJobArgs) (err error)
func SubmitVolcanoJob ¶
func SubmitVolcanoJob(namespace string, submitArgs *types.SubmitVolcanoJobArgs) error
func TopTrainingJobs ¶
func TopTrainingJobs(args []string, namespace string, allNamespaces bool, jobType types.TrainingJobType, instanceName string, notStop bool, format types.FormatStyle) error
Types ¶
type BasicJobInfo ¶
type BasicJobInfo struct {
// contains filtered or unexported fields
}
func (*BasicJobInfo) Resources ¶
func (j *BasicJobInfo) Resources() []Resource
type DeepSpeedJob ¶ added in v0.9.9
type DeepSpeedJob struct { *BasicJobInfo // contains filtered or unexported fields }
DeepSpeedJob Information
func (*DeepSpeedJob) Age ¶ added in v0.9.9
func (dsj *DeepSpeedJob) Age() time.Duration
Get the Job Age
func (*DeepSpeedJob) AllPods ¶ added in v0.9.9
func (dsj *DeepSpeedJob) AllPods() []*v1.Pod
Get all the pods of the Training Job
func (*DeepSpeedJob) AllocatedGPU ¶ added in v0.9.9
func (dsj *DeepSpeedJob) AllocatedGPU() int64
Requested GPU count of the Job
func (*DeepSpeedJob) ChiefPod ¶ added in v0.9.9
func (dsj *DeepSpeedJob) ChiefPod() *v1.Pod
Get the chief Pod of the Job.
func (*DeepSpeedJob) Duration ¶ added in v0.9.9
func (dsj *DeepSpeedJob) Duration() time.Duration
Get the Job Training Duration
func (*DeepSpeedJob) GetJobDashboards ¶ added in v0.9.9
func (dsj *DeepSpeedJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
func (*DeepSpeedJob) GetLabels ¶ added in v0.9.14
func (dsj *DeepSpeedJob) GetLabels() map[string]string
func (*DeepSpeedJob) GetPriorityClass ¶ added in v0.9.9
func (dsj *DeepSpeedJob) GetPriorityClass() string
GetPriorityClass Get PriorityClass
func (*DeepSpeedJob) GetStatus ¶ added in v0.9.9
func (dsj *DeepSpeedJob) GetStatus() (status string)
Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (*DeepSpeedJob) GetTrainJob ¶ added in v0.9.9
func (dsj *DeepSpeedJob) GetTrainJob() interface{}
func (*DeepSpeedJob) GetWorkerMaxReplicas ¶ added in v0.9.9
func (dsj *DeepSpeedJob) GetWorkerMaxReplicas(maxWorkers int) interface{}
func (*DeepSpeedJob) GetWorkerMinReplicas ¶ added in v0.9.9
func (dsj *DeepSpeedJob) GetWorkerMinReplicas(minWorkers int) interface{}
func (*DeepSpeedJob) HostIPOfChief ¶ added in v0.9.9
func (dsj *DeepSpeedJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
func (*DeepSpeedJob) Name ¶ added in v0.9.9
func (dsj *DeepSpeedJob) Name() string
func (*DeepSpeedJob) Namespace ¶ added in v0.9.9
func (dsj *DeepSpeedJob) Namespace() string
func (*DeepSpeedJob) RequestedGPU ¶ added in v0.9.9
func (dsj *DeepSpeedJob) RequestedGPU() int64
Requested GPU count of the Job
func (*DeepSpeedJob) StartTime ¶ added in v0.9.9
func (dsj *DeepSpeedJob) StartTime() *metav1.Time
Get the start time
func (*DeepSpeedJob) Trainer ¶ added in v0.9.9
func (dsj *DeepSpeedJob) Trainer() types.TrainingJobType
func (*DeepSpeedJob) Uid ¶ added in v0.9.9
func (dsj *DeepSpeedJob) Uid() string
type DeepSpeedJobTrainer ¶ added in v0.9.9
type DeepSpeedJobTrainer struct {
// contains filtered or unexported fields
}
DeepSpeedJobTrainer DeepSpeed Job trainer
func (*DeepSpeedJobTrainer) GetTrainingJob ¶ added in v0.9.9
func (dst *DeepSpeedJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*DeepSpeedJobTrainer) IsEnabled ¶ added in v0.9.9
func (dst *DeepSpeedJobTrainer) IsEnabled() bool
func (*DeepSpeedJobTrainer) IsSupported ¶ added in v0.9.9
func (dst *DeepSpeedJobTrainer) IsSupported(name, ns string) bool
check if it's et job
func (*DeepSpeedJobTrainer) ListTrainingJobs ¶ added in v0.9.9
func (dst *DeepSpeedJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
func (*DeepSpeedJobTrainer) Type ¶ added in v0.9.9
func (dst *DeepSpeedJobTrainer) Type() types.TrainingJobType
Get the type
type ETJob ¶
type ETJob struct { *BasicJobInfo // contains filtered or unexported fields }
ET Job Information
func (*ETJob) GetJobDashboards ¶
func (*ETJob) GetTrainJob ¶
func (ej *ETJob) GetTrainJob() interface{}
func (*ETJob) GetWorkerMaxReplicas ¶
func (*ETJob) GetWorkerMinReplicas ¶
func (*ETJob) HostIPOfChief ¶
Get the hostIP of the chief Pod
func (*ETJob) Trainer ¶
func (ej *ETJob) Trainer() types.TrainingJobType
type ETJobTrainer ¶
type ETJobTrainer struct {
// contains filtered or unexported fields
}
ET Job trainer
func (*ETJobTrainer) GetTrainingJob ¶
func (ejt *ETJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*ETJobTrainer) IsEnabled ¶
func (ejt *ETJobTrainer) IsEnabled() bool
func (*ETJobTrainer) IsSupported ¶
func (ejt *ETJobTrainer) IsSupported(name, ns string) bool
check if it's et job
func (*ETJobTrainer) ListTrainingJobs ¶
func (ejt *ETJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
type MPIJob ¶
type MPIJob struct { *BasicJobInfo // contains filtered or unexported fields }
MPI Job Information
func (*MPIJob) GetJobDashboards ¶
func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
Get Dashboard url of the job
func (*MPIJob) GetTrainJob ¶
func (mj *MPIJob) GetTrainJob() interface{}
func (*MPIJob) HostIPOfChief ¶
Get the hostIP of the chief Pod
func (*MPIJob) Trainer ¶
func (mj *MPIJob) Trainer() types.TrainingJobType
type MPIJobTrainer ¶
type MPIJobTrainer struct {
// contains filtered or unexported fields
}
MPI Job trainer
func (*MPIJobTrainer) GetTrainingJob ¶
func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*MPIJobTrainer) IsEnabled ¶
func (tt *MPIJobTrainer) IsEnabled() bool
IsEnabled is used to get the trainer is enable or not
func (*MPIJobTrainer) IsSupported ¶
func (tt *MPIJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*MPIJobTrainer) ListTrainingJobs ¶
func (tt *MPIJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
type PyTorchJob ¶
type PyTorchJob struct { *BasicJobInfo // contains filtered or unexported fields }
PyTorch Job Information
func (*PyTorchJob) AllPods ¶
func (pj *PyTorchJob) AllPods() []*v1.Pod
Get all the pods of the Training Job
func (*PyTorchJob) AllocatedGPU ¶
func (pj *PyTorchJob) AllocatedGPU() int64
Requested GPU count of the Job
func (*PyTorchJob) ChiefPod ¶
func (pj *PyTorchJob) ChiefPod() *v1.Pod
Get the master Pod of the Job.
func (*PyTorchJob) Duration ¶
func (pj *PyTorchJob) Duration() time.Duration
Get the Job Training Duration
func (*PyTorchJob) GetJobDashboards ¶
func (pj *PyTorchJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
Get Dashboard url of the job
func (*PyTorchJob) GetLabels ¶ added in v0.9.14
func (pj *PyTorchJob) GetLabels() map[string]string
func (*PyTorchJob) GetPriorityClass ¶
func (p *PyTorchJob) GetPriorityClass() string
Get PriorityClass
func (*PyTorchJob) GetStatus ¶
func (pj *PyTorchJob) GetStatus() (status string)
Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (*PyTorchJob) GetTrainJob ¶
func (pj *PyTorchJob) GetTrainJob() interface{}
func (*PyTorchJob) HostIPOfChief ¶
func (pj *PyTorchJob) HostIPOfChief() (hostIP string)
Get the hostIP of the master Pod
func (*PyTorchJob) Name ¶
func (pj *PyTorchJob) Name() string
func (*PyTorchJob) Namespace ¶
func (pj *PyTorchJob) Namespace() string
func (*PyTorchJob) RequestedGPU ¶
func (pj *PyTorchJob) RequestedGPU() int64
Requested GPU count of the Job
func (*PyTorchJob) Trainer ¶
func (pj *PyTorchJob) Trainer() types.TrainingJobType
func (*PyTorchJob) Uid ¶
func (pj *PyTorchJob) Uid() string
type PyTorchJobTrainer ¶
type PyTorchJobTrainer struct {
// contains filtered or unexported fields
}
PyTorch Job trainer
func (*PyTorchJobTrainer) GetTrainingJob ¶
func (tt *PyTorchJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*PyTorchJobTrainer) IsEnabled ¶
func (tt *PyTorchJobTrainer) IsEnabled() bool
IsEnabled is used to get the trainer is enable or not
func (*PyTorchJobTrainer) IsSupported ¶
func (tt *PyTorchJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*PyTorchJobTrainer) ListTrainingJobs ¶
func (tt *PyTorchJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
func (*PyTorchJobTrainer) Type ¶
func (tt *PyTorchJobTrainer) Type() types.TrainingJobType
Get the type
type RayJob ¶ added in v0.11.0
type RayJob struct { *BasicJobInfo // contains filtered or unexported fields }
RayJob Information
func (*RayJob) AllocatedGPU ¶ added in v0.11.0
Requested GPU count of the Job
func (*RayJob) GetJobDashboards ¶ added in v0.11.0
func (rj *RayJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
Get Dashboard url of the job
func (*RayJob) GetPriorityClass ¶ added in v0.11.0
Get PriorityClass. return the PriorityClassName of HeadPod
func (*RayJob) GetStatus ¶ added in v0.11.0
Get the Status of the rayJob: PENDING, RUNNING, STOPPED, SUCCEEDED, FAILED
func (*RayJob) GetTrainJob ¶ added in v0.11.0
func (rj *RayJob) GetTrainJob() interface{}
func (*RayJob) HostIPOfChief ¶ added in v0.11.0
Get the hostIP of the master Pod
func (*RayJob) RequestedGPU ¶ added in v0.11.0
Requested GPU count of the Job
func (*RayJob) Trainer ¶ added in v0.11.0
func (rj *RayJob) Trainer() types.TrainingJobType
type RayJobTrainer ¶ added in v0.11.0
type RayJobTrainer struct { RayJobClient *versioned.Clientset // contains filtered or unexported fields }
RayJob Job trainer
func (*RayJobTrainer) GetTrainingJob ¶ added in v0.11.0
func (rjt *RayJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
Get the training job from cache or directly
func (*RayJobTrainer) IsEnabled ¶ added in v0.11.0
func (rjt *RayJobTrainer) IsEnabled() bool
IsEnabled is used to get the trainer is enable or not
func (*RayJobTrainer) IsSupported ¶ added in v0.11.0
func (rjt *RayJobTrainer) IsSupported(name, ns string) bool
check if it's ray job
func (*RayJobTrainer) ListTrainingJobs ¶ added in v0.11.0
func (rjt *RayJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
func (*RayJobTrainer) Type ¶ added in v0.11.0
func (rjt *RayJobTrainer) Type() types.TrainingJobType
Get the type
type Resource ¶
type Resource struct { Name string Uid string ResourceType ResourceType }
type ResourceType ¶
type ResourceType string
type SortPodConditionByLastTransitionTime ¶
type SortPodConditionByLastTransitionTime []v1.PodCondition
Sort the pod condition by time.
func (SortPodConditionByLastTransitionTime) Len ¶
func (s SortPodConditionByLastTransitionTime) Len() int
func (SortPodConditionByLastTransitionTime) Less ¶
func (s SortPodConditionByLastTransitionTime) Less(i, j int) bool
func (SortPodConditionByLastTransitionTime) Swap ¶
func (s SortPodConditionByLastTransitionTime) Swap(i, j int)
type SparkJob ¶
type SparkJob struct { *BasicJobInfo // contains filtered or unexported fields }
spark application wrapper
func (*SparkJob) AllocatedGPU ¶
spark job without gpu supported
func (*SparkJob) GetJobDashboards ¶
func (*SparkJob) GetPriorityClass ¶
Get PriorityClass TODO: @moyuan
func (*SparkJob) GetStatus ¶
spark job driver state
------------------------------------------------------- NewState ApplicationStateType = "" SubmittedState ApplicationStateType = "SUBMITTED" RunningState ApplicationStateType = "RUNNING" CompletedState ApplicationStateType = "COMPLETED" FailedState ApplicationStateType = "FAILED" FailedSubmissionState ApplicationStateType = "SUBMISSION_FAILED" PendingRerunState ApplicationStateType = "PENDING_RERUN" InvalidatingState ApplicationStateType = "INVALIDATING" SucceedingState ApplicationStateType = "SUCCEEDING" FailingState ApplicationStateType = "FAILING" UnknownState ApplicationStateType = "UNKNOWN"
spark job executor state
------------------------------------------------------- ExecutorPendingState ExecutorState = "PENDING" ExecutorRunningState ExecutorState = "RUNNING" ExecutorCompletedState ExecutorState = "COMPLETED" ExecutorFailedState ExecutorState = "FAILED" ExecutorUnknownState ExecutorState = "UNKNOWN"
func (*SparkJob) GetTrainJob ¶
func (sj *SparkJob) GetTrainJob() interface{}
func (*SparkJob) HostIPOfChief ¶
Get the hostIP of the driver Pod
func (*SparkJob) RequestedGPU ¶
spark job without gpu supported
func (*SparkJob) Trainer ¶
func (sj *SparkJob) Trainer() types.TrainingJobType
return trainerType: sparkjob
type SparkJobTrainer ¶
type SparkJobTrainer struct {
// contains filtered or unexported fields
}
spark job trainer
func (*SparkJobTrainer) GetTrainingJob ¶
func (st *SparkJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*SparkJobTrainer) IsEnabled ¶
func (st *SparkJobTrainer) IsEnabled() bool
func (*SparkJobTrainer) IsSupported ¶
func (st *SparkJobTrainer) IsSupported(name, ns string) bool
func (*SparkJobTrainer) ListTrainingJobs ¶
func (st *SparkJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
func (*SparkJobTrainer) Type ¶
func (st *SparkJobTrainer) Type() types.TrainingJobType
type TensorFlowJob ¶
type TensorFlowJob struct { *BasicJobInfo // contains filtered or unexported fields }
TensorflowJob implements the TrainingJob TensorFlow Job Information
func (*TensorFlowJob) Age ¶
func (tj *TensorFlowJob) Age() time.Duration
Age returns the age of tfjob
func (*TensorFlowJob) AllPods ¶
func (tj *TensorFlowJob) AllPods() []*v1.Pod
AllPods Get all the pods of the Training Job
func (*TensorFlowJob) AllocatedGPU ¶
func (tj *TensorFlowJob) AllocatedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) ChiefPod ¶
func (tj *TensorFlowJob) ChiefPod() *v1.Pod
ChiefPod gets the chief Pod of the Job.
func (*TensorFlowJob) Duration ¶
func (tj *TensorFlowJob) Duration() time.Duration
Duration returns the duration of tfjob
func (*TensorFlowJob) GetJobDashboards ¶
func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
Get Dashboard url of the job
func (*TensorFlowJob) GetLabels ¶ added in v0.9.14
func (tj *TensorFlowJob) GetLabels() map[string]string
func (*TensorFlowJob) GetPriorityClass ¶
func (t *TensorFlowJob) GetPriorityClass() string
Get PriorityClass
func (*TensorFlowJob) GetStatus ¶
func (tj *TensorFlowJob) GetStatus() (status string)
GetStatus returns the status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (*TensorFlowJob) GetTrainJob ¶
func (tj *TensorFlowJob) GetTrainJob() interface{}
GetTrainJob returns the training job
func (*TensorFlowJob) HostIPOfChief ¶
func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
func (*TensorFlowJob) Name ¶
func (tj *TensorFlowJob) Name() string
Name returns the TensorflowJob name
func (*TensorFlowJob) Namespace ¶
func (tj *TensorFlowJob) Namespace() string
Namespace returns the namespace of tfjob
func (*TensorFlowJob) RequestedGPU ¶
func (tj *TensorFlowJob) RequestedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) StartTime ¶
func (tj *TensorFlowJob) StartTime() *metav1.Time
StartTime returns the start time
func (*TensorFlowJob) Trainer ¶
func (tj *TensorFlowJob) Trainer() types.TrainingJobType
Trainer returns the trainer
type TensorFlowJobTrainer ¶
type TensorFlowJobTrainer struct {
// contains filtered or unexported fields
}
TensorFlow Job trainer
func (*TensorFlowJobTrainer) GetTrainingJob ¶
func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*TensorFlowJobTrainer) IsEnabled ¶
func (tt *TensorFlowJobTrainer) IsEnabled() bool
IsEnabled is used to get the trainer is enable or not
func (*TensorFlowJobTrainer) IsSupported ¶
func (tt *TensorFlowJobTrainer) IsSupported(name, namespace string) bool
check if it's TensorFlow job
func (*TensorFlowJobTrainer) ListTrainingJobs ¶
func (tt *TensorFlowJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
func (*TensorFlowJobTrainer) Type ¶
func (tt *TensorFlowJobTrainer) Type() types.TrainingJobType
type Trainer ¶
type Trainer interface { // IsEnabled is used to check the trainer is enabled or not IsEnabled() bool // Check if the training job is supported IsSupported(name, ns string) bool // Get TrainingJob object directly. this method is called when `arena get` GetTrainingJob(name, namespace string) (TrainingJob, error) // Get the type of trainer Type() types.TrainingJobType // List all tf training jobs ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error) }
func NewDeepSpeedJobTrainer ¶ added in v0.9.9
func NewDeepSpeedJobTrainer() Trainer
NewDeepSpeedJobTrainer new deepspeed job trainer
func NewSparkJobTrainer ¶
func NewSparkJobTrainer() Trainer
func NewTensorFlowJobTrainer ¶
func NewTensorFlowJobTrainer() Trainer
func NewVolcanoJobTrainer ¶
func NewVolcanoJobTrainer() Trainer
type TrainingJob ¶
type TrainingJob interface { // Get the chief Pod of the Job. ChiefPod() *v1.Pod // Get the name of the Training Job Name() string // Get the unique identity of the Training Job Uid() string // Get the namespace of the Training Job Namespace() string // Get all the pods of the Training Job AllPods() []*v1.Pod // Get all the kubernetes resource of the Training Job Resources() []Resource // Get the Status of the Job: RUNNING, PENDING, GetStatus() string // Return trainer Type, match the training job type Trainer() types.TrainingJobType // Get the Job Age Age() time.Duration // Get the Job Duration Duration() time.Duration // Get start time StartTime() *metav1.Time // Get Dashboard GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error) // Requested GPU count of the Job RequestedGPU() int64 // Requested GPU count of the Job AllocatedGPU() int64 // the host ip of the chief pod HostIPOfChief() string // The priority class name of the training job GetPriorityClass() string GetTrainJob() interface{} GetLabels() map[string]string }
The Training Job can be TensorFlow, MPI and Caffe
func ListTrainingJobs ¶
func ListTrainingJobs(namespace string, allNamespaces bool, jobType types.TrainingJobType) ([]TrainingJob, error)
func SearchTrainingJob ¶
func SearchTrainingJob(jobName, namespace string, jobType types.TrainingJobType) (TrainingJob, error)
* search the training job with name and training type
type VolcanoJob ¶
type VolcanoJob struct { *BasicJobInfo // contains filtered or unexported fields }
volcano Job wrapper
func (*VolcanoJob) Age ¶
func (vj *VolcanoJob) Age() time.Duration
func (*VolcanoJob) AllocatedGPU ¶
func (vj *VolcanoJob) AllocatedGPU() int64
volcano job without gpu supported
func (*VolcanoJob) Duration ¶
func (vj *VolcanoJob) Duration() time.Duration
Get the Job Training Duration
func (*VolcanoJob) GetJobDashboards ¶
func (vj *VolcanoJob) GetJobDashboards(client *kubernetes.Clientset, namespace, arenaNamespace string) ([]string, error)
func (*VolcanoJob) GetLabels ¶ added in v0.9.14
func (vj *VolcanoJob) GetLabels() map[string]string
func (*VolcanoJob) GetPriorityClass ¶
func (vj *VolcanoJob) GetPriorityClass() string
Get PriorityClass
func (*VolcanoJob) GetStatus ¶
func (vj *VolcanoJob) GetStatus() (status string)
func (*VolcanoJob) GetTrainJob ¶
func (vj *VolcanoJob) GetTrainJob() interface{}
func (*VolcanoJob) HostIPOfChief ¶
func (vj *VolcanoJob) HostIPOfChief() (hostIP string)
Get the hostIP of the driver Pod
func (*VolcanoJob) Name ¶
func (vj *VolcanoJob) Name() string
func (*VolcanoJob) Namespace ¶
func (vj *VolcanoJob) Namespace() string
func (*VolcanoJob) RequestedGPU ¶
func (vj *VolcanoJob) RequestedGPU() int64
volcano job without gpu supported
func (*VolcanoJob) StartTime ¶
func (vj *VolcanoJob) StartTime() *metav1.Time
func (*VolcanoJob) Trainer ¶
func (vj *VolcanoJob) Trainer() types.TrainingJobType
return trainerType: volcano job
func (*VolcanoJob) Uid ¶
func (vj *VolcanoJob) Uid() string
type VolcanoJobTrainer ¶
type VolcanoJobTrainer struct {
// contains filtered or unexported fields
}
volcano job trainer
func (*VolcanoJobTrainer) GetTrainingJob ¶
func (st *VolcanoJobTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*VolcanoJobTrainer) IsEnabled ¶
func (st *VolcanoJobTrainer) IsEnabled() bool
IsEnabled is used to get the trainer is enable or not
func (*VolcanoJobTrainer) IsSupported ¶
func (st *VolcanoJobTrainer) IsSupported(name, ns string) bool
func (*VolcanoJobTrainer) ListTrainingJobs ¶
func (st *VolcanoJobTrainer) ListTrainingJobs(namespace string, allNamespace bool) ([]TrainingJob, error)
func (*VolcanoJobTrainer) Type ¶
func (st *VolcanoJobTrainer) Type() types.TrainingJobType
Source Files ¶
- const.go
- dashboard_helper.go
- delete.go
- get.go
- get_advanced.go
- gpu.go
- list.go
- logs.go
- pod_helper.go
- prune.go
- resource.go
- submit_deepspeedjob.go
- submit_etjob.go
- submit_horovod.go
- submit_mpijob.go
- submit_pytorchjob.go
- submit_rayjob.go
- submit_sparkjob.go
- submit_tfjob.go
- submit_volcanojob.go
- tensorboard.go
- top_job.go
- trainer.go
- trainer_deepspeed.go
- trainer_et.go
- trainer_interface.go
- trainer_mpi.go
- trainer_pytorch.go
- trainer_ray.go
- trainer_spark.go
- trainer_tensorflow.go
- trainer_volcano.go