Documentation
¶
Index ¶
- Constants
- func DeleteTrainingJob(jobName string) error
- func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string
- func NewCommand() *cobra.Command
- func NewCompletionCommand() *cobra.Command
- func NewDeleteCommand() *cobra.Command
- func NewGetCommand() *cobra.Command
- func NewListCommand() *cobra.Command
- func NewLogViewerCommand() *cobra.Command
- func NewLogsCommand() *cobra.Command
- func NewSubmitCommand() *cobra.Command
- func NewSubmitHorovodJobCommand() *cobra.Command
- func NewSubmitStandaloneJobCommand() *cobra.Command
- func NewSubmitTFJobCommand() *cobra.Command
- func NewTopCommand() *cobra.Command
- func NewTopJobCommand() *cobra.Command
- func NewTopNodeCommand() *cobra.Command
- func NewVersionCmd(cliName string) *cobra.Command
- type HorovodJob
- type HorovodJobTrainer
- type JobInfo
- func (ji *JobInfo) Age() string
- func (ji *JobInfo) AllPods() []v1.Pod
- func (ji *JobInfo) AllocatedGPU() int64
- func (ji *JobInfo) ChiefPod() v1.Pod
- func (ji *JobInfo) GetStatus() (status string)
- func (ji *JobInfo) HostIPOfChief() (hostIP string)
- func (ji *JobInfo) Name() string
- func (ji *JobInfo) RequestedGPU() int64
- func (ji *JobInfo) StartTime() *metav1.Time
- func (ji *JobInfo) Trainer() string
- type NodeDescriber
- type NodeInfo
- type StandaloneJob
- type StandaloneJobTrainer
- type TensorFlowJob
- func (tj *TensorFlowJob) Age() string
- func (tj *TensorFlowJob) AllPods() []v1.Pod
- func (tj *TensorFlowJob) AllocatedGPU() int64
- func (tj *TensorFlowJob) ChiefPod() v1.Pod
- func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (tj *TensorFlowJob) GetStatus() (status string)
- func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
- func (tj *TensorFlowJob) Name() string
- func (tj *TensorFlowJob) RequestedGPU() int64
- func (tj *TensorFlowJob) StartTime() *metav1.Time
- func (tj *TensorFlowJob) Trainer() string
- type TensorFlowJobTrainer
- type Trainer
- type TrainingJob
Constants ¶
const ( CHART_PKG_LOC = "CHARTREPO" // GPUResourceName is the extended name of the GPU resource since v1.8 // this uses the device plugin mechanism NVIDIAGPUResourceName = "nvidia.com/gpu" DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu" )
const (
// CLIName is the name of the CLI
CLIName = "arena"
)
Variables ¶
This section is empty.
Functions ¶
func DeleteTrainingJob ¶
func GetJobDashboards ¶
func NewCommand ¶
NewCommand returns a new instance of an Arena command
func NewCompletionCommand ¶
func NewDeleteCommand ¶
func NewGetCommand ¶
func NewListCommand ¶
func NewLogViewerCommand ¶
func NewLogsCommand ¶
func NewSubmitCommand ¶
func NewSubmitTFJobCommand ¶
func NewTopCommand ¶
func NewTopJobCommand ¶
func NewTopNodeCommand ¶
func NewVersionCmd ¶
Types ¶
type HorovodJob ¶
type HorovodJob struct {
*JobInfo
}
Horovod Job Information
func (*HorovodJob) AllPods ¶
func (hj *HorovodJob) AllPods() []v1.Pod
Get all the pods of the Training Job
func (*HorovodJob) GetJobDashboards ¶
func (hj *HorovodJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*HorovodJob) HostIPOfChief ¶
func (hj *HorovodJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
type HorovodJobTrainer ¶
type HorovodJobTrainer struct {
// contains filtered or unexported fields
}
Horovod Job trainer
func (*HorovodJobTrainer) GetTrainingJob ¶
func (m *HorovodJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*HorovodJobTrainer) IsSupported ¶
func (m *HorovodJobTrainer) IsSupported(name, ns string) bool
check if it's Horovod job
func (*HorovodJobTrainer) Type ¶
func (m *HorovodJobTrainer) Type() string
type JobInfo ¶
type JobInfo struct {
// contains filtered or unexported fields
}
func (*JobInfo) AllocatedGPU ¶
Requested GPU count of the Job
func (*JobInfo) HostIPOfChief ¶
Get the hostIP of the chief Pod
func (*JobInfo) RequestedGPU ¶
Requested GPU count of the Job
type NodeDescriber ¶
type NodeDescriber struct {
// contains filtered or unexported fields
}
type StandaloneJob ¶
type StandaloneJob struct {
*JobInfo
}
Standalone Job Information
func (*StandaloneJob) GetJobDashboards ¶
func (sj *StandaloneJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
type StandaloneJobTrainer ¶
type StandaloneJobTrainer struct {
// contains filtered or unexported fields
}
Standalone Job trainer
func (*StandaloneJobTrainer) GetTrainingJob ¶
func (s *StandaloneJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*StandaloneJobTrainer) IsSupported ¶
func (s *StandaloneJobTrainer) IsSupported(name, ns string) bool
check if it's Standalone job
func (*StandaloneJobTrainer) Type ¶
func (s *StandaloneJobTrainer) Type() string
type TensorFlowJob ¶
type TensorFlowJob struct {
// contains filtered or unexported fields
}
TensorFlow Job Information
func (*TensorFlowJob) AllPods ¶
func (tj *TensorFlowJob) AllPods() []v1.Pod
Get all the pods of the Training Job
func (*TensorFlowJob) AllocatedGPU ¶
func (tj *TensorFlowJob) AllocatedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) ChiefPod ¶
func (tj *TensorFlowJob) ChiefPod() v1.Pod
Get the chief Pod of the Job.
func (*TensorFlowJob) GetJobDashboards ¶
func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*TensorFlowJob) GetStatus ¶
func (tj *TensorFlowJob) GetStatus() (status string)
Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (*TensorFlowJob) HostIPOfChief ¶
func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
func (*TensorFlowJob) Name ¶
func (tj *TensorFlowJob) Name() string
func (*TensorFlowJob) RequestedGPU ¶
func (tj *TensorFlowJob) RequestedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) StartTime ¶
func (tj *TensorFlowJob) StartTime() *metav1.Time
func (*TensorFlowJob) Trainer ¶
func (tj *TensorFlowJob) Trainer() string
type TensorFlowJobTrainer ¶
type TensorFlowJobTrainer struct {
// contains filtered or unexported fields
}
TensorFlow Job trainer
func (*TensorFlowJobTrainer) GetTrainingJob ¶
func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*TensorFlowJobTrainer) IsSupported ¶
func (tt *TensorFlowJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*TensorFlowJobTrainer) Type ¶
func (tt *TensorFlowJobTrainer) Type() string
type Trainer ¶
type Trainer interface { // Check if the training job is supported IsSupported(name, ns string) bool // Get TrainingJob object directly. this method is called when `arena get` GetTrainingJob(name, namespace string) (TrainingJob, error) // Get the type of trainer Type() string }
func NewHorovodJobTrainer ¶
func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer
func NewStandaloneJobTrainer ¶
func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer
func NewTensorFlowJobTrainer ¶
func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer
func NewTrainers ¶
func NewTrainers(client *kubernetes.Clientset) []Trainer
construct the trainer list
type TrainingJob ¶
type TrainingJob interface { // Get the chief Pod of the Job. ChiefPod() v1.Pod // Get the name of the Training Job Name() string // Get all the pods of the Training Job AllPods() []v1.Pod // Get the Status of the Job: RUNNING, PENDING, GetStatus() string // Return trainer Type, support MPI, standalone, tensorflow Trainer() string // Get the Job Age Age() string // Get start time StartTime() *metav1.Time // Get Dashboard GetJobDashboards(client *kubernetes.Clientset) ([]string, error) // Requested GPU count of the Job RequestedGPU() int64 // Requested GPU count of the Job AllocatedGPU() int64 // the host ip of the chief pod HostIPOfChief() string }
The Training Job can be TensorFlow, MPI and Caffe
Source Files
¶
- common.go
- completion.go
- const.go
- dashboard_helper.go
- delete.go
- get.go
- gpu.go
- job_info.go
- list.go
- logs.go
- logviewer.go
- pod_helper.go
- root.go
- submit.go
- submit_horovod.go
- submit_standalone.go
- submit_tfjob.go
- sync_code.go
- tensorboard.go
- top.go
- top_job.go
- top_node.go
- trainer.go
- trainer_horovod.go
- trainer_interface.go
- trainer_standalone.go
- trainer_tensorflow.go
- version.go