Documentation ¶
Index ¶
- Constants
- Variables
- func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string
- func GetJobRealStatus(job TrainingJob) string
- func GetPodEvents(client *kubernetes.Clientset, namespace string, pods []v1.Pod) (map[string][]v1.Event, error)
- func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)
- func GpuMonitoringInstalled(client *kubernetes.Clientset) bool
- func NewCommand() *cobra.Command
- func NewCompletionCommand() *cobra.Command
- func NewDataCommand() *cobra.Command
- func NewDataListCommand() *cobra.Command
- func NewDeleteCommand() *cobra.Command
- func NewGetCommand() *cobra.Command
- func NewListCommand() *cobra.Command
- func NewLogViewerCommand() *cobra.Command
- func NewLogsCommand() *cobra.Command
- func NewPruneCommand() *cobra.Command
- func NewServeCommand() *cobra.Command
- func NewServingDeleteCommand() *cobra.Command
- func NewServingListCommand() *cobra.Command
- func NewServingTensorFlowCommand() *cobra.Command
- func NewSubmitCommand() *cobra.Command
- func NewSubmitHorovodJobCommand() *cobra.Command
- func NewSubmitMPIJobCommand() *cobra.Command
- func NewSubmitStandaloneJobCommand() *cobra.Command
- func NewSubmitTFJobCommand() *cobra.Command
- func NewTopCommand() *cobra.Command
- func NewTopJobCommand() *cobra.Command
- func NewTopNodeCommand() *cobra.Command
- func NewTrafficRouterSplitCommand() *cobra.Command
- func NewVersionCmd(cliName string) *cobra.Command
- func ParseMountPath(dataset []string) (err error)
- func PrintLine(w io.Writer, fields ...string)
- func SortMapKeys(podMetric PodGpuMetric) []string
- type Destination
- type DestinationRuleCRD
- type DestinationWeight
- type GpuMetric
- type GpuMetricInfo
- type HTTPMatchRequest
- type HTTPRoute
- type HorovodJob
- type HorovodJobTrainer
- type JobGpuMetric
- type JobInfo
- func (ji *JobInfo) Age() time.Duration
- func (ji *JobInfo) AllPods() []v1.Pod
- func (ji *JobInfo) AllocatedGPU() int64
- func (ji *JobInfo) ChiefPod() v1.Pod
- func (ji *JobInfo) GetStatus() (status string)
- func (ji *JobInfo) HostIPOfChief() (hostIP string)
- func (ji *JobInfo) Name() string
- func (ji *JobInfo) RequestedGPU() int64
- func (ji *JobInfo) StartTime() *metav1.Time
- func (ji *JobInfo) Trainer() string
- type MPIJob
- func (mj *MPIJob) Age() time.Duration
- func (mj *MPIJob) AllPods() []v1.Pod
- func (mj *MPIJob) AllocatedGPU() int64
- func (mj *MPIJob) ChiefPod() v1.Pod
- func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (mj *MPIJob) GetStatus() (status string)
- func (mj *MPIJob) HostIPOfChief() (hostIP string)
- func (mj *MPIJob) Name() string
- func (mj *MPIJob) RequestedGPU() int64
- func (mj *MPIJob) StartTime() *metav1.Time
- func (mj *MPIJob) Trainer() string
- type MPIJobTrainer
- type NodeDescriber
- type NodeInfo
- type PodGpuMetric
- type PortSelector
- type PreprocesObject
- type PrintArgs
- type PrometheusMetric
- type PrometheusMetricData
- type PrometheusMetricResult
- type PrometheusMetricValue
- type PruneArgs
- type ServeArgs
- type ServeTensorFlowArgs
- type StandaloneJob
- type StandaloneJobTrainer
- type StringMatchPrefix
- type TensorFlowJob
- func (tj *TensorFlowJob) Age() time.Duration
- func (tj *TensorFlowJob) AllPods() []v1.Pod
- func (tj *TensorFlowJob) AllocatedGPU() int64
- func (tj *TensorFlowJob) ChiefPod() v1.Pod
- func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (tj *TensorFlowJob) GetStatus() (status string)
- func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
- func (tj *TensorFlowJob) Name() string
- func (tj *TensorFlowJob) RequestedGPU() int64
- func (tj *TensorFlowJob) StartTime() *metav1.Time
- func (tj *TensorFlowJob) Trainer() string
- type TensorFlowJobTrainer
- type Trainer
- func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer
- func NewMPIJobTrainer(client *kubernetes.Clientset) Trainer
- func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer
- func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer
- func NewTrainers(client *kubernetes.Clientset) []Trainer
- type TrainingJob
- type VirtualService
- type VirtualServiceCRD
Constants ¶
const ( CHART_PKG_LOC = "CHARTREPO" // GPUResourceName is the extended name of the GPU resource since v1.8 // this uses the device plugin mechanism NVIDIAGPUResourceName = "nvidia.com/gpu" DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu" )
const (
// CLIName is the name of the CLI
CLIName = "arena"
)
const KUBEFLOW_NAMESPACE = "kubeflow"
const KUBE_SYSTEM_NAMESPACE = "kube-system"
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
const PROMETHEUS_SCHEME = "http"
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
Variables ¶
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
Functions ¶
func GetJobDashboards ¶
func GetJobRealStatus ¶
func GetJobRealStatus(job TrainingJob) string
Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending
func GetPodEvents ¶
func GetPodEvents(client *kubernetes.Clientset, namespace string, pods []v1.Pod) (map[string][]v1.Event, error)
Get Event of the Job
func GetPrometheusServiceName ¶
func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)
* * Get Prometheus from different namespaces
func GpuMonitoringInstalled ¶
func GpuMonitoringInstalled(client *kubernetes.Clientset) bool
func NewCommand ¶
NewCommand returns a new instance of an Arena command
func NewCompletionCommand ¶
func NewListCommand ¶
func NewLogViewerCommand ¶
func NewLogsCommand ¶
func NewPruneCommand ¶
func NewServeCommand ¶
func NewServingListCommand ¶
func NewSubmitCommand ¶
func NewSubmitHorovodJobCommand ¶
NewSubmitHorovodJobCommand
func NewSubmitMPIJobCommand ¶
func NewSubmitTFJobCommand ¶
func NewTopCommand ¶
func NewTopJobCommand ¶
func NewTopNodeCommand ¶
func NewVersionCmd ¶
func ParseMountPath ¶
func SortMapKeys ¶
func SortMapKeys(podMetric PodGpuMetric) []string
Types ¶
type Destination ¶
type Destination struct { *istiov1alpha3.Destination Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"` }
type DestinationRuleCRD ¶
type DestinationRuleCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }
type DestinationWeight ¶
type DestinationWeight struct { Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"` Weight int32 `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"` }
type GpuMetricInfo ¶
type GpuMetricInfo struct { MetricName string Value string Time float64 PodName string PodNamespace string ContainerName string NodeName string GPUUID string Id string }
func QueryMetricByPrometheus ¶
func QueryMetricByPrometheus(client *kubernetes.Clientset, prometheusServiceName string, namespace string, query string) ([]GpuMetricInfo, error)
type HTTPMatchRequest ¶
type HTTPMatchRequest struct { *istiov1alpha3.HTTPMatchRequest Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"` }
type HTTPRoute ¶
type HTTPRoute struct { *istiov1alpha3.HTTPRoute Match []*HTTPMatchRequest `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"` Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"` }
type HorovodJob ¶
type HorovodJob struct {
*JobInfo
}
Horovod Job Information
func (*HorovodJob) AllPods ¶
func (hj *HorovodJob) AllPods() []v1.Pod
Get all the pods of the Training Job
func (*HorovodJob) GetJobDashboards ¶
func (hj *HorovodJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*HorovodJob) HostIPOfChief ¶
func (hj *HorovodJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
type HorovodJobTrainer ¶
type HorovodJobTrainer struct {
// contains filtered or unexported fields
}
Horovod Job trainer
func (*HorovodJobTrainer) GetTrainingJob ¶
func (m *HorovodJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*HorovodJobTrainer) IsSupported ¶
func (m *HorovodJobTrainer) IsSupported(name, ns string) bool
check if it's Horovod job
func (*HorovodJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (hj *HorovodJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
func (*HorovodJobTrainer) Type ¶
func (m *HorovodJobTrainer) Type() string
type JobGpuMetric ¶
type JobGpuMetric map[string]PodGpuMetric
func GetJobGpuMetric ¶
func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric JobGpuMetric, err error)
func GetPodsGpuInfo ¶
func GetPodsGpuInfo(client *kubernetes.Clientset, prometheusServiceName string, namespace string, podNames []string) (JobGpuMetric, error)
func (JobGpuMetric) GetPodMetrics ¶
func (m JobGpuMetric) GetPodMetrics(podName string) PodGpuMetric
func (*JobGpuMetric) SetPodMetric ¶
func (m *JobGpuMetric) SetPodMetric(metric GpuMetricInfo)
type JobInfo ¶
type JobInfo struct {
// contains filtered or unexported fields
}
func (*JobInfo) AllocatedGPU ¶
Requested GPU count of the Job
func (*JobInfo) HostIPOfChief ¶
Get the hostIP of the chief Pod
func (*JobInfo) RequestedGPU ¶
Requested GPU count of the Job
type MPIJob ¶
type MPIJob struct {
// contains filtered or unexported fields
}
MPI Job Information
func (*MPIJob) GetJobDashboards ¶
func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*MPIJob) HostIPOfChief ¶
Get the hostIP of the chief Pod
type MPIJobTrainer ¶
type MPIJobTrainer struct {
// contains filtered or unexported fields
}
MPI Job trainer
func (*MPIJobTrainer) GetTrainingJob ¶
func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
Get the training job from cache or directly
func (*MPIJobTrainer) IsSupported ¶
func (tt *MPIJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*MPIJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (tt *MPIJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
type NodeDescriber ¶
type NodeDescriber struct {
// contains filtered or unexported fields
}
type PodGpuMetric ¶
type PortSelector ¶
type PortSelector struct { *istiov1alpha3.PortSelector Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"` }
type PreprocesObject ¶
type PreprocesObject struct { ServiceName string Namespace string DestinationRule DestinationRuleCRD VirtualService VirtualServiceCRD }
type PrometheusMetric ¶
type PrometheusMetric struct { Status string `json:"status,inline"` Data PrometheusMetricData `json:"data,omitempty"` }
type PrometheusMetricData ¶
type PrometheusMetricData struct { Result []PrometheusMetricResult `json:"result"` ResultType string `json:"resultType"` }
type PrometheusMetricResult ¶
type PrometheusMetricResult struct { Metric map[string]string `json:"metric"` Value []PrometheusMetricValue `json:"value"` }
type PrometheusMetricValue ¶
type PrometheusMetricValue interface{}
type ServeArgs ¶
type ServeArgs struct { Image string `yaml:"image"` // --image ImagePullPolicy string `yaml:"imagePullPolicy"` // --imagePullPolicy GPUCount int `yaml:"gpuCount"` // --gpus Cpu string `yaml:"cpu"` // --cpu Memory string `yaml:"memory"` // --memory Envs map[string]string `yaml:"envs"` // --envs Command string `yaml:"command"` // --command Replicas int `yaml:"replicas"` // --replicas Port int `yaml:"port"` // --port RestfulPort int `yaml:"rest_api_port"` // --restfulPort ModelName string `yaml:"modelName"` // --modelName ModelPath string `yaml:"modelPath"` // --modelPath EnableIstio bool `yaml:"enableIstio"` // --enableIstio ExposeService bool `yaml:"exposeService"` // --exposeService ServingName string `yaml:"servingName"` // --servingName ServingVersion string `yaml:"servingVersion"` // --servingVersion ModelDirs map[string]string `yaml:"modelDirs"` }
type ServeTensorFlowArgs ¶
type ServeTensorFlowArgs struct { VersionPolicy string `yaml:"versionPolicy"` // --versionPolicy ModelConfigFile string `yaml:"modelConfigFile"` // --modelConfigFile ModelConfigFileContent string `yaml:"modelConfigFileContent"` ServeArgs `yaml:",inline"` ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists }
type StandaloneJob ¶
type StandaloneJob struct {
*JobInfo
}
Standalone Job Information
func (*StandaloneJob) GetJobDashboards ¶
func (sj *StandaloneJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
type StandaloneJobTrainer ¶
type StandaloneJobTrainer struct {
// contains filtered or unexported fields
}
Standalone Job trainer
func (*StandaloneJobTrainer) GetTrainingJob ¶
func (s *StandaloneJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*StandaloneJobTrainer) IsSupported ¶
func (s *StandaloneJobTrainer) IsSupported(name, ns string) bool
check if it's Standalone job
func (*StandaloneJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (s *StandaloneJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
func (*StandaloneJobTrainer) Type ¶
func (s *StandaloneJobTrainer) Type() string
type StringMatchPrefix ¶
type StringMatchPrefix struct {
Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}
type TensorFlowJob ¶
type TensorFlowJob struct {
// contains filtered or unexported fields
}
TensorFlow Job Information
func (*TensorFlowJob) AllPods ¶
func (tj *TensorFlowJob) AllPods() []v1.Pod
Get all the pods of the Training Job
func (*TensorFlowJob) AllocatedGPU ¶
func (tj *TensorFlowJob) AllocatedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) ChiefPod ¶
func (tj *TensorFlowJob) ChiefPod() v1.Pod
Get the chief Pod of the Job.
func (*TensorFlowJob) GetJobDashboards ¶
func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard url of the job
func (*TensorFlowJob) GetStatus ¶
func (tj *TensorFlowJob) GetStatus() (status string)
Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED
func (*TensorFlowJob) HostIPOfChief ¶
func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)
Get the hostIP of the chief Pod
func (*TensorFlowJob) Name ¶
func (tj *TensorFlowJob) Name() string
func (*TensorFlowJob) RequestedGPU ¶
func (tj *TensorFlowJob) RequestedGPU() int64
Requested GPU count of the Job
func (*TensorFlowJob) StartTime ¶
func (tj *TensorFlowJob) StartTime() *metav1.Time
func (*TensorFlowJob) Trainer ¶
func (tj *TensorFlowJob) Trainer() string
type TensorFlowJobTrainer ¶
type TensorFlowJobTrainer struct {
// contains filtered or unexported fields
}
TensorFlow Job trainer
func (*TensorFlowJobTrainer) GetTrainingJob ¶
func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)
func (*TensorFlowJobTrainer) IsSupported ¶
func (tt *TensorFlowJobTrainer) IsSupported(name, ns string) bool
check if it's TensorFlow job
func (*TensorFlowJobTrainer) ListTrainingJobs ¶ added in v0.2.0
func (tt *TensorFlowJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)
* * List Training jobs
func (*TensorFlowJobTrainer) Type ¶
func (tt *TensorFlowJobTrainer) Type() string
type Trainer ¶
type Trainer interface { // Check if the training job is supported IsSupported(name, ns string) bool // Get TrainingJob object directly. this method is called when `arena get` GetTrainingJob(name, namespace string) (TrainingJob, error) // Get the type of trainer Type() string ListTrainingJobs() ([]TrainingJob, error) }
func NewHorovodJobTrainer ¶
func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer
Create HorovodJob Trainer
func NewMPIJobTrainer ¶
func NewMPIJobTrainer(client *kubernetes.Clientset) Trainer
NewMPIJobTrainer
func NewStandaloneJobTrainer ¶
func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer
func NewTensorFlowJobTrainer ¶
func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer
func NewTrainers ¶
func NewTrainers(client *kubernetes.Clientset) []Trainer
construct the trainer list
type TrainingJob ¶
type TrainingJob interface { // Get the chief Pod of the Job. ChiefPod() v1.Pod // Get the name of the Training Job Name() string // Get all the pods of the Training Job AllPods() []v1.Pod // Get the Status of the Job: RUNNING, PENDING, GetStatus() string // Return trainer Type, support MPI, standalone, tensorflow Trainer() string // Get the Job Age Age() time.Duration // Get start time StartTime() *metav1.Time // Get Dashboard GetJobDashboards(client *kubernetes.Clientset) ([]string, error) // Requested GPU count of the Job RequestedGPU() int64 // Requested GPU count of the Job AllocatedGPU() int64 // the host ip of the chief pod HostIPOfChief() string }
The Training Job can be TensorFlow, MPI and Caffe
type VirtualService ¶
type VirtualService struct { *istiov1alpha3.VirtualService Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"` }
type VirtualServiceCRD ¶
type VirtualServiceCRD struct { // Kind is a string value representing the REST resource this object represents. // Servers may infer this from the endpoint the client submits requests to. // Cannot be updated. // In CamelCase. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds // +optional Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"` // APIVersion defines the versioned schema of this representation of an object. // Servers should convert recognized schemas to the latest internal value, and // may reject unrecognized values. // More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources // +optional APIVersion string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"` metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` Spec VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` }
Source Files ¶
- common.go
- completion.go
- completion_bash.go
- const.go
- dashboard_helper.go
- data.go
- data_info.go
- data_list.go
- delete.go
- get.go
- get_advanced.go
- gpu.go
- gpu_info.go
- job_info.go
- list.go
- logs.go
- logviewer.go
- pod_helper.go
- prune.go
- root.go
- serve.go
- serve_delete.go
- serve_list.go
- serve_tensorflow.go
- submit.go
- submit_horovod.go
- submit_mpi.go
- submit_standalone.go
- submit_tfjob.go
- sync_code.go
- tensorboard.go
- top.go
- top_job.go
- top_node.go
- traffic_router_split.go
- trainer.go
- trainer_horovod.go
- trainer_interface.go
- trainer_mpi.go
- trainer_standalone.go
- trainer_tensorflow.go
- version.go