Documentation
¶
Overview ¶
Copyright 2018 The Kubeflow Authors
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Index ¶
- Constants
- Variables
- func BuildJobInfo(job TrainingJob) *types.JobInfo
- func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string
- func GetJobRealStatus(job TrainingJob) string
- func GetNamespace() string
- func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)
- func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)
- func GpuMonitoringInstalled(client *kubernetes.Clientset) bool
- func ListServing(client *kubernetes.Clientset) ([]servejob.Serving, error)
- func ListServingJobsByHelm() ([]servejob.Serving, error)
- func ListServingsByName(client *kubernetes.Clientset, name string) (servings []servejob.Serving, err error)
- func NewBashCommand() *cobra.Command
- func NewCommand() *cobra.Command
- func NewCompletionCommand() *cobra.Command
- func NewDataCommand() *cobra.Command
- func NewDataListCommand() *cobra.Command
- func NewDeleteCommand() *cobra.Command
- func NewExecCommand() *cobra.Command
- func NewGetCommand() *cobra.Command
- func NewListCommand() *cobra.Command
- func NewLogViewerCommand() *cobra.Command
- func NewLogsCommand() *cobra.Command
- func NewPruneCommand() *cobra.Command
- func NewRunaiJobCommand() *cobra.Command
- func NewServingDeleteCommand() *cobra.Command
- func NewServingGetCommand() *cobra.Command
- func NewServingListCommand() *cobra.Command
- func NewServingLogCommand() *cobra.Command
- func NewSubmitCommand() *cobra.Command
- func NewSubmitRunaiJobArgs() *submitRunaiJobArgs
- func NewTemplateCommand() *cobra.Command
- func NewTemplateGetCommand() *cobra.Command
- func NewTemplateListCommand() *cobra.Command
- func NewTopCommand() *cobra.Command
- func NewTopJobCommand() *cobra.Command
- func NewTopNodeCommand() *cobra.Command
- func NewUpdateCommand() *cobra.Command
- func NewVersionCmd() *cobra.Command
- func PrintLine(w io.Writer, fields ...string)
- func PrintTemplates(configs []clusterConfig.ClusterConfig)
- func SortMapKeys(podMetric PodGpuMetric) []string
- type Asset
- type BasicJobInfo
- type GithubResponse
- type GpuMetric
- type GpuMetricInfo
- type JobGpuMetric
- type JobInfo
- func (ji *JobInfo) Age() time.Duration
- func (ji *JobInfo) AllPods() []v1.Pod
- func (ji *JobInfo) AllocatedGPU() int64
- func (ji *JobInfo) ChiefPod() v1.Pod
- func (ji *JobInfo) Duration() time.Duration
- func (ji *JobInfo) GetStatus() (status string)
- func (ji *JobInfo) HostIPOfChief() (hostIP string)
- func (ji *JobInfo) Name() string
- func (ji *JobInfo) Namespace() string
- func (ji *JobInfo) RequestedGPU() int64
- func (ji *JobInfo) StartTime() *metav1.Time
- func (ji *JobInfo) Trainer() string
- func (ji *JobInfo) Uid() string
- type NodeDescriber
- type NodeInfo
- type PodGpuMetric
- type PrintArgs
- type PrometheusMetric
- type PrometheusMetricData
- type PrometheusMetricResult
- type PrometheusMetricValue
- type PruneArgs
- type Resource
- type ResourceType
- type RunaiJob
- func (rj *RunaiJob) Age() time.Duration
- func (rj *RunaiJob) AllPods() []v1.Pod
- func (rj *RunaiJob) AllocatedGPU() int64
- func (rj *RunaiJob) ChiefPod() *v1.Pod
- func (rj *RunaiJob) CreatedByCLI() bool
- func (rj *RunaiJob) Duration() time.Duration
- func (rj *RunaiJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
- func (rj *RunaiJob) GetPriorityClass() string
- func (rj *RunaiJob) GetStatus() string
- func (rj *RunaiJob) HostIPOfChief() string
- func (rj *RunaiJob) Image() string
- func (rj *RunaiJob) Interactive() string
- func (rj *RunaiJob) Name() string
- func (rj *RunaiJob) Namespace() string
- func (rj *RunaiJob) Project() string
- func (rj *RunaiJob) RequestedGPU() int64
- func (rj *RunaiJob) Resources() []Resource
- func (rj *RunaiJob) ServiceURLs() []string
- func (rj *RunaiJob) StartTime() *metav1.Time
- func (rj *RunaiJob) Trainer() string
- func (rj *RunaiJob) User() string
- type RunaiJobInfo
- type RunaiOwnerInfo
- type RunaiTrainer
- type SortPodConditionByLastTransitionTime
- type Trainer
- type TrainingJob
Constants ¶
const ( RecommendedConfigPathEnvVar = "ARENA_CONFIG" DefaultArenaConfigPath = "~/.arena/config" )
const ( CHART_PKG_LOC = "CHARTREPO" // GPUResourceName is the extended name of the GPU resource since v1.8 // this uses the device plugin mechanism NVIDIAGPUResourceName = "nvidia.com/gpu" ALIYUNGPUResourceName = "aliyun.com/gpu-mem" DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu" )
const KUBEFLOW_NAMESPACE = "kubeflow"
const KUBE_SYSTEM_NAMESPACE = "kube-system"
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
const PROMETHEUS_SCHEME = "http"
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"
const ResourceTypeJob = ResourceType("Job")
const ResourceTypePod = ResourceType("Pod")
const ResourceTypeReplicaset = ResourceType("ReplicaSet")
const ResourceTypeStatefulSet = ResourceType("StatefulSet")
Variables ¶
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}
Functions ¶
func BuildJobInfo ¶ added in v0.2.0
func BuildJobInfo(job TrainingJob) *types.JobInfo
* * BuildTrainingJobInfo returns types.TrainingJobInfo
func GetJobDashboards ¶
func GetJobRealStatus ¶
func GetJobRealStatus(job TrainingJob) string
Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending
func GetNamespace ¶ added in v0.2.0
func GetNamespace() string
func GetPrometheusServiceName ¶
func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)
* * Get Prometheus from different namespaces
func GetResourcesEvents ¶ added in v0.3.0
func GetResourcesEvents(client *kubernetes.Clientset, namespace string, resources []Resource) (map[string][]v1.Event, error)
Get Event of the Job
func GpuMonitoringInstalled ¶
func GpuMonitoringInstalled(client *kubernetes.Clientset) bool
func ListServing ¶ added in v0.2.0
func ListServing(client *kubernetes.Clientset) ([]servejob.Serving, error)
ListServing returns a list of serving
func ListServingJobsByHelm ¶ added in v0.2.0
func ListServingsByName ¶ added in v0.3.0
func ListServingsByName(client *kubernetes.Clientset, name string) (servings []servejob.Serving, err error)
List Servings by name
func NewBashCommand ¶ added in v1.1.0
func NewCommand ¶
NewCommand returns a new instance of an Arena command
func NewCompletionCommand ¶
func NewExecCommand ¶ added in v1.1.0
func NewListCommand ¶
func NewLogViewerCommand ¶
func NewLogsCommand ¶
func NewPruneCommand ¶
func NewRunaiJobCommand ¶ added in v1.0.0
func NewServingGetCommand ¶ added in v0.3.0
NewServingGetCommand starts the command
func NewServingListCommand ¶
func NewServingLogCommand ¶ added in v1.0.0
func NewSubmitCommand ¶
func NewSubmitRunaiJobArgs ¶ added in v1.0.0
func NewSubmitRunaiJobArgs() *submitRunaiJobArgs
func NewTemplateCommand ¶ added in v1.1.4
func NewTemplateGetCommand ¶ added in v1.1.4
func NewTemplateListCommand ¶ added in v1.1.4
func NewTopCommand ¶
func NewTopJobCommand ¶
func NewTopNodeCommand ¶
func NewUpdateCommand ¶ added in v1.1.0
func NewVersionCmd ¶
func PrintTemplates ¶ added in v1.1.4
func PrintTemplates(configs []clusterConfig.ClusterConfig)
func SortMapKeys ¶
func SortMapKeys(podMetric PodGpuMetric) []string
Types ¶
type BasicJobInfo ¶ added in v0.3.0
type BasicJobInfo struct {
// contains filtered or unexported fields
}
func (*BasicJobInfo) CreatedByCLI ¶ added in v1.1.0
func (*BasicJobInfo) CreatedByCLI() bool
func (*BasicJobInfo) Image ¶ added in v1.1.0
func (j *BasicJobInfo) Image() string
func (*BasicJobInfo) Interactive ¶ added in v1.1.0
func (j *BasicJobInfo) Interactive() string
func (*BasicJobInfo) Project ¶ added in v1.1.0
func (j *BasicJobInfo) Project() string
func (*BasicJobInfo) Resources ¶ added in v0.3.0
func (j *BasicJobInfo) Resources() []Resource
func (*BasicJobInfo) ServiceURLs ¶ added in v1.1.0
func (*BasicJobInfo) ServiceURLs() []string
func (*BasicJobInfo) User ¶ added in v1.1.0
func (j *BasicJobInfo) User() string
type GithubResponse ¶ added in v1.1.0
type GpuMetricInfo ¶
type GpuMetricInfo struct { MetricName string Value string Time float64 PodName string PodNamespace string ContainerName string NodeName string GPUUID string Id string }
func QueryMetricByPrometheus ¶
func QueryMetricByPrometheus(client *kubernetes.Clientset, prometheusServiceName string, namespace string, query string) ([]GpuMetricInfo, error)
type JobGpuMetric ¶
type JobGpuMetric map[string]PodGpuMetric
func GetJobGpuMetric ¶
func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric JobGpuMetric, err error)
func GetPodsGpuInfo ¶
func GetPodsGpuInfo(client *kubernetes.Clientset, prometheusServiceName string, namespace string, podNames []string) (JobGpuMetric, error)
func (JobGpuMetric) GetPodMetrics ¶
func (m JobGpuMetric) GetPodMetrics(podName string) PodGpuMetric
func (*JobGpuMetric) SetPodMetric ¶
func (m *JobGpuMetric) SetPodMetric(metric GpuMetricInfo)
type JobInfo ¶
type JobInfo struct { *BasicJobInfo // contains filtered or unexported fields }
func (*JobInfo) AllocatedGPU ¶
Requested GPU count of the Job
func (*JobInfo) HostIPOfChief ¶
Get the hostIP of the chief Pod
func (*JobInfo) RequestedGPU ¶
Requested GPU count of the Job
type NodeDescriber ¶
type NodeDescriber struct {
// contains filtered or unexported fields
}
type PodGpuMetric ¶
type PrometheusMetric ¶
type PrometheusMetric struct { Status string `json:"status,inline"` Data PrometheusMetricData `json:"data,omitempty"` }
type PrometheusMetricData ¶
type PrometheusMetricData struct { Result []PrometheusMetricResult `json:"result"` ResultType string `json:"resultType"` }
type PrometheusMetricResult ¶
type PrometheusMetricResult struct { Metric map[string]string `json:"metric"` Value []PrometheusMetricValue `json:"value"` }
type PrometheusMetricValue ¶
type PrometheusMetricValue interface{}
type Resource ¶ added in v0.3.0
type Resource struct { Name string Uid string ResourceType ResourceType }
type ResourceType ¶ added in v0.3.0
type ResourceType string
type RunaiJob ¶ added in v1.0.0
type RunaiJob struct { *BasicJobInfo // contains filtered or unexported fields }
func NewRunaiJob ¶ added in v1.1.0
func (*RunaiJob) AllocatedGPU ¶ added in v1.0.0
Requested GPU count of the Job
func (*RunaiJob) CreatedByCLI ¶ added in v1.1.0
func (*RunaiJob) GetJobDashboards ¶ added in v1.0.0
func (rj *RunaiJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)
Get Dashboard
func (*RunaiJob) GetPriorityClass ¶ added in v1.0.0
The priority class name of the training job
func (*RunaiJob) HostIPOfChief ¶ added in v1.0.0
the host ip of the chief pod
func (*RunaiJob) Interactive ¶ added in v1.1.0
func (*RunaiJob) RequestedGPU ¶ added in v1.0.0
Requested GPU count of the Job
func (*RunaiJob) ServiceURLs ¶ added in v1.1.0
type RunaiJobInfo ¶ added in v1.1.0
type RunaiJobInfo struct {
// contains filtered or unexported fields
}
type RunaiOwnerInfo ¶ added in v1.1.5
type RunaiTrainer ¶ added in v1.0.0
type RunaiTrainer struct {
// contains filtered or unexported fields
}
func (*RunaiTrainer) GetTrainingJob ¶ added in v1.0.0
func (rt *RunaiTrainer) GetTrainingJob(name, namespace string) (TrainingJob, error)
func (*RunaiTrainer) IsSupported ¶ added in v1.0.0
func (rt *RunaiTrainer) IsSupported(name, ns string) bool
func (*RunaiTrainer) ListTrainingJobs ¶ added in v1.0.0
func (rt *RunaiTrainer) ListTrainingJobs(namespace string) ([]TrainingJob, error)
func (*RunaiTrainer) Type ¶ added in v1.0.0
func (rt *RunaiTrainer) Type() string
type SortPodConditionByLastTransitionTime ¶ added in v0.2.0
type SortPodConditionByLastTransitionTime []v1.PodCondition
Sort the pod condition by time.
func (SortPodConditionByLastTransitionTime) Len ¶ added in v0.2.0
func (s SortPodConditionByLastTransitionTime) Len() int
func (SortPodConditionByLastTransitionTime) Less ¶ added in v0.2.0
func (s SortPodConditionByLastTransitionTime) Less(i, j int) bool
func (SortPodConditionByLastTransitionTime) Swap ¶ added in v0.2.0
func (s SortPodConditionByLastTransitionTime) Swap(i, j int)
type Trainer ¶
type Trainer interface { // Check if the training job is supported IsSupported(name, ns string) bool // Get TrainingJob object directly. this method is called when `arena get` GetTrainingJob(name, namespace string) (TrainingJob, error) // Get the type of trainer Type() string ListTrainingJobs(namespace string) ([]TrainingJob, error) }
func NewRunaiTrainer ¶ added in v1.0.0
func NewRunaiTrainer(client kubernetes.Interface) Trainer
func NewTrainers ¶
func NewTrainers(client *kubernetes.Clientset) []Trainer
construct the trainer list
type TrainingJob ¶
type TrainingJob interface { // Get the chief Pod of the Job. ChiefPod() *v1.Pod // Get the name of the Training Job Name() string // Get the namespace of the Training Job Namespace() string // Get all the pods of the Training Job AllPods() []v1.Pod // Get all the kubernetes resource of the Training Job Resources() []Resource // Get the Status of the Job: RUNNING, PENDING, GetStatus() string // Return trainer Type, support MPI, standalone, tensorflow Trainer() string // Get the Job Age Age() time.Duration // Get the Job Duration Duration() time.Duration // Get start time StartTime() *metav1.Time // Get Dashboard GetJobDashboards(client *kubernetes.Clientset) ([]string, error) // Requested GPU count of the Job RequestedGPU() int64 // Requested GPU count of the Job AllocatedGPU() int64 // the host ip of the chief pod HostIPOfChief() string // The priority class name of the training job GetPriorityClass() string Project() string User() string Interactive() string Image() string CreatedByCLI() bool ServiceURLs() []string }
The Training Job can be TensorFlow, MPI and Caffe
Source Files
¶
- common.go
- completion.go
- completion_bash.go
- config.go
- const.go
- dashboard_helper.go
- data.go
- data_info.go
- data_list.go
- delete.go
- exec.go
- get.go
- get_advanced.go
- gpu.go
- gpu_info.go
- job_info.go
- list.go
- logs.go
- logviewer.go
- pod_helper.go
- prune.go
- resource.go
- root.go
- runai_job.go
- serve_delete.go
- serve_get.go
- serve_list.go
- serve_log.go
- submit.go
- submit_runai.go
- sync_code.go
- template.go
- template_get.go
- template_list.go
- tensorboard.go
- top.go
- top_job.go
- top_node.go
- trainer.go
- trainer_interface.go
- trainer_runai.go
- update.go
- version.go