commands

package
v0.2.0-rc.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 1, 2019 License: Apache-2.0, Apache-2.0 Imports: 41 Imported by: 0

Documentation

Index

Constants

View Source
const (
	CHART_PKG_LOC = "CHARTREPO"
	// GPUResourceName is the extended name of the GPU resource since v1.8
	// this uses the device plugin mechanism
	NVIDIAGPUResourceName = "nvidia.com/gpu"

	DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu"
)
View Source
const (
	// CLIName is the name of the CLI
	CLIName = "arena"
)
View Source
const KUBEFLOW_NAMESPACE = "kubeflow"
View Source
const KUBE_SYSTEM_NAMESPACE = "kube-system"
View Source
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
View Source
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
View Source
const PROMETHEUS_SCHEME = "http"
View Source
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"

Variables

View Source
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}

Functions

func GetJobDashboards

func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string

func GetJobRealStatus

func GetJobRealStatus(job TrainingJob) string

Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending

func GetPodEvents

func GetPodEvents(client *kubernetes.Clientset, namespace string, pods []v1.Pod) (map[string][]v1.Event, error)

Get Event of the Job

func GetPrometheusServiceName

func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)

* * Get Prometheus from different namespaces

func GpuMonitoringInstalled

func GpuMonitoringInstalled(client *kubernetes.Clientset) bool

func NewCommand

func NewCommand() *cobra.Command

NewCommand returns a new instance of an Arena command

func NewCompletionCommand

func NewCompletionCommand() *cobra.Command

func NewDataCommand

func NewDataCommand() *cobra.Command

manage data volume

func NewDataListCommand

func NewDataListCommand() *cobra.Command

List Data Command

func NewDeleteCommand

func NewDeleteCommand() *cobra.Command

NewDeleteCommand

func NewGetCommand

func NewGetCommand() *cobra.Command

NewGetCommand

func NewListCommand

func NewListCommand() *cobra.Command

func NewLogViewerCommand

func NewLogViewerCommand() *cobra.Command

func NewLogsCommand

func NewLogsCommand() *cobra.Command

func NewPruneCommand

func NewPruneCommand() *cobra.Command

func NewServeCommand

func NewServeCommand() *cobra.Command

func NewServingDeleteCommand

func NewServingDeleteCommand() *cobra.Command

NewDeleteCommand

func NewServingListCommand

func NewServingListCommand() *cobra.Command

func NewServingTensorFlowCommand

func NewServingTensorFlowCommand() *cobra.Command

func NewSubmitCommand

func NewSubmitCommand() *cobra.Command

func NewSubmitHorovodJobCommand

func NewSubmitHorovodJobCommand() *cobra.Command

NewSubmitHorovodJobCommand

func NewSubmitMPIJobCommand

func NewSubmitMPIJobCommand() *cobra.Command

func NewSubmitStandaloneJobCommand

func NewSubmitStandaloneJobCommand() *cobra.Command

func NewSubmitTFJobCommand

func NewSubmitTFJobCommand() *cobra.Command

func NewTopCommand

func NewTopCommand() *cobra.Command

func NewTopJobCommand

func NewTopJobCommand() *cobra.Command

func NewTopNodeCommand

func NewTopNodeCommand() *cobra.Command

func NewTrafficRouterSplitCommand

func NewTrafficRouterSplitCommand() *cobra.Command

func NewVersionCmd

func NewVersionCmd(cliName string) *cobra.Command

func ParseMountPath

func ParseMountPath(dataset []string) (err error)

func PrintLine

func PrintLine(w io.Writer, fields ...string)

func SortMapKeys

func SortMapKeys(podMetric PodGpuMetric) []string

Types

type Destination

type Destination struct {
	*istiov1alpha3.Destination
	Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"`
}

type DestinationRuleCRD

type DestinationRuleCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type DestinationWeight

type DestinationWeight struct {
	Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"`
	Weight      int32        `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"`
}

type GpuMetric

type GpuMetric struct {
	GpuDutyCycle   float64
	GpuMemoryUsed  float64
	GpuMemoryTotal float64
}

type GpuMetricInfo

type GpuMetricInfo struct {
	MetricName    string
	Value         string
	Time          float64
	PodName       string
	PodNamespace  string
	ContainerName string
	NodeName      string
	GPUUID        string
	Id            string
}

func QueryMetricByPrometheus

func QueryMetricByPrometheus(client *kubernetes.Clientset, prometheusServiceName string, namespace string, query string) ([]GpuMetricInfo, error)

type HTTPMatchRequest

type HTTPMatchRequest struct {
	*istiov1alpha3.HTTPMatchRequest
	Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"`
}

type HTTPRoute

type HTTPRoute struct {
	*istiov1alpha3.HTTPRoute
	Match []*HTTPMatchRequest  `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"`
	Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"`
}

type HorovodJob

type HorovodJob struct {
	*JobInfo
}

Horovod Job Information

func (*HorovodJob) AllPods

func (hj *HorovodJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*HorovodJob) ChiefPod

func (hj *HorovodJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*HorovodJob) GetJobDashboards

func (hj *HorovodJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*HorovodJob) HostIPOfChief

func (hj *HorovodJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

type HorovodJobTrainer

type HorovodJobTrainer struct {
	// contains filtered or unexported fields
}

Horovod Job trainer

func (*HorovodJobTrainer) GetTrainingJob

func (m *HorovodJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*HorovodJobTrainer) IsSupported

func (m *HorovodJobTrainer) IsSupported(name, ns string) bool

check if it's Horovod job

func (*HorovodJobTrainer) ListTrainingJobs added in v0.2.0

func (hj *HorovodJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*HorovodJobTrainer) Type

func (m *HorovodJobTrainer) Type() string

type JobGpuMetric

type JobGpuMetric map[string]PodGpuMetric

func GetJobGpuMetric

func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric JobGpuMetric, err error)

func GetPodsGpuInfo

func GetPodsGpuInfo(client *kubernetes.Clientset, prometheusServiceName string, namespace string, podNames []string) (JobGpuMetric, error)

func (JobGpuMetric) GetPodMetrics

func (m JobGpuMetric) GetPodMetrics(podName string) PodGpuMetric

func (*JobGpuMetric) SetPodMetric

func (m *JobGpuMetric) SetPodMetric(metric GpuMetricInfo)

type JobInfo

type JobInfo struct {
	// contains filtered or unexported fields
}

func (*JobInfo) Age

func (ji *JobInfo) Age() time.Duration

func (*JobInfo) AllPods

func (ji *JobInfo) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*JobInfo) AllocatedGPU

func (ji *JobInfo) AllocatedGPU() int64

Requested GPU count of the Job

func (*JobInfo) ChiefPod

func (ji *JobInfo) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*JobInfo) GetStatus

func (ji *JobInfo) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*JobInfo) HostIPOfChief

func (ji *JobInfo) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*JobInfo) Name

func (ji *JobInfo) Name() string

func (*JobInfo) RequestedGPU

func (ji *JobInfo) RequestedGPU() int64

Requested GPU count of the Job

func (*JobInfo) StartTime

func (ji *JobInfo) StartTime() *metav1.Time

func (*JobInfo) Trainer

func (ji *JobInfo) Trainer() string

type MPIJob

type MPIJob struct {
	// contains filtered or unexported fields
}

MPI Job Information

func (*MPIJob) Age

func (mj *MPIJob) Age() time.Duration

Get the Job Age

func (*MPIJob) AllPods

func (mj *MPIJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*MPIJob) AllocatedGPU

func (mj *MPIJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*MPIJob) ChiefPod

func (mj *MPIJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*MPIJob) GetJobDashboards

func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*MPIJob) GetStatus

func (mj *MPIJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*MPIJob) HostIPOfChief

func (mj *MPIJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*MPIJob) Name

func (mj *MPIJob) Name() string

func (*MPIJob) RequestedGPU

func (mj *MPIJob) RequestedGPU() int64

Requested GPU count of the Job

func (*MPIJob) StartTime

func (mj *MPIJob) StartTime() *metav1.Time

Get the start time

func (*MPIJob) Trainer

func (mj *MPIJob) Trainer() string

type MPIJobTrainer

type MPIJobTrainer struct {
	// contains filtered or unexported fields
}

MPI Job trainer

func (*MPIJobTrainer) GetTrainingJob

func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

Get the training job from cache or directly

func (*MPIJobTrainer) IsSupported

func (tt *MPIJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*MPIJobTrainer) ListTrainingJobs added in v0.2.0

func (tt *MPIJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*MPIJobTrainer) Type

func (tt *MPIJobTrainer) Type() string

Get the type

type NodeDescriber

type NodeDescriber struct {
	// contains filtered or unexported fields
}

type NodeInfo

type NodeInfo struct {
	// contains filtered or unexported fields
}

type PodGpuMetric

type PodGpuMetric map[string]*GpuMetric

type PortSelector

type PortSelector struct {
	*istiov1alpha3.PortSelector
	Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"`
}

type PreprocesObject

type PreprocesObject struct {
	ServiceName     string
	Namespace       string
	DestinationRule DestinationRuleCRD
	VirtualService  VirtualServiceCRD
}

type PrintArgs

type PrintArgs struct {
	ShowEvents bool
	Output     string
}

type PrometheusMetric

type PrometheusMetric struct {
	Status string               `json:"status,inline"`
	Data   PrometheusMetricData `json:"data,omitempty"`
}

type PrometheusMetricData

type PrometheusMetricData struct {
	Result     []PrometheusMetricResult `json:"result"`
	ResultType string                   `json:"resultType"`
}

type PrometheusMetricResult

type PrometheusMetricResult struct {
	Metric map[string]string       `json:"metric"`
	Value  []PrometheusMetricValue `json:"value"`
}

type PrometheusMetricValue

type PrometheusMetricValue interface{}

type PruneArgs

type PruneArgs struct {
	// contains filtered or unexported fields
}

type ServeArgs

type ServeArgs struct {
	Image           string            `yaml:"image"`           // --image
	ImagePullPolicy string            `yaml:"imagePullPolicy"` // --imagePullPolicy
	GPUCount        int               `yaml:"gpuCount"`        // --gpus
	Cpu             string            `yaml:"cpu"`             // --cpu
	Memory          string            `yaml:"memory"`          // --memory
	Envs            map[string]string `yaml:"envs"`            // --envs
	Command         string            `yaml:"command"`         // --command
	Replicas        int               `yaml:"replicas"`        // --replicas
	Port            int               `yaml:"port"`            // --port
	RestfulPort     int               `yaml:"rest_api_port"`   // --restfulPort
	ModelName       string            `yaml:"modelName"`       // --modelName
	ModelPath       string            `yaml:"modelPath"`       // --modelPath
	EnableIstio     bool              `yaml:"enableIstio"`     // --enableIstio
	ExposeService   bool              `yaml:"exposeService"`   // --exposeService
	ServingName     string            `yaml:"servingName"`     // --servingName
	ServingVersion  string            `yaml:"servingVersion"`  // --servingVersion
	ModelDirs       map[string]string `yaml:"modelDirs"`
}

type ServeTensorFlowArgs

type ServeTensorFlowArgs struct {
	VersionPolicy          string `yaml:"versionPolicy"`   // --versionPolicy
	ModelConfigFile        string `yaml:"modelConfigFile"` // --modelConfigFile
	ModelConfigFileContent string `yaml:"modelConfigFileContent"`

	ServeArgs `yaml:",inline"`

	ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists
}

type StandaloneJob

type StandaloneJob struct {
	*JobInfo
}

Standalone Job Information

func (*StandaloneJob) GetJobDashboards

func (sj *StandaloneJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

type StandaloneJobTrainer

type StandaloneJobTrainer struct {
	// contains filtered or unexported fields
}

Standalone Job trainer

func (*StandaloneJobTrainer) GetTrainingJob

func (s *StandaloneJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*StandaloneJobTrainer) IsSupported

func (s *StandaloneJobTrainer) IsSupported(name, ns string) bool

check if it's Standalone job

func (*StandaloneJobTrainer) ListTrainingJobs added in v0.2.0

func (s *StandaloneJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*StandaloneJobTrainer) Type

func (s *StandaloneJobTrainer) Type() string

type StringMatchPrefix

type StringMatchPrefix struct {
	Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}

type TensorFlowJob

type TensorFlowJob struct {
	// contains filtered or unexported fields
}

TensorFlow Job Information

func (*TensorFlowJob) Age

func (tj *TensorFlowJob) Age() time.Duration

Get the Job Age

func (*TensorFlowJob) AllPods

func (tj *TensorFlowJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*TensorFlowJob) AllocatedGPU

func (tj *TensorFlowJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) ChiefPod

func (tj *TensorFlowJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*TensorFlowJob) GetJobDashboards

func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*TensorFlowJob) GetStatus

func (tj *TensorFlowJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*TensorFlowJob) HostIPOfChief

func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*TensorFlowJob) Name

func (tj *TensorFlowJob) Name() string

func (*TensorFlowJob) RequestedGPU

func (tj *TensorFlowJob) RequestedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) StartTime

func (tj *TensorFlowJob) StartTime() *metav1.Time

func (*TensorFlowJob) Trainer

func (tj *TensorFlowJob) Trainer() string

type TensorFlowJobTrainer

type TensorFlowJobTrainer struct {
	// contains filtered or unexported fields
}

TensorFlow Job trainer

func (*TensorFlowJobTrainer) GetTrainingJob

func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*TensorFlowJobTrainer) IsSupported

func (tt *TensorFlowJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*TensorFlowJobTrainer) ListTrainingJobs added in v0.2.0

func (tt *TensorFlowJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*TensorFlowJobTrainer) Type

func (tt *TensorFlowJobTrainer) Type() string

type Trainer

type Trainer interface {
	// Check if the training job is supported
	IsSupported(name, ns string) bool

	// Get TrainingJob object directly. this method is called when `arena get`
	GetTrainingJob(name, namespace string) (TrainingJob, error)

	// Get the type of trainer
	Type() string

	ListTrainingJobs() ([]TrainingJob, error)
}

func NewHorovodJobTrainer

func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer

Create HorovodJob Trainer

func NewMPIJobTrainer

func NewMPIJobTrainer(client *kubernetes.Clientset) Trainer

NewMPIJobTrainer

func NewStandaloneJobTrainer

func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer

func NewTensorFlowJobTrainer

func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer

func NewTrainers

func NewTrainers(client *kubernetes.Clientset) []Trainer

construct the trainer list

type TrainingJob

type TrainingJob interface {
	// Get the chief Pod of the Job.
	ChiefPod() v1.Pod

	// Get the name of the Training Job
	Name() string

	// Get all the pods of the Training Job
	AllPods() []v1.Pod

	// Get the Status of the Job: RUNNING, PENDING,
	GetStatus() string

	// Return trainer Type, support MPI, standalone, tensorflow
	Trainer() string

	// Get the Job Age
	Age() time.Duration

	// Get start time
	StartTime() *metav1.Time

	// Get Dashboard
	GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

	// Requested GPU count of the Job
	RequestedGPU() int64

	// Requested GPU count of the Job
	AllocatedGPU() int64

	// the host ip of the chief pod
	HostIPOfChief() string
}

The Training Job can be TensorFlow, MPI and Caffe

type VirtualService

type VirtualService struct {
	*istiov1alpha3.VirtualService
	Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"`
}

type VirtualServiceCRD

type VirtualServiceCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL