commands

package
v0.0.0-...-41c11b6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 8, 2019 License: Apache-2.0, Apache-2.0 Imports: 47 Imported by: 0

Documentation

Index

Constants

View Source
const (
	RecommendedConfigPathEnvVar = "ARENA_CONFIG"
	DefaultArenaConfigPath      = "~/.arena/config"
)
View Source
const (
	CHART_PKG_LOC = "CHARTREPO"
	// GPUResourceName is the extended name of the GPU resource since v1.8
	// this uses the device plugin mechanism
	NVIDIAGPUResourceName = "nvidia.com/gpu"

	DeprecatedNVIDIAGPUResourceName = "alpha.kubernetes.io/nvidia-gpu"
)
View Source
const (
	// CLIName is the name of the CLI
	CLIName = "arena"
)
View Source
const KUBEFLOW_NAMESPACE = "kubeflow"
View Source
const KUBE_SYSTEM_NAMESPACE = "kube-system"
View Source
const POD_METRIC_TMP = `{__name__=~"%s", pod_name=~"%s"}`
View Source
const PROMETHEUS_INSTALL_DOC_URL = "https://github.com/kubeflow/arena/blob/master/docs/userguide/9-top-job-gpu-metric.md"
View Source
const PROMETHEUS_SCHEME = "http"
View Source
const PROMETHEUS_SVC_LABEL = "kubernetes.io/name=Prometheus"

Variables

View Source
var GPU_METRIC_LIST = []string{"nvidia_gpu_duty_cycle", "nvidia_gpu_memory_used_bytes", "nvidia_gpu_memory_total_bytes"}

Functions

func BuildJobInfo

func BuildJobInfo(job TrainingJob) *types.JobInfo

* * BuildTrainingJobInfo returns types.TrainingJobInfo

func GetJobDashboards

func GetJobDashboards(dashboard string, job *v1.Job, pods []corev1.Pod) []string

func GetJobRealStatus

func GetJobRealStatus(job TrainingJob) string

Get real job status WHen has pods being pending, tfJob still show in Running state, it should be Pending

func GetNamespace

func GetNamespace() string

func GetPodEvents

func GetPodEvents(client *kubernetes.Clientset, namespace string, pods []v1.Pod) (map[string][]v1.Event, error)

Get Event of the Job

func GetPrometheusServiceName

func GetPrometheusServiceName(client *kubernetes.Clientset) (name string, ns string)

* * Get Prometheus from different namespaces

func GpuMonitoringInstalled

func GpuMonitoringInstalled(client *kubernetes.Clientset) bool

func ListServing

func ListServing(client *kubernetes.Clientset) ([]types.Serving, error)

func ListServingJobsByHelm

func ListServingJobsByHelm() ([]types.Serving, error)

func NewCommand

func NewCommand() *cobra.Command

NewCommand returns a new instance of an Arena command

func NewCompletionCommand

func NewCompletionCommand() *cobra.Command

func NewDataCommand

func NewDataCommand() *cobra.Command

manage data volume

func NewDataListCommand

func NewDataListCommand() *cobra.Command

List Data Command

func NewDeleteCommand

func NewDeleteCommand() *cobra.Command

NewDeleteCommand

func NewGetCommand

func NewGetCommand() *cobra.Command

NewGetCommand

func NewListCommand

func NewListCommand() *cobra.Command

func NewLogViewerCommand

func NewLogViewerCommand() *cobra.Command

func NewLogsCommand

func NewLogsCommand() *cobra.Command

func NewPruneCommand

func NewPruneCommand() *cobra.Command

func NewServeCommand

func NewServeCommand() *cobra.Command

func NewServingDeleteCommand

func NewServingDeleteCommand() *cobra.Command

NewDeleteCommand

func NewServingListCommand

func NewServingListCommand() *cobra.Command

func NewServingTensorFlowCommand

func NewServingTensorFlowCommand() *cobra.Command

func NewServingTensorRTCommand

func NewServingTensorRTCommand() *cobra.Command

func NewSparkApplicationCommand

func NewSparkApplicationCommand() *cobra.Command

*

https://github.com/GoogleCloudPlatform/spark-on-k8s-operator

sparkApplication is the supported as default
scheduledSparkApplication is not supported.

func NewSubmitCommand

func NewSubmitCommand() *cobra.Command

func NewSubmitHorovodJobCommand

func NewSubmitHorovodJobCommand() *cobra.Command

NewSubmitHorovodJobCommand

func NewSubmitMPIJobCommand

func NewSubmitMPIJobCommand() *cobra.Command

func NewSubmitSparkJobArgs

func NewSubmitSparkJobArgs() *submitSparkJobArgs

func NewSubmitStandaloneJobCommand

func NewSubmitStandaloneJobCommand() *cobra.Command

func NewSubmitTFJobCommand

func NewSubmitTFJobCommand() *cobra.Command

func NewTopCommand

func NewTopCommand() *cobra.Command

func NewTopJobCommand

func NewTopJobCommand() *cobra.Command

func NewTopNodeCommand

func NewTopNodeCommand() *cobra.Command

func NewTrafficRouterSplitCommand

func NewTrafficRouterSplitCommand() *cobra.Command

func NewVersionCmd

func NewVersionCmd(cliName string) *cobra.Command

func ParseMountPath

func ParseMountPath(dataset []string) (err error)

func PrintLine

func PrintLine(w io.Writer, fields ...string)

func SortMapKeys

func SortMapKeys(podMetric PodGpuMetric) []string

Types

type Destination

type Destination struct {
	*istiov1alpha3.Destination
	Port *PortSelector `protobuf:"bytes,3,opt,name=port" json:"port,omitempty"`
}

type DestinationRuleCRD

type DestinationRuleCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              istiov1alpha3.DestinationRule `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

type DestinationWeight

type DestinationWeight struct {
	Destination *Destination `protobuf:"bytes,1,opt,name=destination" json:"destination,omitempty"`
	Weight      int32        `protobuf:"varint,2,opt,name=weight,proto3" json:"weight"`
}

type Driver

type Driver struct {
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type Executor

type Executor struct {
	Replicas      int    `yaml:"Replicas"`
	CPURequest    int    `yaml:"CPURequest"`
	MemoryRequest string `yaml:"MemoryRequest"`
}

type GpuMetric

type GpuMetric struct {
	GpuDutyCycle   float64
	GpuMemoryUsed  float64
	GpuMemoryTotal float64
}

type GpuMetricInfo

type GpuMetricInfo struct {
	MetricName    string
	Value         string
	Time          float64
	PodName       string
	PodNamespace  string
	ContainerName string
	NodeName      string
	GPUUID        string
	Id            string
}

func QueryMetricByPrometheus

func QueryMetricByPrometheus(client *kubernetes.Clientset, prometheusServiceName string, namespace string, query string) ([]GpuMetricInfo, error)

type HTTPMatchRequest

type HTTPMatchRequest struct {
	*istiov1alpha3.HTTPMatchRequest
	Uri *StringMatchPrefix `protobuf:"bytes,1,opt,name=uri" json:"uri,omitempty"`
}

type HTTPRoute

type HTTPRoute struct {
	*istiov1alpha3.HTTPRoute
	Match []*HTTPMatchRequest  `protobuf:"bytes,1,rep,name=match" json:"match,omitempty"`
	Route []*DestinationWeight `protobuf:"bytes,2,rep,name=route" json:"route,omitempty"`
}

type HorovodJob

type HorovodJob struct {
	*JobInfo
}

Horovod Job Information

func (*HorovodJob) AllPods

func (hj *HorovodJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*HorovodJob) ChiefPod

func (hj *HorovodJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*HorovodJob) GetJobDashboards

func (hj *HorovodJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*HorovodJob) HostIPOfChief

func (hj *HorovodJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

type HorovodJobTrainer

type HorovodJobTrainer struct {
	// contains filtered or unexported fields
}

Horovod Job trainer

func (*HorovodJobTrainer) GetTrainingJob

func (m *HorovodJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*HorovodJobTrainer) IsSupported

func (m *HorovodJobTrainer) IsSupported(name, ns string) bool

check if it's Horovod job

func (*HorovodJobTrainer) ListTrainingJobs

func (hj *HorovodJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*HorovodJobTrainer) Type

func (m *HorovodJobTrainer) Type() string

type JobGpuMetric

type JobGpuMetric map[string]PodGpuMetric

func GetJobGpuMetric

func GetJobGpuMetric(client *kubernetes.Clientset, job TrainingJob) (jobMetric JobGpuMetric, err error)

func GetPodsGpuInfo

func GetPodsGpuInfo(client *kubernetes.Clientset, prometheusServiceName string, namespace string, podNames []string) (JobGpuMetric, error)

func (JobGpuMetric) GetPodMetrics

func (m JobGpuMetric) GetPodMetrics(podName string) PodGpuMetric

func (*JobGpuMetric) SetPodMetric

func (m *JobGpuMetric) SetPodMetric(metric GpuMetricInfo)

type JobInfo

type JobInfo struct {
	// contains filtered or unexported fields
}

func (*JobInfo) Age

func (ji *JobInfo) Age() time.Duration

func (*JobInfo) AllPods

func (ji *JobInfo) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*JobInfo) AllocatedGPU

func (ji *JobInfo) AllocatedGPU() int64

Requested GPU count of the Job

func (*JobInfo) ChiefPod

func (ji *JobInfo) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*JobInfo) Duration

func (ji *JobInfo) Duration() time.Duration

Get the Job Training Duration

func (*JobInfo) GetStatus

func (ji *JobInfo) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*JobInfo) HostIPOfChief

func (ji *JobInfo) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*JobInfo) Name

func (ji *JobInfo) Name() string

func (*JobInfo) Namespace

func (ji *JobInfo) Namespace() string

func (*JobInfo) RequestedGPU

func (ji *JobInfo) RequestedGPU() int64

Requested GPU count of the Job

func (*JobInfo) StartTime

func (ji *JobInfo) StartTime() *metav1.Time

func (*JobInfo) Trainer

func (ji *JobInfo) Trainer() string

type MPIJob

type MPIJob struct {
	// contains filtered or unexported fields
}

MPI Job Information

func (*MPIJob) Age

func (mj *MPIJob) Age() time.Duration

Get the Job Age

func (*MPIJob) AllPods

func (mj *MPIJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*MPIJob) AllocatedGPU

func (mj *MPIJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*MPIJob) ChiefPod

func (mj *MPIJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*MPIJob) Duration

func (mj *MPIJob) Duration() time.Duration

Get the Job Training Duration

func (*MPIJob) GetJobDashboards

func (mj *MPIJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*MPIJob) GetStatus

func (mj *MPIJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*MPIJob) HostIPOfChief

func (mj *MPIJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*MPIJob) Name

func (mj *MPIJob) Name() string

func (*MPIJob) Namespace

func (mj *MPIJob) Namespace() string

func (*MPIJob) RequestedGPU

func (mj *MPIJob) RequestedGPU() int64

Requested GPU count of the Job

func (*MPIJob) StartTime

func (mj *MPIJob) StartTime() *metav1.Time

Get the start time

func (*MPIJob) Trainer

func (mj *MPIJob) Trainer() string

type MPIJobTrainer

type MPIJobTrainer struct {
	// contains filtered or unexported fields
}

MPI Job trainer

func (*MPIJobTrainer) GetTrainingJob

func (tt *MPIJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

Get the training job from cache or directly

func (*MPIJobTrainer) IsSupported

func (tt *MPIJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*MPIJobTrainer) ListTrainingJobs

func (tt *MPIJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*MPIJobTrainer) Type

func (tt *MPIJobTrainer) Type() string

Get the type

type NodeDescriber

type NodeDescriber struct {
	// contains filtered or unexported fields
}

type NodeInfo

type NodeInfo struct {
	// contains filtered or unexported fields
}

type PodGpuMetric

type PodGpuMetric map[string]*GpuMetric

type PortSelector

type PortSelector struct {
	*istiov1alpha3.PortSelector
	Number uint32 `protobuf:"varint,1,opt,name=number,proto3,oneof" json:"number,omitempty"`
}

type PreprocesObject

type PreprocesObject struct {
	ServiceName     string
	Namespace       string
	DestinationRule DestinationRuleCRD
	VirtualService  VirtualServiceCRD
}

type PrintArgs

type PrintArgs struct {
	ShowEvents bool
	Output     string
}

type PrometheusMetric

type PrometheusMetric struct {
	Status string               `json:"status,inline"`
	Data   PrometheusMetricData `json:"data,omitempty"`
}

type PrometheusMetricData

type PrometheusMetricData struct {
	Result     []PrometheusMetricResult `json:"result"`
	ResultType string                   `json:"resultType"`
}

type PrometheusMetricResult

type PrometheusMetricResult struct {
	Metric map[string]string       `json:"metric"`
	Value  []PrometheusMetricValue `json:"value"`
}

type PrometheusMetricValue

type PrometheusMetricValue interface{}

type PruneArgs

type PruneArgs struct {
	// contains filtered or unexported fields
}

type ServeArgs

type ServeArgs struct {
	ImagePullPolicy string            `yaml:"imagePullPolicy"` // --imagePullPolicy
	GPUCount        int               `yaml:"gpuCount"`        // --gpus
	Cpu             string            `yaml:"cpu"`             // --cpu
	Memory          string            `yaml:"memory"`          // --memory
	Envs            map[string]string `yaml:"envs"`            // --envs
	Command         string            `yaml:"command"`         // --command
	Replicas        int               `yaml:"replicas"`        // --replicas
	Port            int               `yaml:"port"`            // --port
	RestfulPort     int               `yaml:"rest_api_port"`   // --restfulPort
	ModelName       string            `yaml:"modelName"`       // --modelName
	ModelPath       string            `yaml:"modelPath"`       // --modelPath
	EnableIstio     bool              `yaml:"enableIstio"`     // --enableIstio
	ExposeService   bool              `yaml:"exposeService"`   // --exposeService
	ServingName     string            `yaml:"servingName"`     // --servingName
	ServingVersion  string            `yaml:"servingVersion"`  // --servingVersion
	ModelDirs       map[string]string `yaml:"modelDirs"`
}

type ServeTensorFlowArgs

type ServeTensorFlowArgs struct {
	VersionPolicy          string `yaml:"versionPolicy"`   // --versionPolicy
	ModelConfigFile        string `yaml:"modelConfigFile"` // --modelConfigFile
	ModelConfigFileContent string `yaml:"modelConfigFileContent"`
	Image                  string `yaml:"image"` // --image

	ServeArgs `yaml:",inline"`

	ModelServiceExists bool `yaml:"modelServiceExists"` // --modelServiceExists
}

type ServeTensorRTArgs

type ServeTensorRTArgs struct {
	Image        string `yaml:"image"`        // --image
	ModelStore   string `yaml:"modelStore"`   // --modelStore
	MetricsPort  int    `yaml:"metricsPort"`  // --metricsPort
	HttpPort     int    `yaml:"httpPort"`     // --httpPort
	GrpcPort     int    `yaml:"grpcPort"`     // --grpcPort
	AllowMetrics bool   `yaml:"allowMetrics"` // --allowMetrics

	ServeArgs `yaml:",inline"`
}

type SortPodConditionByLastTransitionTime

type SortPodConditionByLastTransitionTime []v1.PodCondition

Sort the pod condition by time.

func (SortPodConditionByLastTransitionTime) Len

func (SortPodConditionByLastTransitionTime) Less

func (SortPodConditionByLastTransitionTime) Swap

type SparkJob

type SparkJob struct {
	// contains filtered or unexported fields
}

spark application wrapper

func (*SparkJob) Age

func (sj *SparkJob) Age() time.Duration

func (*SparkJob) AllPods

func (sj *SparkJob) AllPods() []v1.Pod

return pods from cache

func (*SparkJob) AllocatedGPU

func (sj *SparkJob) AllocatedGPU() int64

spark job without gpu supported

func (*SparkJob) ChiefPod

func (sj *SparkJob) ChiefPod() v1.Pod

return driver pod

func (*SparkJob) Duration

func (sj *SparkJob) Duration() time.Duration

Get the Job Training Duration

func (*SparkJob) GetJobDashboards

func (sj *SparkJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

func (*SparkJob) GetStatus

func (sj *SparkJob) GetStatus() (status string)
spark job driver state

------------------------------------------------------- NewState ApplicationStateType = "" SubmittedState ApplicationStateType = "SUBMITTED" RunningState ApplicationStateType = "RUNNING" CompletedState ApplicationStateType = "COMPLETED" FailedState ApplicationStateType = "FAILED" FailedSubmissionState ApplicationStateType = "SUBMISSION_FAILED" PendingRerunState ApplicationStateType = "PENDING_RERUN" InvalidatingState ApplicationStateType = "INVALIDATING" SucceedingState ApplicationStateType = "SUCCEEDING" FailingState ApplicationStateType = "FAILING" UnknownState ApplicationStateType = "UNKNOWN"

spark job executor state

------------------------------------------------------- ExecutorPendingState ExecutorState = "PENDING" ExecutorRunningState ExecutorState = "RUNNING" ExecutorCompletedState ExecutorState = "COMPLETED" ExecutorFailedState ExecutorState = "FAILED" ExecutorUnknownState ExecutorState = "UNKNOWN"

func (*SparkJob) HostIPOfChief

func (sj *SparkJob) HostIPOfChief() (hostIP string)

Get the hostIP of the driver Pod

func (*SparkJob) Name

func (sj *SparkJob) Name() string

func (*SparkJob) Namespace

func (sj *SparkJob) Namespace() string

func (*SparkJob) RequestedGPU

func (sj *SparkJob) RequestedGPU() int64

spark job without gpu supported

func (*SparkJob) StartTime

func (sj *SparkJob) StartTime() *metav1.Time

func (*SparkJob) Trainer

func (sj *SparkJob) Trainer() string

return trainerType: sparkjob

type SparkJobTrainer

type SparkJobTrainer struct {
	// contains filtered or unexported fields
}

spark job trainer

func (*SparkJobTrainer) GetTrainingJob

func (st *SparkJobTrainer) GetTrainingJob(name, namespace string) (job TrainingJob, err error)

func (*SparkJobTrainer) IsSupported

func (st *SparkJobTrainer) IsSupported(name, ns string) bool

func (*SparkJobTrainer) ListTrainingJobs

func (st *SparkJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

func (*SparkJobTrainer) Type

func (st *SparkJobTrainer) Type() string

type StandaloneJob

type StandaloneJob struct {
	*JobInfo
}

Standalone Job Information

func (*StandaloneJob) GetJobDashboards

func (sj *StandaloneJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

type StandaloneJobTrainer

type StandaloneJobTrainer struct {
	// contains filtered or unexported fields
}

Standalone Job trainer

func (*StandaloneJobTrainer) GetTrainingJob

func (s *StandaloneJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*StandaloneJobTrainer) IsSupported

func (s *StandaloneJobTrainer) IsSupported(name, ns string) bool

check if it's Standalone job

func (*StandaloneJobTrainer) ListTrainingJobs

func (s *StandaloneJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*StandaloneJobTrainer) Type

func (s *StandaloneJobTrainer) Type() string

type StringMatchPrefix

type StringMatchPrefix struct {
	Prefix string `protobuf:"bytes,2,opt,name=prefix,proto3,oneof" json:"prefix,omitempty"`
}

type TensorFlowJob

type TensorFlowJob struct {
	// contains filtered or unexported fields
}

TensorFlow Job Information

func (*TensorFlowJob) Age

func (tj *TensorFlowJob) Age() time.Duration

Get the Job Age

func (*TensorFlowJob) AllPods

func (tj *TensorFlowJob) AllPods() []v1.Pod

Get all the pods of the Training Job

func (*TensorFlowJob) AllocatedGPU

func (tj *TensorFlowJob) AllocatedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) ChiefPod

func (tj *TensorFlowJob) ChiefPod() v1.Pod

Get the chief Pod of the Job.

func (*TensorFlowJob) Duration

func (tj *TensorFlowJob) Duration() time.Duration

Get the Job Training Duration

func (*TensorFlowJob) GetJobDashboards

func (tj *TensorFlowJob) GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

Get Dashboard url of the job

func (*TensorFlowJob) GetStatus

func (tj *TensorFlowJob) GetStatus() (status string)

Get the Status of the Job: RUNNING, PENDING, SUCCEEDED, FAILED

func (*TensorFlowJob) HostIPOfChief

func (tj *TensorFlowJob) HostIPOfChief() (hostIP string)

Get the hostIP of the chief Pod

func (*TensorFlowJob) Name

func (tj *TensorFlowJob) Name() string

func (*TensorFlowJob) Namespace

func (tj *TensorFlowJob) Namespace() string

func (*TensorFlowJob) RequestedGPU

func (tj *TensorFlowJob) RequestedGPU() int64

Requested GPU count of the Job

func (*TensorFlowJob) StartTime

func (tj *TensorFlowJob) StartTime() *metav1.Time

func (*TensorFlowJob) Trainer

func (tj *TensorFlowJob) Trainer() string

type TensorFlowJobTrainer

type TensorFlowJobTrainer struct {
	// contains filtered or unexported fields
}

TensorFlow Job trainer

func (*TensorFlowJobTrainer) GetTrainingJob

func (tt *TensorFlowJobTrainer) GetTrainingJob(name, namespace string) (tj TrainingJob, err error)

func (*TensorFlowJobTrainer) IsSupported

func (tt *TensorFlowJobTrainer) IsSupported(name, ns string) bool

check if it's TensorFlow job

func (*TensorFlowJobTrainer) ListTrainingJobs

func (tt *TensorFlowJobTrainer) ListTrainingJobs() (jobs []TrainingJob, err error)

* * List Training jobs

func (*TensorFlowJobTrainer) Type

func (tt *TensorFlowJobTrainer) Type() string

type Trainer

type Trainer interface {
	// Check if the training job is supported
	IsSupported(name, ns string) bool

	// Get TrainingJob object directly. this method is called when `arena get`
	GetTrainingJob(name, namespace string) (TrainingJob, error)

	// Get the type of trainer
	Type() string

	ListTrainingJobs() ([]TrainingJob, error)
}

func NewHorovodJobTrainer

func NewHorovodJobTrainer(client *kubernetes.Clientset) Trainer

Create HorovodJob Trainer

func NewMPIJobTrainer

func NewMPIJobTrainer(client *kubernetes.Clientset) Trainer

NewMPIJobTrainer

func NewSparkJobTrainer

func NewSparkJobTrainer(client *kubernetes.Clientset) Trainer

func NewStandaloneJobTrainer

func NewStandaloneJobTrainer(client *kubernetes.Clientset) Trainer

func NewTensorFlowJobTrainer

func NewTensorFlowJobTrainer(client *kubernetes.Clientset) Trainer

func NewTrainers

func NewTrainers(client *kubernetes.Clientset) []Trainer

construct the trainer list

type TrainingJob

type TrainingJob interface {
	// Get the chief Pod of the Job.
	ChiefPod() v1.Pod

	// Get the name of the Training Job
	Name() string

	// Get the namespace of the Training Job
	Namespace() string

	// Get all the pods of the Training Job
	AllPods() []v1.Pod

	// Get the Status of the Job: RUNNING, PENDING,
	GetStatus() string

	// Return trainer Type, support MPI, standalone, tensorflow
	Trainer() string

	// Get the Job Age
	Age() time.Duration

	// Get the Job Duration
	Duration() time.Duration

	// Get start time
	StartTime() *metav1.Time

	// Get Dashboard
	GetJobDashboards(client *kubernetes.Clientset) ([]string, error)

	// Requested GPU count of the Job
	RequestedGPU() int64

	// Requested GPU count of the Job
	AllocatedGPU() int64

	// the host ip of the chief pod
	HostIPOfChief() string
}

The Training Job can be TensorFlow, MPI and Caffe

type VirtualService

type VirtualService struct {
	*istiov1alpha3.VirtualService
	Http []*HTTPRoute `protobuf:"bytes,3,rep,name=http" json:"http,omitempty"`
}

type VirtualServiceCRD

type VirtualServiceCRD struct {
	// Kind is a string value representing the REST resource this object represents.
	// Servers may infer this from the endpoint the client submits requests to.
	// Cannot be updated.
	// In CamelCase.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds
	// +optional
	Kind string `json:"kind,omitempty" protobuf:"bytes,1,opt,name=kind"`

	// APIVersion defines the versioned schema of this representation of an object.
	// Servers should convert recognized schemas to the latest internal value, and
	// may reject unrecognized values.
	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources
	// +optional
	APIVersion        string `json:"apiVersion,omitempty" protobuf:"bytes,2,opt,name=apiVersion"`
	metav1.ObjectMeta `json:"metadata,omitempty" yaml:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
	Spec              VirtualService `json:"spec,omitempty" yaml:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL