Documentation
¶
Index ¶
- Constants
- Variables
- func CRDName() string
- func Resource(resource string) schema.GroupResource
- type AcceleratorConfig
- type AcceleratorVolume
- type ContainerName
- type ControllerConfig
- type EnvironmentVariableConfig
- type JobMode
- type MxJob
- type MxJobCondition
- type MxJobConditionType
- type MxJobCopy
- type MxJobList
- type MxJobListCopy
- type MxJobPhase
- type MxJobSpec
- type MxJobStatus
- func (cs *MxJobStatus) AppendRecoveringCondition()
- func (cs *MxJobStatus) AppendRemovingDeadMember(name string)
- func (cs *MxJobStatus) AppendScalingDownCondition(from, to int)
- func (cs *MxJobStatus) AppendUpgradingCondition(to string, member string)
- func (cs *MxJobStatus) Control()
- func (cs MxJobStatus) Copy() MxJobStatus
- func (cs *MxJobStatus) IsFailed() bool
- func (cs *MxJobStatus) PauseControl()
- func (cs *MxJobStatus) SetPhase(p MxJobPhase)
- func (cs *MxJobStatus) SetReadyCondition()
- func (cs *MxJobStatus) SetReason(r string)
- func (cs *MxJobStatus) SetState(s State)
- type MxReplicaSpec
- type MxReplicaStatus
- type MxReplicaType
- type ReplicaState
- type State
Constants ¶
const ( // CRDKind k8s crd kind CRDKind = "MxJob" // CRDKindPlural k8s crd Plural CRDKindPlural = "mxjobs" // CRDGroup k8s crd group CRDGroup = "mxnet.mlkube.io" // CRDVersion k8s crd version CRDVersion = "v1beta1" // CRDApiVersion k8s crd api version CRDApiVersion = CRDGroup + "/" + CRDVersion // "mlkube.io/v1beta1" // AppLabel Value of the APP label that gets applied to a lot of entities. AppLabel = "mxnet-job" // PsRootPort Defaults for the Spec PsRootPort = 9091 // Replicas Defaults for the Spec Replicas = 1 )
const ( // MxJobPhaseNone job phase none MxJobPhaseNone MxJobPhase = "" // MxJobPhaseCreating job phase creating MxJobPhaseCreating = "Creating" // MxJobPhaseRunning job phase running MxJobPhaseRunning = "Running" // MxJobPhaseCleanUp job phase cleanup MxJobPhaseCleanUp = "CleanUp" // MxJobPhaseFailed job phase failed MxJobPhaseFailed = "Failed" // MxJobPhaseDone job phase done MxJobPhaseDone = "Done" )
const ( MxJobConditionReady = "Ready" MxJobConditionRemovingDeadMember = "RemovingDeadMember" MxJobConditionRecovering = "Recovering" MxJobConditionScalingUp = "ScalingUp" MxJobConditionScalingDown = "ScalingDown" MxJobConditionUpgrading = "Upgrading" )
TODO(jlewi): Need to define appropriate conditions and get rid of the ones we don't need.
const ( // ReplicaStateUnknown replica state unknown ReplicaStateUnknown ReplicaState = "Unknown" // ReplicaStateStarting replica state starting ReplicaStateStarting = "Starting" // ReplicaStateRunning replica state running ReplicaStateRunning = "Running" // ReplicaStateFailed replica state failed ReplicaStateFailed = "Failed" // ReplicaStateSucceeded replica state succeeded ReplicaStateSucceeded = "Succeeded" )
Variables ¶
var ( // SchemeBuilder for SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) // AddToScheme for AddToScheme = SchemeBuilder.AddToScheme )
var SchemeGroupVersion = schema.GroupVersion{Group: CRDGroup, Version: CRDVersion}
SchemeGroupVersion is the group version used to register these objects.
Functions ¶
func Resource ¶
func Resource(resource string) schema.GroupResource
Resource takes an unqualified resource and returns a Group-qualified GroupResource.
Types ¶
type AcceleratorConfig ¶
type AcceleratorConfig struct { Volumes []AcceleratorVolume EnvVars []EnvironmentVariableConfig }
AcceleratorConfig for docker container's volume and enviroment
type AcceleratorVolume ¶
AcceleratorVolume represents a host path that must be mounted into each container that needs to use GPUs.
type ContainerName ¶
type ContainerName string
ContainerName is an enum for expected containers.
const ( // MXNET container name for mxnet training job MXNET ContainerName = "mxnet" )
type ControllerConfig ¶
type ControllerConfig struct { // Accelerators is a map from the name of the accelerator to the config for that accelerator. // This should match the value specified as a container limit. // e.g. alpha.kubernetes.io/nvidia-gpu Accelerators map[string]AcceleratorConfig }
ControllerConfig for docker container with GPU accelerator
type EnvironmentVariableConfig ¶
EnvironmentVariableConfig for container
type MxJob ¶
type MxJob struct { metav1.TypeMeta `json:",inline"` Metadata metav1.ObjectMeta `json:"metadata,omitempty"` Spec MxJobSpec `json:"spec"` Status MxJobStatus `json:"status"` }
MxJob mxnet job
func (*MxJob) AsOwner ¶
func (j *MxJob) AsOwner() metav1.OwnerReference
AsOwner return owner reference
func (*MxJob) UnmarshalJSON ¶
UnmarshalJSON for MxJob
type MxJobCondition ¶
type MxJobCondition struct { Type MxJobConditionType `json:"type,omitempty"` Reason string `json:"reason,omitempty"` TransitionTime string `json:"transitionTime,omitempty"` }
MxJobCondition mxnet job condition
type MxJobConditionType ¶
type MxJobConditionType string
MxJobConditionType mxnet job condition type
type MxJobList ¶
type MxJobList struct { metav1.TypeMeta `json:",inline"` // Standard list metadata // More info: http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata Metadata metav1.ListMeta `json:"metadata,omitempty"` // Items is a list of third party objects Items []MxJob `json:"items"` }
MxJobList is a list of etcd clusters.
func (*MxJobList) UnmarshalJSON ¶
UnmarshalJSON for MxJobList
type MxJobSpec ¶
type MxJobSpec struct { // RuntimeId job id RuntimeId string // JobMode MXNet training job mode: local, dist JobMode `json:"jobMode"` // ReplicaSpecs specifies the Mx replicas to run. ReplicaSpecs []*MxReplicaSpec `json:"replicaSpecs"` }
MxJobSpec mxnet job specification
func (*MxJobSpec) Cleanup ¶
func (c *MxJobSpec) Cleanup()
Cleanup cleans up user passed spec, e.g. defaulting, transforming fields. TODO: move this to admission controller
func (*MxJobSpec) ConfigureAccelerators ¶
func (c *MxJobSpec) ConfigureAccelerators(accelerators map[string]AcceleratorConfig) error
ConfigureAccelerators adds any accelerator specific configuration to the pods.
func (*MxJobSpec) SetDefaults ¶
SetDefaults sets any unspecified values to defaults
type MxJobStatus ¶
type MxJobStatus struct { // Phase is the MxJob running phase Phase MxJobPhase `json:"phase,omitempty"` Reason string `json:"reason,omitempty"` // ControlPuased indicates the operator pauses the control of the cluster. // TODO(jlewi): I think we can get rid of ControlPaued. ControlPaused bool `json:"controlPaused"` // Condition keeps ten most recent cluster conditions Conditions []MxJobCondition `json:"conditions,omitempty"` // State indicates the state of the job. State State `json:"state,omitempty"` // ReplicaStatuses specifies the status of each Mx replica. ReplicaStatuses []*MxReplicaStatus `json:"replicaStatuses"` }
MxJobStatus mxnet job status
func (*MxJobStatus) AppendRecoveringCondition ¶
func (cs *MxJobStatus) AppendRecoveringCondition()
AppendRecoveringCondition for mxnet job status
func (*MxJobStatus) AppendRemovingDeadMember ¶
func (cs *MxJobStatus) AppendRemovingDeadMember(name string)
AppendRemovingDeadMember for mxnet job status
func (*MxJobStatus) AppendScalingDownCondition ¶
func (cs *MxJobStatus) AppendScalingDownCondition(from, to int)
AppendScalingDownCondition for mxnet job status
func (*MxJobStatus) AppendUpgradingCondition ¶
func (cs *MxJobStatus) AppendUpgradingCondition(to string, member string)
AppendUpgradingCondition for mxnet job status
func (*MxJobStatus) IsFailed ¶
func (cs *MxJobStatus) IsFailed() bool
IsFailed return true if job status failed
func (*MxJobStatus) PauseControl ¶
func (cs *MxJobStatus) PauseControl()
PauseControl set cs ControlPaused = true
func (*MxJobStatus) SetPhase ¶
func (cs *MxJobStatus) SetPhase(p MxJobPhase)
SetPhase set up mxnet job status phase
func (*MxJobStatus) SetReadyCondition ¶
func (cs *MxJobStatus) SetReadyCondition()
SetReadyCondition for mxnet job status
func (*MxJobStatus) SetReason ¶
func (cs *MxJobStatus) SetReason(r string)
SetReason for mxnet job status
func (*MxJobStatus) SetState ¶
func (cs *MxJobStatus) SetState(s State)
SetState for mxnet job status
type MxReplicaSpec ¶
type MxReplicaSpec struct { // Replicas is the number of desired replicas. // This is a pointer to distinguish between explicit zero and unspecified. // Defaults to 1. // More info: http://kubernetes.io/docs/user-guide/replication-controller#what-is-a-replication-controller // +optional Replicas *int32 `json:"replicas,omitempty" protobuf:"varint,1,opt,name=replicas"` Template *v1.PodTemplateSpec `json:"template,omitempty" protobuf:"bytes,3,opt,name=template"` // Root_PS_Port is the port to use for scheduler. PsRootPort *int32 `json:"PsRootPort,omitempty" protobuf:"varint,1,opt,name=PsRootPort"` MxReplicaType `json:"mxReplicaType"` }
MxReplicaSpec mxnet replica specification
type MxReplicaStatus ¶
type MxReplicaStatus struct { MxReplicaType `json:"Mx_replica_type"` // State is the overall state of the replica State ReplicaState `json:"state"` // ReplicasStates provides the number of replicas in each status. ReplicasStates map[ReplicaState]int }
MxReplicaStatus mxnet replica status
type MxReplicaType ¶
type MxReplicaType string
MxReplicaType determines how a set of Mx processes are handled.
const ( // SCHEDULER mxnet training job replica type SCHEDULER MxReplicaType = "SCHEDULER" // SERVER mxnet training job replica type SERVER MxReplicaType = "SERVER" // WORKER mxnet training job replica type WORKER MxReplicaType = "WORKER" )