Documentation ¶
Index ¶
- Constants
- func AddImageWarmupForWorker(podTemplate *corev1.PodTemplateSpec, mainContainerName string)
- func AddMasterWaiterForWorker(podTemplate *corev1.PodTemplateSpec, param InitContainerParam) error
- func ContainMasterSpec(job *training.PyTorchJob) bool
- type InitContainerParam
- type PytorchJobReconciler
- func (r *PytorchJobReconciler) CheckpointIfNecessary(job interface{}, pods []*corev1.Pod) (completed bool, err error)
- func (r *PytorchJobReconciler) ControllerName() string
- func (r *PytorchJobReconciler) DeleteJob(job interface{}) error
- func (r *PytorchJobReconciler) EnableElasticScaling(job metav1.Object, runPolicy *v1.RunPolicy) bool
- func (r *PytorchJobReconciler) GetAPIGroupVersion() schema.GroupVersion
- func (r *PytorchJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind
- func (r *PytorchJobReconciler) GetDefaultContainerName() string
- func (r *PytorchJobReconciler) GetDefaultContainerPortName() string
- func (r *PytorchJobReconciler) GetDefaultContainerPortNumber() int32
- func (r *PytorchJobReconciler) GetGroupNameLabelValue() string
- func (r *PytorchJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
- func (r *PytorchJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
- func (r *PytorchJobReconciler) GetNodeForModelOutput(pods []*corev1.Pod) (nodeName string)
- func (r *PytorchJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)
- func (r *PytorchJobReconciler) GetReconcileOrders() []v1.ReplicaType
- func (r *PytorchJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)
- func (r *PytorchJobReconciler) IsMasterRole(replicas map[v1.ReplicaType]*v1.ReplicaSpec, rtype v1.ReplicaType, index int) bool
- func (r *PytorchJobReconciler) Reconcile(_ context.Context, req ctrl.Request) (ctrl.Result, error)
- func (r *PytorchJobReconciler) ScaleIn(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, ...) error
- func (r *PytorchJobReconciler) ScaleOut(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, ...) error
- func (r *PytorchJobReconciler) SetClusterSpec(ctx context.Context, job interface{}, podTemplate *corev1.PodTemplateSpec, ...) error
- func (r *PytorchJobReconciler) SetupWithManager(mgr ctrl.Manager) error
- func (r *PytorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, ...) error
- func (r *PytorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *v1.JobStatus) error
Constants ¶
const ( AnnotationCheckpointRequestedVersion = v1.KubeDLPrefix + "/ckpt-requested-version" AnnotationCheckpointCompletedVersion = v1.KubeDLPrefix + "/ckpt-completed-version" AnnotationReadyToStartWorker = v1.KubeDLPrefix + "/ready-to-start-worker" AnnotationImmediatelyStartWorker = v1.KubeDLPrefix + "/immediately-start-worker" AnnotationWorldSize = v1.KubeDLPrefix + "/world-size" )
const ( CheckpointStartReason = "CheckpointStarted" CheckpointFinishedReason = "CheckpointSucceeded" CheckpointFailedReason = "CheckpointFailed" )
Variables ¶
This section is empty.
Functions ¶
func AddImageWarmupForWorker ¶ added in v0.4.3
func AddImageWarmupForWorker(podTemplate *corev1.PodTemplateSpec, mainContainerName string)
func AddMasterWaiterForWorker ¶ added in v0.4.3
func AddMasterWaiterForWorker(podTemplate *corev1.PodTemplateSpec, param InitContainerParam) error
func ContainMasterSpec ¶
func ContainMasterSpec(job *training.PyTorchJob) bool
Types ¶
type InitContainerParam ¶ added in v0.4.3
type PytorchJobReconciler ¶
PytorchJobReconciler reconcile a PytorchJob object
func NewReconciler ¶
func NewReconciler(mgr ctrl.Manager, config options.JobControllerConfiguration) *PytorchJobReconciler
func (*PytorchJobReconciler) CheckpointIfNecessary ¶ added in v0.4.3
func (r *PytorchJobReconciler) CheckpointIfNecessary(job interface{}, pods []*corev1.Pod) (completed bool, err error)
CheckpointIfNecessary triggers checkpoint when workers are going to be preempted or evicted, notify AIMaster to checkpoint and drain out victim pods after succeed. Checkpoint requests contains a `version` to distinguish from different progresses, and controller guarantees that 'checkpoint-version' <= 'job generation'. When preemption happens controller triggers a new round of checkpoint and take job generation as its version, and self-increase generation after checkpoint succeed.
func (*PytorchJobReconciler) ControllerName ¶
func (r *PytorchJobReconciler) ControllerName() string
func (*PytorchJobReconciler) DeleteJob ¶
func (r *PytorchJobReconciler) DeleteJob(job interface{}) error
DeleteJob deletes the job
func (*PytorchJobReconciler) EnableElasticScaling ¶ added in v0.4.3
func (*PytorchJobReconciler) GetAPIGroupVersion ¶
func (r *PytorchJobReconciler) GetAPIGroupVersion() schema.GroupVersion
GetAPIGroupVersion returns the GroupVersion of the API
func (*PytorchJobReconciler) GetAPIGroupVersionKind ¶
func (r *PytorchJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind
GetAPIGroupVersionKind returns the GroupVersionKind of the API
func (*PytorchJobReconciler) GetDefaultContainerName ¶
func (r *PytorchJobReconciler) GetDefaultContainerName() string
GetDefaultContainerName returns the default container name in pod
func (*PytorchJobReconciler) GetDefaultContainerPortName ¶
func (r *PytorchJobReconciler) GetDefaultContainerPortName() string
GetDefaultContainerPortName Get the default container port name
func (*PytorchJobReconciler) GetDefaultContainerPortNumber ¶
func (r *PytorchJobReconciler) GetDefaultContainerPortNumber() int32
GetDefaultContainerPortNumber get the default container port number
func (*PytorchJobReconciler) GetGroupNameLabelValue ¶
func (r *PytorchJobReconciler) GetGroupNameLabelValue() string
GetGroupNameLabelValue returns the Group Name(value) in the labels of the job
func (*PytorchJobReconciler) GetJobFromAPIClient ¶
func (r *PytorchJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
GetJobFromAPIClient returns the Job from API server
func (*PytorchJobReconciler) GetJobFromInformerCache ¶
func (r *PytorchJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
GetJobFromInformerCache returns the Job from Informer Cache
func (*PytorchJobReconciler) GetNodeForModelOutput ¶ added in v0.4.0
func (r *PytorchJobReconciler) GetNodeForModelOutput(pods []*corev1.Pod) (nodeName string)
func (*PytorchJobReconciler) GetPodsForJob ¶
func (r *PytorchJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)
GetPodsForJob returns the pods managed by the job. This can be achieved by selecting pods using label key "job-name" i.e. all pods created by the job will come with label "job-name" = <this_job_name>
func (*PytorchJobReconciler) GetReconcileOrders ¶
func (r *PytorchJobReconciler) GetReconcileOrders() []v1.ReplicaType
func (*PytorchJobReconciler) GetServicesForJob ¶
func (r *PytorchJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)
GetServicesForJob returns the services managed by the job. This can be achieved by selecting services using label key "job-name" i.e. all services created by the job will come with label "job-name" = <this_job_name>
func (*PytorchJobReconciler) IsMasterRole ¶
func (r *PytorchJobReconciler) IsMasterRole(replicas map[v1.ReplicaType]*v1.ReplicaSpec, rtype v1.ReplicaType, index int) bool
IsMasterRole returns if this replica type with index specified is a master role. MasterRole pod will have "job-role=master" set in its label
func (*PytorchJobReconciler) ScaleIn ¶ added in v0.4.3
func (r *PytorchJobReconciler) ScaleIn(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, activePods []*corev1.Pod, activeServices []*corev1.Service) error
func (*PytorchJobReconciler) ScaleOut ¶ added in v0.4.3
func (r *PytorchJobReconciler) ScaleOut(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, activePods []*corev1.Pod, activeServices []*corev1.Service) error
func (*PytorchJobReconciler) SetClusterSpec ¶
func (r *PytorchJobReconciler) SetClusterSpec(ctx context.Context, job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error
SetClusterSpec sets the cluster spec for the pod
func (*PytorchJobReconciler) SetupWithManager ¶
func (r *PytorchJobReconciler) SetupWithManager(mgr ctrl.Manager) error
func (*PytorchJobReconciler) UpdateJobStatus ¶
func (r *PytorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, jobStatus *v1.JobStatus, restart bool) error
UpdateJobStatus updates the job status and job conditions
func (*PytorchJobReconciler) UpdateJobStatusInApiServer ¶
func (r *PytorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *v1.JobStatus) error
UpdateJobStatusInApiServer updates the job status in API server