Documentation ¶
Index ¶
- Constants
- Variables
- func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool
- type ElasticEnvVarGenerator
- type EnvVarGenerator
- type MasterEnvVarGenerator
- type PyTorchJobReconciler
- func (r *PyTorchJobReconciler) ControllerName() string
- func (r *PyTorchJobReconciler) DeleteJob(job interface{}) error
- func (jc *PyTorchJobReconciler) GenLabelSelector(jobName string, rtype kubeflowv1.ReplicaType) *metav1.LabelSelector
- func (r *PyTorchJobReconciler) GetAPIGroupVersion() schema.GroupVersion
- func (r *PyTorchJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind
- func (r *PyTorchJobReconciler) GetDefaultContainerName() string
- func (r *PyTorchJobReconciler) GetDefaultContainerPortName() string
- func (r *PyTorchJobReconciler) GetFrameworkName() string
- func (r *PyTorchJobReconciler) GetGroupNameLabelValue() string
- func (r *PyTorchJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
- func (r *PyTorchJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
- func (r *PyTorchJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)
- func (r *PyTorchJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)
- func (r *PyTorchJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, ...) bool
- func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
- func (r *PyTorchJobReconciler) ReconcileHPA(pytorchJob *kubeflowv1.PyTorchJob) error
- func (r *PyTorchJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error
- func (r *PyTorchJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error
- func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, ...) error
- func (r *PyTorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error
Constants ¶
const ( // EnvRDZVBackend is the environment variable name for the rdzv backend. EnvRDZVBackend = "PET_RDZV_BACKEND" // EnvRDZVID is the environment variable name for the rdzv id. EnvRDZVID = "PET_RDZV_ID" // ENVRDZVConf is the environment variable name for the rdzv conf. EnvRDZVConf = "PET_RDZV_CONF" // EnvRDZVEndpoint is the environment variable name for the rdzv endpoint. EnvRDZVEndpoint = "PET_RDZV_ENDPOINT" // EnvRDZVStandalone is the environment variable name for the standalone mode. EnvStandalone = "PET_STANDALONE" // EnvMaxRestarts is the environment variable name for the maximum number of worker group restarts before failing. EnvMaxRestarts = "PET_MAX_RESTARTS" // EnvMonitorInterval is the environment variable name for the interval, in seconds, to monitor the state of workers. EnvMonitorInterval = "PET_MONITOR_INTERVAL" // EnvStartMethod is the environment variable name for the multiprocessing start method to use when creating workers, which could be fork, spawn and forkserver. EnvStartMethod = "PET_START_METHOD" // EnvNProcPerNode is the environment variable name for the number of processes per node. EnvNProcPerNode = "PET_NPROC_PER_NODE" )
const ( // EnvNprocPerNode is the environment variable name for the number of processes per node. EnvNprocPerNode = "PET_NPROC_PER_NODE" // EnvNnodes is the environment variable name for the number of nodes. EnvNnodes = "PET_NNODES" // EnvNodeRank is the environment variable name for the rank of nodes. EnvNodeRank = "PET_NODE_RANK" )
Variables ¶
var ( EnvMasterPort = "MASTER_PORT" EnvMasterAddr = "MASTER_ADDR" PETMasterPort = "PET_MASTER_PORT" PETMasterAddr = "PET_MASTER_ADDR" )
Functions ¶
func ContainsMasterSpec ¶
func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool
ContainsMasterSpec returns true if the pytorchjob contains master spec.
Types ¶
type ElasticEnvVarGenerator ¶
type ElasticEnvVarGenerator struct{}
ElasticEnvVarGenerator is the environment variable generator for Elastic related arguments.
func (ElasticEnvVarGenerator) Generate ¶
func (e ElasticEnvVarGenerator) Generate( job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error)
type EnvVarGenerator ¶
type EnvVarGenerator interface {
Generate(job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error)
}
EnvVarGenerator is the environment variable generator interface.
func GetElasticEnvVarGenerator ¶
func GetElasticEnvVarGenerator() EnvVarGenerator
func GetMasterEnvVarGenerator ¶
func GetMasterEnvVarGenerator() EnvVarGenerator
type MasterEnvVarGenerator ¶
type MasterEnvVarGenerator struct { }
MasterEnvVarGenerator is the environment variable generator for Master related arguments.
func (MasterEnvVarGenerator) Generate ¶
func (e MasterEnvVarGenerator) Generate( job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error)
type PyTorchJobReconciler ¶
type PyTorchJobReconciler struct { common.JobController client.Client Scheme *runtime.Scheme Log logr.Logger // contains filtered or unexported fields }
PyTorchJobReconciler reconciles a PyTorchJob object
func NewReconciler ¶
func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *PyTorchJobReconciler
NewReconciler creates a PyTorchJob Reconciler
func (*PyTorchJobReconciler) ControllerName ¶
func (r *PyTorchJobReconciler) ControllerName() string
func (*PyTorchJobReconciler) DeleteJob ¶
func (r *PyTorchJobReconciler) DeleteJob(job interface{}) error
func (*PyTorchJobReconciler) GenLabelSelector ¶
func (jc *PyTorchJobReconciler) GenLabelSelector(jobName string, rtype kubeflowv1.ReplicaType) *metav1.LabelSelector
func (*PyTorchJobReconciler) GetAPIGroupVersion ¶
func (r *PyTorchJobReconciler) GetAPIGroupVersion() schema.GroupVersion
func (*PyTorchJobReconciler) GetAPIGroupVersionKind ¶
func (r *PyTorchJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind
func (*PyTorchJobReconciler) GetDefaultContainerName ¶
func (r *PyTorchJobReconciler) GetDefaultContainerName() string
func (*PyTorchJobReconciler) GetDefaultContainerPortName ¶
func (r *PyTorchJobReconciler) GetDefaultContainerPortName() string
func (*PyTorchJobReconciler) GetFrameworkName ¶
func (r *PyTorchJobReconciler) GetFrameworkName() string
func (*PyTorchJobReconciler) GetGroupNameLabelValue ¶
func (r *PyTorchJobReconciler) GetGroupNameLabelValue() string
func (*PyTorchJobReconciler) GetJobFromAPIClient ¶
func (r *PyTorchJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
func (*PyTorchJobReconciler) GetJobFromInformerCache ¶
func (r *PyTorchJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
func (*PyTorchJobReconciler) GetPodsForJob ¶
func (r *PyTorchJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)
func (*PyTorchJobReconciler) GetServicesForJob ¶
func (r *PyTorchJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)
func (*PyTorchJobReconciler) IsMasterRole ¶
func (r *PyTorchJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, rtype kubeflowv1.ReplicaType, index int) bool
func (*PyTorchJobReconciler) Reconcile ¶
func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)
Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. the PyTorchJob object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.
For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile
func (*PyTorchJobReconciler) ReconcileHPA ¶
func (r *PyTorchJobReconciler) ReconcileHPA(pytorchJob *kubeflowv1.PyTorchJob) error
func (*PyTorchJobReconciler) SetClusterSpec ¶
func (r *PyTorchJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error
SetClusterSpec sets the cluster spec and init container for the pod
func (*PyTorchJobReconciler) SetupWithManager ¶
func (r *PyTorchJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error
SetupWithManager sets up the controller with the Manager.
func (*PyTorchJobReconciler) UpdateJobStatus ¶
func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error
UpdateJobStatus updates the job status and job conditions
func (*PyTorchJobReconciler) UpdateJobStatusInApiServer ¶
func (r *PyTorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error
UpdateJobStatusInApiServer updates the job status in to cluster.