pytorch

package
v1.7.0-fix Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 15, 2024 License: Apache-2.0 Imports: 46 Imported by: 0

Documentation

Index

Constants

View Source
const (

	// EnvRDZVBackend is the environment variable name for the rdzv backend.
	EnvRDZVBackend = "PET_RDZV_BACKEND"
	// EnvRDZVID is the environment variable name for the rdzv id.
	EnvRDZVID = "PET_RDZV_ID"
	// ENVRDZVConf is the environment variable name for the rdzv conf.
	EnvRDZVConf = "PET_RDZV_CONF"
	// EnvRDZVEndpoint is the environment variable name for the rdzv endpoint.
	EnvRDZVEndpoint = "PET_RDZV_ENDPOINT"
	// EnvRDZVStandalone is the environment variable name for the standalone mode.
	EnvStandalone = "PET_STANDALONE"

	// EnvMaxRestarts is the environment variable name for the maximum number of worker group restarts before failing.
	EnvMaxRestarts = "PET_MAX_RESTARTS"
	// EnvMonitorInterval is the environment variable name for the interval, in seconds, to monitor the state of workers.
	EnvMonitorInterval = "PET_MONITOR_INTERVAL"
	// EnvStartMethod is the environment variable name for the multiprocessing start method to use when creating workers, which could be fork, spawn and forkserver.
	EnvStartMethod = "PET_START_METHOD"

	// EnvNProcPerNode is the environment variable name for the number of processes per node.
	EnvNProcPerNode = "PET_NPROC_PER_NODE"
)
View Source
const (

	// EnvNprocPerNode is the environment variable name for the number of processes per node.
	EnvNprocPerNode = "PET_NPROC_PER_NODE"
	// EnvNnodes is the environment variable name for the number of nodes.
	EnvNnodes = "PET_NNODES"
	// EnvNodeRank is the environment variable name for the rank of nodes.
	EnvNodeRank = "PET_NODE_RANK"
)

Variables

View Source
var (
	EnvMasterPort = "MASTER_PORT"
	EnvMasterAddr = "MASTER_ADDR"

	PETMasterPort = "PET_MASTER_PORT"
	PETMasterAddr = "PET_MASTER_ADDR"
)

Functions

func ContainsMasterSpec

func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool

ContainsMasterSpec returns true if the pytorchjob contains master spec.

Types

type ElasticEnvVarGenerator

type ElasticEnvVarGenerator struct{}

ElasticEnvVarGenerator is the environment variable generator for Elastic related arguments.

func (ElasticEnvVarGenerator) Generate

type EnvVarGenerator

type EnvVarGenerator interface {
	Generate(job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error)
}

EnvVarGenerator is the environment variable generator interface.

func GetElasticEnvVarGenerator

func GetElasticEnvVarGenerator() EnvVarGenerator

func GetMasterEnvVarGenerator

func GetMasterEnvVarGenerator() EnvVarGenerator

type MasterEnvVarGenerator

type MasterEnvVarGenerator struct {
}

MasterEnvVarGenerator is the environment variable generator for Master related arguments.

func (MasterEnvVarGenerator) Generate

type PyTorchJobReconciler

type PyTorchJobReconciler struct {
	common.JobController
	client.Client
	Scheme *runtime.Scheme
	Log    logr.Logger
	// contains filtered or unexported fields
}

PyTorchJobReconciler reconciles a PyTorchJob object

func NewReconciler

func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *PyTorchJobReconciler

NewReconciler creates a PyTorchJob Reconciler

func (*PyTorchJobReconciler) ControllerName

func (r *PyTorchJobReconciler) ControllerName() string

func (*PyTorchJobReconciler) DeleteJob

func (r *PyTorchJobReconciler) DeleteJob(job interface{}) error

func (*PyTorchJobReconciler) GenLabelSelector

func (jc *PyTorchJobReconciler) GenLabelSelector(jobName string,
	rtype kubeflowv1.ReplicaType) *metav1.LabelSelector

func (*PyTorchJobReconciler) GetAPIGroupVersion

func (r *PyTorchJobReconciler) GetAPIGroupVersion() schema.GroupVersion

func (*PyTorchJobReconciler) GetAPIGroupVersionKind

func (r *PyTorchJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind

func (*PyTorchJobReconciler) GetDefaultContainerName

func (r *PyTorchJobReconciler) GetDefaultContainerName() string

func (*PyTorchJobReconciler) GetDefaultContainerPortName

func (r *PyTorchJobReconciler) GetDefaultContainerPortName() string

func (*PyTorchJobReconciler) GetFrameworkName

func (r *PyTorchJobReconciler) GetFrameworkName() string

func (*PyTorchJobReconciler) GetGroupNameLabelValue

func (r *PyTorchJobReconciler) GetGroupNameLabelValue() string

func (*PyTorchJobReconciler) GetJobFromAPIClient

func (r *PyTorchJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)

func (*PyTorchJobReconciler) GetJobFromInformerCache

func (r *PyTorchJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)

func (*PyTorchJobReconciler) GetPodsForJob

func (r *PyTorchJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)

func (*PyTorchJobReconciler) GetServicesForJob

func (r *PyTorchJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)

func (*PyTorchJobReconciler) IsMasterRole

func (r *PyTorchJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
	rtype kubeflowv1.ReplicaType, index int) bool

func (*PyTorchJobReconciler) Reconcile

func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. the PyTorchJob object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.

For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile

func (*PyTorchJobReconciler) ReconcileHPA

func (r *PyTorchJobReconciler) ReconcileHPA(pytorchJob *kubeflowv1.PyTorchJob) error

func (*PyTorchJobReconciler) SetClusterSpec

func (r *PyTorchJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error

SetClusterSpec sets the cluster spec and init container for the pod

func (*PyTorchJobReconciler) SetupWithManager

func (r *PyTorchJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error

SetupWithManager sets up the controller with the Manager.

func (*PyTorchJobReconciler) UpdateJobStatus

func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{},
	replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
	jobStatus *kubeflowv1.JobStatus) error

UpdateJobStatus updates the job status and job conditions

func (*PyTorchJobReconciler) UpdateJobStatusInApiServer

func (r *PyTorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error

UpdateJobStatusInApiServer updates the job status in to cluster.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL