Documentation ¶
Overview ¶
Package v2alpha1 contains API Schema definitions for the kubeflow.org v2alpha1 API group +kubebuilder:object:generate=true +groupName=kubeflow.org
Index ¶
- Constants
- Variables
- func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenAPIDefinition
- func RegisterDefaults(scheme *runtime.Scheme) error
- func Resource(resource string) schema.GroupResource
- func SetObjectDefaults_ClusterTrainingRuntime(in *ClusterTrainingRuntime)
- func SetObjectDefaults_ClusterTrainingRuntimeList(in *ClusterTrainingRuntimeList)
- func SetObjectDefaults_TrainJob(in *TrainJob)
- func SetObjectDefaults_TrainJobList(in *TrainJobList)
- func SetObjectDefaults_TrainingRuntime(in *TrainingRuntime)
- func SetObjectDefaults_TrainingRuntimeList(in *TrainingRuntimeList)
- type ClusterTrainingRuntime
- type ClusterTrainingRuntimeList
- type ContainerOverride
- type CoschedulingPodGroupPolicySource
- type DatasetConfig
- type InputModel
- type JobSetTemplateSpec
- type JobStatus
- type MLPolicy
- type MLPolicySource
- type MPIImplementation
- type MPIMLPolicySource
- type ModelConfig
- type OutputModel
- type PodGroupPolicy
- type PodGroupPolicySource
- type PodSpecOverride
- type PodSpecOverrideTargetJob
- type RuntimeRef
- type TorchElasticPolicy
- type TorchMLPolicySource
- type TrainJob
- type TrainJobList
- type TrainJobSpec
- type TrainJobStatus
- type Trainer
- type TrainingRuntime
- type TrainingRuntimeList
- type TrainingRuntimeSpec
Constants ¶
const ( // TrainingRuntimeKind is the Kind name for the TrainingRuntime. TrainingRuntimeKind string = "TrainingRuntime" // ClusterTrainingRuntimeKind is the Kind name for the ClusterTrainingRuntime. ClusterTrainingRuntimeKind string = "ClusterTrainingRuntime" )
const ( // TrainJobSuspended means that TrainJob is suspended. TrainJobSuspended string = "Suspended" // TrainJobComplete means that the TrainJob has completed its execution. TrainJobComplete string = "Complete" // TrainJobFailed means that the actual jobs have failed its execution. TrainJobFailed string = "Failed" // TrainJobCreated means that the actual jobs creation has succeeded. TrainJobCreated string = "Created" )
const ( // TrainJobSuspendedReason is the "Suspended" condition reason. // When the TrainJob is suspended, this is added. TrainJobSuspendedReason string = "Suspended" // TrainJobResumedReason is the "Suspended" condition reason. // When the TrainJob suspension is changed from True to False, this is added. TrainJobResumedReason string = "Resumed" // TrainJobJobsCreationSucceededReason is the "Created" condition reason. // When the creating objects succeeded after building succeeded, this is added. TrainJobJobsCreationSucceededReason string = "JobsCreationSucceeded" // TrainJobJobsBuildFailedReason is the "Created" condition reason. // When the building objects based on the TrainJob and the specified runtime failed, // this is added. TrainJobJobsBuildFailedReason string = "JobsBuildFailed" // TrainJobJobsCreationFailedReason is the "Created" condition reason. // When the creating objects failed even though building succeeded, this is added. TrainJobJobsCreationFailedReason string = "JobsCreationFailed" )
const ( // TrainJobKind is the Kind name for the TrainJob. TrainJobKind string = "TrainJob" )
Variables ¶
var ( // GroupVersion is group version used to register these objects. GroupVersion = schema.GroupVersion{Group: "kubeflow.org", Version: "v2alpha1"} // SchemeBuilder is used to add go types to the GroupVersionKind scheme. SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} // SchemeGroupVersion is alias to GroupVersion for client-go libraries. SchemeGroupVersion = GroupVersion // AddToScheme adds the types in this group-version to the given scheme. AddToScheme = SchemeBuilder.AddToScheme )
Functions ¶
func GetOpenAPIDefinitions ¶
func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenAPIDefinition
func RegisterDefaults ¶
RegisterDefaults adds defaulters functions to the given scheme. Public to allow building arbitrary schemes. All generated defaulters are covering - they call all nested defaulters.
func Resource ¶
func Resource(resource string) schema.GroupResource
Resource takes an unqualified resource and returns a Group-qualified GroupResource.
func SetObjectDefaults_ClusterTrainingRuntime ¶
func SetObjectDefaults_ClusterTrainingRuntime(in *ClusterTrainingRuntime)
func SetObjectDefaults_ClusterTrainingRuntimeList ¶
func SetObjectDefaults_ClusterTrainingRuntimeList(in *ClusterTrainingRuntimeList)
func SetObjectDefaults_TrainJob ¶
func SetObjectDefaults_TrainJob(in *TrainJob)
func SetObjectDefaults_TrainJobList ¶
func SetObjectDefaults_TrainJobList(in *TrainJobList)
func SetObjectDefaults_TrainingRuntime ¶
func SetObjectDefaults_TrainingRuntime(in *TrainingRuntime)
func SetObjectDefaults_TrainingRuntimeList ¶
func SetObjectDefaults_TrainingRuntimeList(in *TrainingRuntimeList)
Types ¶
type ClusterTrainingRuntime ¶
type ClusterTrainingRuntime struct { metav1.TypeMeta `json:",inline"` // Standard object's metadata. metav1.ObjectMeta `json:"metadata,omitempty"` // Specification of the desired ClusterTrainingRuntime. Spec TrainingRuntimeSpec `json:"spec,omitempty"` }
ClusterTrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced by TrainJob that created in *any* namespace.
func (*ClusterTrainingRuntime) DeepCopy ¶
func (in *ClusterTrainingRuntime) DeepCopy() *ClusterTrainingRuntime
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterTrainingRuntime.
func (*ClusterTrainingRuntime) DeepCopyInto ¶
func (in *ClusterTrainingRuntime) DeepCopyInto(out *ClusterTrainingRuntime)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*ClusterTrainingRuntime) DeepCopyObject ¶
func (in *ClusterTrainingRuntime) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type ClusterTrainingRuntimeList ¶
type ClusterTrainingRuntimeList struct { metav1.TypeMeta `json:",inline"` // Standard list metadata. metav1.ListMeta `json:"metadata,omitempty"` // List of ClusterTrainingRuntimes. Items []ClusterTrainingRuntime `json:"items"` }
ClusterTrainingRuntimeList is a collection of cluster training runtimes.
func (*ClusterTrainingRuntimeList) DeepCopy ¶
func (in *ClusterTrainingRuntimeList) DeepCopy() *ClusterTrainingRuntimeList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterTrainingRuntimeList.
func (*ClusterTrainingRuntimeList) DeepCopyInto ¶
func (in *ClusterTrainingRuntimeList) DeepCopyInto(out *ClusterTrainingRuntimeList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*ClusterTrainingRuntimeList) DeepCopyObject ¶
func (in *ClusterTrainingRuntimeList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type ContainerOverride ¶
type ContainerOverride struct { // Name for the container. TrainingRuntime must have this container. Name string `json:"name"` // Entrypoint commands for the training container. // +listType=atomic Command []string `json:"command,omitempty"` // Arguments to the entrypoint for the training container. // +listType=atomic Args []string `json:"args,omitempty"` // List of environment variables to set in the container. // These values will be merged with the TrainingRuntime's environments. // +listType=map // +listMapKey=name Env []corev1.EnvVar `json:"env,omitempty"` // List of sources to populate environment variables in the container. // These values will be merged with the TrainingRuntime's environments. // +listType=atomic EnvFrom []corev1.EnvFromSource `json:"envFrom,omitempty"` // Pod volumes to mount into the container's filesystem. // +listType=map // +listMapKey=name VolumeMounts []corev1.VolumeMount `json:"volumeMounts,omitempty"` }
ContainerOverride represents parameters that can be overridden using PodSpecOverrides. Parameters from the Trainer, DatasetConfig, and ModelConfig will take precedence.
func (*ContainerOverride) DeepCopy ¶
func (in *ContainerOverride) DeepCopy() *ContainerOverride
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerOverride.
func (*ContainerOverride) DeepCopyInto ¶
func (in *ContainerOverride) DeepCopyInto(out *ContainerOverride)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type CoschedulingPodGroupPolicySource ¶
type CoschedulingPodGroupPolicySource struct { // Time threshold to schedule PodGroup for gang-scheduling. // If the scheduling timeout is equal to 0, the default value is used. // Defaults to 60 seconds. ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` }
CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin. The number of min members in the PodGroupSpec is always equal to the number of nodes.
func (*CoschedulingPodGroupPolicySource) DeepCopy ¶
func (in *CoschedulingPodGroupPolicySource) DeepCopy() *CoschedulingPodGroupPolicySource
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CoschedulingPodGroupPolicySource.
func (*CoschedulingPodGroupPolicySource) DeepCopyInto ¶
func (in *CoschedulingPodGroupPolicySource) DeepCopyInto(out *CoschedulingPodGroupPolicySource)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type DatasetConfig ¶
type DatasetConfig struct { // Storage uri for the dataset provider. StorageUri *string `json:"storageUri,omitempty"` // List of environment variables to set in the dataset initializer container. // These values will be merged with the TrainingRuntime's dataset initializer environments. // +listType=map // +listMapKey=name Env []corev1.EnvVar `json:"env,omitempty"` // Reference to the secret with credentials to download dataset. // Secret must be created in the TrainJob's namespace. SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"` }
DatasetConfig represents the desired dataset configuration. When this API is used, the training runtime must have the `dataset-initializer` container in the `Initializer` Job.
func (*DatasetConfig) DeepCopy ¶
func (in *DatasetConfig) DeepCopy() *DatasetConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DatasetConfig.
func (*DatasetConfig) DeepCopyInto ¶
func (in *DatasetConfig) DeepCopyInto(out *DatasetConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type InputModel ¶
type InputModel struct { // Storage uri for the model provider. StorageUri *string `json:"storageUri,omitempty"` // List of environment variables to set in the model initializer container. // These values will be merged with the TrainingRuntime's model initializer environments. // +listType=map // +listMapKey=name Env []corev1.EnvVar `json:"env,omitempty"` // Reference to the secret with credentials to download model. // Secret must be created in the TrainJob's namespace. SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"` }
InputModel represents the desired pre-trained model configuration.
func (*InputModel) DeepCopy ¶
func (in *InputModel) DeepCopy() *InputModel
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InputModel.
func (*InputModel) DeepCopyInto ¶
func (in *InputModel) DeepCopyInto(out *InputModel)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type JobSetTemplateSpec ¶
type JobSetTemplateSpec struct { // Metadata for custom JobSet's labels and annotations. // JobSet name and namespace is equal to the TrainJob's name and namespace. metav1.ObjectMeta `json:"metadata,omitempty"` // Specification of the desired JobSet which will be created from TrainJob. Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"` }
JobSetTemplateSpec represents a template of the desired JobSet.
func (*JobSetTemplateSpec) DeepCopy ¶
func (in *JobSetTemplateSpec) DeepCopy() *JobSetTemplateSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new JobSetTemplateSpec.
func (*JobSetTemplateSpec) DeepCopyInto ¶
func (in *JobSetTemplateSpec) DeepCopyInto(out *JobSetTemplateSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type JobStatus ¶
type JobStatus struct { // Name of the child Job. Name string `json:"name"` // Ready is the number of child Jobs where the number of ready pods and completed pods // is greater than or equal to the total expected pod count for the child Job. Ready int32 `json:"ready"` // Succeeded is the number of successfully completed child Jobs. Succeeded int32 `json:"succeeded"` // Failed is the number of failed child Jobs. Failed int32 `json:"failed"` // Active is the number of child Jobs with at least 1 pod in a running or pending state // which are not marked for deletion. Active int32 `json:"active"` // Suspended is the number of child Jobs which are in a suspended state. Suspended int32 `json:"suspended"` }
func (*JobStatus) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new JobStatus.
func (*JobStatus) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type MLPolicy ¶
type MLPolicy struct { // Number of training nodes. // Defaults to 1. NumNodes *int32 `json:"numNodes,omitempty"` // Configuration for the runtime-specific parameters, such as Torch or MPI. // Only one of its members may be specified. MLPolicySource `json:",inline"` }
MLPolicy represents configuration for the model trining with ML-specific parameters.
func (*MLPolicy) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLPolicy.
func (*MLPolicy) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type MLPolicySource ¶
type MLPolicySource struct { // Configuration for the PyTorch runtime. Torch *TorchMLPolicySource `json:"torch,omitempty"` // Configuration for the MPI Runtime. MPI *MPIMLPolicySource `json:"mpi,omitempty"` }
MLPolicySource represents the runtime-specific configuration for various technologies. One of the following specs can be set.
func (*MLPolicySource) DeepCopy ¶
func (in *MLPolicySource) DeepCopy() *MLPolicySource
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MLPolicySource.
func (*MLPolicySource) DeepCopyInto ¶
func (in *MLPolicySource) DeepCopyInto(out *MLPolicySource)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type MPIImplementation ¶
type MPIImplementation string
MPIImplementation represents one of the supported MPI implementations.
const ( MPIImplementationOpenMPI MPIImplementation = "OpenMPI" MPIImplementationIntel MPIImplementation = "Intel" MPIImplementationMPICH MPIImplementation = "MPICH" )
type MPIMLPolicySource ¶
type MPIMLPolicySource struct { // Number of processes per node. // This value is equal to the number of slots for each node in the hostfile. NumProcPerNode *int32 `json:"numProcPerNode,omitempty"` // Implementation name for the MPI to create the appropriate hostfile. // Defaults to OpenMPI. MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"` // Directory where SSH keys are mounted. SSHAuthMountPath *string `json:"sshAuthMountPath,omitempty"` // Whether to run training process on the launcher Job. // Defaults to false. RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"` }
MPIMLPolicySource represents a MPI runtime configuration.
func (*MPIMLPolicySource) DeepCopy ¶
func (in *MPIMLPolicySource) DeepCopy() *MPIMLPolicySource
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MPIMLPolicySource.
func (*MPIMLPolicySource) DeepCopyInto ¶
func (in *MPIMLPolicySource) DeepCopyInto(out *MPIMLPolicySource)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ModelConfig ¶
type ModelConfig struct { // Configuration of the pre-trained model. // When this API is used, the training runtime must have // the `model-initializer` container in the `Initializer` Job. Input *InputModel `json:"input,omitempty"` // Configuration of the trained model. // When this API is used, the training runtime must have // the `model-exporter` container in the `Exporter` Job. Output *OutputModel `json:"output,omitempty"` }
ModelConfig represents the desired model configuration.
func (*ModelConfig) DeepCopy ¶
func (in *ModelConfig) DeepCopy() *ModelConfig
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelConfig.
func (*ModelConfig) DeepCopyInto ¶
func (in *ModelConfig) DeepCopyInto(out *ModelConfig)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type OutputModel ¶
type OutputModel struct { // Storage uri for the model exporter. StorageUri *string `json:"storageUri,omitempty"` // List of environment variables to set in the model exporter container. // These values will be merged with the TrainingRuntime's model exporter environments. // +listType=map // +listMapKey=name Env []corev1.EnvVar `json:"env,omitempty"` // Reference to the secret with credentials to export model. // Secret must be created in the TrainJob's namespace. SecretRef *corev1.LocalObjectReference `json:"secretRef,omitempty"` }
OutputModel represents the desired trained model configuration.
func (*OutputModel) DeepCopy ¶
func (in *OutputModel) DeepCopy() *OutputModel
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OutputModel.
func (*OutputModel) DeepCopyInto ¶
func (in *OutputModel) DeepCopyInto(out *OutputModel)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PodGroupPolicy ¶
type PodGroupPolicy struct { // Configuration for gang-scheduling using various plugins. PodGroupPolicySource `json:",inline"` }
PodGroupPolicy represents a PodGroup configuration for gang-scheduling.
func (*PodGroupPolicy) DeepCopy ¶
func (in *PodGroupPolicy) DeepCopy() *PodGroupPolicy
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupPolicy.
func (*PodGroupPolicy) DeepCopyInto ¶
func (in *PodGroupPolicy) DeepCopyInto(out *PodGroupPolicy)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PodGroupPolicySource ¶
type PodGroupPolicySource struct { // Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling. Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"` }
PodGroupPolicySource represents supported plugins for gang-scheduling. Only one of its members may be specified.
func (*PodGroupPolicySource) DeepCopy ¶
func (in *PodGroupPolicySource) DeepCopy() *PodGroupPolicySource
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupPolicySource.
func (*PodGroupPolicySource) DeepCopyInto ¶
func (in *PodGroupPolicySource) DeepCopyInto(out *PodGroupPolicySource)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PodSpecOverride ¶
type PodSpecOverride struct { // TrainJobs is the training job replicas in the training runtime template to apply the overrides. // +listType=atomic TargetJobs []PodSpecOverrideTargetJob `json:"targetJobs"` // Overrides for the containers in the desired job templates. // +listType=map // +listMapKey=name Containers []ContainerOverride `json:"containers,omitempty"` // Overrides for the init container in the desired job templates. // +listType=map // +listMapKey=name InitContainers []ContainerOverride `json:"initContainers,omitempty"` // Overrides for the Pod volume configuration. // +listType=map // +listMapKey=name Volumes []corev1.Volume `json:"volumes,omitempty"` // Override for the service account. ServiceAccountName string `json:"serviceAccountName,omitempty"` // Override for the node selector to place Pod on the specific mode. NodeSelector map[string]string `json:"nodeSelector,omitempty"` // Override for the Pod's tolerations. // +listType=atomic Tolerations []corev1.Toleration `json:"tolerations,omitempty"` }
PodSpecOverride represents the custom overrides that will be applied for the TrainJob's resources.
func (*PodSpecOverride) DeepCopy ¶
func (in *PodSpecOverride) DeepCopy() *PodSpecOverride
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodSpecOverride.
func (*PodSpecOverride) DeepCopyInto ¶
func (in *PodSpecOverride) DeepCopyInto(out *PodSpecOverride)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PodSpecOverrideTargetJob ¶
type PodSpecOverrideTargetJob struct { // Name is the target training job name for which the PodSpec is overridden. Name string `json:"name"` }
func (*PodSpecOverrideTargetJob) DeepCopy ¶
func (in *PodSpecOverrideTargetJob) DeepCopy() *PodSpecOverrideTargetJob
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodSpecOverrideTargetJob.
func (*PodSpecOverrideTargetJob) DeepCopyInto ¶
func (in *PodSpecOverrideTargetJob) DeepCopyInto(out *PodSpecOverrideTargetJob)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type RuntimeRef ¶
type RuntimeRef struct { // Name of the runtime being referenced. // When namespaced-scoped TrainingRuntime is used, the TrainJob must have // the same namespace as the deployed runtime. Name string `json:"name"` // APIGroup of the runtime being referenced. // Defaults to `kubeflow.org`. // +kubebuilder:default="kubeflow.org" APIGroup *string `json:"apiGroup,omitempty"` // Kind of the runtime being referenced. // Defaults to ClusterTrainingRuntime. // +kubebuilder:default="ClusterTrainingRuntime" Kind *string `json:"kind,omitempty"` }
RuntimeRef represents the reference to the existing training runtime.
func (*RuntimeRef) DeepCopy ¶
func (in *RuntimeRef) DeepCopy() *RuntimeRef
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RuntimeRef.
func (*RuntimeRef) DeepCopyInto ¶
func (in *RuntimeRef) DeepCopyInto(out *RuntimeRef)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TorchElasticPolicy ¶
type TorchElasticPolicy struct { // How many times the training job can be restarted. // This value is inserted into the `--max-restarts` argument of the `torchrun` CLI and // the `.spec.failurePolicy.maxRestarts` parameter of the training Job. MaxRestarts *int32 `json:"maxRestarts,omitempty"` // Lower limit for the number of nodes to which training job can scale down. MinNodes *int32 `json:"minNodes,omitempty"` // Upper limit for the number of nodes to which training job can scale up. MaxNodes *int32 `json:"maxNodes,omitempty"` // Specification which are used to calculate the desired number of nodes. See the individual // metric source types for more information about how each type of metric must respond. // The HPA will be created to perform auto-scaling. // +listType=atomic Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` }
TorchElasticPolicy represents a configuration for the PyTorch elastic training. If this policy is set, the `.spec.numNodes` parameter must be omitted, since min and max node is used to configure the `torchrun` CLI argument: `--nnodes=minNodes:maxNodes`. Only `c10d` backend is supported for the Rendezvous communication.
func (*TorchElasticPolicy) DeepCopy ¶
func (in *TorchElasticPolicy) DeepCopy() *TorchElasticPolicy
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchElasticPolicy.
func (*TorchElasticPolicy) DeepCopyInto ¶
func (in *TorchElasticPolicy) DeepCopyInto(out *TorchElasticPolicy)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TorchMLPolicySource ¶
type TorchMLPolicySource struct { // Number of processes per node. // This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. // Supported values: `auto`, `cpu`, `gpu`, or int value. // TODO (andreyvelich): Add kubebuilder validation. // Defaults to `auto`. NumProcPerNode *string `json:"numProcPerNode,omitempty"` // Elastic policy for the PyTorch training. ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"` }
TorchMLPolicySource represents a PyTorch runtime configuration.
func (*TorchMLPolicySource) DeepCopy ¶
func (in *TorchMLPolicySource) DeepCopy() *TorchMLPolicySource
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TorchMLPolicySource.
func (*TorchMLPolicySource) DeepCopyInto ¶
func (in *TorchMLPolicySource) DeepCopyInto(out *TorchMLPolicySource)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainJob ¶
type TrainJob struct { metav1.TypeMeta `json:",inline"` // Standard object's metadata. metav1.ObjectMeta `json:"metadata,omitempty"` // Specification of the desired TrainJob. Spec TrainJobSpec `json:"spec,omitempty"` // Current status of TrainJob. Status TrainJobStatus `json:"status,omitempty"` }
TrainJob represents configuration of a training job.
func (*TrainJob) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJob.
func (*TrainJob) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TrainJob) DeepCopyObject ¶
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TrainJobList ¶
type TrainJobList struct { metav1.TypeMeta `json:",inline"` // Standard list metadata. metav1.ListMeta `json:"metadata,omitempty"` // List of TrainJobs. Items []TrainJob `json:"items"` }
TrainJobList is a collection of training jobs.
func (*TrainJobList) DeepCopy ¶
func (in *TrainJobList) DeepCopy() *TrainJobList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobList.
func (*TrainJobList) DeepCopyInto ¶
func (in *TrainJobList) DeepCopyInto(out *TrainJobList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TrainJobList) DeepCopyObject ¶
func (in *TrainJobList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TrainJobSpec ¶
type TrainJobSpec struct { // Reference to the training runtime. // The field is immutable. // +kubebuilder:validation:XValidation:rule="self == oldSelf", message="runtimeRef is immutable" RuntimeRef RuntimeRef `json:"runtimeRef"` // Configuration of the desired trainer. Trainer *Trainer `json:"trainer,omitempty"` // Configuration of the training dataset. DatasetConfig *DatasetConfig `json:"datasetConfig,omitempty"` // Configuration of the pre-trained and trained model. ModelConfig *ModelConfig `json:"modelConfig,omitempty"` // Labels to apply for the derivative JobSet and Jobs. // They will be merged with the TrainingRuntime values. Labels map[string]string `json:"labels,omitempty"` // Annotations to apply for the derivative JobSet and Jobs. // They will be merged with the TrainingRuntime values. Annotations map[string]string `json:"annotations,omitempty"` // Custom overrides for the training runtime. // +listType=atomic PodSpecOverrides []PodSpecOverride `json:"podSpecOverrides,omitempty"` // Whether the controller should suspend the running TrainJob. // Defaults to false. // +kubebuilder:default=false Suspend *bool `json:"suspend,omitempty"` // ManagedBy is used to indicate the controller or entity that manages a TrainJob. // The value must be either an empty, `kubeflow.org/trainjob-controller` or // `kueue.x-k8s.io/multikueue`. The built-in TrainJob controller reconciles TrainJob which // don't have this field at all or the field value is the reserved string // `kubeflow.org/trainjob-controller`, but delegates reconciling TrainJobs // with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. // Defaults to `kubeflow.org/trainjob-controller` // +kubebuilder:default="kubeflow.org/trainjob-controller" // +kubebuilder:validation:XValidation:rule="self in ['kubeflow.org/trainjob-controller', 'kueue.x-k8s.io/multikueue']", message="ManagedBy must be kubeflow.org/trainjob-controller or kueue.x-k8s.io/multikueue if set" // +kubebuilder:validation:XValidation:rule="self == oldSelf", message="ManagedBy value is immutable" ManagedBy *string `json:"managedBy,omitempty"` }
TrainJobSpec represents specification of the desired TrainJob.
func (*TrainJobSpec) DeepCopy ¶
func (in *TrainJobSpec) DeepCopy() *TrainJobSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobSpec.
func (*TrainJobSpec) DeepCopyInto ¶
func (in *TrainJobSpec) DeepCopyInto(out *TrainJobSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainJobStatus ¶
type TrainJobStatus struct { // Conditions for the TrainJob. // // +optional // +listType=map // +listMapKey=type // +patchStrategy=merge // +patchMergeKey=type Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` // JobsStatus tracks the child Jobs in TrainJob. // +listType=map // +listMapKey=name JobsStatus []JobStatus `json:"jobsStatus,omitempty"` }
TrainJobStatus represents the current status of TrainJob.
func (*TrainJobStatus) DeepCopy ¶
func (in *TrainJobStatus) DeepCopy() *TrainJobStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainJobStatus.
func (*TrainJobStatus) DeepCopyInto ¶
func (in *TrainJobStatus) DeepCopyInto(out *TrainJobStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type Trainer ¶
type Trainer struct { // Docker image for the training container. Image *string `json:"image,omitempty"` // Entrypoint commands for the training container. // +listType=atomic Command []string `json:"command,omitempty"` // Arguments to the entrypoint for the training container. // +listType=atomic Args []string `json:"args,omitempty"` // List of environment variables to set in the training container. // These values will be merged with the TrainingRuntime's trainer environments. // +listType=map // +listMapKey=name Env []corev1.EnvVar `json:"env,omitempty"` // Number of training nodes. // TODO (andreyvelich): Do we want to support dynamic num of nodes in TrainJob for PyTorch elastic: `--nnodes=1:4` ? NumNodes *int32 `json:"numNodes,omitempty"` // Compute resources for each training node. ResourcesPerNode *corev1.ResourceRequirements `json:"resourcesPerNode,omitempty"` // Number of processes/workers/slots on every training node. // For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. // For the MPI runtime only int value can be set. NumProcPerNode *string `json:"numProcPerNode,omitempty"` }
Trainer represents the desired trainer configuration. Every training runtime contains `trainer` container which represents Trainer.
func (*Trainer) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Trainer.
func (*Trainer) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainingRuntime ¶
type TrainingRuntime struct { metav1.TypeMeta `json:",inline"` // Standard object's metadata. metav1.ObjectMeta `json:"metadata,omitempty"` // Specification of the desired TrainingRuntime. Spec TrainingRuntimeSpec `json:"spec,omitempty"` }
TrainingRuntime represents a training runtime which can be referenced as part of `runtimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced by TrainJob that created in the *same* namespace as the TrainingRuntime.
func (*TrainingRuntime) DeepCopy ¶
func (in *TrainingRuntime) DeepCopy() *TrainingRuntime
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntime.
func (*TrainingRuntime) DeepCopyInto ¶
func (in *TrainingRuntime) DeepCopyInto(out *TrainingRuntime)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TrainingRuntime) DeepCopyObject ¶
func (in *TrainingRuntime) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TrainingRuntimeList ¶
type TrainingRuntimeList struct { metav1.TypeMeta `json:",inline"` // Standard list metadata. metav1.ListMeta `json:"metadata,omitempty"` // List of TrainingRuntimes. Items []TrainingRuntime `json:"items"` }
TrainingRuntimeList is a collection of training runtimes.
func (*TrainingRuntimeList) DeepCopy ¶
func (in *TrainingRuntimeList) DeepCopy() *TrainingRuntimeList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeList.
func (*TrainingRuntimeList) DeepCopyInto ¶
func (in *TrainingRuntimeList) DeepCopyInto(out *TrainingRuntimeList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TrainingRuntimeList) DeepCopyObject ¶
func (in *TrainingRuntimeList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TrainingRuntimeSpec ¶
type TrainingRuntimeSpec struct { // Configuration for the model training with ML-specific parameters. MLPolicy *MLPolicy `json:"mlPolicy,omitempty"` // Configuration for the PodGroup to enable gang-scheduling via supported plugins. PodGroupPolicy *PodGroupPolicy `json:"podGroupPolicy,omitempty"` // JobSet template which will be used by TrainJob. Template JobSetTemplateSpec `json:"template"` }
TrainingRuntimeSpec represents a specification of the desired training runtime.
func (*TrainingRuntimeSpec) DeepCopy ¶
func (in *TrainingRuntimeSpec) DeepCopy() *TrainingRuntimeSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingRuntimeSpec.
func (*TrainingRuntimeSpec) DeepCopyInto ¶
func (in *TrainingRuntimeSpec) DeepCopyInto(out *TrainingRuntimeSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.