Documentation
¶
Overview ¶
Package v1alpha1 is the v1alpha1 version of the API. +groupName=paddlepaddle.org
Index ¶
- Constants
- Variables
- func CRDName() string
- func Resource(resource string) schema.GroupResource
- type Annotations
- type Framework
- type FrameworkName
- type JobType
- type MasterSpec
- type PserverSpec
- type ResourceState
- type ScaleResults
- type TrainerJobScaleRecord
- type TrainerJobScaleRecords
- type TrainerSpec
- type TrainingJob
- func (in *TrainingJob) DeepCopy() *TrainingJob
- func (in *TrainingJob) DeepCopyInto(out *TrainingJob)
- func (in *TrainingJob) DeepCopyObject() runtime.Object
- func (s *TrainingJob) Elastic() bool
- func (s *TrainingJob) Fulfillment() float64
- func (s *TrainingJob) GPU() int
- func (s *TrainingJob) NeedGPU() bool
- func (s *TrainingJob) String() string
- func (s *TrainingJob) TrainerCPURequestMilli() int64
- func (s *TrainingJob) TrainerGPULimit() int
- func (s *TrainingJob) TrainerMemRequestMega() int64
- type TrainingJobList
- type TrainingJobPhase
- type TrainingJobSpec
- type TrainingJobStatus
- type TrainingResourceStatus
- type TrainingResourceType
Constants ¶
const ( // CRDKind is the kind of K8s CRD. CRDKind = "TrainingJob" // CRDKindPlural is the plural of CRDKind. CRDKindPlural = "trainingjobs" // CRDShortName is the short name of CRD. CRDShortName = "tj" // CRDGroup is the name of group. CRDGroup = "paddlepaddle.org" // CRDVersion is the version of CRD. CRDVersion = "v1alpha1" )
const ( // TrainingJobPhaseNone is empty TrainingJobPhase. TrainingJobPhaseNone TrainingJobPhase = "" // TrainingJobPhaseCreating is creating TrainingJobPhase. TrainingJobPhaseCreating = "Creating" // TrainingJobPhaseRunning is running TrainingJobPhase. TrainingJobPhaseRunning = "Running" // TrainingJobPhaseScaling is scaling TrainingJobPhase. TrainingJobPhaseScaling = "Scaling" // TrainingJobPhaseSucceeded is succeeded TrainingJobPhase. TrainingJobPhaseSucceeded = "Succeed" // TrainingJobPhaseFailed is failed TrainingJobPhase. TrainingJobPhaseFailed = "Failed" // TrainingJobPhaseTimeout is failed TrainingJobPhase. TrainingJobPhaseTimeout = "Timeout" )
const ( // ResourceStateNone is the initial state of training job ResourceStateNone ResourceState = "" // ResourceStateStarting is the starting state of ResourceState. ResourceStateStarting = "starting" // ResourceStateRunning is the running state of ResourceState. ResourceStateRunning = "running" // ResourceStateFailed is the failed state of ResourceState. ResourceStateFailed = "failed" // ResourceStateSucceeded is the succeeded state of ResourceState ResourceStateSucceeded = "succeeded" )
const ( Local JobType = "local" Nccl2 = "nccl2" Multi = "multi" )
Job type const.
Variables ¶
var ( // SchemeBuilder will call register SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) // AddToScheme will apply all the stored functions to the scheme AddToScheme = SchemeBuilder.AddToScheme )
var SchemeGroupVersion = schema.GroupVersion{Group: CRDGroup, Version: CRDVersion}
SchemeGroupVersion is the group version used to register these objects.
Functions ¶
func Resource ¶
func Resource(resource string) schema.GroupResource
Resource takes an unqualified resource and returns a Group-qualified GroupResource.
Types ¶
type Annotations ¶
type Annotations struct { Usergroupid string `json:"usergroupid"` Userid string `json:"userid"` Priority string `json:"priority"` Scheduler string `json:"scheduler"` Walltime int `json:"walltime"` }
Annotations that offering additional metadata.
func (*Annotations) DeepCopy ¶
func (in *Annotations) DeepCopy() *Annotations
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Annotations.
func (*Annotations) DeepCopyInto ¶
func (in *Annotations) DeepCopyInto(out *Annotations)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type Framework ¶
type Framework struct { Name FrameworkName `json:"name"` Type JobType `json:"type"` }
Framework which operator support.
func (*Framework) DeepCopy ¶
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Framework.
func (*Framework) DeepCopyInto ¶
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type FrameworkName ¶
type FrameworkName string
FrameworkName that operator support.
const ( Paddle FrameworkName = "paddle" TensorFlow = "tensorflow" )
Framework name const.
type MasterSpec ¶
type MasterSpec struct { EtcdEndpoint string `json:"etcd-endpoint"` Resources corev1.ResourceRequirements `json:"resources"` ReplicaSpec *v1beta1.ReplicaSet `json:"replicaSpec"` Envs map[string]string `json:"envs"` //for preStop GracePeriodSeconds *int64 `json:"grace_period_seconds"` PreStopCmd []string `json:"pre_stop_cmd"` Tolerations []corev1.Toleration `json:"tolerations"` NodeSelector map[string]string `json:"node_selector"` LivenessProbe *corev1.Probe `json:"liveness_probe"` ReadinessProbe *corev1.Probe `json:"readiness_probe"` }
MasterSpec is the spec for a master in the paddle job
func (*MasterSpec) DeepCopy ¶
func (in *MasterSpec) DeepCopy() *MasterSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MasterSpec.
func (*MasterSpec) DeepCopyInto ¶
func (in *MasterSpec) DeepCopyInto(out *MasterSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type PserverSpec ¶
type PserverSpec struct { Entrypoint string `json:"entrypoint"` MinInstance int `json:"min-instance"` MaxInstance int `json:"max-instance"` Resources corev1.ResourceRequirements `json:"resources"` ReplicaSpec *v1beta1.ReplicaSet `json:"replicaSpec"` ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets"` Envs map[string]string `json:"envs"` //for preStop GracePeriodSeconds *int64 `json:"grace_period_seconds"` PreStopCmd []string `json:"pre_stop_cmd"` Tolerations []corev1.Toleration `json:"tolerations"` NodeSelector map[string]string `json:"node_selector"` //IndexSucceed marks if the operator has added labels to pservers successfully in the initial phase IndexSucceed bool `json:"index_succeed"` LivenessProbe *corev1.Probe `json:"liveness_probe"` ReadinessProbe *corev1.Probe `json:"readiness_probe"` }
PserverSpec is the spec for pservers in the paddle job
func (*PserverSpec) DeepCopy ¶
func (in *PserverSpec) DeepCopy() *PserverSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PserverSpec.
func (*PserverSpec) DeepCopyInto ¶
func (in *PserverSpec) DeepCopyInto(out *PserverSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type ScaleResults ¶
type ScaleResults string
ScaleResults is the result of scale
const ( // ScaleTrue means scale succeed. ScaleTrue ScaleResults = "True" // ScaleFalse means scale failed. ScaleFalse ScaleResults = "False" // ScaleUnknown means kubernetes can't decide if a scale succeed or not. ScaleUnknown ScaleResults = "Unknown" )
type TrainerJobScaleRecord ¶
type TrainerJobScaleRecord struct { // ScaleTimestamp is the time to scale a TrainingJob ScaleTimestamp metav1.Time `json:"scaleTimestamp"` // Additional is the additional the job to scale Additional int32 `json:"additional"` // Status is the result of the scale。 Status ScaleResults `json:"status"` // reason is the reason for the scale failed. // +optional Reason string `json:"reason,omitempty"` }
TrainerJobScaleRecord is record of trainer jobs.
func (*TrainerJobScaleRecord) DeepCopy ¶
func (in *TrainerJobScaleRecord) DeepCopy() *TrainerJobScaleRecord
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainerJobScaleRecord.
func (*TrainerJobScaleRecord) DeepCopyInto ¶
func (in *TrainerJobScaleRecord) DeepCopyInto(out *TrainerJobScaleRecord)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainerJobScaleRecords ¶
type TrainerJobScaleRecords struct {
ScaleRecords []*TrainerJobScaleRecord
}
TrainerJobScaleRecords is records of trainer jobs.
func (*TrainerJobScaleRecords) DeepCopy ¶
func (in *TrainerJobScaleRecords) DeepCopy() *TrainerJobScaleRecords
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainerJobScaleRecords.
func (*TrainerJobScaleRecords) DeepCopyInto ¶
func (in *TrainerJobScaleRecords) DeepCopyInto(out *TrainerJobScaleRecords)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainerSpec ¶
type TrainerSpec struct { EtcdEndpoint string `json:"etcd-endpoint"` Entrypoint string `json:"entrypoint"` Workspace string `json:"workspace"` MinInstance int `json:"min-instance"` MaxInstance int `json:"max-instance"` Resources corev1.ResourceRequirements `json:"resources"` ReplicaSpec *batchv1.Job `json:"replicaSpec"` ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets"` Envs map[string]string `json:"envs"` //for preStop GracePeriodSeconds *int64 `json:"grace_period_seconds"` PreStopCmd []string `json:"pre_stop_cmd"` Tolerations []corev1.Toleration `json:"tolerations"` NodeSelector map[string]string `json:"node_selector"` //IndexSucceed marks if the operator has added labels to trainers successfully in the initial phase IndexSucceed bool `json:"index_succeed"` LivenessProbe *corev1.Probe `json:"liveness_probe"` ReadinessProbe *corev1.Probe `json:"readiness_probe"` }
TrainerSpec is the spec for trainers in the paddle job
func (*TrainerSpec) DeepCopy ¶
func (in *TrainerSpec) DeepCopy() *TrainerSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainerSpec.
func (*TrainerSpec) DeepCopyInto ¶
func (in *TrainerSpec) DeepCopyInto(out *TrainerSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainingJob ¶
type TrainingJob struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` Spec TrainingJobSpec `json:"spec"` Status TrainingJobStatus `json:"status"` }
TrainingJob is a specification for a TrainingJob resource
func (*TrainingJob) DeepCopy ¶
func (in *TrainingJob) DeepCopy() *TrainingJob
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJob.
func (*TrainingJob) DeepCopyInto ¶
func (in *TrainingJob) DeepCopyInto(out *TrainingJob)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TrainingJob) DeepCopyObject ¶
func (in *TrainingJob) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (*TrainingJob) Elastic ¶
func (s *TrainingJob) Elastic() bool
Elastic returns true if the job can scale to more workers.
func (*TrainingJob) Fulfillment ¶
func (s *TrainingJob) Fulfillment() float64
Fulfillment returns the fulfillment of a trainingjob
func (*TrainingJob) GPU ¶
func (s *TrainingJob) GPU() int
GPU convert Resource Limit Quantity to int
func (*TrainingJob) NeedGPU ¶
func (s *TrainingJob) NeedGPU() bool
NeedGPU returns true if the job need GPU resource to run.
func (*TrainingJob) String ¶
func (s *TrainingJob) String() string
String returns marshal string of TrainingJob
func (*TrainingJob) TrainerCPURequestMilli ¶
func (s *TrainingJob) TrainerCPURequestMilli() int64
TrainerCPURequestMilli returns cpu request of each trainer instance
func (*TrainingJob) TrainerGPULimit ¶
func (s *TrainingJob) TrainerGPULimit() int
TrainerGPULimit returns gpu limit of each trainer instance
func (*TrainingJob) TrainerMemRequestMega ¶
func (s *TrainingJob) TrainerMemRequestMega() int64
TrainerMemRequestMega returns memory request of each trainer instance
type TrainingJobList ¶
type TrainingJobList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata"` // Items means the list of paddle job/TrainingJob Items []TrainingJob `json:"items"` }
TrainingJobList is a list of TrainingJob resources
func (*TrainingJobList) DeepCopy ¶
func (in *TrainingJobList) DeepCopy() *TrainingJobList
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobList.
func (*TrainingJobList) DeepCopyInto ¶
func (in *TrainingJobList) DeepCopyInto(out *TrainingJobList)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (*TrainingJobList) DeepCopyObject ¶
func (in *TrainingJobList) DeepCopyObject() runtime.Object
DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
type TrainingJobSpec ¶
type TrainingJobSpec struct { // General job attributes. Image string `json:"image"` HostNetwork bool `json:"host_network"` Port int `json:"port"` PortsNum int `json:"ports_num"` PortsNumForSparse int `json:"ports_num_for_sparse"` TrainerPort int `json:"trainer_port"` TrainerPortsNum int `json:"trainer_ports_num"` FaultTolerant bool `json:"fault_tolerant"` LocalJob bool `json:"local_job"` // LocalJob indicates if the job is local job or cluster job Passes int `json:"passes"` Volumes []corev1.Volume `json:"volumes"` VolumeMounts []corev1.VolumeMount `json:"VolumeMounts"` // TODO how to use these two params in matrix Mountpath string `json:"mountpath"` Nfsmount string `json:"nfsmount"` Annotations Annotations `json:"annotations"` //TrainingJob components. Master MasterSpec `json:"master"` Pserver PserverSpec `json:"pserver"` Trainer TrainerSpec `json:"trainer"` IsNccl bool `json:"is_nccl"` FrameWork *Framework `json:"frame_work"` //Scheduling components. SchedulerName string `json:"schedulerName,omitempty"` PodGroupName string `json:"podGroupName,omitempty"` // Matrix field indicates whether the backend container is matrix Matrix bool `json:"matrix"` }
TrainingJobSpec is the spec for a TrainingJob resource
func (*TrainingJobSpec) DeepCopy ¶
func (in *TrainingJobSpec) DeepCopy() *TrainingJobSpec
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobSpec.
func (*TrainingJobSpec) DeepCopyInto ¶
func (in *TrainingJobSpec) DeepCopyInto(out *TrainingJobSpec)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainingJobStatus ¶
type TrainingJobStatus struct { // Phase is phase of TrainingJob Phase TrainingJobPhase `json:"phase"` // Reason is the reason of job phase failed Reason string `json:"reason"` // ScaleStatus is autoscale status of trainer jobs // TODO(ZhengQi): this will used in autoscale mode in future. ScaleRecords TrainerJobScaleRecords `json:"scale_records"` // ReplicaStatuses is detail status of resources // TODO(ZhengQi): should we only considered trainer job now? ReplicaStatuses []*TrainingResourceStatus `json:"replica_statuses"` // StartTime marks when the trainingjob is Running StartTime metav1.Time `json:"startTime"` // Released marks resource have been released Released bool `json:"released"` }
TrainingJobStatus is the status for a TrainingJob resource.
func (*TrainingJobStatus) DeepCopy ¶
func (in *TrainingJobStatus) DeepCopy() *TrainingJobStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingJobStatus.
func (*TrainingJobStatus) DeepCopyInto ¶
func (in *TrainingJobStatus) DeepCopyInto(out *TrainingJobStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainingResourceStatus ¶
type TrainingResourceStatus struct { // TrainingResourceType the type of TrainingJob resource, include MASTER PSERVER and TRAINER TrainingResourceType `json:"training_resource_type"` // State is the state of a type of resource State ResourceState `json:"state"` // ResourceStates is the number of resource in different state ResourceStates map[ResourceState]int `json:"resource_states"` }
TrainingResourceStatus is the status of every resource
func (*TrainingResourceStatus) DeepCopy ¶
func (in *TrainingResourceStatus) DeepCopy() *TrainingResourceStatus
DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingResourceStatus.
func (*TrainingResourceStatus) DeepCopyInto ¶
func (in *TrainingResourceStatus) DeepCopyInto(out *TrainingResourceStatus)
DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
type TrainingResourceType ¶
type TrainingResourceType string
TrainingResourceType the type of TrainingJob resource, include MASTER PSERVER and TRAINER
const ( // MASTER is the master name of TrainingResourceType. MASTER TrainingResourceType = "master" // PSERVER is the pserver name of TrainingResourceType. PSERVER TrainingResourceType = "pserver" // TRAINER is the trainer name of TrainingResourceType. TRAINER TrainingResourceType = "trainer" )