Documentation ¶
Overview ¶
Package tensorflow provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Package tensorflow provides a Kubernetes controller for a TFJob resource.
Index ¶
- Constants
- func OnOwnerDeleteAndDeletionExpectationFunc(jc job_controller.JobController) func(e event.DeleteEvent) bool
- type ClusterSpec
- type TFConfig
- type TFJobReconciler
- func (r *TFJobReconciler) ControllerName() string
- func (r *TFJobReconciler) DeleteJob(job interface{}) error
- func (r *TFJobReconciler) GetAPIGroupVersion() schema.GroupVersion
- func (r *TFJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind
- func (r *TFJobReconciler) GetDefaultContainerName() string
- func (r *TFJobReconciler) GetDefaultContainerPortName() string
- func (r *TFJobReconciler) GetDefaultContainerPortNumber() int32
- func (r *TFJobReconciler) GetGroupNameLabelValue() string
- func (r *TFJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
- func (r *TFJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
- func (r *TFJobReconciler) GetNodeForModelOutput(pods []*corev1.Pod) string
- func (r *TFJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)
- func (r *TFJobReconciler) GetReconcileOrders() []v1.ReplicaType
- func (r *TFJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)
- func (r *TFJobReconciler) IsMasterRole(replicas map[v1.ReplicaType]*v1.ReplicaSpec, rtype v1.ReplicaType, index int) bool
- func (r *TFJobReconciler) Reconcile(_ context.Context, req ctrl.Request) (ctrl.Result, error)
- func (r *TFJobReconciler) SetClusterSpec(ctx context.Context, job interface{}, podTemplateSpec *corev1.PodTemplateSpec, ...) error
- func (r *TFJobReconciler) SetupWithManager(mgr ctrl.Manager) error
- func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, ...) error
- func (r *TFJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *v1.JobStatus) error
- type TaskSpec
Constants ¶
const ( // EnvCustomClusterDomain is the custom defined cluster domain, such as "svc.cluster.local". // Ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#a-records EnvCustomClusterDomain = "CUSTOM_CLUSTER_DOMAIN" )
Variables ¶
This section is empty.
Functions ¶
func OnOwnerDeleteAndDeletionExpectationFunc ¶ added in v0.4.1
func OnOwnerDeleteAndDeletionExpectationFunc(jc job_controller.JobController) func(e event.DeleteEvent) bool
Types ¶
type ClusterSpec ¶
ClusterSpec represents a cluster TensorFlow specification. https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster It is a map from job names to network addresses.
type TFConfig ¶
type TFConfig struct { // Cluster represents a TensorFlow ClusterSpec. // See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec Cluster ClusterSpec `json:"cluster"` Task TaskSpec `json:"task"` // Environment is used by tensorflow.contrib.learn.python.learn in versions <= 1.3 // TODO(jlewi): I don't think it is used in versions TF >- 1.4. So we can eventually get rid of it. Environment string `json:"environment"` }
TFConfig is a struct representing the distributed TensorFlow config. This struct is turned into an environment variable TF_CONFIG which is used by TensorFlow processes to configure themselves. https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig#methods https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details
type TFJobReconciler ¶
type TFJobReconciler struct { client.Client utilruntime.EmptyScaleImpl // contains filtered or unexported fields }
TFJobReconciler reconciles a TFJob object
func NewReconciler ¶
func NewReconciler(mgr ctrl.Manager, config options.JobControllerConfiguration) *TFJobReconciler
NewReconciler returns a new reconcile.Reconciler
func (*TFJobReconciler) ControllerName ¶
func (r *TFJobReconciler) ControllerName() string
ControllerName returns the Controller name
func (*TFJobReconciler) DeleteJob ¶
func (r *TFJobReconciler) DeleteJob(job interface{}) error
DeleteJob deletes the job
func (*TFJobReconciler) GetAPIGroupVersion ¶
func (r *TFJobReconciler) GetAPIGroupVersion() schema.GroupVersion
GetAPIGroupVersion returns the GroupVersion of the API
func (*TFJobReconciler) GetAPIGroupVersionKind ¶
func (r *TFJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind
GetAPIGroupVersionKind returns the GroupVersionKind of the API
func (*TFJobReconciler) GetDefaultContainerName ¶
func (r *TFJobReconciler) GetDefaultContainerName() string
GetDefaultContainerName returns the default container name in pod
func (*TFJobReconciler) GetDefaultContainerPortName ¶
func (r *TFJobReconciler) GetDefaultContainerPortName() string
GetDefaultContainerPortName Get the default container port name
func (*TFJobReconciler) GetDefaultContainerPortNumber ¶
func (r *TFJobReconciler) GetDefaultContainerPortNumber() int32
GetDefaultContainerPortNumber get the default container port number
func (*TFJobReconciler) GetGroupNameLabelValue ¶
func (r *TFJobReconciler) GetGroupNameLabelValue() string
GetGroupNameLabelValue returns the Group Name(value) in the labels of the job
func (*TFJobReconciler) GetJobFromAPIClient ¶
func (r *TFJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
GetJobFromAPIClient returns the Job from API server
func (*TFJobReconciler) GetJobFromInformerCache ¶
func (r *TFJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
GetJobFromInformerCache returns the Job from Informer Cache
func (*TFJobReconciler) GetNodeForModelOutput ¶ added in v0.4.0
func (r *TFJobReconciler) GetNodeForModelOutput(pods []*corev1.Pod) string
func (*TFJobReconciler) GetPodsForJob ¶
func (r *TFJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error)
GetPodsForJob returns the set of pods that this job should manage.
func (*TFJobReconciler) GetReconcileOrders ¶
func (r *TFJobReconciler) GetReconcileOrders() []v1.ReplicaType
Get replicas reconcile orders so that replica type with higher priority can be created earlier.
func (*TFJobReconciler) GetServicesForJob ¶
func (r *TFJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error)
GetServicesForJob returns the services managed by the job. This can be achieved by selecting services using label key "job-name" i.e. all services created by the job will come with label "job-name" = <this_job_name>
func (*TFJobReconciler) IsMasterRole ¶
func (r *TFJobReconciler) IsMasterRole(replicas map[v1.ReplicaType]*v1.ReplicaSpec, rtype v1.ReplicaType, index int) bool
IsMasterRole returns if this replica type with index specified is a master role. MasterRole pod will have "job-role=master" set in its label
func (*TFJobReconciler) SetClusterSpec ¶
func (r *TFJobReconciler) SetClusterSpec(ctx context.Context, job interface{}, podTemplateSpec *corev1.PodTemplateSpec, rt, index string) error
SetClusterSpec generates and sets TF_CONFIG for the given podTemplateSpec.
func (*TFJobReconciler) SetupWithManager ¶
func (r *TFJobReconciler) SetupWithManager(mgr ctrl.Manager) error
SetupWithManager setup reconciler to the Manager with default RBAC. The Manager will set fields on the Controller and Start it when the Manager is Started.
func (*TFJobReconciler) UpdateJobStatus ¶
func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, jobStatus *v1.JobStatus, restart bool) error
UpdateJobStatus updates the job status and job conditions
func (*TFJobReconciler) UpdateJobStatusInApiServer ¶
func (r *TFJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *v1.JobStatus) error
UpdateJobStatusInApiServer updates the job status in API server