Documentation
¶
Overview ¶
Package tensorflow provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Index ¶
- Constants
- Variables
- func ContainChieforMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool
- func GetPortFromTFJob(tfJob *tfv1.TFJob, rtype commonv1.ReplicaType) (int32, error)
- func NewUnstructuredTFJobInformer(restConfig *restclientset.Config, namespace string, resyncPeriod time.Duration) tfjobinformersv1.TFJobInformer
- type ClusterSpec
- type SparseClusterSpec
- type SparseTFConfig
- type TFConfig
- type TFController
- func (tc *TFController) ControllerName() string
- func (tc *TFController) DeleteJob(job interface{}) error
- func (tc *TFController) GetAPIGroupVersion() schema.GroupVersion
- func (tc *TFController) GetAPIGroupVersionKind() schema.GroupVersionKind
- func (tc *TFController) GetDefaultContainerName() string
- func (tc *TFController) GetDefaultContainerPortName() string
- func (tc *TFController) GetGroupNameLabelKey() string
- func (tc *TFController) GetGroupNameLabelValue() string
- func (tc *TFController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
- func (tc *TFController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
- func (tc *TFController) GetJobNameLabelKey() string
- func (tc *TFController) GetReplicaIndexLabelKey() string
- func (tc *TFController) GetReplicaTypeLabelKey() string
- func (tc *TFController) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, ...) bool
- func (tc *TFController) IsWorker0Completed(tfjob *tfv1.TFJob, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) (bool, error)
- func (tc *TFController) NewTFJobInformer(tfJobInformerFactory tfjobinformers.SharedInformerFactory) tfjobinformersv1.TFJobInformer
- func (tc *TFController) ReconcilePods(job interface{}, jobStatus *commonv1.JobStatus, pods []*v1.Pod, ...) error
- func (tc *TFController) Run(threadiness int, stopCh <-chan struct{}) error
- func (tc *TFController) SetClusterSpec(job interface{}, podTemplate *v1.PodTemplateSpec, rtype, index string) error
- func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, ...) error
- func (tc *TFController) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error
- type TaskSpec
Constants ¶
const ( FailedDeleteJobReason = "FailedDeleteJob" SuccessfulDeleteJobReason = "SuccessfulDeleteJob" )
const ( // EnvCustomClusterDomain is the custom defined cluster domain, such as "svc.cluster.local". // Ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#a-records EnvCustomClusterDomain = "CUSTOM_CLUSTER_DOMAIN" )
Variables ¶
var ( // KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc. // IndexerInformer uses a delta queue, therefore for deletes we have to use this // key function but it should be just fine for non delete events. KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc )
Functions ¶
func ContainChieforMasterSpec ¶
func ContainChieforMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool
ContainChieforMasterSpec returns true if the tfjob contains chief or master spec.
func GetPortFromTFJob ¶
GetPortFromTFJob gets the port of tensorflow container.
func NewUnstructuredTFJobInformer ¶
func NewUnstructuredTFJobInformer(restConfig *restclientset.Config, namespace string, resyncPeriod time.Duration) tfjobinformersv1.TFJobInformer
Types ¶
type ClusterSpec ¶
ClusterSpec represents a cluster TensorFlow specification. https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster It is a map from job names to network addresses.
type SparseClusterSpec ¶ added in v1.1.0
SparseClusterSpec enables a server to be configured without needing to know the identity of (for example) all other worker tasks. https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec
type SparseTFConfig ¶ added in v1.1.0
type SparseTFConfig struct { Cluster SparseClusterSpec `json:"sparseCluster"` Task TaskSpec `json:"task"` }
type TFConfig ¶
type TFConfig struct { // Cluster represents a TensorFlow ClusterSpec. // See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec Cluster ClusterSpec `json:"cluster"` Task TaskSpec `json:"task"` // Environment is used by tensorflow.contrib.learn.python.learn in versions <= 1.3 // TODO(jlewi): I don't think it is used in versions TF >- 1.4. So we can eventually get rid of it. Environment string `json:"environment"` }
TFConfig is a struct representing the distributed TensorFlow config. This struct is turned into an environment variable TF_CONFIG which is used by TensorFlow processes to configure themselves. https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig#methods https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details
type TFController ¶
type TFController struct { common.JobController // contains filtered or unexported fields }
TFController is the type for TFJob Controller, which manages the lifecycle of TFJobs.
func NewTFController ¶
func NewTFController( tfJobInformer tfjobinformersv1.TFJobInformer, kubeClientSet kubeclientset.Interface, volcanoClientSet volcanoclient.Interface, tfJobClientSet tfjobclientset.Interface, kubeInformerFactory kubeinformers.SharedInformerFactory, tfJobInformerFactory tfjobinformers.SharedInformerFactory, option options.ServerOption) *TFController
NewTFController returns a new TFJob controller.
func (*TFController) ControllerName ¶
func (tc *TFController) ControllerName() string
func (*TFController) DeleteJob ¶ added in v1.1.0
func (tc *TFController) DeleteJob(job interface{}) error
DeleteJob implements ControllerInterface interface.
func (*TFController) GetAPIGroupVersion ¶
func (tc *TFController) GetAPIGroupVersion() schema.GroupVersion
func (*TFController) GetAPIGroupVersionKind ¶
func (tc *TFController) GetAPIGroupVersionKind() schema.GroupVersionKind
func (*TFController) GetDefaultContainerName ¶ added in v1.1.0
func (tc *TFController) GetDefaultContainerName() string
func (*TFController) GetDefaultContainerPortName ¶ added in v1.1.0
func (tc *TFController) GetDefaultContainerPortName() string
func (*TFController) GetGroupNameLabelKey ¶
func (tc *TFController) GetGroupNameLabelKey() string
func (*TFController) GetGroupNameLabelValue ¶
func (tc *TFController) GetGroupNameLabelValue() string
func (*TFController) GetJobFromAPIClient ¶
func (tc *TFController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
func (*TFController) GetJobFromInformerCache ¶
func (tc *TFController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
func (*TFController) GetJobNameLabelKey ¶
func (tc *TFController) GetJobNameLabelKey() string
Deprecated function for backwards compatibility. Has to be removed later
func (*TFController) GetReplicaIndexLabelKey ¶
func (tc *TFController) GetReplicaIndexLabelKey() string
func (*TFController) GetReplicaTypeLabelKey ¶
func (tc *TFController) GetReplicaTypeLabelKey() string
func (*TFController) IsMasterRole ¶ added in v1.1.0
func (tc *TFController) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool
func (*TFController) IsWorker0Completed ¶ added in v1.1.0
func (tc *TFController) IsWorker0Completed(tfjob *tfv1.TFJob, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) (bool, error)
IsWorker0Completed return true if pod of worker0 succeeded and exited with 0
func (*TFController) NewTFJobInformer ¶
func (tc *TFController) NewTFJobInformer(tfJobInformerFactory tfjobinformers.SharedInformerFactory) tfjobinformersv1.TFJobInformer
NewTFJobInformer returns TFJobInformer from the given factory.
func (*TFController) ReconcilePods ¶ added in v1.1.0
func (tc *TFController) ReconcilePods( job interface{}, jobStatus *commonv1.JobStatus, pods []*v1.Pod, rtype commonv1.ReplicaType, spec *commonv1.ReplicaSpec, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, ) error
reconcilePods checks and updates pods for each given TFReplicaSpec. It will requeue the tfjob in case of an error while creating/deleting pods.
func (*TFController) Run ¶
func (tc *TFController) Run(threadiness int, stopCh <-chan struct{}) error
Run will set up the event handlers for types we are interested in, as well as syncing informer caches and starting workers. It will block until stopCh is closed, at which point it will shutdown the workqueue and wait for workers to finish processing their current work items.
func (*TFController) SetClusterSpec ¶ added in v1.1.0
func (tc *TFController) SetClusterSpec(job interface{}, podTemplate *v1.PodTemplateSpec, rtype, index string) error
SetClusterSpec generates and sets TF_CONFIG for the given podTemplateSpec.
func (*TFController) UpdateJobStatus ¶ added in v1.1.0
func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error
func (*TFController) UpdateJobStatusInApiServer ¶ added in v1.1.0
func (tc *TFController) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error
UpdateJobStatusInApiServer updates the status of the given TFJob.