Documentation ¶
Overview ¶
Package controller provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Package controller provides a Kubernetes controller for a TFJob resource.
Index ¶
- Constants
- Variables
- func ContainChieforMasterSpec(tfJob *tfv1.TFJob) bool
- func GetPortFromTFJob(tfJob *tfv1.TFJob, rtype tfv1.TFReplicaType) (int32, error)
- func NewUnstructuredTFJobInformer(restConfig *restclientset.Config, namespace string) tfjobinformersv1.TFJobInformer
- type ClusterSpec
- type TFConfig
- type TFController
- func (tc *TFController) ControllerName() string
- func (tc *TFController) GetAPIGroupVersion() schema.GroupVersion
- func (tc *TFController) GetAPIGroupVersionKind() schema.GroupVersionKind
- func (tc *TFController) GetGroupNameLabelKey() string
- func (tc *TFController) GetGroupNameLabelValue() string
- func (tc *TFController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
- func (tc *TFController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
- func (tc *TFController) GetJobNameLabelKey() string
- func (tc *TFController) GetJobRoleKey() string
- func (tc *TFController) GetReplicaIndexLabelKey() string
- func (tc *TFController) GetReplicaTypeLabelKey() string
- func (tc *TFController) NewTFJobInformer(tfJobInformerFactory tfjobinformers.SharedInformerFactory) tfjobinformersv1.TFJobInformer
- func (tc *TFController) Run(threadiness int, stopCh <-chan struct{}) error
- type TaskSpec
Constants ¶
const ( // EnvCustomClusterDomain is the custom defined cluster domain, such as "svc.cluster.local". // Ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#a-records EnvCustomClusterDomain = "CUSTOM_CLUSTER_DOMAIN" )
Variables ¶
var ( // KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc. // IndexerInformer uses a delta queue, therefore for deletes we have to use this // key function but it should be just fine for non delete events. KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc // DefaultTFControllerConfiguration is the suggested tf-operator configuration for production. DefaultTFControllerConfiguration = jobcontroller.JobControllerConfiguration{ ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 15 * time.Second}, EnableGangScheduling: false, } )
Functions ¶
func ContainChieforMasterSpec ¶
ContainChieforMasterSpec returns true if the tfjob contains chief or master spec.
func GetPortFromTFJob ¶
GetPortFromTFJob gets the port of tensorflow container.
func NewUnstructuredTFJobInformer ¶
func NewUnstructuredTFJobInformer(restConfig *restclientset.Config, namespace string) tfjobinformersv1.TFJobInformer
Types ¶
type ClusterSpec ¶
ClusterSpec represents a cluster TensorFlow specification. https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster It is a map from job names to network addresses.
type TFConfig ¶
type TFConfig struct { // Cluster represents a TensorFlow ClusterSpec. // See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec Cluster ClusterSpec `json:"cluster"` Task TaskSpec `json:"task"` // Environment is used by tensorflow.contrib.learn.python.learn in versions <= 1.3 // TODO(jlewi): I don't think it is used in versions TF >- 1.4. So we can eventually get rid of it. Environment string `json:"environment"` }
TFConfig is a struct representing the distributed TensorFlow config. This struct is turned into an environment variable TF_CONFIG which is used by TensorFlow processes to configure themselves. https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig#methods https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details
type TFController ¶
type TFController struct { jobcontroller.JobController // contains filtered or unexported fields }
TFController is the type for TFJob Controller, which manages the lifecycle of TFJobs.
func NewTFController ¶
func NewTFController( tfJobInformer tfjobinformersv1.TFJobInformer, kubeClientSet kubeclientset.Interface, kubeBatchClientSet kubebatchclient.Interface, tfJobClientSet tfjobclientset.Interface, kubeInformerFactory kubeinformers.SharedInformerFactory, tfJobInformerFactory tfjobinformers.SharedInformerFactory, option options.ServerOption) *TFController
NewTFController returns a new TFJob controller.
func (*TFController) ControllerName ¶
func (tc *TFController) ControllerName() string
func (*TFController) GetAPIGroupVersion ¶
func (tc *TFController) GetAPIGroupVersion() schema.GroupVersion
func (*TFController) GetAPIGroupVersionKind ¶
func (tc *TFController) GetAPIGroupVersionKind() schema.GroupVersionKind
func (*TFController) GetGroupNameLabelKey ¶
func (tc *TFController) GetGroupNameLabelKey() string
func (*TFController) GetGroupNameLabelValue ¶
func (tc *TFController) GetGroupNameLabelValue() string
func (*TFController) GetJobFromAPIClient ¶
func (tc *TFController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)
func (*TFController) GetJobFromInformerCache ¶
func (tc *TFController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)
func (*TFController) GetJobNameLabelKey ¶
func (tc *TFController) GetJobNameLabelKey() string
func (*TFController) GetJobRoleKey ¶
func (tc *TFController) GetJobRoleKey() string
func (*TFController) GetReplicaIndexLabelKey ¶
func (tc *TFController) GetReplicaIndexLabelKey() string
func (*TFController) GetReplicaTypeLabelKey ¶
func (tc *TFController) GetReplicaTypeLabelKey() string
func (*TFController) NewTFJobInformer ¶
func (tc *TFController) NewTFJobInformer(tfJobInformerFactory tfjobinformers.SharedInformerFactory) tfjobinformersv1.TFJobInformer
NewTFJobInformer returns TFJobInformer from the given factory.
func (*TFController) Run ¶
func (tc *TFController) Run(threadiness int, stopCh <-chan struct{}) error
Run will set up the event handlers for types we are interested in, as well as syncing informer caches and starting workers. It will block until stopCh is closed, at which point it will shutdown the workqueue and wait for workers to finish processing their current work items.