tensorflow

package
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 20, 2021 License: Apache-2.0 Imports: 39 Imported by: 0

Documentation

Overview

Package tensorflow provides a Kubernetes controller for a TFJob resource.

Package controller provides a Kubernetes controller for a TFJob resource.

Package controller provides a Kubernetes controller for a TFJob resource.

Package controller provides a Kubernetes controller for a TFJob resource.

Index

Constants

View Source
const (
	FailedDeleteJobReason     = "FailedDeleteJob"
	SuccessfulDeleteJobReason = "SuccessfulDeleteJob"
)
View Source
const (
	// EnvCustomClusterDomain is the custom defined cluster domain, such as "svc.cluster.local".
	// Ref: https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#a-records
	EnvCustomClusterDomain = "CUSTOM_CLUSTER_DOMAIN"
)

Variables

View Source
var (
	// KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc.
	// IndexerInformer uses a delta queue, therefore for deletes we have to use this
	// key function but it should be just fine for non delete events.
	KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc
)

Functions

func ContainChieforMasterSpec

func ContainChieforMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool

ContainChieforMasterSpec returns true if the tfjob contains chief or master spec.

func GetPortFromTFJob

func GetPortFromTFJob(tfJob *tfv1.TFJob, rtype commonv1.ReplicaType) (int32, error)

GetPortFromTFJob gets the port of tensorflow container.

func NewUnstructuredTFJobInformer

func NewUnstructuredTFJobInformer(restConfig *restclientset.Config, namespace string, resyncPeriod time.Duration) tfjobinformersv1.TFJobInformer

Types

type ClusterSpec

type ClusterSpec map[string][]string

ClusterSpec represents a cluster TensorFlow specification. https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster It is a map from job names to network addresses.

type SparseClusterSpec added in v1.1.0

type SparseClusterSpec struct {
	Worker map[int32]string `json:"worker"`
	PS     []string         `json:"ps"`
}

SparseClusterSpec enables a server to be configured without needing to know the identity of (for example) all other worker tasks. https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec

type SparseTFConfig added in v1.1.0

type SparseTFConfig struct {
	Cluster SparseClusterSpec `json:"sparseCluster"`
	Task    TaskSpec          `json:"task"`
}

type TFConfig

type TFConfig struct {
	// Cluster represents a TensorFlow ClusterSpec.
	// See: https://www.tensorflow.org/api_docs/python/tf/train/ClusterSpec
	Cluster ClusterSpec `json:"cluster"`
	Task    TaskSpec    `json:"task"`
	// Environment is used by tensorflow.contrib.learn.python.learn in versions <= 1.3
	// TODO(jlewi): I don't think it is used in versions TF >- 1.4. So we can eventually get rid of it.
	Environment string `json:"environment"`
}

TFConfig is a struct representing the distributed TensorFlow config. This struct is turned into an environment variable TF_CONFIG which is used by TensorFlow processes to configure themselves. https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig#methods https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details

type TFController

type TFController struct {
	common.JobController
	// contains filtered or unexported fields
}

TFController is the type for TFJob Controller, which manages the lifecycle of TFJobs.

func NewTFController

func NewTFController(

	tfJobInformer tfjobinformersv1.TFJobInformer,
	kubeClientSet kubeclientset.Interface,
	volcanoClientSet volcanoclient.Interface,
	tfJobClientSet tfjobclientset.Interface,
	kubeInformerFactory kubeinformers.SharedInformerFactory,

	tfJobInformerFactory tfjobinformers.SharedInformerFactory,
	option options.ServerOption) *TFController

NewTFController returns a new TFJob controller.

func (*TFController) ControllerName

func (tc *TFController) ControllerName() string

func (*TFController) DeleteJob added in v1.1.0

func (tc *TFController) DeleteJob(job interface{}) error

DeleteJob implements ControllerInterface interface.

func (*TFController) GetAPIGroupVersion

func (tc *TFController) GetAPIGroupVersion() schema.GroupVersion

func (*TFController) GetAPIGroupVersionKind

func (tc *TFController) GetAPIGroupVersionKind() schema.GroupVersionKind

func (*TFController) GetDefaultContainerName added in v1.1.0

func (tc *TFController) GetDefaultContainerName() string

func (*TFController) GetDefaultContainerPortName added in v1.1.0

func (tc *TFController) GetDefaultContainerPortName() string

func (*TFController) GetGroupNameLabelKey

func (tc *TFController) GetGroupNameLabelKey() string

func (*TFController) GetGroupNameLabelValue

func (tc *TFController) GetGroupNameLabelValue() string

func (*TFController) GetJobFromAPIClient

func (tc *TFController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error)

func (*TFController) GetJobFromInformerCache

func (tc *TFController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error)

func (*TFController) GetJobNameLabelKey

func (tc *TFController) GetJobNameLabelKey() string

Deprecated function for backwards compatibility. Has to be removed later

func (*TFController) GetReplicaIndexLabelKey

func (tc *TFController) GetReplicaIndexLabelKey() string

func (*TFController) GetReplicaTypeLabelKey

func (tc *TFController) GetReplicaTypeLabelKey() string

func (*TFController) IsMasterRole added in v1.1.0

func (tc *TFController) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool

func (*TFController) IsWorker0Completed added in v1.1.0

func (tc *TFController) IsWorker0Completed(tfjob *tfv1.TFJob, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) (bool, error)

IsWorker0Completed return true if pod of worker0 succeeded and exited with 0

func (*TFController) NewTFJobInformer

func (tc *TFController) NewTFJobInformer(tfJobInformerFactory tfjobinformers.SharedInformerFactory) tfjobinformersv1.TFJobInformer

NewTFJobInformer returns TFJobInformer from the given factory.

func (*TFController) ReconcilePods added in v1.1.0

func (tc *TFController) ReconcilePods(
	job interface{},
	jobStatus *commonv1.JobStatus,
	pods []*v1.Pod,
	rtype commonv1.ReplicaType,
	spec *commonv1.ReplicaSpec,
	replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec,
) error

reconcilePods checks and updates pods for each given TFReplicaSpec. It will requeue the tfjob in case of an error while creating/deleting pods.

func (*TFController) Run

func (tc *TFController) Run(threadiness int, stopCh <-chan struct{}) error

Run will set up the event handlers for types we are interested in, as well as syncing informer caches and starting workers. It will block until stopCh is closed, at which point it will shutdown the workqueue and wait for workers to finish processing their current work items.

func (*TFController) SetClusterSpec added in v1.1.0

func (tc *TFController) SetClusterSpec(job interface{}, podTemplate *v1.PodTemplateSpec, rtype, index string) error

SetClusterSpec generates and sets TF_CONFIG for the given podTemplateSpec.

func (*TFController) UpdateJobStatus added in v1.1.0

func (tc *TFController) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error

func (*TFController) UpdateJobStatusInApiServer added in v1.1.0

func (tc *TFController) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error

UpdateJobStatusInApiServer updates the status of the given TFJob.

type TaskSpec

type TaskSpec struct {
	Type  string `json:"type"`
	Index int    `json:"index"`
}

TaskSpec is the specification for a task (PS or worker) of the TFJob.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL