job_controller

package
v0.4.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 4, 2022 License: Apache-2.0 Imports: 45 Imported by: 0

Documentation

Overview

Copyright 2019 The Kubeflow Authors

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Index

Constants

View Source
const (
	FailedDeleteJobReason     = "FailedDeleteJob"
	SuccessfulDeleteJobReason = "SuccessfulDeleteJob"
)

Reasons for job events.

View Source
const (
	// FailedCreatePodReason is added in an event and in a replica set condition
	// when a pod for a replica set is failed to be created.
	FailedCreatePodReason = "FailedCreatePod"
	// SuccessfulCreatePodReason is added in an event when a pod for a replica set
	// is successfully created.
	SuccessfulCreatePodReason = "SuccessfulCreatePod"
	// FailedDeletePodReason is added in an event and in a replica set condition
	// when a pod for a replica set is failed to be deleted.
	FailedDeletePodReason = "FailedDeletePod"
	// SuccessfulDeletePodReason is added in an event when a pod for a replica set
	// is successfully deleted.
	SuccessfulDeletePodReason = "SuccessfulDeletePod"
)

Reasons for pod events

View Source
const (
	FailedCreateServiceReason     = "FailedCreateService"
	SuccessfulCreateServiceReason = "SuccessfulCreateService"
	FailedDeleteServiceReason     = "FailedDeleteService"
	SuccessfulDeleteServiceReason = "SuccessfulDeleteService"
)

Variables

View Source
var (
	// KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc.
	// IndexerInformer uses a delta queue, therefore for deletes we have to use this
	// key function but it should be just fine for non delete events.
	KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc
)

Functions

func EnableHostNetwork added in v0.2.0

func EnableHostNetwork(job metav1.Object) bool

func GenExpectationPodsKey

func GenExpectationPodsKey(jobKey, replicaType string) string

func GenExpectationServicesKey

func GenExpectationServicesKey(jobKey, replicaType string) string

func GetHostNetworkPortFromContext added in v0.2.0

func GetHostNetworkPortFromContext(ctx context.Context, rtype, index string) (int32, bool)

func GetPodFromTemplate

func GetPodFromTemplate(template *v1.PodTemplateSpec, parentObject runtime.Object, controllerRef *metav1.OwnerReference) (*v1.Pod, error)

func GetPortFromJob

func GetPortFromJob(spec map[v1.ReplicaType]*v1.ReplicaSpec, rtype v1.ReplicaType, containerName, portName string) (int32, error)

GetPortFromJob gets the port of job default container.

func NewPodControl

func NewPodControl(client client.Client, recorder record.EventRecorder) controller.PodControlInterface

func RecheckDeletionTimestamp

func RecheckDeletionTimestamp(getObject func() (metav1.Object, error)) func() error

RecheckDeletionTimestamp returns a CanAdopt() function to recheck deletion.

The CanAdopt() function calls getObject() to fetch the latest value, and denies adoption attempts if that object has a non-nil DeletionTimestamp.

func ReplicaTypes added in v0.4.0

func ReplicaTypes(specs map[v1.ReplicaType]*v1.ReplicaSpec) []v1.ReplicaType

Types

type FakeServiceControl

type FakeServiceControl struct {
	sync.Mutex
	Templates         []v1.Service
	ControllerRefs    []metav1.OwnerReference
	DeleteServiceName []string
	Patches           [][]byte
	Err               error
	CreateLimit       int
	CreateCallCount   int
}

func (*FakeServiceControl) Clear

func (f *FakeServiceControl) Clear()

func (*FakeServiceControl) CreateServices

func (f *FakeServiceControl) CreateServices(namespace string, service *v1.Service, object runtime.Object) error

func (*FakeServiceControl) CreateServicesWithControllerRef

func (f *FakeServiceControl) CreateServicesWithControllerRef(namespace string, service *v1.Service, object runtime.Object, controllerRef *metav1.OwnerReference) error

func (*FakeServiceControl) DeleteService

func (f *FakeServiceControl) DeleteService(namespace string, serviceID string, object runtime.Object) error

func (*FakeServiceControl) PatchService

func (f *FakeServiceControl) PatchService(namespace, name string, data []byte) error

type JobController

type JobController struct {
	Controller apiv1.ControllerInterface

	Config options.JobControllerConfiguration

	// Gang Scheduler is a abstract gang scheduling clientset.
	GangScheduler gang_schedule.GangScheduler

	// A TTLCache of pod/services creates/deletes each job expects to see
	// We use Job namespace/name + ReplicaType + pods/services as an expectation key,
	// For example, there is a TFJob with namespace "tf-operator" and name "tfjob-abc":
	// {
	//     "PS": {
	//         "Replicas": 2,
	//     },
	//     "Worker": {
	//         "Replicas": 4,
	//     }
	// }
	// We will create 4 expectations:
	// - "tf-operator/tfjob-abc/ps/services", expects 2 adds.
	// - "tf-operator/tfjob-abc/ps/pods", expects 2 adds.
	// - "tf-operator/tfjob-abc/worker/services", expects 4 adds.
	// - "tf-operator/tfjob-abc/worker/pods", expects 4 adds.
	Expectations controller.ControllerExpectationsInterface

	// BackoffStatesQueue is a rate limited queue and record backoff counts for
	// those reconciling-failed job instances, and it does not play a role of
	// build-in work queue in controller-runtime.
	BackoffStatesQueue workqueue.RateLimitingInterface

	// Recorder is an event recorder for recording Event resources to the
	// Kubernetes API.
	Recorder record.EventRecorder

	// Metrics is a metrics exporter that export single numerical counter values.
	Metrics *metrics.JobMetrics

	// Client talks to api-server
	Client client.Client
	// contains filtered or unexported fields
}

JobController abstracts other operators to manage the lifecycle of Jobs. User need to first implement the ControllerInterface(objectA) and then initialize a JobController(objectB) struct with objectA as the parameter. And then call objectB.ReconcileJobs as mentioned below, the ReconcileJobs method is the entrypoint to trigger the reconcile logic of the job controller

ReconcileJobs(

job interface{},
replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec,
jobStatus apiv1.JobStatus,
runPolicy *apiv1.RunPolicy) error

func NewJobController

func NewJobController(
	cli client.Client,
	controllerImpl apiv1.ControllerInterface,
	config options.JobControllerConfiguration,
	recorder record.EventRecorder,
	metrics *metrics.JobMetrics,
) JobController

func (*JobController) AdoptAndClaimPods added in v0.4.1

func (jc *JobController) AdoptAndClaimPods(job metav1.Object, podList *v1.PodList) ([]*v1.Pod, error)

func (*JobController) AdoptAndClaimServices added in v0.4.1

func (jc *JobController) AdoptAndClaimServices(job metav1.Object, serviceList *v1.ServiceList) ([]*v1.Service, error)

func (*JobController) CreateGang

func (jc *JobController) CreateGang(job metav1.Object, replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) (runtime.Object, error)

CreateGang create a new gang schedule process, ensure the relationship between job, managed objects and gang entity always maintained, so the consistency of gang scheduling never breaks.

func (*JobController) CreateNewService

func (jc *JobController) CreateNewService(ctx context.Context, job metav1.Object, rtype apiv1.ReplicaType,
	spec *apiv1.ReplicaSpec, index string) error

createNewService creates a new service for the given index and type.

func (*JobController) CreatePod added in v0.2.0

func (jc *JobController) CreatePod(job interface{}, rt, index string, podTemplate *v1.PodTemplateSpec, masterRole bool) error

CreatePod creates a new common pod for the given index and type.

func (*JobController) CreateService added in v0.4.0

func (jc *JobController) CreateService(job metav1.Object, rtype apiv1.ReplicaType,
	service *v1.Service, index string) error

CreateService creates a new common service with the given index and type.

func (*JobController) DeleteExpectations added in v0.4.1

func (jc *JobController) DeleteExpectations(job metav1.Object, specs map[apiv1.ReplicaType]*apiv1.ReplicaSpec)

func (*JobController) DeleteGang

func (jc *JobController) DeleteGang(job metav1.Object) error

func (*JobController) FilterPodsForReplicaType

func (jc *JobController) FilterPodsForReplicaType(pods []*v1.Pod, replicaType string) ([]*v1.Pod, error)

FilterPodsForReplicaType returns pods belong to a replicaType.

func (*JobController) FilterServicesForReplicaType

func (jc *JobController) FilterServicesForReplicaType(services []*v1.Service, replicaType string) ([]*v1.Service, error)

FilterServicesForReplicaType returns service belong to a replicaType.

func (*JobController) GenLabels

func (jc *JobController) GenLabels(jobName string) map[string]string

func (*JobController) GenOwnerReference

func (jc *JobController) GenOwnerReference(obj metav1.Object) *metav1.OwnerReference

func (*JobController) GetPodSlices

func (jc *JobController) GetPodSlices(pods []*v1.Pod, replicas int, logger *log.Entry) [][]*v1.Pod

GetPodSlices returns a slice, which element is the slice of pod.

func (*JobController) GetPortFromJob

func (jc *JobController) GetPortFromJob(spec *apiv1.ReplicaSpec) (int32, error)

GetPortFromJob gets the port of job container.

func (*JobController) GetServiceSlices

func (jc *JobController) GetServiceSlices(services []*v1.Service, replicas int, logger *log.Entry) [][]*v1.Service

GetServiceSlices returns a slice, which element is the slice of service. Assume the return object is serviceSlices, then serviceSlices[i] is an array of pointers to services corresponding to Services for replica i.

func (*JobController) OnPodCreateFunc

func (jc *JobController) OnPodCreateFunc(e event.CreateEvent) bool

When a pod is created, enqueue the job that manages it and update its expectations.

func (*JobController) OnPodDeleteFunc

func (jc *JobController) OnPodDeleteFunc(e event.DeleteEvent) bool

When a pod is deleted, enqueue the job that manages the pod and update its expectations. obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item.

func (*JobController) OnPodUpdateFunc

func (jc *JobController) OnPodUpdateFunc(e event.UpdateEvent) bool

When a pod is updated, figure out what job is managing it and wake it up. If the labels of the pod have changed we need to awaken both the old and new replica set, old and new must be *v1.Pod types.

func (*JobController) OnServiceCreateFunc

func (jc *JobController) OnServiceCreateFunc(e event.CreateEvent) bool

When a service is created, enqueue the controller that manages it and update its expectations.

func (*JobController) OnServiceDeleteFunc

func (jc *JobController) OnServiceDeleteFunc(e event.DeleteEvent) bool

When a service is deleted, enqueue the job that manages the service and update its expectations. obj could be an *v1.Service, or a DeletionFinalStateUnknown marker item.

func (*JobController) OnServiceUpdateFunc

func (jc *JobController) OnServiceUpdateFunc(e event.UpdateEvent) bool

When a service is updated, figure out what job/s manage it and wake them up. If the labels of the service have changed we need to awaken both the old and new replica set. old and new must be *v1.Service types.

func (*JobController) ReconcileJobs

func (jc *JobController) ReconcileJobs(job interface{}, replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, jobStatus apiv1.JobStatus,
	runPolicy *apiv1.RunPolicy, modelVersion *v1alpha1.ModelVersionSpec, cacheBackend *cachev1alpha1.CacheBackendSpec) (result reconcile.Result, err error)

ReconcileJobs checks and updates replicas for each given ReplicaSpec. It will requeue the job in case of an error while creating/deleting pods/services.

func (*JobController) ReconcilePods

func (jc *JobController) ReconcilePods(
	ctx context.Context,
	job interface{},
	jobStatus *apiv1.JobStatus,
	pods []*v1.Pod,
	rtype apiv1.ReplicaType,
	spec *apiv1.ReplicaSpec,
	replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, restart *bool) error

ReconcilePods checks and updates pods for each given ReplicaSpec. It will requeue the job in case of an error while creating/deleting pods.

func (*JobController) ReconcileServices

func (jc *JobController) ReconcileServices(
	ctx context.Context,
	job metav1.Object,
	services []*v1.Service,
	rtype apiv1.ReplicaType,
	spec *apiv1.ReplicaSpec) error

reconcileServices checks and updates services for each given ReplicaSpec. It will requeue the job in case of an error while creating/deleting services.

func (*JobController) SatisfyExpectations

func (jc *JobController) SatisfyExpectations(job metav1.Object, specs map[apiv1.ReplicaType]*apiv1.ReplicaSpec) bool

SatisfiedExpectations returns true if the required adds/dels for the given job have been observed. Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller manager.

func (*JobController) SortPodsByReplicaType added in v0.4.0

func (jc *JobController) SortPodsByReplicaType(pods []*v1.Pod, rtypes []apiv1.ReplicaType) map[apiv1.ReplicaType][]*v1.Pod

type PodControl

type PodControl struct {
	// contains filtered or unexported fields
}

PodControl is the default implementation of PodControlInterface.

func (PodControl) CreatePods

func (r PodControl) CreatePods(namespace string, template *v1.PodTemplateSpec, object runtime.Object) error

func (PodControl) CreatePodsOnNode

func (r PodControl) CreatePodsOnNode(nodeName, namespace string, template *v1.PodTemplateSpec, object runtime.Object, controllerRef *metav1.OwnerReference) error

func (PodControl) CreatePodsWithControllerRef

func (r PodControl) CreatePodsWithControllerRef(namespace string, template *v1.PodTemplateSpec, controllerObject runtime.Object, controllerRef *metav1.OwnerReference) error

func (PodControl) DeletePod

func (r PodControl) DeletePod(namespace string, name string, object runtime.Object) error

func (PodControl) PatchPod

func (r PodControl) PatchPod(namespace, name string, data []byte) error

type ServiceControl

type ServiceControl struct {
	// contains filtered or unexported fields
}

ServiceControl is the default implementation of ServiceControlInterface.

func NewServiceControl

func NewServiceControl(client client.Client, recorder record.EventRecorder) *ServiceControl

func (ServiceControl) CreateServices

func (r ServiceControl) CreateServices(namespace string, service *v1.Service, object runtime.Object) error

func (ServiceControl) CreateServicesWithControllerRef

func (r ServiceControl) CreateServicesWithControllerRef(namespace string, service *v1.Service, controllerObject runtime.Object, controllerRef *metav1.OwnerReference) error

func (ServiceControl) DeleteService

func (r ServiceControl) DeleteService(namespace, name string, object runtime.Object) error

DeleteService deletes the service identified by name.

func (ServiceControl) PatchService

func (r ServiceControl) PatchService(namespace, name string, data []byte) error

type ServiceControlInterface

type ServiceControlInterface interface {
	// CreateServices creates new Services according to the spec.
	CreateServices(namespace string, service *v1.Service, object runtime.Object) error
	// CreateServicesWithControllerRef creates new services according to the spec, and sets object as the service's controller.
	CreateServicesWithControllerRef(namespace string, service *v1.Service, object runtime.Object, controllerRef *metav1.OwnerReference) error
	// PatchService patches the service.
	PatchService(namespace, name string, data []byte) error
	// DeleteService deletes the service identified by serviceID.
	DeleteService(namespace, serviceID string, object runtime.Object) error
}

ServiceControlInterface is an interface that knows how to add or delete Services created as an interface to allow testing.

type ServiceControllerRefManager

type ServiceControllerRefManager struct {
	controller.BaseControllerRefManager
	// contains filtered or unexported fields
}

func NewServiceControllerRefManager

func NewServiceControllerRefManager(
	serviceControl ServiceControlInterface,
	ctr metav1.Object,
	selector labels.Selector,
	controllerKind schema.GroupVersionKind,
	canAdopt func() error,
) *ServiceControllerRefManager

NewServiceControllerRefManager returns a ServiceControllerRefManager that exposes methods to manage the controllerRef of services.

The canAdopt() function can be used to perform a potentially expensive check (such as a live GET from the API server) prior to the first adoption. It will only be called (at most once) if an adoption is actually attempted. If canAdopt() returns a non-nil error, all adoptions will fail.

NOTE: Once canAdopt() is called, it will not be called again by the same

ServiceControllerRefManager instance. Create a new instance if it makes
sense to check canAdopt() again (e.g. in a different sync pass).

func (*ServiceControllerRefManager) AdoptService

func (m *ServiceControllerRefManager) AdoptService(service *v1.Service) error

AdoptService sends a patch to take control of the service. It returns the error if the patching fails.

func (*ServiceControllerRefManager) ClaimServices

func (m *ServiceControllerRefManager) ClaimServices(services []*v1.Service, filters ...func(*v1.Service) bool) ([]*v1.Service, error)

ClaimServices tries to take ownership of a list of Services.

It will reconcile the following:

  • Adopt orphans if the selector matches.
  • Release owned objects if the selector no longer matches.

Optional: If one or more filters are specified, a Service will only be claimed if all filters return true.

A non-nil error is returned if some form of reconciliation was attempted and failed. Usually, controllers should try again later in case reconciliation is still needed.

If the error is nil, either the reconciliation succeeded, or no reconciliation was necessary. The list of Services that you now own is returned.

func (*ServiceControllerRefManager) ReleaseService

func (m *ServiceControllerRefManager) ReleaseService(service *v1.Service) error

ReleaseService sends a patch to free the service from the control of the controller. It returns the error if the patching fails. 404 and 422 errors are ignored.

type TestJobController

type TestJobController struct {
	// contains filtered or unexported fields
}

func (TestJobController) ControllerName

func (TestJobController) ControllerName() string

func (*TestJobController) CreatePod

func (t *TestJobController) CreatePod(job interface{}, pod *corev1.Pod) error

func (*TestJobController) CreateService

func (t *TestJobController) CreateService(job interface{}, service *corev1.Service) error

func (*TestJobController) DeleteJob

func (t *TestJobController) DeleteJob(job interface{}) error

func (*TestJobController) DeletePod

func (t *TestJobController) DeletePod(job interface{}, pod *corev1.Pod) error

func (*TestJobController) DeleteService

func (t *TestJobController) DeleteService(job interface{}, name string, namespace string) error

func (TestJobController) GetAPIGroupVersion

func (TestJobController) GetAPIGroupVersion() schema.GroupVersion

func (TestJobController) GetAPIGroupVersionKind

func (TestJobController) GetAPIGroupVersionKind() schema.GroupVersionKind

func (*TestJobController) GetDefaultContainerName

func (t *TestJobController) GetDefaultContainerName() string

func (TestJobController) GetDefaultContainerPortName

func (TestJobController) GetDefaultContainerPortName() string

func (TestJobController) GetDefaultContainerPortNumber

func (TestJobController) GetDefaultContainerPortNumber() int32

func (TestJobController) GetGroupNameLabelValue

func (TestJobController) GetGroupNameLabelValue() string

func (*TestJobController) GetJobFromAPIClient

func (t *TestJobController) GetJobFromAPIClient(namespace, name string) (v1.Object, error)

func (*TestJobController) GetJobFromInformerCache

func (t *TestJobController) GetJobFromInformerCache(namespace, name string) (v1.Object, error)

func (TestJobController) GetJobRoleKey

func (TestJobController) GetJobRoleKey() string

func (TestJobController) GetNodeForModelOutput added in v0.4.0

func (t TestJobController) GetNodeForModelOutput(pods []*corev1.Pod) (nodeName string)

func (TestJobController) GetPodsForJob

func (t TestJobController) GetPodsForJob(job interface{}) ([]*corev1.Pod, error)

func (TestJobController) GetReconcileOrders

func (t TestJobController) GetReconcileOrders() []apiv1.ReplicaType

func (TestJobController) GetServicesForJob

func (t TestJobController) GetServicesForJob(job interface{}) ([]*corev1.Service, error)

func (*TestJobController) IsMasterRole

func (t *TestJobController) IsMasterRole(replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, rtype apiv1.ReplicaType, index int) bool

func (*TestJobController) SetClusterSpec

func (t *TestJobController) SetClusterSpec(ctx context.Context, job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error

func (TestJobController) UpdateJobStatus

func (t TestJobController) UpdateJobStatus(job interface{}, replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec,
	jobStatus *apiv1.JobStatus, restart bool) error

func (*TestJobController) UpdateJobStatusInApiServer

func (t *TestJobController) UpdateJobStatusInApiServer(job interface{}, jobStatus *apiv1.JobStatus) error

Directories

Path Synopsis
api
v1
Package v1 is the v1 version of the API.
Package v1 is the v1 version of the API.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL