framework

package
v1.32.0-rc.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 26, 2024 License: Apache-2.0 Imports: 34 Imported by: 367

Documentation

Index

Constants

View Source
const (
	// ScheduleAttemptFailure is the event when a schedule attempt fails.
	ScheduleAttemptFailure = "ScheduleAttemptFailure"
	// BackoffComplete is the event when a pod finishes backoff.
	BackoffComplete = "BackoffComplete"
	// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
	// to activeQ. Usually it's triggered by plugin implementations.
	ForceActivate = "ForceActivate"
	// UnschedulableTimeout is the event when a pod is moved from unschedulablePods
	// due to the timeout specified at pod-max-in-unschedulable-pods-duration.
	UnschedulableTimeout = "UnschedulableTimeout"
)

Special event labels.

View Source
const (
	// MaxNodeScore is the maximum score a Score plugin is expected to return.
	MaxNodeScore int64 = 100

	// MinNodeScore is the minimum score a Score plugin is expected to return.
	MinNodeScore int64 = 0

	// MaxTotalScore is the maximum total score.
	MaxTotalScore int64 = math.MaxInt64
)
View Source
const DefaultBindAllHostIP = "0.0.0.0"

DefaultBindAllHostIP defines the default ip address used to bind to all host.

View Source
const ExtenderName = "Extender"

ExtenderName is a fake plugin name put in UnschedulablePlugins when Extender rejected some Nodes.

View Source
const (
	// NoNodeAvailableMsg is used to format message when no nodes available.
	NoNodeAvailableMsg = "0/%v nodes are available"
)

Variables

View Source
var (
	// EventAssignedPodAdd is the event when an assigned pod is added.
	EventAssignedPodAdd = ClusterEvent{Resource: assignedPod, ActionType: Add}
	// EventAssignedPodUpdate is the event when an assigned pod is updated.
	EventAssignedPodUpdate = ClusterEvent{Resource: assignedPod, ActionType: Update}
	// EventAssignedPodDelete is the event when an assigned pod is deleted.
	EventAssignedPodDelete = ClusterEvent{Resource: assignedPod, ActionType: Delete}
	// EventUnscheduledPodAdd is the event when an unscheduled pod is added.
	EventUnscheduledPodAdd = ClusterEvent{Resource: unschedulablePod, ActionType: Add}
	// EventUnscheduledPodUpdate is the event when an unscheduled pod is updated.
	EventUnscheduledPodUpdate = ClusterEvent{Resource: unschedulablePod, ActionType: Update}
	// EventUnscheduledPodDelete is the event when an unscheduled pod is deleted.
	EventUnscheduledPodDelete = ClusterEvent{Resource: unschedulablePod, ActionType: Delete}
	// EventUnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout.
	EventUnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, /* contains filtered or unexported fields */}
	// EventForceActivate is the event when a pod is moved from unschedulablePods/backoffQ to activeQ.
	EventForceActivate = ClusterEvent{Resource: WildCard, ActionType: All, /* contains filtered or unexported fields */}
)
View Source
var (
	// ErrNotFound is the not found error message.
	ErrNotFound = errors.New("not found")
)

Functions

func AllClusterEventLabels added in v1.32.0

func AllClusterEventLabels() []string

AllClusterEventLabels returns all possible cluster event labels given to the metrics.

func GetNamespacedName added in v1.25.0

func GetNamespacedName(namespace, name string) string

GetNamespacedName returns the string format of a namespaced resource name.

func GetPodAffinityTerms added in v1.31.0

func GetPodAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm)

func GetPodAntiAffinityTerms added in v1.31.0

func GetPodAntiAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm)

func GetPodKey

func GetPodKey(pod *v1.Pod) (string, error)

GetPodKey returns the string key of a pod.

Types

type ActionType added in v1.21.0

type ActionType int64

ActionType is an integer to represent one type of resource change. Different ActionTypes can be bit-wised to compose new semantics.

const (
	Add ActionType = 1 << iota
	Delete

	// UpdateNodeXYZ is only applicable for Node events.
	// If you use UpdateNodeXYZ,
	// your plugin's QueueingHint is only executed for the specific sub-Update event.
	// It's better to narrow down the scope of the event by using them instead of just using Update event
	// for better performance in requeueing.
	UpdateNodeAllocatable
	UpdateNodeLabel
	// UpdateNodeTaint is an update for node's taints or node.Spec.Unschedulable.
	UpdateNodeTaint
	UpdateNodeCondition
	UpdateNodeAnnotation

	// UpdatePodXYZ is only applicable for Pod events.
	// If you use UpdatePodXYZ,
	// your plugin's QueueingHint is only executed for the specific sub-Update event.
	// It's better to narrow down the scope of the event by using them instead of Update event
	// for better performance in requeueing.
	UpdatePodLabel
	// UpdatePodScaleDown is an update for pod's scale down (i.e., any resource request is reduced).
	UpdatePodScaleDown
	// UpdatePodTolerations is an addition for pod's tolerations.
	// (Due to API validation, we can add, but cannot modify or remove tolerations.)
	UpdatePodTolerations
	// UpdatePodSchedulingGatesEliminated is an update for pod's scheduling gates, which eliminates all scheduling gates in the Pod.
	UpdatePodSchedulingGatesEliminated
	// UpdatePodGeneratedResourceClaim is an update of the list of ResourceClaims generated for the pod.
	// Depends on the DynamicResourceAllocation feature gate.
	UpdatePodGeneratedResourceClaim

	All ActionType = 1<<iota - 1

	// Use the general Update type if you don't either know or care the specific sub-Update type to use.
	Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition | UpdateNodeAnnotation | UpdatePodLabel | UpdatePodScaleDown | UpdatePodTolerations | UpdatePodSchedulingGatesEliminated | UpdatePodGeneratedResourceClaim | updatePodOther
)

Constants for ActionTypes. CAUTION for contributors: When you add a new ActionType, you must update the following: - The list of basic, podOnly, and nodeOnly. - String() method.

func (ActionType) String added in v1.32.0

func (a ActionType) String() string

type AffinityTerm

type AffinityTerm struct {
	Namespaces        sets.Set[string]
	Selector          labels.Selector
	TopologyKey       string
	NamespaceSelector labels.Selector
}

AffinityTerm is a processed version of v1.PodAffinityTerm.

func GetAffinityTerms added in v1.31.0

func GetAffinityTerms(pod *v1.Pod, v1Terms []v1.PodAffinityTerm) ([]AffinityTerm, error)

GetAffinityTerms receives a Pod and affinity terms and returns the namespaces and selectors of the terms.

func (*AffinityTerm) Matches added in v1.21.0

func (at *AffinityTerm) Matches(pod *v1.Pod, nsLabels labels.Set) bool

Matches returns true if the pod matches the label selector and namespaces or namespace selector.

type BindPlugin

type BindPlugin interface {
	Plugin
	// Bind plugins will not be called until all pre-bind plugins have completed. Each
	// bind plugin is called in the configured order. A bind plugin may choose whether
	// or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the
	// remaining bind plugins are skipped. When a bind plugin does not handle a pod,
	// it must return Skip in its Status code. If a bind plugin returns an Error, the
	// pod is rejected and will not be bound.
	Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
}

BindPlugin is an interface that must be implemented by "Bind" plugins. Bind plugins are used to bind a pod to a Node.

type ClusterEvent added in v1.21.0

type ClusterEvent struct {
	Resource   EventResource
	ActionType ActionType
	// contains filtered or unexported fields
}

ClusterEvent abstracts how a system resource's state gets changed. Resource represents the standard API resources such as Pod, Node, etc. ActionType denotes the specific change such as Add, Update or Delete.

func NodeSchedulingPropertiesChange added in v1.31.0

func NodeSchedulingPropertiesChange(newNode *v1.Node, oldNode *v1.Node) (events []ClusterEvent)

NodeSchedulingPropertiesChange interprets the update of a node and returns corresponding UpdateNodeXYZ event(s).

func PodSchedulingPropertiesChange added in v1.31.0

func PodSchedulingPropertiesChange(newPod *v1.Pod, oldPod *v1.Pod) (events []ClusterEvent)

PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s). Once we have other pod update events, we should update here as well.

func (ClusterEvent) IsWildCard added in v1.22.0

func (ce ClusterEvent) IsWildCard() bool

IsWildCard returns true if ClusterEvent follows WildCard semantics

func (ClusterEvent) Label added in v1.22.0

func (ce ClusterEvent) Label() string

Label is used for logging and metrics.

func (ClusterEvent) Match added in v1.30.0

func (ce ClusterEvent) Match(incomingEvent ClusterEvent) bool

Match returns true if ClusterEvent is matched with the coming event. If the ce.Resource is "*", there's no requirement for the coming event' Resource. Contrarily, if the coming event's Resource is "*", the ce.Resource should only be "*".

Note: we have a special case here when the coming event is a wildcard event, it will force all Pods to move to activeQ/backoffQ, but we take it as an unmatched event unless the ce is also a wildcard one.

type ClusterEventWithHint added in v1.28.0

type ClusterEventWithHint struct {
	Event ClusterEvent
	// QueueingHintFn is executed for the Pod rejected by this plugin when the above Event happens,
	// and filters out events to reduce useless retry of Pod's scheduling.
	// It's an optional field. If not set,
	// the scheduling of Pods will be always retried with backoff when this Event happens.
	// (the same as Queue)
	QueueingHintFn QueueingHintFn
}

func UnrollWildCardResource added in v1.28.0

func UnrollWildCardResource() []ClusterEventWithHint

type Code

type Code int

Code is the Status code/type which is returned from plugins.

const (
	// Success means that plugin ran correctly and found pod schedulable.
	// NOTE: A nil status is also considered as "Success".
	Success Code = iota
	// Error is one of the failures, used for internal plugin errors, unexpected input, etc.
	// Plugin shouldn't return this code for expected failures, like Unschedulable.
	// Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins.
	// Meaning, the Pod will be requeued to activeQ/backoffQ soon.
	Error
	// Unschedulable is one of the failures, used when a plugin finds a pod unschedulable.
	// If it's returned from PreFilter or Filter, the scheduler might attempt to
	// run other postFilter plugins like preemption to get this pod scheduled.
	// Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins.
	// The accompanying status message should explain why the pod is unschedulable.
	//
	// We regard the backoff as a penalty of wasting the scheduling cycle.
	// When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling,
	// the Pod goes through backoff.
	Unschedulable
	// UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and
	// other postFilter plugins like preemption would not change anything.
	// See the comment on PostFilter interface for more details about how PostFilter should handle this status.
	// Plugins should return Unschedulable if it is possible that the pod can get scheduled
	// after running other postFilter plugins.
	// The accompanying status message should explain why the pod is unschedulable.
	//
	// We regard the backoff as a penalty of wasting the scheduling cycle.
	// When the scheduling queue requeues Pods, which was rejected with UnschedulableAndUnresolvable in the last scheduling,
	// the Pod goes through backoff.
	UnschedulableAndUnresolvable
	// Wait is used when a Permit plugin finds a pod scheduling should wait.
	Wait
	// Skip is used in the following scenarios:
	// - when a Bind plugin chooses to skip binding.
	// - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped.
	// - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped.
	Skip
	// Pending means that the scheduling process is finished successfully,
	// but the plugin wants to stop the scheduling cycle/binding cycle here.
	//
	// For example, the DRA plugin sometimes needs to wait for the external device driver
	// to provision the resource for the Pod.
	// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
	// because in this case, the scheduler decides where the Pod can go successfully,
	// but we need to wait for the external component to do something based on that scheduling result.
	//
	// We regard the backoff as a penalty of wasting the scheduling cycle.
	// In the case of returning Pending, we cannot say the scheduling cycle is wasted
	// because the scheduling result is used to proceed the Pod's scheduling forward,
	// that particular scheduling cycle is failed though.
	// So, Pods rejected by such reasons don't need to suffer a penalty (backoff).
	// When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling,
	// the Pod goes to activeQ directly ignoring backoff.
	Pending
)

These are predefined codes used in a Status. Note: when you add a new status, you have to add it in `codes` slice below.

func (Code) String

func (c Code) String() string

type CycleState

type CycleState struct {

	// SkipFilterPlugins are plugins that will be skipped in the Filter extension point.
	SkipFilterPlugins sets.Set[string]
	// SkipScorePlugins are plugins that will be skipped in the Score extension point.
	SkipScorePlugins sets.Set[string]
	// contains filtered or unexported fields
}

CycleState provides a mechanism for plugins to store and retrieve arbitrary data. StateData stored by one plugin can be read, altered, or deleted by another plugin. CycleState does not provide any data protection, as all plugins are assumed to be trusted. Note: CycleState uses a sync.Map to back the storage, because it is thread safe. It's aimed to optimize for the "write once and read many times" scenarios. It is the recommended pattern used in all in-tree plugins - plugin-specific state is written once in PreFilter/PreScore and afterward read many times in Filter/Score.

func NewCycleState

func NewCycleState() *CycleState

NewCycleState initializes a new CycleState and returns its pointer.

func (*CycleState) Clone

func (c *CycleState) Clone() *CycleState

Clone creates a copy of CycleState and returns its pointer. Clone returns nil if the context being cloned is nil.

func (*CycleState) Delete

func (c *CycleState) Delete(key StateKey)

Delete deletes data with the given key from CycleState.

See CycleState for notes on concurrency.

func (*CycleState) Read

func (c *CycleState) Read(key StateKey) (StateData, error)

Read retrieves data with the given "key" from CycleState. If the key is not present, ErrNotFound is returned.

See CycleState for notes on concurrency.

func (*CycleState) SetRecordPluginMetrics

func (c *CycleState) SetRecordPluginMetrics(flag bool)

SetRecordPluginMetrics sets recordPluginMetrics to the given value.

func (*CycleState) ShouldRecordPluginMetrics

func (c *CycleState) ShouldRecordPluginMetrics() bool

ShouldRecordPluginMetrics returns whether metrics.PluginExecutionDuration metrics should be recorded.

func (*CycleState) Write

func (c *CycleState) Write(key StateKey, val StateData)

Write stores the given "val" in CycleState with the given "key".

See CycleState for notes on concurrency.

type DeviceClassLister added in v1.32.0

type DeviceClassLister interface {
	// List returns a list of all DeviceClasses.
	List() ([]*resourceapi.DeviceClass, error)
	// Get returns the DeviceClass with the given className.
	Get(className string) (*resourceapi.DeviceClass, error)
}

DeviceClassLister can be used to obtain DeviceClasses.

type Diagnosis added in v1.21.0

type Diagnosis struct {
	// NodeToStatus records the status of nodes and generic status for absent ones.
	// if they're rejected in PreFilter (via PreFilterResult) or Filter plugins.
	// Nodes that pass PreFilter/Filter plugins are not included in this map.
	NodeToStatus *NodeToStatus
	// UnschedulablePlugins are plugins that returns Unschedulable or UnschedulableAndUnresolvable.
	UnschedulablePlugins sets.Set[string]
	// UnschedulablePlugins are plugins that returns Pending.
	PendingPlugins sets.Set[string]
	// PreFilterMsg records the messages returned from PreFilter plugins.
	PreFilterMsg string
	// PostFilterMsg records the messages returned from PostFilter plugins.
	PostFilterMsg string
}

Diagnosis records the details to diagnose a scheduling failure.

func (*Diagnosis) AddPluginStatus added in v1.29.0

func (d *Diagnosis) AddPluginStatus(sts *Status)

type EnqueueExtensions added in v1.21.0

type EnqueueExtensions interface {
	Plugin
	// EventsToRegister returns a series of possible events that may cause a Pod
	// failed by this plugin schedulable. Each event has a callback function that
	// filters out events to reduce useless retry of Pod's scheduling.
	// The events will be registered when instantiating the internal scheduling queue,
	// and leveraged to build event handlers dynamically.
	// When it returns an error, the scheduler fails to start.
	// Note: the returned list needs to be determined at a startup,
	// and the scheduler only evaluates it once during start up.
	// Do not change the result during runtime, for example, based on the cluster's state etc.
	//
	// Appropriate implementation of this function will make Pod's re-scheduling accurate and performant.
	EventsToRegister(context.Context) ([]ClusterEventWithHint, error)
}

EnqueueExtensions is an optional interface that plugins can implement to efficiently move unschedulable Pods in internal scheduling queues. In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins, and Pods rejected by these plugins are requeued based on this extension point. Failures from other extension points are regarded as temporal errors (e.g., network failure), and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff. This is because such temporal errors cannot be resolved by specific cluster events, and we have no choose but keep retrying scheduling until the failure is resolved.

Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface, otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin. And, if plugins other than above extension points support this interface, they are just ignored.

type EventResource added in v1.32.0

type EventResource string

EventResource is basically short for group/version/kind, which can uniquely represent a particular API resource.

const (
	// There are a couple of notes about how the scheduler notifies the events of Pods:
	// - Add: add events could be triggered by either a newly created Pod or an existing Pod that is scheduled to a Node.
	// - Delete: delete events could be triggered by:
	//           - a Pod that is deleted
	//           - a Pod that was assumed, but gets un-assumed due to some errors in the binding cycle.
	//           - an existing Pod that was unscheduled but gets scheduled to a Node.
	//
	// Note that the Pod event type includes the events for the unscheduled Pod itself.
	// i.e., when unscheduled Pods are updated, the scheduling queue checks with Pod/Update QueueingHint(s) whether the update may make the pods schedulable,
	// and requeues them to activeQ/backoffQ when at least one QueueingHint(s) return Queue.
	// Plugins **have to** implement a QueueingHint for Pod/Update event
	// if the rejection from them could be resolved by updating unscheduled Pods themselves.
	// Example: Pods that require excessive resources may be rejected by the noderesources plugin,
	// if this unscheduled pod is updated to require fewer resources,
	// the previous rejection from noderesources plugin can be resolved.
	// this plugin would implement QueueingHint for Pod/Update event
	// that returns Queue when such label changes are made in unscheduled Pods.
	Pod EventResource = "Pod"

	// A note about NodeAdd event and UpdateNodeTaint event:
	// When QHint is disabled, NodeAdd often isn't worked expectedly because of the internal feature called preCheck.
	// It's definitely not something expected for plugin developers,
	// and registering UpdateNodeTaint event is the only mitigation for now.
	// So, kube-scheduler registers UpdateNodeTaint event for plugins that has NodeAdded event, but don't have UpdateNodeTaint event.
	// It has a bad impact for the requeuing efficiency though, a lot better than some Pods being stuck in the
	// unschedulable pod pool.
	// This problematic preCheck feature is disabled when QHint is enabled,
	// and eventually will be removed along with QHint graduation.
	// See: https://github.com/kubernetes/kubernetes/issues/110175
	Node                  EventResource = "Node"
	PersistentVolume      EventResource = "PersistentVolume"
	PersistentVolumeClaim EventResource = "PersistentVolumeClaim"
	CSINode               EventResource = "storage.k8s.io/CSINode"
	CSIDriver             EventResource = "storage.k8s.io/CSIDriver"
	VolumeAttachment      EventResource = "storage.k8s.io/VolumeAttachment"
	CSIStorageCapacity    EventResource = "storage.k8s.io/CSIStorageCapacity"
	StorageClass          EventResource = "storage.k8s.io/StorageClass"
	ResourceClaim         EventResource = "resource.k8s.io/ResourceClaim"
	ResourceSlice         EventResource = "resource.k8s.io/ResourceSlice"
	DeviceClass           EventResource = "resource.k8s.io/DeviceClass"

	// WildCard is a special EventResource to match all resources.
	// e.g., If you register `{Resource: "*", ActionType: All}` in EventsToRegister,
	// all coming clusterEvents will be admitted. Be careful to register it, it will
	// increase the computing pressure in requeueing unless you really need it.
	//
	// Meanwhile, if the coming clusterEvent is a wildcard one, all pods
	// will be moved from unschedulablePod pool to activeQ/backoffQ forcibly.
	WildCard EventResource = "*"
)

Constants for GVKs.

CAUTION for contributors: When you add a new EventResource, you must register a new one to allResources.

Note: - UpdatePodXYZ or UpdateNodeXYZ: triggered by updating particular parts of a Pod or a Node, e.g. updatePodLabel. Use specific events rather than general ones (updatePodLabel vs update) can make the requeueing process more efficient and consume less memory as less events will be cached at scheduler.

type Extender

type Extender interface {
	// Name returns a unique name that identifies the extender.
	Name() string

	// Filter based on extender-implemented predicate functions. The filtered list is
	// expected to be a subset of the supplied list.
	// The failedNodes and failedAndUnresolvableNodes optionally contains the list
	// of failed nodes and failure reasons, except nodes in the latter are
	// unresolvable.
	Filter(pod *v1.Pod, nodes []*NodeInfo) (filteredNodes []*NodeInfo, failedNodesMap extenderv1.FailedNodesMap, failedAndUnresolvable extenderv1.FailedNodesMap, err error)

	// Prioritize based on extender-implemented priority functions. The returned scores & weight
	// are used to compute the weighted score for an extender. The weighted scores are added to
	// the scores computed by Kubernetes scheduler. The total scores are used to do the host selection.
	Prioritize(pod *v1.Pod, nodes []*NodeInfo) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error)

	// Bind delegates the action of binding a pod to a node to the extender.
	Bind(binding *v1.Binding) error

	// IsBinder returns whether this extender is configured for the Bind method.
	IsBinder() bool

	// IsInterested returns true if at least one extended resource requested by
	// this pod is managed by this extender.
	IsInterested(pod *v1.Pod) bool

	// IsPrioritizer returns whether this extender is configured for the Prioritize method.
	IsPrioritizer() bool

	// IsFilter returns whether this extender is configured for the Filter method.
	IsFilter() bool

	// ProcessPreemption returns nodes with their victim pods processed by extender based on
	// given:
	//   1. Pod to schedule
	//   2. Candidate nodes and victim pods (nodeNameToVictims) generated by previous scheduling process.
	// The possible changes made by extender may include:
	//   1. Subset of given candidate nodes after preemption phase of extender.
	//   2. A different set of victim pod for every given candidate node after preemption phase of extender.
	ProcessPreemption(
		pod *v1.Pod,
		nodeNameToVictims map[string]*extenderv1.Victims,
		nodeInfos NodeInfoLister,
	) (map[string]*extenderv1.Victims, error)

	// SupportsPreemption returns if the scheduler extender support preemption or not.
	SupportsPreemption() bool

	// IsIgnorable returns true indicates scheduling should not fail when this extender
	// is unavailable. This gives scheduler ability to fail fast and tolerate non-critical extenders as well.
	// Both Filter and Bind actions are supported.
	IsIgnorable() bool
}

Extender is an interface for external processes to influence scheduling decisions made by Kubernetes. This is typically needed for resources not directly managed by Kubernetes.

type FilterPlugin

type FilterPlugin interface {
	Plugin
	// Filter is called by the scheduling framework.
	// All FilterPlugins should return "Success" to declare that
	// the given node fits the pod. If Filter doesn't return "Success",
	// it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error".
	//
	// "Error" aborts pod scheduling and puts the pod into the backoff queue.
	//
	// For the node being evaluated, Filter plugins should look at the passed
	// nodeInfo reference for this particular node's information (e.g., pods
	// considered to be running on the node) instead of looking it up in the
	// NodeInfoSnapshot because we don't guarantee that they will be the same.
	// For example, during preemption, we may pass a copy of the original
	// nodeInfo object that has some pods removed from it to evaluate the
	// possibility of preempting them to schedule the target pod.
	Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status
}

FilterPlugin is an interface for Filter plugins. These plugins are called at the filter extension point for filtering out hosts that cannot run a pod. This concept used to be called 'predicate' in the original scheduler. These plugins should return "Success", "Unschedulable" or "Error" in Status.code. However, the scheduler accepts other valid codes as well. Anything other than "Success" will lead to exclusion of the given host from running the pod.

type FitError added in v1.21.0

type FitError struct {
	Pod         *v1.Pod
	NumAllNodes int
	Diagnosis   Diagnosis
}

FitError describes a fit error of a pod.

func (*FitError) Error added in v1.21.0

func (f *FitError) Error() string

Error returns detailed information of why the pod failed to fit on each node. A message format is "0/X nodes are available: <PreFilterMsg>. <FilterMsg>. <PostFilterMsg>."

type Framework

type Framework interface {
	Handle

	// PreEnqueuePlugins returns the registered preEnqueue plugins.
	PreEnqueuePlugins() []PreEnqueuePlugin

	// EnqueueExtensions returns the registered Enqueue extensions.
	EnqueueExtensions() []EnqueueExtensions

	// QueueSortFunc returns the function to sort pods in scheduling queue
	QueueSortFunc() LessFunc

	// RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns
	// *Status and its code is set to non-success if any of the plugins returns
	// anything but Success. If a non-success status is returned, then the scheduling
	// cycle is aborted.
	// It also returns a PreFilterResult, which may influence what or how many nodes to
	// evaluate downstream.
	// The third returns value contains PreFilter plugin that rejected some or all Nodes with PreFilterResult.
	// But, note that it doesn't contain any plugin when a plugin rejects this Pod with non-success status,
	// not with PreFilterResult.
	RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status, sets.Set[string])

	// RunPostFilterPlugins runs the set of configured PostFilter plugins.
	// PostFilter plugins can either be informational, in which case should be configured
	// to execute first and return Unschedulable status, or ones that try to change the
	// cluster state to make the pod potentially schedulable in a future scheduling cycle.
	RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)

	// RunPreBindPlugins runs the set of configured PreBind plugins. It returns
	// *Status and its code is set to non-success if any of the plugins returns
	// anything but Success. If the Status code is "Unschedulable", it is
	// considered as a scheduling check failure, otherwise, it is considered as an
	// internal error. In either case the pod is not going to be bound.
	RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status

	// RunPostBindPlugins runs the set of configured PostBind plugins.
	RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)

	// RunReservePluginsReserve runs the Reserve method of the set of
	// configured Reserve plugins. If any of these calls returns an error, it
	// does not continue running the remaining ones and returns the error. In
	// such case, pod will not be scheduled.
	RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status

	// RunReservePluginsUnreserve runs the Unreserve method of the set of
	// configured Reserve plugins.
	RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)

	// RunPermitPlugins runs the set of configured Permit plugins. If any of these
	// plugins returns a status other than "Success" or "Wait", it does not continue
	// running the remaining plugins and returns an error. Otherwise, if any of the
	// plugins returns "Wait", then this function will create and add waiting pod
	// to a map of currently waiting pods and return status with "Wait" code.
	// Pod will remain waiting pod for the minimum duration returned by the Permit plugins.
	RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status

	// WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed.
	WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status

	// RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose
	// whether or not to handle the given Pod. If a Bind plugin chooses to skip the
	// binding, it should return code=5("skip") status. Otherwise, it should return "Error"
	// or "Success". If none of the plugins handled binding, RunBindPlugins returns
	// code=5("skip") status.
	RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status

	// HasFilterPlugins returns true if at least one Filter plugin is defined.
	HasFilterPlugins() bool

	// HasPostFilterPlugins returns true if at least one PostFilter plugin is defined.
	HasPostFilterPlugins() bool

	// HasScorePlugins returns true if at least one Score plugin is defined.
	HasScorePlugins() bool

	// ListPlugins returns a map of extension point name to list of configured Plugins.
	ListPlugins() *config.Plugins

	// ProfileName returns the profile name associated to a profile.
	ProfileName() string

	// PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile.
	PercentageOfNodesToScore() *int32

	// SetPodNominator sets the PodNominator
	SetPodNominator(nominator PodNominator)
	// SetPodActivator sets the PodActivator
	SetPodActivator(activator PodActivator)

	// Close calls Close method of each plugin.
	Close() error
}

Framework manages the set of plugins in use by the scheduling framework. Configured plugins are called at specified points in a scheduling context.

type Handle

type Handle interface {
	// PodNominator abstracts operations to maintain nominated Pods.
	PodNominator
	// PluginsRunner abstracts operations to run some plugins.
	PluginsRunner
	// PodActivator abstracts operations in the scheduling queue.
	PodActivator
	// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
	// is taken at the beginning of a scheduling cycle and remains unchanged until
	// a pod finishes "Permit" point.
	//
	// It should be used only during scheduling cycle:
	// - There is no guarantee that the information remains unchanged in the binding phase of scheduling.
	//   So, plugins shouldn't use it in the binding cycle (pre-bind/bind/post-bind/un-reserve plugin)
	//   otherwise, a concurrent read/write error might occur.
	// - There is no guarantee that the information is always up-to-date.
	//   So, plugins shouldn't use it in QueueingHint and PreEnqueue
	//   otherwise, they might make a decision based on stale information.
	//
	// Instead, they should use the resources getting from Informer created from SharedInformerFactory().
	SnapshotSharedLister() SharedLister

	// IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map.
	IterateOverWaitingPods(callback func(WaitingPod))

	// GetWaitingPod returns a waiting pod given its UID.
	GetWaitingPod(uid types.UID) WaitingPod

	// RejectWaitingPod rejects a waiting pod given its UID.
	// The return value indicates if the pod is waiting or not.
	RejectWaitingPod(uid types.UID) bool

	// ClientSet returns a kubernetes clientSet.
	ClientSet() clientset.Interface

	// KubeConfig returns the raw kube config.
	KubeConfig() *restclient.Config

	// EventRecorder returns an event recorder.
	EventRecorder() events.EventRecorder

	SharedInformerFactory() informers.SharedInformerFactory

	// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.
	// A non-default implementation can be plugged into the framework to simulate the state of DRA objects.
	SharedDRAManager() SharedDRAManager

	// RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node.
	RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status

	// Extenders returns registered scheduler extenders.
	Extenders() []Extender

	// Parallelizer returns a parallelizer holding parallelism for scheduler.
	Parallelizer() parallelize.Parallelizer
}

Handle provides data and some tools that plugins can use. It is passed to the plugin factories at the time of plugin initialization. Plugins must store and use this handle to call framework functions.

type HostPortInfo

type HostPortInfo map[string]map[ProtocolPort]struct{}

HostPortInfo stores mapping from ip to a set of ProtocolPort

func (HostPortInfo) Add

func (h HostPortInfo) Add(ip, protocol string, port int32)

Add adds (ip, protocol, port) to HostPortInfo

func (HostPortInfo) CheckConflict

func (h HostPortInfo) CheckConflict(ip, protocol string, port int32) bool

CheckConflict checks if the input (ip, protocol, port) conflicts with the existing ones in HostPortInfo.

func (HostPortInfo) Len

func (h HostPortInfo) Len() int

Len returns the total number of (ip, protocol, port) tuple in HostPortInfo

func (HostPortInfo) Remove

func (h HostPortInfo) Remove(ip, protocol string, port int32)

Remove removes (ip, protocol, port) from HostPortInfo

type ImageStateSummary

type ImageStateSummary struct {
	// Size of the image
	Size int64
	// Used to track how many nodes have this image, it is computed from the Nodes field below
	// during the execution of Snapshot.
	NumNodes int
	// A set of node names for nodes having this image present. This field is used for
	// keeping track of the nodes during update/add/remove events.
	Nodes sets.Set[string]
}

ImageStateSummary provides summarized information about the state of an image.

func (*ImageStateSummary) Snapshot added in v1.29.0

func (iss *ImageStateSummary) Snapshot() *ImageStateSummary

Snapshot returns a copy without Nodes field of ImageStateSummary

type LessFunc

type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool

LessFunc is the function to sort pod info

type NodeInfo

type NodeInfo struct {

	// Pods running on the node.
	Pods []*PodInfo

	// The subset of pods with affinity.
	PodsWithAffinity []*PodInfo

	// The subset of pods with required anti-affinity.
	PodsWithRequiredAntiAffinity []*PodInfo

	// Ports allocated on the node.
	UsedPorts HostPortInfo

	// Total requested resources of all pods on this node. This includes assumed
	// pods, which scheduler has sent for binding, but may not be scheduled yet.
	Requested *Resource
	// Total requested resources of all pods on this node with a minimum value
	// applied to each container's CPU and memory requests. This does not reflect
	// the actual resource requests for this node, but is used to avoid scheduling
	// many zero-request pods onto one node.
	NonZeroRequested *Resource
	// We store allocatedResources (which is Node.Status.Allocatable.*) explicitly
	// as int64, to avoid conversions and accessing map.
	Allocatable *Resource

	// ImageStates holds the entry of an image if and only if this image is on the node. The entry can be used for
	// checking an image's existence and advanced usage (e.g., image locality scheduling policy) based on the image
	// state information.
	ImageStates map[string]*ImageStateSummary

	// PVCRefCounts contains a mapping of PVC names to the number of pods on the node using it.
	// Keys are in the format "namespace/name".
	PVCRefCounts map[string]int

	// Whenever NodeInfo changes, generation is bumped.
	// This is used to avoid cloning it if the object didn't change.
	Generation int64
	// contains filtered or unexported fields
}

NodeInfo is node level aggregated information.

func NewNodeInfo

func NewNodeInfo(pods ...*v1.Pod) *NodeInfo

NewNodeInfo returns a ready to use empty NodeInfo object. If any pods are given in arguments, their information will be aggregated in the returned object.

func (*NodeInfo) AddPod

func (n *NodeInfo) AddPod(pod *v1.Pod)

AddPod is a wrapper around AddPodInfo.

func (*NodeInfo) AddPodInfo added in v1.21.0

func (n *NodeInfo) AddPodInfo(podInfo *PodInfo)

AddPodInfo adds pod information to this NodeInfo. Consider using this instead of AddPod if a PodInfo is already computed.

func (*NodeInfo) GetName added in v1.31.0

func (n *NodeInfo) GetName() string

func (*NodeInfo) GetNamespace added in v1.31.0

func (n *NodeInfo) GetNamespace() string

func (*NodeInfo) Node

func (n *NodeInfo) Node() *v1.Node

Node returns overall information about this node.

func (*NodeInfo) RemoveNode

func (n *NodeInfo) RemoveNode()

RemoveNode removes the node object, leaving all other tracking information.

func (*NodeInfo) RemovePod

func (n *NodeInfo) RemovePod(logger klog.Logger, pod *v1.Pod) error

RemovePod subtracts pod information from this NodeInfo.

func (*NodeInfo) SetNode

func (n *NodeInfo) SetNode(node *v1.Node)

SetNode sets the overall node information.

func (*NodeInfo) Snapshot added in v1.29.0

func (n *NodeInfo) Snapshot() *NodeInfo

Snapshot returns a copy of this node, Except that ImageStates is copied without the Nodes field.

func (*NodeInfo) String

func (n *NodeInfo) String() string

String returns representation of human readable format of this NodeInfo.

type NodeInfoLister

type NodeInfoLister interface {
	// List returns the list of NodeInfos.
	List() ([]*NodeInfo, error)
	// HavePodsWithAffinityList returns the list of NodeInfos of nodes with pods with affinity terms.
	HavePodsWithAffinityList() ([]*NodeInfo, error)
	// HavePodsWithRequiredAntiAffinityList returns the list of NodeInfos of nodes with pods with required anti-affinity terms.
	HavePodsWithRequiredAntiAffinityList() ([]*NodeInfo, error)
	// Get returns the NodeInfo of the given node name.
	Get(nodeName string) (*NodeInfo, error)
}

NodeInfoLister interface represents anything that can list/get NodeInfo objects from node name.

type NodePluginScores added in v1.26.0

type NodePluginScores struct {
	// Name is node name.
	Name string
	// Scores is scores from plugins and extenders.
	Scores []PluginScore
	// TotalScore is the total score in Scores.
	TotalScore int64
}

NodePluginScores is a struct with node name and scores for that node.

type NodeScore

type NodeScore struct {
	Name  string
	Score int64
}

NodeScore is a struct with node name and score.

type NodeScoreList

type NodeScoreList []NodeScore

NodeScoreList declares a list of nodes and their scores.

type NodeToStatus added in v1.32.0

type NodeToStatus struct {
	// contains filtered or unexported fields
}

NodeToStatus contains the statuses of the Nodes where the incoming Pod was not schedulable.

func NewDefaultNodeToStatus added in v1.32.0

func NewDefaultNodeToStatus() *NodeToStatus

NewDefaultNodeToStatus creates NodeToStatus without any node in the map. The absentNodesStatus is set by default to UnschedulableAndUnresolvable.

func NewNodeToStatus added in v1.32.0

func NewNodeToStatus(nodeToStatus map[string]*Status, absentNodesStatus *Status) *NodeToStatus

NewNodeToStatus creates NodeToStatus initialized with given nodeToStatus and absentNodesStatus.

func (*NodeToStatus) AbsentNodesStatus added in v1.32.0

func (m *NodeToStatus) AbsentNodesStatus() *Status

AbsentNodesStatus returns absentNodesStatus value.

func (*NodeToStatus) ForEachExplicitNode added in v1.32.0

func (m *NodeToStatus) ForEachExplicitNode(fn func(nodeName string, status *Status))

ForEachExplicitNode runs fn for each node which status is explicitly set. Imporatant note, it runs the fn only for nodes with a status explicitly registered, and hence may not run the fn for all existing nodes. For example, if PreFilter rejects all Nodes, the scheduler would NOT set a failure status to every Node, but set a failure status as AbsentNodesStatus. You're supposed to get a status from AbsentNodesStatus(), and consider all other nodes that are rejected by them.

func (*NodeToStatus) Get added in v1.32.0

func (m *NodeToStatus) Get(nodeName string) *Status

Get returns the status for given nodeName. If the node is not in the map, the absentNodesStatus is returned.

func (*NodeToStatus) Len added in v1.32.0

func (m *NodeToStatus) Len() int

Len returns length of nodeToStatus map. It is not aware of number of absent nodes.

func (*NodeToStatus) NodesForStatusCode added in v1.32.0

func (m *NodeToStatus) NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error)

NodesForStatusCode returns a list of NodeInfos for the nodes that matches a given status code. If the absentNodesStatus matches the code, all existing nodes are fetched using nodeLister and filtered using NodeToStatus.Get. If the absentNodesStatus doesn't match the code, nodeToStatus map is used to create a list of nodes and nodeLister.Get is used to obtain NodeInfo for each.

func (*NodeToStatus) Set added in v1.32.0

func (m *NodeToStatus) Set(nodeName string, status *Status)

Set sets status for given nodeName.

func (*NodeToStatus) SetAbsentNodesStatus added in v1.32.0

func (m *NodeToStatus) SetAbsentNodesStatus(status *Status)

SetAbsentNodesStatus sets absentNodesStatus value.

type NodeToStatusMap

type NodeToStatusMap = NodeToStatusReader

NodeToStatusMap is an alias for NodeToStatusReader to keep partial backwards compatibility. NodeToStatusReader should be used if possible.

type NodeToStatusReader added in v1.32.0

type NodeToStatusReader interface {
	// Get returns the status for given nodeName.
	// If the node is not in the map, the AbsentNodesStatus is returned.
	Get(nodeName string) *Status
	// NodesForStatusCode returns a list of NodeInfos for the nodes that have a given status code.
	// It returns the NodeInfos for all matching nodes denoted by AbsentNodesStatus as well.
	NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error)
}

NodeToStatusReader is a read-only interface of NodeToStatus passed to each PostFilter plugin.

type NominatingInfo added in v1.23.2

type NominatingInfo struct {
	NominatedNodeName string
	NominatingMode    NominatingMode
}

func (*NominatingInfo) Mode added in v1.23.2

func (ni *NominatingInfo) Mode() NominatingMode

type NominatingMode added in v1.23.2

type NominatingMode int
const (
	ModeNoop NominatingMode = iota
	ModeOverride
)

type PermitPlugin

type PermitPlugin interface {
	Plugin
	// Permit is called before binding a pod (and before prebind plugins). Permit
	// plugins are used to prevent or delay the binding of a Pod. A permit plugin
	// must return success or wait with timeout duration, or the pod will be rejected.
	// The pod will also be rejected if the wait timeout or the pod is rejected while
	// waiting. Note that if the plugin returns "wait", the framework will wait only
	// after running the remaining plugins given that no other plugin rejects the pod.
	Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration)
}

PermitPlugin is an interface that must be implemented by "Permit" plugins. These plugins are called before a pod is bound to a node.

type Plugin

type Plugin interface {
	Name() string
}

Plugin is the parent type for all the scheduling framework plugins.

type PluginScore added in v1.26.0

type PluginScore struct {
	// Name is the name of plugin or extender.
	Name  string
	Score int64
}

PluginScore is a struct with plugin/extender name and score.

type PluginsRunner

type PluginsRunner interface {
	// RunPreScorePlugins runs the set of configured PreScore plugins. If any
	// of these plugins returns any status other than "Success", the given pod is rejected.
	RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) *Status
	// RunScorePlugins runs the set of configured scoring plugins.
	// It returns a list that stores scores from each plugin and total score for each Node.
	// It also returns *Status, which is set to non-success if any of the plugins returns
	// a non-success status.
	RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) ([]NodePluginScores, *Status)
	// RunFilterPlugins runs the set of configured Filter plugins for pod on
	// the given node. Note that for the node being evaluated, the passed nodeInfo
	// reference could be different from the one in NodeInfoSnapshot map (e.g., pods
	// considered to be running on the node could be different). For example, during
	// preemption, we may pass a copy of the original nodeInfo object that has some pods
	// removed from it to evaluate the possibility of preempting them to
	// schedule the target pod.
	RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status
	// RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured
	// PreFilter plugins. It returns directly if any of the plugins return any
	// status other than Success.
	RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
	// RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured
	// PreFilter plugins. It returns directly if any of the plugins return any
	// status other than Success.
	RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
}

PluginsRunner abstracts operations to run some plugins. This is used by preemption PostFilter plugins when evaluating the feasibility of scheduling the pod on nodes when certain running pods get evicted.

type PodActivator added in v1.32.0

type PodActivator interface {
	// Activate moves the given pods to activeQ.
	// If a pod isn't found in unschedulablePods or backoffQ and it's in-flight,
	// the wildcard event is registered so that the pod will be requeued when it comes back.
	// But, if a pod isn't found in unschedulablePods or backoffQ and it's not in-flight (i.e., completely unknown pod),
	// Activate would ignore the pod.
	Activate(logger klog.Logger, pods map[string]*v1.Pod)
}

PodActivator abstracts operations in the scheduling queue.

type PodInfo

type PodInfo struct {
	Pod                        *v1.Pod
	RequiredAffinityTerms      []AffinityTerm
	RequiredAntiAffinityTerms  []AffinityTerm
	PreferredAffinityTerms     []WeightedAffinityTerm
	PreferredAntiAffinityTerms []WeightedAffinityTerm
}

PodInfo is a wrapper to a Pod with additional pre-computed information to accelerate processing. This information is typically immutable (e.g., pre-processed inter-pod affinity selectors).

func NewPodInfo

func NewPodInfo(pod *v1.Pod) (*PodInfo, error)

NewPodInfo returns a new PodInfo.

func (*PodInfo) DeepCopy added in v1.21.0

func (pi *PodInfo) DeepCopy() *PodInfo

DeepCopy returns a deep copy of the PodInfo object.

func (*PodInfo) Update added in v1.21.0

func (pi *PodInfo) Update(pod *v1.Pod) error

Update creates a full new PodInfo by default. And only updates the pod when the PodInfo has been instantiated and the passed pod is the exact same one as the original pod.

type PodNominator

type PodNominator interface {
	// AddNominatedPod adds the given pod to the nominator or
	// updates it if it already exists.
	AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo)
	// DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist.
	DeleteNominatedPodIfExists(pod *v1.Pod)
	// UpdateNominatedPod updates the <oldPod> with <newPod>.
	UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo)
	// NominatedPodsForNode returns nominatedPods on the given node.
	NominatedPodsForNode(nodeName string) []*PodInfo
}

PodNominator abstracts operations to maintain nominated Pods.

type PodsToActivate added in v1.22.0

type PodsToActivate struct {
	sync.Mutex
	// Map is keyed with namespaced pod name, and valued with the pod.
	Map map[string]*v1.Pod
}

PodsToActivate stores pods to be activated.

func NewPodsToActivate added in v1.22.0

func NewPodsToActivate() *PodsToActivate

NewPodsToActivate instantiates a PodsToActivate object.

func (*PodsToActivate) Clone added in v1.22.0

func (s *PodsToActivate) Clone() StateData

Clone just returns the same state.

type PostBindPlugin

type PostBindPlugin interface {
	Plugin
	// PostBind is called after a pod is successfully bound. These plugins are
	// informational. A common application of this extension point is for cleaning
	// up. If a plugin needs to clean-up its state after a pod is scheduled and
	// bound, PostBind is the extension point that it should register.
	PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
}

PostBindPlugin is an interface that must be implemented by "PostBind" plugins. These plugins are called after a pod is successfully bound to a node.

type PostFilterPlugin

type PostFilterPlugin interface {
	Plugin
	// PostFilter is called by the scheduling framework
	// when the scheduling cycle failed at PreFilter or Filter by Unschedulable or UnschedulableAndUnresolvable.
	// NodeToStatusReader has statuses that each Node got in PreFilter or Filter phase.
	//
	// If you're implementing a custom preemption with PostFilter, ignoring Nodes with UnschedulableAndUnresolvable is the responsibility of your plugin,
	// meaning NodeToStatusReader could have Nodes with UnschedulableAndUnresolvable
	// and the scheduling framework does call PostFilter plugins even when all Nodes in NodeToStatusReader are UnschedulableAndUnresolvable.
	//
	// A PostFilter plugin should return one of the following statuses:
	// - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable.
	// - Success: the plugin gets executed successfully and the pod can be made schedulable.
	// - Error: the plugin aborts due to some internal error.
	//
	// Informational plugins should be configured ahead of other ones, and always return Unschedulable status.
	// Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example,
	// a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the
	// preemptor pod's .spec.status.nominatedNodeName field.
	PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)
}

PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called after a pod cannot be scheduled.

type PostFilterResult

type PostFilterResult struct {
	*NominatingInfo
}

PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase.

func NewPostFilterResultWithNominatedNode added in v1.23.2

func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult

type PreBindPlugin

type PreBindPlugin interface {
	Plugin
	// PreBind is called before binding a pod. All prebind plugins must return
	// success or the pod will be rejected and won't be sent for binding.
	PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
}

PreBindPlugin is an interface that must be implemented by "PreBind" plugins. These plugins are called before a pod being scheduled.

type PreEnqueuePlugin added in v1.26.0

type PreEnqueuePlugin interface {
	Plugin
	// PreEnqueue is called prior to adding Pods to activeQ.
	PreEnqueue(ctx context.Context, p *v1.Pod) *Status
}

PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins. These plugins are called prior to adding Pods to activeQ. Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to involve expensive calls like accessing external endpoints; otherwise it'd block other Pods' enqueuing in event handlers.

type PreFilterExtensions

type PreFilterExtensions interface {
	// AddPod is called by the framework while trying to evaluate the impact
	// of adding podToAdd to the node while scheduling podToSchedule.
	AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
	// RemovePod is called by the framework while trying to evaluate the impact
	// of removing podToRemove from the node while scheduling podToSchedule.
	RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
}

PreFilterExtensions is an interface that is included in plugins that allow specifying callbacks to make incremental updates to its supposedly pre-calculated state.

type PreFilterPlugin

type PreFilterPlugin interface {
	Plugin
	// PreFilter is called at the beginning of the scheduling cycle. All PreFilter
	// plugins must return success or the pod will be rejected. PreFilter could optionally
	// return a PreFilterResult to influence which nodes to evaluate downstream. This is useful
	// for cases where it is possible to determine the subset of nodes to process in O(1) time.
	// When PreFilterResult filters out some Nodes, the framework considers Nodes that are filtered out as getting "UnschedulableAndUnresolvable".
	// i.e., those Nodes will be out of the candidates of the preemption.
	//
	// When it returns Skip status, returned PreFilterResult and other fields in status are just ignored,
	// and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle.
	PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status)
	// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one,
	// or nil if it does not. A Pre-filter plugin can provide extensions to incrementally
	// modify its pre-processed info. The framework guarantees that the extensions
	// AddPod/RemovePod will only be called after PreFilter, possibly on a cloned
	// CycleState, and may call those functions more than once before calling
	// Filter again on a specific node.
	PreFilterExtensions() PreFilterExtensions
}

PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins. These plugins are called at the beginning of the scheduling cycle.

type PreFilterResult added in v1.24.0

type PreFilterResult struct {
	// The set of nodes that should be considered downstream; if nil then
	// all nodes are eligible.
	NodeNames sets.Set[string]
}

PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase.

func (*PreFilterResult) AllNodes added in v1.24.0

func (p *PreFilterResult) AllNodes() bool

func (*PreFilterResult) Merge added in v1.24.0

type PreScorePlugin

type PreScorePlugin interface {
	Plugin
	// PreScore is called by the scheduling framework after a list of nodes
	// passed the filtering phase. All prescore plugins must return success or
	// the pod will be rejected
	// When it returns Skip status, other fields in status are just ignored,
	// and coupled Score plugin will be skipped in this scheduling cycle.
	PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*NodeInfo) *Status
}

PreScorePlugin is an interface for "PreScore" plugin. PreScore is an informational extension point. Plugins will be called with a list of nodes that passed the filtering phase. A plugin may use this data to update internal state or to generate logs/metrics.

type ProtocolPort

type ProtocolPort struct {
	Protocol string
	Port     int32
}

ProtocolPort represents a protocol port pair, e.g. tcp:80.

func NewProtocolPort

func NewProtocolPort(protocol string, port int32) *ProtocolPort

NewProtocolPort creates a ProtocolPort instance.

type QueueSortPlugin

type QueueSortPlugin interface {
	Plugin
	// Less are used to sort pods in the scheduling queue.
	Less(*QueuedPodInfo, *QueuedPodInfo) bool
}

QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins. These plugins are used to sort pods in the scheduling queue. Only one queue sort plugin may be enabled at a time.

type QueuedPodInfo

type QueuedPodInfo struct {
	*PodInfo
	// The time pod added to the scheduling queue.
	Timestamp time.Time
	// Number of schedule attempts before successfully scheduled.
	// It's used to record the # attempts metric and calculate the backoff time this Pod is obliged to get before retrying.
	Attempts int
	// The time when the pod is added to the queue for the first time. The pod may be added
	// back to the queue multiple times before it's successfully scheduled.
	// It shouldn't be updated once initialized. It's used to record the e2e scheduling
	// latency for a pod.
	InitialAttemptTimestamp *time.Time
	// UnschedulablePlugins records the plugin names that the Pod failed with Unschedulable or UnschedulableAndUnresolvable status
	// at specific extension points: PreFilter, Filter, Reserve, Permit (WaitOnPermit), or PreBind.
	// If Pods are rejected at other extension points,
	// they're assumed to be unexpected errors (e.g., temporal network issue, plugin implementation issue, etc)
	// and retried soon after a backoff period.
	// That is because such failures could be solved regardless of incoming cluster events (registered in EventsToRegister).
	UnschedulablePlugins sets.Set[string]
	// PendingPlugins records the plugin names that the Pod failed with Pending status.
	PendingPlugins sets.Set[string]
	// Whether the Pod is scheduling gated (by PreEnqueuePlugins) or not.
	Gated bool
}

QueuedPodInfo is a Pod wrapper with additional information related to the pod's status in the scheduling queue, such as the timestamp when it's added to the queue.

func (*QueuedPodInfo) DeepCopy

func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo

DeepCopy returns a deep copy of the QueuedPodInfo object.

type QueueingHint added in v1.28.0

type QueueingHint int
const (
	// QueueSkip implies that the cluster event has no impact on
	// scheduling of the pod.
	QueueSkip QueueingHint = iota

	// Queue implies that the Pod may be schedulable by the event.
	Queue
)

func (QueueingHint) String added in v1.28.0

func (s QueueingHint) String() string

type QueueingHintFn added in v1.28.0

type QueueingHintFn func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (QueueingHint, error)

QueueingHintFn returns a hint that signals whether the event can make a Pod, which was rejected by this plugin in the past scheduling cycle, schedulable or not. It's called before a Pod gets moved from unschedulableQ to backoffQ or activeQ. If it returns an error, we'll take the returned QueueingHint as `Queue` at the caller whatever we returned here so that we can prevent the Pod from being stuck in the unschedulable pod pool.

- `pod`: the Pod to be enqueued, which is rejected by this plugin in the past. - `oldObj` `newObj`: the object involved in that event.

  • For example, the given event is "Node deleted", the `oldObj` will be that deleted Node.
  • `oldObj` is nil if the event is add event.
  • `newObj` is nil if the event is delete event.

type ReservePlugin

type ReservePlugin interface {
	Plugin
	// Reserve is called by the scheduling framework when the scheduler cache is
	// updated. If this method returns a failed Status, the scheduler will call
	// the Unreserve method for all enabled ReservePlugins.
	Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
	// Unreserve is called by the scheduling framework when a reserved pod was
	// rejected, an error occurred during reservation of subsequent plugins, or
	// in a later phase. The Unreserve method implementation must be idempotent
	// and may be called by the scheduler even if the corresponding Reserve
	// method for the same plugin was not called.
	Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
}

ReservePlugin is an interface for plugins with Reserve and Unreserve methods. These are meant to update the state of the plugin. This concept used to be called 'assume' in the original scheduler. These plugins should return only Success or Error in Status.code. However, the scheduler accepts other valid codes as well. Anything other than Success will lead to rejection of the pod.

type Resource

type Resource struct {
	MilliCPU         int64
	Memory           int64
	EphemeralStorage int64
	// We store allowedPodNumber (which is Node.Status.Allocatable.Pods().Value())
	// explicitly as int, to avoid conversions and improve performance.
	AllowedPodNumber int
	// ScalarResources
	ScalarResources map[v1.ResourceName]int64
}

Resource is a collection of compute resource.

func NewResource

func NewResource(rl v1.ResourceList) *Resource

NewResource creates a Resource from ResourceList

func (*Resource) Add

func (r *Resource) Add(rl v1.ResourceList)

Add adds ResourceList into Resource.

func (*Resource) AddScalar

func (r *Resource) AddScalar(name v1.ResourceName, quantity int64)

AddScalar adds a resource by a scalar value of this resource.

func (*Resource) Clone

func (r *Resource) Clone() *Resource

Clone returns a copy of this resource.

func (*Resource) SetMaxResource

func (r *Resource) SetMaxResource(rl v1.ResourceList)

SetMaxResource compares with ResourceList and takes max value for each Resource.

func (*Resource) SetScalar

func (r *Resource) SetScalar(name v1.ResourceName, quantity int64)

SetScalar sets a resource by a scalar value of this resource.

type ResourceClaimTracker added in v1.32.0

type ResourceClaimTracker interface {
	// List lists ResourceClaims. The result is guaranteed to immediately include any changes made via AssumeClaimAfterAPICall(),
	// and SignalClaimPendingAllocation().
	List() ([]*resourceapi.ResourceClaim, error)
	// Get works like List(), but for a single claim.
	Get(namespace, claimName string) (*resourceapi.ResourceClaim, error)
	// ListAllAllocatedDevices lists all allocated Devices from allocated ResourceClaims. The result is guaranteed to immediately include
	// any changes made via AssumeClaimAfterAPICall(), and SignalClaimPendingAllocation().
	ListAllAllocatedDevices() (sets.Set[structured.DeviceID], error)

	// SignalClaimPendingAllocation signals to the tracker that the given ResourceClaim will be allocated via an API call in the
	// binding phase. This change is immediately reflected in the result of List() and the other accessors.
	SignalClaimPendingAllocation(claimUID types.UID, allocatedClaim *resourceapi.ResourceClaim) error
	// ClaimHasPendingAllocation answers whether a given claim has a pending allocation during the binding phase. It can be used to avoid
	// race conditions in subsequent scheduling phases.
	ClaimHasPendingAllocation(claimUID types.UID) bool
	// RemoveClaimPendingAllocation removes the pending allocation for the given ResourceClaim from the tracker if any was signaled via
	// SignalClaimPendingAllocation(). Returns whether there was a pending allocation to remove. List() and the other accessors immediately
	// stop reflecting the pending allocation in the results.
	RemoveClaimPendingAllocation(claimUID types.UID) (deleted bool)

	// AssumeClaimAfterAPICall signals to the tracker that an API call modifying the given ResourceClaim was made in the binding phase, and the
	// changes should be reflected in informers very soon. This change is immediately reflected in the result of List() and the other accessors.
	// This mechanism can be used to avoid race conditions between the informer update and subsequent scheduling phases.
	AssumeClaimAfterAPICall(claim *resourceapi.ResourceClaim) error
	// AssumedClaimRestore signals to the tracker that something went wrong with the API call modifying the given ResourceClaim, and
	// the changes won't be reflected in informers after all. List() and the other accessors immediately stop reflecting the assumed change,
	// and go back to the informer version.
	AssumedClaimRestore(namespace, claimName string)
}

ResourceClaimTracker can be used to obtain ResourceClaims, and track changes to ResourceClaims in-memory.

If the claims are meant to be allocated in the API during the binding phase (when used by scheduler), the tracker helps avoid race conditions between scheduling and binding phases (as well as between the binding phase and the informer cache update).

If the binding phase is not run (e.g. when used by Cluster Autoscaler which only runs the scheduling phase, and simulates binding in-memory), the tracker allows the framework user to obtain the claim allocations produced by the DRA plugin, and persist them outside of the API (e.g. in-memory).

type ResourceSliceLister added in v1.32.0

type ResourceSliceLister interface {
	// List returns a list of all ResourceSlices.
	List() ([]*resourceapi.ResourceSlice, error)
}

ResourceSliceLister can be used to obtain ResourceSlices.

type ScoreExtensions

type ScoreExtensions interface {
	// NormalizeScore is called for all node scores produced by the same plugin's "Score"
	// method. A successful run of NormalizeScore will update the scores list and return
	// a success status.
	NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status
}

ScoreExtensions is an interface for Score extended functionality.

type ScorePlugin

type ScorePlugin interface {
	Plugin
	// Score is called on each filtered node. It must return success and an integer
	// indicating the rank of the node. All scoring plugins must return success or
	// the pod will be rejected.
	Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)

	// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
	ScoreExtensions() ScoreExtensions
}

ScorePlugin is an interface that must be implemented by "Score" plugins to rank nodes that passed the filtering phase.

type SharedDRAManager added in v1.32.0

type SharedDRAManager interface {
	ResourceClaims() ResourceClaimTracker
	ResourceSlices() ResourceSliceLister
	DeviceClasses() DeviceClassLister
}

SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin. The plugin's default implementation obtains the objects from the API. A different implementation can be plugged into the framework in order to simulate the state of DRA objects. For example, Cluster Autoscaler can use this to provide the correct DRA object state to the DRA plugin when simulating scheduling changes in-memory.

type SharedLister

type SharedLister interface {
	NodeInfos() NodeInfoLister
	StorageInfos() StorageInfoLister
}

SharedLister groups scheduler-specific listers.

type StateData

type StateData interface {
	// Clone is an interface to make a copy of StateData. For performance reasons,
	// clone should make shallow copies for members (e.g., slices or maps) that are not
	// impacted by PreFilter's optional AddPod/RemovePod methods.
	Clone() StateData
}

StateData is a generic type for arbitrary data stored in CycleState.

type StateKey

type StateKey string

StateKey is the type of keys stored in CycleState.

var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate"

PodsToActivateKey is a reserved state key for stashing pods. If the stashed pods are present in unschedulablePods or backoffQ,they will be activated (i.e., moved to activeQ) in two phases: - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated) - end of a binding cycle if it succeeds

type Status

type Status struct {
	// contains filtered or unexported fields
}

Status indicates the result of running a plugin. It consists of a code, a message, (optionally) an error, and a plugin name it fails by. When the status code is not Success, the reasons should explain why. And, when code is Success, all the other fields should be empty. NOTE: A nil Status is also considered as Success.

func AsStatus

func AsStatus(err error) *Status

AsStatus wraps an error in a Status.

func NewStatus

func NewStatus(code Code, reasons ...string) *Status

NewStatus makes a Status out of the given arguments and returns its pointer.

func (*Status) AppendReason

func (s *Status) AppendReason(reason string)

AppendReason appends given reason to the Status.

func (*Status) AsError

func (s *Status) AsError() error

AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object with a concatenated message on reasons of the Status.

func (*Status) Code

func (s *Status) Code() Code

Code returns code of the Status.

func (*Status) Equal added in v1.21.0

func (s *Status) Equal(x *Status) bool

Equal checks equality of two statuses. This is useful for testing with cmp.Equal.

func (*Status) IsRejected added in v1.29.0

func (s *Status) IsRejected() bool

IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending).

func (*Status) IsSkip added in v1.25.0

func (s *Status) IsSkip() bool

IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip".

func (*Status) IsSuccess

func (s *Status) IsSuccess() bool

IsSuccess returns true if and only if "Status" is nil or Code is "Success".

func (*Status) IsWait added in v1.25.0

func (s *Status) IsWait() bool

IsWait returns true if and only if "Status" is non-nil and its Code is "Wait".

func (*Status) Message

func (s *Status) Message() string

Message returns a concatenated message on reasons of the Status.

func (*Status) Plugin added in v1.29.0

func (s *Status) Plugin() string

Plugin returns the plugin name which caused this status.

func (*Status) Reasons

func (s *Status) Reasons() []string

Reasons returns reasons of the Status.

func (*Status) SetPlugin added in v1.29.0

func (s *Status) SetPlugin(plugin string)

SetPlugin sets the given plugin name to s.plugin.

func (*Status) String added in v1.30.0

func (s *Status) String() string

func (*Status) WithError added in v1.27.0

func (s *Status) WithError(err error) *Status

func (*Status) WithPlugin added in v1.29.0

func (s *Status) WithPlugin(plugin string) *Status

WithPlugin sets the given plugin name to s.plugin, and returns the given status object.

type StorageInfoLister added in v1.25.0

type StorageInfoLister interface {
	// IsPVCUsedByPods returns true/false on whether the PVC is used by one or more scheduled pods,
	// keyed in the format "namespace/name".
	IsPVCUsedByPods(key string) bool
}

StorageInfoLister interface represents anything that handles storage-related operations and resources.

type WaitingPod

type WaitingPod interface {
	// GetPod returns a reference to the waiting pod.
	GetPod() *v1.Pod
	// GetPendingPlugins returns a list of pending Permit plugin's name.
	GetPendingPlugins() []string
	// Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName".
	// If this is the last remaining plugin to allow, then a success signal is delivered
	// to unblock the pod.
	Allow(pluginName string)
	// Reject declares the waiting pod unschedulable.
	Reject(pluginName, msg string)
}

WaitingPod represents a pod currently waiting in the permit phase.

type WeightedAffinityTerm

type WeightedAffinityTerm struct {
	AffinityTerm
	Weight int32
}

WeightedAffinityTerm is a "processed" representation of v1.WeightedAffinityTerm.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL