disruption

package
v0.37.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 5, 2024 License: Apache-2.0 Imports: 41 Imported by: 0

Documentation

Index

Constants

View Source
const MinInstanceTypesForSpotToSpotConsolidation = 15

MinInstanceTypesForSpotToSpotConsolidation is the minimum number of instanceTypes in a NodeClaim needed to trigger spot-to-spot single-node consolidation

View Source
const MultiNodeConsolidationTimeoutDuration = 1 * time.Minute
View Source
const SingleNodeConsolidationTimeoutDuration = 3 * time.Minute

Variables

View Source
var (
	EvaluationDurationHistogram = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Namespace: metrics.Namespace,
			Subsystem: disruptionSubsystem,
			Name:      "evaluation_duration_seconds",
			Help:      "Duration of the disruption evaluation process in seconds. Labeled by method and consolidation type.",
			Buckets:   metrics.DurationBuckets(),
		},
		[]string{methodLabel, consolidationTypeLabel},
	)
	ActionsPerformedCounter = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: metrics.Namespace,
			Subsystem: disruptionSubsystem,
			Name:      "actions_performed_total",
			Help:      "Number of disruption actions performed. Labeled by disruption action, method, and consolidation type.",
		},
		[]string{actionLabel, methodLabel, consolidationTypeLabel},
	)
	NodesDisruptedCounter = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: metrics.Namespace,
			Subsystem: disruptionSubsystem,
			Name:      "nodes_disrupted_total",
			Help:      "Total number of nodes disrupted. Labeled by NodePool, disruption action, method, and consolidation type.",
		},
		[]string{metrics.NodePoolLabel, actionLabel, methodLabel, consolidationTypeLabel},
	)
	PodsDisruptedCounter = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: metrics.Namespace,
			Subsystem: disruptionSubsystem,
			Name:      "pods_disrupted_total",
			Help:      "Total number of reschedulable pods disrupted on nodes. Labeled by NodePool, disruption action, method, and consolidation type.",
		},
		[]string{metrics.NodePoolLabel, actionLabel, methodLabel, consolidationTypeLabel},
	)
	EligibleNodesGauge = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: metrics.Namespace,
			Subsystem: disruptionSubsystem,
			Name:      "eligible_nodes",
			Help:      "Number of nodes eligible for disruption by Karpenter. Labeled by disruption method and consolidation type.",
		},
		[]string{methodLabel, consolidationTypeLabel},
	)
	ConsolidationTimeoutTotalCounter = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: metrics.Namespace,
			Subsystem: disruptionSubsystem,
			Name:      "consolidation_timeouts_total",
			Help:      "Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type.",
		},
		[]string{consolidationTypeLabel},
	)
	BudgetsAllowedDisruptionsGauge = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: metrics.Namespace,
			Subsystem: disruptionSubsystem,
			Name:      "budgets_allowed_disruptions",
			Help:      "The number of nodes for a given NodePool that can be disrupted at a point in time. Labeled by NodePool. Note that allowed disruptions can change very rapidly, as new nodes may be created and others may be deleted at any point.",
		},
		[]string{metrics.NodePoolLabel},
	)
)

Functions

func BuildDisruptionBudgets added in v0.34.0

func BuildDisruptionBudgets(ctx context.Context, cluster *state.Cluster, clk clock.Clock, kubeClient client.Client, recorder events.Recorder) (map[string]int, error)

BuildDisruptionBudgets will return a map for nodePoolName -> numAllowedDisruptions and an error

func BuildNodePoolMap added in v0.34.0

func BuildNodePoolMap(ctx context.Context, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider) (map[string]*v1beta1.NodePool, map[string]map[string]*cloudprovider.InstanceType, error)

BuildNodePoolMap builds a provName -> nodePool map and a provName -> instanceName -> instance type map

func GetPodEvictionCost

func GetPodEvictionCost(ctx context.Context, p *v1.Pod) float64

GetPodEvictionCost returns the disruption cost computed for evicting the given pod.

func IsValidationError added in v0.36.1

func IsValidationError(err error) bool

func MakeConsolidation added in v0.34.0

func MakeConsolidation(clock clock.Clock, cluster *state.Cluster, kubeClient client.Client, provisioner *provisioning.Provisioner,
	cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, queue *orchestration.Queue) consolidation

func SimulateScheduling added in v0.34.0

func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, provisioner *provisioning.Provisioner,
	candidates ...*Candidate,
) (pscheduling.Results, error)

Types

type Action

type Action string
var (
	NoOpAction    Action = "no-op"
	ReplaceAction Action = "replace"
	DeleteAction  Action = "delete"
)

type Candidate

type Candidate struct {
	*state.StateNode
	// contains filtered or unexported fields
}

Candidate is a state.StateNode that we are considering for disruption along with extra information to be used in making that determination

func GetCandidates

func GetCandidates(ctx context.Context, cluster *state.Cluster, kubeClient client.Client, recorder events.Recorder, clk clock.Clock,
	cloudProvider cloudprovider.CloudProvider, shouldDeprovision CandidateFilter, queue *orchestration.Queue,
) ([]*Candidate, error)

GetCandidates returns nodes that appear to be currently deprovisionable based off of their nodePool

func NewCandidate

func NewCandidate(ctx context.Context, kubeClient client.Client, recorder events.Recorder, clk clock.Clock, node *state.StateNode, pdbs *PDBLimits,
	nodePoolMap map[string]*v1beta1.NodePool, nodePoolToInstanceTypesMap map[string]map[string]*cloudprovider.InstanceType, queue *orchestration.Queue) (*Candidate, error)

type CandidateFilter

type CandidateFilter func(context.Context, *Candidate) bool

type Command

type Command struct {
	// contains filtered or unexported fields
}

func (Command) Action

func (c Command) Action() Action

func (Command) String

func (c Command) String() string

type Controller

type Controller struct {
	// contains filtered or unexported fields
}

func NewController

func NewController(clk clock.Clock, kubeClient client.Client, provisioner *provisioning.Provisioner,
	cp cloudprovider.CloudProvider, recorder events.Recorder, cluster *state.Cluster, queue *orchestration.Queue,
) *Controller

func (*Controller) Reconcile

func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error)

func (*Controller) Register added in v0.37.0

func (c *Controller) Register(_ context.Context, m manager.Manager) error

type Drift

type Drift struct {
	// contains filtered or unexported fields
}

Drift is a subreconciler that deletes drifted candidates.

func NewDrift

func NewDrift(kubeClient client.Client, cluster *state.Cluster, provisioner *provisioning.Provisioner, recorder events.Recorder) *Drift

func (*Drift) ComputeCommand

func (d *Drift) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates

func (*Drift) ConsolidationType

func (d *Drift) ConsolidationType() string

func (*Drift) ShouldDisrupt

func (d *Drift) ShouldDisrupt(ctx context.Context, c *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*Drift) Type

func (d *Drift) Type() string

type Emptiness

type Emptiness struct {
	// contains filtered or unexported fields
}

Emptiness is a subreconciler that deletes empty candidates. Emptiness will respect TTLSecondsAfterEmpty

func NewEmptiness

func NewEmptiness(clk clock.Clock, recorder events.Recorder) *Emptiness

func (*Emptiness) ComputeCommand

func (e *Emptiness) ComputeCommand(_ context.Context, disruptionBudgetMapping map[string]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates

func (*Emptiness) ConsolidationType

func (e *Emptiness) ConsolidationType() string

func (*Emptiness) ShouldDisrupt

func (e *Emptiness) ShouldDisrupt(_ context.Context, c *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*Emptiness) Type

func (e *Emptiness) Type() string

type EmptyNodeConsolidation

type EmptyNodeConsolidation struct {
	// contains filtered or unexported fields
}

EmptyNodeConsolidation is the consolidation controller that performs multi-nodeclaim consolidation of entirely empty nodes

func NewEmptyNodeConsolidation

func NewEmptyNodeConsolidation(consolidation consolidation) *EmptyNodeConsolidation

func (*EmptyNodeConsolidation) ComputeCommand

func (c *EmptyNodeConsolidation) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates

func (*EmptyNodeConsolidation) ConsolidationType

func (c *EmptyNodeConsolidation) ConsolidationType() string

func (*EmptyNodeConsolidation) IsConsolidated added in v0.34.0

func (c *EmptyNodeConsolidation) IsConsolidated() bool

IsConsolidated returns true if nothing has changed since markConsolidated was called.

func (*EmptyNodeConsolidation) ShouldDisrupt

func (c *EmptyNodeConsolidation) ShouldDisrupt(_ context.Context, cn *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*EmptyNodeConsolidation) Type

func (c *EmptyNodeConsolidation) Type() string

type Expiration

type Expiration struct {
	// contains filtered or unexported fields
}

Expiration is a subreconciler that deletes empty candidates. Expiration will respect TTLSecondsAfterEmpty

func NewExpiration

func NewExpiration(clk clock.Clock, kubeClient client.Client, cluster *state.Cluster, provisioner *provisioning.Provisioner, recorder events.Recorder) *Expiration

func (*Expiration) ComputeCommand

func (e *Expiration) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates

func (*Expiration) ConsolidationType

func (e *Expiration) ConsolidationType() string

func (*Expiration) ShouldDisrupt

func (e *Expiration) ShouldDisrupt(_ context.Context, c *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*Expiration) Type

func (e *Expiration) Type() string

type Method

type Method interface {
	ShouldDisrupt(context.Context, *Candidate) bool
	ComputeCommand(context.Context, map[string]int, ...*Candidate) (Command, scheduling.Results, error)
	Type() string
	ConsolidationType() string
}

type MultiNodeConsolidation

type MultiNodeConsolidation struct {
	// contains filtered or unexported fields
}

func NewMultiNodeConsolidation

func NewMultiNodeConsolidation(consolidation consolidation) *MultiNodeConsolidation

func (*MultiNodeConsolidation) ComputeCommand

func (m *MultiNodeConsolidation) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]int, candidates ...*Candidate) (Command, scheduling.Results, error)

func (*MultiNodeConsolidation) ConsolidationType

func (m *MultiNodeConsolidation) ConsolidationType() string

func (*MultiNodeConsolidation) IsConsolidated added in v0.34.0

func (c *MultiNodeConsolidation) IsConsolidated() bool

IsConsolidated returns true if nothing has changed since markConsolidated was called.

func (*MultiNodeConsolidation) ShouldDisrupt

func (c *MultiNodeConsolidation) ShouldDisrupt(_ context.Context, cn *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*MultiNodeConsolidation) Type

func (m *MultiNodeConsolidation) Type() string

type PDBLimits

type PDBLimits struct {
	// contains filtered or unexported fields
}

PDBLimits is used to evaluate if evicting a list of pods is possible.

func NewPDBLimits

func NewPDBLimits(ctx context.Context, clk clock.Clock, kubeClient client.Client) (*PDBLimits, error)

func (*PDBLimits) CanEvictPods

func (s *PDBLimits) CanEvictPods(pods []*v1.Pod) (client.ObjectKey, bool)

CanEvictPods returns true if every pod in the list is evictable. They may not all be evictable simultaneously, but for every PDB that controls the pods at least one pod can be evicted. nolint:gocyclo

type SingleNodeConsolidation

type SingleNodeConsolidation struct {
	// contains filtered or unexported fields
}

SingleNodeConsolidation is the consolidation controller that performs single-node consolidation.

func NewSingleNodeConsolidation

func NewSingleNodeConsolidation(consolidation consolidation) *SingleNodeConsolidation

func (*SingleNodeConsolidation) ComputeCommand

func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates nolint:gocyclo

func (*SingleNodeConsolidation) ConsolidationType

func (s *SingleNodeConsolidation) ConsolidationType() string

func (*SingleNodeConsolidation) IsConsolidated added in v0.34.0

func (c *SingleNodeConsolidation) IsConsolidated() bool

IsConsolidated returns true if nothing has changed since markConsolidated was called.

func (*SingleNodeConsolidation) ShouldDisrupt

func (c *SingleNodeConsolidation) ShouldDisrupt(_ context.Context, cn *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*SingleNodeConsolidation) Type

func (s *SingleNodeConsolidation) Type() string

type UninitializedNodeError added in v0.37.0

type UninitializedNodeError struct {
	*pscheduling.ExistingNode
}

UninitializedNodeError tracks a special pod error for disruption where pods schedule to a node that hasn't been initialized yet, meaning that we can't be confident to make a disruption decision based off of it

func NewUninitializedNodeError added in v0.37.0

func NewUninitializedNodeError(node *pscheduling.ExistingNode) *UninitializedNodeError

func (*UninitializedNodeError) Error added in v0.37.0

func (u *UninitializedNodeError) Error() string

type Validation

type Validation struct {
	// contains filtered or unexported fields
}

Validation is used to perform validation on a consolidation command. It makes an assumption that when re-used, all of the commands passed to IsValid were constructed based off of the same consolidation state. This allows it to skip the validation TTL for all but the first command.

func NewValidation

func NewValidation(clk clock.Clock, cluster *state.Cluster, kubeClient client.Client, provisioner *provisioning.Provisioner,
	cp cloudprovider.CloudProvider, recorder events.Recorder, queue *orchestration.Queue) *Validation

func (*Validation) IsValid

func (v *Validation) IsValid(ctx context.Context, cmd Command, validationPeriod time.Duration) error

func (*Validation) ShouldDisrupt

func (v *Validation) ShouldDisrupt(_ context.Context, c *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*Validation) ValidateCandidates added in v0.36.1

func (v *Validation) ValidateCandidates(ctx context.Context, candidates ...*Candidate) ([]*Candidate, error)

ValidateCandidates gets the current representation of the provided candidates and ensures that they are all still valid. For a candidate to still be valid, the following conditions must be met:

a. It must pass the global candidate filtering logic (no blocking PDBs, no do-not-disrupt annotation, etc)
b. It must not have any pods nominated for it
c. It must still be disruptable without violating node disruption budgets

If these conditions are met for all candidates, ValidateCandidates returns a slice with the updated representations.

func (*Validation) ValidateCommand

func (v *Validation) ValidateCommand(ctx context.Context, cmd Command, candidates []*Candidate) error

ValidateCommand validates a command for a Method

type ValidationError added in v0.36.1

type ValidationError struct {
	// contains filtered or unexported fields
}

func NewValidationError added in v0.36.1

func NewValidationError(err error) *ValidationError

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL