disruption

package
v1.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 26, 2024 License: Apache-2.0 Imports: 41 Imported by: 0

Documentation

Index

Constants

View Source
const (
	GracefulDisruptionClass = "graceful" // graceful disruption always respects blocking pod PDBs and the do-not-disrupt annotation
	EventualDisruptionClass = "eventual" // eventual disruption is bounded by a NodePool's TerminationGracePeriod, regardless of blocking pod PDBs and the do-not-disrupt annotation
)
View Source
const MinInstanceTypesForSpotToSpotConsolidation = 15

MinInstanceTypesForSpotToSpotConsolidation is the minimum number of instanceTypes in a NodeClaim needed to trigger spot-to-spot single-node consolidation

View Source
const MultiNodeConsolidationTimeoutDuration = 1 * time.Minute
View Source
const SingleNodeConsolidationTimeoutDuration = 3 * time.Minute

Variables

View Source
var (
	EvaluationDurationSeconds = prometheus.NewHistogramVec(
		prometheus.HistogramOpts{
			Namespace: metrics.Namespace,
			Subsystem: voluntaryDisruptionSubsystem,
			Name:      "decision_evaluation_duration_seconds",
			Help:      "Duration of the disruption decision evaluation process in seconds. Labeled by method and consolidation type.",
			Buckets:   metrics.DurationBuckets(),
		},
		[]string{metrics.ReasonLabel, consolidationTypeLabel},
	)
	DecisionsPerformedTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: metrics.Namespace,
			Subsystem: voluntaryDisruptionSubsystem,
			Name:      "decisions_total",
			Help:      "Number of disruption decisions performed. Labeled by disruption decision, reason, and consolidation type.",
		},
		[]string{decisionLabel, metrics.ReasonLabel, consolidationTypeLabel},
	)
	EligibleNodes = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: metrics.Namespace,
			Subsystem: voluntaryDisruptionSubsystem,
			Name:      "eligible_nodes",
			Help:      "Number of nodes eligible for disruption by Karpenter. Labeled by disruption reason.",
		},
		[]string{metrics.ReasonLabel},
	)
	ConsolidationTimeoutsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: metrics.Namespace,
			Subsystem: voluntaryDisruptionSubsystem,
			Name:      "consolidation_timeouts_total",
			Help:      "Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type.",
		},
		[]string{consolidationTypeLabel},
	)
	NodePoolAllowedDisruptions = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: metrics.Namespace,
			Subsystem: metrics.NodePoolSubsystem,
			Name:      "allowed_disruptions",
			Help:      "The number of nodes for a given NodePool that can be concurrently disrupting at a point in time. Labeled by NodePool. Note that allowed disruptions can change very rapidly, as new nodes may be created and others may be deleted at any point.",
		},
		[]string{metrics.NodePoolLabel, metrics.ReasonLabel},
	)
)

Functions

func BuildDisruptionBudgets added in v0.34.0

func BuildDisruptionBudgets(ctx context.Context, cluster *state.Cluster, clk clock.Clock, kubeClient client.Client, recorder events.Recorder) (map[string]map[v1.DisruptionReason]int, error)

BuildDisruptionBudgets prepares our disruption budget mapping. The disruption budget maps each disruption reason to the number of allowed disruptions. We calculate allowed disruptions by taking the max disruptions allowed by disruption reason and subtracting the number of nodes that are NotReady and already being deleted by that disruption reason.

func BuildNodePoolMap added in v0.34.0

func BuildNodePoolMap(ctx context.Context, kubeClient client.Client, cloudProvider cloudprovider.CloudProvider) (map[string]*v1.NodePool, map[string]map[string]*cloudprovider.InstanceType, error)

BuildNodePoolMap builds a provName -> nodePool map and a provName -> instanceName -> instance type map

func IsValidationError added in v0.36.1

func IsValidationError(err error) bool

func MakeConsolidation added in v0.34.0

func MakeConsolidation(clock clock.Clock, cluster *state.Cluster, kubeClient client.Client, provisioner *provisioning.Provisioner,
	cloudProvider cloudprovider.CloudProvider, recorder events.Recorder, queue *orchestration.Queue) consolidation

func SimulateScheduling added in v0.34.0

func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *state.Cluster, provisioner *provisioning.Provisioner,
	candidates ...*Candidate,
) (pscheduling.Results, error)

Types

type Candidate

type Candidate struct {
	*state.StateNode
	// contains filtered or unexported fields
}

Candidate is a state.StateNode that we are considering for disruption along with extra information to be used in making that determination

func GetCandidates

func GetCandidates(ctx context.Context, cluster *state.Cluster, kubeClient client.Client, recorder events.Recorder, clk clock.Clock,
	cloudProvider cloudprovider.CloudProvider, shouldDeprovision CandidateFilter, disruptionClass string, queue *orchestration.Queue,
) ([]*Candidate, error)

GetCandidates returns nodes that appear to be currently deprovisionable based off of their nodePool

func NewCandidate

func NewCandidate(ctx context.Context, kubeClient client.Client, recorder events.Recorder, clk clock.Clock, node *state.StateNode, pdbs pdb.Limits,
	nodePoolMap map[string]*v1.NodePool, nodePoolToInstanceTypesMap map[string]map[string]*cloudprovider.InstanceType, queue *orchestration.Queue, disruptionClass string) (*Candidate, error)

type CandidateFilter

type CandidateFilter func(context.Context, *Candidate) bool

type Command

type Command struct {
	// contains filtered or unexported fields
}

func (Command) Decision added in v1.0.0

func (c Command) Decision() Decision

func (Command) String

func (c Command) String() string

type Controller

type Controller struct {
	// contains filtered or unexported fields
}

func NewController

func NewController(clk clock.Clock, kubeClient client.Client, provisioner *provisioning.Provisioner,
	cp cloudprovider.CloudProvider, recorder events.Recorder, cluster *state.Cluster, queue *orchestration.Queue,
) *Controller

func (*Controller) Reconcile

func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error)

func (*Controller) Register added in v0.37.0

func (c *Controller) Register(_ context.Context, m manager.Manager) error

type Decision added in v1.0.0

type Decision string
var (
	NoOpDecision    Decision = "no-op"
	ReplaceDecision Decision = "replace"
	DeleteDecision  Decision = "delete"
)

type Drift

type Drift struct {
	// contains filtered or unexported fields
}

Drift is a subreconciler that deletes drifted candidates.

func NewDrift

func NewDrift(kubeClient client.Client, cluster *state.Cluster, provisioner *provisioning.Provisioner, recorder events.Recorder) *Drift

func (*Drift) Class added in v1.0.0

func (d *Drift) Class() string

func (*Drift) ComputeCommand

func (d *Drift) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]map[v1.DisruptionReason]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates

func (*Drift) ConsolidationType

func (d *Drift) ConsolidationType() string

func (*Drift) Reason added in v1.0.0

func (d *Drift) Reason() v1.DisruptionReason

func (*Drift) ShouldDisrupt

func (d *Drift) ShouldDisrupt(ctx context.Context, c *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

type Emptiness

type Emptiness struct {
	// contains filtered or unexported fields
}

Emptiness is a subreconciler that deletes empty candidates.

func NewEmptiness

func NewEmptiness(c consolidation) *Emptiness

func (*Emptiness) Class added in v1.0.0

func (e *Emptiness) Class() string

func (*Emptiness) ComputeCommand

func (e *Emptiness) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]map[v1.DisruptionReason]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates

func (*Emptiness) ConsolidationType

func (e *Emptiness) ConsolidationType() string

func (*Emptiness) IsConsolidated added in v1.0.0

func (c *Emptiness) IsConsolidated() bool

IsConsolidated returns true if nothing has changed since markConsolidated was called.

func (*Emptiness) Reason added in v1.0.0

func (e *Emptiness) Reason() v1.DisruptionReason

func (*Emptiness) ShouldDisrupt

func (e *Emptiness) ShouldDisrupt(_ context.Context, c *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

type Method

type Method interface {
	ShouldDisrupt(context.Context, *Candidate) bool
	ComputeCommand(context.Context, map[string]map[v1.DisruptionReason]int, ...*Candidate) (Command, scheduling.Results, error)
	Reason() v1.DisruptionReason
	Class() string
	ConsolidationType() string
}

type MultiNodeConsolidation

type MultiNodeConsolidation struct {
	// contains filtered or unexported fields
}

func NewMultiNodeConsolidation

func NewMultiNodeConsolidation(consolidation consolidation) *MultiNodeConsolidation

func (*MultiNodeConsolidation) Class added in v1.0.0

func (m *MultiNodeConsolidation) Class() string

func (*MultiNodeConsolidation) ComputeCommand

func (m *MultiNodeConsolidation) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]map[v1.DisruptionReason]int, candidates ...*Candidate) (Command, scheduling.Results, error)

func (*MultiNodeConsolidation) ConsolidationType

func (m *MultiNodeConsolidation) ConsolidationType() string

func (*MultiNodeConsolidation) IsConsolidated added in v0.34.0

func (c *MultiNodeConsolidation) IsConsolidated() bool

IsConsolidated returns true if nothing has changed since markConsolidated was called.

func (*MultiNodeConsolidation) Reason added in v1.0.0

func (*MultiNodeConsolidation) ShouldDisrupt

func (c *MultiNodeConsolidation) ShouldDisrupt(_ context.Context, cn *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

type SingleNodeConsolidation

type SingleNodeConsolidation struct {
	// contains filtered or unexported fields
}

SingleNodeConsolidation is the consolidation controller that performs single-node consolidation.

func NewSingleNodeConsolidation

func NewSingleNodeConsolidation(consolidation consolidation) *SingleNodeConsolidation

func (*SingleNodeConsolidation) Class added in v1.0.0

func (s *SingleNodeConsolidation) Class() string

func (*SingleNodeConsolidation) ComputeCommand

func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, disruptionBudgetMapping map[string]map[v1.DisruptionReason]int, candidates ...*Candidate) (Command, scheduling.Results, error)

ComputeCommand generates a disruption command given candidates nolint:gocyclo

func (*SingleNodeConsolidation) ConsolidationType

func (s *SingleNodeConsolidation) ConsolidationType() string

func (*SingleNodeConsolidation) IsConsolidated added in v0.34.0

func (c *SingleNodeConsolidation) IsConsolidated() bool

IsConsolidated returns true if nothing has changed since markConsolidated was called.

func (*SingleNodeConsolidation) Reason added in v1.0.0

func (*SingleNodeConsolidation) ShouldDisrupt

func (c *SingleNodeConsolidation) ShouldDisrupt(_ context.Context, cn *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

type UninitializedNodeError added in v0.37.0

type UninitializedNodeError struct {
	*pscheduling.ExistingNode
}

UninitializedNodeError tracks a special pod error for disruption where pods schedule to a node that hasn't been initialized yet, meaning that we can't be confident to make a disruption decision based off of it

func NewUninitializedNodeError added in v0.37.0

func NewUninitializedNodeError(node *pscheduling.ExistingNode) *UninitializedNodeError

func (*UninitializedNodeError) Error added in v0.37.0

func (u *UninitializedNodeError) Error() string

type Validation

type Validation struct {
	// contains filtered or unexported fields
}

Validation is used to perform validation on a consolidation command. It makes an assumption that when re-used, all of the commands passed to IsValid were constructed based off of the same consolidation state. This allows it to skip the validation TTL for all but the first command.

func NewValidation

func NewValidation(clk clock.Clock, cluster *state.Cluster, kubeClient client.Client, provisioner *provisioning.Provisioner,
	cp cloudprovider.CloudProvider, recorder events.Recorder, queue *orchestration.Queue, reason v1.DisruptionReason) *Validation

func (*Validation) IsValid

func (v *Validation) IsValid(ctx context.Context, cmd Command, validationPeriod time.Duration) error

func (*Validation) ShouldDisrupt

func (v *Validation) ShouldDisrupt(_ context.Context, c *Candidate) bool

ShouldDisrupt is a predicate used to filter candidates

func (*Validation) ValidateCandidates added in v0.36.1

func (v *Validation) ValidateCandidates(ctx context.Context, candidates ...*Candidate) ([]*Candidate, error)

ValidateCandidates gets the current representation of the provided candidates and ensures that they are all still valid. For a candidate to still be valid, the following conditions must be met:

a. It must pass the global candidate filtering logic (no blocking PDBs, no do-not-disrupt annotation, etc)
b. It must not have any pods nominated for it
c. It must still be disruptable without violating node disruption budgets

If these conditions are met for all candidates, ValidateCandidates returns a slice with the updated representations.

func (*Validation) ValidateCommand

func (v *Validation) ValidateCommand(ctx context.Context, cmd Command, candidates []*Candidate) error

ValidateCommand validates a command for a Method

type ValidationError added in v0.36.1

type ValidationError struct {
	// contains filtered or unexported fields
}

func NewValidationError added in v0.36.1

func NewValidationError(err error) *ValidationError

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL