Documentation ¶
Index ¶
- Constants
- func FakeNode(instance cloudprovider.Instance, reason string) *apiv1.Node
- type AcceptableRange
- type ClusterStateRegistry
- func (csr *ClusterStateRegistry) BackoffStatusForNodeGroup(nodeGroup cloudprovider.NodeGroup, now time.Time) backoff.Status
- func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetSize int)
- func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness
- func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node
- func (csr *ClusterStateRegistry) GetIncorrectNodeGroupSize(nodeGroupName string) *IncorrectNodeGroupSize
- func (csr *ClusterStateRegistry) GetScaleUpFailures() map[string][]ScaleUpFailure
- func (csr *ClusterStateRegistry) GetStatus(now time.Time) *api.ClusterAutoscalerStatus
- func (csr *ClusterStateRegistry) GetUnregisteredNodes() []UnregisteredNode
- func (csr *ClusterStateRegistry) GetUpcomingNodes() (upcomingCounts map[string]int, registeredNodeNames map[string][]string)
- func (csr *ClusterStateRegistry) HasNodeGroupStartedScaleUp(nodeGroupName string) bool
- func (csr *ClusterStateRegistry) InvalidateNodeInstancesCacheEntry(nodeGroup cloudprovider.NodeGroup)
- func (csr *ClusterStateRegistry) IsClusterHealthy() bool
- func (csr *ClusterStateRegistry) IsNodeGroupAtTargetSize(nodeGroupName string) bool
- func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool
- func (csr *ClusterStateRegistry) IsNodeGroupRegistered(nodeGroupName string) bool
- func (csr *ClusterStateRegistry) IsNodeGroupScalingUp(nodeGroupName string) bool
- func (csr *ClusterStateRegistry) MaxNodeProvisionTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error)
- func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety
- func (csr *ClusterStateRegistry) PeriodicCleanup()
- func (csr *ClusterStateRegistry) Recalculate()
- func (csr *ClusterStateRegistry) RefreshCloudProviderNodeInstancesCache()
- func (csr *ClusterStateRegistry) RegisterFailedScaleDown(_ cloudprovider.NodeGroup, _ string, _ time.Time)
- func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason string, ...)
- func (csr *ClusterStateRegistry) RegisterScaleDown(nodeGroup cloudprovider.NodeGroup, nodeName string, currentTime time.Time, ...)
- func (csr *ClusterStateRegistry) RegisterScaleUp(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time)
- func (csr *ClusterStateRegistry) Start()
- func (csr *ClusterStateRegistry) Stop()
- func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, nodeInfosForGroups map[string]*framework.NodeInfo, ...) error
- func (csr *ClusterStateRegistry) UpdateScaleDownCandidates(nodes []*apiv1.Node, now time.Time)
- type ClusterStateRegistryConfig
- type IncorrectNodeGroupSize
- type NodeGroupScalingSafety
- type Readiness
- type ScaleDownRequest
- type ScaleUpFailure
- type ScaleUpRequest
- type UnregisteredNode
Constants ¶
const ( // MaxNodeStartupTime is the maximum time from the moment the node is registered to the time the node is ready. MaxNodeStartupTime = 15 * time.Minute )
Variables ¶
This section is empty.
Functions ¶
Types ¶
type AcceptableRange ¶
type AcceptableRange struct { // MinNodes is the minimum number of nodes in the group. MinNodes int // MaxNodes is the maximum number of nodes in the group. MaxNodes int // CurrentTarget is the current target size of the group. CurrentTarget int }
AcceptableRange contains information about acceptable size of a node group.
type ClusterStateRegistry ¶
ClusterStateRegistry is a structure to keep track the current state of the cluster.
func NewClusterStateRegistry ¶
func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config ClusterStateRegistryConfig, logRecorder *utils.LogEventRecorder, backoff backoff.Backoff, nodeGroupConfigProcessor nodegroupconfig.NodeGroupConfigProcessor, asyncNodeGroupStateChecker asyncnodegroups.AsyncNodeGroupStateChecker) *ClusterStateRegistry
NewClusterStateRegistry creates new ClusterStateRegistry.
func (*ClusterStateRegistry) BackoffStatusForNodeGroup ¶
func (csr *ClusterStateRegistry) BackoffStatusForNodeGroup(nodeGroup cloudprovider.NodeGroup, now time.Time) backoff.Status
BackoffStatusForNodeGroup queries the backoff status of the node group
func (*ClusterStateRegistry) GetAutoscaledNodesCount ¶
func (csr *ClusterStateRegistry) GetAutoscaledNodesCount() (currentSize, targetSize int)
GetAutoscaledNodesCount calculates and returns the actual and the target number of nodes belonging to autoscaled node groups in the cluster.
func (*ClusterStateRegistry) GetClusterReadiness ¶
func (csr *ClusterStateRegistry) GetClusterReadiness() Readiness
GetClusterReadiness returns current readiness stats of cluster
func (*ClusterStateRegistry) GetCreatedNodesWithErrors ¶
func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node
GetCreatedNodesWithErrors returns a map from node group id to list of nodes which reported a create error.
func (*ClusterStateRegistry) GetIncorrectNodeGroupSize ¶
func (csr *ClusterStateRegistry) GetIncorrectNodeGroupSize(nodeGroupName string) *IncorrectNodeGroupSize
GetIncorrectNodeGroupSize gets IncorrectNodeGroupSizeInformation for the given node group.
func (*ClusterStateRegistry) GetScaleUpFailures ¶
func (csr *ClusterStateRegistry) GetScaleUpFailures() map[string][]ScaleUpFailure
GetScaleUpFailures returns the scale-up failures map.
func (*ClusterStateRegistry) GetStatus ¶
func (csr *ClusterStateRegistry) GetStatus(now time.Time) *api.ClusterAutoscalerStatus
GetStatus returns ClusterAutoscalerStatus with the current cluster autoscaler status.
func (*ClusterStateRegistry) GetUnregisteredNodes ¶
func (csr *ClusterStateRegistry) GetUnregisteredNodes() []UnregisteredNode
GetUnregisteredNodes returns a list of all unregistered nodes.
func (*ClusterStateRegistry) GetUpcomingNodes ¶
func (csr *ClusterStateRegistry) GetUpcomingNodes() (upcomingCounts map[string]int, registeredNodeNames map[string][]string)
GetUpcomingNodes returns how many new nodes will be added shortly to the node groups or should become ready soon. The function may overestimate the number of nodes. The second return value contains the names of upcoming nodes that are already registered in the cluster.
func (*ClusterStateRegistry) HasNodeGroupStartedScaleUp ¶
func (csr *ClusterStateRegistry) HasNodeGroupStartedScaleUp(nodeGroupName string) bool
HasNodeGroupStartedScaleUp returns true if the node group has started scale up regardless of whether there are any upcoming nodes. This is useful in the case when the node group's size reverts back to its previous size before the next UpdatesCall and we want to know if a scale up for node group has started.
func (*ClusterStateRegistry) InvalidateNodeInstancesCacheEntry ¶
func (csr *ClusterStateRegistry) InvalidateNodeInstancesCacheEntry(nodeGroup cloudprovider.NodeGroup)
InvalidateNodeInstancesCacheEntry removes a node group from the cloud provider node instances cache.
func (*ClusterStateRegistry) IsClusterHealthy ¶
func (csr *ClusterStateRegistry) IsClusterHealthy() bool
IsClusterHealthy returns true if the cluster health is within the acceptable limits
func (*ClusterStateRegistry) IsNodeGroupAtTargetSize ¶
func (csr *ClusterStateRegistry) IsNodeGroupAtTargetSize(nodeGroupName string) bool
IsNodeGroupAtTargetSize returns true if the number of nodes provisioned in the group is equal to the target number of nodes.
func (*ClusterStateRegistry) IsNodeGroupHealthy ¶
func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool
IsNodeGroupHealthy returns true if the node group health is within the acceptable limits
func (*ClusterStateRegistry) IsNodeGroupRegistered ¶
func (csr *ClusterStateRegistry) IsNodeGroupRegistered(nodeGroupName string) bool
IsNodeGroupRegistered returns true if the node group is registered in cluster state.
func (*ClusterStateRegistry) IsNodeGroupScalingUp ¶
func (csr *ClusterStateRegistry) IsNodeGroupScalingUp(nodeGroupName string) bool
IsNodeGroupScalingUp returns true if the node group is currently scaling up.
func (*ClusterStateRegistry) MaxNodeProvisionTime ¶
func (csr *ClusterStateRegistry) MaxNodeProvisionTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error)
MaxNodeProvisionTime returns MaxNodeProvisionTime value that should be used for the given NodeGroup. TODO(BigDarkClown): remove this method entirely, it is a redundant wrapper
func (*ClusterStateRegistry) NodeGroupScaleUpSafety ¶
func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety
NodeGroupScaleUpSafety returns information about node group safety to be scaled up now.
func (*ClusterStateRegistry) PeriodicCleanup ¶
func (csr *ClusterStateRegistry) PeriodicCleanup()
PeriodicCleanup performs clean-ups that should be done periodically, e.g. each Autoscaler loop.
func (*ClusterStateRegistry) Recalculate ¶
func (csr *ClusterStateRegistry) Recalculate()
Recalculate cluster state after scale-ups or scale-downs were registered.
func (*ClusterStateRegistry) RefreshCloudProviderNodeInstancesCache ¶
func (csr *ClusterStateRegistry) RefreshCloudProviderNodeInstancesCache()
RefreshCloudProviderNodeInstancesCache refreshes cloud provider node instances cache.
func (*ClusterStateRegistry) RegisterFailedScaleDown ¶
func (csr *ClusterStateRegistry) RegisterFailedScaleDown(_ cloudprovider.NodeGroup, _ string, _ time.Time)
RegisterFailedScaleDown records failed scale-down for a nodegroup. We don't need to implement this function for cluster state registry
func (*ClusterStateRegistry) RegisterFailedScaleUp ¶
func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.NodeGroup, reason string, errorMessage, gpuResourceName, gpuType string, currentTime time.Time)
RegisterFailedScaleUp should be called after getting error from cloudprovider when trying to scale-up node group. It will mark this group as not safe to autoscale for some time.
func (*ClusterStateRegistry) RegisterScaleDown ¶
func (csr *ClusterStateRegistry) RegisterScaleDown(nodeGroup cloudprovider.NodeGroup, nodeName string, currentTime time.Time, expectedDeleteTime time.Time)
RegisterScaleDown registers node scale down.
func (*ClusterStateRegistry) RegisterScaleUp ¶
func (csr *ClusterStateRegistry) RegisterScaleUp(nodeGroup cloudprovider.NodeGroup, delta int, currentTime time.Time)
RegisterScaleUp registers scale-up for give node group
func (*ClusterStateRegistry) Start ¶
func (csr *ClusterStateRegistry) Start()
Start starts components running in background.
func (*ClusterStateRegistry) Stop ¶
func (csr *ClusterStateRegistry) Stop()
Stop stops components running in background.
func (*ClusterStateRegistry) UpdateNodes ¶
func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, nodeInfosForGroups map[string]*framework.NodeInfo, currentTime time.Time) error
UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
func (*ClusterStateRegistry) UpdateScaleDownCandidates ¶
func (csr *ClusterStateRegistry) UpdateScaleDownCandidates(nodes []*apiv1.Node, now time.Time)
UpdateScaleDownCandidates updates scale down candidates
type ClusterStateRegistryConfig ¶
type ClusterStateRegistryConfig struct { // Maximum percentage of unready nodes in total, if the number of unready nodes is higher than OkTotalUnreadyCount. MaxTotalUnreadyPercentage float64 // Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply. // This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling. OkTotalUnreadyCount int }
ClusterStateRegistryConfig contains configuration information for ClusterStateRegistry.
type IncorrectNodeGroupSize ¶
type IncorrectNodeGroupSize struct { // ExpectedSize is the size of the node group measured on the cloud provider side. ExpectedSize int // CurrentSize is the size of the node group measured on the kubernetes side. CurrentSize int // FirstObserved is the time when the given difference occurred. FirstObserved time.Time }
IncorrectNodeGroupSize contains information about how much the current size of the node group differs from the expected size. Prolonged, stable mismatch is an indication of quota or startup issues.
type NodeGroupScalingSafety ¶
NodeGroupScalingSafety contains information about the safety of the node group to scale up/down.
type Readiness ¶
type Readiness struct { // Names of ready nodes. Ready []string // Names of unready nodes that broke down after they started. Unready []string // Names of nodes that are being currently deleted. They exist in K8S but // are not included in NodeGroup.TargetSize(). Deleted []string // Names of nodes that are not yet fully started. NotStarted []string // Names of all registered nodes in the group (ready/unready/deleted/etc). Registered []string // Names of nodes that failed to register within a reasonable limit. LongUnregistered []string // Names of nodes that haven't yet registered. Unregistered []string // Time when the readiness was measured. Time time.Time // Names of nodes that are Unready due to missing resources. // This field is only used for exposing information externally and // doesn't influence CA behavior. ResourceUnready []string }
Readiness contains readiness information about a group of nodes.
type ScaleDownRequest ¶
type ScaleDownRequest struct { // NodeName is the name of the node to be deleted. NodeName string // NodeGroup is the node group of the deleted node. NodeGroup cloudprovider.NodeGroup // Time is the time when the node deletion was requested. Time time.Time // ExpectedDeleteTime is the time when the node is expected to be deleted. ExpectedDeleteTime time.Time }
ScaleDownRequest contains information about the requested node deletion.
type ScaleUpFailure ¶
type ScaleUpFailure struct { NodeGroup cloudprovider.NodeGroup Reason metrics.FailedScaleUpReason Time time.Time }
ScaleUpFailure contains information about a failure of a scale-up.
type ScaleUpRequest ¶
type ScaleUpRequest struct { // NodeGroup is the node group to be scaled up. NodeGroup cloudprovider.NodeGroup // Time is the time when the request was submitted. Time time.Time // ExpectedAddTime is the time at which the request should be fulfilled. ExpectedAddTime time.Time // How much the node group is increased. Increase int }
ScaleUpRequest contains information about the requested node group scale up.
type UnregisteredNode ¶
type UnregisteredNode struct { // Node is a dummy node that contains only the name of the node. Node *apiv1.Node // UnregisteredSince is the time when the node was first spotted. UnregisteredSince time.Time }
UnregisteredNode contains information about nodes that are present on the cluster provider side but failed to register in Kubernetes.