metrics

package
v0.0.0-...-5ae08a9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 22, 2024 License: Apache-2.0 Imports: 11 Imported by: 61

Documentation

Index

Constants

View Source
const (

	// Underutilized node was removed because of low utilization
	Underutilized NodeScaleDownReason = "underutilized"
	// Empty node was removed
	Empty NodeScaleDownReason = "empty"
	// Unready node was removed
	Unready NodeScaleDownReason = "unready"

	// CloudProviderError caused scale-up to fail
	CloudProviderError FailedScaleUpReason = "cloudProviderError"
	// APIError caused scale-up to fail
	APIError FailedScaleUpReason = "apiCallError"
	// Timeout was encountered when trying to scale-up
	Timeout FailedScaleUpReason = "timeout"

	// DirectionScaleDown is the direction of skipped scaling event when scaling in (shrinking)
	DirectionScaleDown string = "down"
	// DirectionScaleUp is the direction of skipped scaling event when scaling out (growing)
	DirectionScaleUp string = "up"

	// CpuResourceLimit minimum or maximum reached, check the direction label to determine min or max
	CpuResourceLimit string = "CpuResourceLimit"
	// MemoryResourceLimit minimum or maximum reached, check the direction label to determine min or max
	MemoryResourceLimit string = "MemoryResourceLimit"

	// LogLongDurationThreshold defines the duration after which long function
	// duration will be logged (in addition to being counted in metric).
	// This is meant to help find unexpectedly long function execution times for
	// debugging purposes.
	LogLongDurationThreshold = 5 * time.Second
	// PodEvictionSucceed means creation of the pod eviction object succeed
	PodEvictionSucceed PodEvictionResult = "succeeded"
	// PodEvictionFailed means creation of the pod eviction object failed
	PodEvictionFailed PodEvictionResult = "failed"
)

Variables

This section is empty.

Functions

func ObserveNodeTaintsCount

func ObserveNodeTaintsCount(taintType string, count float64)

ObserveNodeTaintsCount records the node taints count of given type.

func ObservePendingNodeDeletions

func ObservePendingNodeDeletions(value int)

ObservePendingNodeDeletions records the current value of nodes_pending_deletion metric

func RegisterAll

func RegisterAll(emitPerNodeGroupMetrics bool)

RegisterAll registers all metrics.

func RegisterError

func RegisterError(err errors.AutoscalerError)

RegisterError records any errors preventing Cluster Autoscaler from working. No more than one error should be recorded per loop.

func RegisterEvictions

func RegisterEvictions(podsCount int, result PodEvictionResult)

RegisterEvictions records number of evicted pods succeed or failed

func RegisterFailedScaleUp

func RegisterFailedScaleUp(reason FailedScaleUpReason, gpuResourceName, gpuType string)

RegisterFailedScaleUp records a failed scale-up operation

func RegisterNodeGroupCreation

func RegisterNodeGroupCreation()

RegisterNodeGroupCreation registers node group creation

func RegisterNodeGroupCreationWithLabelValues

func RegisterNodeGroupCreationWithLabelValues(groupType string)

RegisterNodeGroupCreationWithLabelValues registers node group creation with the provided labels

func RegisterNodeGroupDeletion

func RegisterNodeGroupDeletion()

RegisterNodeGroupDeletion registers node group deletion

func RegisterNodeGroupDeletionWithLabelValues

func RegisterNodeGroupDeletionWithLabelValues(groupType string)

RegisterNodeGroupDeletionWithLabelValues registers node group deletion with the provided labels

func RegisterOldUnregisteredNodesRemoved

func RegisterOldUnregisteredNodesRemoved(nodesCount int)

RegisterOldUnregisteredNodesRemoved records number of old unregistered nodes that have been removed by the cluster autoscaler

func RegisterScaleDown

func RegisterScaleDown(nodesCount int, gpuResourceName, gpuType string, reason NodeScaleDownReason)

RegisterScaleDown records number of nodes removed by scale down

func RegisterScaleUp

func RegisterScaleUp(nodesCount int, gpuResourceName, gpuType string)

RegisterScaleUp records number of nodes added by scale up

func RegisterSkippedScaleDownCPU

func RegisterSkippedScaleDownCPU()

RegisterSkippedScaleDownCPU increases the count of skipped scale outs because of CPU resource limits

func RegisterSkippedScaleDownMemory

func RegisterSkippedScaleDownMemory()

RegisterSkippedScaleDownMemory increases the count of skipped scale outs because of Memory resource limits

func RegisterSkippedScaleUpCPU

func RegisterSkippedScaleUpCPU()

RegisterSkippedScaleUpCPU increases the count of skipped scale outs because of CPU resource limits

func RegisterSkippedScaleUpMemory

func RegisterSkippedScaleUpMemory()

RegisterSkippedScaleUpMemory increases the count of skipped scale outs because of Memory resource limits

func UpdateCPULimitsCores

func UpdateCPULimitsCores(minCoresCount int64, maxCoresCount int64)

UpdateCPULimitsCores records the minimum and maximum number of cores in the cluster

func UpdateClusterCPUCurrentCores

func UpdateClusterCPUCurrentCores(coresCount int64)

UpdateClusterCPUCurrentCores records the number of cores in the cluster, minus deleting nodes

func UpdateClusterMemoryCurrentBytes

func UpdateClusterMemoryCurrentBytes(memoryCount int64)

UpdateClusterMemoryCurrentBytes records the number of bytes of memory in the cluster, minus deleting nodes

func UpdateClusterSafeToAutoscale

func UpdateClusterSafeToAutoscale(safe bool)

UpdateClusterSafeToAutoscale records if cluster is safe to autoscale

func UpdateDuration

func UpdateDuration(label FunctionLabel, duration time.Duration)

UpdateDuration records the duration of the step identified by the label

func UpdateDurationFromStart

func UpdateDurationFromStart(label FunctionLabel, start time.Time)

UpdateDurationFromStart records the duration of the step identified by the label using start time

func UpdateInconsistentInstancesMigsCount

func UpdateInconsistentInstancesMigsCount(migCount int)

UpdateInconsistentInstancesMigsCount records the observed number of migs where instance count according to InstanceGroupManagers.List() differs from the results of Instances.List(). This can happen when some instances are abandoned or a user edits instance 'created-by' metadata.

func UpdateLastTime

func UpdateLastTime(label FunctionLabel, now time.Time)

UpdateLastTime records the time the step identified by the label was started

func UpdateMaxNodesCount

func UpdateMaxNodesCount(nodesCount int)

UpdateMaxNodesCount records the current maximum number of nodes being set for all node groups

func UpdateMemoryLimitsBytes

func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64)

UpdateMemoryLimitsBytes records the minimum and maximum bytes of memory in the cluster

func UpdateNapEnabled

func UpdateNapEnabled(enabled bool)

UpdateNapEnabled records if NodeAutoprovisioning is enabled

func UpdateNodeGroupBackOffStatus

func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]bool)

UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling

func UpdateNodeGroupHealthStatus

func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool)

UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling

func UpdateNodeGroupMax

func UpdateNodeGroupMax(nodeGroup string, maxNodes int)

UpdateNodeGroupMax records the node group maximum allowed number of nodes

func UpdateNodeGroupMin

func UpdateNodeGroupMin(nodeGroup string, minNodes int)

UpdateNodeGroupMin records the node group minimum allowed number of nodes

func UpdateNodeGroupTargetSize

func UpdateNodeGroupTargetSize(targetSizes map[string]int)

UpdateNodeGroupTargetSize records the node group target size

func UpdateNodeGroupsCount

func UpdateNodeGroupsCount(autoscaled, autoprovisioned int)

UpdateNodeGroupsCount records the number of node groups managed by CA

func UpdateNodesCount

func UpdateNodesCount(ready, unready, starting, longUnregistered, unregistered int)

UpdateNodesCount records the number of nodes in cluster

func UpdateOverflowingControllers

func UpdateOverflowingControllers(count int)

UpdateOverflowingControllers sets the number of controllers that could not have their pods cached.

func UpdateScaleDownInCooldown

func UpdateScaleDownInCooldown(inCooldown bool)

UpdateScaleDownInCooldown registers if the cluster autoscaler scaledown is in cooldown

func UpdateUnneededNodesCount

func UpdateUnneededNodesCount(nodesCount int)

UpdateUnneededNodesCount records number of currently unneeded nodes

func UpdateUnremovableNodesCount

func UpdateUnremovableNodesCount(unremovableReasonCounts map[simulator.UnremovableReason]int)

UpdateUnremovableNodesCount records number of currently unremovable nodes

func UpdateUnschedulablePodsCount

func UpdateUnschedulablePodsCount(uschedulablePodsCount, schedulerUnprocessedCount int)

UpdateUnschedulablePodsCount records number of currently unschedulable pods

func UpdateUnschedulablePodsCountWithLabel

func UpdateUnschedulablePodsCountWithLabel(uschedulablePodsCount int, label string)

UpdateUnschedulablePodsCountWithLabel records number of currently unschedulable pods wil label "type" value "label"

Types

type FailedScaleUpReason

type FailedScaleUpReason string

FailedScaleUpReason describes reason of failed scale-up

type FunctionLabel

type FunctionLabel string

FunctionLabel is a name of Cluster Autoscaler operation for which we measure duration

const (
	ScaleDown                  FunctionLabel = "scaleDown"
	ScaleDownNodeDeletion      FunctionLabel = "scaleDown:nodeDeletion"
	ScaleDownFindNodesToRemove FunctionLabel = "scaleDown:findNodesToRemove"
	ScaleDownMiscOperations    FunctionLabel = "scaleDown:miscOperations"
	ScaleDownSoftTaintUnneeded FunctionLabel = "scaleDown:softTaintUnneeded"
	ScaleUp                    FunctionLabel = "scaleUp"
	BuildPodEquivalenceGroups  FunctionLabel = "scaleUp:buildPodEquivalenceGroups"
	Estimate                   FunctionLabel = "scaleUp:estimate"
	FindUnneeded               FunctionLabel = "findUnneeded"
	UpdateState                FunctionLabel = "updateClusterState"
	FilterOutSchedulable       FunctionLabel = "filterOutSchedulable"
	CloudProviderRefresh       FunctionLabel = "cloudProviderRefresh"
	Main                       FunctionLabel = "main"
	Poll                       FunctionLabel = "poll"
	Reconfigure                FunctionLabel = "reconfigure"
	Autoscaling                FunctionLabel = "autoscaling"
	LoopWait                   FunctionLabel = "loopWait"
	BulkListAllGceInstances    FunctionLabel = "bulkListInstances:listAllInstances"
	BulkListMigInstances       FunctionLabel = "bulkListInstances:listMigInstances"
)

Names of Cluster Autoscaler operations

type HealthCheck

type HealthCheck struct {
	// contains filtered or unexported fields
}

HealthCheck contains information about last time of autoscaler activity and timeout

func NewHealthCheck

func NewHealthCheck(activityTimeout, successTimeout time.Duration) *HealthCheck

NewHealthCheck builds new HealthCheck object with given timeout

func (*HealthCheck) ServeHTTP

func (hc *HealthCheck) ServeHTTP(w http.ResponseWriter, r *http.Request)

ServeHTTP implements http.Handler interface to provide a health-check endpoint

func (*HealthCheck) StartMonitoring

func (hc *HealthCheck) StartMonitoring()

StartMonitoring activates checks for autoscaler inactivity

func (*HealthCheck) UpdateLastActivity

func (hc *HealthCheck) UpdateLastActivity(timestamp time.Time)

UpdateLastActivity updates last time of activity

func (*HealthCheck) UpdateLastSuccessfulRun

func (hc *HealthCheck) UpdateLastSuccessfulRun(timestamp time.Time)

UpdateLastSuccessfulRun updates last time of successful (i.e. not ending in error) activity

type NodeGroupType

type NodeGroupType string

NodeGroupType describes node group relation to CA

type NodeScaleDownReason

type NodeScaleDownReason string

NodeScaleDownReason describes reason for removing node

type PodEvictionResult

type PodEvictionResult string

PodEvictionResult describes result of the pod eviction attempt

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL