metrics

package
v0.5.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 2, 2022 License: Apache-2.0 Imports: 16 Imported by: 0

Documentation

Index

Constants

View Source
const (
	CraneNamespace      = "crane"
	CraneAgentSubsystem = "craneAgent"

	LastActivity                = "last_activity"
	StepDurationSeconds         = "step_duration_seconds"
	StepDurationQuantileSummary = "step_duration_quantile_summary"

	AnalyzerStatus        = "analyzer_status"
	AnalyzerStatusTotal   = "analyzer_status_total"
	ExecutorStatus        = "executor_status"
	ExecutorStatusTotal   = "executor_status_total"
	ExecutorErrorTotal    = "executor_error_total"
	ExecutorEvictTotal    = "executor_evict_total"
	PodResourceErrorTotal = "pod_resource_error_total"

	NodeCpuCannotBeReclaimedSeconds = "node_cpu_cannot_be_reclaimed_seconds"
	NodeResourceRecommended         = "node_resource_recommended"
	NodeResourceRecommendedFrom     = "node_resource_recommended_from"
)

This const block defines the metric names for the crane-agent metrics.

View Source
const (
	// LogLongDurationThreshold defines the duration after which long step
	// duration will be logged (in addition to being counted in metric).
	// This is meant to help find unexpectedly long step execution times for
	// debugging purposes.
	LogLongDurationThreshold = 1 * time.Minute
)

Variables

View Source
var (
	HPAReplicas = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "hpa_replicas",
			Help:      "Replicas for HPA",
		},
		[]string{"namespace", "name"},
	)
	EHPAReplicas = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "effective_hpa_replicas",
			Help:      "Replicas for Effective HPA",
		},
		[]string{"namespace", "name"},
	)
	HPAScaleCount = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "hpa_scale_count",
			Help:      "Scale count for HPA",
		},
		[]string{"namespace", "name", "type"},
	)
	OOMCount = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "oom_count",
			Help:      "The count of pod oom event",
		},
		[]string{
			"pod",
			"container",
		},
	)
	EVPACpuScaleUp = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "effective_vpa_cpu_scale_up",
			Help:      "The cpu scale up for Effective VPA",
		},
		[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
	)
	EVPACpuScaleDown = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "effective_vpa_cpu_scale_down",
			Help:      "The cpu scale down for Effective VPA",
		},
		[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
	)
	EVPAMemoryScaleUp = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "effective_vpa_memory_scale_up",
			Help:      "The memory scale up for Effective VPA",
		},
		[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
	)
	EVPAMemoryScaleDown = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "effective_vpa_memory_scale_down",
			Help:      "The memory scale down for Effective VPA",
		},
		[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
	)
	EVPAResourceRecommendation = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "autoscaling",
			Name:      "effective_vpa_resource_recommendation",
			Help:      "The resource recommendation for Effective VPA",
		},
		[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
	)
)
View Source
var (
	ResourceRecommendation = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: "crane",
			Subsystem: "analysis",
			Name:      "resource_recommendation",
			Help:      "The containers' CPU/Memory recommended value",
		},
		[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
	)
)

Functions

func AggregateSignalKey

func AggregateSignalKey(id string, labels []predictionapi.Label) string

func CustomCollectorRegister

func CustomCollectorRegister(collector ...prometheus.Collector)

func ExecutorErrorCounterInc added in v0.3.0

func ExecutorErrorCounterInc(subComponent SubComponent, stepName StepLabel)

func ExecutorEvictCountsInc added in v0.3.0

func ExecutorEvictCountsInc()

func ExecutorStatusCounterInc added in v0.3.0

func ExecutorStatusCounterInc(subComponent SubComponent, stepName StepLabel)

func PodResourceUpdateErrorCounterInc added in v0.3.0

func PodResourceUpdateErrorCounterInc(subComponent SubComponent, stepName StepLabel)

func RegisterCraneAgent added in v0.3.0

func RegisterCraneAgent()

func UpdateAnalyzerStatus added in v0.3.0

func UpdateAnalyzerStatus(typeName AnalyzeType, value float64)

func UpdateAnalyzerWithKeyStatus added in v0.3.0

func UpdateAnalyzerWithKeyStatus(typeName AnalyzeType, key string, value float64)

func UpdateDuration added in v0.3.0

func UpdateDuration(module string, stepName StepLabel, duration time.Duration)

func UpdateDurationFromStart added in v0.3.0

func UpdateDurationFromStart(module string, stepName StepLabel, start time.Time)

UpdateDurationFromStart records the duration of the step identified by the label using start time

func UpdateDurationFromStartWithSubComponent added in v0.3.0

func UpdateDurationFromStartWithSubComponent(module string, subComponent string, stepName StepLabel, start time.Time)

func UpdateDurationWithSubComponent added in v0.3.0

func UpdateDurationWithSubComponent(module string, subComponent string, stepName StepLabel, duration time.Duration)

func UpdateExecutorStatus added in v0.3.0

func UpdateExecutorStatus(subComponent SubComponent, stepName StepLabel, value float64)

func UpdateLastTime added in v0.3.0

func UpdateLastTime(module string, stepName StepLabel, now time.Time)

func UpdateLastTimeWithSubComponent added in v0.3.0

func UpdateLastTimeWithSubComponent(module string, subComponent string, stepName StepLabel, now time.Time)

func UpdateNodeCpuCannotBeReclaimedSeconds added in v0.5.0

func UpdateNodeCpuCannotBeReclaimedSeconds(value float64)

func UpdateNodeResourceRecommendedFromValue added in v0.5.0

func UpdateNodeResourceRecommendedFromValue(subComponent SubComponent, stepName StepLabel, resourceName string, value float64)

func UpdateNodeResourceRecommendedValue added in v0.5.0

func UpdateNodeResourceRecommendedValue(subComponent SubComponent, stepName StepLabel, resourceName string, from string, value float64)

Types

type AnalyzeType added in v0.3.0

type AnalyzeType string
const (
	AnalyzeTypeEnableScheduling AnalyzeType = "enableScheduling"
	AnalyzeTypeAvoidance        AnalyzeType = "avoidance"
	AnalyzeTypeRestore          AnalyzeType = "restore"
	AnalyzeTypeAnalyzeError     AnalyzeType = "analyzeError "
)

type HealthCheck added in v0.3.0

type HealthCheck struct {
	// contains filtered or unexported fields
}

HealthCheck contains information about last time of crane-agent collect activity and timeout

func NewHealthCheck added in v0.3.0

func NewHealthCheck(activityTimeout time.Duration) *HealthCheck

NewHealthCheck builds new HealthCheck object with given timeout

func (*HealthCheck) ServeHTTP added in v0.3.0

func (hc *HealthCheck) ServeHTTP(w http.ResponseWriter, r *http.Request)

ServeHTTP implements http.Handler interface to provide a health-check endpoint

func (*HealthCheck) StartMonitoring added in v0.3.0

func (hc *HealthCheck) StartMonitoring()

StartMonitoring activates checks for crane-agent inactivity

func (*HealthCheck) UpdateLastActivity added in v0.3.0

func (hc *HealthCheck) UpdateLastActivity(timestamp time.Time)

UpdateLastActivity updates last time of activity

func (*HealthCheck) UpdateLastConfigUpdate added in v0.3.0

func (hc *HealthCheck) UpdateLastConfigUpdate(timestamp time.Time)

UpdateLastConfigUpdate updates last time of config update

type StepLabel added in v0.3.0

type StepLabel string
const (
	StepMain               StepLabel = "main"
	StepCollect            StepLabel = "collect"
	StepAvoid              StepLabel = "avoid"
	StepRestore            StepLabel = "restore"
	StepUpdateConfig       StepLabel = "updateConfig"
	StepUpdateNodeResource StepLabel = "updateNodeResource"
	StepUpdatePodResource  StepLabel = "updatePodResource"

	// Step for pod resource manager
	StepGetPeriod   StepLabel = "getPeriod"
	StepUpdateQuota StepLabel = "updateQuota"

	StepGetExtResourceRecommended StepLabel = "getExtResourceRecommended"
)

type SubComponent added in v0.3.0

type SubComponent string
const (
	SubComponentSchedule     SubComponent = "schedule"
	SubComponentThrottle     SubComponent = "throttle"
	SubComponentEvict        SubComponent = "evict"
	SubComponentPodResource  SubComponent = "pod-resource-manager"
	SubComponentNodeResource SubComponent = "node-resource-manager"
)

type TspMetricCollector

type TspMetricCollector struct {
	client.Client
	// contains filtered or unexported fields
}

func NewTspMetricCollector

func NewTspMetricCollector(client client.Client) *TspMetricCollector

func (*TspMetricCollector) Collect

func (c *TspMetricCollector) Collect(ch chan<- prometheus.Metric)

func (*TspMetricCollector) Describe

func (c *TspMetricCollector) Describe(ch chan<- *prometheus.Desc)

Why Implement prometheus collector ? Because the time series prediction timestamp is future timestamp, this way can push timestamp to prometheus if use prometheus metric instrument by default, prometheus scrape will use its own scrape timestamp, so that the prediction time series maybe has wrong timestamps in prom.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL