metrics

package
v0.11.0-devel Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 16, 2024 License: Apache-2.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

View Source
const (
	AdmissionResultSuccess      AdmissionResult = "success"
	AdmissionResultInadmissible AdmissionResult = "inadmissible"

	PendingStatusActive       = "active"
	PendingStatusInadmissible = "inadmissible"

	// CQStatusPending means the ClusterQueue is accepted but not yet active,
	// this can be because of:
	// - a missing ResourceFlavor referenced by the ClusterQueue
	// - a missing or inactive AdmissionCheck referenced by the ClusterQueue
	// - the ClusterQueue is stopped
	// In this state, the ClusterQueue can't admit new workloads and its quota can't be borrowed
	// by other active ClusterQueues in the cohort.
	CQStatusPending ClusterQueueStatus = "pending"
	// CQStatusActive means the ClusterQueue can admit new workloads and its quota
	// can be borrowed by other ClusterQueues in the cohort.
	CQStatusActive ClusterQueueStatus = "active"
	// CQStatusTerminating means the clusterQueue is in pending deletion.
	CQStatusTerminating ClusterQueueStatus = "terminating"
)

Variables

View Source
var (
	CQStatuses = []ClusterQueueStatus{CQStatusPending, CQStatusActive, CQStatusTerminating}

	AdmissionAttemptsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "admission_attempts_total",
			Help: `The total number of attempts to admit workloads.
Each admission attempt might try to admit more than one workload.
The label 'result' can have the following values:
- 'success' means that at least one workload was admitted.,
- 'inadmissible' means that no workload was admitted.`,
		}, []string{"result"},
	)

	AdmissionCyclePreemptionSkips = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "admission_cycle_preemption_skips",
			Help: "The number of Workloads in the ClusterQueue that got preemption candidates " +
				"but had to be skipped because other ClusterQueues needed the same resources in the same cycle",
		}, []string{"cluster_queue"},
	)

	PendingWorkloads = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "pending_workloads",
			Help: `The number of pending workloads, per 'cluster_queue' and 'status'.
'status' can have the following values:
- "active" means that the workloads are in the admission queue.
- "inadmissible" means there was a failed admission attempt for these workloads and they won't be retried until cluster conditions, which could make this workload admissible, change`,
		}, []string{"cluster_queue", "status"},
	)

	LocalQueuePendingWorkloads = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_pending_workloads",
			Help: `The number of pending workloads, per 'local_queue' and 'status'.
'status' can have the following values:
- "active" means that the workloads are in the admission queue.
- "inadmissible" means there was a failed admission attempt for these workloads and they won't be retried until cluster conditions, which could make this workload admissible, change`,
		}, []string{"name", "namespace", "status"},
	)

	QuotaReservedWorkloadsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "quota_reserved_workloads_total",
			Help:      "The total number of quota reserved workloads per 'cluster_queue'",
		}, []string{"cluster_queue"},
	)

	LocalQueueQuotaReservedWorkloadsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_quota_reserved_workloads_total",
			Help:      "The total number of quota reserved workloads per 'local_queue'",
		}, []string{"name", "namespace"},
	)

	AdmittedWorkloadsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "admitted_workloads_total",
			Help:      "The total number of admitted workloads per 'cluster_queue'",
		}, []string{"cluster_queue"},
	)

	LocalQueueAdmittedWorkloadsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_admitted_workloads_total",
			Help:      "The total number of admitted workloads per 'local_queue'",
		}, []string{"name", "namespace"},
	)

	EvictedWorkloadsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "evicted_workloads_total",
			Help: `The number of evicted workloads per 'cluster_queue',
The label 'reason' can have the following values:
- "Preempted" means that the workload was evicted in order to free resources for a workload with a higher priority or reclamation of nominal quota.
- "PodsReadyTimeout" means that the eviction took place due to a PodsReady timeout.
- "AdmissionCheck" means that the workload was evicted because at least one admission check transitioned to False.
- "ClusterQueueStopped" means that the workload was evicted because the ClusterQueue is stopped.
- "Deactivated" means that the workload was evicted because spec.active is set to false`,
		}, []string{"cluster_queue", "reason"},
	)

	LocalQueueEvictedWorkloadsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_evicted_workloads_total",
			Help: `The number of evicted workloads per 'local_queue',
The label 'reason' can have the following values:
- "Preempted" means that the workload was evicted in order to free resources for a workload with a higher priority or reclamation of nominal quota.
- "PodsReadyTimeout" means that the eviction took place due to a PodsReady timeout.
- "AdmissionCheck" means that the workload was evicted because at least one admission check transitioned to False.
- "ClusterQueueStopped" means that the workload was evicted because the ClusterQueue is stopped.
- "Deactivated" means that the workload was evicted because spec.active is set to false`,
		}, []string{"name", "namespace", "reason"},
	)

	PreemptedWorkloadsTotal = prometheus.NewCounterVec(
		prometheus.CounterOpts{
			Subsystem: constants.KueueName,
			Name:      "preempted_workloads_total",
			Help: `The number of preempted workloads per 'preempting_cluster_queue',
The label 'reason' can have the following values:
- "InClusterQueue" means that the workload was preempted by a workload in the same ClusterQueue.
- "InCohortReclamation" means that the workload was preempted by a workload in the same cohort due to reclamation of nominal quota.
- "InCohortFairSharing" means that the workload was preempted by a workload in the same cohort due to fair sharing.
- "InCohortReclaimWhileBorrowing" means that the workload was preempted by a workload in the same cohort due to reclamation of nominal quota while borrowing.`,
		}, []string{"preempting_cluster_queue", "reason"},
	)

	ReservingActiveWorkloads = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "reserving_active_workloads",
			Help:      "The number of Workloads that are reserving quota, per 'cluster_queue'",
		}, []string{"cluster_queue"},
	)

	LocalQueueReservingActiveWorkloads = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_reserving_active_workloads",
			Help:      "The number of Workloads that are reserving quota, per 'localQueue'",
		}, []string{"name", "namespace"},
	)

	AdmittedActiveWorkloads = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "admitted_active_workloads",
			Help:      "The number of admitted Workloads that are active (unsuspended and not finished), per 'cluster_queue'",
		}, []string{"cluster_queue"},
	)

	LocalQueueAdmittedActiveWorkloads = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_admitted_active_workloads",
			Help:      "The number of admitted Workloads that are active (unsuspended and not finished), per 'localQueue'",
		}, []string{"name", "namespace"},
	)

	ClusterQueueByStatus = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "cluster_queue_status",
			Help: `Reports 'cluster_queue' with its 'status' (with possible values 'pending', 'active' or 'terminated').
For a ClusterQueue, the metric only reports a value of 1 for one of the statuses.`,
		}, []string{"cluster_queue", "status"},
	)

	LocalQueueByStatus = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_status",
			Help: `Reports 'localQueue' with its 'active' status (with possible values 'True', 'False', or 'Unknown').
For a LocalQueue, the metric only reports a value of 1 for one of the statuses.`,
		}, []string{"name", "namespace", "active"},
	)

	ClusterQueueResourceReservations = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "cluster_queue_resource_reservation",
			Help:      `Reports the cluster_queue's total resource reservation within all the flavors`,
		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
	)

	ClusterQueueResourceUsage = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "cluster_queue_resource_usage",
			Help:      `Reports the cluster_queue's total resource usage within all the flavors`,
		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
	)

	LocalQueueResourceReservations = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_resource_reservation",
			Help:      `Reports the localQueue's total resource reservation within all the flavors`,
		}, []string{"name", "namespace", "flavor", "resource"},
	)

	LocalQueueResourceUsage = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "local_queue_resource_usage",
			Help:      `Reports the localQueue's total resource usage within all the flavors`,
		}, []string{"name", "namespace", "flavor", "resource"},
	)

	ClusterQueueResourceNominalQuota = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "cluster_queue_nominal_quota",
			Help:      `Reports the cluster_queue's resource nominal quota within all the flavors`,
		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
	)

	ClusterQueueResourceBorrowingLimit = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "cluster_queue_borrowing_limit",
			Help:      `Reports the cluster_queue's resource borrowing limit within all the flavors`,
		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
	)

	ClusterQueueResourceLendingLimit = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "cluster_queue_lending_limit",
			Help:      `Reports the cluster_queue's resource lending limit within all the flavors`,
		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
	)

	ClusterQueueWeightedShare = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: constants.KueueName,
			Name:      "cluster_queue_weighted_share",
			Help: `Reports a value that representing the maximum of the ratios of usage above nominal 
quota to the lendable resources in the cohort, among all the resources provided by 
the ClusterQueue, and divided by the weight.
If zero, it means that the usage of the ClusterQueue is below the nominal quota.
If the ClusterQueue has a weight of zero, this will return 9223372036854775807,
the maximum possible share value.`,
		}, []string{"cluster_queue"},
	)
)

Functions

func AdmissionAttempt

func AdmissionAttempt(result AdmissionResult, duration time.Duration)

func AdmissionChecksWaitTime added in v0.7.0

func AdmissionChecksWaitTime(cqName kueue.ClusterQueueReference, waitTime time.Duration)

func AdmittedWorkload

func AdmittedWorkload(cqName kueue.ClusterQueueReference, waitTime time.Duration)

func ClearCacheMetrics

func ClearCacheMetrics(cqName string)

func ClearClusterQueueMetrics added in v0.8.1

func ClearClusterQueueMetrics(cqName string)

func ClearClusterQueueResourceMetrics added in v0.5.0

func ClearClusterQueueResourceMetrics(cqName string)

func ClearClusterQueueResourceQuotas added in v0.5.0

func ClearClusterQueueResourceQuotas(cqName, flavor, resource string)

func ClearClusterQueueResourceReservations added in v0.5.0

func ClearClusterQueueResourceReservations(cqName, flavor, resource string)

func ClearClusterQueueResourceUsage added in v0.5.0

func ClearClusterQueueResourceUsage(cqName, flavor, resource string)

func ClearLocalQueueCacheMetrics added in v0.10.0

func ClearLocalQueueCacheMetrics(lq LocalQueueReference)

func ClearLocalQueueMetrics added in v0.10.0

func ClearLocalQueueMetrics(lq LocalQueueReference)

func ClearLocalQueueResourceMetrics added in v0.10.0

func ClearLocalQueueResourceMetrics(lq LocalQueueReference)

func LocalQueueAdmissionChecksWaitTime added in v0.10.0

func LocalQueueAdmissionChecksWaitTime(lq LocalQueueReference, waitTime time.Duration)

func LocalQueueAdmittedWorkload added in v0.10.0

func LocalQueueAdmittedWorkload(lq LocalQueueReference, waitTime time.Duration)

func LocalQueueQuotaReservedWorkload added in v0.10.0

func LocalQueueQuotaReservedWorkload(lq LocalQueueReference, waitTime time.Duration)

func QuotaReservedWorkload added in v0.7.0

func QuotaReservedWorkload(cqName kueue.ClusterQueueReference, waitTime time.Duration)

func Register

func Register()

func RegisterLQMetrics added in v0.10.0

func RegisterLQMetrics()

func ReportClusterQueueQuotas added in v0.5.0

func ReportClusterQueueQuotas(cohort, queue, flavor, resource string, nominal, borrowing, lending float64)

func ReportClusterQueueResourceReservations added in v0.5.0

func ReportClusterQueueResourceReservations(cohort, queue, flavor, resource string, usage float64)

func ReportClusterQueueResourceUsage added in v0.5.0

func ReportClusterQueueResourceUsage(cohort, queue, flavor, resource string, usage float64)

func ReportClusterQueueStatus

func ReportClusterQueueStatus(cqName string, cqStatus ClusterQueueStatus)

func ReportClusterQueueWeightedShare added in v0.7.0

func ReportClusterQueueWeightedShare(cq string, weightedShare int64)

func ReportEvictedWorkloads added in v0.7.0

func ReportEvictedWorkloads(cqName, reason string)

func ReportLocalQueueEvictedWorkloads added in v0.10.0

func ReportLocalQueueEvictedWorkloads(lq LocalQueueReference, reason string)

func ReportLocalQueuePendingWorkloads added in v0.10.0

func ReportLocalQueuePendingWorkloads(lq LocalQueueReference, active, inadmissible int)

func ReportLocalQueueResourceReservations added in v0.10.0

func ReportLocalQueueResourceReservations(lq LocalQueueReference, flavor, resource string, usage float64)

func ReportLocalQueueResourceUsage added in v0.10.0

func ReportLocalQueueResourceUsage(lq LocalQueueReference, flavor, resource string, usage float64)

func ReportLocalQueueStatus added in v0.10.0

func ReportLocalQueueStatus(lq LocalQueueReference, conditionStatus metav1.ConditionStatus)

func ReportPendingWorkloads

func ReportPendingWorkloads(cqName string, active, inadmissible int)

func ReportPreemption added in v0.8.0

func ReportPreemption(preemptingCqName, preemptingReason, targetCqName string)

Types

type AdmissionResult

type AdmissionResult string

type ClusterQueueStatus

type ClusterQueueStatus string

type LocalQueueReference added in v0.10.0

type LocalQueueReference struct {
	Name      string
	Namespace string
}

func LQRefFromLocalQueueKey added in v0.10.0

func LQRefFromLocalQueueKey(lqKey string) LocalQueueReference

func LQRefFromWorkload added in v0.10.0

func LQRefFromWorkload(wl *kueue.Workload) LocalQueueReference

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL