metrics

package
v1.17.0-pre.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 2, 2024 License: Apache-2.0 Imports: 40 Imported by: 105

Documentation

Overview

Package metrics holds prometheus metrics objects and related utility functions. It does not abstract away the prometheus client but the caller rarely needs to refer to prometheus directly.

Index

Constants

View Source
const (
	// ErrorTimeout is the value used to notify timeout errors.
	ErrorTimeout = "timeout"

	// ErrorProxy is the value used to notify errors on Proxy.
	ErrorProxy = "proxy"

	// L7DNS is the value used to report DNS label on metrics
	L7DNS = "dns"

	// SubsystemBPF is the subsystem to scope metrics related to the bpf syscalls.
	SubsystemBPF = "bpf"

	// SubsystemDatapath is the subsystem to scope metrics related to management of
	// the datapath. It is prepended to metric names and separated with a '_'.
	SubsystemDatapath = "datapath"

	// SubsystemAgent is the subsystem to scope metrics related to the cilium agent itself.
	SubsystemAgent = "agent"

	// SubsystemFQDN is the subsystem to scope metrics related to the FQDN proxy.
	SubsystemIPCache = "ipcache"

	// SubsystemK8s is the subsystem to scope metrics related to Kubernetes
	SubsystemK8s = "k8s"

	// SubsystemK8sClient is the subsystem to scope metrics related to the kubernetes client.
	SubsystemK8sClient = "k8s_client"

	// SubsystemWorkQueue is the subsystem to scope metrics related to the workqueue.
	SubsystemWorkQueue = "k8s_workqueue"

	// SubsystemKVStore is the subsystem to scope metrics related to the kvstore.
	SubsystemKVStore = "kvstore"

	// SubsystemFQDN is the subsystem to scope metrics related to the FQDN proxy.
	SubsystemFQDN = "fqdn"

	// SubsystemNodes is the subsystem to scope metrics related to the node manager.
	SubsystemNodes = "nodes"

	// SubsystemTriggers is the subsystem to scope metrics related to the trigger package.
	SubsystemTriggers = "triggers"

	// SubsystemAPILimiter is the subsystem to scope metrics related to the API limiter package.
	SubsystemAPILimiter = "api_limiter"

	// CiliumAgentNamespace is used to scope metrics from the Cilium Agent
	CiliumAgentNamespace = "cilium"

	// CiliumClusterMeshAPIServerNamespace is used to scope metrics from the
	// Cilium Cluster Mesh API Server
	CiliumClusterMeshAPIServerNamespace = "cilium_clustermesh_apiserver"

	// CiliumClusterMeshAPIServerNamespace is used to scope metrics from
	// Cilium KVStoreMesh
	CiliumKVStoreMeshNamespace = "cilium_kvstoremesh"

	// CiliumOperatorNamespace is used to scope metrics from the Cilium Operator
	CiliumOperatorNamespace = "cilium_operator"

	// LabelError indicates the type of error (string)
	LabelError = "error"

	// LabelOutcome indicates whether the outcome of the operation was successful or not
	LabelOutcome = "outcome"

	// LabelAttempts is the number of attempts it took to complete the operation
	LabelAttempts = "attempts"

	// LabelValueFalse is the string value for true metric label values.
	LabelValueTrue = "true"

	// LabelValueFalse is the string value for false metric label values.
	LabelValueFalse = "false"

	// LabelValueOutcomeSuccess is used as a successful outcome of an operation
	LabelValueOutcomeSuccess = "success"

	// LabelValueOutcomeFail is used as an unsuccessful outcome of an operation
	LabelValueOutcomeFail = "fail"

	// LabelValueOutcomeFailure is used as an unsuccessful outcome of an operation.
	// NOTE: This should only be used for existing metrics, new metrics should use LabelValueOutcomeFail.
	LabelValueOutcomeFailure = "failure"

	// LabelDropReason is used to describe reason for dropping a packets/bytes
	LabelDropReason = "reason"

	// LabelEventSourceAPI marks event-related metrics that come from the API
	LabelEventSourceAPI = "api"

	// LabelEventSourceK8s marks event-related metrics that come from k8s
	LabelEventSourceK8s = "k8s"

	// LabelEventSourceFQDN marks event-related metrics that come from pkg/fqdn
	LabelEventSourceFQDN = "fqdn"

	// LabelEventSourceContainerd marks event-related metrics that come from docker
	LabelEventSourceContainerd = "docker"

	// LabelDatapathArea marks which area the metrics are related to (eg, which BPF map)
	LabelDatapathArea = "area"

	// LabelDatapathName marks a unique identifier for this metric.
	// The name should be defined once for a given type of error.
	LabelDatapathName = "name"

	// LabelDatapathFamily marks which protocol family (IPv4, IPV6) the metric is related to.
	LabelDatapathFamily = "family"

	// LabelProtocol marks the L4 protocol (TCP, ANY) for the metric.
	LabelProtocol = "protocol"

	// LabelSignalType marks the signal name
	LabelSignalType = "signal"

	// LabelSignalData marks the signal data
	LabelSignalData = "data"

	// LabelStatus the label from completed task
	LabelStatus = "status"

	// LabelPolicyEnforcement is the label used to see the enforcement status
	LabelPolicyEnforcement = "enforcement"

	// LabelPolicySource is the label used to see the enforcement status
	LabelPolicySource = "source"

	LabelSource = "source"

	// LabelScope is the label used to defined multiples scopes in the same
	// metric. For example, one counter may measure a metric over the scope of
	// the entire event (scope=global), or just part of an event
	// (scope=slow_path)
	LabelScope = "scope"

	// LabelProtocolL7 is the label used when working with layer 7 protocols.
	LabelProtocolL7 = "protocol_l7"

	// LabelBuildState is the state a build queue entry is in
	LabelBuildState = "state"

	// LabelBuildQueueName is the name of the build queue
	LabelBuildQueueName = "name"

	// LabelAction is the label used to defined what kind of action was performed in a metric
	LabelAction = "action"

	// LabelSubsystem is the label used to refer to any of the child process
	// started by cilium (Envoy, monitor, etc..)
	LabelSubsystem = "subsystem"

	// LabelKind is the kind of a label
	LabelKind = "kind"

	// LabelEventSource is the source of a label for event metrics
	// i.e. k8s, containerd, api.
	LabelEventSource = "source"

	// LabelPath is the label for the API path
	LabelPath = "path"
	// LabelMethod is the label for the HTTP method
	LabelMethod = "method"

	// LabelAPIReturnCode is the HTTP code returned for that API path
	LabelAPIReturnCode = "return_code"

	// LabelOperation is the label for BPF maps operations
	LabelOperation = "operation"

	// LabelMapName is the label for the BPF map name
	LabelMapName = "map_name"

	LabelMapGroup = "map_group"

	// LabelVersion is the label for the version number
	LabelVersion = "version"

	// LabelVersionRevision is the label for the version revision
	LabelVersionRevision = "revision"

	// LabelArch is the label for the platform architecture (e.g. linux/amd64)
	LabelArch = "arch"

	// LabelDirection is the label for traffic direction
	LabelDirection = "direction"

	// LabelSourceCluster is the label for source cluster name
	LabelSourceCluster = "source_cluster"

	// LabelSourceNodeName is the label for source node name
	LabelSourceNodeName = "source_node_name"

	// LabelTargetCluster is the label for target cluster name
	LabelTargetCluster = "target_cluster"

	// LabelTargetNodeIP is the label for target node IP
	LabelTargetNodeIP = "target_node_ip"

	// LabelTargetNodeName is the label for target node name
	LabelTargetNodeName = "target_node_name"

	// LabelTargetNodeType is the label for target node type (local_node, remote_intra_cluster, vs remote_inter_cluster)
	LabelTargetNodeType = "target_node_type"

	LabelLocationLocalNode          = "local_node"
	LabelLocationRemoteIntraCluster = "remote_intra_cluster"
	LabelLocationRemoteInterCluster = "remote_inter_cluster"

	// Rule label is a label for a L7 rule name.
	LabelL7Rule = "rule"

	// LabelL7ProxyType is the label for denoting a L7 proxy type.
	LabelL7ProxyType = "proxy_type"

	// LabelType is the label for type in general (e.g. endpoint, node)
	LabelType         = "type"
	LabelPeerEndpoint = "endpoint"
	LabelPeerNode     = "node"

	LabelTrafficHTTP = "http"
	LabelTrafficICMP = "icmp"

	LabelAddressType          = "address_type"
	LabelAddressTypePrimary   = "primary"
	LabelAddressTypeSecondary = "secondary"

	// LabelConnectivityStatus is the label for connectivity statuses
	LabelConnectivityStatus = "status"
	LabelReachable          = "reachable"
	LabelUnreachable        = "unreachable"
	LabelUnknown            = "unknown"
)
View Source
const DefaultMapCapacity = 65536

In general, most bpf maps are allocated to occupy a 16-bit key size. To reduce the number of metrics that need to be emitted for map capacity, we assume a default map size of 2^16 entries for all maps, which can be assumed unless specified otherwise.

Variables

View Source
var (
	NoOpMetric    prometheus.Metric    = &mockMetric{}
	NoOpCollector prometheus.Collector = &collector{}

	NoOpCounter           metricpkg.Counter                       = &counter{NoOpMetric, NoOpCollector}
	NoOpCounterVec        metricpkg.Vec[metricpkg.Counter]        = &counterVec{NoOpCollector}
	NoOpObserver          metricpkg.Observer                      = &observer{}
	NoOpHistogram         metricpkg.Histogram                     = &histogram{NoOpCollector}
	NoOpObserverVec       metricpkg.Vec[metricpkg.Observer]       = &observerVec{NoOpCollector}
	NoOpGauge             metricpkg.Gauge                         = &gauge{NoOpMetric, NoOpCollector}
	NoOpGaugeVec          metricpkg.Vec[metricpkg.Gauge]          = &gaugeVec{NoOpCollector}
	NoOpGaugeDeletableVec metricpkg.DeletableVec[metricpkg.Gauge] = &gaugeDeletableVec{gaugeVec{NoOpCollector}}
)
View Source
var (
	// LabelValuesBool is metric label value set for boolean type.
	LabelValuesBool = metric.NewValues(LabelValueTrue, LabelValueFalse)

	// Namespace is used to scope metrics from cilium. It is prepended to metric
	// names and separated with a '_'
	Namespace = CiliumAgentNamespace

	BPFMapPressure = true

	// BootstrapTimes is the durations of cilium-agent bootstrap sequence.
	BootstrapTimes = NoOpObserverVec

	// APIInteractions is the total time taken to process an API call made
	// to the cilium-agent
	APIInteractions = NoOpObserverVec

	// NodeConnectivityStatus is the connectivity status between local node to
	// other node intra or inter cluster.
	NodeConnectivityStatus = NoOpGaugeDeletableVec

	// NodeConnectivityLatency is the connectivity latency between local node to
	// other node intra or inter cluster.
	NodeConnectivityLatency = NoOpGaugeDeletableVec

	// NodeHealthConnectivityStatus is the number of connections with connectivity status
	// between local node to other node intra or inter cluster.
	NodeHealthConnectivityStatus = NoOpGaugeVec

	// NodeHealthConnectivityLatency is the histogram connectivity latency between local node to
	// other node intra or inter cluster.
	NodeHealthConnectivityLatency = NoOpObserverVec

	// Endpoint is a function used to collect this metric.
	// It must be thread-safe.
	Endpoint metric.GaugeFunc

	// EndpointMaxIfindex is the maximum observed interface index for existing endpoints
	EndpointMaxIfindex = NoOpGauge

	// EndpointRegenerationTotal is a count of the number of times any endpoint
	// has been regenerated and success/fail outcome
	EndpointRegenerationTotal = NoOpCounterVec

	// EndpointStateCount is the total count of the endpoints in various states.
	EndpointStateCount = NoOpGaugeVec

	// EndpointRegenerationTimeStats is the total time taken to regenerate
	// endpoints, labeled by span name and status ("success" or "failure")
	EndpointRegenerationTimeStats = NoOpObserverVec

	// EndpointPropagationDelay is the delay between creation of local CiliumEndpoint
	// and update for that CiliumEndpoint received through CiliumEndpointSlice.
	// Measure of local CEP roundtrip time with CiliumEndpointSlice feature enabled.
	EndpointPropagationDelay = NoOpObserverVec

	// Policy
	// Policy is the number of policies loaded into the agent
	Policy = NoOpGauge

	// PolicyRegenerationCount is the total number of successful policy
	// regenerations.
	// Deprecated: Use EndpointRegenerationTotal.
	PolicyRegenerationCount = NoOpCounter

	// PolicyRegenerationTimeStats is the total time taken to generate policies.
	// Deprecated: Use EndpointRegenerationTimeStats.
	PolicyRegenerationTimeStats = NoOpObserverVec

	// PolicyRevision is the current policy revision number for this agent
	PolicyRevision = NoOpGauge

	// PolicyChangeTotal is a count of policy changes by outcome ("success" or
	// "failure")
	PolicyChangeTotal = NoOpCounterVec

	// PolicyEndpointStatus is the number of endpoints with policy labeled by enforcement type
	PolicyEndpointStatus = NoOpGaugeVec

	// PolicyImplementationDelay is a distribution of times taken from adding a
	// policy (and incrementing the policy revision) to seeing it in the datapath
	// per Endpoint. This reflects the actual delay perceived by traffic flowing
	// through the datapath. The longest times will roughly correlate with the
	// time taken to fully deploy an endpoint.
	PolicyImplementationDelay = NoOpObserverVec

	// CIDRGroupsReferenced is the number of CNPs and CCNPs referencing at least one CiliumCIDRGroup.
	// CNPs with empty or non-existing CIDRGroupRefs are not considered.
	CIDRGroupsReferenced = NoOpGauge

	// Identity is the number of identities currently in use on the node by type
	Identity = NoOpGaugeVec

	// IdentityLabelSources is the number of identities in use on the node with
	// have a particular label source. Note that an identity may contain labels
	// from multiple sources and thus might be counted in multiple buckets
	IdentityLabelSources = NoOpGaugeVec

	// EventTS is the time in seconds since epoch that we last received an
	// event that was handled by Cilium. This metric tracks the source of the
	// event which can be one of K8s or Cilium's API.
	EventTS = NoOpGaugeVec

	// EventLagK8s is the lag calculation for k8s Pod events.
	EventLagK8s = NoOpGauge

	// ProxyRedirects is the number of redirects labeled by protocol
	ProxyRedirects = NoOpGaugeVec

	// ProxyPolicyL7Total is a count of all l7 requests handled by proxy
	ProxyPolicyL7Total = NoOpCounterVec

	// ProxyUpstreamTime is how long the upstream server took to reply labeled
	// by error, protocol and span time
	ProxyUpstreamTime = NoOpObserverVec

	// ProxyDatapathUpdateTimeout is a count of all the timeouts encountered while
	// updating the datapath due to an FQDN IP update
	ProxyDatapathUpdateTimeout = NoOpCounter

	// ConntrackGCRuns is the number of times that the conntrack GC
	// process was run.
	ConntrackGCRuns = NoOpCounterVec

	// ConntrackGCKeyFallbacks number of times that the conntrack key fallback was invalid.
	ConntrackGCKeyFallbacks = NoOpCounterVec

	// ConntrackGCSize the number of entries in the conntrack table
	ConntrackGCSize = NoOpGaugeVec

	// NatGCSize the number of entries in the nat table
	NatGCSize = NoOpGaugeVec

	// ConntrackGCDuration the duration of the conntrack GC process in milliseconds.
	ConntrackGCDuration = NoOpObserverVec

	// ConntrackDumpReset marks the count for conntrack dump resets
	ConntrackDumpResets = NoOpCounterVec

	// SignalsHandled is the number of signals received.
	SignalsHandled = NoOpCounterVec

	// ServicesEventsCount counts the number of services
	ServicesEventsCount = NoOpCounterVec

	// ServiceImplementationDelay the execution duration of the service handler in milliseconds.
	// The metric reflects the time it took to program the service excluding the event queue latency.
	ServiceImplementationDelay = NoOpObserverVec

	// ErrorsWarnings is the number of errors and warnings in cilium-agent instances
	ErrorsWarnings = NoOpCounterVec

	// ControllerRuns is the number of times that a controller process runs.
	ControllerRuns = NoOpCounterVec

	// ControllerRunsDuration the duration of the controller process in seconds
	ControllerRunsDuration = NoOpObserverVec

	// subprocess, labeled by Subsystem
	SubprocessStart = NoOpCounterVec

	// KubernetesEventProcessed is the number of Kubernetes events
	// processed labeled by scope, action and execution result
	KubernetesEventProcessed = NoOpCounterVec

	// KubernetesEventReceived is the number of Kubernetes events received
	// labeled by scope, action, valid data and equalness.
	KubernetesEventReceived = NoOpCounterVec

	// KubernetesAPIInteractions is the total time taken to process an API call made
	// to the kube-apiserver
	KubernetesAPIInteractions = NoOpObserverVec

	// KubernetesAPIRateLimiterLatency is the client side rate limiter latency metric
	KubernetesAPIRateLimiterLatency = NoOpObserverVec

	// KubernetesAPICallsTotal is the counter for all API calls made to
	// kube-apiserver.
	KubernetesAPICallsTotal = NoOpCounterVec

	// KubernetesCNPStatusCompletion is the number of seconds it takes to
	// complete a CNP status update
	KubernetesCNPStatusCompletion = NoOpObserverVec

	// TerminatingEndpointsEvents is the number of terminating endpoint events received from kubernetes.
	TerminatingEndpointsEvents = NoOpCounter

	// IPAMEvent is the number of IPAM events received labeled by action and
	// datapath family type
	IPAMEvent = NoOpCounterVec

	// IPAMCapacity tracks the total number of IPs that could be allocated. To
	// get the current number of available IPs, it would be this metric
	// subtracted by IPAMEvent{allocated}.
	IPAMCapacity = NoOpGaugeVec

	// KVStoreOperationsDuration records the duration of kvstore operations
	KVStoreOperationsDuration = NoOpObserverVec

	// KVStoreEventsQueueDuration records the duration in seconds of time
	// received event was blocked before it could be queued
	KVStoreEventsQueueDuration = NoOpObserverVec

	// KVStoreQuorumErrors records the number of kvstore quorum errors
	KVStoreQuorumErrors = NoOpCounterVec

	// FQDNGarbageCollectorCleanedTotal is the number of domains cleaned by the
	// GC job.
	FQDNGarbageCollectorCleanedTotal = NoOpCounter

	// FQDNActiveNames is the number of domains inside the DNS cache that have
	// not expired (by TTL), per endpoint.
	FQDNActiveNames = NoOpGaugeVec

	// FQDNActiveIPs is the number of IPs inside the DNS cache associated with
	// a domain that has not expired (by TTL) and are currently active, per
	// endpoint.
	FQDNActiveIPs = NoOpGaugeVec

	// FQDNAliveZombieConnections is the number IPs associated with domains
	// that have expired (by TTL) yet still associated with an active
	// connection (aka zombie), per endpoint.
	FQDNAliveZombieConnections = NoOpGaugeVec

	// FQDNSelectors is the total number of registered ToFQDN selectors
	FQDNSelectors = NoOpGauge

	// FQDNSemaphoreRejectedTotal is the total number of DNS requests rejected
	// by the DNS proxy because too many requests were in flight, as enforced by
	// the admission semaphore.
	FQDNSemaphoreRejectedTotal = NoOpCounter

	// IPCacheErrorsTotal is the total number of IPCache events handled in
	// the IPCache subsystem that resulted in errors.
	IPCacheErrorsTotal = NoOpCounterVec

	// IPCacheEventsTotal is the total number of IPCache events handled in
	// the IPCache subsystem.
	IPCacheEventsTotal = NoOpCounterVec

	// BPFSyscallDuration is the metric for bpf syscalls duration.
	BPFSyscallDuration = NoOpObserverVec

	// BPFMapOps is the metric to measure the number of operations done to a
	// bpf map.
	BPFMapOps = NoOpCounterVec

	// BPFMapCapacity is the max capacity of bpf maps, labelled by map group classification.
	BPFMapCapacity = NoOpGaugeVec

	// TriggerPolicyUpdateTotal is the metric to count total number of
	// policy update triggers
	TriggerPolicyUpdateTotal = NoOpCounterVec

	// TriggerPolicyUpdateFolds is the current level folding that is
	// happening when running policy update triggers
	TriggerPolicyUpdateFolds = NoOpGauge

	// TriggerPolicyUpdateCallDuration measures the latency and call
	// duration of policy update triggers
	TriggerPolicyUpdateCallDuration = NoOpObserverVec

	// VersionMetric labelled by Cilium version
	VersionMetric = NoOpGaugeVec

	// APILimiterWaitHistoryDuration is a histogram that measures the
	// individual wait durations of API limiters
	APILimiterWaitHistoryDuration = NoOpObserverVec

	// APILimiterWaitDuration is the gauge of the current mean, min, and
	// max wait duration
	APILimiterWaitDuration = NoOpGaugeVec

	// APILimiterProcessingDuration is the gauge of the mean and estimated
	// processing duration
	APILimiterProcessingDuration = NoOpGaugeVec

	// APILimiterRequestsInFlight is the gauge of the current and max
	// requests in flight
	APILimiterRequestsInFlight = NoOpGaugeVec

	// APILimiterRateLimit is the gauge of the current rate limiting
	// configuration including limit and burst
	APILimiterRateLimit = NoOpGaugeVec

	// APILimiterAdjustmentFactor is the gauge representing the latest
	// adjustment factor that was applied
	APILimiterAdjustmentFactor = NoOpGaugeVec

	// APILimiterProcessedRequests is the counter of the number of
	// processed (successful and failed) requests
	APILimiterProcessedRequests = NoOpCounterVec

	// WorkQueueDepth is the depth of the workqueue
	//
	// We set actual metrics here instead of NoOp for the workqueue metrics
	// because these metrics will be registered with workqueue.SetProvider
	// by init function in watcher.go. Otherwise, we will register NoOps.
	//
	WorkQueueDepth = metric.NewGaugeVec(metric.GaugeOpts{
		ConfigName: Namespace + "_" + SubsystemWorkQueue + "_depth",
		Namespace:  Namespace,
		Subsystem:  SubsystemWorkQueue,
		Name:       "depth",
		Help:       "Current depth of workqueue.",
	}, []string{"name"})

	// WorkQueueAddsTotal is the total number of adds to the workqueue
	WorkQueueAddsTotal = metric.NewCounterVec(metric.CounterOpts{
		ConfigName: Namespace + "_" + SubsystemWorkQueue + "_adds_total",
		Namespace:  Namespace,
		Subsystem:  SubsystemWorkQueue,
		Name:       "adds_total",
		Help:       "Total number of adds handled by workqueue.",
	}, []string{"name"})

	// WorkQueueLatency is the latency of how long an item stays in the workqueue
	WorkQueueLatency = metric.NewHistogramVec(metric.HistogramOpts{
		ConfigName: Namespace + "_" + SubsystemWorkQueue + "_queue_duration_seconds",
		Namespace:  Namespace,
		Subsystem:  SubsystemWorkQueue,
		Name:       "queue_duration_seconds",
		Help:       "How long in seconds an item stays in workqueue before being requested.",
		Buckets:    prometheus.ExponentialBuckets(10e-9, 10, 10),
	}, []string{"name"})

	// WorkQueueDuration is the duration of how long processing an item for the workqueue
	WorkQueueDuration = metric.NewHistogramVec(metric.HistogramOpts{
		ConfigName: Namespace + "_" + SubsystemWorkQueue + "_work_duration_seconds",
		Namespace:  Namespace,
		Subsystem:  SubsystemWorkQueue,
		Name:       "work_duration_seconds",
		Help:       "How long in seconds processing an item from workqueue takes.",
		Buckets:    prometheus.ExponentialBuckets(10e-9, 10, 10),
	}, []string{"name"})

	// WorkQueueUnfinishedWork is how many seconds of work has been done that is in progress
	WorkQueueUnfinishedWork = metric.NewGaugeVec(metric.GaugeOpts{
		ConfigName: Namespace + "_" + SubsystemWorkQueue + "_unfinished_work_seconds",
		Namespace:  Namespace,
		Subsystem:  SubsystemWorkQueue,
		Name:       "unfinished_work_seconds",
		Help: "How many seconds of work has been done that " +
			"is in progress and hasn't been observed by work_duration. Large " +
			"values indicate stuck threads. One can deduce the number of stuck " +
			"threads by observing the rate at which this increases.",
	}, []string{"name"})

	// WorkQueueLongestRunningProcessor is the longest running processor in the workqueue
	WorkQueueLongestRunningProcessor = metric.NewGaugeVec(metric.GaugeOpts{
		ConfigName: Namespace + "_" + SubsystemWorkQueue + "_longest_running_processor_seconds",
		Namespace:  Namespace,
		Subsystem:  SubsystemWorkQueue,
		Name:       "longest_running_processor_seconds",
		Help: "How many seconds has the longest running " +
			"processor for workqueue been running.",
	}, []string{"name"})

	// WorkQueueRetries is the number of retries for handled by the workqueue
	WorkQueueRetries = metric.NewCounterVec(metric.CounterOpts{
		ConfigName: Namespace + "_" + SubsystemWorkQueue + "_retries_total",
		Namespace:  Namespace,
		Subsystem:  SubsystemWorkQueue,
		Name:       "retries_total",
		Help:       "Total number of retries handled by workqueue.",
	}, []string{"name"})
)
View Source
var Cell = cell.Module("metrics", "Metrics",

	cell.Provide(NewRegistry),
	Metric(NewLegacyMetrics),
	cell.Config(defaultRegistryConfig),
	cell.Invoke(func(_ *Registry) {

		FlushLoggingMetrics()
	}),
	cell.Provide(newMetricsRestApiHandler),
)

Functions

func BoolToFloat64

func BoolToFloat64(v bool) float64

func Errno2Outcome

func Errno2Outcome(errno unix.Errno) string

Errno2Outcome converts a unix.Errno to LabelOutcome

func Error2Outcome

func Error2Outcome(err error) string

Error2Outcome converts an error to LabelOutcome

func FlushLoggingMetrics added in v1.14.11

func FlushLoggingMetrics()

FlushLoggingMetrics will cause all logging hook metrics accumulated prior to the errors_warnings metrics being registered with the Prometheus collector to be incremented to their respective errors_warnings metrics tuple.

func GetCounterValue

func GetCounterValue(m prometheus.Counter) float64

GetCounterValue returns the current value stored for the counter

func GetGaugeValue

func GetGaugeValue(m prometheus.Gauge) float64

GetGaugeValue returns the current value stored for the gauge. This function is useful in tests.

func InitOperatorMetrics added in v1.14.11

func InitOperatorMetrics()

InitOperatorMetrics is used to init legacy metrics necessary during operator init.

func LabelOutcome2Code added in v1.15.4

func LabelOutcome2Code(outcome string) int

LabelOutcome2Code converts a label outcome to a code

func Metric added in v1.16.0

func Metric[S any](ctor func() S) cell.Cell

Metric constructs a new metric cell.

This cell type provides `S` to the hive as returned by `ctor`, it also makes each individual field value available via the `hive-metrics` value group. Infrastructure components such as a registry, inspection tool, or documentation generator can collect all metrics in the hive via this value group.

The `ctor` constructor must return a struct or pointer to a struct of type `S`. The returned struct must only contain public fields. All field types should implement the `github.com/cilium/cilium/pkg/metrics/metric.WithMetadata` and `github.com/prometheus/client_golang/prometheus.Collector` interfaces.

func Register

func Register(c prometheus.Collector) error

Register registers a collector

func RegisterList

func RegisterList(list []prometheus.Collector) error

RegisterList registers a list of collectors. If registration of one collector fails, no collector is registered.

func Reinitialize

func Reinitialize()

func Unregister

func Unregister(c prometheus.Collector) bool

Unregister unregisters a collector

func UpdateMapCapacity added in v1.15.0

func UpdateMapCapacity(groupName string, capacity uint32)

Types

type APIEventTSHelper

type APIEventTSHelper struct {
	Next      http.Handler
	TSGauge   metric.Vec[metric.Gauge]
	Histogram metric.Vec[metric.Observer]
}

APIEventTSHelper is intended to be a global middleware to track metrics around API calls. It records the timestamp of an API call in the provided gauge.

func (*APIEventTSHelper) ServeHTTP

func (m *APIEventTSHelper) ServeHTTP(r http.ResponseWriter, req *http.Request)

ServeHTTP implements the http.Handler interface. It records the timestamp this API call began at, then chains to the next handler.

type GaugeWithThreshold

type GaugeWithThreshold struct {
	// contains filtered or unexported fields
}

GaugeWithThreshold is a prometheus gauge that registers itself with prometheus if over a threshold value and unregisters when under.

func NewBPFMapPressureGauge

func NewBPFMapPressureGauge(mapname string, threshold float64) *GaugeWithThreshold

NewBPFMapPressureGauge creates a new GaugeWithThreshold for the cilium_bpf_map_pressure metric with the map name as constant label.

func NewGaugeWithThreshold

func NewGaugeWithThreshold(name string, subsystem string, desc string, labels map[string]string, threshold float64) *GaugeWithThreshold

NewGaugeWithThreshold creates a new GaugeWithThreshold.

func (*GaugeWithThreshold) Set

func (gwt *GaugeWithThreshold) Set(value float64)

Set the value of the GaugeWithThreshold.

type LegacyMetrics

type LegacyMetrics struct {
	BootstrapTimes                   metric.Vec[metric.Observer]
	APIInteractions                  metric.Vec[metric.Observer]
	NodeConnectivityStatus           metric.DeletableVec[metric.Gauge]
	NodeConnectivityLatency          metric.DeletableVec[metric.Gauge]
	NodeHealthConnectivityStatus     metric.Vec[metric.Gauge]
	NodeHealthConnectivityLatency    metric.Vec[metric.Observer]
	Endpoint                         metric.GaugeFunc
	EndpointMaxIfindex               metric.Gauge
	EndpointRegenerationTotal        metric.Vec[metric.Counter]
	EndpointStateCount               metric.Vec[metric.Gauge]
	EndpointRegenerationTimeStats    metric.Vec[metric.Observer]
	EndpointPropagationDelay         metric.Vec[metric.Observer]
	Policy                           metric.Gauge
	PolicyRegenerationCount          metric.Counter
	PolicyRegenerationTimeStats      metric.Vec[metric.Observer]
	PolicyRevision                   metric.Gauge
	PolicyChangeTotal                metric.Vec[metric.Counter]
	PolicyEndpointStatus             metric.Vec[metric.Gauge]
	PolicyImplementationDelay        metric.Vec[metric.Observer]
	CIDRGroupsReferenced             metric.Gauge
	Identity                         metric.Vec[metric.Gauge]
	IdentityLabelSources             metric.Vec[metric.Gauge]
	EventTS                          metric.Vec[metric.Gauge]
	EventLagK8s                      metric.Gauge
	ProxyRedirects                   metric.Vec[metric.Gauge]
	ProxyPolicyL7Total               metric.Vec[metric.Counter]
	ProxyUpstreamTime                metric.Vec[metric.Observer]
	ProxyDatapathUpdateTimeout       metric.Counter
	ConntrackGCRuns                  metric.Vec[metric.Counter]
	ConntrackGCKeyFallbacks          metric.Vec[metric.Counter]
	ConntrackGCSize                  metric.Vec[metric.Gauge]
	NatGCSize                        metric.Vec[metric.Gauge]
	ConntrackGCDuration              metric.Vec[metric.Observer]
	ConntrackDumpResets              metric.Vec[metric.Counter]
	SignalsHandled                   metric.Vec[metric.Counter]
	ServicesEventsCount              metric.Vec[metric.Counter]
	ServiceImplementationDelay       metric.Vec[metric.Observer]
	ErrorsWarnings                   metric.Vec[metric.Counter]
	ControllerRuns                   metric.Vec[metric.Counter]
	ControllerRunsDuration           metric.Vec[metric.Observer]
	SubprocessStart                  metric.Vec[metric.Counter]
	KubernetesEventProcessed         metric.Vec[metric.Counter]
	KubernetesEventReceived          metric.Vec[metric.Counter]
	KubernetesAPIInteractions        metric.Vec[metric.Observer]
	KubernetesAPIRateLimiterLatency  metric.Vec[metric.Observer]
	KubernetesAPICallsTotal          metric.Vec[metric.Counter]
	KubernetesCNPStatusCompletion    metric.Vec[metric.Observer]
	TerminatingEndpointsEvents       metric.Counter
	IPAMEvent                        metric.Vec[metric.Counter]
	IPAMCapacity                     metric.Vec[metric.Gauge]
	KVStoreOperationsDuration        metric.Vec[metric.Observer]
	KVStoreEventsQueueDuration       metric.Vec[metric.Observer]
	KVStoreQuorumErrors              metric.Vec[metric.Counter]
	FQDNGarbageCollectorCleanedTotal metric.Counter
	FQDNActiveNames                  metric.Vec[metric.Gauge]
	FQDNActiveIPs                    metric.Vec[metric.Gauge]
	FQDNAliveZombieConnections       metric.Vec[metric.Gauge]
	FQDNSelectors                    metric.Gauge
	FQDNSemaphoreRejectedTotal       metric.Counter
	IPCacheErrorsTotal               metric.Vec[metric.Counter]
	IPCacheEventsTotal               metric.Vec[metric.Counter]
	BPFSyscallDuration               metric.Vec[metric.Observer]
	BPFMapOps                        metric.Vec[metric.Counter]
	BPFMapCapacity                   metric.Vec[metric.Gauge]
	TriggerPolicyUpdateTotal         metric.Vec[metric.Counter]
	TriggerPolicyUpdateFolds         metric.Gauge
	TriggerPolicyUpdateCallDuration  metric.Vec[metric.Observer]
	VersionMetric                    metric.Vec[metric.Gauge]
	APILimiterWaitHistoryDuration    metric.Vec[metric.Observer]
	APILimiterWaitDuration           metric.Vec[metric.Gauge]
	APILimiterProcessingDuration     metric.Vec[metric.Gauge]
	APILimiterRequestsInFlight       metric.Vec[metric.Gauge]
	APILimiterRateLimit              metric.Vec[metric.Gauge]
	APILimiterAdjustmentFactor       metric.Vec[metric.Gauge]
	APILimiterProcessedRequests      metric.Vec[metric.Counter]
	WorkQueueDepth                   metric.Vec[metric.Gauge]
	WorkQueueAddsTotal               metric.Vec[metric.Counter]
	WorkQueueLatency                 metric.Vec[metric.Observer]
	WorkQueueDuration                metric.Vec[metric.Observer]
	WorkQueueUnfinishedWork          metric.Vec[metric.Gauge]
	WorkQueueLongestRunningProcessor metric.Vec[metric.Gauge]
	WorkQueueRetries                 metric.Vec[metric.Counter]
}

func NewLegacyMetrics

func NewLegacyMetrics() *LegacyMetrics

type LoggingHook

type LoggingHook struct {
	// contains filtered or unexported fields
}

LoggingHook is a hook for logrus which counts error and warning messages as a Prometheus metric.

func NewLoggingHook

func NewLoggingHook() *LoggingHook

NewLoggingHook returns a new instance of LoggingHook for the given Cilium component.

func (*LoggingHook) Fire

func (h *LoggingHook) Fire(entry *logrus.Entry) error

Fire is the main method which is called every time when logger has an error or warning message.

func (*LoggingHook) Levels

func (h *LoggingHook) Levels() []logrus.Level

Levels returns the list of logging levels on which the hook is triggered.

type Registry

type Registry struct {
	// contains filtered or unexported fields
}

Registry is a cell around a prometheus registry. This registry starts an HTTP server as part of its lifecycle on which all enabled metrics will be available. A reference to this registry can also be used to dynamically register or unregister `prometheus.Collector`s.

func NewRegistry

func NewRegistry(params RegistryParams) *Registry

func (*Registry) DumpMetrics

func (r *Registry) DumpMetrics() ([]*models.Metric, error)

DumpMetrics gets the current Cilium metrics and dumps all into a models.Metrics structure.If metrics cannot be retrieved, returns an error

func (*Registry) MustRegister

func (r *Registry) MustRegister(c ...prometheus.Collector)

MustRegister adds the collector to the registry, exposing this metric to prometheus scrapes. It will panic on error.

func (*Registry) Register

func (r *Registry) Register(c prometheus.Collector) error

Register registers a collector

func (*Registry) RegisterList

func (r *Registry) RegisterList(list []prometheus.Collector) error

RegisterList registers a list of collectors. If registration of one collector fails, no collector is registered.

func (*Registry) Reinitialize

func (r *Registry) Reinitialize()

Reinitialize creates a new internal registry and re-registers metrics to it.

func (*Registry) Unregister

func (r *Registry) Unregister(c prometheus.Collector) bool

Unregister unregisters a collector

type RegistryConfig

type RegistryConfig struct {
	// PrometheusServeAddr IP:Port on which to serve prometheus metrics (pass ":Port" to bind on all interfaces, "" is off)
	PrometheusServeAddr string
	// This is a list of metrics to be enabled or disabled, format is `+`/`-` + `{metric name}`
	Metrics []string
}

func (RegistryConfig) Flags

func (rc RegistryConfig) Flags(flags *pflag.FlagSet)

type RegistryParams

type RegistryParams struct {
	cell.In

	Logger     logrus.FieldLogger
	Shutdowner hive.Shutdowner
	Lifecycle  cell.Lifecycle

	AutoMetrics []metricpkg.WithMetadata `group:"hive-metrics"`
	Config      RegistryConfig

	DaemonConfig *option.DaemonConfig
}

RegistryParams are the parameters needed to construct a Registry

type ResponderWrapper

type ResponderWrapper struct {
	http.ResponseWriter
	// contains filtered or unexported fields
}

func (*ResponderWrapper) WriteHeader

func (rw *ResponderWrapper) WriteHeader(code int)

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL