metrics

package
v1.5.0-rc1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 19, 2019 License: Apache-2.0 Imports: 17 Imported by: 0

Documentation

Overview

Package metrics holds prometheus metrics objects and related utility functions. It does not abstract away the prometheus client but the caller rarely needs to refer to prometheus directly.

Index

Constants

View Source
const (
	// BuildStateWaiting is the value of LabelBuildState to describe
	// the number of entries waiting in the build queue
	BuildStateWaiting = "waiting"

	// BuildStateBlocked is the value of LabelBuildState to describe
	// the number of entries scheduled for building but blocked due to
	// build conditions
	BuildStateBlocked = "blocked"

	// BuildStateRunning is the value of LabelBuildState to describe
	// the number of builds currently running
	BuildStateRunning = "running"

	// ErrorTimeout is the value used to notify timeout errors.
	ErrorTimeout = "timeout"

	// ErrorProxy is the value used to notify errors on Proxy.
	ErrorProxy = "proxy"

	//L7DNS is the value used to report DNS label on metrics
	L7DNS = "dns"
)

Variables

View Source
var (

	// Namespace is used to scope metrics from cilium. It is prepended to metric
	// names and separated with a '_'
	Namespace = "cilium"

	// Datapath is the subsystem to scope metrics related to management of
	// the datapath. It is prepended to metric names and separated with a '_'.
	Datapath = "datapath"

	// Agent is the subsystem to scope metrics related to the cilium agent itself.
	Agent = "agent"

	// K8s is the subsystem to scope metrics related to Kubernetes
	K8s = "k8s"

	// K8sClient is the subsystem to scope metrics related to the kubernetes client.
	K8sClient = "k8s_client"

	// LabelOutcome indicates whether the outcome of the operation was successful or not
	LabelOutcome = "outcome"

	// LabelAttempts is the number of attempts it took to complete the operation
	LabelAttempts = "attempts"

	// LabelValueOutcomeSuccess is used as a successful outcome of an operation
	LabelValueOutcomeSuccess = "success"

	// LabelValueOutcomeFail is used as an unsuccessful outcome of an operation
	LabelValueOutcomeFail = "fail"

	// LabelEventSourceAPI marks event-related metrics that come from the API
	LabelEventSourceAPI = "api"

	// LabelEventSourceK8s marks event-related metrics that come from k8s
	LabelEventSourceK8s = "k8s"

	// LabelEventSourceFQDN marks event-related metrics that come from pkg/fqdn
	LabelEventSourceFQDN = "fqdn"

	// LabelEventSourceContainerd marks event-related metrics that come from docker
	LabelEventSourceContainerd = "docker"

	// LabelDatapathArea marks which area the metrics are related to (eg, which BPF map)
	LabelDatapathArea = "area"

	// LabelDatapathName marks a unique identifier for this metric.
	// The name should be defined once for a given type of error.
	LabelDatapathName = "name"

	// LabelDatapathFamily marks which protocol family (IPv4, IPV6) the metric is related to.
	LabelDatapathFamily = "family"

	// LabelProtocol marks the L4 protocol (TCP, ANY) for the metric.
	LabelProtocol = "protocol"

	// LabelStatus the label from completed task
	LabelStatus = "status"

	//LabelPolicyEnforcement is the label used to see the enforcement status
	LabelPolicyEnforcement = "enforcement"

	//LabelPolicySource is the label used to see the enforcement status
	LabelPolicySource = "source"

	// LabelScope is the label used to defined multiples scopes in the same
	// metric. For example, one counter may measure a metric over the scope of
	// the entire event (scope=global), or just part of an event
	// (scope=slow_path)
	LabelScope = "scope"

	// LabelProtocolL7 is the label used when working with layer 7 protocols.
	LabelProtocolL7 = "protocol_l7"

	// LabelBuildState is the state a build queue entry is in
	LabelBuildState = "state"

	// LabelBuildQueueName is the name of the build queue
	LabelBuildQueueName = "name"

	// LabelAction is the label used to defined what kind of action was performed in a metric
	LabelAction = "action"

	// LabelSubsystem is the label used to refer to any of the child process
	// started by cilium (Envoy, monitor, etc..)
	LabelSubsystem = "subsystem"

	// LabelKind is the kind a label
	LabelKind = "kind"

	// LabelPath is the label for the API path
	LabelPath = "path"
	// LabelMethod is the label for the HTTP method
	LabelMethod = "method"

	// LabelAPIReturnCode is the HTTP code returned for that API path
	LabelAPIReturnCode = "return_code"

	// APIInteractions is the total time taken to process an API call made
	// to the cilium-agent
	APIInteractions = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Subsystem: Agent,
		Name:      "api_process_time_seconds",
		Help:      "Duration of processed API calls labeled by path, method and return code.",
	}, []string{LabelPath, LabelMethod, LabelAPIReturnCode})

	// EndpointCount is a function used to collect this metric.
	// It must be thread-safe.
	EndpointCount prometheus.GaugeFunc

	// EndpointCountRegenerating is the number of endpoints currently regenerating
	EndpointCountRegenerating = prometheus.NewGauge(prometheus.GaugeOpts{
		Namespace: Namespace,
		Name:      "endpoint_regenerating",
		Help:      "Number of endpoints currently regenerating. Deprecated. Use endpoint_state with proper labels instead",
	})

	// EndpointRegenerationCount is a count of the number of times any endpoint
	// has been regenerated and success/fail outcome
	EndpointRegenerationCount = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "endpoint_regenerations",
		Help:      "Count of all endpoint regenerations that have completed, tagged by outcome",
	},
		[]string{"outcome"})

	// EndpointStateCount is the total count of the endpoints in various states.
	EndpointStateCount = prometheus.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: Namespace,
			Name:      "endpoint_state",
			Help:      "Count of all endpoints, tagged by different endpoint states",
		},
		[]string{"endpoint_state"},
	)

	// EndpointRegenerationTimeStats is the total time taken to regenerate
	// endpoints, labeled by span name and status ("success" or "failure")
	EndpointRegenerationTimeStats = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Name:      "endpoint_regeneration_time_stats_seconds",
		Help:      "Endpoint regeneration time stats labeled by the scope",
	}, []string{LabelScope, LabelStatus})

	// PolicyCount is the number of policies loaded into the agent
	PolicyCount = prometheus.NewGauge(prometheus.GaugeOpts{
		Namespace: Namespace,
		Name:      "policy_count",
		Help:      "Number of policies currently loaded",
	})

	// PolicyRegenerationCount is the total number of successful policy
	// regenerations.
	PolicyRegenerationCount = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_regeneration_total",
		Help:      "Total number of successful policy regenerations",
	})

	// PolicyRegenerationTime is the total time taken to generate policies
	PolicyRegenerationTime = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_regeneration_seconds_total",
		Help:      "Total sum of successful policy regeneration times",
	})

	// PolicyRegenerationTimeSquare is the sum of squares of total time taken
	// to generate policies
	PolicyRegenerationTimeSquare = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_regeneration_square_seconds_total",
		Help:      "Total sum of squares of successful policy regeneration times",
	})

	// PolicyRevision is the current policy revision number for this agent
	PolicyRevision = prometheus.NewGauge(prometheus.GaugeOpts{
		Namespace: Namespace,
		Name:      "policy_max_revision",
		Help:      "Highest policy revision number in the agent",
	})

	// PolicyImportErrors is a count of failed policy imports
	PolicyImportErrors = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_import_errors",
		Help:      "Number of times a policy import has failed",
	})

	// PolicyEndpointStatus is the number of endpoints with policy labeled by enforcement type
	PolicyEndpointStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: Namespace,
		Name:      "policy_endpoint_enforcement_status",
		Help:      "Number of endpoints labeled by policy enforcement status",
	}, []string{LabelPolicyEnforcement})

	// PolicyImplementationDelay is a distribution of times taken from adding a
	// policy (and incrementing the policy revision) to seeing it in the datapath
	// per Endpoint. This reflects the actual delay percieved by traffic flowing
	// through the datapath. The longest times will roughly correlate with the
	// time taken to fully deploy an endpoint.
	PolicyImplementationDelay = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Name:      "policy_implementation_delay",
		Help:      "Time between a policy change and it being fully deployed into the datapath",
	}, []string{LabelPolicySource})

	// EventTSK8s is the timestamp of k8s events
	EventTSK8s = prometheus.NewGauge(prometheus.GaugeOpts{
		Namespace:   Namespace,
		Name:        "event_ts",
		Help:        "Last timestamp when we received an event",
		ConstLabels: prometheus.Labels{"source": LabelEventSourceK8s},
	})

	// EventTSContainerd is the timestamp of docker events
	EventTSContainerd = prometheus.NewGauge(prometheus.GaugeOpts{
		Namespace:   Namespace,
		Name:        "event_ts",
		Help:        "Last timestamp when we received an event",
		ConstLabels: prometheus.Labels{"source": LabelEventSourceContainerd},
	})

	// EventTSAPI is the timestamp of docker events
	EventTSAPI = prometheus.NewGauge(prometheus.GaugeOpts{
		Namespace:   Namespace,
		Name:        "event_ts",
		Help:        "Last timestamp when we received an event",
		ConstLabels: prometheus.Labels{"source": LabelEventSourceAPI},
	})

	// ProxyRedirects is the number of redirects labelled by protocol
	ProxyRedirects = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: Namespace,
		Name:      "proxy_redirects",
		Help:      "Number of redirects installed for endpoints, labeled by protocol",
	}, []string{LabelProtocolL7})

	// ProxyParseErrors is a count of failed parse errors on proxy
	ProxyParseErrors = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_l7_parse_errors_total",
		Help:      "Number of total L7 parse errors",
	})

	// ProxyForwarded is a count of all forwarded requests by proxy
	ProxyForwarded = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_l7_forwarded_total",
		Help:      "Number of total L7 forwarded requests/responses",
	})

	// ProxyDenied is a count of all denied requests by policy by the proxy
	ProxyDenied = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_l7_denied_total",
		Help:      "Number of total L7 denied requests/responses due to policy",
	})

	// ProxyReceived is a count of all received requests by the proxy
	ProxyReceived = prometheus.NewCounter(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "policy_l7_received_total",
		Help:      "Number of total L7 received requests/responses",
	})

	// ProxyUpstreamTime is how long the upstream server took to reply labeled
	// by error, protocol and span time
	ProxyUpstreamTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Name:      "proxy_upstream_reply_seconds",
		Help:      "Seconds waited to get a reply from a upstream server",
	}, []string{"error", LabelProtocolL7, LabelScope})

	// DropCount is the total drop requests,
	// tagged by drop reason and direction(ingress/egress)
	DropCount = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "drop_count_total",
		Help:      "Total dropped packets, tagged by drop reason and ingress/egress direction",
	},
		[]string{"reason", "direction"})

	// DropBytes is the total dropped bytes,
	// tagged by drop reason and direction(ingress/egress)
	DropBytes = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "drop_bytes_total",
		Help:      "Total dropped bytes, tagged by drop reason and ingress/egress direction",
	},
		[]string{"reason", "direction"})

	// ForwardCount is the total forwarded packets,
	// tagged by ingress/egress direction
	ForwardCount = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "forward_count_total",
		Help:      "Total forwarded packets, tagged by ingress/egress direction",
	},
		[]string{"direction"})

	// ForwardBytes is the total forwarded bytes,
	// tagged by ingress/egress direction
	ForwardBytes = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "forward_bytes_total",
		Help:      "Total forwarded bytes, tagged by ingress/egress direction",
	},
		[]string{"direction"})

	// DatapathErrors is the number of errors managing datapath components
	// such as BPF maps.
	DatapathErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Subsystem: Datapath,
		Name:      "errors_total",
		Help:      "Number of errors that occurred in the datapath or datapath management",
	}, []string{LabelDatapathArea, LabelDatapathName, LabelDatapathFamily})

	// ConntrackGCRuns is the number of times that the conntrack GC
	// process was run.
	ConntrackGCRuns = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Subsystem: Datapath,
		Name:      "conntrack_gc_runs_total",
		Help: "Number of times that the conntrack garbage collector process was run " +
			"labeled by completion status",
	}, []string{LabelDatapathFamily, LabelProtocol, LabelStatus})

	// ConntrackGCKeyFallbacks number of times that the conntrack key fallback was invalid.
	ConntrackGCKeyFallbacks = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Subsystem: Datapath,
		Name:      "conntrack_gc_key_fallbacks_total",
		Help:      "Number of times a key fallback was needed when iterating over the BPF map",
	}, []string{LabelDatapathFamily, LabelProtocol})

	// ConntrackGCSize the number of entries in the conntrack table
	ConntrackGCSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: Namespace,
		Subsystem: Datapath,
		Name:      "conntrack_gc_entries",
		Help: "The number of alive and deleted conntrack entries at the end " +
			"of a garbage collector run labeled by datapath family.",
	}, []string{LabelDatapathFamily, LabelProtocol, LabelStatus})

	// ConntrackGCDuration the duration of the conntrack GC process in milliseconds.
	ConntrackGCDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Subsystem: Datapath,
		Name:      "conntrack_gc_duration_seconds",
		Help: "Duration in seconds of the garbage collector process " +
			"labeled by datapath family and completion status",
	}, []string{LabelDatapathFamily, LabelProtocol, LabelStatus})

	// ServicesCount number of services
	ServicesCount = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "services_events_total",
		Help:      "Number of services events labeled by action type",
	}, []string{LabelAction})

	// ErrorsWarnings is the number of errors and warnings in cilium-agent instances
	ErrorsWarnings = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "errors_warnings_total",
		Help:      "Number of total errors in cilium-agent instances",
	}, []string{"level", "subsystem"})

	// ControllerRuns is the number of times that a controller process runs.
	ControllerRuns = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "controllers_runs_total",
		Help:      "Number of times that a controller process was run labeled by completion status",
	}, []string{LabelStatus})

	// ControllerRunsDuration the duration of the controller process in seconds
	ControllerRunsDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Name:      "controllers_runs_duration_seconds",
		Help:      "Duration in seconds of the controller process labeled by completion status",
	}, []string{LabelStatus})

	// BuildQueueEntries is the number of queued, waiting and running
	// builds in the build queue
	BuildQueueEntries = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Namespace: Namespace,
		Name:      "buildqueue_entries",
		Help:      "The number of queued, waiting and running builds in the build queue",
	}, []string{LabelBuildState, LabelBuildQueueName})

	// subprocess, labeled by Subsystem
	SubprocessStart = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "subprocess_start_total",
		Help:      "Number of times that Cilium has started a subprocess, labeled by subsystem",
	}, []string{LabelSubsystem})

	// KubernetesEventProcessed is the number of Kubernetes events
	// processed labeled by scope, action and execution result
	KubernetesEventProcessed = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "kubernetes_events_total",
		Help:      "Number of Kubernetes events processed labeled by scope, action and execution result",
	}, []string{LabelScope, LabelAction, LabelStatus})

	// KubernetesEventReceived is the number of Kubernetes events received
	// labeled by scope, action, valid data and equalness.
	KubernetesEventReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "kubernetes_events_received_total",
		Help:      "Number of Kubernetes events processed labeled by scope, action and execution result",
	}, []string{LabelScope, LabelAction, "valid", "equal"})

	// KubernetesAPIInteractions is the total time taken to process an API call made
	// to the kube-apiserver
	KubernetesAPIInteractions = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Subsystem: K8sClient,
		Name:      "api_latency_time_seconds",
		Help:      "Duration of processed API calls labeled by path and method.",
	}, []string{LabelPath, LabelMethod})

	// KubernetesAPICalls is the counter for all API calls made to
	// kube-apiserver.
	KubernetesAPICalls = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Subsystem: K8sClient,
		Name:      "api_calls_counter",
		Help:      "Number of API calls made to kube-apiserver labeled by host, method and return code.",
	}, []string{"host", LabelMethod, LabelAPIReturnCode})

	// KubernetesCNPStatusCompletion is the number of seconds it takes to
	// complete a CNP status update
	KubernetesCNPStatusCompletion = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Subsystem: K8s,
		Name:      "cnp_status_completion_seconds",
		Help:      "Duration in seconds in how long it took to complete a CNP status update",
	}, []string{LabelAttempts, LabelOutcome})

	// IpamEvent is the number of IPAM events received labeled by action and
	// datapath family type
	IpamEvent = prometheus.NewCounterVec(prometheus.CounterOpts{
		Namespace: Namespace,
		Name:      "ipam_events_total",
		Help:      "Number of IPAM events received labeled by action and datapath family type",
	}, []string{LabelAction, LabelDatapathFamily})

	// KVStoreOperationsTotal is the  number of interactions with the Key-Value
	// Store, labeled by subsystem, kind of action and action
	//
	// Deprecated: This metric can be removed in 1.6 as
	// KVStoreOperationsDuration provides the count along with duration
	KVStoreOperationsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
		Name: "kvstore_operations_total",
		Help: "Number of interactions with the Key-Value Store, labeled by subsystem, kind of action and action",
	}, []string{LabelScope, LabelKind, LabelAction, LabelOutcome})

	// KVStoreOperationsDuration records the duration of kvstore operations
	KVStoreOperationsDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Subsystem: "kvstore",
		Name:      "operations_duration_seconds",
		Help:      "Duration in seconds of kvstore operations",
	}, []string{LabelScope, LabelKind, LabelAction, LabelOutcome})

	// KVStoreEventsQueueDuration records the duration in seconds of time
	// received event was blocked before it could be queued
	KVStoreEventsQueueDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
		Namespace: Namespace,
		Subsystem: "kvstore",
		Name:      "events_queue_seconds",
		Help:      "Duration in seconds of time received event was blocked before it could be queued",
		Buckets:   []float64{.002, .005, .01, .015, .025, .05, .1, .25, .5, .75, 1},
	}, []string{LabelScope, LabelAction})

	// FQDNGarbageCollectorCleanedTotal is the number of domains cleaned by the
	// GC job.
	FQDNGarbageCollectorCleanedTotal = prometheus.NewCounter(prometheus.CounterOpts{
		Name: "fqdn_gc_deletions_total",
		Help: "Number of FQDNs that have been cleaned on FQDN Garbage collector job",
	})
)

Functions

func DumpMetrics

func DumpMetrics() ([]*models.Metric, error)

DumpMetrics gets the current Cilium metrics and dumps all into a models.Metrics structure.If metrics cannot be retrieved, returns an error

func Enable

func Enable(addr string) <-chan error

Enable begins serving prometheus metrics on the address passed in. Addresses of the form ":8080" will bind the port on all interfaces.

func Errno2Outcome

func Errno2Outcome(errno syscall.Errno) string

Errno2Outcome converts a syscall.Errno to LabelOutcome

func Error2Outcome

func Error2Outcome(err error) string

Error2Outcome converts an error to LabelOutcome

func GetCounterValue

func GetCounterValue(m prometheus.Counter) float64

GetCounterValue returns the current value stored for the counter

func MustRegister

func MustRegister(c prometheus.Collector)

MustRegister adds the collector to the registry, exposing this metric to prometheus scrapes. It will panic on error.

func Register

func Register(c prometheus.Collector) error

Register registers a collector

func RegisterList

func RegisterList(list []prometheus.Collector) error

RegisterList registers a list of collectors. If registration of one collector fails, no collector is registered.

func Unregister

func Unregister(c prometheus.Collector) bool

Unregister unregisters a collector

Types

type APIEventTSHelper

type APIEventTSHelper struct {
	Next      http.Handler
	TSGauge   prometheus.Gauge
	Histogram *prometheus.HistogramVec
}

APIEventTSHelper is intended to be a global middleware to track metrics around API calls. It records the timestamp of an API call in the provided gauge.

func (*APIEventTSHelper) ServeHTTP

func (m *APIEventTSHelper) ServeHTTP(r http.ResponseWriter, req *http.Request)

ServeHTTP implements the http.Handler interface. It records the timestamp this API call began at, then chains to the next handler.

type LoggingHook

type LoggingHook struct {
	// contains filtered or unexported fields
}

LoggingHook is a hook for logrus which counts error and warning messages as a Prometheus metric.

func NewLoggingHook

func NewLoggingHook(component string) *LoggingHook

NewLoggingHook returns a new instance of LoggingHook for the given Cilium component.

func (*LoggingHook) Fire

func (h *LoggingHook) Fire(entry *logrus.Entry) error

Fire is the main method which is called every time when logger has an error or warning message.

func (*LoggingHook) Levels

func (h *LoggingHook) Levels() []logrus.Level

Levels returns the list of logging levels on which the hook is triggered.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL