metrics

package
v1.15.2-rc.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 30, 2023 License: Apache-2.0 Imports: 11 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// OpenTelemetry is the app label for all otel resources.
	OpenTelemetry = "opentelemetry"

	// OtelAgentName is the name of the OpenTelemetry Agent.
	OtelAgentName = "otel-agent"

	// OtelCollectorName is the name of the OpenTelemetry Collector.
	OtelCollectorName = "otel-collector"

	// OtelCollectorGooglecloud is the name of the OpenTelemetry Collector ConfigMap that contains Googlecloud exporter.
	OtelCollectorGooglecloud = "otel-collector-googlecloud"

	// OtelCollectorCustomCM is the name of the custom OpenTelemetry Collector ConfigMap.
	OtelCollectorCustomCM = "otel-collector-custom"

	// MonitoringNamespace is the Namespace used for OpenTelemetry Collector deployment.
	MonitoringNamespace = "config-management-monitoring"

	// CollectorConfigGooglecloud is the OpenTelemetry Collector configuration with
	// the googlecloud exporter.
	CollectorConfigGooglecloud = `` /* 6970-byte string literal not displayed */

)
View Source
const (
	// StatusSuccess is the string value for the status key indicating success
	StatusSuccess = "success"
	// StatusError is the string value for the status key indicating failure/errors
	StatusError = "error"
	// CommitNone is the string value for the commit key indicating that no
	// commit has been synced.
	CommitNone = "NONE"
	// ApplierController is the string value for the applier controller in the multi-repo mode
	ApplierController = "applier"
	// RemediatorController is the string value for the remediator controller in the multi-repo mode
	RemediatorController = "remediator"
)

Variables

View Source
var (
	// APICallDuration metric measures the latency of API server calls.
	APICallDuration = stats.Float64(
		"api_duration_seconds",
		"The duration of API server calls in seconds",
		stats.UnitSeconds)

	// ReconcilerErrors metric measures the number of errors in the reconciler.
	ReconcilerErrors = stats.Int64(
		"reconciler_errors",
		"The number of errors in the reconciler",
		stats.UnitDimensionless)

	// PipelineError metric measures the error by components when syncing a commit.
	// Definition here must exactly match the definition in the resource-group
	// controller, or the Prometheus exporter will error. b/247516388
	// https://github.com/GoogleContainerTools/kpt-resource-group/blob/main/controllers/metrics/metrics.go#L88
	PipelineError = stats.Int64(
		"pipeline_error_observed",
		"A boolean value indicates if error happened at readiness stage when syncing a commit",
		stats.UnitDimensionless)

	// ReconcileDuration metric measures the latency of reconcile events.
	ReconcileDuration = stats.Float64(
		"reconcile_duration_seconds",
		"The duration of reconcile events in seconds",
		stats.UnitSeconds)

	// ParserDuration metric measures the latency of the parse-apply-watch loop.
	ParserDuration = stats.Float64(
		"parser_duration_seconds",
		"The duration of the parse-apply-watch loop in seconds",
		stats.UnitSeconds)

	// LastSync metric measures the timestamp of the latest Git sync.
	LastSync = stats.Int64(
		"last_sync_timestamp",
		"The timestamp of the most recent sync from Git",
		stats.UnitDimensionless)

	// DeclaredResources metric measures the number of declared resources parsed from Git.
	DeclaredResources = stats.Int64(
		"declared_resources",
		"The number of declared resources parsed from Git",
		stats.UnitDimensionless)

	// ApplyOperations metric measures the number of applier apply events.
	ApplyOperations = stats.Int64(
		"apply_operations",
		"The number of operations that have been performed to sync resources to source of truth",
		stats.UnitDimensionless)

	// ApplyDuration metric measures the latency of applier apply events.
	ApplyDuration = stats.Float64(
		"apply_duration_seconds",
		"The duration of applier events in seconds",
		stats.UnitSeconds)

	// ResourceFights metric measures the number of resource fights.
	ResourceFights = stats.Int64(
		"resource_fights",
		"The number of resources that are being synced too frequently",
		stats.UnitDimensionless)

	// RemediateDuration metric measures the latency of remediator reconciliation events.
	RemediateDuration = stats.Float64(
		"remediate_duration_seconds",
		"The duration of remediator reconciliation events",
		stats.UnitSeconds)

	// LastApply metric measures the timestamp of the most recent applier apply event.
	LastApply = stats.Int64(
		"last_apply_timestamp",
		"The timestamp of the most recent applier event",
		stats.UnitDimensionless)

	// ResourceConflicts metric measures the number of resource conflicts.
	ResourceConflicts = stats.Int64(
		"resource_conflicts",
		"The number of resource conflicts resulting from a mismatch between the cached resources and cluster resources",
		stats.UnitDimensionless)

	// InternalErrors metric measures the number of unexpected internal errors triggered by defensive checks in Config Sync.
	InternalErrors = stats.Int64(
		"internal_errors",
		"The number of internal errors triggered by Config Sync",
		stats.UnitDimensionless)
)
View Source
var (
	// KeyName groups metrics by the reconciler name. Possible values: root-reconciler, ns-reconciler-<namespace>
	// TODO b/208316928 remove this key from pipeline_error_observed metric once same metric in Resource Group Controller has this tag removed
	KeyName, _ = tag.NewKey("name")

	// KeyReconcilerType groups metrics by the reconciler type. Possible values: root, namespace.
	// TODO: replace with configsync.sync.kind resource attribute
	KeyReconcilerType, _ = tag.NewKey("reconciler")

	// KeyOperation groups metrics by their operation. Possible values: create, patch, update, delete.
	KeyOperation, _ = tag.NewKey("operation")

	// KeyController groups metrics by their controller. Possible values: applier, remediator.
	KeyController, _ = tag.NewKey("controller")

	// KeyComponent groups metrics by their component. Possible values: source, sync, rendering, readiness(from Resource Group Controller).
	KeyComponent, _ = tag.NewKey("component")

	// KeyExportedComponent groups metrics by their component.
	// The "component" metric tag overlaps with a resource tag exported by
	// resource_to_telemetry_conversion when using Prometheus. So it's renamed
	// "exported_component" when exported to Prometheus.
	// TODO: Fix this naming overlap by renaming the "component" metric tag.
	// Possible values: source, sync, rendering, readiness (from Resource Group Controller).
	KeyExportedComponent, _ = tag.NewKey("exported_component")

	// KeyErrorClass groups metrics by their error code.
	KeyErrorClass, _ = tag.NewKey("errorclass")

	// KeyStatus groups metrics by their status. Possible values: success, error.
	KeyStatus, _ = tag.NewKey("status")

	// KeyType groups metrics by their resource Kind.
	KeyType, _ = tag.NewKey("type")

	// KeyInternalErrorSource groups the InternalError metrics by their source. Possible values: parser, differ, remediator.
	KeyInternalErrorSource, _ = tag.NewKey("source")

	// KeyParserSource groups the metrics for the parser by their source. Possible values: read, parse, update.
	KeyParserSource, _ = tag.NewKey("source")

	// KeyTrigger groups metrics by their trigger. Possible values: retry, watchUpdate, managementConflict, resync, reimport.
	KeyTrigger, _ = tag.NewKey("trigger")

	// KeyCommit groups metrics by their git commit. Even though this tag has a high cardinality,
	// it is only used by the `last_sync_timestamp` and `last_apply_timestamp` metrics.
	// These are both aggregated as LastValue metrics so the number of recorded values will always be
	// at most 1 per git commit.
	KeyCommit, _ = tag.NewKey("commit")

	// KeyContainer groups metrics by their container names. Possible values: reconciler, git-sync.
	// TODO: replace with k8s.container.name resource attribute
	KeyContainer, _ = tag.NewKey("container")

	// KeyResourceType groups metris by their resource types. Possible values: cpu, memory.
	KeyResourceType, _ = tag.NewKey("resource")
)
View Source
var (
	// ResourceKeySyncKind groups metrics by the Sync kind. Possible values: RootSync, RepoSync.
	ResourceKeySyncKind, _ = tag.NewKey("configsync_sync_kind")

	// ResourceKeySyncName groups metrics by the Sync name.
	ResourceKeySyncName, _ = tag.NewKey("configsync_sync_name")

	// ResourceKeySyncNamespace groups metrics by the Sync namespace.
	ResourceKeySyncNamespace, _ = tag.NewKey("configsync_sync_namespace")

	// ResourceKeyDeploymentName groups metrics by k8s deployment name.
	ResourceKeyDeploymentName, _ = tag.NewKey("k8s_deployment_name")

	// ResourceKeyDeploymentName groups metrics by k8s pod name.
	ResourceKeyPodName, _ = tag.NewKey("k8s_pod_name")
)

The following metric tag keys are available from the otel-collector Prometheus exporter. They are created from resource attributes using the resource_to_telemetry_conversion feature.

View Source
var (
	// APICallDurationView aggregates the APICallDuration metric measurements.
	APICallDurationView = &view.View{
		Name:        APICallDuration.Name(),
		Measure:     APICallDuration,
		Description: "The latency distribution of API server calls",
		TagKeys:     []tag.Key{KeyOperation, KeyType, KeyStatus},
		Aggregation: view.Distribution(distributionBounds...),
	}

	// ReconcilerErrorsView aggregates the ReconcilerErrors metric measurements.
	ReconcilerErrorsView = &view.View{
		Name:        ReconcilerErrors.Name(),
		Measure:     ReconcilerErrors,
		Description: "The current number of errors in the RootSync and RepoSync reconcilers",
		TagKeys:     []tag.Key{KeyComponent, KeyErrorClass},
		Aggregation: view.LastValue(),
	}

	// PipelineErrorView aggregates the PipelineError metric measurements.
	// Definition here must exactly match the definition in the resource-group
	// controller, or the Prometheus exporter will error. b/247516388
	// https://github.com/GoogleContainerTools/kpt-resource-group/blob/main/controllers/metrics/views.go#L123
	PipelineErrorView = &view.View{
		Name:        PipelineError.Name(),
		Measure:     PipelineError,
		Description: "A boolean value indicates if error happened from different stages when syncing a commit",
		TagKeys:     []tag.Key{KeyName, KeyReconcilerType, KeyComponent},
		Aggregation: view.LastValue(),
	}

	// ReconcileDurationView aggregates the ReconcileDuration metric measurements.
	ReconcileDurationView = &view.View{
		Name:        ReconcileDuration.Name(),
		Measure:     ReconcileDuration,
		Description: "The latency distribution of RootSync and RepoSync reconcile events",
		TagKeys:     []tag.Key{KeyStatus},
		Aggregation: view.Distribution(distributionBounds...),
	}

	// ParserDurationView aggregates the ParserDuration metric measurements.
	ParserDurationView = &view.View{
		Name:        ParserDuration.Name(),
		Measure:     ParserDuration,
		Description: "The latency distribution of the parse-apply-watch loop",
		TagKeys:     []tag.Key{KeyStatus, KeyTrigger, KeyParserSource},
		Aggregation: view.Distribution(longDistributionBounds...),
	}

	// LastSyncTimestampView aggregates the LastSyncTimestamp metric measurements.
	LastSyncTimestampView = &view.View{
		Name:        LastSync.Name(),
		Measure:     LastSync,
		Description: "The timestamp of the most recent sync from Git",
		TagKeys:     []tag.Key{KeyCommit, KeyStatus},
		Aggregation: view.LastValue(),
	}

	// DeclaredResourcesView aggregates the DeclaredResources metric measurements.
	DeclaredResourcesView = &view.View{
		Name:        DeclaredResources.Name(),
		Measure:     DeclaredResources,
		Description: "The current number of declared resources parsed from Git",
		TagKeys:     []tag.Key{KeyCommit},
		Aggregation: view.LastValue(),
	}

	// ApplyOperationsView aggregates the ApplyOps metric measurements.
	ApplyOperationsView = &view.View{
		Name:        ApplyOperations.Name() + "_total",
		Measure:     ApplyOperations,
		Description: "The total number of operations that have been performed to sync resources to source of truth",
		TagKeys:     []tag.Key{KeyController, KeyOperation, KeyType, KeyStatus},
		Aggregation: view.Count(),
	}

	// ApplyDurationView aggregates the ApplyDuration metric measurements.
	ApplyDurationView = &view.View{
		Name:        ApplyDuration.Name(),
		Measure:     ApplyDuration,
		Description: "The latency distribution of applier resource sync events",
		TagKeys:     []tag.Key{KeyCommit, KeyStatus},
		Aggregation: view.Distribution(longDistributionBounds...),
	}

	// LastApplyTimestampView aggregates the LastApplyTimestamp metric measurements.
	LastApplyTimestampView = &view.View{
		Name:        LastApply.Name(),
		Measure:     LastApply,
		Description: "The timestamp of the most recent applier resource sync event",
		TagKeys:     []tag.Key{KeyCommit, KeyStatus},
		Aggregation: view.LastValue(),
	}

	// ResourceFightsView aggregates the ResourceFights metric measurements.
	ResourceFightsView = &view.View{
		Name:        ResourceFights.Name() + "_total",
		Measure:     ResourceFights,
		Description: "The total number of resources that are being synced too frequently",
		Aggregation: view.Count(),
	}

	// RemediateDurationView aggregates the RemediateDuration metric measurements.
	RemediateDurationView = &view.View{
		Name:        RemediateDuration.Name(),
		Measure:     RemediateDuration,
		Description: "The latency distribution of remediator reconciliation events",
		TagKeys:     []tag.Key{KeyType, KeyStatus},
		Aggregation: view.Distribution(distributionBounds...),
	}

	// ResourceConflictsView aggregates the ResourceConflicts metric measurements.
	ResourceConflictsView = &view.View{
		Name:        ResourceConflicts.Name() + "_total",
		Measure:     ResourceConflicts,
		Description: "The total number of resource conflicts resulting from a mismatch between the cached resources and cluster resources",
		TagKeys:     []tag.Key{KeyCommit},
		Aggregation: view.Count(),
	}

	// InternalErrorsView aggregates the InternalErrors metric measurements.
	InternalErrorsView = &view.View{
		Name:        InternalErrors.Name() + "_total",
		Measure:     InternalErrors,
		Description: "The total number of internal errors triggered by Config Sync",
		TagKeys:     []tag.Key{KeyInternalErrorSource},
		Aggregation: view.Count(),
	}
)

Functions

func RecordAPICallDuration

func RecordAPICallDuration(ctx context.Context, operation, status, kind string, startTime time.Time)

RecordAPICallDuration produces a measurement for the APICallDuration view.

func RecordApplyDuration

func RecordApplyDuration(ctx context.Context, status, commit string, startTime time.Time)

RecordApplyDuration produces measurements for the ApplyDuration and LastApplyTimestamp views.

func RecordApplyOperation

func RecordApplyOperation(ctx context.Context, controller, operation, status, kind string)

RecordApplyOperation produces a measurement for the ApplyOperations view.

func RecordDeclaredResources

func RecordDeclaredResources(ctx context.Context, commit string, numResources int)

RecordDeclaredResources produces a measurement for the DeclaredResources view.

func RecordInternalError

func RecordInternalError(ctx context.Context, source string)

RecordInternalError produces measurements for the InternalErrors view.

func RecordLastSync

func RecordLastSync(ctx context.Context, status, commit string, timestamp time.Time)

RecordLastSync produces a measurement for the LastSync view.

func RecordParserDuration

func RecordParserDuration(ctx context.Context, trigger, source, status string, startTime time.Time)

RecordParserDuration produces a measurement for the ParserDuration view.

func RecordPipelineError

func RecordPipelineError(ctx context.Context, reconcilerType, component string, errLen int)

RecordPipelineError produces a measurement for the PipelineError view

func RecordReconcileDuration

func RecordReconcileDuration(ctx context.Context, status string, startTime time.Time)

RecordReconcileDuration produces a measurement for the ReconcileDuration view.

func RecordReconcilerErrors

func RecordReconcilerErrors(ctx context.Context, component string, errs []v1beta1.ConfigSyncError)

RecordReconcilerErrors produces a measurement for the ReconcilerErrors view.

func RecordRemediateDuration

func RecordRemediateDuration(ctx context.Context, status, kind string, startTime time.Time)

RecordRemediateDuration produces measurements for the RemediateDuration view.

func RecordResourceConflict

func RecordResourceConflict(ctx context.Context, _, commit string)

RecordResourceConflict produces measurements for the ResourceConflicts view.

func RecordResourceFight

func RecordResourceFight(ctx context.Context, _, _ string)

RecordResourceFight produces measurements for the ResourceFights view.

func RegisterOCAgentExporter

func RegisterOCAgentExporter(containerName string) (*ocagent.Exporter, error)

RegisterOCAgentExporter creates the OC Agent metrics exporter.

func RegisterReconcilerManagerMetricsViews

func RegisterReconcilerManagerMetricsViews() error

RegisterReconcilerManagerMetricsViews registers the views so that recorded metrics can be exported in the reconciler manager.

func RegisterReconcilerMetricsViews

func RegisterReconcilerMetricsViews() error

RegisterReconcilerMetricsViews registers the views so that recorded metrics can be exported in the reconcilers.

func StatusTagKey

func StatusTagKey(err error) string

StatusTagKey returns a string representation of the error, if it exists, otherwise success.

func StatusTagValueFromSummary added in v1.15.1

func StatusTagValueFromSummary(summary *v1beta1.ErrorSummary) string

StatusTagValueFromSummary returns error if the summary indicates at least 1 error, otherwise success.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL