config

package
v0.7.9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 8, 2024 License: Apache-2.0 Imports: 9 Imported by: 0

Documentation

Index

Constants

View Source
const (
	CORE      = "core"
	DRAM      = "dram"
	UNCORE    = "uncore"
	PKG       = "package"
	GPU       = "gpu"
	OTHER     = "other"
	PLATFORM  = "platform"
	FREQUENCY = "frequency"

	// counter - attacher package
	CPUCycle       = "cpu_cycles"
	CPURefCycle    = "cpu_ref_cycles"
	CPUInstruction = "cpu_instructions"
	CacheMiss      = "cache_miss"
	TaskClock      = "task_clock_ms"

	// bpf - attacher package
	CPUTime       = "bpf_cpu_time_ms"
	PageCacheHit  = "bpf_page_cache_hit"
	IRQNetTXLabel = "bpf_net_tx_irq"
	IRQNetRXLabel = "bpf_net_rx_irq"
	IRQBlockLabel = "bpf_block_irq"

	// cgroup - cgroup package
	CgroupfsMemory       = "cgroupfs_memory_usage_bytes"
	CgroupfsKernelMemory = "cgroupfs_kernel_memory_usage_bytes"
	CgroupfsTCPMemory    = "cgroupfs_tcp_memory_usage_bytes"
	CgroupfsCPU          = "cgroupfs_cpu_usage_us"
	CgroupfsSystemCPU    = "cgroupfs_system_cpu_usage_us"
	CgroupfsUserCPU      = "cgroupfs_user_cpu_usage_us"
	CgroupfsReadIO       = "cgroupfs_ioread_bytes"
	CgroupfsWriteIO      = "cgroupfs_iowrite_bytes"
	BytesReadIO          = "bytes_read"
	BytesWriteIO         = "bytes_writes"
	BlockDevicesIO       = "block_devices_used"

	// system
	CPUFrequency = "avg_cpu_frequency"

	// NVIDIA GPU
	GPUComputeUtilization = "gpu_compute_util"
	GPUMemUtilization     = "gpu_mem_util"

	// Intel QuickAssist Technology (QAT)
	// TODO: test if different request has different energy consumption.
	QATUtilization = "qta_sample_cnt"

	// Energy Metrics
	// Absolute energy and power
	AbsEnergyInCore     = "abs_energy_in_core"
	AbsEnergyInDRAM     = "abs_energy_in_dram"
	AbsEnergyInUnCore   = "abs_energy_in_uncore"
	AbsEnergyInPkg      = "abs_energy_in_pkg"
	AbsEnergyInGPU      = "abs_energy_in_gpu"
	AbsEnergyInOther    = "abs_energy_in_other"
	AbsEnergyInPlatform = "abs_energy_in_platform"
	// Dynamic energy and power
	DynEnergyInCore     = "dyn_energy_in_core"
	DynEnergyInDRAM     = "dyn_energy_in_dram"
	DynEnergyInUnCore   = "dyn_energy_in_uncore"
	DynEnergyInPkg      = "dyn_energy_in_pkg"
	DynEnergyInGPU      = "dyn_energy_in_gpu"
	DynEnergyInOther    = "dyn_energy_in_other"
	DynEnergyInPlatform = "dyn_energy_in_platform"
	// Idle energy and power
	IdleEnergyInCore     = "idle_energy_in_core"
	IdleEnergyInDRAM     = "idle_energy_in_dram"
	IdleEnergyInUnCore   = "idle_energy_in_uncore"
	IdleEnergyInPkg      = "idle_energy_in_pkg"
	IdleEnergyInGPU      = "idle_energy_in_gpu"
	IdleEnergyInOther    = "idle_energy_in_other"
	IdleEnergyInPlatform = "idle_energy_in_platform"
)
View Source
const (

	// MaxIRQ is the maximum number of IRQs to be monitored
	MaxIRQ = 10
)

Variables

View Source
var (
	EnabledMSR            = false
	EnabledBPFBatchDelete = true

	KernelVersion = float32(0)

	KeplerNamespace              = getConfig("KEPLER_NAMESPACE", defaultNamespace)
	UseLibBPFAttacher            = false
	EnabledEBPFCgroupID          = getBoolConfig("ENABLE_EBPF_CGROUPID", true)
	EnabledGPU                   = getBoolConfig("ENABLE_GPU", false)
	EnabledQAT                   = getBoolConfig("ENABLE_QAT", false)
	EnableProcessStats           = getBoolConfig("ENABLE_PROCESS_METRICS", false)
	ExposeContainerStats         = getBoolConfig("EXPOSE_CONTAINER_METRICS", true)
	ExposeVMStats                = getBoolConfig("EXPOSE_VM_METRICS", true)
	ExposeHardwareCounterMetrics = getBoolConfig("EXPOSE_HW_COUNTER_METRICS", true)
	ExposeCgroupMetrics          = getBoolConfig("EXPOSE_CGROUP_METRICS", true)
	ExposeIRQCounterMetrics      = getBoolConfig("EXPOSE_IRQ_COUNTER_METRICS", true)
	ExposeIdlePowerMetrics       = getBoolConfig("EXPOSE_ESTIMATED_IDLE_POWER_METRICS", false)
	ExposeCPUFrequencyMetrics    = getBoolConfig("EXPOSE_CPU_FREQUENCY_METRICS", false)

	MetricPathKey   = "METRIC_PATH"
	BindAddressKey  = "BIND_ADDRESS"
	CPUArchOverride = getConfig("CPU_ARCH_OVERRIDE", "")
	MaxLookupRetry  = getIntConfig("MAX_LOOKUP_RETRY", defaultMaxLookupRetry)
	BPFSampleRate   = getIntConfig("EXPERIMENTAL_BPF_SAMPLE_RATE", 0)

	EstimatorModel        = getConfig("ESTIMATOR_MODEL", defaultMetricValue)         // auto-select
	EstimatorSelectFilter = getConfig("ESTIMATOR_SELECT_FILTER", defaultMetricValue) // no filter
	CoreUsageMetric       = getConfig("CORE_USAGE_METRIC", CPUInstruction)
	DRAMUsageMetric       = getConfig("DRAM_USAGE_METRIC", CacheMiss)
	UncoreUsageMetric     = getConfig("UNCORE_USAGE_METRIC", defaultMetricValue)  // no metric (evenly divided)
	GpuUsageMetric        = getConfig("GPU_USAGE_METRIC", GPUComputeUtilization)  // no metric (evenly divided)
	GeneralUsageMetric    = getConfig("GENERAL_USAGE_METRIC", defaultMetricValue) // for uncategorized energy

	SamplePeriodSec = uint64(getIntConfig("SAMPLE_PERIOD_SEC", defaultSamplePeriodSec))

	// nvidia dcgm hostengine endpoint
	DCGMHostEngineEndpoint = getConfig("NVIDIA_HOSTENGINE_ENDPOINT", "localhost:5555")

	////////////////////////////////////
	ModelServerEnable   = getBoolConfig("MODEL_SERVER_ENABLE", false)
	ModelServerEndpoint = SetModelServerReqEndpoint()
	// for model config
	ModelConfigValues map[string]string
	// model_parameter_prefix
	NodePlatformPowerKey        = "NODE_TOTAL"
	NodeComponentsPowerKey      = "NODE_COMPONENTS"
	ContainerPlatformPowerKey   = "CONTAINER_TOTAL"
	ContainerComponentsPowerKey = "CONTAINER_COMPONENTS"
	ProcessPlatformPowerKey     = "PROCESS_TOTAL"
	ProcessComponentsPowerKey   = "PROCESS_COMPONENTS"

	// model_parameter_attribute
	RatioEnabledKey          = "RATIO" // the default container power model is RATIO but ESTIMATOR or LINEAR_REGRESSION can be used
	EstimatorEnabledKey      = "ESTIMATOR"
	LocalRegressorEnabledKey = "LOCAL_REGRESSOR"
	InitModelURLKey          = "INIT_URL"
	FixedTrainerNameKey      = "TRAINER"
	FixedNodeTypeKey         = "NODE_TYPE"
	ModelFiltersKey          = "FILTERS"
	DefaultTrainerName       = "SGDRegressorTrainer"

	// KubeConfig is used to start k8s client with the pod running outside the cluster
	KubeConfig      = ""
	EnableAPIServer = false
)

Functions

func GetBindAddress

func GetBindAddress(cmdSet string) string

func GetCGroupVersion

func GetCGroupVersion() int

Get cgroup version, return 1 or 2

func GetDefaultPowerModelURL added in v0.5.5

func GetDefaultPowerModelURL(modelOutputType, energySource string) string

return local path to power model weight e.g., /var/lib/kepler/data/acpi_AbsPowerModel.json

func GetKernelSourceDirs added in v0.5.1

func GetKernelSourceDirs() []string

func GetMetricPath

func GetMetricPath(cmdSet string) string

func GetModelConfigMap added in v0.5.4

func GetModelConfigMap() map[string]string

func GetRedfishCredFilePath added in v0.5.2

func GetRedfishCredFilePath() string

func GetRedfishProbeIntervalInSeconds added in v0.5.2

func GetRedfishProbeIntervalInSeconds() int

func GetRedfishSkipSSLVerify added in v0.5.2

func GetRedfishSkipSSLVerify() bool

func InitModelConfigMap

func InitModelConfigMap()

InitModelConfigMap initializes map of config from MODEL_CONFIG

func IsCgroupMetricsEnabled added in v0.5.5

func IsCgroupMetricsEnabled() bool

func IsExposeCPUFrequencyMetricsEnabled added in v0.7.3

func IsExposeCPUFrequencyMetricsEnabled() bool

IsExposeCPUFrequencyMetricsEnabled returns false if CPUFrequency metrics are disabled to minimize overhead.

func IsExposeContainerStatsEnabled added in v0.7.3

func IsExposeContainerStatsEnabled() bool

IsExposeContainerStatsEnabled returns false if container metrics are disabled to minimize overhead in the Kepler standalone mode.

func IsExposeProcessStatsEnabled added in v0.7.3

func IsExposeProcessStatsEnabled() bool

IsExposeProcessStatsEnabled returns false if process metrics are disabled to minimize overhead in the Kepler standalone mode.

func IsExposeQATMetricsEnabled added in v0.7.3

func IsExposeQATMetricsEnabled() bool

IsExposeCPUFrequencyMetricsEnabled returns false if CPUFrequency metrics are disabled to minimize overhead.

func IsExposeVMStatsEnabled added in v0.7.3

func IsExposeVMStatsEnabled() bool

IsExposeVMStatsEnabled returns false if VM metrics are disabled to minimize overhead.

func IsHCMetricsEnabled added in v0.7.3

func IsHCMetricsEnabled() bool

func IsIRQCounterMetricsEnabled added in v0.5.5

func IsIRQCounterMetricsEnabled() bool

func IsIdlePowerEnabled added in v0.6.1

func IsIdlePowerEnabled() bool

IsIdlePowerEnabled always return true if Kepler has access to system power metrics. However, if pre-trained power models are being used, Kepler should only expose metrics if the user is aware of the implications.

func LogConfigs

func LogConfigs()

func SetEnableAPIServer added in v0.5.1

func SetEnableAPIServer(enabled bool)

SetEnableAPIServer enables Kepler to watch apiserver

func SetEnabledEBPFCgroupID

func SetEnabledEBPFCgroupID(enabled bool)

SetEnabledEBPFCgroupID enables the eBPF code to collect cgroup id if the system has kernel version > 4.18

func SetEnabledGPU

func SetEnabledGPU(enabled bool)

SetEnabledGPU enables the exposure of gpu metrics

func SetEnabledHardwareCounterMetrics

func SetEnabledHardwareCounterMetrics(enabled bool)

SetEnabledHardwareCounterMetrics enables the exposure of hardware counter metrics

func SetEnabledIdlePower added in v0.6.1

func SetEnabledIdlePower(enabled bool)

SetEnabledIdlePower allows enabling idle power exposure in Kepler's metrics. When direct power metrics access is available, idle power exposure is automatic. With pre-trained power models, awareness of implications is crucial. Estimated idle power is useful for bare-metal or single VM setups. In VM environments, accurately distributing idle power is tough due to unknown co-running VMs. Wrong division results in significant accuracy errors, duplicatiing the host idle power across all VMs. Container pre-trained models focus on dynamic power. Estimating idle power in limited information scenarios (like VMs) is complex. Idle power prediction is limited to bare-metal or single VM setups. Know the number of runnign VMs becomes crucial for achieving a fair distribution of idle power, particularly when following the GHG (Greenhouse Gas) protocol.

func SetEnabledQAT added in v0.5.4

func SetEnabledQAT(enabled bool)

SetEnabledQAT enables the exposure of qat metrics

func SetEstimatorConfig

func SetEstimatorConfig(modelName, selectFilter string)

func SetGpuUsageMetric added in v0.7.4

func SetGpuUsageMetric(metric string)

func SetKernelSourceDir added in v0.5.1

func SetKernelSourceDir(dir string) error

SetKernelSourceDir sets the directory for all kernel source. This is used for bcc. Only the top level directory is needed.

func SetKubeConfig added in v0.5.1

func SetKubeConfig(k string)

SetKubeConfig set kubeconfig file

func SetModelServerEndpoint

func SetModelServerEndpoint(serverEndpoint string)

func SetModelServerReqEndpoint

func SetModelServerReqEndpoint() (modelServerReqEndpoint string)

func SetRedfishCredFilePath added in v0.5.2

func SetRedfishCredFilePath(credFilePath string)

func SetRedfishProbeIntervalInSeconds added in v0.5.2

func SetRedfishProbeIntervalInSeconds(interval string)

func SetRedfishSkipSSLVerify added in v0.5.2

func SetRedfishSkipSSLVerify(skipSSLVerify bool)

Types

type Client

type Client interface {
	// contains filtered or unexported methods
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL