Documentation ¶
Index ¶
- Constants
- Variables
- func GetBindAddress(cmdSet string) string
- func GetCGroupVersion() int
- func GetDefaultPowerModelURL(modelOutputType, energySource string) string
- func GetKernelSourceDirs() []string
- func GetMetricPath(cmdSet string) string
- func GetMockACPIPowerPath() string
- func GetModelConfigMap() map[string]string
- func GetRedfishCredFilePath() string
- func GetRedfishProbeIntervalInSeconds() int
- func GetRedfishSkipSSLVerify() bool
- func InitModelConfigMap()
- func IsExposeBPFMetricsEnabled() bool
- func IsExposeComponentPowerEnabled() bool
- func IsExposeContainerStatsEnabled() bool
- func IsExposeProcessStatsEnabled() bool
- func IsExposeQATMetricsEnabled() bool
- func IsExposeVMStatsEnabled() bool
- func IsIdlePowerEnabled() bool
- func LogConfigs()
- func SetEnableAPIServer(enabled bool)
- func SetEnabledEBPFCgroupID(enabled bool)
- func SetEnabledGPU(enabled bool)
- func SetEnabledHardwareCounterMetrics(enabled bool)
- func SetEnabledIdlePower(enabled bool)
- func SetEnabledQAT(enabled bool)
- func SetEstimatorConfig(modelName, selectFilter string)
- func SetGpuUsageMetric(metric string)
- func SetKernelSourceDir(dir string) error
- func SetKubeConfig(k string)
- func SetModelServerEndpoint(serverEndpoint string)
- func SetModelServerReqEndpoint() (modelServerReqEndpoint string)
- func SetRedfishCredFilePath(credFilePath string)
- func SetRedfishProbeIntervalInSeconds(interval string)
- func SetRedfishSkipSSLVerify(skipSSLVerify bool)
- type Client
Constants ¶
const ( CORE = "core" DRAM = "dram" UNCORE = "uncore" PKG = "package" GPU = "gpu" OTHER = "other" PLATFORM = "platform" FREQUENCY = "frequency" // counter - attacher package CPUCycle = "cpu_cycles" CPURefCycle = "cpu_ref_cycles" CPUInstruction = "cpu_instructions" CacheMiss = "cache_miss" // bpf - attacher package CPUTime = "bpf_cpu_time_ms" PageCacheHit = "bpf_page_cache_hit" IRQNetTXLabel = "bpf_net_tx_irq" IRQNetRXLabel = "bpf_net_rx_irq" IRQBlockLabel = "bpf_block_irq" // NVIDIA GPU GPUComputeUtilization = "gpu_compute_util" GPUMemUtilization = "gpu_mem_util" // Intel QuickAssist Technology (QAT) // TODO: test if different request has different energy consumption. QATUtilization = "qta_sample_cnt" // Energy Metrics // Absolute energy and power AbsEnergyInCore = "abs_energy_in_core" AbsEnergyInDRAM = "abs_energy_in_dram" AbsEnergyInUnCore = "abs_energy_in_uncore" AbsEnergyInPkg = "abs_energy_in_pkg" AbsEnergyInGPU = "abs_energy_in_gpu" AbsEnergyInOther = "abs_energy_in_other" AbsEnergyInPlatform = "abs_energy_in_platform" // Dynamic energy and power DynEnergyInCore = "dyn_energy_in_core" DynEnergyInDRAM = "dyn_energy_in_dram" DynEnergyInUnCore = "dyn_energy_in_uncore" DynEnergyInPkg = "dyn_energy_in_pkg" DynEnergyInGPU = "dyn_energy_in_gpu" DynEnergyInOther = "dyn_energy_in_other" DynEnergyInPlatform = "dyn_energy_in_platform" // Idle energy and power IdleEnergyInCore = "idle_energy_in_core" IdleEnergyInDRAM = "idle_energy_in_dram" IdleEnergyInUnCore = "idle_energy_in_uncore" IdleEnergyInPkg = "idle_energy_in_pkg" IdleEnergyInGPU = "idle_energy_in_gpu" IdleEnergyInOther = "idle_energy_in_other" IdleEnergyInPlatform = "idle_energy_in_platform" )
const (
// MaxIRQ is the maximum number of IRQs to be monitored
MaxIRQ = 10
)
Variables ¶
var ( EnabledMSR = false KernelVersion = float32(0) KeplerNamespace = getConfig("KEPLER_NAMESPACE", defaultNamespace) EnabledEBPFCgroupID = getBoolConfig("ENABLE_EBPF_CGROUPID", true) EnabledGPU = getBoolConfig("ENABLE_GPU", false) EnabledQAT = getBoolConfig("ENABLE_QAT", false) EnableProcessStats = getBoolConfig("ENABLE_PROCESS_METRICS", false) ExposeContainerStats = getBoolConfig("EXPOSE_CONTAINER_METRICS", true) ExposeVMStats = getBoolConfig("EXPOSE_VM_METRICS", true) ExposeHardwareCounterMetrics = getBoolConfig("EXPOSE_HW_COUNTER_METRICS", true) ExposeIRQCounterMetrics = getBoolConfig("EXPOSE_IRQ_COUNTER_METRICS", true) ExposeBPFMetrics = getBoolConfig("EXPOSE_BPF_METRICS", true) ExposeComponentPower = getBoolConfig("EXPOSE_COMPONENT_POWER", true) ExposeIdlePowerMetrics = getBoolConfig("EXPOSE_ESTIMATED_IDLE_POWER_METRICS", false) MockACPIPowerPath = getConfig("MOCK_ACPI_POWER_PATH", "") MetricPathKey = "METRIC_PATH" BindAddressKey = "BIND_ADDRESS" CPUArchOverride = getConfig("CPU_ARCH_OVERRIDE", "") MaxLookupRetry = getIntConfig("MAX_LOOKUP_RETRY", defaultMaxLookupRetry) BPFSampleRate = getIntConfig("EXPERIMENTAL_BPF_SAMPLE_RATE", 0) EstimatorModel = getConfig("ESTIMATOR_MODEL", defaultMetricValue) // auto-select EstimatorSelectFilter = getConfig("ESTIMATOR_SELECT_FILTER", defaultMetricValue) // no filter CoreUsageMetric = getConfig("CORE_USAGE_METRIC", CPUInstruction) DRAMUsageMetric = getConfig("DRAM_USAGE_METRIC", CacheMiss) UncoreUsageMetric = getConfig("UNCORE_USAGE_METRIC", defaultMetricValue) // no metric (evenly divided) GpuUsageMetric = getConfig("GPU_USAGE_METRIC", GPUComputeUtilization) // no metric (evenly divided) GeneralUsageMetric = getConfig("GENERAL_USAGE_METRIC", defaultMetricValue) // for uncategorized energy SamplePeriodSec = uint64(getIntConfig("SAMPLE_PERIOD_SEC", defaultSamplePeriodSec)) // nvidia dcgm hostengine endpoint DCGMHostEngineEndpoint = getConfig("NVIDIA_HOSTENGINE_ENDPOINT", "localhost:5555") //////////////////////////////////// ModelServerEnable = getBoolConfig("MODEL_SERVER_ENABLE", false) ModelServerEndpoint = SetModelServerReqEndpoint() // for model config ModelConfigValues map[string]string // model_parameter_prefix NodePlatformPowerKey = "NODE_TOTAL" NodeComponentsPowerKey = "NODE_COMPONENTS" ContainerPlatformPowerKey = "CONTAINER_TOTAL" ContainerComponentsPowerKey = "CONTAINER_COMPONENTS" ProcessPlatformPowerKey = "PROCESS_TOTAL" ProcessComponentsPowerKey = "PROCESS_COMPONENTS" // model_parameter_attribute RatioEnabledKey = "RATIO" // the default container power model is RATIO but ESTIMATOR or LINEAR_REGRESSION can be used EstimatorEnabledKey = "ESTIMATOR" LocalRegressorEnabledKey = "LOCAL_REGRESSOR" InitModelURLKey = "INIT_URL" FixedTrainerNameKey = "TRAINER" FixedNodeTypeKey = "NODE_TYPE" ModelFiltersKey = "FILTERS" DefaultTrainerName = "SGDRegressorTrainer" // KubeConfig is used to start k8s client with the pod running outside the cluster KubeConfig = "" EnableAPIServer = false )
Functions ¶
func GetBindAddress ¶
func GetDefaultPowerModelURL ¶ added in v0.5.5
return local path to power model weight e.g., /var/lib/kepler/data/acpi_AbsPowerModel.json
func GetKernelSourceDirs ¶ added in v0.5.1
func GetKernelSourceDirs() []string
func GetMetricPath ¶
func GetMockACPIPowerPath ¶ added in v0.7.10
func GetMockACPIPowerPath() string
func GetModelConfigMap ¶ added in v0.5.4
func GetRedfishCredFilePath ¶ added in v0.5.2
func GetRedfishCredFilePath() string
func GetRedfishProbeIntervalInSeconds ¶ added in v0.5.2
func GetRedfishProbeIntervalInSeconds() int
func GetRedfishSkipSSLVerify ¶ added in v0.5.2
func GetRedfishSkipSSLVerify() bool
func InitModelConfigMap ¶
func InitModelConfigMap()
InitModelConfigMap initializes map of config from MODEL_CONFIG
func IsExposeBPFMetricsEnabled ¶ added in v0.7.11
func IsExposeBPFMetricsEnabled() bool
IsExposeBPFMetricsEnabled returns false if BPF Metrics metrics are disabled to minimize overhead.
func IsExposeComponentPowerEnabled ¶ added in v0.7.11
func IsExposeComponentPowerEnabled() bool
IsExposeComponentPowerEnabled returns false if component power metrics are disabled to minimize overhead.
func IsExposeContainerStatsEnabled ¶ added in v0.7.3
func IsExposeContainerStatsEnabled() bool
IsExposeContainerStatsEnabled returns false if container metrics are disabled to minimize overhead in the Kepler standalone mode.
func IsExposeProcessStatsEnabled ¶ added in v0.7.3
func IsExposeProcessStatsEnabled() bool
IsExposeProcessStatsEnabled returns false if process metrics are disabled to minimize overhead in the Kepler standalone mode.
func IsExposeQATMetricsEnabled ¶ added in v0.7.3
func IsExposeQATMetricsEnabled() bool
IsExposeQATMetricsEnabled returns false if QATMetrics metrics are disabled to minimize overhead.
func IsExposeVMStatsEnabled ¶ added in v0.7.3
func IsExposeVMStatsEnabled() bool
IsExposeVMStatsEnabled returns false if VM metrics are disabled to minimize overhead.
func IsIdlePowerEnabled ¶ added in v0.6.1
func IsIdlePowerEnabled() bool
IsIdlePowerEnabled always return true if Kepler has access to system power metrics. However, if pre-trained power models are being used, Kepler should only expose metrics if the user is aware of the implications.
func LogConfigs ¶
func LogConfigs()
func SetEnableAPIServer ¶ added in v0.5.1
func SetEnableAPIServer(enabled bool)
SetEnableAPIServer enables Kepler to watch apiserver
func SetEnabledEBPFCgroupID ¶
func SetEnabledEBPFCgroupID(enabled bool)
SetEnabledEBPFCgroupID enables the eBPF code to collect cgroup id if the system has kernel version > 4.18
func SetEnabledGPU ¶
func SetEnabledGPU(enabled bool)
SetEnabledGPU enables the exposure of gpu metrics
func SetEnabledHardwareCounterMetrics ¶
func SetEnabledHardwareCounterMetrics(enabled bool)
SetEnabledHardwareCounterMetrics enables the exposure of hardware counter metrics
func SetEnabledIdlePower ¶ added in v0.6.1
func SetEnabledIdlePower(enabled bool)
SetEnabledIdlePower allows enabling idle power exposure in Kepler's metrics. When direct power metrics access is available, idle power exposure is automatic. With pre-trained power models, awareness of implications is crucial. Estimated idle power is useful for bare-metal or single VM setups. In VM environments, accurately distributing idle power is tough due to unknown co-running VMs. Wrong division results in significant accuracy errors, duplicatiing the host idle power across all VMs. Container pre-trained models focus on dynamic power. Estimating idle power in limited information scenarios (like VMs) is complex. Idle power prediction is limited to bare-metal or single VM setups. Know the number of runnign VMs becomes crucial for achieving a fair distribution of idle power, particularly when following the GHG (Greenhouse Gas) protocol.
func SetEnabledQAT ¶ added in v0.5.4
func SetEnabledQAT(enabled bool)
SetEnabledQAT enables the exposure of qat metrics
func SetEstimatorConfig ¶
func SetEstimatorConfig(modelName, selectFilter string)
func SetGpuUsageMetric ¶ added in v0.7.4
func SetGpuUsageMetric(metric string)
func SetKernelSourceDir ¶ added in v0.5.1
SetKernelSourceDir sets the directory for all kernel source. This is used for bcc. Only the top level directory is needed.
func SetModelServerEndpoint ¶
func SetModelServerEndpoint(serverEndpoint string)
func SetModelServerReqEndpoint ¶
func SetModelServerReqEndpoint() (modelServerReqEndpoint string)
func SetRedfishCredFilePath ¶ added in v0.5.2
func SetRedfishCredFilePath(credFilePath string)
func SetRedfishProbeIntervalInSeconds ¶ added in v0.5.2
func SetRedfishProbeIntervalInSeconds(interval string)
func SetRedfishSkipSSLVerify ¶ added in v0.5.2
func SetRedfishSkipSSLVerify(skipSSLVerify bool)