Documentation ¶
Index ¶
- Constants
- Variables
- func CPUCoreIdExists(sysInfo *SystemInfo, coreId int) bool
- func CPUIdExists(sysInfo *SystemInfo, cpuId int) bool
- func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)
- func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error)
- func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)
- func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error)
- func GPUIdExists(sysInfo *SystemInfo, gpuId int) bool
- func GPUInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool
- func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceID uint) string
- func GetHostname(config *Config) (string, error)
- func IsCPUWatched(cpuID uint, sysInfo SystemInfo) bool
- func IsCoreWatched(coreID uint, cpuID uint, sysInfo SystemInfo) bool
- func IsDCGMExpClockEventsCountEnabled(counters []Counter) bool
- func IsDCGMExpXIDErrorsCountEnabled(counters []Counter) bool
- func IsLinkWatched(linkIndex uint, switchID uint, sysInfo SystemInfo) bool
- func IsSwitchWatched(switchID uint, sysInfo SystemInfo) bool
- func LinkIdExists(sysInfo *SystemInfo, linkId int) bool
- func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short
- func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)
- func NewGroup() (dcgm.GroupHandle, func(), error)
- func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error
- func ReadCSVFile(filename string) ([][]string, error)
- func SetGPUInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool
- func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error
- func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error)
- func ShouldMonitorDeviceType(fields []dcgm.Short, entityType dcgm.Field_Entity_Group) bool
- func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool
- func ToCPUMetric(metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, ...)
- func ToMetric(metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, ...)
- func ToString(value dcgm.FieldValue_v1) string
- func ToSwitchMetric(metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, ...)
- func VerifyCPUDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error
- func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error
- func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error
- func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error
- func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, ...) error
- type CPUInfo
- type Collector
- type ComputeInstanceInfo
- type Config
- type Counter
- type CounterSet
- type DCGMCollector
- type DCGMCollectorConstructor
- type DeviceOptions
- type ExporterCounter
- type FieldEntityGroupTypeSystemInfo
- type FieldEntityGroupTypeSystemInfoItem
- type GPUInfo
- type GPUInstanceInfo
- type KubernetesGPUIDType
- type Metric
- type MetricsByCounter
- type MetricsPipeline
- type MetricsServer
- type MonitoringInfo
- func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo
- func AddAllCPUs(sysInfo SystemInfo) []MonitoringInfo
- func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo
- func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo
- func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo
- func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo
- func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo
- func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuID int) *MonitoringInfo
- func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceID int) *MonitoringInfo
- type PodInfo
- type PodMapper
- type Registry
- type SwitchInfo
- type SystemInfo
- func GetSystemInfo(config *Config, entityType dcgm.Field_Entity_Group) (*SystemInfo, error)
- func InitializeCPUInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)
- func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) (SystemInfo, error)
- func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)
- func InitializeSystemInfo(gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOptions, useFakeGPUs bool, ...) (SystemInfo, error)
- type Transform
Constants ¶
const ( // DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE Nothing is running on the GPU and the clocks are dropping to Idle state DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clockEventBitmask = 0x0000000000000001 // DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING GPU clocks are limited by current setting of applications clocks DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clockEventBitmask = 0x0000000000000002 // DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP SW Power Scaling algorithm is reducing the clocks below requested clocks DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clockEventBitmask = 0x0000000000000004 // DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clockEventBitmask = 0x0000000000000008 // DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST Sync Boost DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clockEventBitmask = 0x0000000000000010 //SW Thermal Slowdown DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clockEventBitmask = 0x0000000000000020 // DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clockEventBitmask = 0x0000000000000040 // DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clockEventBitmask = 0x0000000000000080 // DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS GPU clocks are limited by current setting of Display clocks DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clockEventBitmask = 0x0000000000000100 )
Source of the const values: https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_fields.h
const ( LoggerGroupIDKey = "groupID" LoggerDumpKey = "dump" LoggerStackTrace = "stacktrace" )
Constants for logging fields
const ( PARENT_ID_IGNORED = 0 DCGM_ST_NOT_CONFIGURED = "Setting not configured" )
const ( DCGMDbgLvlNone = "NONE" DCGMDbgLvlFatal = "FATAL" DCGMDbgLvlError = "ERROR" DCGMDbgLvlWarn = "WARN" DCGMDbgLvlInfo = "INFO" DCGMDbgLvlDebug = "DEBUG" DCGMDbgLvlVerb = "VERB" )
DCGMDbgLvl is a DCGM library debug level.
Variables ¶
var ( SkipDCGMValue = "SKIPPING DCGM VALUE" FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" MIG_UUID_PREFIX = "MIG-" )
var DCGMDbgLvlValues = []string{DCGMDbgLvlNone, DCGMDbgLvlFatal, DCGMDbgLvlError, DCGMDbgLvlWarn, DCGMDbgLvlInfo, DCGMDbgLvlDebug, DCGMDbgLvlVerb, }
var DCGMFields = map[string]ExporterCounter{ DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, DCGMClockEventsCount.String(): DCGMClockEventsCount, DCGMFIUnknown.String(): DCGMFIUnknown, }
DCGMFields maps DCGMExporterMetric String to enum
var FieldEntityGroupTypeToMonitor = []dcgm.Field_Entity_Group{ dcgm.FE_GPU, dcgm.FE_SWITCH, dcgm.FE_LINK, dcgm.FE_CPU, dcgm.FE_CPU_CORE, }
FieldEntityGroupTypeToMonitor supported entity group types
Functions ¶
func CPUCoreIdExists ¶
func CPUCoreIdExists(sysInfo *SystemInfo, coreId int) bool
func CPUIdExists ¶
func CPUIdExists(sysInfo *SystemInfo, cpuId int) bool
func CreateCoreGroupsFromSystemInfo ¶
func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)
func CreateGroupFromSystemInfo ¶
func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error)
func CreateLinkGroupsFromSystemInfo ¶
func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)
func FormatMetrics ¶
func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error)
FormatMetrics Template is passed here so that it isn't recompiled at each iteration
func GPUIdExists ¶
func GPUIdExists(sysInfo *SystemInfo, gpuId int) bool
func GPUInstanceIdExists ¶
func GPUInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool
func GetGPUInstanceIdentifier ¶
func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceID uint) string
func GetHostname ¶
func IsCPUWatched ¶
func IsCPUWatched(cpuID uint, sysInfo SystemInfo) bool
func IsCoreWatched ¶
func IsCoreWatched(coreID uint, cpuID uint, sysInfo SystemInfo) bool
func IsDCGMExpClockEventsCountEnabled ¶
IsDCGMExpClockEventsCountEnabled checks if the DCGM_EXP_CLOCK_EVENTS_COUNT counter exists
func IsLinkWatched ¶
func IsLinkWatched(linkIndex uint, switchID uint, sysInfo SystemInfo) bool
func IsSwitchWatched ¶
func IsSwitchWatched(switchID uint, sysInfo SystemInfo) bool
func LinkIdExists ¶
func LinkIdExists(sysInfo *SystemInfo, linkId int) bool
func NewDeviceFields ¶
func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short
func NewFieldGroup ¶
func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)
func NewGroup ¶
func NewGroup() (dcgm.GroupHandle, func(), error)
func PopulateMigProfileNames ¶
func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error
func ReadCSVFile ¶
func SetGPUInstanceProfileName ¶
func SetGPUInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool
func SetMigProfileNames ¶
func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error
func SetupDcgmFieldsWatch ¶
func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error)
func ShouldMonitorDeviceType ¶
func ShouldMonitorDeviceType(fields []dcgm.Short, entityType dcgm.Field_Entity_Group) bool
func SwitchIdExists ¶
func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool
func ToCPUMetric ¶
func ToCPUMetric( metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, )
func ToMetric ¶
func ToMetric( metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GPUInstanceInfo, useOld bool, hostname string, replaceBlanksInModelName bool, )
func ToString ¶
func ToString(value dcgm.FieldValue_v1) string
func ToSwitchMetric ¶
func ToSwitchMetric( metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, )
func VerifyCPUDevicePresence ¶
func VerifyCPUDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error
func VerifyDevicePresence ¶
func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error
func VerifySwitchDevicePresence ¶
func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error
func WatchFieldGroup ¶
func WatchFieldGroup( group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32, ) error
Types ¶
type Collector ¶
type Collector interface { GetMetrics() (MetricsByCounter, error) Cleanup() }
Collector interface
func NewClockEventsCollector ¶
func NewXIDCollector ¶
type ComputeInstanceInfo ¶
type ComputeInstanceInfo struct { InstanceInfo dcgm.MigEntityInfo ProfileName string EntityId uint }
type Config ¶
type Config struct { CollectorsFile string Address string CollectInterval int Kubernetes bool KubernetesGPUIdType KubernetesGPUIDType CollectDCP bool UseOldNamespace bool UseRemoteHE bool RemoteHEInfo string GPUDevices DeviceOptions SwitchDevices DeviceOptions CPUDevices DeviceOptions NoHostname bool UseFakeGPUs bool ConfigMapData string MetricGroups []dcgm.MetricGroup WebSystemdSocket bool WebConfigFile string XIDCountWindowSize int ReplaceBlanksInModelName bool Debug bool ClockEventsCountWindowSize int EnableDCGMLog bool DCGMLogLevel string PodResourcesKubeletSocket string HPCJobMappingDir string NvidiaResourceNames []string }
type CounterSet ¶
CounterSet return
func GetCounterSet ¶
func GetCounterSet(c *Config) (*CounterSet, error)
type DCGMCollector ¶
type DCGMCollector struct { Counters []Counter DeviceFields []dcgm.Short Cleanups []func() UseOldNamespace bool SysInfo SystemInfo Hostname string ReplaceBlanksInModelName bool }
func NewDCGMCollector ¶
func NewDCGMCollector( c []Counter, hostname string, config *Config, fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, ) (*DCGMCollector, func(), error)
func (*DCGMCollector) Cleanup ¶
func (c *DCGMCollector) Cleanup()
func (*DCGMCollector) GetMetrics ¶
func (c *DCGMCollector) GetMetrics() (MetricsByCounter, error)
type DCGMCollectorConstructor ¶
type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error)
type DeviceOptions ¶
type DeviceOptions struct { Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all }
type ExporterCounter ¶
type ExporterCounter uint16
const ( DCGMFIUnknown ExporterCounter = 0 DCGMXIDErrorsCount ExporterCounter = iota + 9000 DCGMClockEventsCount ExporterCounter = iota + 9000 )
func IdentifyMetricType ¶
func IdentifyMetricType(s string) (ExporterCounter, error)
func (ExporterCounter) String ¶
func (enm ExporterCounter) String() string
String method to convert the enum value to a string
type FieldEntityGroupTypeSystemInfo ¶
type FieldEntityGroupTypeSystemInfo struct {
// contains filtered or unexported fields
}
FieldEntityGroupTypeSystemInfo represents a mapping between FieldEntityGroupType and SystemInfo
func NewEntityGroupTypeSystemInfo ¶
func NewEntityGroupTypeSystemInfo(c []Counter, config *Config) *FieldEntityGroupTypeSystemInfo
NewEntityGroupTypeSystemInfo creates a new instance of the FieldEntityGroupTypeSystemInfo
func (*FieldEntityGroupTypeSystemInfo) Get ¶
func (e *FieldEntityGroupTypeSystemInfo) Get(key dcgm.Field_Entity_Group) (FieldEntityGroupTypeSystemInfoItem, bool)
Get returns FieldEntityGroupTypeSystemInfoItem, bool by dcgm.Field_Entity_Group
func (*FieldEntityGroupTypeSystemInfo) Load ¶
func (e *FieldEntityGroupTypeSystemInfo) Load(entityType dcgm.Field_Entity_Group) error
Load loads SystemInfo for a provided Field_Entity_Group
type FieldEntityGroupTypeSystemInfoItem ¶
type FieldEntityGroupTypeSystemInfoItem struct { SystemInfo SystemInfo DeviceFields []dcgm.Short }
type GPUInfo ¶
type GPUInfo struct { DeviceInfo dcgm.Device GPUInstances []GPUInstanceInfo MigEnabled bool }
type GPUInstanceInfo ¶
type GPUInstanceInfo struct { Info dcgm.MigEntityInfo ProfileName string EntityId uint ComputeInstances []ComputeInstanceInfo }
type KubernetesGPUIDType ¶
type KubernetesGPUIDType string
const ( GPUUID KubernetesGPUIDType = "uid" DeviceName KubernetesGPUIDType = "device-name" )
type MetricsByCounter ¶
MetricsByCounter represents a map where each Counter is associated with a slice of Metric objects
type MetricsPipeline ¶
type MetricsPipeline struct {
// contains filtered or unexported fields
}
func NewMetricsPipeline ¶
func NewMetricsPipeline(config *Config, counters []Counter, hostname string, newDCGMCollector DCGMCollectorConstructor, fieldEntityGroupTypeSystemInfo *FieldEntityGroupTypeSystemInfo, ) (*MetricsPipeline, func(), error)
func NewMetricsPipelineWithGPUCollector ¶
func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error)
Primarely for testing, caller expected to cleanup the collector
type MetricsServer ¶
func NewMetricsServer ¶
func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*MetricsServer, func(), error)
func (*MetricsServer) Health ¶
func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request)
func (*MetricsServer) Metrics ¶
func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request)
func (*MetricsServer) Run ¶
func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup)
type MonitoringInfo ¶
type MonitoringInfo struct { Entity dcgm.GroupEntityPair DeviceInfo dcgm.Device InstanceInfo *GPUInstanceInfo ParentId uint }
func AddAllCPUCores ¶
func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo
func AddAllCPUs ¶
func AddAllCPUs(sysInfo SystemInfo) []MonitoringInfo
func AddAllGPUInstances ¶
func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo
func AddAllGPUs ¶
func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo
func AddAllLinks ¶
func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo
func AddAllSwitches ¶
func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo
func GetMonitoredEntities ¶
func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo
func GetMonitoringInfoForGPU ¶
func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuID int) *MonitoringInfo
func GetMonitoringInfoForGPUInstance ¶
func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceID int) *MonitoringInfo
type PodMapper ¶
type PodMapper struct {
Config *Config
}
func NewPodMapper ¶
func (*PodMapper) Process ¶
func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error
type Registry ¶
type Registry struct {
// contains filtered or unexported fields
}
func NewRegistry ¶
func NewRegistry() *Registry
func (*Registry) Gather ¶
func (r *Registry) Gather() (MetricsByCounter, error)
Gather gathers metrics from all registered collectors.
type SwitchInfo ¶
type SwitchInfo struct { EntityId uint NvLinks []dcgm.NvLinkStatus }
type SystemInfo ¶
type SystemInfo struct { GPUCount uint GPUs [dcgm.MAX_NUM_DEVICES]GPUInfo InfoType dcgm.Field_Entity_Group Switches []SwitchInfo CPUs []CPUInfo // contains filtered or unexported fields }
func GetSystemInfo ¶
func GetSystemInfo(config *Config, entityType dcgm.Field_Entity_Group) (*SystemInfo, error)
func InitializeCPUInfo ¶
func InitializeCPUInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)
func InitializeGPUInfo ¶
func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) (SystemInfo, error)
func InitializeNvSwitchInfo ¶
func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)
func InitializeSystemInfo ¶
func InitializeSystemInfo( gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOptions, useFakeGPUs bool, entityType dcgm.Field_Entity_Group, ) (SystemInfo, error)
type Transform ¶
type Transform interface { Process(metrics MetricsByCounter, sysInfo SystemInfo) error Name() string }