dcgmexporter

package
v0.0.0-...-b97b763 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 18, 2024 License: Apache-2.0 Imports: 40 Imported by: 1

Documentation

Index

Constants

View Source
const (
	// DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE Nothing is running on the GPU and the clocks are dropping to Idle state
	DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE clockEventBitmask = 0x0000000000000001
	// DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING GPU clocks are limited by current setting of applications clocks
	DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING clockEventBitmask = 0x0000000000000002
	// DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP SW Power Scaling algorithm is reducing the clocks below requested clocks
	DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP clockEventBitmask = 0x0000000000000004
	// DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
	DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN clockEventBitmask = 0x0000000000000008
	// DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST Sync Boost
	DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST clockEventBitmask = 0x0000000000000010
	//SW Thermal Slowdown
	DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL clockEventBitmask = 0x0000000000000020
	// DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
	DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL clockEventBitmask = 0x0000000000000040
	// DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
	DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE clockEventBitmask = 0x0000000000000080
	// DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS GPU clocks are limited by current setting of Display clocks
	DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS clockEventBitmask = 0x0000000000000100
)

Source of the const values: https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_fields.h

View Source
const (
	LoggerGroupIDKey = "groupID"
	LoggerDumpKey    = "dump"
	LoggerStackTrace = "stacktrace"
)

Constants for logging fields

View Source
const (
	PARENT_ID_IGNORED      = 0
	DCGM_ST_NOT_CONFIGURED = "Setting not configured"
)
View Source
const (
	DCGMDbgLvlNone  = "NONE"
	DCGMDbgLvlFatal = "FATAL"
	DCGMDbgLvlError = "ERROR"
	DCGMDbgLvlWarn  = "WARN"
	DCGMDbgLvlInfo  = "INFO"
	DCGMDbgLvlDebug = "DEBUG"
	DCGMDbgLvlVerb  = "VERB"
)

DCGMDbgLvl is a DCGM library debug level.

Variables

View Source
var (
	SkipDCGMValue   = "SKIPPING DCGM VALUE"
	FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING"

	MIG_UUID_PREFIX = "MIG-"
)

DCGMFields maps DCGMExporterMetric String to enum

FieldEntityGroupTypeToMonitor supported entity group types

Functions

func CPUCoreIdExists

func CPUCoreIdExists(sysInfo *SystemInfo, coreId int) bool

func CPUIdExists

func CPUIdExists(sysInfo *SystemInfo, cpuId int) bool

func CreateCoreGroupsFromSystemInfo

func CreateCoreGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)

func CreateGroupFromSystemInfo

func CreateGroupFromSystemInfo(sysInfo SystemInfo) (dcgm.GroupHandle, func(), error)

func CreateLinkGroupsFromSystemInfo

func CreateLinkGroupsFromSystemInfo(sysInfo SystemInfo) ([]dcgm.GroupHandle, []func(), error)

func FormatMetrics

func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error)

FormatMetrics Template is passed here so that it isn't recompiled at each iteration

func GPUIdExists

func GPUIdExists(sysInfo *SystemInfo, gpuId int) bool

func GPUInstanceIdExists

func GPUInstanceIdExists(sysInfo *SystemInfo, gpuInstanceId int) bool

func GetGPUInstanceIdentifier

func GetGPUInstanceIdentifier(sysInfo SystemInfo, gpuuuid string, gpuInstanceID uint) string

func GetHostname

func GetHostname(config *Config) (string, error)

func IsCPUWatched

func IsCPUWatched(cpuID uint, sysInfo SystemInfo) bool

func IsCoreWatched

func IsCoreWatched(coreID uint, cpuID uint, sysInfo SystemInfo) bool

func IsDCGMExpClockEventsCountEnabled

func IsDCGMExpClockEventsCountEnabled(counters []Counter) bool

IsDCGMExpClockEventsCountEnabled checks if the DCGM_EXP_CLOCK_EVENTS_COUNT counter exists

func IsDCGMExpXIDErrorsCountEnabled

func IsDCGMExpXIDErrorsCountEnabled(counters []Counter) bool

func IsLinkWatched

func IsLinkWatched(linkIndex uint, switchID uint, sysInfo SystemInfo) bool

func IsSwitchWatched

func IsSwitchWatched(switchID uint, sysInfo SystemInfo) bool

func LinkIdExists

func LinkIdExists(sysInfo *SystemInfo, linkId int) bool

func NewDeviceFields

func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short

func NewFieldGroup

func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)

func NewGroup

func NewGroup() (dcgm.GroupHandle, func(), error)

func PopulateMigProfileNames

func PopulateMigProfileNames(sysInfo *SystemInfo, entities []dcgm.GroupEntityPair) error

func ReadCSVFile

func ReadCSVFile(filename string) ([][]string, error)

func SetGPUInstanceProfileName

func SetGPUInstanceProfileName(sysInfo *SystemInfo, entityId uint, profileName string) bool

func SetMigProfileNames

func SetMigProfileNames(sysInfo *SystemInfo, values []dcgm.FieldValue_v2) error

func SetupDcgmFieldsWatch

func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo, collectIntervalUsec int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error)

func ShouldMonitorDeviceType

func ShouldMonitorDeviceType(fields []dcgm.Short, entityType dcgm.Field_Entity_Group) bool

func SwitchIdExists

func SwitchIdExists(sysInfo *SystemInfo, switchId int) bool

func ToCPUMetric

func ToCPUMetric(
	metrics MetricsByCounter,
	values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
)

func ToMetric

func ToMetric(
	metrics MetricsByCounter,
	values []dcgm.FieldValue_v1,
	c []Counter,
	d dcgm.Device,
	instanceInfo *GPUInstanceInfo,
	useOld bool,
	hostname string,
	replaceBlanksInModelName bool,
)

func ToString

func ToString(value dcgm.FieldValue_v1) string

func ToSwitchMetric

func ToSwitchMetric(
	metrics MetricsByCounter,
	values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
)

func VerifyCPUDevicePresence

func VerifyCPUDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error

func VerifyDevicePresence

func VerifyDevicePresence(sysInfo *SystemInfo, gOpt DeviceOptions) error

func VerifySwitchDevicePresence

func VerifySwitchDevicePresence(sysInfo *SystemInfo, sOpt DeviceOptions) error

func WaitWithTimeout

func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error

func WatchFieldGroup

func WatchFieldGroup(
	group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32,
) error

Types

type CPUInfo

type CPUInfo struct {
	EntityId uint
	Cores    []uint
}

type Collector

type Collector interface {
	GetMetrics() (MetricsByCounter, error)
	Cleanup()
}

Collector interface

func NewClockEventsCollector

func NewClockEventsCollector(counters []Counter,
	hostname string,
	config *Config,
	fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error)

func NewXIDCollector

func NewXIDCollector(counters []Counter,
	hostname string,
	config *Config,
	fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (Collector, error)

type ComputeInstanceInfo

type ComputeInstanceInfo struct {
	InstanceInfo dcgm.MigEntityInfo
	ProfileName  string
	EntityId     uint
}

type Config

type Config struct {
	CollectorsFile             string
	Address                    string
	CollectInterval            int
	Kubernetes                 bool
	KubernetesGPUIdType        KubernetesGPUIDType
	CollectDCP                 bool
	UseOldNamespace            bool
	UseRemoteHE                bool
	RemoteHEInfo               string
	GPUDevices                 DeviceOptions
	SwitchDevices              DeviceOptions
	CPUDevices                 DeviceOptions
	NoHostname                 bool
	UseFakeGPUs                bool
	ConfigMapData              string
	MetricGroups               []dcgm.MetricGroup
	WebSystemdSocket           bool
	WebConfigFile              string
	XIDCountWindowSize         int
	ReplaceBlanksInModelName   bool
	Debug                      bool
	ClockEventsCountWindowSize int
	EnableDCGMLog              bool
	DCGMLogLevel               string
	PodResourcesKubeletSocket  string
	HPCJobMappingDir           string
	NvidiaResourceNames        []string
}

type Counter

type Counter struct {
	FieldID   dcgm.Short
	FieldName string
	PromType  string
	Help      string
}

func FindCounterField

func FindCounterField(c []Counter, fieldID uint) (Counter, error)

type CounterSet

type CounterSet struct {
	DCGMCounters     []Counter
	ExporterCounters []Counter
}

CounterSet return

func GetCounterSet

func GetCounterSet(c *Config) (*CounterSet, error)

type DCGMCollector

type DCGMCollector struct {
	Counters                 []Counter
	DeviceFields             []dcgm.Short
	Cleanups                 []func()
	UseOldNamespace          bool
	SysInfo                  SystemInfo
	Hostname                 string
	ReplaceBlanksInModelName bool
}

func NewDCGMCollector

func NewDCGMCollector(
	c []Counter,
	hostname string,
	config *Config,
	fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem,
) (*DCGMCollector, func(), error)

func (*DCGMCollector) Cleanup

func (c *DCGMCollector) Cleanup()

func (*DCGMCollector) GetMetrics

func (c *DCGMCollector) GetMetrics() (MetricsByCounter, error)

type DCGMCollectorConstructor

type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector,
	func(), error)

type DeviceOptions

type DeviceOptions struct {
	Flex       bool  // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled.
	MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all
	MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all
}

type ExporterCounter

type ExporterCounter uint16
const (
	DCGMFIUnknown        ExporterCounter = 0
	DCGMXIDErrorsCount   ExporterCounter = iota + 9000
	DCGMClockEventsCount ExporterCounter = iota + 9000
)

func IdentifyMetricType

func IdentifyMetricType(s string) (ExporterCounter, error)

func (ExporterCounter) String

func (enm ExporterCounter) String() string

String method to convert the enum value to a string

type FieldEntityGroupTypeSystemInfo

type FieldEntityGroupTypeSystemInfo struct {
	// contains filtered or unexported fields
}

FieldEntityGroupTypeSystemInfo represents a mapping between FieldEntityGroupType and SystemInfo

func NewEntityGroupTypeSystemInfo

func NewEntityGroupTypeSystemInfo(c []Counter, config *Config) *FieldEntityGroupTypeSystemInfo

NewEntityGroupTypeSystemInfo creates a new instance of the FieldEntityGroupTypeSystemInfo

func (*FieldEntityGroupTypeSystemInfo) Get

Get returns FieldEntityGroupTypeSystemInfoItem, bool by dcgm.Field_Entity_Group

func (*FieldEntityGroupTypeSystemInfo) Load

Load loads SystemInfo for a provided Field_Entity_Group

type FieldEntityGroupTypeSystemInfoItem

type FieldEntityGroupTypeSystemInfoItem struct {
	SystemInfo   SystemInfo
	DeviceFields []dcgm.Short
}

type GPUInfo

type GPUInfo struct {
	DeviceInfo   dcgm.Device
	GPUInstances []GPUInstanceInfo
	MigEnabled   bool
}

type GPUInstanceInfo

type GPUInstanceInfo struct {
	Info             dcgm.MigEntityInfo
	ProfileName      string
	EntityId         uint
	ComputeInstances []ComputeInstanceInfo
}

type KubernetesGPUIDType

type KubernetesGPUIDType string
const (
	GPUUID     KubernetesGPUIDType = "uid"
	DeviceName KubernetesGPUIDType = "device-name"
)

type Metric

type Metric struct {
	Counter Counter
	Value   string

	GPU          string
	GPUUUID      string
	GPUDevice    string
	GPUModelName string
	GPUPCIBusID  string

	UUID string

	MigProfile    string
	GPUInstanceID string
	Hostname      string

	Labels     map[string]string
	Attributes map[string]string
}

type MetricsByCounter

type MetricsByCounter map[Counter][]Metric

MetricsByCounter represents a map where each Counter is associated with a slice of Metric objects

type MetricsPipeline

type MetricsPipeline struct {
	// contains filtered or unexported fields
}

func NewMetricsPipeline

func NewMetricsPipeline(config *Config,
	counters []Counter,
	hostname string,
	newDCGMCollector DCGMCollectorConstructor,
	fieldEntityGroupTypeSystemInfo *FieldEntityGroupTypeSystemInfo,
) (*MetricsPipeline, func(), error)

func NewMetricsPipelineWithGPUCollector

func NewMetricsPipelineWithGPUCollector(c *Config, collector *DCGMCollector) (*MetricsPipeline, func(), error)

Primarely for testing, caller expected to cleanup the collector

func (*MetricsPipeline) Run

func (m *MetricsPipeline) Run(out chan string, stop chan interface{}, wg *sync.WaitGroup)

type MetricsServer

type MetricsServer struct {
	sync.Mutex
	// contains filtered or unexported fields
}

func NewMetricsServer

func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*MetricsServer, func(), error)

func (*MetricsServer) Health

func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request)

func (*MetricsServer) Metrics

func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request)

func (*MetricsServer) Run

func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup)

type MonitoringInfo

type MonitoringInfo struct {
	Entity       dcgm.GroupEntityPair
	DeviceInfo   dcgm.Device
	InstanceInfo *GPUInstanceInfo
	ParentId     uint
}

func AddAllCPUCores

func AddAllCPUCores(sysInfo SystemInfo) []MonitoringInfo

func AddAllCPUs

func AddAllCPUs(sysInfo SystemInfo) []MonitoringInfo

func AddAllGPUInstances

func AddAllGPUInstances(sysInfo SystemInfo, addFlexibly bool) []MonitoringInfo

func AddAllGPUs

func AddAllGPUs(sysInfo SystemInfo) []MonitoringInfo
func AddAllLinks(sysInfo SystemInfo) []MonitoringInfo

func AddAllSwitches

func AddAllSwitches(sysInfo SystemInfo) []MonitoringInfo

func GetMonitoredEntities

func GetMonitoredEntities(sysInfo SystemInfo) []MonitoringInfo

func GetMonitoringInfoForGPU

func GetMonitoringInfoForGPU(sysInfo SystemInfo, gpuID int) *MonitoringInfo

func GetMonitoringInfoForGPUInstance

func GetMonitoringInfoForGPUInstance(sysInfo SystemInfo, gpuInstanceID int) *MonitoringInfo

type PodInfo

type PodInfo struct {
	Name      string
	Namespace string
	Container string
}

type PodMapper

type PodMapper struct {
	Config *Config
}

func NewPodMapper

func NewPodMapper(c *Config) (*PodMapper, error)

func (*PodMapper) Name

func (p *PodMapper) Name() string

func (*PodMapper) Process

func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error

type Registry

type Registry struct {
	// contains filtered or unexported fields
}

func NewRegistry

func NewRegistry() *Registry

func (*Registry) Cleanup

func (r *Registry) Cleanup()

Cleanup resources of registered collectors

func (*Registry) Gather

func (r *Registry) Gather() (MetricsByCounter, error)

Gather gathers metrics from all registered collectors.

func (*Registry) Register

func (r *Registry) Register(c Collector)

Register registers a collector with the registry.

type SwitchInfo

type SwitchInfo struct {
	EntityId uint
	NvLinks  []dcgm.NvLinkStatus
}

type SystemInfo

type SystemInfo struct {
	GPUCount uint
	GPUs     [dcgm.MAX_NUM_DEVICES]GPUInfo

	InfoType dcgm.Field_Entity_Group
	Switches []SwitchInfo
	CPUs     []CPUInfo
	// contains filtered or unexported fields
}

func GetSystemInfo

func GetSystemInfo(config *Config, entityType dcgm.Field_Entity_Group) (*SystemInfo, error)

func InitializeCPUInfo

func InitializeCPUInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)

func InitializeGPUInfo

func InitializeGPUInfo(sysInfo SystemInfo, gOpt DeviceOptions, useFakeGPUs bool) (SystemInfo, error)

func InitializeNvSwitchInfo

func InitializeNvSwitchInfo(sysInfo SystemInfo, sOpt DeviceOptions) (SystemInfo, error)

func InitializeSystemInfo

func InitializeSystemInfo(
	gOpt DeviceOptions, sOpt DeviceOptions, cOpt DeviceOptions, useFakeGPUs bool, entityType dcgm.Field_Entity_Group,
) (SystemInfo, error)

type Transform

type Transform interface {
	Process(metrics MetricsByCounter, sysInfo SystemInfo) error
	Name() string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL