Documentation ¶
Index ¶
- Constants
- func GetAllDeviceCount() (uint, error)
- func GetSupportedDevices() ([]uint, error)
- func Init(m mode, args ...string) (err error)
- func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
- func Shutdown() (err error)
- func ViolationRegistration(data unsafe.Pointer) int
- func WatchPidFields() (groupHandle, error)
- type ClockInfo
- type DcgmStatus
- type Device
- type DeviceHealth
- type DeviceIdentifiers
- type DeviceStatus
- type ECCErrorsInfo
- type MemoryInfo
- type P2PLink
- type P2PLinkType
- type PCIInfo
- type PCIStatusInfo
- type PCIThroughputInfo
- type PerfState
- type PolicyViolation
- type ProcessInfo
- type ProcessUtilInfo
- type SystemWatch
- type Time
- type UtilizationInfo
- type ViolationTime
- type XIDErrorInfo
Constants ¶
const ( Embedded mode = iota Standalone StartHostengine )
const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine
const ( PerfStateMax = 0 PerfStateMin = 15 PerfStateUnknown = 32 )
const ( DbePolicy = policyCondition("Double-bit ECC error") PCIePolicy = policyCondition("PCI error") MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") ThermalPolicy = policyCondition("Thermal Limit") PowerPolicy = policyCondition("Power Limit") NvlinkPolicy = policyCondition("Nvlink Error") XidPolicy = policyCondition("XID Error") )
Variables ¶
This section is empty.
Functions ¶
func GetAllDeviceCount ¶
GetAllDeviceCount counts all GPUs on the system
func GetSupportedDevices ¶
GetSupportedDevices returns only DCGM supported GPUs
func Init ¶
Init starts DCGM, based on the user selected mode DCGM can be started in 3 differengt modes: 1. Embedded: Start hostengine within this process 2. Standalone: Connect to an already running nv-hostengine at the specified address Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting
func Policy ¶
func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
Policy sets GPU usage and error policies and notifies in case of any violations via callback functions
func ViolationRegistration ¶
ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()
func WatchPidFields ¶
func WatchPidFields() (groupHandle, error)
WatchPidFields lets DCGM start recording stats for GPU process It needs to be called before calling GetProcessInfo
Types ¶
type DcgmStatus ¶
func Introspect ¶
func Introspect() (DcgmStatus, error)
Introspect returns DCGM hostengine memory and CPU usage
type Device ¶
type Device struct { GPU uint DCGMSupported string UUID string Power *uint // W PCI PCIInfo Clocks ClockInfo Identifiers DeviceIdentifiers Topology []P2PLink CPUAffinity string }
func GetDeviceInfo ¶
GetDeviceInfo describes the given device
type DeviceHealth ¶
type DeviceHealth struct { GPU uint Status string Watches []SystemWatch }
func HealthCheckByGpuId ¶
func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error)
HealthCheckByGpuId monitors GPU health for any errors/failures/warnings
type DeviceIdentifiers ¶
type DeviceStatus ¶
type DeviceStatus struct { Power *float64 // W Temperature *uint // °C Utilization UtilizationInfo Memory MemoryInfo Clocks ClockInfo PCI PCIStatusInfo Performance PerfState FanSpeed uint // % }
func GetDeviceStatus ¶
func GetDeviceStatus(gpuId uint) (DeviceStatus, error)
GetDeviceStatus monitors GPU status including its power, memory and GPU utilization
type ECCErrorsInfo ¶
type MemoryInfo ¶
type MemoryInfo struct { GlobalUsed *uint64 ECCErrors ECCErrorsInfo }
type P2PLink ¶
type P2PLink struct { GPU uint BusID string Link P2PLinkType }
func GetDeviceTopology ¶
GetDeviceTopology returns device topology corresponding to the gpuId
type P2PLinkType ¶
type P2PLinkType uint
const ( P2PLinkUnknown P2PLinkType = iota P2PLinkCrossCPU P2PLinkSameCPU P2PLinkHostBridge P2PLinkMultiSwitch P2PLinkSingleSwitch P2PLinkSameBoard SingleNVLINKLink TwoNVLINKLinks ThreeNVLINKLinks FourNVLINKLinks )
func (P2PLinkType) PCIPaths ¶
func (l P2PLinkType) PCIPaths() string
type PCIStatusInfo ¶
type PCIStatusInfo struct { BAR1Used *uint // MB Throughput PCIThroughputInfo FBUsed *uint }
type PCIThroughputInfo ¶
type PolicyViolation ¶
type ProcessInfo ¶
type ProcessInfo struct { GPU uint PID uint Name string ProcessUtilization ProcessUtilInfo PCI PCIStatusInfo Memory MemoryInfo GpuUtilization UtilizationInfo Clocks ClockInfo Violations ViolationTime XIDErrors XIDErrorInfo }
func GetProcessInfo ¶
func GetProcessInfo(group groupHandle, pid uint) ([]ProcessInfo, error)
GetProcessInfo provides detailed per GPU stats for this process
type ProcessUtilInfo ¶
type SystemWatch ¶
type UtilizationInfo ¶
type ViolationTime ¶
type ViolationTime struct { Power *uint64 Thermal *uint64 Reliability *uint64 BoardLimit *uint64 LowUtilization *uint64 SyncBoost *uint64 }
ViolationTime measures amount of time (in ms) GPU was at reduced clocks