dcgm

package
v0.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 14, 2020 License: Apache-2.0 Imports: 15 Imported by: 0

Documentation

Index

Constants

View Source
const (
	Embedded mode = iota
	Standalone
	StartHostengine
)

const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine

View Source
const (
	PerfStateMax     = 0
	PerfStateMin     = 15
	PerfStateUnknown = 32
)
View Source
const (
	DbePolicy     = policyCondition("Double-bit ECC error")
	PCIePolicy    = policyCondition("PCI error")
	MaxRtPgPolicy = policyCondition("Max Retired Pages Limit")
	ThermalPolicy = policyCondition("Thermal Limit")
	PowerPolicy   = policyCondition("Power Limit")
	NvlinkPolicy  = policyCondition("Nvlink Error")
	XidPolicy     = policyCondition("XID Error")
)

Variables

This section is empty.

Functions

func GetAllDeviceCount

func GetAllDeviceCount() (uint, error)

GetAllDeviceCount counts all GPUs on the system

func GetSupportedDevices

func GetSupportedDevices() ([]uint, error)

GetSupportedDevices returns only DCGM supported GPUs

func Init

func Init(m mode, args ...string) (err error)

Init starts DCGM, based on the user selected mode DCGM can be started in 3 differengt modes: 1. Embedded: Start hostengine within this process 2. Standalone: Connect to an already running nv-hostengine at the specified address Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting

func Policy

func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)

Policy sets GPU usage and error policies and notifies in case of any violations via callback functions

func Shutdown

func Shutdown() (err error)

Shutdown stops DCGM and destroy all connections

func ViolationRegistration

func ViolationRegistration(data unsafe.Pointer) int

ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()

func WatchPidFields

func WatchPidFields() (groupHandle, error)

WatchPidFields lets DCGM start recording stats for GPU process It needs to be called before calling GetProcessInfo

Types

type ClockInfo

type ClockInfo struct {
	Cores  *uint // MHz
	Memory *uint // MHz
}

type DcgmStatus

type DcgmStatus struct {
	Memory int64
	CPU    float64
}

func Introspect

func Introspect() (DcgmStatus, error)

Introspect returns DCGM hostengine memory and CPU usage

type Device

type Device struct {
	GPU           uint
	DCGMSupported string
	UUID          string
	Power         *uint // W
	PCI           PCIInfo
	Clocks        ClockInfo
	Identifiers   DeviceIdentifiers
	Topology      []P2PLink
	CPUAffinity   string
}

func GetDeviceInfo

func GetDeviceInfo(gpuId uint) (Device, error)

GetDeviceInfo describes the given device

type DeviceHealth

type DeviceHealth struct {
	GPU     uint
	Status  string
	Watches []SystemWatch
}

func HealthCheckByGpuId

func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error)

HealthCheckByGpuId monitors GPU health for any errors/failures/warnings

type DeviceIdentifiers

type DeviceIdentifiers struct {
	Brand               string
	Model               string
	Serial              string
	Vbios               string
	InforomImageVersion string
	DriverVersion       string
}

type DeviceStatus

type DeviceStatus struct {
	Power       *float64 // W
	Temperature *uint    // °C
	Utilization UtilizationInfo
	Memory      MemoryInfo
	Clocks      ClockInfo
	PCI         PCIStatusInfo
	Performance PerfState
	FanSpeed    uint // %
}

func GetDeviceStatus

func GetDeviceStatus(gpuId uint) (DeviceStatus, error)

GetDeviceStatus monitors GPU status including its power, memory and GPU utilization

type ECCErrorsInfo

type ECCErrorsInfo struct {
	SingleBit *uint
	DoubleBit *uint
}

type MemoryInfo

type MemoryInfo struct {
	GlobalUsed *uint64
	ECCErrors  ECCErrorsInfo
}
type P2PLink struct {
	GPU   uint
	BusID string
	Link  P2PLinkType
}

func GetDeviceTopology

func GetDeviceTopology(gpuId uint) ([]P2PLink, error)

GetDeviceTopology returns device topology corresponding to the gpuId

type P2PLinkType

type P2PLinkType uint
const (
	P2PLinkUnknown P2PLinkType = iota
	P2PLinkCrossCPU
	P2PLinkSameCPU
	P2PLinkHostBridge
	P2PLinkMultiSwitch
	P2PLinkSingleSwitch
	P2PLinkSameBoard
	SingleNVLINKLink
	TwoNVLINKLinks
	ThreeNVLINKLinks
	FourNVLINKLinks
)

func (P2PLinkType) PCIPaths

func (l P2PLinkType) PCIPaths() string

type PCIInfo

type PCIInfo struct {
	BusID     string
	BAR1      *uint // MB
	FBTotal   *uint // MB
	Bandwidth *uint // MB/s
}

type PCIStatusInfo

type PCIStatusInfo struct {
	BAR1Used   *uint // MB
	Throughput PCIThroughputInfo
	FBUsed     *uint
}

type PCIThroughputInfo

type PCIThroughputInfo struct {
	Rx      *uint64 // MB
	Tx      *uint64 // MB
	Replays *uint64
}

type PerfState

type PerfState uint

func (PerfState) String

func (p PerfState) String() string

type PolicyViolation

type PolicyViolation struct {
	Condition policyCondition
	Timestamp time.Time
	Data      interface{}
}

type ProcessInfo

type ProcessInfo struct {
	GPU                uint
	PID                uint
	Name               string
	ProcessUtilization ProcessUtilInfo
	PCI                PCIStatusInfo
	Memory             MemoryInfo
	GpuUtilization     UtilizationInfo
	Clocks             ClockInfo
	Violations         ViolationTime
	XIDErrors          XIDErrorInfo
}

func GetProcessInfo

func GetProcessInfo(group groupHandle, pid uint) ([]ProcessInfo, error)

GetProcessInfo provides detailed per GPU stats for this process

type ProcessUtilInfo

type ProcessUtilInfo struct {
	StartTime      Time
	EndTime        Time
	EnergyConsumed *uint64 // Joules
	SmUtil         *float64
	MemUtil        *float64
}

type SystemWatch

type SystemWatch struct {
	Type   string
	Status string
	Error  string
}

type Time

type Time uint64

func (Time) String

func (t Time) String() string

type UtilizationInfo

type UtilizationInfo struct {
	GPU     *uint // %
	Memory  *uint // %
	Encoder *uint // %
	Decoder *uint // %
}

type ViolationTime

type ViolationTime struct {
	Power          *uint64
	Thermal        *uint64
	Reliability    *uint64
	BoardLimit     *uint64
	LowUtilization *uint64
	SyncBoost      *uint64
}

ViolationTime measures amount of time (in ms) GPU was at reduced clocks

type XIDErrorInfo

type XIDErrorInfo struct {
	NumErrors int
	Timestamp []uint64
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL