nvml

package

v0.0.1-alpha8 Latest Latest Go to latest Published: Sep 2, 2024 License: Apache-2.0 Imports: 20 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

Documentation ¶

Overview ¶

Package nvml implements the NVIDIA Management Library (NVML) interface. See https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference for more details.

Index ¶

func DefaultInstanceReady() <-chan any
func GPMSupported() (bool, error)
func GPMSupportedByDevice(dev device.Device) (bool, error)
func GetGPMMetrics(ctx context.Context, dev device.Device, metricIDs ...nvml.GpmMetricId) (map[nvml.GpmMetricId]float64, error)
func StartDefaultInstance(ctx context.Context) error
type AllECCErrorCounts
- func (allCounts AllECCErrorCounts) FindUncorrectedErrs() []string
type ClockEvents
- func GetClockEvents(uuid string, dev device.Device) (ClockEvents, error)
- func (evs *ClockEvents) JSON() ([]byte, error)
- func (evs *ClockEvents) YAML() ([]byte, error)
type ClockSpeed
- func GetClockSpeed(uuid string, dev device.Device) (ClockSpeed, error)
type DeviceInfo
type ECCErrorCounts
type ECCErrors
- func GetECCErrors(uuid string, dev device.Device) (ECCErrors, error)
- func (es ECCErrors) JSON() ([]byte, error)
- func (es ECCErrors) YAML() ([]byte, error)
type GPMEvent
- func (ev *GPMEvent) YAML() ([]byte, error)
type GPMMetrics
type Instance
- func DefaultInstance() Instance
- func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error)
type Memory
- func GetMemory(uuid string, dev device.Device) (Memory, error)
- func (mem Memory) GetUsedPercent() (float64, error)
type NVLink
- func GetNVLink(uuid string, dev device.Device) (NVLink, error)
type NVLinkState
type NVLinkStates
- func (s NVLinkStates) AllFeatureEnabled() bool
- func (s NVLinkStates) TotalCRCErrors() uint64
- func (s NVLinkStates) TotalRecoveryErrors() uint64
- func (s NVLinkStates) TotalRelayErrors() uint64
- func (s NVLinkStates) TotalThroughputRawRxBytes() uint64
- func (s NVLinkStates) TotalThroughputRawTxBytes() uint64
type Op
type OpOption
- func WithGPMMetricsID(id nvml.GpmMetricId) OpOption
type Output
type Power
- func GetPower(uuid string, dev device.Device) (Power, error)
- func (power Power) GetUsedPercent() (float64, error)
type Process
type Processes
- func GetProcesses(uuid string, dev device.Device) (Processes, error)
- func (procs *Processes) JSON() ([]byte, error)
- func (procs *Processes) YAML() ([]byte, error)
type Temperature
- func GetTemperature(uuid string, dev device.Device) (Temperature, error)
- func (temp Temperature) GetUsedPercentGPUMax() (float64, error)
- func (temp Temperature) GetUsedPercentMemMax() (float64, error)
- func (temp Temperature) GetUsedPercentShutdown() (float64, error)
- func (temp Temperature) GetUsedPercentSlowdown() (float64, error)
type Utilization
- func GetUtilization(uuid string, dev device.Device) (Utilization, error)
type XidEvent
- func (ev *XidEvent) YAML() ([]byte, error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func DefaultInstanceReady ¶

func DefaultInstanceReady() <-chan any

func GPMSupported ¶

func GPMSupported() (bool, error)

Returns true if GPM is supported by all devices. Returns false if any device does not support GPM.

func GPMSupportedByDevice ¶

func GPMSupportedByDevice(dev device.Device) (bool, error)

func GetGPMMetrics ¶

func GetGPMMetrics(ctx context.Context, dev device.Device, metricIDs ...nvml.GpmMetricId) (map[nvml.GpmMetricId]float64, error)

Returns the map from the metrics ID to the value for this device. Don't call these in parallel for multiple devices. It "SIGSEGV: segmentation violation" in cgo execution. ref. https://github.com/NVIDIA/go-nvml/blob/main/examples/gpm-metrics/main.go

func StartDefaultInstance ¶

func StartDefaultInstance(ctx context.Context) error

Starts the default NVML instance.

By default, it tracks the SM occupancy metric, with nvml.GPM_METRIC_SM_OCCUPANCY. NVML_GPM_METRIC_SM_OCCUPANCY is the percentage of warps that were active vs theoretical maximum (0.0 - 100.0). ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlGpmStructs.html#group__nvmlGpmStructs_1g168f5f2704ec9871110d22aa1879aec0

Types ¶

type AllECCErrorCounts ¶

type AllECCErrorCounts struct {
	// Total ECC error counts for the device.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g9748430b6aa6cdbb2349c5e835d70b0f
	Total ECCErrorCounts `json:"total"`

	// GPU L1 Cache.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	L1Cache ECCErrorCounts `json:"l1_cache"`

	// GPU L2 Cache.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	L2Cache ECCErrorCounts `json:"l2_cache"`

	// Turing+ DRAM.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	DRAM ECCErrorCounts `json:"dram"`

	// Turing+ SRAM.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	SRAM ECCErrorCounts `json:"sram"`

	// GPU Device Memory.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	GPUDeviceMemory ECCErrorCounts `json:"gpu_device_memory"`

	// GPU Texture Memory.
	// Specialized memory optimized for 2D spatial locality.
	// Read-only from kernels (in most cases).
	// Optimized for specific access patterns common in graphics/image processing.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	GPUTextureMemory ECCErrorCounts `json:"gpu_texture_memory"`

	// Shared memory. Not texture memory.
	// Used for inter-thread communication and data caching within a block.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	SharedMemory ECCErrorCounts `json:"shared_memory"`

	// GPU Register File.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g9bcbee49054a953d333d4aa11e8b9c25
	GPURegisterFile ECCErrorCounts `json:"gpu_register_file"`
}

func (AllECCErrorCounts) FindUncorrectedErrs ¶

func (allCounts AllECCErrorCounts) FindUncorrectedErrs() []string

type ClockEvents ¶

type ClockEvents struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// Represents the bitmask of active clocks event reasons.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons
	ReasonsBitmask uint64 `json:"reasons_bitmask"`
	// Represents the human-readable reasons for the clock events.
	Reasons []string `json:"reasons,omitempty"`

	// Set true if the HW Slowdown reason due to the high temperature is active.
	HWSlowdown bool `json:"hw_slowdown"`
	// Set true if the HW Thermal Slowdown reason due to the high temperature is active.
	HWSlowdownThermal bool `json:"hw_thermal_slowdown"`
	// Set true if the HW Power Brake Slowdown reason due to the external power brake assertion is active.
	HWSlowdownPowerBrake bool `json:"hw_slowdown_power_brake"`
}

ClockEvents represents the current clock events from the nvmlDeviceGetCurrentClocksEventReasons API. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga115e41a14b747cb334a0e7b49ae1941 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons

func GetClockEvents ¶

func GetClockEvents(uuid string, dev device.Device) (ClockEvents, error)

func (*ClockEvents) JSON ¶

func (evs *ClockEvents) JSON() ([]byte, error)

func (*ClockEvents) YAML ¶

func (evs *ClockEvents) YAML() ([]byte, error)

type ClockSpeed ¶

type ClockSpeed struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	GraphicsMHz uint32 `json:"graphics_mhz"`
	MemoryMHz   uint32 `json:"memory_mhz"`
}

ClockSpeed represents the data from the nvmlDeviceGetClockInfo API. Returns the graphics and memory clock speeds in MHz. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2efc4dd4096173f01d80b2a8bbfd97ad

func GetClockSpeed ¶

func GetClockSpeed(uuid string, dev device.Device) (ClockSpeed, error)

type DeviceInfo ¶

type DeviceInfo struct {
	// Note that k8s-device-plugin has a different logic for MIG devices.
	// TODO: implement MIG device UUID fetching using NVML.
	UUID string `json:"uuid"`

	// MinorNumber is the minor number of the device.
	MinorNumber int `json:"minor_number"`
	// Bus is the bus ID from PCI info API.
	Bus uint32 `json:"bus"`
	// Device ID is the device ID from PCI info API.
	Device uint32 `json:"device"`

	Name            string `json:"name"`
	GPUCores        int    `json:"gpu_cores"`
	SupportedEvents uint64 `json:"supported_events"`

	// Set true if the device supports NVML error checks (health checks).
	XidErrorSupported bool `json:"xid_error_supported"`
	// Set true if the device supports GPM metrics.
	GPMMetricsSupported bool `json:"gpm_metrics_supported"`

	ClockEvents ClockEvents `json:"clock_events"`
	ClockSpeed  ClockSpeed  `json:"clock_speed"`
	Memory      Memory      `json:"memory"`
	NVLink      NVLink      `json:"nvlink"`
	Power       Power       `json:"power"`
	Temperature Temperature `json:"temperature"`
	Utilization Utilization `json:"utilization"`
	Processes   Processes   `json:"processes"`
	ECCErrors   ECCErrors   `json:"ecc_errors"`
	// contains filtered or unexported fields
}

type ECCErrorCounts ¶

type ECCErrorCounts struct {
	// A memory error that was correctedFor ECC errors, these are single bit errors.
	// For Texture memory, these are errors fixed by resend.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gc5469bd68b9fdcf78734471d86becb24
	Corrected uint64 `json:"corrected"`

	// A memory error that was not correctedFor ECC errors, these are double bit errors.
	// For Texture memory, these are errors where the resend fails.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gc5469bd68b9fdcf78734471d86becb24
	Uncorrected uint64 `json:"uncorrected"`
}

type ECCErrors ¶

type ECCErrors struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// Aggregate counts persist across reboots (i.e. for the lifetime of the device).
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g08978d1c4fb52b6a4c72b39de144f1d9
	Aggregate AllECCErrorCounts `json:"aggregate"`

	// Volatile counts are reset each time the driver loads.
	// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g08978d1c4fb52b6a4c72b39de144f1d9
	Volatile AllECCErrorCounts `json:"volatile"`
}

func GetECCErrors ¶

func GetECCErrors(uuid string, dev device.Device) (ECCErrors, error)

func (ECCErrors) JSON ¶

func (es ECCErrors) JSON() ([]byte, error)

func (ECCErrors) YAML ¶

func (es ECCErrors) YAML() ([]byte, error)

type GPMEvent ¶

type GPMEvent struct {
	Metrics []GPMMetrics `json:"metrics"`
	Error   error        `json:"error"`
}

func (*GPMEvent) YAML ¶

func (ev *GPMEvent) YAML() ([]byte, error)

type GPMMetrics ¶

type GPMMetrics struct {
	// Time is the time the metrics were collected.
	Time metav1.Time `json:"time"`

	// Device UUID that these GPM metrics belong to.
	UUID string `json:"uuid"`

	// The duration of the sample.
	SampleDuration metav1.Duration `json:"sample_duration"`

	// The metrics.
	Metrics map[nvml.GpmMetricId]float64 `json:"metrics"`
}

GPMMetrics contains the GPM metrics for a device.

type Instance ¶

type Instance interface {
	NVMLExists() bool

	Start() error

	XidErrorSupported() bool
	RecvXidEvents() <-chan *XidEvent

	GPMMetricsSupported() bool
	RecvGPMEvents() <-chan *GPMEvent

	Shutdown() error
	Get() (*Output, error)
}

func DefaultInstance ¶

func DefaultInstance() Instance

func NewInstance ¶

func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error)

type Memory ¶

type Memory struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	TotalBytes     uint64 `json:"total_bytes"`
	TotalHumanized string `json:"total_humanized"`

	ReservedBytes     uint64 `json:"reserved_bytes"`
	ReservedHumanized string `json:"reserved_humanized"`

	UsedBytes     uint64 `json:"used_bytes"`
	UsedHumanized string `json:"used_humanized"`

	FreeBytes     uint64 `json:"free_bytes"`
	FreeHumanized string `json:"free_humanized"`

	UsedPercent string `json:"used_percent"`
}

func GetMemory ¶

func GetMemory(uuid string, dev device.Device) (Memory, error)

func (Memory) GetUsedPercent ¶

func (mem Memory) GetUsedPercent() (float64, error)

type NVLink ¶

type NVLink struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// States is the list of nvlink states.
	States NVLinkStates `json:"states"`
}

func GetNVLink ¶

func GetNVLink(uuid string, dev device.Device) (NVLink, error)

Queries the nvlink information.

type NVLinkState ¶

type NVLinkState struct {
	// Link is the nvlink link number.
	Link int `json:"link"`

	// FeatureEnabled is true if the nvlink feature is enabled.
	FeatureEnabled bool `json:"feature_enabled"`
	// ReplayErrors is the number of replay errors.
	ReplayErrors uint64 `json:"replay_errors"`
	// RecoveryErrors is the number of recovery errors.
	RecoveryErrors uint64 `json:"recovery_errors"`
	// CRCErrors is the number of crc errors.
	CRCErrors uint64 `json:"crc_errors"`

	// ThroughputRawTxBytes is the NVLink TX Data throughput + protocol overhead in bytes.
	ThroughputRawTxBytes uint64 `json:"throughput_raw_tx_bytes"`
	// ThroughputRawRxBytes is the NVLink RX Data throughput + protocol overhead in bytes.
	ThroughputRawRxBytes uint64 `json:"throughput_raw_rx_bytes"`
}

type NVLinkStates ¶

type NVLinkStates []NVLinkState

func (NVLinkStates) AllFeatureEnabled ¶

func (s NVLinkStates) AllFeatureEnabled() bool

func (NVLinkStates) TotalCRCErrors ¶

func (s NVLinkStates) TotalCRCErrors() uint64

func (NVLinkStates) TotalRecoveryErrors ¶

func (s NVLinkStates) TotalRecoveryErrors() uint64

func (NVLinkStates) TotalRelayErrors ¶

func (s NVLinkStates) TotalRelayErrors() uint64

func (NVLinkStates) TotalThroughputRawRxBytes ¶

func (s NVLinkStates) TotalThroughputRawRxBytes() uint64

func (NVLinkStates) TotalThroughputRawTxBytes ¶

func (s NVLinkStates) TotalThroughputRawTxBytes() uint64

type Op ¶

type Op struct {
	// contains filtered or unexported fields
}

type OpOption ¶

type OpOption func(*Op)

func WithGPMMetricsID ¶

func WithGPMMetricsID(id nvml.GpmMetricId) OpOption

type Output ¶

type Output struct {
	Exists      bool          `json:"exists"`
	Message     string        `json:"message"`
	DeviceInfos []*DeviceInfo `json:"device_infos"`
}

type Power ¶

type Power struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// Represents the GPU ID.
	GPUID uint32 `json:"gpu_id"`

	UsageMilliWatts           uint32 `json:"usage_milli_watts"`
	EnforcedLimitMilliWatts   uint32 `json:"enforced_limit_milli_watts"`
	ManagementLimitMilliWatts uint32 `json:"management_limit_milli_watts"`

	UsedPercent string `json:"used_percent"`
}

func GetPower ¶

func GetPower(uuid string, dev device.Device) (Power, error)

func (Power) GetUsedPercent ¶

func (power Power) GetUsedPercent() (float64, error)

type Process ¶

type Process struct {
	PID                         uint32      `json:"pid"`
	Status                      []string    `json:"status,omitempty"`
	CmdArgs                     []string    `json:"cmd_args,omitempty"`
	CreateTime                  metav1.Time `json:"create_time,omitempty"`
	GPUUsedPercent              uint32      `json:"gpu_used_percent,omitempty"`
	GPUUsedMemoryBytes          uint64      `json:"gpu_used_memory_bytes,omitempty"`
	GPUUsedMemoryBytesHumanized string      `json:"gpu_used_memory_bytes_humanized,omitempty"`
}

type Processes ¶

type Processes struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// A list of running processes.
	RunningProcesses []Process `json:"running_processes"`
}

Processes represents the current clock events from the nvmlDeviceGetCurrentClocksEventReasons API. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga115e41a14b747cb334a0e7b49ae1941 ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons

func GetProcesses ¶

func GetProcesses(uuid string, dev device.Device) (Processes, error)

func (*Processes) JSON ¶

func (procs *Processes) JSON() ([]byte, error)

func (*Processes) YAML ¶

func (procs *Processes) YAML() ([]byte, error)

type Temperature ¶

type Temperature struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// Represents the GPU ID.
	GPUID uint32 `json:"gpu_id"`

	CurrentCelsiusGPUCore uint32 `json:"current_celsius_gpu_core"`

	// Threshold at which the GPU starts to shut down to prevent hardware damage.
	ThresholdCelsiusShutdown uint32 `json:"threshold_celsius_shutdown"`
	// Threshold at which the GPU starts to throttle its performance.
	ThresholdCelsiusSlowdown uint32 `json:"threshold_celsius_slowdown"`
	// Maximum safe operating temperature for the GPU's memory.
	ThresholdCelsiusMemMax uint32 `json:"threshold_celsius_mem_max"`
	// Maximum safe operating temperature for the GPU core.
	ThresholdCelsiusGPUMax uint32 `json:"threshold_celsius_gpu_max"`

	UsedPercentShutdown string `json:"used_percent_shutdown"`
	UsedPercentSlowdown string `json:"used_percent_slowdown"`
	UsedPercentMemMax   string `json:"used_percent_mem_max"`
	UsedPercentGPUMax   string `json:"used_percent_gpu_max"`
}

func GetTemperature ¶

func GetTemperature(uuid string, dev device.Device) (Temperature, error)

func (Temperature) GetUsedPercentGPUMax ¶

func (temp Temperature) GetUsedPercentGPUMax() (float64, error)

func (Temperature) GetUsedPercentMemMax ¶

func (temp Temperature) GetUsedPercentMemMax() (float64, error)

func (Temperature) GetUsedPercentShutdown ¶

func (temp Temperature) GetUsedPercentShutdown() (float64, error)

func (Temperature) GetUsedPercentSlowdown ¶

func (temp Temperature) GetUsedPercentSlowdown() (float64, error)

type Utilization ¶

type Utilization struct {
	// Represents the GPU UUID.
	UUID string `json:"uuid"`

	// Percent of time over the past sample period during which one or more kernels was executing on the GPU.
	GPUUsedPercent uint32 `json:"gpu_used_percent"`
	// Percent of time over the past sample period during which global (device) memory was being read or written.
	MemoryUsedPercent uint32 `json:"memory_used_percent"`
}

Utilization represents the data from the nvmlDeviceGetUtilizationRates API. Utilization information for a device. Each sample period may be between 1 second and 1/6 second, depending on the product being queried. ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g540824faa6cef45500e0d1dc2f50b321 ref. https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t c.f., "DCGM_FI_PROF_GR_ENGINE_ACTIVE" https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.html#group__dcgmFieldIdentifiers_1g5a93634d6e8574ab6af4bfab102709dc

func GetUtilization ¶

func GetUtilization(uuid string, dev device.Device) (Utilization, error)

type XidEvent ¶

type XidEvent struct {
	// Time is the time the metrics were collected.
	Time metav1.Time `json:"time"`
	// The duration of the sample.
	SampleDuration metav1.Duration `json:"sample_duration"`

	EventType uint64 `json:"event_type"`

	Xid              uint64 `json:"xid"`
	XidCriticalError bool   `json:"xid_critical_error"`

	Detail *nvidia_query_xid.Detail `json:"detail,omitempty"`

	Message string `json:"message,omitempty"`

	// Set if any error happens during NVML calls.
	Error error `json:"error,omitempty"`
}

func (*XidEvent) YAML ¶

func (ev *XidEvent) YAML() ([]byte, error)

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL