query

package

v0.3.0 Latest Latest Go to latest Published: Nov 30, 2024 License: Apache-2.0 Imports: 41 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/leptonai/gpud

Links

Open Source Insights

Documentation ¶

Overview ¶

Package query implements "nvidia-smi --query" output helpers.

Package query implements various NVIDIA-related system queries. All interactions with NVIDIA data sources are implemented under the query packages.

Index ¶

Constants
func CheckFabricManagerActive(ctx context.Context, conn *systemd.DbusConn) (bool, error)
func CheckFabricManagerVersion(ctx context.Context) (string, error)
func CountAllDevicesFromDevDir() (int, error)
func CreateGet(db *sql.DB) query.GetFunc
func FabricManagerExists() bool
func FindSummaryErr(s string) []string
func GPUsInstalled(ctx context.Context) (bool, error)
func Get(ctx context.Context, db *sql.DB) (output any, err error)
func GetDefaultPoller() query.Poller
func GetLatestFabricManagerOutput(ctx context.Context) (string, error)
func GetSuccessOnce() <-chan any
func IsErrDeviceHandleUnknownError(err error) bool
func ListNVIDIAPCIs(ctx context.Context) ([]string, error)
func LoadGPUDeviceName(ctx context.Context) (string, error)
func PersistencedExists() bool
func PersistencedRunning() bool
func RunSMI(ctx context.Context, args ...string) ([]byte, error)
func SMIExists() bool
func SetDefaultPoller(db *sql.DB)
type FabricManagerOutput
type MemoryErrorManagementCapabilities
- func GetMemoryErrorManagementCapabilities(gpuProductName string) MemoryErrorManagementCapabilities
type NvidiaSMIGPU
- func (g NvidiaSMIGPU) FindAddressingModeErr() error
- func (g NvidiaSMIGPU) FindErrs() []string
- func (g NvidiaSMIGPU) FindHWSlowdownErrs() []string
- func (gpu *NvidiaSMIGPU) GetSMIGPUPersistenceMode() SMIGPUPersistenceMode
type Output
- func (o *Output) GPUCount() int
- func (o *Output) GPUCountFromNVML() int
- func (o *Output) GPUProductName() string
- func (o *Output) GPUProductNameFromNVML() string
- func (o *Output) PrintInfo(debug bool)
- func (o *Output) YAML() ([]byte, error)
type ParsedMemoryUsage
- func (u ParsedMemoryUsage) GetUsedPercent() (float64, error)
type ParsedSMIPowerReading
- func (pr ParsedSMIPowerReading) GetCurrentPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetDefaultPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetMaxPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetMinPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetPowerDrawW() (float64, error)
- func (pr ParsedSMIPowerReading) GetRequestedPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetUsedPercent() (float64, error)
type ParsedSMIRemappedRows
- func (rw ParsedSMIRemappedRows) GetRemappedDueToCorrectableError() (int64, error)
- func (rw ParsedSMIRemappedRows) GetRemappedDueToUncorrectableError() (int64, error)
- func (rw ParsedSMIRemappedRows) GetRemappingFailured() (bool, error)
- func (rw ParsedSMIRemappedRows) GetRemappingPending() (bool, error)
- func (rw ParsedSMIRemappedRows) QualifiesForRMA() (bool, error)
- func (rw ParsedSMIRemappedRows) RequiresReset() (bool, error)
type ParsedTemperature
- func (temp ParsedTemperature) GetCurrentCelsius() (float64, error)
- func (temp ParsedTemperature) GetLimitCelsius() (float64, error)
- func (temp ParsedTemperature) GetShutdownCelsius() (float64, error)
- func (temp ParsedTemperature) GetSlowdownCelsius() (float64, error)
- func (temp ParsedTemperature) GetUsedPercent() (float64, error)
type SMIClockEventReasons
type SMIECCErrorAggregate
type SMIECCErrorAggregateUncorrectableSRAMSources
type SMIECCErrorVolatile
type SMIECCErrors
- func (eccErrs SMIECCErrors) FindVolatileUncorrectableErrs() []string
type SMIECCMode
type SMIFBMemoryUsage
- func (f *SMIFBMemoryUsage) GetFreeBytes() (uint64, error)
- func (f *SMIFBMemoryUsage) GetReservedBytes() (uint64, error)
- func (f *SMIFBMemoryUsage) GetTotalBytes() (uint64, error)
- func (f *SMIFBMemoryUsage) GetUsedBytes() (uint64, error)
- func (f *SMIFBMemoryUsage) Parse() (ParsedMemoryUsage, error)
type SMIGPUPersistenceMode
type SMIGPUPowerReadings
- func (g *SMIGPUPowerReadings) GetCurrentPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetDefaultPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetMaxPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetMinPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetPowerDrawW() (float64, error)
- func (g *SMIGPUPowerReadings) GetRequestedPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) Parse() (ParsedSMIPowerReading, error)
type SMIGPUResetStatus
type SMIGPUTemperature
- func (tm *SMIGPUTemperature) GetCurrentCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetLimitCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetShutdownCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetShutdownLimitCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetSlowdownCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetSlowdownLimitCelsius() (float64, error)
- func (tm *SMIGPUTemperature) Parse() (ParsedTemperature, error)
type SMIOutput
- func GetSMIOutput(ctx context.Context) (*SMIOutput, error)
- func ParseSMIQueryOutput(b []byte) (*SMIOutput, error)
- func (o *SMIOutput) FindGPUErrs() []string
- func (o *SMIOutput) FindHWSlowdownErrs() []string
- func (o *SMIOutput) JSON() ([]byte, error)
- func (o *SMIOutput) YAML() ([]byte, error)
type SMIProcesses
type SMIRemappedRows
- func (rw *SMIRemappedRows) Parse() (ParsedSMIRemappedRows, error)

Constants ¶

View Source

const (
	ClockEventsActive    = "Active"
	ClockEventsNotActive = "Not Active"
)

View Source

const (
	StateKeyGPUProductName      = "gpu_product_name"
	StateKeySMIExists           = "smi_exists"
	StateKeyFabricManagerExists = "fabric_manager_exists"
	StateKeyIbstatExists        = "ibstat_exists"
)

View Source

const AddressingModeNone = "None"

Variables ¶

This section is empty.

Functions ¶

func CheckFabricManagerActive ¶

func CheckFabricManagerActive(ctx context.Context, conn *systemd.DbusConn) (bool, error)

func CheckFabricManagerVersion ¶

func CheckFabricManagerVersion(ctx context.Context) (string, error)

func CountAllDevicesFromDevDir ¶ added in v0.0.5

func CountAllDevicesFromDevDir() (int, error)

func CreateGet ¶ added in v0.1.8

func CreateGet(db *sql.DB) query.GetFunc

func FabricManagerExists ¶

func FabricManagerExists() bool

func FindSummaryErr ¶

func FindSummaryErr(s string) []string

ref. https://forums.developer.nvidia.com/t/nvidia-smi-q-shows-several-unknown-error-gpu-ignored-by-pytorch/263881/2

func GPUsInstalled ¶ added in v0.1.0

func GPUsInstalled(ctx context.Context) (bool, error)

Returns true if the local machine has NVIDIA GPUs installed.

func Get ¶

func Get(ctx context.Context, db *sql.DB) (output any, err error)

Get all nvidia component queries.

func GetDefaultPoller ¶ added in v0.1.8

func GetDefaultPoller() query.Poller

func GetLatestFabricManagerOutput ¶ added in v0.0.2

func GetLatestFabricManagerOutput(ctx context.Context) (string, error)

Returns the latest fabric manager output using journalctl. Equivalent to "journalctl -xeu nvidia-fabricmanager.service --no-pager".

func GetSuccessOnce ¶

func GetSuccessOnce() <-chan any

func IsErrDeviceHandleUnknownError ¶ added in v0.2.4

func IsErrDeviceHandleUnknownError(err error) bool

"NVIDIA Xid 79: GPU has fallen off the bus" may fail this syscall with: "error getting device handle for index '6': Unknown Error"

or "Unable to determine the device handle for GPU0000:CB:00.0: Unknown Error"

func ListNVIDIAPCIs ¶ added in v0.1.0

func ListNVIDIAPCIs(ctx context.Context) ([]string, error)

Lists all PCI devices that are compatible with NVIDIA.

func LoadGPUDeviceName ¶ added in v0.1.0

func LoadGPUDeviceName(ctx context.Context) (string, error)

Loads the product name of the NVIDIA GPU device.

func PersistencedExists ¶ added in v0.0.5

func PersistencedExists() bool

Returns true if the local machine has "nvidia-persistenced". ref. https://docs.nvidia.com/deploy/driver-persistence/index.html#usage

func PersistencedRunning ¶ added in v0.0.5

func PersistencedRunning() bool

"pidof nvidia-persistenced"

func RunSMI ¶

func RunSMI(ctx context.Context, args ...string) ([]byte, error)

func SMIExists ¶

func SMIExists() bool

Returns true if the local machine runs on Nvidia GPU by running "nvidia-smi".

func SetDefaultPoller ¶ added in v0.1.8

func SetDefaultPoller(db *sql.DB)

only set once since it relies on the kube client and specific port

Types ¶

type FabricManagerOutput ¶

type FabricManagerOutput struct {
	Version string `json:"version"`
	// Set true if the "nvidia-fabricmanager" systemd service is active.
	Active bool `json:"active"`

	// Stores the output of "journalctl -xeu nvidia-fabricmanager.service --no-pager".
	// Useful for debugging fabric manager fails to start (e.g., "Error: Fabric Manager already running with pid 7388").
	JournalOutput string `json:"journal_output,omitempty"`
}

type MemoryErrorManagementCapabilities ¶ added in v0.0.4

type MemoryErrorManagementCapabilities struct {
	// (If supported) GPU can limit the impact of uncorrectable ECC errors to GPU applications.
	// Existing/new workloads will run unaffected, both in terms of accuracy and performance.
	// Thus, does not require a GPU reset when memory errors occur.
	//
	// Note thtat there are some rarer cases, where uncorrectable errors are still uncontained
	// thus impacting all other workloads being procssed in the GPU.
	//
	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#error-containments
	ErrorContainment bool `json:"error_containment"`

	// (If supported) GPU can dynamically mark the page containing uncorrectable errors
	// as unusable, and any existing or new workloads will not be allocating this page.
	//
	// Thus, does not require a GPU reset to recover from most uncorrectable ECC errors.
	//
	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#dynamic-page-offlining
	DynamicPageOfflining bool `json:"dynamic_page_offlining"`

	// (If supported) GPU can replace degrading memory cells with spare ones
	// to avoid offlining regions of memory. And the row remapping is different
	// from dynamic page offlining which is fixed at a hardware level.
	//
	// The row remapping requires a GPU reset to take effect.
	//
	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#row-remapping
	RowRemapping bool `json:"row_remapping"`
}

Contains information about the GPU's memory error management capabilities. ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus

func GetMemoryErrorManagementCapabilities ¶ added in v0.0.4

func GetMemoryErrorManagementCapabilities(gpuProductName string) MemoryErrorManagementCapabilities

GetMemoryErrorManagementCapabilities returns the GPU memory error management capabilities based on the GPU product name. ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus

type NvidiaSMIGPU ¶

type NvidiaSMIGPU struct {
	// The original GPU identifier from the nvidia-smi query output.
	// e.g., "GPU 00000000:53:00.0"
	ID string `json:"ID"`

	ProductName         string `json:"Product Name"`
	ProductBrand        string `json:"Product Brand"`
	ProductArchitecture string `json:"Product Architecture"`

	PersistenceMode string `json:"Persistence Mode"`
	AddressingMode  string `json:"Addressing Mode"`

	GPUResetStatus    *SMIGPUResetStatus    `json:"GPU Reset Status,omitempty"`
	ClockEventReasons *SMIClockEventReasons `json:"Clocks Event Reasons,omitempty"`

	ECCMode      *SMIECCMode      `json:"ECC Mode,omitempty"`
	ECCErrors    *SMIECCErrors    `json:"ECC Errors,omitempty"`
	RemappedRows *SMIRemappedRows `json:"Remapped Rows,omitempty"`

	Temperature      *SMIGPUTemperature   `json:"Temperature,omitempty"`
	GPUPowerReadings *SMIGPUPowerReadings `json:"GPU Power Readings,omitempty"`
	Processes        *SMIProcesses        `json:"Processes,omitempty"`
	FBMemoryUsage    *SMIFBMemoryUsage    `json:"FB Memory Usage"`

	FanSpeed string `json:"Fan Speed"`
}

GPU object from the nvidia-smi query. ref. "nvidia-smi --help-query-gpu"

func (NvidiaSMIGPU) FindAddressingModeErr ¶

func (g NvidiaSMIGPU) FindAddressingModeErr() error

Returns the Address Mode error if any of the GPU has "Unknown Error" Addressing Mode. It may indicate Xid 31 "GPU memory page fault", where the application crashes with: e.g., RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

func (NvidiaSMIGPU) FindErrs ¶

func (g NvidiaSMIGPU) FindErrs() []string

Returns true if the GPU has any errors. ref. https://forums.developer.nvidia.com/t/nvidia-smi-q-shows-several-unknown-error-gpu-ignored-by-pytorch/263881

func (NvidiaSMIGPU) FindHWSlowdownErrs ¶

func (g NvidiaSMIGPU) FindHWSlowdownErrs() []string

func (*NvidiaSMIGPU) GetSMIGPUPersistenceMode ¶ added in v0.0.5

func (gpu *NvidiaSMIGPU) GetSMIGPUPersistenceMode() SMIGPUPersistenceMode

type Output ¶

type Output struct {
	// GPU device count from the /dev directory.
	GPUDeviceCount int `json:"gpu_device_count"`

	// BadEnvVarsForCUDA is a map of environment variables that are known to hurt CUDA.
	// that is set globally for the host.
	// This implements "DCGM_FR_BAD_CUDA_ENV" logic in DCGM.
	BadEnvVarsForCUDA map[string]string `json:"bad_env_vars_for_cuda,omitempty"`

	PersistencedExists  bool `json:"persistenced_exists"`
	PersistencedRunning bool `json:"persistenced_running"`

	FabricManagerExists bool                 `json:"fabric_manager_exists"`
	FabricManager       *FabricManagerOutput `json:"fabric_manager,omitempty"`
	FabricManagerErrors []string             `json:"fabric_manager_errors,omitempty"`

	InfinibandClassExists bool                     `json:"infiniband_class_exists"`
	IbstatExists          bool                     `json:"ibstat_exists"`
	Ibstat                *infiniband.IbstatOutput `json:"ibstat,omitempty"`

	LsmodPeermem       *peermem.LsmodPeermemModuleOutput `json:"lsmod_peermem,omitempty"`
	LsmodPeermemErrors []string                          `json:"lsmod_peermem_errors,omitempty"`

	NVML       *nvml.Output `json:"nvml,omitempty"`
	NVMLErrors []string     `json:"nvml_errors,omitempty"`

	MemoryErrorManagementCapabilities MemoryErrorManagementCapabilities `json:"memory_error_management_capabilities,omitempty"`

	// at some point, we will deprecate "nvidia-smi" parsing
	// as the NVML API provides all the data we need
	SMIExists      bool       `json:"smi_exists"`
	SMI            *SMIOutput `json:"smi,omitempty"`
	SMIQueryErrors []string   `json:"smi_query_errors,omitempty"`
}

func (*Output) GPUCount ¶ added in v0.0.4

func (o *Output) GPUCount() int

func (*Output) GPUCountFromNVML ¶ added in v0.0.5

func (o *Output) GPUCountFromNVML() int

func (*Output) GPUProductName ¶ added in v0.0.3

func (o *Output) GPUProductName() string

func (*Output) GPUProductNameFromNVML ¶ added in v0.0.5

func (o *Output) GPUProductNameFromNVML() string

This is the same product name in nvidia-smi outputs. ref. https://developer.nvidia.com/management-library-nvml

func (*Output) PrintInfo ¶

func (o *Output) PrintInfo(debug bool)

func (*Output) YAML ¶

func (o *Output) YAML() ([]byte, error)

type ParsedMemoryUsage ¶

type ParsedMemoryUsage struct {
	ID string `json:"id"`

	TotalBytes     uint64 `json:"total_bytes"`
	TotalHumanized string `json:"total_humanized"`

	ReservedBytes     uint64 `json:"reserved_bytes"`
	ReservedHumanized string `json:"reserved_humanized"`

	UsedBytes     uint64 `json:"used_bytes"`
	UsedHumanized string `json:"used_humanized"`

	UsedPercent string `json:"used_percent"`

	FreeBytes     uint64 `json:"free_bytes"`
	FreeHumanized string `json:"free_humanized"`
}

func (ParsedMemoryUsage) GetUsedPercent ¶

func (u ParsedMemoryUsage) GetUsedPercent() (float64, error)

type ParsedSMIPowerReading ¶

type ParsedSMIPowerReading struct {
	ID string `json:"id"`

	PowerDrawW         string `json:"power_draw_w"`
	PowerDrawHumanized string `json:"power_draw_humanized"`

	CurrentPowerLimitW         string `json:"current_power_limit_w"`
	CurrentPowerLimitHumanized string `json:"current_power_limit_humanized"`

	UsedPercent string `json:"used_percent"`

	RequestedPowerLimitW         string `json:"requested_power_limit_w"`
	RequestedPowerLimitHumanized string `json:"requested_power_limit_humanized"`

	DefaultPowerLimitW         string `json:"default_power_limit_w"`
	DefaultPowerLimitHumanized string `json:"default_power_limit_humanized"`

	MinPowerLimitW         string `json:"min_power_limit_w"`
	MinPowerLimitHumanized string `json:"min_power_limit_humanized"`

	MaxPowerLimitW         string `json:"max_power_limit_w"`
	MaxPowerLimitHumanized string `json:"max_power_limit_humanized"`
}

func (ParsedSMIPowerReading) GetCurrentPowerLimitW ¶

func (pr ParsedSMIPowerReading) GetCurrentPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetDefaultPowerLimitW ¶

func (pr ParsedSMIPowerReading) GetDefaultPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetMaxPowerLimitW ¶

func (pr ParsedSMIPowerReading) GetMaxPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetMinPowerLimitW ¶

func (pr ParsedSMIPowerReading) GetMinPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetPowerDrawW ¶

func (pr ParsedSMIPowerReading) GetPowerDrawW() (float64, error)

func (ParsedSMIPowerReading) GetRequestedPowerLimitW ¶

func (pr ParsedSMIPowerReading) GetRequestedPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetUsedPercent ¶

func (pr ParsedSMIPowerReading) GetUsedPercent() (float64, error)

type ParsedSMIRemappedRows ¶ added in v0.0.4

type ParsedSMIRemappedRows struct {
	ID string `json:"id"`

	RemappedDueToCorrectableErrors   string `json:"remapped_due_to_correctable_errors"`
	RemappedDueToUncorrectableErrors string `json:"remapped_due_to_uncorrectable_errors"`
	RemappingPending                 string `json:"remapping_pending"`
	RemappingFailed                  string `json:"remapping_failed"`
}

func (ParsedSMIRemappedRows) GetRemappedDueToCorrectableError ¶ added in v0.0.4

func (rw ParsedSMIRemappedRows) GetRemappedDueToCorrectableError() (int64, error)

func (ParsedSMIRemappedRows) GetRemappedDueToUncorrectableError ¶ added in v0.0.4

func (rw ParsedSMIRemappedRows) GetRemappedDueToUncorrectableError() (int64, error)

func (ParsedSMIRemappedRows) GetRemappingFailured ¶ added in v0.0.4

func (rw ParsedSMIRemappedRows) GetRemappingFailured() (bool, error)

func (ParsedSMIRemappedRows) GetRemappingPending ¶ added in v0.0.4

func (rw ParsedSMIRemappedRows) GetRemappingPending() (bool, error)

func (ParsedSMIRemappedRows) QualifiesForRMA ¶ added in v0.0.4

func (rw ParsedSMIRemappedRows) QualifiesForRMA() (bool, error)

Returns true if a GPU qualifies for RMA. ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#rma-policy-thresholds-for-row-remapping

func (ParsedSMIRemappedRows) RequiresReset ¶ added in v0.0.4

func (rw ParsedSMIRemappedRows) RequiresReset() (bool, error)

type ParsedTemperature ¶

type ParsedTemperature struct {
	ID string `json:"id"`

	CurrentHumanized string `json:"current_humanized"`
	CurrentCelsius   string `json:"current_celsius"`

	LimitHumanized string `json:"limit_humanized"`
	LimitCelsius   string `json:"limit_celsius"`

	UsedPercent string `json:"used_percent"`

	ShutdownHumanized string `json:"shutdown_humanized"`
	ShutdownLimit     string `json:"shutdown_limit"`
	ShutdownCelsius   string `json:"shutdown_celsius"`

	SlowdownHumanized string `json:"slowdown_humanized"`
	SlowdownLimit     string `json:"slowdown_limit"`
	SlowdownCelsius   string `json:"slowdown_celsius"`

	MaxOperatingLimit string `json:"max_operating_limit"`

	Target                  string `json:"target"`
	MemoryCurrent           string `json:"memory_current"`
	MemoryMaxOperatingLimit string `json:"memory_max_operating_limit"`
}

func (ParsedTemperature) GetCurrentCelsius ¶

func (temp ParsedTemperature) GetCurrentCelsius() (float64, error)

func (ParsedTemperature) GetLimitCelsius ¶

func (temp ParsedTemperature) GetLimitCelsius() (float64, error)

func (ParsedTemperature) GetShutdownCelsius ¶

func (temp ParsedTemperature) GetShutdownCelsius() (float64, error)

func (ParsedTemperature) GetSlowdownCelsius ¶

func (temp ParsedTemperature) GetSlowdownCelsius() (float64, error)

func (ParsedTemperature) GetUsedPercent ¶

func (temp ParsedTemperature) GetUsedPercent() (float64, error)

type SMIClockEventReasons ¶

type SMIClockEventReasons struct {
	SWPowerCap           string `json:"SW Power Cap"`
	SWThermalSlowdown    string `json:"SW Thermal Slowdown"`
	HWSlowdown           string `json:"HW Slowdown"`
	HWThermalSlowdown    string `json:"HW Thermal Slowdown"`
	HWPowerBrakeSlowdown string `json:"HW Power Brake Slowdown"`
}

type SMIECCErrorAggregate ¶

type SMIECCErrorAggregate struct {
	DRAMCorrectable   string `json:"DRAM Correctable"`
	DRAMUncorrectable string `json:"DRAM Uncorrectable"`

	SRAMCorrectable       string `json:"SRAM Correctable"`
	SRAMThresholdExceeded string `json:"SRAM Threshold Exceeded"`

	SRAMUncorrectable       string `json:"SRAM Uncorrectable"`
	SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"`  // for newer driver versions
	SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions
}

type SMIECCErrorAggregateUncorrectableSRAMSources ¶

type SMIECCErrorAggregateUncorrectableSRAMSources struct {
	SRAML2              string `json:"SRAM L2"`
	SRAMMicrocontroller string `json:"SRAM Microcontroller"`
	SRAMOther           string `json:"SRAM Other"`
	SRAMPCIE            string `json:"SRAM PCIE"`
	SRAMSM              string `json:"SRAM SM"`
}

type SMIECCErrorVolatile ¶

type SMIECCErrorVolatile struct {
	DRAMCorrectable   string `json:"DRAM Correctable"`
	DRAMUncorrectable string `json:"DRAM Uncorrectable"`

	SRAMCorrectable   string `json:"SRAM Correctable"`
	SRAMUncorrectable string `json:"SRAM Uncorrectable"`

	SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"`  // for newer driver versions
	SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions
}

type SMIECCErrors ¶

type SMIECCErrors struct {
	ID string `json:"id"`

	Aggregate                         *SMIECCErrorAggregate                         `json:"Aggregate,omitempty"`
	AggregateUncorrectableSRAMSources *SMIECCErrorAggregateUncorrectableSRAMSources `json:"Aggregate Uncorrectable SRAM Sources,omitempty"`
	Volatile                          *SMIECCErrorVolatile                          `json:"Volatile,omitempty"`
}

func (SMIECCErrors) FindVolatileUncorrectableErrs ¶

func (eccErrs SMIECCErrors) FindVolatileUncorrectableErrs() []string

type SMIECCMode ¶ added in v0.0.4

type SMIECCMode struct {
	Current string `json:"Current"`
	Pending string `json:"Pending"`
}

type SMIFBMemoryUsage ¶

type SMIFBMemoryUsage struct {
	ID string `json:"id"`

	Total    string `json:"Total"`
	Reserved string `json:"Reserved"`
	Used     string `json:"Used"`
	Free     string `json:"Free"`
}

func (*SMIFBMemoryUsage) GetFreeBytes ¶

func (f *SMIFBMemoryUsage) GetFreeBytes() (uint64, error)

func (*SMIFBMemoryUsage) GetReservedBytes ¶

func (f *SMIFBMemoryUsage) GetReservedBytes() (uint64, error)

func (*SMIFBMemoryUsage) GetTotalBytes ¶

func (f *SMIFBMemoryUsage) GetTotalBytes() (uint64, error)

func (*SMIFBMemoryUsage) GetUsedBytes ¶

func (f *SMIFBMemoryUsage) GetUsedBytes() (uint64, error)

func (*SMIFBMemoryUsage) Parse ¶

func (f *SMIFBMemoryUsage) Parse() (ParsedMemoryUsage, error)

type SMIGPUPersistenceMode ¶ added in v0.0.5

type SMIGPUPersistenceMode struct {
	ID      string `json:"id"`
	Enabled bool   `json:"enabled"`
}

type SMIGPUPowerReadings ¶

type SMIGPUPowerReadings struct {
	ID string `json:"id"`

	PowerDraw           string `json:"Power Draw"`
	CurrentPowerLimit   string `json:"Current Power Limit"`
	RequestedPowerLimit string `json:"Requested Power Limit"`
	DefaultPowerLimit   string `json:"Default Power Limit"`
	MinPowerLimit       string `json:"Min Power Limit"`
	MaxPowerLimit       string `json:"Max Power Limit"`
}

func (*SMIGPUPowerReadings) GetCurrentPowerLimitW ¶

func (g *SMIGPUPowerReadings) GetCurrentPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetDefaultPowerLimitW ¶

func (g *SMIGPUPowerReadings) GetDefaultPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetMaxPowerLimitW ¶

func (g *SMIGPUPowerReadings) GetMaxPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetMinPowerLimitW ¶

func (g *SMIGPUPowerReadings) GetMinPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetPowerDrawW ¶

func (g *SMIGPUPowerReadings) GetPowerDrawW() (float64, error)

func (*SMIGPUPowerReadings) GetRequestedPowerLimitW ¶

func (g *SMIGPUPowerReadings) GetRequestedPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) Parse ¶

func (g *SMIGPUPowerReadings) Parse() (ParsedSMIPowerReading, error)

type SMIGPUResetStatus ¶

type SMIGPUResetStatus struct {
	ResetRequired            string `json:"Reset Required"`
	DrainAndResetRecommended string `json:"Drain and Reset Recommended"`
}

type SMIGPUTemperature ¶

type SMIGPUTemperature struct {
	ID string `json:"id"`

	Current string `json:"GPU Current Temp"`
	Limit   string `json:"GPU T.Limit Temp"`

	// Shutdown limit for older drivers (e.g., 535.129.03).
	Shutdown      string `json:"GPU Shutdown Temp"`
	ShutdownLimit string `json:"GPU Shutdown T.Limit Temp"`

	// Slowdown limit for older drivers (e.g., 535.129.03).
	Slowdown      string `json:"GPU Slowdown Temp"`
	SlowdownLimit string `json:"GPU Slowdown T.Limit Temp"`

	MaxOperatingLimit string `json:"GPU Max Operating T.Limit Temp"`

	// this value is not reliable to monitor as it's often N/A
	Target string `json:"GPU Target Temperature"`

	MemoryCurrent           string `json:"Memory Current Temp"`
	MemoryMaxOperatingLimit string `json:"Memory Max Operating T.Limit Temp"`
}

If any field shows "Unknown Error", it means GPU has some issues.

func (*SMIGPUTemperature) GetCurrentCelsius ¶

func (tm *SMIGPUTemperature) GetCurrentCelsius() (float64, error)

func (*SMIGPUTemperature) GetLimitCelsius ¶

func (tm *SMIGPUTemperature) GetLimitCelsius() (float64, error)

func (*SMIGPUTemperature) GetShutdownCelsius ¶

func (tm *SMIGPUTemperature) GetShutdownCelsius() (float64, error)

func (*SMIGPUTemperature) GetShutdownLimitCelsius ¶

func (tm *SMIGPUTemperature) GetShutdownLimitCelsius() (float64, error)

func (*SMIGPUTemperature) GetSlowdownCelsius ¶

func (tm *SMIGPUTemperature) GetSlowdownCelsius() (float64, error)

func (*SMIGPUTemperature) GetSlowdownLimitCelsius ¶

func (tm *SMIGPUTemperature) GetSlowdownLimitCelsius() (float64, error)

func (*SMIGPUTemperature) Parse ¶

func (tm *SMIGPUTemperature) Parse() (ParsedTemperature, error)

type SMIOutput ¶

type SMIOutput struct {
	Timestamp     string `json:"timestamp"`
	DriverVersion string `json:"driver_version"`
	CUDAVersion   string `json:"cuda_version"`
	AttachedGPUs  int    `json:"attached_gpus"`

	GPUs []NvidiaSMIGPU `json:"gpus,omitempty"`

	// Raw is the raw output of "nvidia-smi --query".
	// Useful for debugging.
	Raw string `json:"raw,omitempty"`

	// Summary is the "nvidia-smi" output without "--query" flag.
	// Useful for error detecting, in case the new nvidia-smi
	// version introduces breaking changes to its query output.
	Summary string `json:"summary,omitempty"`

	// Only set if "nvidia-smi" failed to run.
	SummaryFailure error `json:"summary_failure,omitempty"`
}

Represents the current nvidia status using "nvidia-smi --query", "nvidia-smi", etc.. ref. "nvidia-smi --help-query-gpu"

func GetSMIOutput ¶

func GetSMIOutput(ctx context.Context) (*SMIOutput, error)

Make sure to call this with a timeout, as a broken GPU may block the command. e.g., nvAssertOkFailedNoLog: Assertion failed: Call timed out [NV_ERR_TIMEOUT] (0x00000065) returned from pRmApi->Control(pRmApi, RES_GET_CLIENT_HANDLE(pKernelChannel), RES_GET_HANDLE(pKernelChannel),

func ParseSMIQueryOutput ¶

func ParseSMIQueryOutput(b []byte) (*SMIOutput, error)

Decodes the "nvidia-smi --query" output. ref. https://developer.nvidia.com/system-management-interface

func (*SMIOutput) FindGPUErrs ¶

func (o *SMIOutput) FindGPUErrs() []string

Returns the detail GPU errors if any.

func (*SMIOutput) FindHWSlowdownErrs ¶

func (o *SMIOutput) FindHWSlowdownErrs() []string

Returns the detail HW Slowdown message if any of the GPU has "Active" HW Slowdown event.

func (*SMIOutput) JSON ¶

func (o *SMIOutput) JSON() ([]byte, error)

func (*SMIOutput) YAML ¶

func (o *SMIOutput) YAML() ([]byte, error)

type SMIProcesses ¶

type SMIProcesses struct {
	GPUInstanceID        string `json:"GPU instance ID"`
	ComputeInstanceID    string `json:"Compute instance ID"`
	ProcessID            int64  `json:"Process ID"`
	ProcessType          string `json:"Process Type"`
	ProcessName          string `json:"Process Name"`
	ProcessUsedGPUMemory string `json:"Process Used GPU Memory"`
}

type SMIRemappedRows ¶ added in v0.0.4

type SMIRemappedRows struct {
	ID string `json:"id"`

	CorrectableError   string `json:"Correctable Error,omitempty"`
	UncorrectableError string `json:"Uncorrectable Error,omitempty"`

	// Yes/No.
	// If uncorrectable error is >0, this pending field is set to "Yes".
	// For a100/h100, it requires a GPU reset to actually remap the row.
	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#rma-policy-thresholds
	Pending string `json:"Pending,omitempty"`

	// Yes/No
	RemappingFailureOccurred string `json:"Remapping Failure Occurred,omitempty"`
}

func (*SMIRemappedRows) Parse ¶ added in v0.0.4

func (rw *SMIRemappedRows) Parse() (ParsedSMIRemappedRows, error)

Source Files ¶

View all Source files

Directories ¶

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Path	Synopsis
fabric-manager-log Package fabricmanagerlog implements the fabric manager log poller.	Package fabricmanagerlog implements the fabric manager log poller.
infiniband Package infiniband provides utilities to query infiniband status.	Package infiniband provides utilities to query infiniband status.
metrics
clock Package clock provides the NVIDIA clock metrics collection and reporting.	Package clock provides the NVIDIA clock metrics collection and reporting.
clock-speed Package clockspeed provides the NVIDIA clock speed metrics collection and reporting.	Package clockspeed provides the NVIDIA clock speed metrics collection and reporting.
ecc Package ecc provides the NVIDIA ECC metrics collection and reporting.	Package ecc provides the NVIDIA ECC metrics collection and reporting.
gpm Package gpm provides the NVIDIA GPM metrics collection and reporting.	Package gpm provides the NVIDIA GPM metrics collection and reporting.
memory Package memory provides the NVIDIA memory metrics collection and reporting.	Package memory provides the NVIDIA memory metrics collection and reporting.
nvlink Package nvlink provides the NVIDIA nvlink metrics collection and reporting.	Package nvlink provides the NVIDIA nvlink metrics collection and reporting.
power Package power provides the NVIDIA power usage metrics collection and reporting.	Package power provides the NVIDIA power usage metrics collection and reporting.
processes Package processes provides the NVIDIA processes metrics collection and reporting.	Package processes provides the NVIDIA processes metrics collection and reporting.
remapped-rows Package remappedrows provides the NVIDIA row remapping metrics collection and reporting.	Package remappedrows provides the NVIDIA row remapping metrics collection and reporting.
temperature Package temperature provides the NVIDIA temperature metrics collection and reporting.	Package temperature provides the NVIDIA temperature metrics collection and reporting.
utilization Package utilization provides the NVIDIA GPU utilization metrics collection and reporting.	Package utilization provides the NVIDIA GPU utilization metrics collection and reporting.
nccl Package nccl contains the implementation of the NCCL (NVIDIA Collective Communications Library) query for NVIDIA GPUs.	Package nccl contains the implementation of the NCCL (NVIDIA Collective Communications Library) query for NVIDIA GPUs.
nvml Package nvml implements the NVIDIA Management Library (NVML) interface.	Package nvml implements the NVIDIA Management Library (NVML) interface.
peermem Package peermem contains the implementation of the peermem query for NVIDIA GPUs.	Package peermem contains the implementation of the peermem query for NVIDIA GPUs.
sxid Package sxid provides the NVIDIA SXID error details.	Package sxid provides the NVIDIA SXID error details.
xid Package xid provides the NVIDIA XID error details.	Package xid provides the NVIDIA XID error details.
xid-sxid-state Package xidsxidstate provides the persistent storage layer for the nvidia query results.	Package xidsxidstate provides the persistent storage layer for the nvidia query results.