Documentation ¶
Overview ¶
Package query implements "nvidia-smi --query" output helpers.
Package query implements various NVIDIA-related system queries. All interactions with NVIDIA data sources are implemented under the query packages.
Index ¶
- Constants
- Variables
- func CheckFabricManagerActive(ctx context.Context, conn *systemd.DbusConn) (bool, error)
- func CheckFabricManagerVersion(ctx context.Context) (string, error)
- func FabricManagerExists() bool
- func FindSummaryErr(s string) []string
- func Get(ctx context.Context) (output any, err error)
- func GetLatestFabricManagerOutput(ctx context.Context) (string, error)
- func GetSuccessOnce() <-chan any
- func HasLsmodInfinibandPeerMem(lsmodOutput string) bool
- func IbstatExists() bool
- func InfinibandClassExists() bool
- func IsIbcoreExpected(gpuProductName string, ibstatExists bool, infinibandClassExists bool) bool
- func RunSMI(ctx context.Context, args ...string) ([]byte, error)
- func SMIExists() bool
- func ValidateIbstatOutput(s string) error
- type FabricManagerOutput
- type IbstatOutput
- type LsmodPeermemModuleOutput
- type NvidiaSMIGPU
- type Output
- type ParsedMemoryUsage
- type ParsedSMIPowerReading
- func (pr ParsedSMIPowerReading) GetCurrentPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetDefaultPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetMaxPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetMinPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetPowerDrawW() (float64, error)
- func (pr ParsedSMIPowerReading) GetRequestedPowerLimitW() (float64, error)
- func (pr ParsedSMIPowerReading) GetUsedPercent() (float64, error)
- type ParsedTemperature
- func (temp ParsedTemperature) GetCurrentCelsius() (float64, error)
- func (temp ParsedTemperature) GetLimitCelsius() (float64, error)
- func (temp ParsedTemperature) GetShutdownCelsius() (float64, error)
- func (temp ParsedTemperature) GetSlowdownCelsius() (float64, error)
- func (temp ParsedTemperature) GetUsedPercent() (float64, error)
- type SMIClockEventReasons
- type SMIECCErrorAggregate
- type SMIECCErrorAggregateUncorrectableSRAMSources
- type SMIECCErrorVolatile
- type SMIECCErrors
- type SMIFBMemoryUsage
- type SMIGPUPowerReadings
- func (g *SMIGPUPowerReadings) GetCurrentPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetDefaultPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetMaxPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetMinPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) GetPowerDrawW() (float64, error)
- func (g *SMIGPUPowerReadings) GetRequestedPowerLimitW() (float64, error)
- func (g *SMIGPUPowerReadings) Parse() (ParsedSMIPowerReading, error)
- type SMIGPUResetStatus
- type SMIGPUTemperature
- func (tm *SMIGPUTemperature) GetCurrentCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetLimitCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetShutdownCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetShutdownLimitCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetSlowdownCelsius() (float64, error)
- func (tm *SMIGPUTemperature) GetSlowdownLimitCelsius() (float64, error)
- func (tm *SMIGPUTemperature) Parse() (ParsedTemperature, error)
- type SMIOutput
- type SMIProcesses
Constants ¶
const ( ClockEventsActive = "Active" ClockEventsNotActive = "Not Active" )
const ( StateKeySMIExists = "smi_exists" StateKeyFabricManagerExists = "fabric_manager_exists" StateKeyIbstatExists = "ibstat_exists" )
const AddressingModeNone = "None"
Variables ¶
var DefaultPoller = query.New( "shared-nvidia-poller", query_config.Config{ Interval: metav1.Duration{Duration: query_config.DefaultPollInterval}, QueueSize: query_config.DefaultQueueSize, State: &query_config.State{ Retention: metav1.Duration{Duration: query_config.DefaultStateRetention}, }, }, Get, )
Functions ¶
func FabricManagerExists ¶
func FabricManagerExists() bool
func FindSummaryErr ¶
func GetLatestFabricManagerOutput ¶ added in v0.0.2
Returns the latest fabric manager output using journalctl. Equivalent to "journalctl -xeu nvidia-fabricmanager.service --no-pager".
func GetSuccessOnce ¶
func GetSuccessOnce() <-chan any
func HasLsmodInfinibandPeerMem ¶
Returns true if infiniband (ib_core module) is using nvidia_peermem.
func IbstatExists ¶
func IbstatExists() bool
func InfinibandClassExists ¶
func InfinibandClassExists() bool
Checks if "/sys/class/infiniband" directory exists.
func IsIbcoreExpected ¶
func SMIExists ¶
func SMIExists() bool
Returns true if the local machine runs on Nvidia GPU by running "nvidia-smi".
func ValidateIbstatOutput ¶
Types ¶
type FabricManagerOutput ¶
type FabricManagerOutput struct { Version string `json:"version"` // Set true if the "nvidia-fabricmanager" systemd service is active. Active bool `json:"active"` // Stores the output of "journalctl -xeu nvidia-fabricmanager.service --no-pager". // Useful for debugging fabric manager fails to start (e.g., "Error: Fabric Manager already running with pid 7388"). JournalOutput string `json:"journal_output,omitempty"` }
type IbstatOutput ¶
type LsmodPeermemModuleOutput ¶
type LsmodPeermemModuleOutput struct { IbstatExists bool `json:"ibstat_exists"` InfinibandClassExists bool `json:"infiniband_class_exists"` Raw string `json:"raw"` IbcoreUsingPeermemModule bool `json:"ibcore_using_peermem_module"` }
func CheckLsmodPeermemModule ¶
func CheckLsmodPeermemModule(ctx context.Context) (*LsmodPeermemModuleOutput, error)
type NvidiaSMIGPU ¶
type NvidiaSMIGPU struct { // The original GPU identifier from the nvidia-smi query output. // e.g., "GPU 00000000:53:00.0" ID string `json:"ID"` ProductName string `json:"Product Name"` ProductBrand string `json:"Product Brand"` ProductArchitecture string `json:"Product Architecture"` PersistenceMode string `json:"Persistence Mode"` AddressingMode string `json:"Addressing Mode"` GPUResetStatus *SMIGPUResetStatus `json:"GPU Reset Status,omitempty"` ClockEventReasons *SMIClockEventReasons `json:"Clocks Event Reasons,omitempty"` ECCErrors *SMIECCErrors `json:"ECC Errors,omitempty"` Temperature *SMIGPUTemperature `json:"Temperature,omitempty"` GPUPowerReadings *SMIGPUPowerReadings `json:"GPU Power Readings,omitempty"` Processes *SMIProcesses `json:"Processes,omitempty"` FBMemoryUsage *SMIFBMemoryUsage `json:"FB Memory Usage"` FanSpeed string `json:"Fan Speed"` }
GPU object from the nvidia-smi query. ref. "nvidia-smi --help-query-gpu"
func (NvidiaSMIGPU) FindAddressingModeErr ¶
func (g NvidiaSMIGPU) FindAddressingModeErr() error
Returns the Address Mode error if any of the GPU has "Unknown Error" Addressing Mode. It may indicate Xid 31 "GPU memory page fault", where the application crashes with: e.g., RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.
func (NvidiaSMIGPU) FindErrs ¶
func (g NvidiaSMIGPU) FindErrs() []string
Returns true if the GPU has any errors. ref. https://forums.developer.nvidia.com/t/nvidia-smi-q-shows-several-unknown-error-gpu-ignored-by-pytorch/263881
func (NvidiaSMIGPU) FindHWSlowdownErrs ¶
func (g NvidiaSMIGPU) FindHWSlowdownErrs() []string
type Output ¶
type Output struct { SMIExists bool `json:"smi_exists"` SMI *SMIOutput `json:"smi,omitempty"` SMIQueryErrors []string `json:"smi_query_errors,omitempty"` FabricManagerExists bool `json:"fabric_manager_exists"` FabricManager *FabricManagerOutput `json:"fabric_manager,omitempty"` FabricManagerErrors []string `json:"fabric_manager_errors,omitempty"` InfinibandClassExists bool `json:"infiniband_class_exists"` IbstatExists bool `json:"ibstat_exists"` Ibstat *IbstatOutput `json:"ibstat,omitempty"` LsmodPeermem *LsmodPeermemModuleOutput `json:"lsmod_peermem,omitempty"` LsmodPeermemErrors []string `json:"lsmod_peermem_errors,omitempty"` NVML *nvml.Output `json:"nvml,omitempty"` NVMLErrors []string `json:"nvml_errors,omitempty"` }
type ParsedMemoryUsage ¶
type ParsedMemoryUsage struct { ID string `json:"id"` TotalBytes uint64 `json:"total_bytes"` TotalHumanized string `json:"total_humanized"` ReservedBytes uint64 `json:"reserved_bytes"` ReservedHumanized string `json:"reserved_humanized"` UsedBytes uint64 `json:"used_bytes"` UsedHumanized string `json:"used_humanized"` UsedPercent string `json:"used_percent"` FreeBytes uint64 `json:"free_bytes"` FreeHumanized string `json:"free_humanized"` }
func (ParsedMemoryUsage) GetUsedPercent ¶
func (u ParsedMemoryUsage) GetUsedPercent() (float64, error)
type ParsedSMIPowerReading ¶
type ParsedSMIPowerReading struct { ID string `json:"id"` PowerDrawW string `json:"power_draw_w"` PowerDrawHumanized string `json:"power_draw_humanized"` CurrentPowerLimitW string `json:"current_power_limit_w"` CurrentPowerLimitHumanized string `json:"current_power_limit_humanized"` UsedPercent string `json:"used_percent"` RequestedPowerLimitW string `json:"requested_power_limit_w"` RequestedPowerLimitHumanized string `json:"requested_power_limit_humanized"` DefaultPowerLimitW string `json:"default_power_limit_w"` DefaultPowerLimitHumanized string `json:"default_power_limit_humanized"` MinPowerLimitW string `json:"min_power_limit_w"` MinPowerLimitHumanized string `json:"min_power_limit_humanized"` MaxPowerLimitW string `json:"max_power_limit_w"` MaxPowerLimitHumanized string `json:"max_power_limit_humanized"` }
func (ParsedSMIPowerReading) GetCurrentPowerLimitW ¶
func (pr ParsedSMIPowerReading) GetCurrentPowerLimitW() (float64, error)
func (ParsedSMIPowerReading) GetDefaultPowerLimitW ¶
func (pr ParsedSMIPowerReading) GetDefaultPowerLimitW() (float64, error)
func (ParsedSMIPowerReading) GetMaxPowerLimitW ¶
func (pr ParsedSMIPowerReading) GetMaxPowerLimitW() (float64, error)
func (ParsedSMIPowerReading) GetMinPowerLimitW ¶
func (pr ParsedSMIPowerReading) GetMinPowerLimitW() (float64, error)
func (ParsedSMIPowerReading) GetPowerDrawW ¶
func (pr ParsedSMIPowerReading) GetPowerDrawW() (float64, error)
func (ParsedSMIPowerReading) GetRequestedPowerLimitW ¶
func (pr ParsedSMIPowerReading) GetRequestedPowerLimitW() (float64, error)
func (ParsedSMIPowerReading) GetUsedPercent ¶
func (pr ParsedSMIPowerReading) GetUsedPercent() (float64, error)
type ParsedTemperature ¶
type ParsedTemperature struct { ID string `json:"id"` CurrentHumanized string `json:"current_humanized"` CurrentCelsius string `json:"current_celsius"` LimitHumanized string `json:"limit_humanized"` LimitCelsius string `json:"limit_celsius"` UsedPercent string `json:"used_percent"` ShutdownHumanized string `json:"shutdown_humanized"` ShutdownLimit string `json:"shutdown_limit"` ShutdownCelsius string `json:"shutdown_celsius"` SlowdownHumanized string `json:"slowdown_humanized"` SlowdownLimit string `json:"slowdown_limit"` SlowdownCelsius string `json:"slowdown_celsius"` MaxOperatingLimit string `json:"max_operating_limit"` Target string `json:"target"` MemoryCurrent string `json:"memory_current"` MemoryMaxOperatingLimit string `json:"memory_max_operating_limit"` }
func (ParsedTemperature) GetCurrentCelsius ¶
func (temp ParsedTemperature) GetCurrentCelsius() (float64, error)
func (ParsedTemperature) GetLimitCelsius ¶
func (temp ParsedTemperature) GetLimitCelsius() (float64, error)
func (ParsedTemperature) GetShutdownCelsius ¶
func (temp ParsedTemperature) GetShutdownCelsius() (float64, error)
func (ParsedTemperature) GetSlowdownCelsius ¶
func (temp ParsedTemperature) GetSlowdownCelsius() (float64, error)
func (ParsedTemperature) GetUsedPercent ¶
func (temp ParsedTemperature) GetUsedPercent() (float64, error)
type SMIClockEventReasons ¶
type SMIECCErrorAggregate ¶
type SMIECCErrorAggregate struct { DRAMCorrectable string `json:"DRAM Correctable"` DRAMUncorrectable string `json:"DRAM Uncorrectable"` SRAMCorrectable string `json:"SRAM Correctable"` SRAMThresholdExceeded string `json:"SRAM Threshold Exceeded"` SRAMUncorrectable string `json:"SRAM Uncorrectable"` SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"` // for newer driver versions SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions }
type SMIECCErrorVolatile ¶
type SMIECCErrorVolatile struct { DRAMCorrectable string `json:"DRAM Correctable"` DRAMUncorrectable string `json:"DRAM Uncorrectable"` SRAMCorrectable string `json:"SRAM Correctable"` SRAMUncorrectable string `json:"SRAM Uncorrectable"` SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"` // for newer driver versions SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions }
type SMIECCErrors ¶
type SMIECCErrors struct { ID string `json:"id"` Aggregate *SMIECCErrorAggregate `json:"Aggregate,omitempty"` AggregateUncorrectableSRAMSources *SMIECCErrorAggregateUncorrectableSRAMSources `json:"Aggregate Uncorrectable SRAM Sources,omitempty"` Volatile *SMIECCErrorVolatile `json:"Volatile,omitempty"` }
func (SMIECCErrors) FindVolatileUncorrectableErrs ¶
func (eccErrs SMIECCErrors) FindVolatileUncorrectableErrs() []string
type SMIFBMemoryUsage ¶
type SMIFBMemoryUsage struct { ID string `json:"id"` Total string `json:"Total"` Reserved string `json:"Reserved"` Used string `json:"Used"` Free string `json:"Free"` }
func (*SMIFBMemoryUsage) GetFreeBytes ¶
func (f *SMIFBMemoryUsage) GetFreeBytes() (uint64, error)
func (*SMIFBMemoryUsage) GetReservedBytes ¶
func (f *SMIFBMemoryUsage) GetReservedBytes() (uint64, error)
func (*SMIFBMemoryUsage) GetTotalBytes ¶
func (f *SMIFBMemoryUsage) GetTotalBytes() (uint64, error)
func (*SMIFBMemoryUsage) GetUsedBytes ¶
func (f *SMIFBMemoryUsage) GetUsedBytes() (uint64, error)
func (*SMIFBMemoryUsage) Parse ¶
func (f *SMIFBMemoryUsage) Parse() (ParsedMemoryUsage, error)
type SMIGPUPowerReadings ¶
type SMIGPUPowerReadings struct { ID string `json:"id"` PowerDraw string `json:"Power Draw"` CurrentPowerLimit string `json:"Current Power Limit"` RequestedPowerLimit string `json:"Requested Power Limit"` DefaultPowerLimit string `json:"Default Power Limit"` MinPowerLimit string `json:"Min Power Limit"` MaxPowerLimit string `json:"Max Power Limit"` }
func (*SMIGPUPowerReadings) GetCurrentPowerLimitW ¶
func (g *SMIGPUPowerReadings) GetCurrentPowerLimitW() (float64, error)
func (*SMIGPUPowerReadings) GetDefaultPowerLimitW ¶
func (g *SMIGPUPowerReadings) GetDefaultPowerLimitW() (float64, error)
func (*SMIGPUPowerReadings) GetMaxPowerLimitW ¶
func (g *SMIGPUPowerReadings) GetMaxPowerLimitW() (float64, error)
func (*SMIGPUPowerReadings) GetMinPowerLimitW ¶
func (g *SMIGPUPowerReadings) GetMinPowerLimitW() (float64, error)
func (*SMIGPUPowerReadings) GetPowerDrawW ¶
func (g *SMIGPUPowerReadings) GetPowerDrawW() (float64, error)
func (*SMIGPUPowerReadings) GetRequestedPowerLimitW ¶
func (g *SMIGPUPowerReadings) GetRequestedPowerLimitW() (float64, error)
func (*SMIGPUPowerReadings) Parse ¶
func (g *SMIGPUPowerReadings) Parse() (ParsedSMIPowerReading, error)
type SMIGPUResetStatus ¶
type SMIGPUTemperature ¶
type SMIGPUTemperature struct { ID string `json:"id"` Current string `json:"GPU Current Temp"` Limit string `json:"GPU T.Limit Temp"` // Shutdown limit for older drivers (e.g., 535.129.03). Shutdown string `json:"GPU Shutdown Temp"` ShutdownLimit string `json:"GPU Shutdown T.Limit Temp"` // Slowdown limit for older drivers (e.g., 535.129.03). Slowdown string `json:"GPU Slowdown Temp"` SlowdownLimit string `json:"GPU Slowdown T.Limit Temp"` MaxOperatingLimit string `json:"GPU Max Operating T.Limit Temp"` // this value is not reliable to monitor as it's often N/A Target string `json:"GPU Target Temperature"` MemoryCurrent string `json:"Memory Current Temp"` MemoryMaxOperatingLimit string `json:"Memory Max Operating T.Limit Temp"` }
If any field shows "Unknown Error", it means GPU has some issues.
func (*SMIGPUTemperature) GetCurrentCelsius ¶
func (tm *SMIGPUTemperature) GetCurrentCelsius() (float64, error)
func (*SMIGPUTemperature) GetLimitCelsius ¶
func (tm *SMIGPUTemperature) GetLimitCelsius() (float64, error)
func (*SMIGPUTemperature) GetShutdownCelsius ¶
func (tm *SMIGPUTemperature) GetShutdownCelsius() (float64, error)
func (*SMIGPUTemperature) GetShutdownLimitCelsius ¶
func (tm *SMIGPUTemperature) GetShutdownLimitCelsius() (float64, error)
func (*SMIGPUTemperature) GetSlowdownCelsius ¶
func (tm *SMIGPUTemperature) GetSlowdownCelsius() (float64, error)
func (*SMIGPUTemperature) GetSlowdownLimitCelsius ¶
func (tm *SMIGPUTemperature) GetSlowdownLimitCelsius() (float64, error)
func (*SMIGPUTemperature) Parse ¶
func (tm *SMIGPUTemperature) Parse() (ParsedTemperature, error)
type SMIOutput ¶
type SMIOutput struct { Timestamp string `json:"timestamp"` DriverVersion string `json:"driver_version"` CUDAVersion string `json:"cuda_version"` AttachedGPUs int `json:"attached_gpus"` GPUs []NvidiaSMIGPU `json:"gpus,omitempty"` // Raw is the raw output of "nvidia-smi --query". // Useful for debugging. Raw string `json:"raw,omitempty"` // Summary is the "nvidia-smi" output without "--query" flag. // Useful for error detecting, in case the new nvidia-smi // version introduces breaking changes to its query output. Summary string `json:"summary,omitempty"` // Only set if "nvidia-smi" failed to run. SummaryFailure error `json:"summary_failure,omitempty"` }
Represents the current nvidia status using "nvidia-smi --query", "nvidia-smi", etc.. ref. "nvidia-smi --help-query-gpu"
func ParseSMIQueryOutput ¶
Decodes the "nvidia-smi --query" output. ref. https://developer.nvidia.com/system-management-interface
func (*SMIOutput) FindGPUErrs ¶
Returns the detail GPU errors if any.
func (*SMIOutput) FindHWSlowdownErrs ¶
Returns the detail HW Slowdown message if any of the GPU has "Active" HW Slowdown event.
type SMIProcesses ¶
type SMIProcesses struct { GPUInstanceID string `json:"GPU instance ID"` ComputeInstanceID string `json:"Compute instance ID"` ProcessID int64 `json:"Process ID"` ProcessType string `json:"Process Type"` ProcessName string `json:"Process Name"` ProcessUsedGPUMemory string `json:"Process Used GPU Memory"` }
Source Files ¶
Directories ¶
Path | Synopsis |
---|---|
Package fabricmanagerlog implements the fabric manager log poller.
|
Package fabricmanagerlog implements the fabric manager log poller. |
metrics
|
|
clock
Package clock provides the NVIDIA clock metrics collection and reporting.
|
Package clock provides the NVIDIA clock metrics collection and reporting. |
clock-speed
Package clockspeed provides the NVIDIA clock speed metrics collection and reporting.
|
Package clockspeed provides the NVIDIA clock speed metrics collection and reporting. |
ecc
Package ecc provides the NVIDIA ECC metrics collection and reporting.
|
Package ecc provides the NVIDIA ECC metrics collection and reporting. |
gpm
Package gpm provides the NVIDIA GPM metrics collection and reporting.
|
Package gpm provides the NVIDIA GPM metrics collection and reporting. |
memory
Package memory provides the NVIDIA memory metrics collection and reporting.
|
Package memory provides the NVIDIA memory metrics collection and reporting. |
nvlink
Package nvlink provides the NVIDIA nvlink metrics collection and reporting.
|
Package nvlink provides the NVIDIA nvlink metrics collection and reporting. |
power
Package power provides the NVIDIA power usage metrics collection and reporting.
|
Package power provides the NVIDIA power usage metrics collection and reporting. |
processes
Package processes provides the NVIDIA processes metrics collection and reporting.
|
Package processes provides the NVIDIA processes metrics collection and reporting. |
temperature
Package temperature provides the NVIDIA temperature metrics collection and reporting.
|
Package temperature provides the NVIDIA temperature metrics collection and reporting. |
utilization
Package utilization provides the NVIDIA GPU utilization metrics collection and reporting.
|
Package utilization provides the NVIDIA GPU utilization metrics collection and reporting. |
Package nvml implements the NVIDIA Management Library (NVML) interface.
|
Package nvml implements the NVIDIA Management Library (NVML) interface. |
Package sxid provides the NVIDIA SXID error details.
|
Package sxid provides the NVIDIA SXID error details. |
Package xid provides the NVIDIA XID error details.
|
Package xid provides the NVIDIA XID error details. |