query

package
v0.0.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 25, 2024 License: Apache-2.0 Imports: 31 Imported by: 0

Documentation

Overview

Package query implements "nvidia-smi --query" output helpers.

Package query implements various NVIDIA-related system queries. All interactions with NVIDIA data sources are implemented under the query packages.

Index

Constants

View Source
const (
	ClockEventsActive    = "Active"
	ClockEventsNotActive = "Not Active"
)
View Source
const (
	StateKeySMIExists           = "smi_exists"
	StateKeyFabricManagerExists = "fabric_manager_exists"
	StateKeyIbstatExists        = "ibstat_exists"
)
View Source
const AddressingModeNone = "None"

Variables

View Source
var DefaultPoller = query.New(
	"shared-nvidia-poller",
	query_config.Config{
		Interval:  metav1.Duration{Duration: query_config.DefaultPollInterval},
		QueueSize: query_config.DefaultQueueSize,
		State: &query_config.State{
			Retention: metav1.Duration{Duration: query_config.DefaultStateRetention},
		},
	},
	Get,
)

Functions

func CheckFabricManagerActive

func CheckFabricManagerActive(ctx context.Context, conn *systemd.DbusConn) (bool, error)

func CheckFabricManagerVersion

func CheckFabricManagerVersion(ctx context.Context) (string, error)

func FabricManagerExists

func FabricManagerExists() bool

func Get

func Get(ctx context.Context) (output any, err error)

Get all nvidia component queries.

func GetLatestFabricManagerOutput added in v0.0.2

func GetLatestFabricManagerOutput(ctx context.Context) (string, error)

Returns the latest fabric manager output using journalctl. Equivalent to "journalctl -xeu nvidia-fabricmanager.service --no-pager".

func GetSuccessOnce

func GetSuccessOnce() <-chan any

func HasLsmodInfinibandPeerMem

func HasLsmodInfinibandPeerMem(lsmodOutput string) bool

Returns true if infiniband (ib_core module) is using nvidia_peermem.

func IbstatExists

func IbstatExists() bool

func InfinibandClassExists

func InfinibandClassExists() bool

Checks if "/sys/class/infiniband" directory exists.

func IsIbcoreExpected

func IsIbcoreExpected(gpuProductName string, ibstatExists bool, infinibandClassExists bool) bool

func RunSMI

func RunSMI(ctx context.Context, args ...string) ([]byte, error)

func SMIExists

func SMIExists() bool

Returns true if the local machine runs on Nvidia GPU by running "nvidia-smi".

func ValidateIbstatOutput

func ValidateIbstatOutput(s string) error

Types

type FabricManagerOutput

type FabricManagerOutput struct {
	Version string `json:"version"`
	// Set true if the "nvidia-fabricmanager" systemd service is active.
	Active bool `json:"active"`

	// Stores the output of "journalctl -xeu nvidia-fabricmanager.service --no-pager".
	// Useful for debugging fabric manager fails to start (e.g., "Error: Fabric Manager already running with pid 7388").
	JournalOutput string `json:"journal_output,omitempty"`
}

type IbstatOutput

type IbstatOutput struct {
	Raw    string   `json:"raw"`
	Errors []string `json:"errors,omitempty"`
}

func RunIbstat

func RunIbstat(ctx context.Context) (*IbstatOutput, error)

type LsmodPeermemModuleOutput

type LsmodPeermemModuleOutput struct {
	IbstatExists             bool   `json:"ibstat_exists"`
	InfinibandClassExists    bool   `json:"infiniband_class_exists"`
	Raw                      string `json:"raw"`
	IbcoreUsingPeermemModule bool   `json:"ibcore_using_peermem_module"`
}

func CheckLsmodPeermemModule

func CheckLsmodPeermemModule(ctx context.Context) (*LsmodPeermemModuleOutput, error)

type NvidiaSMIGPU

type NvidiaSMIGPU struct {
	// The original GPU identifier from the nvidia-smi query output.
	// e.g., "GPU 00000000:53:00.0"
	ID string `json:"ID"`

	ProductName         string `json:"Product Name"`
	ProductBrand        string `json:"Product Brand"`
	ProductArchitecture string `json:"Product Architecture"`

	PersistenceMode string `json:"Persistence Mode"`
	AddressingMode  string `json:"Addressing Mode"`

	GPUResetStatus    *SMIGPUResetStatus    `json:"GPU Reset Status,omitempty"`
	ClockEventReasons *SMIClockEventReasons `json:"Clocks Event Reasons,omitempty"`
	ECCErrors         *SMIECCErrors         `json:"ECC Errors,omitempty"`
	Temperature       *SMIGPUTemperature    `json:"Temperature,omitempty"`
	GPUPowerReadings  *SMIGPUPowerReadings  `json:"GPU Power Readings,omitempty"`
	Processes         *SMIProcesses         `json:"Processes,omitempty"`
	FBMemoryUsage     *SMIFBMemoryUsage     `json:"FB Memory Usage"`

	FanSpeed string `json:"Fan Speed"`
}

GPU object from the nvidia-smi query. ref. "nvidia-smi --help-query-gpu"

func (NvidiaSMIGPU) FindAddressingModeErr

func (g NvidiaSMIGPU) FindAddressingModeErr() error

Returns the Address Mode error if any of the GPU has "Unknown Error" Addressing Mode. It may indicate Xid 31 "GPU memory page fault", where the application crashes with: e.g., RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

func (NvidiaSMIGPU) FindErrs

func (g NvidiaSMIGPU) FindErrs() []string

Returns true if the GPU has any errors. ref. https://forums.developer.nvidia.com/t/nvidia-smi-q-shows-several-unknown-error-gpu-ignored-by-pytorch/263881

func (NvidiaSMIGPU) FindHWSlowdownErrs

func (g NvidiaSMIGPU) FindHWSlowdownErrs() []string

type Output

type Output struct {
	SMIExists      bool       `json:"smi_exists"`
	SMI            *SMIOutput `json:"smi,omitempty"`
	SMIQueryErrors []string   `json:"smi_query_errors,omitempty"`

	FabricManagerExists bool                 `json:"fabric_manager_exists"`
	FabricManager       *FabricManagerOutput `json:"fabric_manager,omitempty"`
	FabricManagerErrors []string             `json:"fabric_manager_errors,omitempty"`

	InfinibandClassExists bool          `json:"infiniband_class_exists"`
	IbstatExists          bool          `json:"ibstat_exists"`
	Ibstat                *IbstatOutput `json:"ibstat,omitempty"`

	LsmodPeermem       *LsmodPeermemModuleOutput `json:"lsmod_peermem,omitempty"`
	LsmodPeermemErrors []string                  `json:"lsmod_peermem_errors,omitempty"`

	NVML       *nvml.Output `json:"nvml,omitempty"`
	NVMLErrors []string     `json:"nvml_errors,omitempty"`
}

func (*Output) GPUCounts added in v0.0.3

func (o *Output) GPUCounts() int

func (*Output) GPUProductName added in v0.0.3

func (o *Output) GPUProductName() string

func (*Output) PrintInfo

func (o *Output) PrintInfo(debug bool)

func (*Output) YAML

func (o *Output) YAML() ([]byte, error)

type ParsedMemoryUsage

type ParsedMemoryUsage struct {
	ID string `json:"id"`

	TotalBytes     uint64 `json:"total_bytes"`
	TotalHumanized string `json:"total_humanized"`

	ReservedBytes     uint64 `json:"reserved_bytes"`
	ReservedHumanized string `json:"reserved_humanized"`

	UsedBytes     uint64 `json:"used_bytes"`
	UsedHumanized string `json:"used_humanized"`

	UsedPercent string `json:"used_percent"`

	FreeBytes     uint64 `json:"free_bytes"`
	FreeHumanized string `json:"free_humanized"`
}

func (ParsedMemoryUsage) GetUsedPercent

func (u ParsedMemoryUsage) GetUsedPercent() (float64, error)

type ParsedSMIPowerReading

type ParsedSMIPowerReading struct {
	ID string `json:"id"`

	PowerDrawW         string `json:"power_draw_w"`
	PowerDrawHumanized string `json:"power_draw_humanized"`

	CurrentPowerLimitW         string `json:"current_power_limit_w"`
	CurrentPowerLimitHumanized string `json:"current_power_limit_humanized"`

	UsedPercent string `json:"used_percent"`

	RequestedPowerLimitW         string `json:"requested_power_limit_w"`
	RequestedPowerLimitHumanized string `json:"requested_power_limit_humanized"`

	DefaultPowerLimitW         string `json:"default_power_limit_w"`
	DefaultPowerLimitHumanized string `json:"default_power_limit_humanized"`

	MinPowerLimitW         string `json:"min_power_limit_w"`
	MinPowerLimitHumanized string `json:"min_power_limit_humanized"`

	MaxPowerLimitW         string `json:"max_power_limit_w"`
	MaxPowerLimitHumanized string `json:"max_power_limit_humanized"`
}

func (ParsedSMIPowerReading) GetCurrentPowerLimitW

func (pr ParsedSMIPowerReading) GetCurrentPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetDefaultPowerLimitW

func (pr ParsedSMIPowerReading) GetDefaultPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetMaxPowerLimitW

func (pr ParsedSMIPowerReading) GetMaxPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetMinPowerLimitW

func (pr ParsedSMIPowerReading) GetMinPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetPowerDrawW

func (pr ParsedSMIPowerReading) GetPowerDrawW() (float64, error)

func (ParsedSMIPowerReading) GetRequestedPowerLimitW

func (pr ParsedSMIPowerReading) GetRequestedPowerLimitW() (float64, error)

func (ParsedSMIPowerReading) GetUsedPercent

func (pr ParsedSMIPowerReading) GetUsedPercent() (float64, error)

type ParsedTemperature

type ParsedTemperature struct {
	ID string `json:"id"`

	CurrentHumanized string `json:"current_humanized"`
	CurrentCelsius   string `json:"current_celsius"`

	LimitHumanized string `json:"limit_humanized"`
	LimitCelsius   string `json:"limit_celsius"`

	UsedPercent string `json:"used_percent"`

	ShutdownHumanized string `json:"shutdown_humanized"`
	ShutdownLimit     string `json:"shutdown_limit"`
	ShutdownCelsius   string `json:"shutdown_celsius"`

	SlowdownHumanized string `json:"slowdown_humanized"`
	SlowdownLimit     string `json:"slowdown_limit"`
	SlowdownCelsius   string `json:"slowdown_celsius"`

	MaxOperatingLimit string `json:"max_operating_limit"`

	Target                  string `json:"target"`
	MemoryCurrent           string `json:"memory_current"`
	MemoryMaxOperatingLimit string `json:"memory_max_operating_limit"`
}

func (ParsedTemperature) GetCurrentCelsius

func (temp ParsedTemperature) GetCurrentCelsius() (float64, error)

func (ParsedTemperature) GetLimitCelsius

func (temp ParsedTemperature) GetLimitCelsius() (float64, error)

func (ParsedTemperature) GetShutdownCelsius

func (temp ParsedTemperature) GetShutdownCelsius() (float64, error)

func (ParsedTemperature) GetSlowdownCelsius

func (temp ParsedTemperature) GetSlowdownCelsius() (float64, error)

func (ParsedTemperature) GetUsedPercent

func (temp ParsedTemperature) GetUsedPercent() (float64, error)

type SMIClockEventReasons

type SMIClockEventReasons struct {
	SWPowerCap           string `json:"SW Power Cap"`
	SWThermalSlowdown    string `json:"SW Thermal Slowdown"`
	HWSlowdown           string `json:"HW Slowdown"`
	HWThermalSlowdown    string `json:"HW Thermal Slowdown"`
	HWPowerBrakeSlowdown string `json:"HW Power Brake Slowdown"`
}

type SMIECCErrorAggregate

type SMIECCErrorAggregate struct {
	DRAMCorrectable   string `json:"DRAM Correctable"`
	DRAMUncorrectable string `json:"DRAM Uncorrectable"`

	SRAMCorrectable       string `json:"SRAM Correctable"`
	SRAMThresholdExceeded string `json:"SRAM Threshold Exceeded"`

	SRAMUncorrectable       string `json:"SRAM Uncorrectable"`
	SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"`  // for newer driver versions
	SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions
}

type SMIECCErrorAggregateUncorrectableSRAMSources

type SMIECCErrorAggregateUncorrectableSRAMSources struct {
	SRAML2              string `json:"SRAM L2"`
	SRAMMicrocontroller string `json:"SRAM Microcontroller"`
	SRAMOther           string `json:"SRAM Other"`
	SRAMPCIE            string `json:"SRAM PCIE"`
	SRAMSM              string `json:"SRAM SM"`
}

type SMIECCErrorVolatile

type SMIECCErrorVolatile struct {
	DRAMCorrectable   string `json:"DRAM Correctable"`
	DRAMUncorrectable string `json:"DRAM Uncorrectable"`

	SRAMCorrectable   string `json:"SRAM Correctable"`
	SRAMUncorrectable string `json:"SRAM Uncorrectable"`

	SRAMUncorrectableParity string `json:"SRAM Uncorrectable Parity"`  // for newer driver versions
	SRAMUncorrectableSECDED string `json:"SRAM Uncorrectable SEC-DED"` // for newer driver versions
}

type SMIECCErrors

type SMIECCErrors struct {
	ID string `json:"id"`

	Aggregate                         *SMIECCErrorAggregate                         `json:"Aggregate,omitempty"`
	AggregateUncorrectableSRAMSources *SMIECCErrorAggregateUncorrectableSRAMSources `json:"Aggregate Uncorrectable SRAM Sources,omitempty"`
	Volatile                          *SMIECCErrorVolatile                          `json:"Volatile,omitempty"`
}

func (SMIECCErrors) FindVolatileUncorrectableErrs

func (eccErrs SMIECCErrors) FindVolatileUncorrectableErrs() []string

type SMIFBMemoryUsage

type SMIFBMemoryUsage struct {
	ID string `json:"id"`

	Total    string `json:"Total"`
	Reserved string `json:"Reserved"`
	Used     string `json:"Used"`
	Free     string `json:"Free"`
}

func (*SMIFBMemoryUsage) GetFreeBytes

func (f *SMIFBMemoryUsage) GetFreeBytes() (uint64, error)

func (*SMIFBMemoryUsage) GetReservedBytes

func (f *SMIFBMemoryUsage) GetReservedBytes() (uint64, error)

func (*SMIFBMemoryUsage) GetTotalBytes

func (f *SMIFBMemoryUsage) GetTotalBytes() (uint64, error)

func (*SMIFBMemoryUsage) GetUsedBytes

func (f *SMIFBMemoryUsage) GetUsedBytes() (uint64, error)

func (*SMIFBMemoryUsage) Parse

type SMIGPUPowerReadings

type SMIGPUPowerReadings struct {
	ID string `json:"id"`

	PowerDraw           string `json:"Power Draw"`
	CurrentPowerLimit   string `json:"Current Power Limit"`
	RequestedPowerLimit string `json:"Requested Power Limit"`
	DefaultPowerLimit   string `json:"Default Power Limit"`
	MinPowerLimit       string `json:"Min Power Limit"`
	MaxPowerLimit       string `json:"Max Power Limit"`
}

func (*SMIGPUPowerReadings) GetCurrentPowerLimitW

func (g *SMIGPUPowerReadings) GetCurrentPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetDefaultPowerLimitW

func (g *SMIGPUPowerReadings) GetDefaultPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetMaxPowerLimitW

func (g *SMIGPUPowerReadings) GetMaxPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetMinPowerLimitW

func (g *SMIGPUPowerReadings) GetMinPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) GetPowerDrawW

func (g *SMIGPUPowerReadings) GetPowerDrawW() (float64, error)

func (*SMIGPUPowerReadings) GetRequestedPowerLimitW

func (g *SMIGPUPowerReadings) GetRequestedPowerLimitW() (float64, error)

func (*SMIGPUPowerReadings) Parse

type SMIGPUResetStatus

type SMIGPUResetStatus struct {
	ResetRequired            string `json:"Reset Required"`
	DrainAndResetRecommended string `json:"Drain and Reset Recommended"`
}

type SMIGPUTemperature

type SMIGPUTemperature struct {
	ID string `json:"id"`

	Current string `json:"GPU Current Temp"`
	Limit   string `json:"GPU T.Limit Temp"`

	// Shutdown limit for older drivers (e.g., 535.129.03).
	Shutdown      string `json:"GPU Shutdown Temp"`
	ShutdownLimit string `json:"GPU Shutdown T.Limit Temp"`

	// Slowdown limit for older drivers (e.g., 535.129.03).
	Slowdown      string `json:"GPU Slowdown Temp"`
	SlowdownLimit string `json:"GPU Slowdown T.Limit Temp"`

	MaxOperatingLimit string `json:"GPU Max Operating T.Limit Temp"`

	// this value is not reliable to monitor as it's often N/A
	Target string `json:"GPU Target Temperature"`

	MemoryCurrent           string `json:"Memory Current Temp"`
	MemoryMaxOperatingLimit string `json:"Memory Max Operating T.Limit Temp"`
}

If any field shows "Unknown Error", it means GPU has some issues.

func (*SMIGPUTemperature) GetCurrentCelsius

func (tm *SMIGPUTemperature) GetCurrentCelsius() (float64, error)

func (*SMIGPUTemperature) GetLimitCelsius

func (tm *SMIGPUTemperature) GetLimitCelsius() (float64, error)

func (*SMIGPUTemperature) GetShutdownCelsius

func (tm *SMIGPUTemperature) GetShutdownCelsius() (float64, error)

func (*SMIGPUTemperature) GetShutdownLimitCelsius

func (tm *SMIGPUTemperature) GetShutdownLimitCelsius() (float64, error)

func (*SMIGPUTemperature) GetSlowdownCelsius

func (tm *SMIGPUTemperature) GetSlowdownCelsius() (float64, error)

func (*SMIGPUTemperature) GetSlowdownLimitCelsius

func (tm *SMIGPUTemperature) GetSlowdownLimitCelsius() (float64, error)

func (*SMIGPUTemperature) Parse

func (tm *SMIGPUTemperature) Parse() (ParsedTemperature, error)

type SMIOutput

type SMIOutput struct {
	Timestamp     string `json:"timestamp"`
	DriverVersion string `json:"driver_version"`
	CUDAVersion   string `json:"cuda_version"`
	AttachedGPUs  int    `json:"attached_gpus"`

	GPUs []NvidiaSMIGPU `json:"gpus,omitempty"`

	// Raw is the raw output of "nvidia-smi --query".
	// Useful for debugging.
	Raw string `json:"raw,omitempty"`

	// Summary is the "nvidia-smi" output without "--query" flag.
	// Useful for error detecting, in case the new nvidia-smi
	// version introduces breaking changes to its query output.
	Summary string `json:"summary,omitempty"`

	// Only set if "nvidia-smi" failed to run.
	SummaryFailure error `json:"summary_failure,omitempty"`
}

Represents the current nvidia status using "nvidia-smi --query", "nvidia-smi", etc.. ref. "nvidia-smi --help-query-gpu"

func GetSMIOutput

func GetSMIOutput(ctx context.Context) (*SMIOutput, error)

func ParseSMIQueryOutput

func ParseSMIQueryOutput(b []byte) (*SMIOutput, error)

Decodes the "nvidia-smi --query" output. ref. https://developer.nvidia.com/system-management-interface

func (*SMIOutput) FindGPUErrs

func (o *SMIOutput) FindGPUErrs() []string

Returns the detail GPU errors if any.

func (*SMIOutput) FindHWSlowdownErrs

func (o *SMIOutput) FindHWSlowdownErrs() []string

Returns the detail HW Slowdown message if any of the GPU has "Active" HW Slowdown event.

func (*SMIOutput) JSON

func (o *SMIOutput) JSON() ([]byte, error)

func (*SMIOutput) YAML

func (o *SMIOutput) YAML() ([]byte, error)

type SMIProcesses

type SMIProcesses struct {
	GPUInstanceID        string `json:"GPU instance ID"`
	ComputeInstanceID    string `json:"Compute instance ID"`
	ProcessID            int64  `json:"Process ID"`
	ProcessType          string `json:"Process Type"`
	ProcessName          string `json:"Process Name"`
	ProcessUsedGPUMemory string `json:"Process Used GPU Memory"`
}

Directories

Path Synopsis
Package fabricmanagerlog implements the fabric manager log poller.
Package fabricmanagerlog implements the fabric manager log poller.
metrics
clock
Package clock provides the NVIDIA clock metrics collection and reporting.
Package clock provides the NVIDIA clock metrics collection and reporting.
clock-speed
Package clockspeed provides the NVIDIA clock speed metrics collection and reporting.
Package clockspeed provides the NVIDIA clock speed metrics collection and reporting.
ecc
Package ecc provides the NVIDIA ECC metrics collection and reporting.
Package ecc provides the NVIDIA ECC metrics collection and reporting.
gpm
Package gpm provides the NVIDIA GPM metrics collection and reporting.
Package gpm provides the NVIDIA GPM metrics collection and reporting.
memory
Package memory provides the NVIDIA memory metrics collection and reporting.
Package memory provides the NVIDIA memory metrics collection and reporting.
nvlink
Package nvlink provides the NVIDIA nvlink metrics collection and reporting.
Package nvlink provides the NVIDIA nvlink metrics collection and reporting.
power
Package power provides the NVIDIA power usage metrics collection and reporting.
Package power provides the NVIDIA power usage metrics collection and reporting.
processes
Package processes provides the NVIDIA processes metrics collection and reporting.
Package processes provides the NVIDIA processes metrics collection and reporting.
temperature
Package temperature provides the NVIDIA temperature metrics collection and reporting.
Package temperature provides the NVIDIA temperature metrics collection and reporting.
utilization
Package utilization provides the NVIDIA GPU utilization metrics collection and reporting.
Package utilization provides the NVIDIA GPU utilization metrics collection and reporting.
Package nvml implements the NVIDIA Management Library (NVML) interface.
Package nvml implements the NVIDIA Management Library (NVML) interface.
Package peermem contains the implementation of the peermem query for NVIDIA GPUs.
Package peermem contains the implementation of the peermem query for NVIDIA GPUs.
Package sxid provides the NVIDIA SXID error details.
Package sxid provides the NVIDIA SXID error details.
Package xid provides the NVIDIA XID error details.
Package xid provides the NVIDIA XID error details.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL