Documentation ¶
Index ¶
- Constants
- Variables
- func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error)
- func AddToGroup(groupId GroupHandle, gpuId uint) (err error)
- func DestroyGroup(groupId GroupHandle) (err error)
- func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
- func FieldsInit() int
- func FieldsTerm() int
- func FindFirstNonAsciiIndex(value [4096]byte) int
- func Fv2_Blob(fv FieldValue_v2) [4096]byte
- func Fv2_Float64(fv FieldValue_v2) float64
- func Fv2_Int64(fv FieldValue_v2) int64
- func Fv2_String(fv FieldValue_v2) string
- func GetAllDeviceCount() (uint, error)
- func GetSupportedDevices() ([]uint, error)
- func Init(m mode, args ...string) (cleanup func(), err error)
- func IsInt32Blank(value int) bool
- func IsInt64Blank(value int64) bool
- func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
- func Shutdown() (err error)
- func UpdateAllFields() error
- func ViolationRegistration(data unsafe.Pointer) int
- func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
- type ClockInfo
- type DcgmStatus
- type Device
- type DeviceHealth
- type DeviceIdentifiers
- type DeviceStatus
- type ECCErrorsInfo
- type FieldHandle
- type FieldMeta
- type FieldValue_v1
- type FieldValue_v2
- type Field_Entity_Group
- type GroupEntityPair
- type GroupHandle
- type MemoryInfo
- type MetricGroup
- type MigEntityInfo
- type MigHierarchyInfo_v2
- type MigHierarchy_v2
- type P2PLink
- type P2PLinkType
- type PCIInfo
- type PCIStatusInfo
- type PCIThroughputInfo
- type PerfState
- type PolicyViolation
- type ProcessInfo
- type ProcessUtilInfo
- type Short
- type SystemWatch
- type Time
- type UtilizationInfo
- type ViolationTime
- type XIDErrorInfo
Constants ¶
const ( Embedded mode = iota Standalone StartHostengine )
const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine
const ( DCGM_FT_BINARY = uint('b') DCGM_FT_DOUBLE = uint('d') DCGM_FT_INT64 = uint('i') DCGM_FT_STRING = uint('s') DCGM_FT_TIMESTAMP = uint('t') DCGM_FT_INT32_BLANK = int64(2147483632) DCGM_FT_INT32_NOT_FOUND = int64(DCGM_FT_INT32_BLANK + 1) DCGM_FT_INT32_NOT_SUPPORTED = int64(DCGM_FT_INT32_BLANK + 2) DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3) DCGM_FT_INT64_BLANK = int64(9223372036854775792) DCGM_FT_INT64_NOT_FOUND = int64(DCGM_FT_INT64_BLANK + 1) DCGM_FT_INT64_NOT_SUPPORTED = int64(DCGM_FT_INT64_BLANK + 2) DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3) DCGM_FT_FP64_BLANK = 140737488355328.0 DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) DCGM_FT_STR_BLANK = "<<<NULL>>>" DCGM_FT_STR_NOT_FOUND = "<<<NOT_FOUND>>>" DCGM_FT_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>" DCGM_FT_STR_NOT_PERMISSIONED = "<<<NOT_PERM>>>" DCGM_FI_UNKNOWN = 0 DCGM_FI_DRIVER_VERSION = 1 DCGM_FI_NVML_VERSION = 2 DCGM_FI_PROCESS_NAME = 3 DCGM_FI_DEV_COUNT = 4 DCGM_FI_DEV_NAME = 50 DCGM_FI_DEV_BRAND = 51 DCGM_FI_DEV_NVML_INDEX = 52 DCGM_FI_DEV_SERIAL = 53 DCGM_FI_DEV_UUID = 54 DCGM_FI_DEV_MINOR_NUMBER = 55 DCGM_FI_DEV_OEM_INFOROM_VER = 56 DCGM_FI_DEV_PCI_BUSID = 57 DCGM_FI_DEV_PCI_COMBINED_ID = 58 DCGM_FI_DEV_PCI_SUBSYS_ID = 59 DCGM_FI_GPU_TOPOLOGY_PCI = 60 DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 DCGM_FI_DEV_COMPUTE_MODE = 65 DCGM_FI_DEV_CPU_AFFINITY_0 = 70 DCGM_FI_DEV_CPU_AFFINITY_1 = 71 DCGM_FI_DEV_CPU_AFFINITY_2 = 72 DCGM_FI_DEV_CPU_AFFINITY_3 = 73 DCGM_FI_DEV_ECC_INFOROM_VER = 80 DCGM_FI_DEV_POWER_INFOROM_VER = 81 DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 DCGM_FI_DEV_VBIOS_VERSION = 85 DCGM_FI_DEV_BAR1_TOTAL = 90 DCGM_FI_SYNC_BOOST = 91 DCGM_FI_DEV_BAR1_USED = 92 DCGM_FI_DEV_BAR1_FREE = 93 DCGM_FI_DEV_SM_CLOCK = 100 DCGM_FI_DEV_MEM_CLOCK = 101 DCGM_FI_DEV_VIDEO_CLOCK = 102 DCGM_FI_DEV_APP_SM_CLOCK = 110 DCGM_FI_DEV_APP_MEM_CLOCK = 111 DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 DCGM_FI_DEV_MAX_SM_CLOCK = 113 DCGM_FI_DEV_MAX_MEM_CLOCK = 114 DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 DCGM_FI_DEV_AUTOBOOST = 120 DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 DCGM_FI_DEV_MEMORY_TEMP = 140 DCGM_FI_DEV_GPU_TEMP = 150 DCGM_FI_DEV_POWER_USAGE = 155 DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 DCGM_FI_DEV_SLOWDOWN_TEMP = 158 DCGM_FI_DEV_SHUTDOWN_TEMP = 159 DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 DCGM_FI_DEV_PSTATE = 190 DCGM_FI_DEV_FAN_SPEED = 191 DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 DCGM_FI_DEV_GPU_UTIL = 203 DCGM_FI_DEV_MEM_COPY_UTIL = 204 DCGM_FI_DEV_ACCOUNTING_DATA = 205 DCGM_FI_DEV_ENC_UTIL = 206 DCGM_FI_DEV_DEC_UTIL = 207 DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 DCGM_FI_DEV_GRAPHICS_PIDS = 220 DCGM_FI_DEV_COMPUTE_PIDS = 221 DCGM_FI_DEV_XID_ERRORS = 230 DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 DCGM_FI_DEV_PCIE_LINK_GEN = 237 DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 DCGM_FI_DEV_POWER_VIOLATION = 240 DCGM_FI_DEV_THERMAL_VIOLATION = 241 DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 DCGM_FI_DEV_FB_TOTAL = 250 DCGM_FI_DEV_FB_FREE = 251 DCGM_FI_DEV_FB_USED = 252 DCGM_FI_DEV_ECC_CURRENT = 300 DCGM_FI_DEV_ECC_PENDING = 301 DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 DCGM_FI_DEV_RETIRED_SBE = 390 DCGM_FI_DEV_RETIRED_DBE = 391 DCGM_FI_DEV_RETIRED_PENDING = 392 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 DCGM_FI_DEV_VIRTUAL_MODE = 500 DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 DCGM_FI_DEV_ENC_STATS = 506 DCGM_FI_DEV_FBC_STATS = 507 DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 DCGM_FI_DEV_VGPU_VM_ID = 520 DCGM_FI_DEV_VGPU_VM_NAME = 521 DCGM_FI_DEV_VGPU_TYPE = 522 DCGM_FI_DEV_VGPU_UUID = 523 DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 DCGM_FI_DEV_VGPU_ENC_STATS = 528 DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 DCGM_FI_DEV_VGPU_FBC_STATS = 530 DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 DCGM_FI_FIRST_VGPU_FIELD_ID = 520 DCGM_FI_LAST_VGPU_FIELD_ID = 570 DCGM_FI_INTERNAL_FIELDS_0_START = 600 DCGM_FI_INTERNAL_FIELDS_0_END = 699 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 = 780 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 = 781 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 = 782 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 = 783 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 = 784 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 = 785 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 = 786 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 = 787 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 = 788 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 = 789 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 = 790 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 = 791 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 = 792 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 = 793 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 = 794 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 = 795 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 = 796 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 = 797 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 = 798 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 = 799 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 = 800 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 = 801 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 = 802 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 = 803 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 = 804 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 = 805 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 = 806 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 = 807 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 = 808 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 = 809 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 = 810 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 = 811 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 = 812 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 = 813 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 = 814 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 = 815 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 = 820 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 = 821 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 = 822 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 = 823 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 = 824 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 = 825 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 = 826 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 = 827 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 = 828 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 = 829 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 = 830 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 = 831 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 = 832 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 = 833 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 = 834 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 = 835 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 = 836 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 = 837 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 = 838 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 = 839 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 = 840 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 = 841 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 = 842 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 = 843 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 = 844 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 = 845 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 = 846 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 = 847 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 = 848 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 = 849 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 = 850 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 = 851 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 = 852 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 = 853 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 = 854 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 = 855 DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 DCGM_FI_LAST_NVSWITCH_FIELD_ID = 860 DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 DCGM_FI_PROF_SM_ACTIVE = 1002 DCGM_FI_PROF_SM_OCCUPANCY = 1003 DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 DCGM_FI_PROF_DRAM_ACTIVE = 1005 DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 DCGM_FI_PROF_PCIE_TX_BYTES = 1009 DCGM_FI_PROF_PCIE_RX_BYTES = 1010 DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 DCGM_FI_MAX_FIELDS = 1013 )
const ( PerfStateMax = 0 PerfStateMin = 15 PerfStateUnknown = 32 )
const ( MAX_NUM_DEVICES uint = C.DCGM_MAX_NUM_DEVICES MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO )
const ( DbePolicy = policyCondition("Double-bit ECC error") PCIePolicy = policyCondition("PCI error") MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") ThermalPolicy = policyCondition("Thermal Limit") PowerPolicy = policyCondition("Power Limit") NvlinkPolicy = policyCondition("Nvlink Error") XidPolicy = policyCondition("XID Error") )
const (
DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)
Variables ¶
var (
DCGM_FI = map[string]Short{}/* 345 elements not displayed */
)
var (
OLD_DCGM_FI = map[string]Short{
"dcgm_sm_clock": 100,
"dcgm_memory_clock": 101,
"dcgm_memory_temp": 140,
"dcgm_gpu_temp": 150,
"dcgm_power_usage": 155,
"dcgm_total_energy_consumption": 156,
"dcgm_pcie_tx_throughput": 200,
"dcgm_pcie_rx_throughput": 201,
"dcgm_pcie_replay_counter": 202,
"dcgm_gpu_utilization": 203,
"dcgm_mem_copy_utilization": 204,
"dcgm_enc_utilization": 206,
"dcgm_dec_utilization": 207,
"dcgm_xid_errors": 230,
"dcgm_power_violation": 240,
"dcgm_thermal_violation": 241,
"dcgm_sync_boost_violation": 242,
"dcgm_board_limit_violation": 243,
"dcgm_low_util_violation": 244,
"dcgm_reliability_violation": 245,
"dcgm_fb_free": 251,
"dcgm_fb_used": 252,
"dcgm_ecc_sbe_volatile_total": 310,
"dcgm_ecc_dbe_volatile_total": 311,
"dcgm_ecc_sbe_aggregate_total": 312,
"dcgm_ecc_dbe_aggregate_total": 313,
"dcgm_retired_pages_sbe": 390,
"dcgm_retired_pages_dbe": 391,
"dcgm_retired_pages_pending": 392,
"dcgm_nvlink_flit_crc_error_count_total": 409,
"dcgm_nvlink_data_crc_error_count_total": 419,
"dcgm_nvlink_replay_error_count_total": 429,
"dcgm_nvlink_recovery_error_count_total": 439,
"dcgm_nvlink_bandwidth_total": 449,
"dcgm_fi_prof_gr_engine_active": 1001,
"dcgm_fi_prof_sm_active": 1002,
"dcgm_fi_prof_sm_occupancy": 1003,
"dcgm_fi_prof_pipe_tensor_active": 1004,
"dcgm_fi_prof_dram_active": 1005,
"dcgm_fi_prof_pcie_tx_bytes": 1009,
"dcgm_fi_prof_pcie_rx_bytes": 1010,
}
)
Functions ¶
func AddEntityToGroup ¶
func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error)
func AddToGroup ¶
func AddToGroup(groupId GroupHandle, gpuId uint) (err error)
func DestroyGroup ¶
func DestroyGroup(groupId GroupHandle) (err error)
func FieldGroupDestroy ¶
func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
func FieldsInit ¶
func FieldsInit() int
func FieldsTerm ¶
func FieldsTerm() int
func FindFirstNonAsciiIndex ¶
func Fv2_Blob ¶
func Fv2_Blob(fv FieldValue_v2) [4096]byte
func Fv2_Float64 ¶
func Fv2_Float64(fv FieldValue_v2) float64
func Fv2_Int64 ¶
func Fv2_Int64(fv FieldValue_v2) int64
func Fv2_String ¶
func Fv2_String(fv FieldValue_v2) string
func GetAllDeviceCount ¶
GetAllDeviceCount counts all GPUs on the system
func GetSupportedDevices ¶
GetSupportedDevices returns only DCGM supported GPUs
func Init ¶
Init starts DCGM, based on the user selected mode DCGM can be started in 3 differengt modes: 1. Embedded: Start hostengine within this process 2. Standalone: Connect to an already running nv-hostengine at the specified address Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting
func IsInt32Blank ¶
func IsInt64Blank ¶
func Policy ¶
func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
Policy sets GPU usage and error policies and notifies in case of any violations via callback functions
func UpdateAllFields ¶
func UpdateAllFields() error
func ViolationRegistration ¶
ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()
func WatchFieldsWithGroup ¶
func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
Types ¶
type DcgmStatus ¶
func Introspect ¶
func Introspect() (DcgmStatus, error)
Introspect returns DCGM hostengine memory and CPU usage
type Device ¶
type Device struct { GPU uint DCGMSupported string UUID string Power uint // W PCI PCIInfo Identifiers DeviceIdentifiers Topology []P2PLink CPUAffinity string }
func GetDeviceInfo ¶
GetDeviceInfo describes the given device
type DeviceHealth ¶
type DeviceHealth struct { GPU uint Status string Watches []SystemWatch }
func HealthCheckByGpuId ¶
func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error)
HealthCheckByGpuId monitors GPU health for any errors/failures/warnings
type DeviceIdentifiers ¶
type DeviceStatus ¶
type DeviceStatus struct { Power float64 // W Temperature int64 // °C Utilization UtilizationInfo Memory MemoryInfo Clocks ClockInfo PCI PCIStatusInfo Performance PerfState FanSpeed int64 // % }
func GetDeviceStatus ¶
func GetDeviceStatus(gpuId uint) (DeviceStatus, error)
GetDeviceStatus monitors GPU status including its power, memory and GPU utilization
type ECCErrorsInfo ¶
type FieldHandle ¶
type FieldHandle struct {
// contains filtered or unexported fields
}
func FieldGroupCreate ¶
func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)
type FieldMeta ¶
type FieldMeta struct { FieldId Short FieldType byte Size byte Tag string Scope int NvmlFieldId int EntityLevel Field_Entity_Group }
func FieldGetById ¶
func ToFieldMeta ¶
func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta
type FieldValue_v1 ¶
type FieldValue_v1 struct { Version uint FieldId uint FieldType uint Status int Ts int64 Value [4096]byte }
func EntityGetLatestValues ¶
func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)
func GetLatestValuesForFields ¶
func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)
func (FieldValue_v1) Blob ¶
func (fv FieldValue_v1) Blob() [4096]byte
func (FieldValue_v1) Float64 ¶
func (fv FieldValue_v1) Float64() float64
func (FieldValue_v1) Int64 ¶
func (fv FieldValue_v1) Int64() int64
func (FieldValue_v1) String ¶
func (fv FieldValue_v1) String() string
type FieldValue_v2 ¶
type FieldValue_v2 struct { Version uint EntityGroupId Field_Entity_Group EntityId uint FieldId uint FieldType uint Status int Ts int64 Value [4096]byte StringValue *string }
func EntitiesGetLatestValues ¶
func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error)
type Field_Entity_Group ¶
type Field_Entity_Group uint
const ( FE_NONE Field_Entity_Group = iota FE_GPU FE_VGPU FE_SWITCH FE_GPU_I FE_GPU_CI FE_COUNT )
type GroupEntityPair ¶
type GroupEntityPair struct { EntityGroupId Field_Entity_Group EntityId uint }
type GroupHandle ¶
type GroupHandle struct {
// contains filtered or unexported fields
}
func CreateGroup ¶
func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
func NewDefaultGroup ¶
func NewDefaultGroup(groupName string) (GroupHandle, error)
func WatchFields ¶
func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
func WatchPidFields ¶
func WatchPidFields() (GroupHandle, error)
WatchPidFields lets DCGM start recording stats for GPU process It needs to be called before calling GetProcessInfo
type MemoryInfo ¶
type MemoryInfo struct { GlobalUsed int64 ECCErrors ECCErrorsInfo }
type MetricGroup ¶
type MetricGroup struct {
// contains filtered or unexported fields
}
func GetSupportedMetricGroups ¶
func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error)
Get all of the profiling metric groups for a given GPU group.
type MigEntityInfo ¶
type MigHierarchyInfo_v2 ¶
type MigHierarchyInfo_v2 struct { Entity GroupEntityPair Parent GroupEntityPair Info MigEntityInfo }
type MigHierarchy_v2 ¶
type MigHierarchy_v2 struct { Version uint Count uint EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 }
func GetGpuInstanceHierarchy ¶
func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error)
type P2PLink ¶
type P2PLink struct { GPU uint BusID string Link P2PLinkType }
func GetDeviceTopology ¶
GetDeviceTopology returns device topology corresponding to the gpuId
type P2PLinkType ¶
type P2PLinkType uint
const ( P2PLinkUnknown P2PLinkType = iota P2PLinkCrossCPU P2PLinkSameCPU P2PLinkHostBridge P2PLinkMultiSwitch P2PLinkSingleSwitch P2PLinkSameBoard SingleNVLINKLink TwoNVLINKLinks ThreeNVLINKLinks FourNVLINKLinks )
func (P2PLinkType) PCIPaths ¶
func (l P2PLinkType) PCIPaths() string
type PCIStatusInfo ¶
type PCIStatusInfo struct { BAR1Used int64 // MB Throughput PCIThroughputInfo FBUsed int64 }
type PCIThroughputInfo ¶
type PolicyViolation ¶
type ProcessInfo ¶
type ProcessInfo struct { GPU uint PID uint Name string ProcessUtilization ProcessUtilInfo PCI PCIStatusInfo Memory MemoryInfo GpuUtilization UtilizationInfo Clocks ClockInfo Violations ViolationTime XIDErrors XIDErrorInfo }
func GetProcessInfo ¶
func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)
GetProcessInfo provides detailed per GPU stats for this process
type ProcessUtilInfo ¶
type SystemWatch ¶
type UtilizationInfo ¶
type ViolationTime ¶
type ViolationTime struct { Power *uint64 Thermal *uint64 Reliability *uint64 BoardLimit *uint64 LowUtilization *uint64 SyncBoost *uint64 }
ViolationTime measures amount of time (in ms) GPU was at reduced clocks