Documentation ¶
Index ¶
- Variables
- func AverageGPUUtilization(uuid string, since time.Duration) (uint, error)
- func DeviceFromName(deviceName string) (*nvml.Device, error)
- func DiscoverGPUDevices() error
- func GetAllGpuDevices() map[string]*nvml.Device
- func GetDevicesForAllContainers() (map[ContainerID][]string, error)
- type ContainerID
- type MetricServer
Constants ¶
This section is empty.
Variables ¶
var ( // DutyCycleNodeGpu reports the percent of time when the GPU was actively processing per Node. DutyCycleNodeGpu = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "duty_cycle_gpu_node", Help: "Percent of time when the GPU was actively processing", }, []string{"make", "accelerator_id", "model"}) // MemoryTotalNodeGpu reports the total memory available on the GPU per Node. MemoryTotalNodeGpu = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "memory_total_gpu_node", Help: "Total memory available on the GPU in bytes", }, []string{"make", "accelerator_id", "model"}) // MemoryUsedNodeGpu reports GPU memory allocated per Node. MemoryUsedNodeGpu = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "memory_used_gpu_node", Help: "Allocated GPU memory in bytes", }, []string{"make", "accelerator_id", "model"}) // DutyCycle reports the percent of time when the GPU was actively processing per container. DutyCycle = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "duty_cycle", Help: "Percent of time when the GPU was actively processing", }, []string{"namespace", "pod", "container", "make", "accelerator_id", "model"}) // MemoryTotal reports the total memory available on the GPU per container. MemoryTotal = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "memory_total", Help: "Total memory available on the GPU in bytes", }, []string{"namespace", "pod", "container", "make", "accelerator_id", "model"}) // MemoryUsed reports GPU memory allocated per container. MemoryUsed = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "memory_used", Help: "Allocated GPU memory in bytes", }, []string{"namespace", "pod", "container", "make", "accelerator_id", "model"}) // AcceleratorRequests reports the number of GPU devices requested by the container. AcceleratorRequests = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "request", Help: "Number of accelerator devices requested by the container", }, []string{"namespace", "pod", "container", "resource_name"}) )
Functions ¶
func AverageGPUUtilization ¶
AverageGPUUtilization reports the average GPU utilization over the last 10 seconds.
func DeviceFromName ¶
DeviceFromName returns the device object for a given device name.
func DiscoverGPUDevices ¶
func DiscoverGPUDevices() error
DiscoverGPUDevices discovers GPUs attached to the node, and updates `gpuDevices` map.
func GetAllGpuDevices ¶
func GetDevicesForAllContainers ¶
func GetDevicesForAllContainers() (map[ContainerID][]string, error)
GetDevicesForAllContainers returns a map with container as the key and the list of devices allocated to that container as the value. It will skip time-shared GPU devices when time-sharing solution is enabled.
Types ¶
type ContainerID ¶
type ContainerID struct {
// contains filtered or unexported fields
}
ContainerID uniquely identifies a container.
type MetricServer ¶
type MetricServer struct {
// contains filtered or unexported fields
}
MetricServer exposes GPU metrics for all containers and nodes in prometheus format on the specified port.
func NewMetricServer ¶
func NewMetricServer(collectionInterval, port int, metricsEndpointPath string) *MetricServer
func (*MetricServer) Start ¶
func (m *MetricServer) Start() error
Start performs necessary initializations and starts the metric server.
func (*MetricServer) Stop ¶
func (m *MetricServer) Stop()
Stop performs cleanup operations and stops the metric server.