vgpu

package
v1.11.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 24, 2025 License: Apache-2.0 Imports: 20 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// VolcanoNamespace - namespace in prometheus used by volcano
	VolcanoNamespace = "volcano"

	// OnSessionOpen label
	OnSessionOpen = "OnSessionOpen"

	// OnSessionClose label
	OnSessionClose = "OnSessionClose"
)
View Source
const (
	GPUInUse                         = "nvidia.com/use-gputype"
	GPUNoUse                         = "nvidia.com/nouse-gputype"
	AssignedTimeAnnotations          = "volcano.sh/vgpu-time"
	AssignedIDsAnnotations           = "volcano.sh/vgpu-ids-new"
	AssignedIDsToAllocateAnnotations = "volcano.sh/devices-to-allocate"
	AssignedNodeAnnotations          = "volcano.sh/vgpu-node"
	BindTimeAnnotations              = "volcano.sh/bind-time"
	DeviceBindPhase                  = "volcano.sh/bind-phase"

	NvidiaGPUDevice = "NVIDIA"

	// VolcanoVGPUMemory extended gpu memory
	VolcanoVGPUMemory = "volcano.sh/vgpu-memory"
	// VolcanoVGPUMemoryPercentage extends gpu memory
	VolcanoVGPUMemoryPercentage = "volcano.sh/vgpu-memory-percentage"
	// VolcanoVGPUCores indicates utilization percentage of vgpu
	VolcanoVGPUCores = "volcano.sh/vgpu-cores"
	// VolcanoVGPUNumber virtual GPU card number
	VolcanoVGPUNumber = "volcano.sh/vgpu-number"
	// VolcanoVGPURegister virtual gpu information registered from device-plugin to scheduler
	VolcanoVGPURegister = "volcano.sh/node-vgpu-register"
	// VolcanoVGPUHandshake for vgpu
	VolcanoVGPUHandshake = "volcano.sh/node-vgpu-handshake"

	// PredicateTime is the key of predicate time
	PredicateTime = "volcano.sh/predicate-time"
	// GPUIndex is the key of gpu index
	GPUIndex = "volcano.sh/gpu-index"

	// UnhealthyGPUIDs list of unhealthy gpu ids
	UnhealthyGPUIDs = "volcano.sh/gpu-unhealthy-ids"

	// DeviceName used to indicate this device
	DeviceName = "hamivgpu"

	DefaultMemPercentage = 101
)

Variables

View Source
var (
	VGPUDevicesSharedNumber = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_shared_number",
			Help:      "The number of vgpu tasks sharing this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUDevicesAllocatedMemory = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_allocated_memory",
			Help:      "The number of vgpu memory allocated in this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUDevicesAllocatedCores = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_allocated_cores",
			Help:      "The percentage of gpu compute cores allocated in this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUDevicesMemoryTotal = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_memory_limit",
			Help:      "The number of total device memory in this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUPodMemoryAllocated = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_memory_allocation_for_a_certain_pod",
			Help:      "The vgpu device memory allocated for a certain pod",
		},
		[]string{"devID", "NodeName", "podName"},
	)
	VGPUPodCoreAllocated = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_core_allocation_for_a_certain_pod",
			Help:      "The vgpu device core allocated for a certain pod",
		},
		[]string{"devID", "NodeName", "podName"},
	)
)
View Source
var NodeLockEnable bool
View Source
var VGPUEnable bool

Functions

func NewClient

func NewClient() (kubernetes.Interface, error)

NewClient connects to an API server

func ResetDeviceMetrics added in v1.10.0

func ResetDeviceMetrics(UUID string, nodeName string, memory float64)

Types

type ContainerDevice

type ContainerDevice struct {
	UUID      string
	Type      string
	Usedmem   int32
	Usedcores int32
}

type ContainerDeviceRequest

type ContainerDeviceRequest struct {
	Nums             int32
	Type             string
	Memreq           int32
	MemPercentagereq int32
	Coresreq         int32
}

type ContainerDevices

type ContainerDevices []ContainerDevice

type GPUDevice

type GPUDevice struct {
	// GPU ID
	ID int
	// Node this GPU Device belongs
	Node string
	// GPU Unique ID
	UUID string
	// The resource usage by pods that are sharing this GPU
	PodMap map[string]*GPUUsage
	// memory per card
	Memory uint
	// max sharing number
	Number uint
	// type of this number
	Type string
	// Health condition of this GPU
	Health bool
	// number of allocated
	UsedNum uint
	// number of device memory allocated
	UsedMem uint
	// number of core used
	UsedCore uint
}

GPUDevice include gpu id, memory and the pods that are sharing it.

func NewGPUDevice

func NewGPUDevice(id int, mem uint) *GPUDevice

NewGPUDevice creates a device

type GPUDevices

type GPUDevices struct {
	Name string

	// We cache score in filter step according to schedulePolicy, to avoid recalculating in score
	Score float64

	Device map[int]*GPUDevice
}

func NewGPUDevices

func NewGPUDevices(name string, node *v1.Node) *GPUDevices

func (*GPUDevices) AddPodMetrics added in v1.10.0

func (gs *GPUDevices) AddPodMetrics(index int, PodName string)

func (*GPUDevices) AddResource

func (gs *GPUDevices) AddResource(pod *v1.Pod)

AddResource adds the pod to GPU pool if it is assigned

func (*GPUDevices) Allocate

func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error

func (*GPUDevices) FilterNode

func (gs *GPUDevices) FilterNode(pod *v1.Pod, schedulePolicy string) (int, string, error)

func (*GPUDevices) GetIgnoredDevices

func (gs *GPUDevices) GetIgnoredDevices() []string

func (*GPUDevices) GetStatus

func (gs *GPUDevices) GetStatus() string

func (*GPUDevices) HasDeviceRequest

func (gs *GPUDevices) HasDeviceRequest(pod *v1.Pod) bool

func (*GPUDevices) Release

func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) error

func (*GPUDevices) ScoreNode added in v1.9.0

func (gs *GPUDevices) ScoreNode(pod *v1.Pod, schedulePolicy string) float64

func (*GPUDevices) SubPodMetrics added in v1.10.0

func (gs *GPUDevices) SubPodMetrics(index int, PodName string)

func (*GPUDevices) SubResource

func (gs *GPUDevices) SubResource(pod *v1.Pod)

SubResource frees the gpu hold by the pod

type GPUUsage added in v1.10.0

type GPUUsage struct {
	UsedMem  uint
	UsedCore uint
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL