vgpu

package

v1.11.0 Latest Latest Go to latest Published: Jan 24, 2025 License: Apache-2.0 Imports: 20 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/volcano-sh/volcano

Documentation ¶

Index ¶

Constants
Variables
func NewClient() (kubernetes.Interface, error)
func ResetDeviceMetrics(UUID string, nodeName string, memory float64)
type ContainerDevice
type ContainerDeviceRequest
type ContainerDevices
type GPUDevice
- func NewGPUDevice(id int, mem uint) *GPUDevice
type GPUDevices
- func NewGPUDevices(name string, node *v1.Node) *GPUDevices
type GPUUsage

Constants ¶

View Source

const (
	// VolcanoNamespace - namespace in prometheus used by volcano
	VolcanoNamespace = "volcano"

	// OnSessionOpen label
	OnSessionOpen = "OnSessionOpen"

	// OnSessionClose label
	OnSessionClose = "OnSessionClose"
)

View Source

const (
	GPUInUse                         = "nvidia.com/use-gputype"
	GPUNoUse                         = "nvidia.com/nouse-gputype"
	AssignedTimeAnnotations          = "volcano.sh/vgpu-time"
	AssignedIDsAnnotations           = "volcano.sh/vgpu-ids-new"
	AssignedIDsToAllocateAnnotations = "volcano.sh/devices-to-allocate"
	AssignedNodeAnnotations          = "volcano.sh/vgpu-node"
	BindTimeAnnotations              = "volcano.sh/bind-time"
	DeviceBindPhase                  = "volcano.sh/bind-phase"

	NvidiaGPUDevice = "NVIDIA"

	// VolcanoVGPUMemory extended gpu memory
	VolcanoVGPUMemory = "volcano.sh/vgpu-memory"
	// VolcanoVGPUMemoryPercentage extends gpu memory
	VolcanoVGPUMemoryPercentage = "volcano.sh/vgpu-memory-percentage"
	// VolcanoVGPUCores indicates utilization percentage of vgpu
	VolcanoVGPUCores = "volcano.sh/vgpu-cores"
	// VolcanoVGPUNumber virtual GPU card number
	VolcanoVGPUNumber = "volcano.sh/vgpu-number"
	// VolcanoVGPURegister virtual gpu information registered from device-plugin to scheduler
	VolcanoVGPURegister = "volcano.sh/node-vgpu-register"
	// VolcanoVGPUHandshake for vgpu
	VolcanoVGPUHandshake = "volcano.sh/node-vgpu-handshake"

	// PredicateTime is the key of predicate time
	PredicateTime = "volcano.sh/predicate-time"
	// GPUIndex is the key of gpu index
	GPUIndex = "volcano.sh/gpu-index"

	// UnhealthyGPUIDs list of unhealthy gpu ids
	UnhealthyGPUIDs = "volcano.sh/gpu-unhealthy-ids"

	// DeviceName used to indicate this device
	DeviceName = "hamivgpu"

	DefaultMemPercentage = 101
)

Variables ¶

View Source

var (
	VGPUDevicesSharedNumber = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_shared_number",
			Help:      "The number of vgpu tasks sharing this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUDevicesAllocatedMemory = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_allocated_memory",
			Help:      "The number of vgpu memory allocated in this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUDevicesAllocatedCores = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_allocated_cores",
			Help:      "The percentage of gpu compute cores allocated in this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUDevicesMemoryTotal = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_memory_limit",
			Help:      "The number of total device memory in this card",
		},
		[]string{"devID", "NodeName"},
	)
	VGPUPodMemoryAllocated = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_memory_allocation_for_a_certain_pod",
			Help:      "The vgpu device memory allocated for a certain pod",
		},
		[]string{"devID", "NodeName", "podName"},
	)
	VGPUPodCoreAllocated = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Subsystem: VolcanoNamespace,
			Name:      "vgpu_device_core_allocation_for_a_certain_pod",
			Help:      "The vgpu device core allocated for a certain pod",
		},
		[]string{"devID", "NodeName", "podName"},
	)
)

View Source

var NodeLockEnable bool

View Source

var VGPUEnable bool

Functions ¶

func NewClient ¶

func NewClient() (kubernetes.Interface, error)

NewClient connects to an API server

func ResetDeviceMetrics ¶ added in v1.10.0

func ResetDeviceMetrics(UUID string, nodeName string, memory float64)

Types ¶

type ContainerDevice ¶

type ContainerDevice struct {
	UUID      string
	Type      string
	Usedmem   int32
	Usedcores int32
}

type ContainerDeviceRequest ¶

type ContainerDeviceRequest struct {
	Nums             int32
	Type             string
	Memreq           int32
	MemPercentagereq int32
	Coresreq         int32
}

type ContainerDevices ¶

type ContainerDevices []ContainerDevice

type GPUDevice ¶

type GPUDevice struct {
	// GPU ID
	ID int
	// Node this GPU Device belongs
	Node string
	// GPU Unique ID
	UUID string
	// The resource usage by pods that are sharing this GPU
	PodMap map[string]*GPUUsage
	// memory per card
	Memory uint
	// max sharing number
	Number uint
	// type of this number
	Type string
	// Health condition of this GPU
	Health bool
	// number of allocated
	UsedNum uint
	// number of device memory allocated
	UsedMem uint
	// number of core used
	UsedCore uint
}

GPUDevice include gpu id, memory and the pods that are sharing it.

func NewGPUDevice ¶

func NewGPUDevice(id int, mem uint) *GPUDevice

NewGPUDevice creates a device

type GPUDevices ¶

type GPUDevices struct {
	Name string

	// We cache score in filter step according to schedulePolicy, to avoid recalculating in score
	Score float64

	Device map[int]*GPUDevice
}

func NewGPUDevices ¶

func NewGPUDevices(name string, node *v1.Node) *GPUDevices

func (*GPUDevices) AddPodMetrics ¶ added in v1.10.0

func (gs *GPUDevices) AddPodMetrics(index int, PodName string)

func (*GPUDevices) AddResource ¶

func (gs *GPUDevices) AddResource(pod *v1.Pod)

AddResource adds the pod to GPU pool if it is assigned

func (*GPUDevices) Allocate ¶

func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error

func (*GPUDevices) FilterNode ¶

func (gs *GPUDevices) FilterNode(pod *v1.Pod, schedulePolicy string) (int, string, error)

func (*GPUDevices) GetIgnoredDevices ¶

func (gs *GPUDevices) GetIgnoredDevices() []string

func (*GPUDevices) GetStatus ¶

func (gs *GPUDevices) GetStatus() string

func (*GPUDevices) HasDeviceRequest ¶

func (gs *GPUDevices) HasDeviceRequest(pod *v1.Pod) bool

func (*GPUDevices) Release ¶

func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) error

func (*GPUDevices) ScoreNode ¶ added in v1.9.0

func (gs *GPUDevices) ScoreNode(pod *v1.Pod, schedulePolicy string) float64

func (*GPUDevices) SubPodMetrics ¶ added in v1.10.0

func (gs *GPUDevices) SubPodMetrics(index int, PodName string)

func (*GPUDevices) SubResource ¶

func (gs *GPUDevices) SubResource(pod *v1.Pod)

SubResource frees the gpu hold by the pod

type GPUUsage ¶ added in v1.10.0

type GPUUsage struct {
	UsedMem  uint
	UsedCore uint
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL