deviceshare

package
v1.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 17, 2025 License: Apache-2.0 Imports: 40 Imported by: 0

Documentation

Index

Constants

View Source
const (
	ErrNoGPURequirements                    = "No GPU Requirements"
	ErrInsufficientPartitionedDevice        = "Insufficient Partitioned GPU Devices"
	ErrInsufficientTopologyScopedGPUDevices = "Insufficient Topology Scoped GPU Devices"
	ErrInsufficientGPUDevices               = "Insufficient GPU Devices"
	ErrNodeMissingGPUPartitionTable         = "node(s) missing GPU Partition Table"
	ErrUnsupportedGPURequests               = "node(s) Unsupported number of GPU requests"
	ErrUnsupportedMultiSharedGPU            = "node(s) Unsupported Multi-Shared GPU"
	ErrNodeMissingGPUDeviceTopologyTree     = "node(s) missing GPU Device Topology Tree"
)
View Source
const (
	NvidiaGPU = 1 << iota
	AMDGPU
	HygonDCU
	KoordGPU
	GPUShared
	GPUCore
	GPUMemory
	GPUMemoryRatio
	FPGA
	RDMA
)
View Source
const (
	ErrInsufficientNUMAScopedDevices = "Insufficient NUMA Scoped Devices"
)
View Source
const (
	// Name is the name of the plugin used in the plugin registry and configurations.
	Name = "DeviceShare"
)

Variables

View Source
var (
	GPUPartitionIndexOfNVIDIAHopper = GPUPartitionIndexer{
		1: []*PartitionsOfAllocationScore{
			{
				Partitions: []*apiext.GPUPartition{
					{
						Minors:          []int{0},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{0}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{1},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{1}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{2},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{2}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{3},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{3}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{4},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{4}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{5},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{5}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{6},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{6}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{7},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{7}),
						AllocationScore: 1,
					},
				},
				AllocationScore: 1,
			},
		},
		2: []*PartitionsOfAllocationScore{
			{
				Partitions: []*apiext.GPUPartition{
					{
						Minors:          []int{0, 1},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{0, 1}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{2, 3},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{2, 3}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{4, 5},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{4, 5}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{6, 7},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{6, 7}),
						AllocationScore: 1,
					},
				},
				AllocationScore: 1,
			},
		},
		4: []*PartitionsOfAllocationScore{
			{
				Partitions: []*apiext.GPUPartition{
					{
						Minors:          []int{0, 1, 2, 3},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{0, 1, 2, 3}),
						AllocationScore: 1,
					},
					{
						Minors:          []int{4, 5, 6, 7},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{4, 5, 6, 7}),
						AllocationScore: 1,
					},
				},
				AllocationScore: 1,
			},
		},
		8: []*PartitionsOfAllocationScore{
			{
				Partitions: []*apiext.GPUPartition{
					{
						Minors:          []int{0, 1, 2, 3, 4, 5, 6, 7},
						GPULinkType:     apiext.GPUNVLink,
						MinorsHash:      hashMinors([]int{0, 1, 2, 3, 4, 5, 6, 7}),
						AllocationScore: 1,
					},
				},
				AllocationScore: 1,
			},
		},
	}

	GetDesignatedGPUPartitionIndexer = func(node *corev1.Node) (GPUPartitionIndexer, bool) {
		var partitionIndexer GPUPartitionIndexer
		partitionPolicy := apiext.GetGPUPartitionPolicy(node)
		model := node.Labels[apiext.LabelGPUModel]
		switch model {
		case "H100", "H800", "H20":
			partitionIndexer = GPUPartitionIndexOfNVIDIAHopper
		}
		return partitionIndexer, partitionPolicy == apiext.GPUPartitionPolicyHonor
	}
)
View Source
var ResourceCombinationsMapper = map[uint]func(podRequest corev1.ResourceList) corev1.ResourceList{
	GPUMemory: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUMemory: podRequest[apiext.ResourceGPUMemory],
		}
	},
	GPUMemoryRatio: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUMemoryRatio: podRequest[apiext.ResourceGPUMemoryRatio],
		}
	},
	GPUCore | GPUMemory: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUCore:   podRequest[apiext.ResourceGPUCore],
			apiext.ResourceGPUMemory: podRequest[apiext.ResourceGPUMemory],
		}
	},
	GPUCore | GPUMemoryRatio: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUCore:        podRequest[apiext.ResourceGPUCore],
			apiext.ResourceGPUMemoryRatio: podRequest[apiext.ResourceGPUMemoryRatio],
		}
	},
	KoordGPU: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUCore:        podRequest[apiext.ResourceGPU],
			apiext.ResourceGPUMemoryRatio: podRequest[apiext.ResourceGPU],
		}
	},
	GPUShared | GPUMemory: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUShared: podRequest[apiext.ResourceGPUShared],
			apiext.ResourceGPUMemory: podRequest[apiext.ResourceGPUMemory],
		}
	},
	GPUShared | GPUMemoryRatio: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUShared:      podRequest[apiext.ResourceGPUShared],
			apiext.ResourceGPUMemoryRatio: podRequest[apiext.ResourceGPUMemoryRatio],
		}
	},
	GPUShared | GPUCore | GPUMemory: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUShared: podRequest[apiext.ResourceGPUShared],
			apiext.ResourceGPUCore:   podRequest[apiext.ResourceGPUCore],
			apiext.ResourceGPUMemory: podRequest[apiext.ResourceGPUMemory],
		}
	},
	GPUShared | GPUCore | GPUMemoryRatio: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceGPUShared:      podRequest[apiext.ResourceGPUShared],
			apiext.ResourceGPUCore:        podRequest[apiext.ResourceGPUCore],
			apiext.ResourceGPUMemoryRatio: podRequest[apiext.ResourceGPUMemoryRatio],
		}
	},
	NvidiaGPU: func(podRequest corev1.ResourceList) corev1.ResourceList {
		nvidiaGPU := podRequest[apiext.ResourceNvidiaGPU]
		return corev1.ResourceList{
			apiext.ResourceGPUCore:        *resource.NewQuantity(nvidiaGPU.Value()*100, resource.DecimalSI),
			apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(nvidiaGPU.Value()*100, resource.DecimalSI),
		}
	},
	AMDGPU: func(podRequest corev1.ResourceList) corev1.ResourceList {
		amdGPU := podRequest[apiext.ResourceAMDGPU]
		return corev1.ResourceList{
			apiext.ResourceGPUCore:        *resource.NewQuantity(amdGPU.Value()*100, resource.DecimalSI),
			apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(amdGPU.Value()*100, resource.DecimalSI),
		}
	},
	HygonDCU: func(podRequest corev1.ResourceList) corev1.ResourceList {
		hygonDCU := podRequest[apiext.ResourceHygonDCU]
		return corev1.ResourceList{
			apiext.ResourceGPUCore:        *resource.NewQuantity(hygonDCU.Value()*100, resource.DecimalSI),
			apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(hygonDCU.Value()*100, resource.DecimalSI),
		}
	},
	FPGA: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceFPGA: podRequest[apiext.ResourceFPGA],
		}
	},
	RDMA: func(podRequest corev1.ResourceList) corev1.ResourceList {
		return corev1.ResourceList{
			apiext.ResourceRDMA: podRequest[apiext.ResourceRDMA],
		}
	},
}

Functions

func ConvertDeviceRequest added in v1.3.0

func ConvertDeviceRequest(podRequest corev1.ResourceList, combination uint) corev1.ResourceList

func GetPodDeviceRequests added in v1.4.0

func GetPodDeviceRequests(pod *corev1.Pod) (map[schedulingv1alpha1.DeviceType]corev1.ResourceList, error)

func New

func New(obj runtime.Object, handle framework.Handle) (framework.Plugin, error)

func ValidDeviceResourceCombinationsDefaultTrue added in v1.6.0

func ValidDeviceResourceCombinationsDefaultTrue(podRequest corev1.ResourceList) bool

func ValidDeviceResourceCombinationsGPUPercentage added in v1.6.0

func ValidDeviceResourceCombinationsGPUPercentage(podRequest corev1.ResourceList) bool

func ValidDeviceResourceCombinationsGPUShared added in v1.6.0

func ValidDeviceResourceCombinationsGPUShared(podRequest corev1.ResourceList) bool

func ValidateDeviceRequest added in v1.3.0

func ValidateDeviceRequest(podRequest corev1.ResourceList) (uint, error)

func ValidateLessThan100Times added in v1.6.0

func ValidateLessThan100Times(a, b resource.Quantity) bool

func ValidateMultiple added in v1.6.0

func ValidateMultiple(a, b resource.Quantity) bool

func ValidatePercentageResource added in v1.3.0

func ValidatePercentageResource(q resource.Quantity) bool

Types

type AllocateContext added in v1.6.0

type AllocateContext struct {
	// contains filtered or unexported fields
}

type AutopilotAllocator added in v1.4.0

type AutopilotAllocator struct {
	// contains filtered or unexported fields
}

func (*AutopilotAllocator) Allocate added in v1.4.0

func (a *AutopilotAllocator) Allocate(
	required, preferred map[schedulingv1alpha1.DeviceType]sets.Int,
	requiredDeviceResources, preemptibleDeviceResources map[schedulingv1alpha1.DeviceType]deviceResources,
) (apiext.DeviceAllocations, *framework.Status)

func (*AutopilotAllocator) Prepare added in v1.4.0

func (a *AutopilotAllocator) Prepare() *framework.Status

type DefaultDeviceHandler added in v1.4.0

type DefaultDeviceHandler struct {
	// contains filtered or unexported fields
}

func (*DefaultDeviceHandler) CalcDesiredRequestsAndCount added in v1.4.0

func (h *DefaultDeviceHandler) CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1.Pod, podRequests corev1.ResourceList, nodeDevice *nodeDevice, hint *apiext.DeviceHint, state *preFilterState) (corev1.ResourceList, int, *framework.Status)

type DeviceAllocator added in v1.4.0

type DeviceAllocator interface {
	Allocate(requestCtx *requestContext, nodeDevice *nodeDevice, desiredCount int, maxDesiredCount int, preferredPCIEs sets.String) ([]*apiext.DeviceAllocation, *framework.Status)
}

type DeviceHandler added in v1.4.0

type DeviceHandler interface {
	CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1.Pod, podRequests corev1.ResourceList, nodeDevice *nodeDevice, hint *apiext.DeviceHint, state *preFilterState) (corev1.ResourceList, int, *framework.Status)
}

type DeviceLevelContext added in v1.6.0

type DeviceLevelContext struct {
	// contains filtered or unexported fields
}

type GPUAllocator added in v1.6.0

type GPUAllocator struct {
}

func (*GPUAllocator) Allocate added in v1.6.0

func (a *GPUAllocator) Allocate(requestCtx *requestContext, nodeDevice *nodeDevice, desiredCount int, maxDesiredCount int, preferredPCIEs sets.String) ([]*apiext.DeviceAllocation, *framework.Status)

type GPUHandler added in v1.4.0

type GPUHandler struct {
}

func (*GPUHandler) CalcDesiredRequestsAndCount added in v1.4.0

func (h *GPUHandler) CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1.Pod, podRequests corev1.ResourceList, nodeDevice *nodeDevice, hint *apiext.DeviceHint, state *preFilterState) (corev1.ResourceList, int, *framework.Status)

type GPUPartitionIndexer added in v1.6.0

type GPUPartitionIndexer map[int][]*PartitionsOfAllocationScore

func GetGPUPartitionIndexer added in v1.6.0

func GetGPUPartitionIndexer(table apiext.GPUPartitionTable) GPUPartitionIndexer

type GPURequirements added in v1.6.0

type GPURequirements struct {
	// contains filtered or unexported fields
}

type GPUTopologyScope added in v1.6.0

type GPUTopologyScope struct {
	// contains filtered or unexported fields
}

func GetGPUTopologyScope added in v1.6.0

func GetGPUTopologyScope(deviceInfos []*schedulingv1alpha1.DeviceInfo, nodeDeviceResources deviceResources) *GPUTopologyScope

type NUMATopology added in v1.4.0

type NUMATopology struct {
	// contains filtered or unexported fields
}

type NodeDeviceSummary

type NodeDeviceSummary struct {
	DeviceTotal map[v1.ResourceName]*resource.Quantity `json:"deviceTotal"`
	DeviceFree  map[v1.ResourceName]*resource.Quantity `json:"deviceFree"`
	DeviceUsed  map[v1.ResourceName]*resource.Quantity `json:"deviceUsed"`

	DeviceTotalDetail map[schedulingv1alpha1.DeviceType]deviceResources `json:"deviceTotalDetail"`
	DeviceFreeDetail  map[schedulingv1alpha1.DeviceType]deviceResources `json:"deviceFreeDetail"`
	DeviceUsedDetail  map[schedulingv1alpha1.DeviceType]deviceResources `json:"deviceUsedDetail"`

	AllocateSet map[schedulingv1alpha1.DeviceType]map[string]map[int]v1.ResourceList `json:"allocateSet"`
}

func NewNodeDeviceSummary

func NewNodeDeviceSummary() *NodeDeviceSummary

type PCIe added in v1.4.0

type PCIe struct {
	PCIeIndex
	// contains filtered or unexported fields
}

type PCIeIndex added in v1.4.0

type PCIeIndex struct {
	// contains filtered or unexported fields
}

type PartitionsOfAllocationScore added in v1.6.0

type PartitionsOfAllocationScore struct {
	Partitions      []*apiext.GPUPartition
	AllocationScore int
}

type Plugin

type Plugin struct {
	// contains filtered or unexported fields
}

func (*Plugin) AddPod added in v1.2.0

func (p *Plugin) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *corev1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status

func (*Plugin) Allocate added in v1.4.0

func (p *Plugin) Allocate(ctx context.Context, cycleState *framework.CycleState, affinity topologymanager.NUMATopologyHint, pod *corev1.Pod, nodeName string) *framework.Status

func (*Plugin) EventsToRegister added in v1.3.0

func (p *Plugin) EventsToRegister() []framework.ClusterEventWithHint

func (*Plugin) Filter

func (p *Plugin) Filter(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeInfo *framework.NodeInfo) *framework.Status

func (*Plugin) FilterNominateReservation added in v1.6.0

func (p *Plugin) FilterNominateReservation(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, reservationInfo *frameworkext.ReservationInfo, nodeName string) *framework.Status

func (*Plugin) FilterReservation added in v1.2.0

func (p *Plugin) FilterReservation(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, reservationInfo *frameworkext.ReservationInfo, nodeInfo *framework.NodeInfo) *framework.Status

func (*Plugin) FinalRestoreReservation added in v1.3.0

func (p *Plugin) FinalRestoreReservation(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeToStates frameworkext.NodeReservationRestoreStates) *framework.Status

DEPRECATED

func (*Plugin) GetPodTopologyHints added in v1.4.0

func (p *Plugin) GetPodTopologyHints(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeName string) (map[string][]topologymanager.NUMATopologyHint, *framework.Status)

func (*Plugin) Name

func (p *Plugin) Name() string

func (*Plugin) NormalizeReservationScore added in v1.4.0

func (p *Plugin) NormalizeReservationScore(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, scores frameworkext.ReservationScoreList) *framework.Status

func (*Plugin) NormalizeScore added in v1.3.0

func (p *Plugin) NormalizeScore(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, scores framework.NodeScoreList) *framework.Status

func (*Plugin) PreBind

func (p *Plugin) PreBind(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status

func (*Plugin) PreBindReservation added in v1.2.0

func (p *Plugin) PreBindReservation(ctx context.Context, cycleState *framework.CycleState, reservation *schedulingv1alpha1.Reservation, nodeName string) *framework.Status

func (*Plugin) PreFilter

func (p *Plugin) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod) (*framework.PreFilterResult, *framework.Status)

func (*Plugin) PreFilterExtensions

func (p *Plugin) PreFilterExtensions() framework.PreFilterExtensions

func (*Plugin) PreRestoreReservation added in v1.3.0

func (p *Plugin) PreRestoreReservation(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod) *framework.Status

func (*Plugin) PreScore added in v1.6.0

func (p *Plugin) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodes []*corev1.Node) *framework.Status

func (*Plugin) RegisterEndpoints

func (p *Plugin) RegisterEndpoints(group *gin.RouterGroup)

func (*Plugin) RemovePod added in v1.2.0

func (p *Plugin) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *corev1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status

func (*Plugin) ReservationScoreExtensions added in v1.4.0

func (p *Plugin) ReservationScoreExtensions() frameworkext.ReservationScoreExtensions

func (*Plugin) Reserve

func (p *Plugin) Reserve(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status

func (*Plugin) ResizePod added in v1.4.0

func (p *Plugin) ResizePod(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeName string) *framework.Status

func (*Plugin) RestoreReservation added in v1.3.0

func (p *Plugin) RestoreReservation(ctx context.Context, cycleState *framework.CycleState, podToSchedule *corev1.Pod, matched []*frameworkext.ReservationInfo, unmatched []*frameworkext.ReservationInfo, nodeInfo *framework.NodeInfo) (interface{}, *framework.Status)

func (*Plugin) Score added in v1.3.0

func (p *Plugin) Score(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeName string) (int64, *framework.Status)

func (*Plugin) ScoreExtensions added in v1.3.0

func (p *Plugin) ScoreExtensions() framework.ScoreExtensions

func (*Plugin) ScoreReservation added in v1.3.0

func (p *Plugin) ScoreReservation(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, reservationInfo *frameworkext.ReservationInfo, nodeName string) (int64, *framework.Status)

func (*Plugin) Unreserve

func (p *Plugin) Unreserve(ctx context.Context, cycleState *framework.CycleState, pod *corev1.Pod, nodeName string)

type ScopeLevelAllocateResult added in v1.6.0

type ScopeLevelAllocateResult struct {
	// contains filtered or unexported fields
}

type ScopeLevelContext added in v1.6.0

type ScopeLevelContext struct {
	// contains filtered or unexported fields
}

type VFAllocation added in v1.4.0

type VFAllocation struct {
	// contains filtered or unexported fields
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL