plugin

package
v6.0.0+incompatible Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 31, 2024 License: Apache-2.0, Apache-2.0 Imports: 28 Imported by: 0

Documentation

Overview

Package plugin is using for HuaWei Ascend pin affinity schedule.

Package plugin is using for HuaWei Ascend pin affinity schedule.

Package plugin is using for HuaWei Ascend pin affinity schedule frame.

Package plugin is using for HuaWei Ascend pin affinity schedule frame.

Package plugin is using for HuaWei Ascend pin affinity schedule frame.

Package plugin is using for HuaWei Ascend pin affinity schedule frame.

Package plugin is using for HuaWei Ascend pin affinity schedule.

Package plugin is using for HuaWei Ascend pin affinity schedule.

Index

Constants

View Source
const (
	// TorNodeCMName the Name of tor info configmap
	TorNodeCMName = "basic-tor-node-cm"
	// TorShareCMName the Name of tor share info configmap
	TorShareCMName = "tor-share-cm"
	// TorInfoCMKey the key of tor info in configmap
	TorInfoCMKey = "tor_info"
	// TorLevelCMKey the key of tor level in configmap
	TorLevelCMKey = "tor_level"
	// SingleLayer the single layer switch value of tor level in configmap
	SingleLayer = "single_layer"
	// TorAffinityKey the key of tor affinity
	TorAffinityKey = "tor-affinity"
	// GlobalTorInfoKey the key of tor share info in configmap
	GlobalTorInfoKey = "global-tor-info"
	// LargeModelTag the value of large model
	LargeModelTag = "large-model-schema"
	// NormalSchema the value of normal tor affinity
	NormalSchema = "normal-schema"
	// NullTag the value means not use tor affinity
	NullTag = "null"
	// JobDeleteFlag the flag mark job is deleted
	JobDeleteFlag = "fault-job-delete"
	// JobDelete the value of mark job is deleted
	JobDelete = "deleted"

	// NSLB2Version nslb 2.0 version
	NSLB2Version = "2.0"

	// SharedTorIp shared tor Ip
	SharedTorIp = "sharedTorIp"
	// PodRankIndexKey rank index key
	PodRankIndexKey = "hccl/rankIndex"
	// ReplicaSetType replicaset type
	ReplicaSetType = "ReplicaSet"
)
View Source
const (

	// ChipTypeB1 chip type 910B1
	ChipTypeB1 = "910B1"
	// ChipTypeB2C chip type 910B2C
	ChipTypeB2C = "910B2C"
	// ChipTypeB3 chip type 910B3
	ChipTypeB3 = "910B3"
	// ChipTypeB4 chip type 910B4
	ChipTypeB4 = "910B4"
)
View Source
const (
	// VNPUTempVir06 vir06_1c_16g
	VNPUTempVir06 = "vir06_1c_16g"
	// VNPUTempVir03 vir03_1c_8g
	VNPUTempVir03 = "vir03_1c_8g"
	// VNPUTempVir12 vir12_3c_32g
	VNPUTempVir12 = "vir12_3c_32g"
)

the temp of 910B1/910B2C

View Source
const (
	// VNPUTempVir05 vir05_1c_16g
	VNPUTempVir05 = "vir05_1c_16g"
	// VNPUTempVir10 vir10_3c_32g
	VNPUTempVir10 = "vir10_3c_32g"
)

the temp of 910B3

View Source
const (
	// VNPUB4TempVir05 vir05_1c_8g
	VNPUB4TempVir05 = "vir05_1c_8g"
	// VNPUB4TempVir10C3NM vir10_3c_16g_nm
	VNPUB4TempVir10C3NM = "vir10_3c_16g_nm"
	// VNPUB4TempVir10C4M vir10_4c_16g_m
	VNPUB4TempVir10C4M = "vir10_4c_16g_m"
	// VNPUB4TempVir10 vir10_3c_16g
	VNPUB4TempVir10 = "vir10_3c_16g"
)

the temp of 910B4

View Source
const (
	// GraceExitValue grace exit value
	GraceExitValue = 1
	// DefaultExitValue default exit value
	DefaultExitValue = 0
)
View Source
const (
	// PluginName the HuaWei NPU 's plugin name.
	PluginName = "huaweiNPU"

	// FormatIncorrectError format incorrect error
	FormatIncorrectError = "format incorrect"

	// AscendVNPULevel vnpu level
	AscendVNPULevel = "vnpu-level"
	// AscendVNPULevelLow low
	AscendVNPULevelLow = "low"
	// AscendVNPULevelHigh high
	AscendVNPULevelHigh = "high"
	// AscendVNPUPrefix vir
	AscendVNPUPrefix = "vir"
	// AscendVNPUDVPP dvpp enable
	AscendVNPUDVPP = "vnpu-dvpp"
	// AscendDVPPEnabledOff off
	AscendDVPPEnabledOff = "no"
	// AscendDVPPEnabledNull null
	AscendDVPPEnabledNull = "null"
	// AscendDVPPEnabledOn on
	AscendDVPPEnabledOn = "yes"
	// AscendNDVPPValue value
	AscendNDVPPValue = "ndvpp"
	// AscendDVPPValue value
	AscendDVPPValue = "dvpp"
	// VNPUTempVir01 vir01
	VNPUTempVir01 = "vir01"
	// VNPUTempVir02 vir02
	VNPUTempVir02 = "vir02"
	// VNPUTempVir02C1 vir02_1c
	VNPUTempVir02C1 = "vir02_1c"
	// VNPUTempVir04  vir04
	VNPUTempVir04 = "vir04"
	// VNPUTempVir04C3 vir04_3c
	VNPUTempVir04C3 = "vir04_3c"
	// VNPUTempVir04C3NDVPP vir04_3c_ndvpp
	VNPUTempVir04C3NDVPP = "vir04_3c_ndvpp"
	// VNPUTempVir04C4cDVPP vir04_4c_dvpp
	VNPUTempVir04C4cDVPP = "vir04_4c_dvpp"
	// VNPUTempVir08  vir08 only 910
	VNPUTempVir08 = "vir08"
	// VNPUTempVir16  vir16 only 910
	VNPUTempVir16 = "vir16"
	// Ascend310P 310P template name
	Ascend310P = "Ascend310P"
	// Ascend910 910 template name
	Ascend910 = "Ascend910"

	// ResetInfoCMNamePrefix for reset configmap name prefix
	ResetInfoCMNamePrefix = "reset-config-"
	// ResetInfoCMDataKey for reset configmap data key
	ResetInfoCMDataKey = "reset.json"
	// ResetInfoTypeKey for reset configmap type key
	ResetInfoTypeKey = "restartType"
	// PodRescheduleRestartType for hot reset restart type
	PodRescheduleRestartType = "podReschedule"
)

Variables

This section is empty.

Functions

func CheckNetSliceIsMeetJobRequire

func CheckNetSliceIsMeetJobRequire(sJob SchedulerJob, sHandler *ScheduleHandler,
	nodeMaps map[string]*api.NodeInfo) error

CheckNetSliceIsMeetJobRequire check the net slice is meet the job require in nslb 1.0 and set the job server list

func GetCardPhysicsIDFromAscendCore

func GetCardPhysicsIDFromAscendCore(pod *v1.Pod, isWholeCard bool) ([]int, error)

GetCardPhysicsIDFromAscendCore get card physics id from 0,1/0-vir04

func GetJobInfoAllocatedTaskNum

func GetJobInfoAllocatedTaskNum(jobInfo *api.JobInfo) int32

GetJobInfoAllocatedTaskNum get job allocated task num

func GetJobLabelFromVcJob

func GetJobLabelFromVcJob(job *api.JobInfo) map[string]string

GetJobLabelFromVcJob get job's label, not task's.

func GetJobNPUTasks

func GetJobNPUTasks(vcJob *api.JobInfo) map[api.TaskID]util.NPUTask

GetJobNPUTasks get NPUTask from jobInfo.

func GetJobSelectorFromVcJob

func GetJobSelectorFromVcJob(job *api.JobInfo) map[string]string

GetJobSelectorFromVcJob get job selector.

func GetLargeModelMaxServerNum

func GetLargeModelMaxServerNum(tors []*Tor, sharedTorNum int) int

GetLargeModelMaxServerNum get the node num that nslb 2.0 job can use at most

func GetMaxSharedTorServerNum

func GetMaxSharedTorServerNum(tors []*Tor, sharedTorNum int) int

GetMaxSharedTorServerNum get max shared tor num a job can use

func GetPhysicCardNameFromVChip

func GetPhysicCardNameFromVChip(realCardName string) string

GetPhysicCardNameFromVChip get cardName from whole Ascend310P-0/Ascend310P-1c-400-3_0

func GetResourceFromTemplate

func GetResourceFromTemplate(nodeType string, templateString string,
	taskTemplate map[string]map[string]util.VResource) *util.VResource

GetResourceFromTemplate nodeType like Ascend310P, templateString like "vir04_3c_ndvpp"

func GetTaskLabels

func GetTaskLabels(task *api.TaskInfo) map[string]string

GetTaskLabels get task's Labels.

func GetTaskSelectors

func GetTaskSelectors(task *api.TaskInfo) map[string]string

GetTaskSelectors get task's selector.

func GetVCJobReqNPUTypeFromJobInfo

func GetVCJobReqNPUTypeFromJobInfo(vcJob *api.JobInfo) (string, int, error)

GetVCJobReqNPUTypeFromJobInfo get job request resource, only NPU.

func GetVCTaskReqNPUTypeFromTaskInfo

func GetVCTaskReqNPUTypeFromTaskInfo(vcTask *api.TaskInfo) (string, int)

GetVCTaskReqNPUTypeFromTaskInfo get task request resource, only NPU.

func GetWholeCardIDFromAscendReal

func GetWholeCardIDFromAscendReal(cardNameStr string) (int, error)

GetWholeCardIDFromAscendReal get card physics id from Ascend910-0

func IsJobInitial

func IsJobInitial(job *api.JobInfo) bool

IsJobInitial Determine if the task is ready.

func IsJobRestarted

func IsJobRestarted(job *api.JobInfo) bool

IsJobRestarted used for rescheduling, judge if job restarted

func IsNPUTask

func IsNPUTask(nT *api.TaskInfo) bool

IsNPUTask to judge the task either is NPU task or not.

func IsPodWholeCardFromAscendCore

func IsPodWholeCardFromAscendCore(coreCardName string) bool

IsPodWholeCardFromAscendCore judge if card is whole card 0,1/0-vir04

func IsPodWholeCardFromAscendReal

func IsPodWholeCardFromAscendReal(realCardName string) bool

IsPodWholeCardFromAscendReal judge if card is whole card Ascend310P-0/Ascend310P-1c-400-3_0

func SetJobServerList

func SetJobServerList(sJob SchedulerJob, sHandler *ScheduleHandler,
	nodeMaps map[string]*api.NodeInfo) error

SetJobServerList check the single layer tor whether meet the job require and set the job server list

Types

type AllocNodeRankOccurrence

type AllocNodeRankOccurrence struct {
	NodeName   string
	RankIndex  string
	IsFault    bool
	Occurrence int
}

AllocNodeRankOccurrence object recording node rankIndex and whether index re-allocated to new node

type CommonNode

type CommonNode struct {
	Name           string
	Capability     map[v1.ResourceName]float64
	Allocate       map[v1.ResourceName]float64
	Idle           map[v1.ResourceName]float64
	BaseDeviceInfo string
	Annotation     map[string]string
	Label          map[string]string
	Address        string
	SuperPodID     int32
}

CommonNode common npu node properties

type DevFaultInfo

type DevFaultInfo struct {
	LogicId       int32
	Status        string
	Policy        string
	InitialPolicy string
	ErrorCode     []int64
	ErrorCodeHex  string
}

DevFaultInfo is the fault info of device

type DeviceInfosWithMutex

type DeviceInfosWithMutex struct {
	sync.Mutex
	Devices map[string]NodeDeviceInfoWithID
}

DeviceInfosWithMutex information for the current plugin

type FaultRankIdData

type FaultRankIdData struct {
	Name, Namespace string
	Data            map[string]string
}

FaultRankIdData fault rank id data

type ISchedulerPlugin

type ISchedulerPlugin interface {
	ISchedulerPluginBase
	ISchedulerPluginNeed
}

ISchedulerPlugin for volcano-npu plugin has function.

type ISchedulerPluginBase

type ISchedulerPluginBase interface {
	GetPluginName() string
	SetPluginName(string)
	GetAnnoPreVal() string
	SetAnnoPreVal(string)
	GetAnnoName() string
	SetAnnoName(string)
	GetDefaultJobSchedulerConfig() map[string]string
	SetDefaultJobSchedulerConfig(map[string]string)
}

ISchedulerPluginBase the frame plugin need implement.

type ISchedulerPluginNeed

type ISchedulerPluginNeed interface {
	// ValidNPUJob Valid the job part of npu scheduler policy, if not, disallowed.
	ValidNPUJob() *api.ValidateResult
	CheckNodeNPUByTask(*api.TaskInfo, NPUNode) error
	ScoreBestNPUNodes(*api.TaskInfo, []*api.NodeInfo, map[string]float64) error
	UseAnnotation(*api.TaskInfo, NPUNode) *NPUNode
	ReleaseAnnotation(*api.TaskInfo, NPUNode) *NPUNode
	PreStartAction(i interface{}, ssn *framework.Session) error
	PreStopAction(*ScheduleEnv) error
	InitMyJobPlugin(util.SchedulerJobAttr, ScheduleEnv) error
	GetReHandle() interface{}
}

ISchedulerPluginNeed The interface that the specific plug-in needs to implement.

type JobServers

type JobServers []*Server

JobServers job server

func (JobServers) Len

func (s JobServers) Len() int

Len get length

func (JobServers) Less

func (s JobServers) Less(i, j int) bool

Less define rule

func (JobServers) Swap

func (s JobServers) Swap(i, j int)

Swap swap element

type LogicTorList

type LogicTorList [][]*Server

LogicTorList logic tor list

func (LogicTorList) Len

func (tp LogicTorList) Len() int

Len get length

func (LogicTorList) Less

func (tp LogicTorList) Less(i, j int) bool

Less define rule

func (LogicTorList) Swap

func (tp LogicTorList) Swap(i, j int)

Swap swap element

type NPUBuilder

type NPUBuilder = func(string2 string) ISchedulerPlugin

NPUBuilder PluginBuilder plugin management

type NPUNode

type NPUNode struct {
	CommonNode
	VNode
	IsUnhealthy bool
	// contains filtered or unexported fields
}

NPUNode the plugin define node info.

func (NPUNode) CheckNPUResourceStable

func (n NPUNode) CheckNPUResourceStable(vcJob SchedulerJob) error

CheckNPUResourceStable check resource stabilize.

func (NPUNode) CheckNPUResourceStableReScheduling

func (n NPUNode) CheckNPUResourceStableReScheduling(vcJob util.SchedulerJobAttr) error

CheckNPUResourceStableReScheduling check resource stabilize.

func (NPUNode) GetChipKindFromNpuNode

func (n NPUNode) GetChipKindFromNpuNode() (string, error)

GetChipKindFromNpuNode input huawei-Ascend910 return Ascend910/Ascend310p/Ascend310

func (*NPUNode) GetNewNPUNodeAnnotation

func (n *NPUNode) GetNewNPUNodeAnnotation(usedTop []int, resourceName, resourceNamePre string) (string, error)

GetNewNPUNodeAnnotation get new annotation after allocate

func (NPUNode) IsNodeChipResEnough

func (n NPUNode) IsNodeChipResEnough(vRes util.VResource) bool

IsNodeChipResEnough judge if chip on node can be allocated to job

func (NPUNode) IsNodeNotMeetRes

func (n NPUNode) IsNodeNotMeetRes(taskResReq util.VResource) bool

IsNodeNotMeetRes judge the node meet resource or not.

func (NPUNode) IsNodeTotalResEnough

func (n NPUNode) IsNodeTotalResEnough(vRes util.VResource) bool

IsNodeTotalResEnough judge node total resource enough

type NodeDNodeInfo

type NodeDNodeInfo struct {
	FaultDevList []struct {
		DeviceType string
		DeviceId   int
		FaultCode  []string
		FaultLevel string
	}
	NodeStatus string
}

NodeDNodeInfo is node the information reported by noded

type NodeDeviceInfo

type NodeDeviceInfo struct {
	DeviceList map[string]string
	UpdateTime int64
}

NodeDeviceInfo like node annotation.

type NodeDeviceInfoWithDevPlugin

type NodeDeviceInfoWithDevPlugin struct {
	DeviceInfo  NodeDeviceInfo
	CheckCode   string
	SuperPodID  int32 `json:"SuperPodID,omitempty"`
	ServerIndex int32 `json:"ServerIndex,omitempty"`
}

NodeDeviceInfoWithDevPlugin a node has one by cm.

type NodeDeviceInfoWithID

type NodeDeviceInfoWithID struct {
	NodeDeviceInfo
	SuperPodID int32
}

NodeDeviceInfoWithID is node the information reported by cm.

type NodeDeviceInfoWithTime

type NodeDeviceInfoWithTime struct {
	NodeDeviceInfoWithID
	HostUpdateTime int64
}

NodeDeviceInfoWithTime is node device info with time

type NodeInfoWithNodeD

type NodeInfoWithNodeD struct {
	NodeInfo  NodeDNodeInfo
	CheckCode string
}

NodeInfoWithNodeD is node the node information and checkCode reported by noded

type NodeInfosFromCmWithMutex

type NodeInfosFromCmWithMutex struct {
	sync.Mutex
	Nodes map[string]NodeDNodeInfo
}

NodeInfosFromCmWithMutex node info with mutex

type NodeJobInfo

type NodeJobInfo struct {
	NodeIp   string
	NodeName string
	JobName  []string
}

NodeJobInfo node job info

type NslbParameters

type NslbParameters struct {
	// contains filtered or unexported fields
}

NslbParameters the Parameters os nslb

type OwnerInfo

type OwnerInfo struct {
	v1.OwnerReference
	Annotations map[string]string
	Replicas    *int32
}

OwnerInfo the owner info of job

type RankIndexInfo

type RankIndexInfo struct {
	HealthTorRankIndex map[string]string
	FaultRankIndex     map[int]struct{}
}

RankIndexInfo the info of job used rank

type ScheduleCache

type ScheduleCache struct {
	// special, name, value
	Names, Namespaces map[string]string
	Data              map[string]map[string]string
	FaultConfigMaps   map[api.JobID]*FaultRankIdData
}

ScheduleCache the plugin defined caches saving cm data

type ScheduleEnv

type ScheduleEnv struct {
	IsFirstSession    *bool // scheduler first session message is unreliable
	Jobs              map[api.JobID]SchedulerJob
	JobReplicas       map[api.JobID]int32
	Nodes             map[string]NPUNode
	NodesNotInSsn     map[string]*corev1.Node
	JobSinglePodFlag  map[api.JobID]bool
	JobSeverInfos     map[api.JobID]struct{}
	JobDeleteFlag     map[api.JobID]struct{}
	DeviceInfos       *DeviceInfosWithMutex
	DeleteJobInfos    map[api.JobID]*api.JobInfo
	NodeInfosFromCm   *NodeInfosFromCmWithMutex   // NodeInfos is get from kube-system/node-info- configmap
	SwitchInfosFromCm *SwitchInfosFromCmWithMutex // SwitchInfosFromCm is get from mindx-dl/device-info- configmap
	FrameAttr         VolcanoFrame
	Cache             ScheduleCache
	Tors              *TorList
	NslbAttr          *NslbParameters
	SuperPodInfo      *SuperPodInfo
	JobPendingMessage map[api.JobID]map[string]map[string]struct{}
}

ScheduleEnv for job scheduler context.

type ScheduleHandler

type ScheduleHandler struct {
	NPUPlugins map[string]NPUBuilder
	ScheduleEnv
	BaseHandle ISchedulerPlugin
	sync.Once
}

ScheduleHandler information for the current plugin

func (*ScheduleHandler) BatchNodeOrderFn

func (sHandle *ScheduleHandler) BatchNodeOrderFn(task *api.TaskInfo,
	nodes []*api.NodeInfo) (map[string]float64, error)

BatchNodeOrderFn Score the selected nodes.

func (*ScheduleHandler) BeforeCloseHandler

func (sHandle *ScheduleHandler) BeforeCloseHandler()

BeforeCloseHandler do the action before ssn close.

func (*ScheduleHandler) CacheToShareCM

func (sHandle *ScheduleHandler) CacheToShareCM() error

CacheToShareCM cache tors info to configmap

func (*ScheduleHandler) GetJobTemplate

func (sHandle *ScheduleHandler) GetJobTemplate() map[string]map[string]util.VResource

GetJobTemplate get template of all possible segmentation jobs

func (*ScheduleHandler) GetNPUScheduler

func (sHandle *ScheduleHandler) GetNPUScheduler(name string) (ISchedulerPlugin, bool)

GetNPUScheduler get the NPU scheduler by name

func (*ScheduleHandler) InitCache

func (sHandle *ScheduleHandler) InitCache()

InitCache init ScheduleHandler's cache.

func (*ScheduleHandler) InitDeleteJobInfos

func (sHandle *ScheduleHandler) InitDeleteJobInfos()

InitDeleteJobInfos init empty deleted jobinfos.

func (*ScheduleHandler) InitJobsFromSsn

func (sHandle *ScheduleHandler) InitJobsFromSsn(ssn *framework.Session)

InitJobsFromSsn init all jobs in ssn.

func (*ScheduleHandler) InitJobsPlugin

func (sHandle *ScheduleHandler) InitJobsPlugin()

InitJobsPlugin init job by plugins.

func (*ScheduleHandler) InitNPUSession

func (sHandle *ScheduleHandler) InitNPUSession(ssn *framework.Session) error

InitNPUSession init npu plugin and nodes.

func (*ScheduleHandler) InitNSLB2

func (sHandle *ScheduleHandler) InitNSLB2(ssn *framework.Session)

InitNSLB2 Init NSLB 2.0

func (*ScheduleHandler) InitNodesFromSsn

func (sHandle *ScheduleHandler) InitNodesFromSsn(ssn *framework.Session)

InitNodesFromSsn init all nodes in ssn.

func (*ScheduleHandler) InitReschedulerFromSsn

func (sHandle *ScheduleHandler) InitReschedulerFromSsn(ssn *framework.Session)

InitReschedulerFromSsn initialize re-scheduler

func (*ScheduleHandler) InitTorNodeInfo

func (sHandle *ScheduleHandler) InitTorNodeInfo(ssn *framework.Session)

InitTorNodeInfo init tor node if basic tor node configmap exits

func (*ScheduleHandler) InitVolcanoFrameFromSsn

func (sHandle *ScheduleHandler) InitVolcanoFrameFromSsn(ssn *framework.Session)

InitVolcanoFrameFromSsn init frame parameter from ssn.

func (*ScheduleHandler) IsPluginRegistered

func (sHandle *ScheduleHandler) IsPluginRegistered(name string) bool

IsPluginRegistered Determine if the plug-in is registered.

func (ScheduleHandler) IsTaskNeedNPUAllocated

func (sHandle ScheduleHandler) IsTaskNeedNPUAllocated(task *api.TaskInfo) bool

IsTaskNeedNPUAllocated to judge the task is static cut. true is dynamic cut.

func (*ScheduleHandler) JobValid

func (sHandle *ScheduleHandler) JobValid(obj interface{}) *api.ValidateResult

JobValid the job valid, used by volcano frame.

func (ScheduleHandler) NPUAllocateFunc

func (sHandle ScheduleHandler) NPUAllocateFunc(task *api.TaskInfo)

NPUAllocateFunc Allocate npu and called by volcano frame.

func (*ScheduleHandler) NPUDeallocateFunc

func (sHandle *ScheduleHandler) NPUDeallocateFunc(task *api.TaskInfo)

NPUDeallocateFunc Free assigned npu, if allocate failed by volcano frame.

func (*ScheduleHandler) NodePredicate

func (sHandle *ScheduleHandler) NodePredicate(taskInfo *api.TaskInfo, nodeInfo *api.NodeInfo) error

NodePredicate Predicate nodes.

func (*ScheduleHandler) PreStartPlugin

func (sHandle *ScheduleHandler) PreStartPlugin(ssn *framework.Session)

PreStartPlugin preStart plugin action.

func (*ScheduleHandler) RecordJobPendingMessage

func (sHandle *ScheduleHandler) RecordJobPendingMessage(vcJob SchedulerJob)

RecordJobPendingMessage record the job pending message to log

func (*ScheduleHandler) RegisterNPUScheduler

func (sHandle *ScheduleHandler) RegisterNPUScheduler(name string, pc NPUBuilder)

RegisterNPUScheduler register the plugin,like factory.

func (ScheduleHandler) SetJobPendReasonByNodesCase

func (sHandle ScheduleHandler) SetJobPendReasonByNodesCase(job *api.JobInfo)

SetJobPendReasonByNodesCase In nodes select case, set node failed and add failed reason.

func (*ScheduleHandler) SetJobPendingReason

func (sHandle *ScheduleHandler) SetJobPendingReason(vcJob *api.JobInfo, reason interface{}) error

SetJobPendingReason set the pod and podGroup pending reason.

func (*ScheduleHandler) SetSingleLayerTorAffinityJobNodesScore

func (sHandle *ScheduleHandler) SetSingleLayerTorAffinityJobNodesScore(task *api.TaskInfo,
	nodeMaps map[string]*api.NodeInfo, vcJob SchedulerJob, scoreMap map[string]float64) (map[string]float64, error)

SetSingleLayerTorAffinityJobNodesScore single layer switch networking rule

func (*ScheduleHandler) SetTorAffinityJobNodesScore

func (sHandle *ScheduleHandler) SetTorAffinityJobNodesScore(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo,
	vcJob SchedulerJob, label string, scoreMap map[string]float64) (map[string]float64, error)

SetTorAffinityJobNodesScore nslb 1.0 rule

func (*ScheduleHandler) SetTorAffinityJobNodesScoreV2

func (sHandle *ScheduleHandler) SetTorAffinityJobNodesScoreV2(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo,
	vcJob SchedulerJob, scoreMap map[string]float64) (map[string]float64, error)

SetTorAffinityJobNodesScoreV2 nslb 2.0 rule

func (*ScheduleHandler) UnRegisterNPUScheduler

func (sHandle *ScheduleHandler) UnRegisterNPUScheduler(name string) error

UnRegisterNPUScheduler unRegister the plugin

func (*ScheduleHandler) UpdateConfigMap

func (sHandle *ScheduleHandler) UpdateConfigMap(obj interface{}, operator string)

UpdateConfigMap update deviceInfo in cache

func (*ScheduleHandler) UpdatePodGroupPendingReason

func (sHandle *ScheduleHandler) UpdatePodGroupPendingReason(job *api.JobInfo, reason string)

UpdatePodGroupPendingReason update pg

type SchedulerJob

type SchedulerJob struct {
	util.SchedulerJobAttr
	RankIndexInfo
	UnschedulableReason

	ServerList   []*Tor
	TorBlackMaps map[string]struct{}
	JobReadyTag  bool
	SuperPods    map[string][]SuperNode
	Owner        OwnerInfo
	// contains filtered or unexported fields
}

SchedulerJob the plugin define job info

func (*SchedulerJob) CheckNodeNum

func (sJob *SchedulerJob) CheckNodeNum(taskInfo *api.TaskInfo, vcNode NPUNode) error

CheckNodeNum Check whether the number of cards on the node meets the task requirements.

func (SchedulerJob) CheckTorJobSinglePodDeleteV1

func (sJob SchedulerJob) CheckTorJobSinglePodDeleteV1(sHandler *ScheduleHandler,
	taskInfo *api.TaskInfo, vcNode NPUNode) error

CheckTorJobSinglePodDeleteV1 valid node.

func (SchedulerJob) CheckTorJobSinglePodDeleteV2

func (sJob SchedulerJob) CheckTorJobSinglePodDeleteV2(sHandler *ScheduleHandler, vcNode NPUNode) error

CheckTorJobSinglePodDeleteV2 valid node.

func (SchedulerJob) GetAnnoName

func (sJob SchedulerJob) GetAnnoName() (string, error)

GetAnnoName get job AnnoName, include vNPU job.

func (*SchedulerJob) GetEnableServerList

func (sJob *SchedulerJob) GetEnableServerList(nodes map[string]*api.NodeInfo, sHandler *ScheduleHandler)

GetEnableServerList get global tor list ,mark the server a job can be scheduled

func (*SchedulerJob) GetFullTorNumFromTorInfo

func (sJob *SchedulerJob) GetFullTorNumFromTorInfo(sHandler *ScheduleHandler) int

GetFullTorNumFromTorInfo get the num of full tor

func (SchedulerJob) GetLogicTorList

func (sJob SchedulerJob) GetLogicTorList(sHandler *ScheduleHandler, netSliceNum int) [][]*Server

GetLogicTorList get logic tor list by global tor list

func (SchedulerJob) GetPhyTosList

func (sJob SchedulerJob) GetPhyTosList(sHandler *ScheduleHandler, logicList [][]*Server) ([]*Tor, int)

GetPhyTosList transpose the logic tor list

func (SchedulerJob) GetReqCardNameFromRingController

func (sJob SchedulerJob) GetReqCardNameFromRingController() string

GetReqCardNameFromRingController Get request card name from RingController.

func (*SchedulerJob) Init

func (sJob *SchedulerJob) Init(vcJob *api.JobInfo, sHandle *ScheduleHandler) error

Init the SchedulerJob's init.

func (SchedulerJob) IsNPUJob

func (sJob SchedulerJob) IsNPUJob() bool

IsNPUJob check SchedulerJob is npu job

func (*SchedulerJob) IsTorAffinityJob

func (sJob *SchedulerJob) IsTorAffinityJob() bool

IsTorAffinityJob check job is tor affinity job

func (*SchedulerJob) MarkMulJobServerList

func (sJob *SchedulerJob) MarkMulJobServerList()

MarkMulJobServerList mark the job if the server job used is over 1 tor

func (*SchedulerJob) MarkTorListByJob

func (sJob *SchedulerJob) MarkTorListByJob(nodes map[string]*api.NodeInfo, sHandler *ScheduleHandler)

MarkTorListByJob mark the global tor list by node list a job can be scheduled

func (SchedulerJob) PreCheckTorEnv

func (sJob SchedulerJob) PreCheckTorEnv(sHandler *ScheduleHandler, nodeMaps map[string]*api.NodeInfo) error

PreCheckTorEnv precheck the env of cluster is ready for tor affinity job

func (*SchedulerJob) SetFaultJobRankIndex

func (sJob *SchedulerJob) SetFaultJobRankIndex()

SetFaultJobRankIndex set rank index for fault job's fault task

func (SchedulerJob) SetFillJobServerList

func (sJob SchedulerJob) SetFillJobServerList(sHandler *ScheduleHandler, Tors []*Tor, taskNum int) error

SetFillJobServerList set the fill job server list in nslb 1.0 and single layer switch networking rule

func (*SchedulerJob) SetFillJobServerListV2

func (sJob *SchedulerJob) SetFillJobServerListV2(Tors []*Tor, taskNum int) error

SetFillJobServerListV2 set the fill job server list in nslb 2.0

func (*SchedulerJob) SetJobRankIndex

func (sJob *SchedulerJob) SetJobRankIndex()

SetJobRankIndex set rank index for job

func (*SchedulerJob) SetJobServerCacheTosHandler

func (sJob *SchedulerJob) SetJobServerCacheTosHandler(sHandler *ScheduleHandler,
	pyTor []*Tor, taskRow, taskColumn int)

SetJobServerCacheTosHandler set job server list and update the job in sHandler

func (*SchedulerJob) SetNormalJobServerList

func (sJob *SchedulerJob) SetNormalJobServerList(sHandler *ScheduleHandler)

SetNormalJobServerList set the server list of normal job in nslb 1.0

func (SchedulerJob) SortJobServerListBySliceId

func (sJob SchedulerJob) SortJobServerListBySliceId() []*Tor

SortJobServerListBySliceId sort JobServer list by SliceId

func (*SchedulerJob) UpdateJobPendingMessage

func (sJob *SchedulerJob) UpdateJobPendingMessage(message, nodeName string)

UpdateJobPendingMessage update job pending message

func (SchedulerJob) ValidJobFn

func (sJob SchedulerJob) ValidJobFn() *api.ValidateResult

ValidJobFn valid job.

type SchedulerPlugin

type SchedulerPlugin struct {
	// contains filtered or unexported fields
}

SchedulerPlugin for all volcano-npu plugin.

func (SchedulerPlugin) GetAnnoName

func (sp SchedulerPlugin) GetAnnoName() string

GetAnnoName get AnnoName.

func (SchedulerPlugin) GetAnnoPreVal

func (sp SchedulerPlugin) GetAnnoPreVal() string

GetAnnoPreVal get AnnoPreVal.

func (SchedulerPlugin) GetDefaultJobSchedulerConfig

func (sp SchedulerPlugin) GetDefaultJobSchedulerConfig() map[string]string

GetDefaultJobSchedulerConfig get DefaultJobSchedulerConfig.

func (SchedulerPlugin) GetPluginName

func (sp SchedulerPlugin) GetPluginName() string

GetPluginName get PluginName.

func (*SchedulerPlugin) SetAnnoName

func (sp *SchedulerPlugin) SetAnnoName(annoName string)

SetAnnoName set AnnoName.

func (*SchedulerPlugin) SetAnnoPreVal

func (sp *SchedulerPlugin) SetAnnoPreVal(value string)

SetAnnoPreVal set AnnoPreVal.

func (*SchedulerPlugin) SetDefaultJobSchedulerConfig

func (sp *SchedulerPlugin) SetDefaultJobSchedulerConfig(conf map[string]string)

SetDefaultJobSchedulerConfig set DefaultJobSchedulerConfig.

func (*SchedulerPlugin) SetPluginName

func (sp *SchedulerPlugin) SetPluginName(name string)

SetPluginName set PluginName.

type Server

type Server struct {
	IsUsedByMulJob bool   `json:"-"`
	NodeRank       string `json:"-"`
	IP             string `json:"server_ip"`
	Count          int    `json:"npu_count"`
	SliceId        int    `json:"slice_id"`
	Jobs           map[api.JobID]SchedulerJob
	CurrentJob     *api.JobID
	Name           string
}

Server server info

type ServerList

type ServerList struct {
	Id      int                      `json:"tor_id"`
	Servers []map[string]interface{} `json:"server"`
}

ServerList server interface

type Servers

type Servers struct {
	Version     string      `json:"version"`
	ServerCount int         `json:"server_count"`
	TorCount    int         `json:"tor_count"`
	ServerList  []*basicTor `json:"server_list"`
}

Servers include basic tor

type Slice

type Slice struct {
	Idle  int
	Id    int
	Nodes map[string]*Server
}

Slice include server

type SuperNode

type SuperNode struct {
	Name       string
	SuperPodID int32
}

SuperNode node with SuperPodID

type SuperPodInfo

type SuperPodInfo struct {
	SuperPodReschdInfo        map[api.JobID]map[string][]SuperNode // cache super pod re-schd info
	SuperPodFaultTaskNodes    map[api.JobID][]string               // cache fault task nodes info
	SuperPodMapFaultTaskNodes map[api.JobID]map[string]string      // cache task and nodes for stage2
}

SuperPodInfo cache super pod info for pod rescheduling

type SwitchFaultInfo

type SwitchFaultInfo struct {
	FaultCode  []string
	FaultLevel string
	UpdateTime int64
	NodeStatus string
}

SwitchFaultInfo Switch Fault Info

type SwitchInfosFromCmWithMutex

type SwitchInfosFromCmWithMutex struct {
	sync.Mutex
	Switches map[string]SwitchFaultInfo
}

SwitchInfosFromCmWithMutex SwitchInfos From Cm WithMutex

type TaskDevInfo

type TaskDevInfo struct {
	RankId int
	DevFaultInfo
}

TaskDevInfo is the device info of a task

type TaskResetInfo

type TaskResetInfo struct {
	RankList      []*TaskDevInfo
	UpdateTime    int64
	RetryTime     int
	FaultFlushing bool
	GracefulExit  int
}

TaskResetInfo record task reset device information

type Tor

type Tor struct {
	FreeServerCount int
	IsHealthy       int
	IsSharedTor     int
	Id              int       `json:"tor_id"`
	IP              string    `json:"tor_ip"`
	Servers         []*Server `json:"server"`
	Jobs            map[api.JobID]SchedulerJob
}

Tor tor info include server

func GetHealthyTorUsedByNormalJob

func GetHealthyTorUsedByNormalJob(tors []*Tor, sortType string) []*Tor

GetHealthyTorUsedByNormalJob get shared tors only used by normal job

func GetNotShareAndFreeTorServer

func GetNotShareAndFreeTorServer(tors []*Tor, sortType string) []*Tor

GetNotShareAndFreeTorServer get the free tors

func GetNotShareTorServer

func GetNotShareTorServer(tors []*Tor, sortType string) []*Tor

GetNotShareTorServer get the exclusiveTor tors

func GetSharedTorServer

func GetSharedTorServer(tors []*Tor, sortType string) []*Tor

GetSharedTorServer get healthy shared tors

func GetTorServer

func GetTorServer(tors []*Tor, sortType string) []*Tor

GetTorServer get all healthy tor

func GetUnhealthyTorServer

func GetUnhealthyTorServer(tors []*Tor, sortType string) []*Tor

GetUnhealthyTorServer get unhealthy shared tors

func (*Tor) GetNodeByNodeIP

func (t *Tor) GetNodeByNodeIP(ip string) *Server

GetNodeByNodeIP get node by node IP

func (*Tor) GetNodeByNodeName

func (t *Tor) GetNodeByNodeName(name string) *Server

GetNodeByNodeName get node by node name

func (*Tor) HasAcrossJob

func (t *Tor) HasAcrossJob(isNSLBv2 bool, jobName api.JobID) bool

HasAcrossJob whether has across job

func (*Tor) IsUsedByAcrossLargeModelJob

func (t *Tor) IsUsedByAcrossLargeModelJob() bool

IsUsedByAcrossLargeModelJob whether used by across large model job

type TorList

type TorList struct {
	Version  string `json:"version"`
	TorCount int    `json:"tor_count"`
	Tors     []*Tor `json:"server_list"`
	// contains filtered or unexported fields
}

TorList tor info about nodes

type TorListInfo

type TorListInfo struct {
	Status      string       `json:"status"`
	Version     string       `json:"version"`
	ServerCount int          `json:"server_count"`
	TorCount    int          `json:"tor_count"`
	ServerList  []ServerList `json:"server_list"`
}

TorListInfo information for the current plugin

type TorLs

type TorLs []*Tor

TorLs tor list

func (TorLs) Len

func (tp TorLs) Len() int

Len get length

func (TorLs) Less

func (tp TorLs) Less(i, j int) bool

Less define rule

func (TorLs) Swap

func (tp TorLs) Swap(i, j int)

Swap swap element

type TorShare

type TorShare struct {
	IsHealthy   int
	IsSharedTor int
	NodeJobs    []NodeJobInfo `json:"nodes"`
}

TorShare tor share info

type UnschedulableReason

type UnschedulableReason struct {
	Reason map[string]map[string]struct{}
	*sync.Mutex
}

UnschedulableReason the message of pod pending

type VChip

type VChip struct {
	PodMap map[string]*v1.Pod
	ID     []string
	// Name Ascend910-0
	Name string
	// Kind Ascend910/Ascend310/Ascend310P
	Kind        string
	IsDual      bool
	Unstable    bool
	CoreNum     int
	SegmentFlag bool
	TotalRes    util.VResource
	UsedRes     util.VResource
	FreeRes     util.VResource
}

VChip vnpu chip class

func (*VChip) IsChipMeetResReq

func (vChip *VChip) IsChipMeetResReq(vRes util.VResource) bool

IsChipMeetResReq check chip resource can be allocated as the task requires

func (*VChip) IsPodResUnstable

func (vChip *VChip) IsPodResUnstable(pod *v1.Pod) bool

IsPodResUnstable return true if chip stable

func (*VChip) UpdateDVPP

func (vChip *VChip) UpdateDVPP(podResDVPP string)

UpdateDVPP update dvpp according to pod resource

type VNode

type VNode struct {
	// Chips map chipID to VChip class
	Chips map[int]*VChip
	// ChipKind Ascend910/310/310p
	ChipKind string
	// UnhealthyChipIds the card unhealthy chip ids in this node
	UnhealthyChipIds map[int]struct{}
	// ServerType Ascend310p-10-dual cardType-cardCoreNum-duo
	ServerType string
	// TotalChipNum num of total chips, get from capacity
	TotalChipNum int
	// AiCorePerChip num of aicore on each chip
	AiCorePerChip int
	// FreeChipNum num of free chips get from device-info
	FreeChipNum int
	// TotalRes total resource on node
	TotalRes util.VResource
	// ValidVNode node init success
	ValidVNode bool
	// Chip type 910B1/910B2C/910B3/910B4
	ChipType string
}

VNode vnpu node class

func (VNode) GetNodeTopForWholeCard

func (vNode VNode) GetNodeTopForWholeCard() []int

GetNodeTopForWholeCard get node top for whole card

func (*VNode) IsResourceWholeCard

func (vNode *VNode) IsResourceWholeCard(aiCore int) bool

IsResourceWholeCard judge if resource is whole card by node total resource

func (*VNode) NewVChip

func (vNode *VNode) NewVChip(id int, totalRes util.VResource) *VChip

NewVChip create new vChip

func (*VNode) SelectChipFromNode

func (vNode *VNode) SelectChipFromNode(vRes util.VResource) (string, error)

SelectChipFromNode get chip with least resource that meets vRes requirements

type VolcanoFrame

type VolcanoFrame struct {
	UID            types.UID
	Confs          []config.Configuration
	KubeClient     kubernetes.Interface
	VJobTemplate   map[string]map[string]util.VResource
	SuperPodSize   int
	ReservePodSize int
}

VolcanoFrame passed in by the volcano frame.

func (*VolcanoFrame) CheckUseCIMByConfig

func (vf *VolcanoFrame) CheckUseCIMByConfig() bool

CheckUseCIMByConfig check use cluster info manager by config, default true

func (*VolcanoFrame) CheckVNPUSegmentEnableByConfig

func (vf *VolcanoFrame) CheckVNPUSegmentEnableByConfig() bool

CheckVNPUSegmentEnableByConfig Check VNPU segmentEnable by init plugin parameters, return true if static

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL