Documentation
¶
Overview ¶
Package plugin is using for HuaWei Ascend pin affinity schedule.
Package plugin is using for HuaWei Ascend pin affinity schedule.
Package plugin is using for HuaWei Ascend pin affinity schedule frame.
Package plugin is using for HuaWei Ascend pin affinity schedule frame.
Package plugin is using for HuaWei Ascend pin affinity schedule frame.
Package plugin is using for HuaWei Ascend pin affinity schedule frame.
Package plugin is using for HuaWei Ascend pin affinity schedule.
Package plugin is using for HuaWei Ascend pin affinity schedule.
Index ¶
- Constants
- func CheckNetSliceIsMeetJobRequire(sJob SchedulerJob, sHandler *ScheduleHandler, ...) error
- func GetCardPhysicsIDFromAscendCore(pod *v1.Pod, isWholeCard bool) ([]int, error)
- func GetJobInfoAllocatedTaskNum(jobInfo *api.JobInfo) int32
- func GetJobLabelFromVcJob(job *api.JobInfo) map[string]string
- func GetJobNPUTasks(vcJob *api.JobInfo) map[api.TaskID]util.NPUTask
- func GetJobSelectorFromVcJob(job *api.JobInfo) map[string]string
- func GetLargeModelMaxServerNum(tors []*Tor, sharedTorNum int) int
- func GetMaxSharedTorServerNum(tors []*Tor, sharedTorNum int) int
- func GetPhysicCardNameFromVChip(realCardName string) string
- func GetResourceFromTemplate(nodeType string, templateString string, ...) *util.VResource
- func GetTaskLabels(task *api.TaskInfo) map[string]string
- func GetTaskSelectors(task *api.TaskInfo) map[string]string
- func GetVCJobReqNPUTypeFromJobInfo(vcJob *api.JobInfo) (string, int, error)
- func GetVCTaskReqNPUTypeFromTaskInfo(vcTask *api.TaskInfo) (string, int)
- func GetWholeCardIDFromAscendReal(cardNameStr string) (int, error)
- func IsJobInitial(job *api.JobInfo) bool
- func IsJobRestarted(job *api.JobInfo) bool
- func IsNPUTask(nT *api.TaskInfo) bool
- func IsPodWholeCardFromAscendCore(coreCardName string) bool
- func IsPodWholeCardFromAscendReal(realCardName string) bool
- func SetJobServerList(sJob SchedulerJob, sHandler *ScheduleHandler, ...) error
- type AllocNodeRankOccurrence
- type CommonNode
- type DevFaultInfo
- type DeviceInfosWithMutex
- type FaultRankIdData
- type ISchedulerPlugin
- type ISchedulerPluginBase
- type ISchedulerPluginNeed
- type JobServers
- type LogicTorList
- type NPUBuilder
- type NPUNode
- func (n NPUNode) CheckNPUResourceStable(vcJob SchedulerJob) error
- func (n NPUNode) CheckNPUResourceStableReScheduling(vcJob util.SchedulerJobAttr) error
- func (n NPUNode) GetChipKindFromNpuNode() (string, error)
- func (n *NPUNode) GetNewNPUNodeAnnotation(usedTop []int, resourceName, resourceNamePre string) (string, error)
- func (n NPUNode) IsNodeChipResEnough(vRes util.VResource) bool
- func (n NPUNode) IsNodeNotMeetRes(taskResReq util.VResource) bool
- func (n NPUNode) IsNodeTotalResEnough(vRes util.VResource) bool
- type NodeDNodeInfo
- type NodeDeviceInfo
- type NodeDeviceInfoWithDevPlugin
- type NodeDeviceInfoWithID
- type NodeDeviceInfoWithTime
- type NodeInfoWithNodeD
- type NodeInfosFromCmWithMutex
- type NodeJobInfo
- type NslbParameters
- type OwnerInfo
- type RankIndexInfo
- type ScheduleCache
- type ScheduleEnv
- type ScheduleHandler
- func (sHandle *ScheduleHandler) BatchNodeOrderFn(task *api.TaskInfo, nodes []*api.NodeInfo) (map[string]float64, error)
- func (sHandle *ScheduleHandler) BeforeCloseHandler()
- func (sHandle *ScheduleHandler) CacheToShareCM() error
- func (sHandle *ScheduleHandler) GetJobTemplate() map[string]map[string]util.VResource
- func (sHandle *ScheduleHandler) GetNPUScheduler(name string) (ISchedulerPlugin, bool)
- func (sHandle *ScheduleHandler) InitCache()
- func (sHandle *ScheduleHandler) InitDeleteJobInfos()
- func (sHandle *ScheduleHandler) InitJobsFromSsn(ssn *framework.Session)
- func (sHandle *ScheduleHandler) InitJobsPlugin()
- func (sHandle *ScheduleHandler) InitNPUSession(ssn *framework.Session) error
- func (sHandle *ScheduleHandler) InitNSLB2(ssn *framework.Session)
- func (sHandle *ScheduleHandler) InitNodesFromSsn(ssn *framework.Session)
- func (sHandle *ScheduleHandler) InitReschedulerFromSsn(ssn *framework.Session)
- func (sHandle *ScheduleHandler) InitTorNodeInfo(ssn *framework.Session)
- func (sHandle *ScheduleHandler) InitVolcanoFrameFromSsn(ssn *framework.Session)
- func (sHandle *ScheduleHandler) IsPluginRegistered(name string) bool
- func (sHandle ScheduleHandler) IsTaskNeedNPUAllocated(task *api.TaskInfo) bool
- func (sHandle *ScheduleHandler) JobValid(obj interface{}) *api.ValidateResult
- func (sHandle ScheduleHandler) NPUAllocateFunc(task *api.TaskInfo)
- func (sHandle *ScheduleHandler) NPUDeallocateFunc(task *api.TaskInfo)
- func (sHandle *ScheduleHandler) NodePredicate(taskInfo *api.TaskInfo, nodeInfo *api.NodeInfo) error
- func (sHandle *ScheduleHandler) PreStartPlugin(ssn *framework.Session)
- func (sHandle *ScheduleHandler) RecordJobPendingMessage(vcJob SchedulerJob)
- func (sHandle *ScheduleHandler) RegisterNPUScheduler(name string, pc NPUBuilder)
- func (sHandle ScheduleHandler) SetJobPendReasonByNodesCase(job *api.JobInfo)
- func (sHandle *ScheduleHandler) SetJobPendingReason(vcJob *api.JobInfo, reason interface{}) error
- func (sHandle *ScheduleHandler) SetSingleLayerTorAffinityJobNodesScore(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo, vcJob SchedulerJob, ...) (map[string]float64, error)
- func (sHandle *ScheduleHandler) SetTorAffinityJobNodesScore(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo, vcJob SchedulerJob, ...) (map[string]float64, error)
- func (sHandle *ScheduleHandler) SetTorAffinityJobNodesScoreV2(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo, vcJob SchedulerJob, ...) (map[string]float64, error)
- func (sHandle *ScheduleHandler) UnRegisterNPUScheduler(name string) error
- func (sHandle *ScheduleHandler) UpdateConfigMap(obj interface{}, operator string)
- func (sHandle *ScheduleHandler) UpdatePodGroupPendingReason(job *api.JobInfo, reason string)
- type SchedulerJob
- func (sJob *SchedulerJob) CheckNodeNum(taskInfo *api.TaskInfo, vcNode NPUNode) error
- func (sJob SchedulerJob) CheckTorJobSinglePodDeleteV1(sHandler *ScheduleHandler, taskInfo *api.TaskInfo, vcNode NPUNode) error
- func (sJob SchedulerJob) CheckTorJobSinglePodDeleteV2(sHandler *ScheduleHandler, vcNode NPUNode) error
- func (sJob SchedulerJob) GetAnnoName() (string, error)
- func (sJob *SchedulerJob) GetEnableServerList(nodes map[string]*api.NodeInfo, sHandler *ScheduleHandler)
- func (sJob *SchedulerJob) GetFullTorNumFromTorInfo(sHandler *ScheduleHandler) int
- func (sJob SchedulerJob) GetLogicTorList(sHandler *ScheduleHandler, netSliceNum int) [][]*Server
- func (sJob SchedulerJob) GetPhyTosList(sHandler *ScheduleHandler, logicList [][]*Server) ([]*Tor, int)
- func (sJob SchedulerJob) GetReqCardNameFromRingController() string
- func (sJob *SchedulerJob) Init(vcJob *api.JobInfo, sHandle *ScheduleHandler) error
- func (sJob SchedulerJob) IsNPUJob() bool
- func (sJob *SchedulerJob) IsTorAffinityJob() bool
- func (sJob *SchedulerJob) MarkMulJobServerList()
- func (sJob *SchedulerJob) MarkTorListByJob(nodes map[string]*api.NodeInfo, sHandler *ScheduleHandler)
- func (sJob SchedulerJob) PreCheckTorEnv(sHandler *ScheduleHandler, nodeMaps map[string]*api.NodeInfo) error
- func (sJob *SchedulerJob) SetFaultJobRankIndex()
- func (sJob SchedulerJob) SetFillJobServerList(sHandler *ScheduleHandler, Tors []*Tor, taskNum int) error
- func (sJob *SchedulerJob) SetFillJobServerListV2(Tors []*Tor, taskNum int) error
- func (sJob *SchedulerJob) SetJobRankIndex()
- func (sJob *SchedulerJob) SetJobServerCacheTosHandler(sHandler *ScheduleHandler, pyTor []*Tor, taskRow, taskColumn int)
- func (sJob *SchedulerJob) SetNormalJobServerList(sHandler *ScheduleHandler)
- func (sJob SchedulerJob) SortJobServerListBySliceId() []*Tor
- func (sJob *SchedulerJob) UpdateJobPendingMessage(message, nodeName string)
- func (sJob SchedulerJob) ValidJobFn() *api.ValidateResult
- type SchedulerPlugin
- func (sp SchedulerPlugin) GetAnnoName() string
- func (sp SchedulerPlugin) GetAnnoPreVal() string
- func (sp SchedulerPlugin) GetDefaultJobSchedulerConfig() map[string]string
- func (sp SchedulerPlugin) GetPluginName() string
- func (sp *SchedulerPlugin) SetAnnoName(annoName string)
- func (sp *SchedulerPlugin) SetAnnoPreVal(value string)
- func (sp *SchedulerPlugin) SetDefaultJobSchedulerConfig(conf map[string]string)
- func (sp *SchedulerPlugin) SetPluginName(name string)
- type Server
- type ServerList
- type Servers
- type Slice
- type SuperNode
- type SuperPodInfo
- type SwitchFaultInfo
- type SwitchInfosFromCmWithMutex
- type TaskDevInfo
- type TaskResetInfo
- type Tor
- func GetHealthyTorUsedByNormalJob(tors []*Tor, sortType string) []*Tor
- func GetNotShareAndFreeTorServer(tors []*Tor, sortType string) []*Tor
- func GetNotShareTorServer(tors []*Tor, sortType string) []*Tor
- func GetSharedTorServer(tors []*Tor, sortType string) []*Tor
- func GetTorServer(tors []*Tor, sortType string) []*Tor
- func GetUnhealthyTorServer(tors []*Tor, sortType string) []*Tor
- type TorList
- type TorListInfo
- type TorLs
- type TorShare
- type UnschedulableReason
- type VChip
- type VNode
- type VolcanoFrame
Constants ¶
const ( // TorNodeCMName the Name of tor info configmap TorNodeCMName = "basic-tor-node-cm" TorShareCMName = "tor-share-cm" // TorInfoCMKey the key of tor info in configmap TorInfoCMKey = "tor_info" // TorLevelCMKey the key of tor level in configmap TorLevelCMKey = "tor_level" // SingleLayer the single layer switch value of tor level in configmap SingleLayer = "single_layer" // TorAffinityKey the key of tor affinity TorAffinityKey = "tor-affinity" // GlobalTorInfoKey the key of tor share info in configmap GlobalTorInfoKey = "global-tor-info" // LargeModelTag the value of large model LargeModelTag = "large-model-schema" // NormalSchema the value of normal tor affinity NormalSchema = "normal-schema" // NullTag the value means not use tor affinity NullTag = "null" // JobDeleteFlag the flag mark job is deleted JobDeleteFlag = "fault-job-delete" // JobDelete the value of mark job is deleted JobDelete = "deleted" // NSLB2Version nslb 2.0 version NSLB2Version = "2.0" SharedTorIp = "sharedTorIp" // PodRankIndexKey rank index key PodRankIndexKey = "hccl/rankIndex" // ReplicaSetType replicaset type ReplicaSetType = "ReplicaSet" )
const ( // ChipTypeB1 chip type 910B1 ChipTypeB1 = "910B1" // ChipTypeB2C chip type 910B2C ChipTypeB2C = "910B2C" // ChipTypeB3 chip type 910B3 ChipTypeB3 = "910B3" // ChipTypeB4 chip type 910B4 ChipTypeB4 = "910B4" )
const ( // VNPUTempVir06 vir06_1c_16g VNPUTempVir06 = "vir06_1c_16g" // VNPUTempVir03 vir03_1c_8g VNPUTempVir03 = "vir03_1c_8g" // VNPUTempVir12 vir12_3c_32g VNPUTempVir12 = "vir12_3c_32g" )
the temp of 910B1/910B2C
const ( // VNPUTempVir05 vir05_1c_16g VNPUTempVir05 = "vir05_1c_16g" // VNPUTempVir10 vir10_3c_32g VNPUTempVir10 = "vir10_3c_32g" )
the temp of 910B3
const ( // VNPUB4TempVir05 vir05_1c_8g VNPUB4TempVir05 = "vir05_1c_8g" // VNPUB4TempVir10C3NM vir10_3c_16g_nm VNPUB4TempVir10C3NM = "vir10_3c_16g_nm" // VNPUB4TempVir10C4M vir10_4c_16g_m VNPUB4TempVir10C4M = "vir10_4c_16g_m" // VNPUB4TempVir10 vir10_3c_16g VNPUB4TempVir10 = "vir10_3c_16g" )
the temp of 910B4
const ( // GraceExitValue grace exit value GraceExitValue = 1 // DefaultExitValue default exit value DefaultExitValue = 0 )
const ( // PluginName the HuaWei NPU 's plugin name. PluginName = "huaweiNPU" // FormatIncorrectError format incorrect error FormatIncorrectError = "format incorrect" // AscendVNPULevel vnpu level AscendVNPULevel = "vnpu-level" // AscendVNPULevelLow low AscendVNPULevelLow = "low" // AscendVNPULevelHigh high AscendVNPULevelHigh = "high" // AscendVNPUPrefix vir AscendVNPUPrefix = "vir" // AscendVNPUDVPP dvpp enable AscendVNPUDVPP = "vnpu-dvpp" // AscendDVPPEnabledOff off AscendDVPPEnabledOff = "no" // AscendDVPPEnabledNull null AscendDVPPEnabledNull = "null" // AscendDVPPEnabledOn on AscendDVPPEnabledOn = "yes" // AscendNDVPPValue value AscendNDVPPValue = "ndvpp" // AscendDVPPValue value AscendDVPPValue = "dvpp" // VNPUTempVir01 vir01 VNPUTempVir01 = "vir01" // VNPUTempVir02 vir02 VNPUTempVir02 = "vir02" // VNPUTempVir02C1 vir02_1c VNPUTempVir02C1 = "vir02_1c" // VNPUTempVir04 vir04 VNPUTempVir04 = "vir04" // VNPUTempVir04C3 vir04_3c VNPUTempVir04C3 = "vir04_3c" // VNPUTempVir04C3NDVPP vir04_3c_ndvpp VNPUTempVir04C3NDVPP = "vir04_3c_ndvpp" // VNPUTempVir04C4cDVPP vir04_4c_dvpp VNPUTempVir04C4cDVPP = "vir04_4c_dvpp" // VNPUTempVir08 vir08 only 910 VNPUTempVir08 = "vir08" // VNPUTempVir16 vir16 only 910 VNPUTempVir16 = "vir16" // Ascend310P 310P template name Ascend310P = "Ascend310P" // Ascend910 910 template name Ascend910 = "Ascend910" // ResetInfoCMNamePrefix for reset configmap name prefix ResetInfoCMNamePrefix = "reset-config-" // ResetInfoCMDataKey for reset configmap data key ResetInfoCMDataKey = "reset.json" // ResetInfoTypeKey for reset configmap type key ResetInfoTypeKey = "restartType" // PodRescheduleRestartType for hot reset restart type PodRescheduleRestartType = "podReschedule" )
Variables ¶
This section is empty.
Functions ¶
func CheckNetSliceIsMeetJobRequire ¶
func CheckNetSliceIsMeetJobRequire(sJob SchedulerJob, sHandler *ScheduleHandler, nodeMaps map[string]*api.NodeInfo) error
CheckNetSliceIsMeetJobRequire check the net slice is meet the job require in nslb 1.0 and set the job server list
func GetCardPhysicsIDFromAscendCore ¶
GetCardPhysicsIDFromAscendCore get card physics id from 0,1/0-vir04
func GetJobInfoAllocatedTaskNum ¶
GetJobInfoAllocatedTaskNum get job allocated task num
func GetJobLabelFromVcJob ¶
GetJobLabelFromVcJob get job's label, not task's.
func GetJobNPUTasks ¶
GetJobNPUTasks get NPUTask from jobInfo.
func GetJobSelectorFromVcJob ¶
GetJobSelectorFromVcJob get job selector.
func GetLargeModelMaxServerNum ¶
GetLargeModelMaxServerNum get the node num that nslb 2.0 job can use at most
func GetMaxSharedTorServerNum ¶
GetMaxSharedTorServerNum get max shared tor num a job can use
func GetPhysicCardNameFromVChip ¶
GetPhysicCardNameFromVChip get cardName from whole Ascend310P-0/Ascend310P-1c-400-3_0
func GetResourceFromTemplate ¶
func GetResourceFromTemplate(nodeType string, templateString string, taskTemplate map[string]map[string]util.VResource) *util.VResource
GetResourceFromTemplate nodeType like Ascend310P, templateString like "vir04_3c_ndvpp"
func GetTaskLabels ¶
GetTaskLabels get task's Labels.
func GetTaskSelectors ¶
GetTaskSelectors get task's selector.
func GetVCJobReqNPUTypeFromJobInfo ¶
GetVCJobReqNPUTypeFromJobInfo get job request resource, only NPU.
func GetVCTaskReqNPUTypeFromTaskInfo ¶
GetVCTaskReqNPUTypeFromTaskInfo get task request resource, only NPU.
func GetWholeCardIDFromAscendReal ¶
GetWholeCardIDFromAscendReal get card physics id from Ascend910-0
func IsJobInitial ¶
IsJobInitial Determine if the task is ready.
func IsJobRestarted ¶
IsJobRestarted used for rescheduling, judge if job restarted
func IsPodWholeCardFromAscendCore ¶
IsPodWholeCardFromAscendCore judge if card is whole card 0,1/0-vir04
func IsPodWholeCardFromAscendReal ¶
IsPodWholeCardFromAscendReal judge if card is whole card Ascend310P-0/Ascend310P-1c-400-3_0
func SetJobServerList ¶
func SetJobServerList(sJob SchedulerJob, sHandler *ScheduleHandler, nodeMaps map[string]*api.NodeInfo) error
SetJobServerList check the single layer tor whether meet the job require and set the job server list
Types ¶
type AllocNodeRankOccurrence ¶
type AllocNodeRankOccurrence struct { NodeName string RankIndex string IsFault bool Occurrence int }
AllocNodeRankOccurrence object recording node rankIndex and whether index re-allocated to new node
type CommonNode ¶
type CommonNode struct { Name string Capability map[v1.ResourceName]float64 Allocate map[v1.ResourceName]float64 Idle map[v1.ResourceName]float64 BaseDeviceInfo string Annotation map[string]string Label map[string]string Address string SuperPodID int32 }
CommonNode common npu node properties
type DevFaultInfo ¶
type DevFaultInfo struct { LogicId int32 Status string Policy string InitialPolicy string ErrorCode []int64 ErrorCodeHex string }
DevFaultInfo is the fault info of device
type DeviceInfosWithMutex ¶
type DeviceInfosWithMutex struct { sync.Mutex Devices map[string]NodeDeviceInfoWithID }
DeviceInfosWithMutex information for the current plugin
type FaultRankIdData ¶
FaultRankIdData fault rank id data
type ISchedulerPlugin ¶
type ISchedulerPlugin interface { ISchedulerPluginBase ISchedulerPluginNeed }
ISchedulerPlugin for volcano-npu plugin has function.
type ISchedulerPluginBase ¶
type ISchedulerPluginBase interface { GetPluginName() string SetPluginName(string) GetAnnoPreVal() string SetAnnoPreVal(string) GetAnnoName() string SetAnnoName(string) GetDefaultJobSchedulerConfig() map[string]string SetDefaultJobSchedulerConfig(map[string]string) }
ISchedulerPluginBase the frame plugin need implement.
type ISchedulerPluginNeed ¶
type ISchedulerPluginNeed interface { // ValidNPUJob Valid the job part of npu scheduler policy, if not, disallowed. ValidNPUJob() *api.ValidateResult CheckNodeNPUByTask(*api.TaskInfo, NPUNode) error ScoreBestNPUNodes(*api.TaskInfo, []*api.NodeInfo, map[string]float64) error UseAnnotation(*api.TaskInfo, NPUNode) *NPUNode ReleaseAnnotation(*api.TaskInfo, NPUNode) *NPUNode PreStartAction(i interface{}, ssn *framework.Session) error PreStopAction(*ScheduleEnv) error InitMyJobPlugin(util.SchedulerJobAttr, ScheduleEnv) error GetReHandle() interface{} }
ISchedulerPluginNeed The interface that the specific plug-in needs to implement.
type NPUBuilder ¶
type NPUBuilder = func(string2 string) ISchedulerPlugin
NPUBuilder PluginBuilder plugin management
type NPUNode ¶
type NPUNode struct { CommonNode VNode IsUnhealthy bool // contains filtered or unexported fields }
NPUNode the plugin define node info.
func (NPUNode) CheckNPUResourceStable ¶
func (n NPUNode) CheckNPUResourceStable(vcJob SchedulerJob) error
CheckNPUResourceStable check resource stabilize.
func (NPUNode) CheckNPUResourceStableReScheduling ¶
func (n NPUNode) CheckNPUResourceStableReScheduling(vcJob util.SchedulerJobAttr) error
CheckNPUResourceStableReScheduling check resource stabilize.
func (NPUNode) GetChipKindFromNpuNode ¶
GetChipKindFromNpuNode input huawei-Ascend910 return Ascend910/Ascend310p/Ascend310
func (*NPUNode) GetNewNPUNodeAnnotation ¶
func (n *NPUNode) GetNewNPUNodeAnnotation(usedTop []int, resourceName, resourceNamePre string) (string, error)
GetNewNPUNodeAnnotation get new annotation after allocate
func (NPUNode) IsNodeChipResEnough ¶
IsNodeChipResEnough judge if chip on node can be allocated to job
func (NPUNode) IsNodeNotMeetRes ¶
IsNodeNotMeetRes judge the node meet resource or not.
type NodeDNodeInfo ¶
type NodeDNodeInfo struct { FaultDevList []struct { DeviceType string DeviceId int FaultCode []string FaultLevel string } NodeStatus string }
NodeDNodeInfo is node the information reported by noded
type NodeDeviceInfo ¶
NodeDeviceInfo like node annotation.
type NodeDeviceInfoWithDevPlugin ¶
type NodeDeviceInfoWithDevPlugin struct { DeviceInfo NodeDeviceInfo CheckCode string SuperPodID int32 `json:"SuperPodID,omitempty"` ServerIndex int32 `json:"ServerIndex,omitempty"` }
NodeDeviceInfoWithDevPlugin a node has one by cm.
type NodeDeviceInfoWithID ¶
type NodeDeviceInfoWithID struct { NodeDeviceInfo SuperPodID int32 }
NodeDeviceInfoWithID is node the information reported by cm.
type NodeDeviceInfoWithTime ¶
type NodeDeviceInfoWithTime struct { NodeDeviceInfoWithID HostUpdateTime int64 }
NodeDeviceInfoWithTime is node device info with time
type NodeInfoWithNodeD ¶
type NodeInfoWithNodeD struct { NodeInfo NodeDNodeInfo CheckCode string }
NodeInfoWithNodeD is node the node information and checkCode reported by noded
type NodeInfosFromCmWithMutex ¶
type NodeInfosFromCmWithMutex struct { sync.Mutex Nodes map[string]NodeDNodeInfo }
NodeInfosFromCmWithMutex node info with mutex
type NodeJobInfo ¶
NodeJobInfo node job info
type NslbParameters ¶
type NslbParameters struct {
// contains filtered or unexported fields
}
NslbParameters the Parameters os nslb
type OwnerInfo ¶
type OwnerInfo struct { v1.OwnerReference Annotations map[string]string Replicas *int32 }
OwnerInfo the owner info of job
type RankIndexInfo ¶
RankIndexInfo the info of job used rank
type ScheduleCache ¶
type ScheduleCache struct { // special, name, value Names, Namespaces map[string]string Data map[string]map[string]string FaultConfigMaps map[api.JobID]*FaultRankIdData }
ScheduleCache the plugin defined caches saving cm data
type ScheduleEnv ¶
type ScheduleEnv struct { IsFirstSession *bool // scheduler first session message is unreliable Jobs map[api.JobID]SchedulerJob JobReplicas map[api.JobID]int32 Nodes map[string]NPUNode NodesNotInSsn map[string]*corev1.Node JobSinglePodFlag map[api.JobID]bool JobSeverInfos map[api.JobID]struct{} JobDeleteFlag map[api.JobID]struct{} DeviceInfos *DeviceInfosWithMutex DeleteJobInfos map[api.JobID]*api.JobInfo NodeInfosFromCm *NodeInfosFromCmWithMutex // NodeInfos is get from kube-system/node-info- configmap SwitchInfosFromCm *SwitchInfosFromCmWithMutex // SwitchInfosFromCm is get from mindx-dl/device-info- configmap FrameAttr VolcanoFrame Cache ScheduleCache Tors *TorList NslbAttr *NslbParameters SuperPodInfo *SuperPodInfo JobPendingMessage map[api.JobID]map[string]map[string]struct{} }
ScheduleEnv for job scheduler context.
type ScheduleHandler ¶
type ScheduleHandler struct { NPUPlugins map[string]NPUBuilder ScheduleEnv BaseHandle ISchedulerPlugin sync.Once }
ScheduleHandler information for the current plugin
func (*ScheduleHandler) BatchNodeOrderFn ¶
func (sHandle *ScheduleHandler) BatchNodeOrderFn(task *api.TaskInfo, nodes []*api.NodeInfo) (map[string]float64, error)
BatchNodeOrderFn Score the selected nodes.
func (*ScheduleHandler) BeforeCloseHandler ¶
func (sHandle *ScheduleHandler) BeforeCloseHandler()
BeforeCloseHandler do the action before ssn close.
func (*ScheduleHandler) CacheToShareCM ¶
func (sHandle *ScheduleHandler) CacheToShareCM() error
CacheToShareCM cache tors info to configmap
func (*ScheduleHandler) GetJobTemplate ¶
func (sHandle *ScheduleHandler) GetJobTemplate() map[string]map[string]util.VResource
GetJobTemplate get template of all possible segmentation jobs
func (*ScheduleHandler) GetNPUScheduler ¶
func (sHandle *ScheduleHandler) GetNPUScheduler(name string) (ISchedulerPlugin, bool)
GetNPUScheduler get the NPU scheduler by name
func (*ScheduleHandler) InitCache ¶
func (sHandle *ScheduleHandler) InitCache()
InitCache init ScheduleHandler's cache.
func (*ScheduleHandler) InitDeleteJobInfos ¶
func (sHandle *ScheduleHandler) InitDeleteJobInfos()
InitDeleteJobInfos init empty deleted jobinfos.
func (*ScheduleHandler) InitJobsFromSsn ¶
func (sHandle *ScheduleHandler) InitJobsFromSsn(ssn *framework.Session)
InitJobsFromSsn init all jobs in ssn.
func (*ScheduleHandler) InitJobsPlugin ¶
func (sHandle *ScheduleHandler) InitJobsPlugin()
InitJobsPlugin init job by plugins.
func (*ScheduleHandler) InitNPUSession ¶
func (sHandle *ScheduleHandler) InitNPUSession(ssn *framework.Session) error
InitNPUSession init npu plugin and nodes.
func (*ScheduleHandler) InitNSLB2 ¶
func (sHandle *ScheduleHandler) InitNSLB2(ssn *framework.Session)
InitNSLB2 Init NSLB 2.0
func (*ScheduleHandler) InitNodesFromSsn ¶
func (sHandle *ScheduleHandler) InitNodesFromSsn(ssn *framework.Session)
InitNodesFromSsn init all nodes in ssn.
func (*ScheduleHandler) InitReschedulerFromSsn ¶
func (sHandle *ScheduleHandler) InitReschedulerFromSsn(ssn *framework.Session)
InitReschedulerFromSsn initialize re-scheduler
func (*ScheduleHandler) InitTorNodeInfo ¶
func (sHandle *ScheduleHandler) InitTorNodeInfo(ssn *framework.Session)
InitTorNodeInfo init tor node if basic tor node configmap exits
func (*ScheduleHandler) InitVolcanoFrameFromSsn ¶
func (sHandle *ScheduleHandler) InitVolcanoFrameFromSsn(ssn *framework.Session)
InitVolcanoFrameFromSsn init frame parameter from ssn.
func (*ScheduleHandler) IsPluginRegistered ¶
func (sHandle *ScheduleHandler) IsPluginRegistered(name string) bool
IsPluginRegistered Determine if the plug-in is registered.
func (ScheduleHandler) IsTaskNeedNPUAllocated ¶
func (sHandle ScheduleHandler) IsTaskNeedNPUAllocated(task *api.TaskInfo) bool
IsTaskNeedNPUAllocated to judge the task is static cut. true is dynamic cut.
func (*ScheduleHandler) JobValid ¶
func (sHandle *ScheduleHandler) JobValid(obj interface{}) *api.ValidateResult
JobValid the job valid, used by volcano frame.
func (ScheduleHandler) NPUAllocateFunc ¶
func (sHandle ScheduleHandler) NPUAllocateFunc(task *api.TaskInfo)
NPUAllocateFunc Allocate npu and called by volcano frame.
func (*ScheduleHandler) NPUDeallocateFunc ¶
func (sHandle *ScheduleHandler) NPUDeallocateFunc(task *api.TaskInfo)
NPUDeallocateFunc Free assigned npu, if allocate failed by volcano frame.
func (*ScheduleHandler) NodePredicate ¶
NodePredicate Predicate nodes.
func (*ScheduleHandler) PreStartPlugin ¶
func (sHandle *ScheduleHandler) PreStartPlugin(ssn *framework.Session)
PreStartPlugin preStart plugin action.
func (*ScheduleHandler) RecordJobPendingMessage ¶
func (sHandle *ScheduleHandler) RecordJobPendingMessage(vcJob SchedulerJob)
RecordJobPendingMessage record the job pending message to log
func (*ScheduleHandler) RegisterNPUScheduler ¶
func (sHandle *ScheduleHandler) RegisterNPUScheduler(name string, pc NPUBuilder)
RegisterNPUScheduler register the plugin,like factory.
func (ScheduleHandler) SetJobPendReasonByNodesCase ¶
func (sHandle ScheduleHandler) SetJobPendReasonByNodesCase(job *api.JobInfo)
SetJobPendReasonByNodesCase In nodes select case, set node failed and add failed reason.
func (*ScheduleHandler) SetJobPendingReason ¶
func (sHandle *ScheduleHandler) SetJobPendingReason(vcJob *api.JobInfo, reason interface{}) error
SetJobPendingReason set the pod and podGroup pending reason.
func (*ScheduleHandler) SetSingleLayerTorAffinityJobNodesScore ¶
func (sHandle *ScheduleHandler) SetSingleLayerTorAffinityJobNodesScore(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo, vcJob SchedulerJob, scoreMap map[string]float64) (map[string]float64, error)
SetSingleLayerTorAffinityJobNodesScore single layer switch networking rule
func (*ScheduleHandler) SetTorAffinityJobNodesScore ¶
func (sHandle *ScheduleHandler) SetTorAffinityJobNodesScore(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo, vcJob SchedulerJob, label string, scoreMap map[string]float64) (map[string]float64, error)
SetTorAffinityJobNodesScore nslb 1.0 rule
func (*ScheduleHandler) SetTorAffinityJobNodesScoreV2 ¶
func (sHandle *ScheduleHandler) SetTorAffinityJobNodesScoreV2(task *api.TaskInfo, nodeMaps map[string]*api.NodeInfo, vcJob SchedulerJob, scoreMap map[string]float64) (map[string]float64, error)
SetTorAffinityJobNodesScoreV2 nslb 2.0 rule
func (*ScheduleHandler) UnRegisterNPUScheduler ¶
func (sHandle *ScheduleHandler) UnRegisterNPUScheduler(name string) error
UnRegisterNPUScheduler unRegister the plugin
func (*ScheduleHandler) UpdateConfigMap ¶
func (sHandle *ScheduleHandler) UpdateConfigMap(obj interface{}, operator string)
UpdateConfigMap update deviceInfo in cache
func (*ScheduleHandler) UpdatePodGroupPendingReason ¶
func (sHandle *ScheduleHandler) UpdatePodGroupPendingReason(job *api.JobInfo, reason string)
UpdatePodGroupPendingReason update pg
type SchedulerJob ¶
type SchedulerJob struct { util.SchedulerJobAttr RankIndexInfo UnschedulableReason ServerList []*Tor TorBlackMaps map[string]struct{} JobReadyTag bool SuperPods map[string][]SuperNode Owner OwnerInfo // contains filtered or unexported fields }
SchedulerJob the plugin define job info
func (*SchedulerJob) CheckNodeNum ¶
func (sJob *SchedulerJob) CheckNodeNum(taskInfo *api.TaskInfo, vcNode NPUNode) error
CheckNodeNum Check whether the number of cards on the node meets the task requirements.
func (SchedulerJob) CheckTorJobSinglePodDeleteV1 ¶
func (sJob SchedulerJob) CheckTorJobSinglePodDeleteV1(sHandler *ScheduleHandler, taskInfo *api.TaskInfo, vcNode NPUNode) error
CheckTorJobSinglePodDeleteV1 valid node.
func (SchedulerJob) CheckTorJobSinglePodDeleteV2 ¶
func (sJob SchedulerJob) CheckTorJobSinglePodDeleteV2(sHandler *ScheduleHandler, vcNode NPUNode) error
CheckTorJobSinglePodDeleteV2 valid node.
func (SchedulerJob) GetAnnoName ¶
func (sJob SchedulerJob) GetAnnoName() (string, error)
GetAnnoName get job AnnoName, include vNPU job.
func (*SchedulerJob) GetEnableServerList ¶
func (sJob *SchedulerJob) GetEnableServerList(nodes map[string]*api.NodeInfo, sHandler *ScheduleHandler)
GetEnableServerList get global tor list ,mark the server a job can be scheduled
func (*SchedulerJob) GetFullTorNumFromTorInfo ¶
func (sJob *SchedulerJob) GetFullTorNumFromTorInfo(sHandler *ScheduleHandler) int
GetFullTorNumFromTorInfo get the num of full tor
func (SchedulerJob) GetLogicTorList ¶
func (sJob SchedulerJob) GetLogicTorList(sHandler *ScheduleHandler, netSliceNum int) [][]*Server
GetLogicTorList get logic tor list by global tor list
func (SchedulerJob) GetPhyTosList ¶
func (sJob SchedulerJob) GetPhyTosList(sHandler *ScheduleHandler, logicList [][]*Server) ([]*Tor, int)
GetPhyTosList transpose the logic tor list
func (SchedulerJob) GetReqCardNameFromRingController ¶
func (sJob SchedulerJob) GetReqCardNameFromRingController() string
GetReqCardNameFromRingController Get request card name from RingController.
func (*SchedulerJob) Init ¶
func (sJob *SchedulerJob) Init(vcJob *api.JobInfo, sHandle *ScheduleHandler) error
Init the SchedulerJob's init.
func (SchedulerJob) IsNPUJob ¶
func (sJob SchedulerJob) IsNPUJob() bool
IsNPUJob check SchedulerJob is npu job
func (*SchedulerJob) IsTorAffinityJob ¶
func (sJob *SchedulerJob) IsTorAffinityJob() bool
IsTorAffinityJob check job is tor affinity job
func (*SchedulerJob) MarkMulJobServerList ¶
func (sJob *SchedulerJob) MarkMulJobServerList()
MarkMulJobServerList mark the job if the server job used is over 1 tor
func (*SchedulerJob) MarkTorListByJob ¶
func (sJob *SchedulerJob) MarkTorListByJob(nodes map[string]*api.NodeInfo, sHandler *ScheduleHandler)
MarkTorListByJob mark the global tor list by node list a job can be scheduled
func (SchedulerJob) PreCheckTorEnv ¶
func (sJob SchedulerJob) PreCheckTorEnv(sHandler *ScheduleHandler, nodeMaps map[string]*api.NodeInfo) error
PreCheckTorEnv precheck the env of cluster is ready for tor affinity job
func (*SchedulerJob) SetFaultJobRankIndex ¶
func (sJob *SchedulerJob) SetFaultJobRankIndex()
SetFaultJobRankIndex set rank index for fault job's fault task
func (SchedulerJob) SetFillJobServerList ¶
func (sJob SchedulerJob) SetFillJobServerList(sHandler *ScheduleHandler, Tors []*Tor, taskNum int) error
SetFillJobServerList set the fill job server list in nslb 1.0 and single layer switch networking rule
func (*SchedulerJob) SetFillJobServerListV2 ¶
func (sJob *SchedulerJob) SetFillJobServerListV2(Tors []*Tor, taskNum int) error
SetFillJobServerListV2 set the fill job server list in nslb 2.0
func (*SchedulerJob) SetJobRankIndex ¶
func (sJob *SchedulerJob) SetJobRankIndex()
SetJobRankIndex set rank index for job
func (*SchedulerJob) SetJobServerCacheTosHandler ¶
func (sJob *SchedulerJob) SetJobServerCacheTosHandler(sHandler *ScheduleHandler, pyTor []*Tor, taskRow, taskColumn int)
SetJobServerCacheTosHandler set job server list and update the job in sHandler
func (*SchedulerJob) SetNormalJobServerList ¶
func (sJob *SchedulerJob) SetNormalJobServerList(sHandler *ScheduleHandler)
SetNormalJobServerList set the server list of normal job in nslb 1.0
func (SchedulerJob) SortJobServerListBySliceId ¶
func (sJob SchedulerJob) SortJobServerListBySliceId() []*Tor
SortJobServerListBySliceId sort JobServer list by SliceId
func (*SchedulerJob) UpdateJobPendingMessage ¶
func (sJob *SchedulerJob) UpdateJobPendingMessage(message, nodeName string)
UpdateJobPendingMessage update job pending message
func (SchedulerJob) ValidJobFn ¶
func (sJob SchedulerJob) ValidJobFn() *api.ValidateResult
ValidJobFn valid job.
type SchedulerPlugin ¶
type SchedulerPlugin struct {
// contains filtered or unexported fields
}
SchedulerPlugin for all volcano-npu plugin.
func (SchedulerPlugin) GetAnnoName ¶
func (sp SchedulerPlugin) GetAnnoName() string
GetAnnoName get AnnoName.
func (SchedulerPlugin) GetAnnoPreVal ¶
func (sp SchedulerPlugin) GetAnnoPreVal() string
GetAnnoPreVal get AnnoPreVal.
func (SchedulerPlugin) GetDefaultJobSchedulerConfig ¶
func (sp SchedulerPlugin) GetDefaultJobSchedulerConfig() map[string]string
GetDefaultJobSchedulerConfig get DefaultJobSchedulerConfig.
func (SchedulerPlugin) GetPluginName ¶
func (sp SchedulerPlugin) GetPluginName() string
GetPluginName get PluginName.
func (*SchedulerPlugin) SetAnnoName ¶
func (sp *SchedulerPlugin) SetAnnoName(annoName string)
SetAnnoName set AnnoName.
func (*SchedulerPlugin) SetAnnoPreVal ¶
func (sp *SchedulerPlugin) SetAnnoPreVal(value string)
SetAnnoPreVal set AnnoPreVal.
func (*SchedulerPlugin) SetDefaultJobSchedulerConfig ¶
func (sp *SchedulerPlugin) SetDefaultJobSchedulerConfig(conf map[string]string)
SetDefaultJobSchedulerConfig set DefaultJobSchedulerConfig.
func (*SchedulerPlugin) SetPluginName ¶
func (sp *SchedulerPlugin) SetPluginName(name string)
SetPluginName set PluginName.
type Server ¶
type Server struct { IsUsedByMulJob bool `json:"-"` NodeRank string `json:"-"` IP string `json:"server_ip"` Count int `json:"npu_count"` SliceId int `json:"slice_id"` Jobs map[api.JobID]SchedulerJob CurrentJob *api.JobID Name string }
Server server info
type ServerList ¶
ServerList server interface
type Servers ¶
type Servers struct { Version string `json:"version"` ServerCount int `json:"server_count"` TorCount int `json:"tor_count"` ServerList []*basicTor `json:"server_list"` }
Servers include basic tor
type SuperPodInfo ¶
type SuperPodInfo struct { SuperPodReschdInfo map[api.JobID]map[string][]SuperNode // cache super pod re-schd info SuperPodFaultTaskNodes map[api.JobID][]string // cache fault task nodes info SuperPodMapFaultTaskNodes map[api.JobID]map[string]string // cache task and nodes for stage2 }
SuperPodInfo cache super pod info for pod rescheduling
type SwitchFaultInfo ¶
type SwitchFaultInfo struct { FaultCode []string FaultLevel string UpdateTime int64 NodeStatus string }
SwitchFaultInfo Switch Fault Info
type SwitchInfosFromCmWithMutex ¶
type SwitchInfosFromCmWithMutex struct { sync.Mutex Switches map[string]SwitchFaultInfo }
SwitchInfosFromCmWithMutex SwitchInfos From Cm WithMutex
type TaskDevInfo ¶
type TaskDevInfo struct { RankId int DevFaultInfo }
TaskDevInfo is the device info of a task
type TaskResetInfo ¶
type TaskResetInfo struct { RankList []*TaskDevInfo UpdateTime int64 RetryTime int FaultFlushing bool GracefulExit int }
TaskResetInfo record task reset device information
type Tor ¶
type Tor struct { FreeServerCount int IsHealthy int Id int `json:"tor_id"` IP string `json:"tor_ip"` Servers []*Server `json:"server"` Jobs map[api.JobID]SchedulerJob }
Tor tor info include server
func GetHealthyTorUsedByNormalJob ¶
GetHealthyTorUsedByNormalJob get shared tors only used by normal job
func GetNotShareAndFreeTorServer ¶
GetNotShareAndFreeTorServer get the free tors
func GetNotShareTorServer ¶
GetNotShareTorServer get the exclusiveTor tors
func GetSharedTorServer ¶
GetSharedTorServer get healthy shared tors
func GetTorServer ¶
GetTorServer get all healthy tor
func GetUnhealthyTorServer ¶
GetUnhealthyTorServer get unhealthy shared tors
func (*Tor) GetNodeByNodeIP ¶
GetNodeByNodeIP get node by node IP
func (*Tor) GetNodeByNodeName ¶
GetNodeByNodeName get node by node name
func (*Tor) HasAcrossJob ¶
HasAcrossJob whether has across job
func (*Tor) IsUsedByAcrossLargeModelJob ¶
IsUsedByAcrossLargeModelJob whether used by across large model job
type TorList ¶
type TorList struct { Version string `json:"version"` TorCount int `json:"tor_count"` Tors []*Tor `json:"server_list"` // contains filtered or unexported fields }
TorList tor info about nodes
type TorListInfo ¶
type TorListInfo struct { Status string `json:"status"` Version string `json:"version"` ServerCount int `json:"server_count"` TorCount int `json:"tor_count"` ServerList []ServerList `json:"server_list"` }
TorListInfo information for the current plugin
type UnschedulableReason ¶
UnschedulableReason the message of pod pending
type VChip ¶
type VChip struct { PodMap map[string]*v1.Pod ID []string // Name Ascend910-0 Name string // Kind Ascend910/Ascend310/Ascend310P Kind string IsDual bool Unstable bool CoreNum int SegmentFlag bool TotalRes util.VResource UsedRes util.VResource FreeRes util.VResource }
VChip vnpu chip class
func (*VChip) IsChipMeetResReq ¶
IsChipMeetResReq check chip resource can be allocated as the task requires
func (*VChip) IsPodResUnstable ¶
IsPodResUnstable return true if chip stable
func (*VChip) UpdateDVPP ¶
UpdateDVPP update dvpp according to pod resource
type VNode ¶
type VNode struct { // Chips map chipID to VChip class Chips map[int]*VChip // ChipKind Ascend910/310/310p ChipKind string // UnhealthyChipIds the card unhealthy chip ids in this node UnhealthyChipIds map[int]struct{} // ServerType Ascend310p-10-dual cardType-cardCoreNum-duo ServerType string // TotalChipNum num of total chips, get from capacity TotalChipNum int // AiCorePerChip num of aicore on each chip AiCorePerChip int // FreeChipNum num of free chips get from device-info FreeChipNum int // TotalRes total resource on node TotalRes util.VResource // ValidVNode node init success ValidVNode bool // Chip type 910B1/910B2C/910B3/910B4 ChipType string }
VNode vnpu node class
func (VNode) GetNodeTopForWholeCard ¶
GetNodeTopForWholeCard get node top for whole card
func (*VNode) IsResourceWholeCard ¶
IsResourceWholeCard judge if resource is whole card by node total resource
type VolcanoFrame ¶
type VolcanoFrame struct { UID types.UID Confs []config.Configuration KubeClient kubernetes.Interface VJobTemplate map[string]map[string]util.VResource SuperPodSize int ReservePodSize int }
VolcanoFrame passed in by the volcano frame.
func (*VolcanoFrame) CheckUseCIMByConfig ¶
func (vf *VolcanoFrame) CheckUseCIMByConfig() bool
CheckUseCIMByConfig check use cluster info manager by config, default true
func (*VolcanoFrame) CheckVNPUSegmentEnableByConfig ¶
func (vf *VolcanoFrame) CheckVNPUSegmentEnableByConfig() bool
CheckVNPUSegmentEnableByConfig Check VNPU segmentEnable by init plugin parameters, return true if static