rescheduling

package
v6.0.0+incompatible Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 31, 2024 License: Apache-2.0, Apache-2.0 Imports: 18 Imported by: 0

Documentation

Overview

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.

Index

Constants

View Source
const (
	// RePropertyName name specifying re-scheduler cm
	RePropertyName = "re-scheduling"
	// ReschedulingReasonKey is used to record the reason of rescheduling
	ReschedulingReasonKey = "rescheduling-reason"
	// CmName Name of ReSchedulerConfigmap
	CmName = "vcjob-fault-npu-cm"
	// CmNameSpace Namespace of ReSchedulerConfigmap
	CmNameSpace = "volcano-system"
	// RescheduleReasonCmName Name of RescheduleReasonConfigmap
	RescheduleReasonCmName = "job-reschedule-reason"
	// RescheduleReasonCmNamespace Namespace of RescheduleReasonConfigmap
	RescheduleReasonCmNamespace = "mindx-dl"

	// JobRescheduleLabelKey key word of re-scheduling configuration
	JobRescheduleLabelKey = "fault-scheduling"
	// JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration
	JobGraceRescheduleLabelValue = "grace"
	// JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration
	JobForceRescheduleLabelValue = "force"
	// JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration
	JobOffRescheduleLabelValue = "off"
	// GraceOverTimeKey for GraceOverTime config by user
	GraceOverTimeKey = "grace-over-time"
	// ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling
	ElasticSchedulingKey = "elastic-scheduling"
	// JobOnElasticScheduling job enabled with elastic scheduling
	JobOnElasticScheduling = "on"
	// JobOffElasticScheduling job not enabled with elastic scheduling
	JobOffElasticScheduling = "off"

	// CmFaultNodeKind key in configmap which saves the FaultNode cache
	CmFaultNodeKind = "fault-node"
	// CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache
	CmFaultJob910bx2Kind = "fault-job-910bx2"
	// CmFaultJob910bx2InferKind key in configmap which saves the 910bx2-infer FaultJob cache
	CmFaultJob910bx2InferKind = "fault-job-910bx2-infer"
	// CmFaultJob910bx8Kind key in configmap which saves the 910bx8 FaultJob cache
	CmFaultJob910bx8Kind = "fault-job-910bx8"
	// CmFaultJob910bx16Kind key in configmap which saves the 910bx16 FaultJob cache
	CmFaultJob910bx16Kind = "fault-job-910bx16"
	// CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache
	CmFaultJob910x8Kind = "fault-job-910x8"
	// CmFaultJob910x4Kind key in configmap which saves the 910x8 FaultJob cache
	CmFaultJob910x4Kind = "fault-job-910x4"
	// CmFaultJob910x2Kind key in configmap which saves the 910x8 FaultJob cache
	CmFaultJob910x2Kind = "fault-job-910x2"
	// CmFaultJob310x4Kind key in configmap which saves the 310x4 FaultJob cache
	CmFaultJob310x4Kind = "fault-job-310x4"
	// CmFaultJob310PKind key in configmap which saves the 310P FaultJob cache
	CmFaultJob310PKind = "fault-job-310P"
	// CmJobRemainRetryTimes key in configmap which saves remain retry times of job
	CmJobRemainRetryTimes = "remain-retry-times"
	// MaxRescheduleRecordsNum the upper limit of the cm kept reschedule records, oldest record will be deleted
	// if record more than MaxRescheduleRecordsNum records
	MaxRescheduleRecordsNum = 10
	// MaxKbOfRescheduleRecords the upper limit words of the cm kept reschedule records
	MaxKbOfRescheduleRecords = 950 * 1024
	// ReduceRetryTimeLimit is the time limit of reduce loop
	ReduceRetryTimeLimit = 20
	// CmJobRescheduleReasonsKey keeping recent MaxRescheduleRecordsNum records of rescheduling
	CmJobRescheduleReasonsKey = "recent-reschedule-records"
	// CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence
	CmNodeRankTimeMapKind = "node-rankIndex-Occurrence"
	// CmCheckCode Check code key
	CmCheckCode = "checkCode"

	// CmFaultJob key in configmap which saves the FaultJob cache
	CmFaultJob = "fault-job"

	// DefaultGraceOverTime time interval for grace delete
	DefaultGraceOverTime = 900

	// CardHealthy represents a healthy card
	CardHealthy = "Healthy"
	// CardUnhealthy represents an unhealthy card
	CardUnhealthy = "Unhealthy"
	// CardNetworkUnhealthy represents a network unhealthy card
	CardNetworkUnhealthy = "NetworkUnhealthy"
	// NodeHealthy represents node is available for scheduling
	NodeHealthy = "Healthy"
	// NodeUnhealthy represents node is unhealthy
	NodeUnhealthy = "NodeUnhealthy"
	// NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy
	NodeCardUnhealthy = "CardUnhealthy"
	// NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy
	NodeCardNetworkUnhealthy = "CardNetworkUnhealthy"
	// NoFaultJobsErr none fault jobs
	NoFaultJobsErr = "none fault jobs to be restarted in cache"
	// JobFaultRankIDCMPre the job cm name prefix, for retraining
	JobFaultRankIDCMPre = "fault-config-"
	// JobFaultRankIDCMDataKey the job cm value key.
	JobFaultRankIDCMDataKey = "fault-npus"
	// JobRecovery Name of cm for recovery
	JobRecovery = "job-recovery"
	// DeviceFaultCmKey the key of DeviceFault info
	DeviceFaultCmKey = "huawei.com/Ascend910-Fault"
	// PodFailed the state of failed pod
	PodFailed = "pod-failed"
	// PodHealthy the state of healthy pod
	PodHealthy = "pod-healthy"

	// FaultRetryTimesKey key of fault-retry-times label
	FaultRetryTimesKey = "fault-retry-times"
)
View Source
const (
	// PreSeparateNPU fault type waiting user check
	PreSeparateNPU = "PreSeparateNPU"
	// NotHandleFault fault type not handle
	NotHandleFault = "NotHandleFault"
	// NodeFaultCode fault type nodeUnhealthy
	NodeFaultCode = "heartbeatTimeOut"
	// AcJobTag the tag of AcJob
	AcJobTag = "group-name"
	// AcJobVersion the api version of AcJob
	AcJobVersion = "mindxdl.gitee.com"
	// SubHealthFault subHealth code
	SubHealthFault = "SubHealthFault"
)
View Source
const (

	// SuperPodAnnoKey annotation key of super pod
	SuperPodAnnoKey = "sp-block"
)

Variables

This section is empty.

Functions

func GetTaskRestartReason

func GetTaskRestartReason(reasonList []FaultReasonList) string

GetTaskRestartReason convert to json str

Types

type AllocNodeRankOccurrence

type AllocNodeRankOccurrence struct {
	NodeName  string
	RankIndex string
	IsFault   bool
}

AllocNodeRankOccurrence object recording node rankIndex and whether index re-allocated to new node

type DealReSchedulerCache

type DealReSchedulerCache struct {
	*DealReSchedulerConfigmap
	FaultNodes                 []FaultNode
	FaultNodeMaps              map[string]SimpleFNodeInfo `json:"-"`
	FaultJobs                  []FaultJob
	RealFaultJobs              []FaultJob `json:"-"`
	AllocNodeRankOccurrenceMap map[api.JobID][]*AllocNodeRankOccurrence
	JobRemainRetryTimes        map[api.JobID]*RemainRetryTimes
	JobRecentRescheduleRecords map[api.JobID]*RescheduleReason
}

DealReSchedulerCache object with method for re-scheduler cache

func (DealReSchedulerCache) GetRealFaultNodes

func (reCache DealReSchedulerCache) GetRealFaultNodes() []FaultNode

GetRealFaultNodes get the nodes whose isFaultNode property takes true value

func (*DealReSchedulerCache) SetFaultJobsFromCM

func (reCache *DealReSchedulerCache) SetFaultJobsFromCM(jobType string) error

SetFaultJobsFromCM unmarshal FaultJobs from string into struct and set the value

func (*DealReSchedulerCache) SetFaultNodesFromCM

func (reCache *DealReSchedulerCache) SetFaultNodesFromCM() error

SetFaultNodesFromCM unmarshal FaultNodes from string into struct and set the value

func (*DealReSchedulerCache) SetJobRecentRescheduleRecords

func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool,
	client kubernetes.Interface) error

SetJobRecentRescheduleRecords get already recorded rescheduling records from cm, and cache it

func (*DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM

func (reCache *DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM() error

SetNodeRankOccurrenceMapFromCM unmarshal NodeRankOccurrenceMap from string into struct and set the value

func (*DealReSchedulerCache) SetRetryTimesFromCM

func (reCache *DealReSchedulerCache) SetRetryTimesFromCM() error

SetRetryTimesFromCM unmarshal RetryTimes from string into struct and set the value

func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache

func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error

WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap

type DealReSchedulerConfigmap

type DealReSchedulerConfigmap struct {
	CMName      string
	CMNameSpace string
	CMData      map[string]string
}

DealReSchedulerConfigmap object with method for re-scheduler configmap

type FaultCard

type FaultCard struct {
	IsFaultCard bool
	NPUName     string
	NodeName    string
	FaultType   string
}

FaultCard card object for re-scheduling

type FaultDeviceList

type FaultDeviceList struct {
	FaultType            string `json:"fault_type"`
	NPUName              string `json:"npu_name"`
	FaultLevel           string `json:"fault_level"`
	FaultHandling        string `json:"fault_handling"`
	LargeModelFaultLevel string `json:"large_model_fault_level"`
	FaultCode            string `json:"fault_code"`
}

FaultDeviceList is the fault reason of card

func GetNodeDeviceFaultFromDeviceInfo

func GetNodeDeviceFaultFromDeviceInfo(node *plugin.NPUNode) ([]FaultDeviceList, error)

GetNodeDeviceFaultFromDeviceInfo get device fault from device info

type FaultJob

type FaultJob struct {
	ReScheduleKey       string // values taken off/grace/force
	SubHealthyStrategy  string
	IsSubHealthFault    bool
	PendingSessionNum   int
	IsFaultJob          bool
	IsInSession         bool
	JobName             string
	JobUID              api.JobID
	JobNamespace        string
	JobRankIds          []string // useCardIndex + 8*NodeRankIndex
	NodeNames           []string
	SuperPods           map[string][]plugin.SuperNode
	NodeNameMaps        map[string]struct{}
	FaultTasks          []FaultTask
	UpdateTime          int64
	JobRankIdCreateTime int64 // stop updating when job becomes a real fault one
	FaultTypes          []string
	DeleteExecutedFlag  bool
	ElasticScheduling   string
	ReferenceName       string
	FaultRetryTimes     int

	UUID types.UID
	// contains filtered or unexported fields
}

FaultJob job object for re-scheduling

func (*FaultJob) CheckJobExistsInKubernetes

func (fJob *FaultJob) CheckJobExistsInKubernetes(ssn *framework.Session) bool

CheckJobExistsInKubernetes check whether job recorded in cache can be traced in kubernetes

func (*FaultJob) ForceDeleteJob

func (fJob *FaultJob) ForceDeleteJob(ssn *framework.Session, schedulerJob *plugin.SchedulerJob,
	env plugin.ScheduleEnv) error

ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones

func (*FaultJob) GetJobElasticSchedulingLabel

func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string

GetJobElasticSchedulingLabel get job's elastic scheduling label

func (*FaultJob) GetJobFaultNPUTaskNum

func (fJob *FaultJob) GetJobFaultNPUTaskNum() int

GetJobFaultNPUTaskNum get fob fault NPU task num

func (*FaultJob) GetJobFaultRescheduleLabel

func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string

GetJobFaultRescheduleLabel Get job's fault reschedule label.

func (*FaultJob) GraceDeleteJob

func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob,
	env plugin.ScheduleEnv) error

GraceDeleteJob grace delete jobs labelled to be deleted gracefully

func (*FaultJob) IsJobSingleRescheduling

func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool

IsJobSingleRescheduling valid job.

func (*FaultJob) IsNormalJobNeedRestart

func (fJob *FaultJob) IsNormalJobNeedRestart() bool

IsNormalJobNeedRestart is Job has the key of PreSeparateNPU os Job has software fault

func (*FaultJob) IsProcessReschedulingJob

func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool

IsProcessReschedulingJob valid job.

type FaultNode

type FaultNode struct {
	SuperPodID              int32
	NodeName                string
	NPUName                 string
	FaultDeviceList         []FaultDeviceList
	UpdateTime              int64
	UnhealthyNPU            []string
	NetworkUnhealthyNPU     []string
	IsFaultNode             bool
	NodeDEnable             bool
	NodeHealthState         string
	AllCards                []string
	FaultCards              []FaultCard
	HasSwitchSubHealthFault bool
	HasCardSubHealthFault   bool
}

FaultNode node object for re-scheduling

type FaultNodeInfoToCm

type FaultNodeInfoToCm struct {
	FaultDeviceList     []FaultDeviceList
	NodeName            string
	UnhealthyNPU        []string
	NetworkUnhealthyNPU []string
	NodeDEnable         bool
	NodeHealthState     string
	UpdateTime          int64
}

FaultNodeInfoToCm fault node info to cm

type FaultRankIdsJobCMData

type FaultRankIdsJobCMData struct {
	FaultRankIds []string
	CreatTime    int64
}

FaultRankIdsJobCMData used by RestoreManager for every job.

type FaultReasonList

type FaultReasonList struct {
	NodeName      string `json:"node_name"`
	TaskName      string `json:"task_name"`
	FaultRankList []string
	FaultDeviceList
}

FaultReasonList node Fault Device List

type FaultTask

type FaultTask struct {
	Reason             []FaultReasonList
	IsFaultTask        bool
	IsFaultRetryEnable bool
	HasSubHealthFault  bool
	IsSoftwareFault    bool
	TaskUID            api.TaskID
	TaskName           string
	TaskNamespace      string
	NodeName           string
	JobName            string
	NodeRankIndex      string
	UseCardName        []string
	PodCreateTime      int64
	PodUID             types.UID
	// contains filtered or unexported fields
}

FaultTask object dealing with node for rescheduling

func (*FaultTask) DeleteRealPodByTask

func (fTask *FaultTask) DeleteRealPodByTask(ssn *framework.Session, waitTime int64) error

DeleteRealPodByTask delete pod from kubernetes of tasks

type ReScheduler

type ReScheduler struct {
	*DealReSchedulerCache
	GraceDeleteTime int64
	Level           string
	Jobs            map[api.JobID]plugin.SchedulerJob
	Nodes           map[string]plugin.NPUNode
	IsFirstSession  *bool
	// contains filtered or unexported fields
}

ReScheduler object for re-scheduling

func New

func New(env *plugin.ScheduleEnv, jobType string) *ReScheduler

New Initialisation of ReScheduler

func (*ReScheduler) AddFaultJobWithSession

func (reScheduler *ReScheduler) AddFaultJobWithSession(
	jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error

AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects

func (*ReScheduler) AddFaultNodeWithSession

func (reScheduler *ReScheduler) AddFaultNodeWithSession()

AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache

func (*ReScheduler) CheckNodeNPUByTask

func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode, npuName string) error

CheckNodeNPUByTask used in the predicate process of task and node

func (*ReScheduler) GenerateNodeRankIndexTaskMap

func (reScheduler *ReScheduler) GenerateNodeRankIndexTaskMap()

GenerateNodeRankIndexTaskMap get the nodeName, rankIndex, and Occurrence of nodes in a job

func (ReScheduler) GetFaultJobOfGivenTaskInfoFromCache

func (reScheduler ReScheduler) GetFaultJobOfGivenTaskInfoFromCache(task *api.TaskInfo) *FaultJob

GetFaultJobOfGivenTaskInfoFromCache get fault job from task info

func (*ReScheduler) GetGraceDeleteTime

func (reScheduler *ReScheduler) GetGraceDeleteTime(Conf []config.Configuration) (int64, error)

GetGraceDeleteTime Get the graceful delete time from configuration

func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs

func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(
	schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)

GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully

func (*ReScheduler) GetRunningJobs

func (reScheduler *ReScheduler) GetRunningJobs(
	ssn *framework.Session) (map[api.JobID]*api.JobInfo, error)

GetRunningJobs get all the running jobs of <UseCardName> type

func (*ReScheduler) InitFaultNodeMap

func (reScheduler *ReScheduler) InitFaultNodeMap()

InitFaultNodeMap init the node map of fault node

func (*ReScheduler) New910ReScheduler

func (reScheduler *ReScheduler) New910ReScheduler()

New910ReScheduler initialise ReScheduler.FaultJobs for 910x8

func (*ReScheduler) NewCommonReScheduler

func (reScheduler *ReScheduler) NewCommonReScheduler(jobType string)

NewCommonReScheduler initialise ReScheduler.FaultJobs for non 910x8

func (*ReScheduler) RestartFaultJobs

func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error

RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off

func (*ReScheduler) RestartNeedForceDeleteJobs

func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error

RestartNeedForceDeleteJobs Restart jobs that need to be force deleted

func (*ReScheduler) ScoreBestNPUNodes

func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64) error

ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks

func (*ReScheduler) SynCacheFaultJobWithSession

func (reScheduler *ReScheduler) SynCacheFaultJobWithSession(ssn *framework.Session)

SynCacheFaultJobWithSession Synchronise FaultJobs in cache by updating the information using current session

func (*ReScheduler) SynCacheFaultNodeWithSession

func (reScheduler *ReScheduler) SynCacheFaultNodeWithSession()

SynCacheFaultNodeWithSession Synchronise FaultNodes in cache by updating the information using current session

func (*ReScheduler) SynCacheNodeRankOccMapWithSession

func (reScheduler *ReScheduler) SynCacheNodeRankOccMapWithSession(ssn *framework.Session)

SynCacheNodeRankOccMapWithSession Synchronise FaultJobs in cache by updating the information using current session

func (*ReScheduler) SyncJobRecentRescheduleReason

func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)

SyncJobRecentRescheduleReason sync recent reschedule records with ssn, to ensure cache is new and sync

func (*ReScheduler) SyncJobRemainRetryTimes

func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)

SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session

func (*ReScheduler) ValidJobByReschedule

func (reScheduler *ReScheduler) ValidJobByReschedule(curSchedulerJob util.SchedulerJobAttr) *api.ValidateResult

ValidJobByReschedule valid job by reschedule

type RemainRetryTimes

type RemainRetryTimes struct {
	UUID  types.UID
	Times int
}

RemainRetryTimes remained retry times

type RescheduleReason

type RescheduleReason struct {
	// JobID the job id of this record
	JobID api.JobID
	// TotalRescheduleTimes to show how many times reschedule has happened since job created
	TotalRescheduleTimes int
	// RescheduleRecords keep recent MaxRescheduleRecordsNum records of rescheduling
	RescheduleRecords []RescheduleRecord
	// AdditionalInfo is used to provide additional information, such as for length concern reduce some records
	AdditionalInfo string `json:",omitempty"`
}

RescheduleReason shows the reason of this job rescheduling

type RescheduleRecord

type RescheduleRecord struct {
	// LogFileFormatTime is the formated time, to make it convenient to read and locate log
	LogFileFormatTime string
	// RescheduleTimeStamp time.now.unix() indicates when the rescheduling happened
	RescheduleTimeStamp int64
	// ReasonOfTask record the reason of this rescheduling of task
	ReasonOfTask []RescheduleTaskReason
}

RescheduleRecord will records job rescheduling records

type RescheduleTaskReason

type RescheduleTaskReason struct {
	// RescheduleReason the fault type of this rescheduling
	RescheduleReason string
	// PodName the fault task caused this rescheduling
	PodName string
	// NodeName the fault node caused this rescheduling
	NodeName string
	// NodeRankIndex the rank index of the fault task
	NodeRankIndex string
}

RescheduleTaskReason record the reason of this rescheduling of task

type SimpleFNodeInfo

type SimpleFNodeInfo struct {
	NodeName                string
	IsFaultNode             bool
	HasCardSubHealthFault   bool
	HasSwitchSubHealthFault bool
	NodeHealthState         string
}

SimpleFNodeInfo simple fault node info

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL