Documentation
¶
Overview ¶
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.
Index ¶
- Constants
- func GetTaskRestartReason(reasonList []FaultReasonList) string
- type AllocNodeRankOccurrence
- type DealReSchedulerCache
- func (reCache DealReSchedulerCache) GetRealFaultNodes() []FaultNode
- func (reCache *DealReSchedulerCache) SetFaultJobsFromCM(jobType string) error
- func (reCache *DealReSchedulerCache) SetFaultNodesFromCM() error
- func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
- func (reCache *DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM() error
- func (reCache *DealReSchedulerCache) SetRetryTimesFromCM() error
- func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
- type DealReSchedulerConfigmap
- type FaultCard
- type FaultDeviceList
- type FaultJob
- func (fJob *FaultJob) CheckJobExistsInKubernetes(ssn *framework.Session) bool
- func (fJob *FaultJob) ForceDeleteJob(ssn *framework.Session, schedulerJob *plugin.SchedulerJob, ...) error
- func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GetJobFaultNPUTaskNum() int
- func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
- func (fJob *FaultJob) IsNormalJobNeedRestart() bool
- func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
- type FaultNode
- type FaultNodeInfoToCm
- type FaultRankIdsJobCMData
- type FaultReasonList
- type FaultTask
- type ReScheduler
- func (reScheduler *ReScheduler) AddFaultJobWithSession(jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) AddFaultNodeWithSession()
- func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode, npuName string) error
- func (reScheduler *ReScheduler) GenerateNodeRankIndexTaskMap()
- func (reScheduler ReScheduler) GetFaultJobOfGivenTaskInfoFromCache(task *api.TaskInfo) *FaultJob
- func (reScheduler *ReScheduler) GetGraceDeleteTime(Conf []config.Configuration) (int64, error)
- func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
- func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) (map[api.JobID]*api.JobInfo, error)
- func (reScheduler *ReScheduler) InitFaultNodeMap()
- func (reScheduler *ReScheduler) New910ReScheduler()
- func (reScheduler *ReScheduler) NewCommonReScheduler(jobType string)
- func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64) error
- func (reScheduler *ReScheduler) SynCacheFaultJobWithSession(ssn *framework.Session)
- func (reScheduler *ReScheduler) SynCacheFaultNodeWithSession()
- func (reScheduler *ReScheduler) SynCacheNodeRankOccMapWithSession(ssn *framework.Session)
- func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
- func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
- func (reScheduler *ReScheduler) ValidJobByReschedule(curSchedulerJob util.SchedulerJobAttr) *api.ValidateResult
- type RemainRetryTimes
- type RescheduleReason
- type RescheduleRecord
- type RescheduleTaskReason
- type SimpleFNodeInfo
Constants ¶
const ( // RePropertyName name specifying re-scheduler cm RePropertyName = "re-scheduling" // ReschedulingReasonKey is used to record the reason of rescheduling ReschedulingReasonKey = "rescheduling-reason" // CmName Name of ReSchedulerConfigmap CmName = "vcjob-fault-npu-cm" // CmNameSpace Namespace of ReSchedulerConfigmap CmNameSpace = "volcano-system" // RescheduleReasonCmName Name of RescheduleReasonConfigmap RescheduleReasonCmName = "job-reschedule-reason" // RescheduleReasonCmNamespace Namespace of RescheduleReasonConfigmap RescheduleReasonCmNamespace = "mindx-dl" // JobRescheduleLabelKey key word of re-scheduling configuration JobRescheduleLabelKey = "fault-scheduling" // JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration JobGraceRescheduleLabelValue = "grace" // JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration JobForceRescheduleLabelValue = "force" // JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration JobOffRescheduleLabelValue = "off" // GraceOverTimeKey for GraceOverTime config by user GraceOverTimeKey = "grace-over-time" // ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling ElasticSchedulingKey = "elastic-scheduling" // JobOnElasticScheduling job enabled with elastic scheduling JobOnElasticScheduling = "on" // JobOffElasticScheduling job not enabled with elastic scheduling JobOffElasticScheduling = "off" // CmFaultNodeKind key in configmap which saves the FaultNode cache CmFaultNodeKind = "fault-node" // CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache CmFaultJob910bx2Kind = "fault-job-910bx2" // CmFaultJob910bx2InferKind key in configmap which saves the 910bx2-infer FaultJob cache CmFaultJob910bx2InferKind = "fault-job-910bx2-infer" // CmFaultJob910bx8Kind key in configmap which saves the 910bx8 FaultJob cache CmFaultJob910bx8Kind = "fault-job-910bx8" // CmFaultJob910bx16Kind key in configmap which saves the 910bx16 FaultJob cache CmFaultJob910bx16Kind = "fault-job-910bx16" // CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x8Kind = "fault-job-910x8" // CmFaultJob910x4Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x4Kind = "fault-job-910x4" // CmFaultJob910x2Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x2Kind = "fault-job-910x2" // CmFaultJob310x4Kind key in configmap which saves the 310x4 FaultJob cache CmFaultJob310x4Kind = "fault-job-310x4" // CmFaultJob310PKind key in configmap which saves the 310P FaultJob cache CmFaultJob310PKind = "fault-job-310P" // CmJobRemainRetryTimes key in configmap which saves remain retry times of job CmJobRemainRetryTimes = "remain-retry-times" // MaxRescheduleRecordsNum the upper limit of the cm kept reschedule records, oldest record will be deleted // if record more than MaxRescheduleRecordsNum records MaxRescheduleRecordsNum = 10 // MaxKbOfRescheduleRecords the upper limit words of the cm kept reschedule records MaxKbOfRescheduleRecords = 950 * 1024 // ReduceRetryTimeLimit is the time limit of reduce loop ReduceRetryTimeLimit = 20 // CmJobRescheduleReasonsKey keeping recent MaxRescheduleRecordsNum records of rescheduling CmJobRescheduleReasonsKey = "recent-reschedule-records" // CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence CmNodeRankTimeMapKind = "node-rankIndex-Occurrence" // CmCheckCode Check code key CmCheckCode = "checkCode" // CmFaultJob key in configmap which saves the FaultJob cache CmFaultJob = "fault-job" // DefaultGraceOverTime time interval for grace delete DefaultGraceOverTime = 900 // CardHealthy represents a healthy card CardHealthy = "Healthy" // CardUnhealthy represents an unhealthy card CardUnhealthy = "Unhealthy" // CardNetworkUnhealthy represents a network unhealthy card CardNetworkUnhealthy = "NetworkUnhealthy" // NodeHealthy represents node is available for scheduling NodeHealthy = "Healthy" // NodeUnhealthy represents node is unhealthy NodeUnhealthy = "NodeUnhealthy" // NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy NodeCardUnhealthy = "CardUnhealthy" // NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy NodeCardNetworkUnhealthy = "CardNetworkUnhealthy" // NoFaultJobsErr none fault jobs NoFaultJobsErr = "none fault jobs to be restarted in cache" // JobFaultRankIDCMPre the job cm name prefix, for retraining JobFaultRankIDCMPre = "fault-config-" // JobFaultRankIDCMDataKey the job cm value key. JobFaultRankIDCMDataKey = "fault-npus" // JobRecovery Name of cm for recovery JobRecovery = "job-recovery" // DeviceFaultCmKey the key of DeviceFault info DeviceFaultCmKey = "huawei.com/Ascend910-Fault" // PodFailed the state of failed pod PodFailed = "pod-failed" // PodHealthy the state of healthy pod PodHealthy = "pod-healthy" // FaultRetryTimesKey key of fault-retry-times label FaultRetryTimesKey = "fault-retry-times" )
const ( // PreSeparateNPU fault type waiting user check PreSeparateNPU = "PreSeparateNPU" // NotHandleFault fault type not handle NotHandleFault = "NotHandleFault" // NodeFaultCode fault type nodeUnhealthy NodeFaultCode = "heartbeatTimeOut" // AcJobTag the tag of AcJob AcJobTag = "group-name" // AcJobVersion the api version of AcJob AcJobVersion = "mindxdl.gitee.com" // SubHealthFault subHealth code SubHealthFault = "SubHealthFault" )
const (
// SuperPodAnnoKey annotation key of super pod
SuperPodAnnoKey = "sp-block"
)
Variables ¶
This section is empty.
Functions ¶
func GetTaskRestartReason ¶
func GetTaskRestartReason(reasonList []FaultReasonList) string
GetTaskRestartReason convert to json str
Types ¶
type AllocNodeRankOccurrence ¶
AllocNodeRankOccurrence object recording node rankIndex and whether index re-allocated to new node
type DealReSchedulerCache ¶
type DealReSchedulerCache struct { *DealReSchedulerConfigmap FaultNodes []FaultNode FaultNodeMaps map[string]SimpleFNodeInfo `json:"-"` FaultJobs []FaultJob RealFaultJobs []FaultJob `json:"-"` AllocNodeRankOccurrenceMap map[api.JobID][]*AllocNodeRankOccurrence JobRemainRetryTimes map[api.JobID]*RemainRetryTimes JobRecentRescheduleRecords map[api.JobID]*RescheduleReason }
DealReSchedulerCache object with method for re-scheduler cache
func (DealReSchedulerCache) GetRealFaultNodes ¶
func (reCache DealReSchedulerCache) GetRealFaultNodes() []FaultNode
GetRealFaultNodes get the nodes whose isFaultNode property takes true value
func (*DealReSchedulerCache) SetFaultJobsFromCM ¶
func (reCache *DealReSchedulerCache) SetFaultJobsFromCM(jobType string) error
SetFaultJobsFromCM unmarshal FaultJobs from string into struct and set the value
func (*DealReSchedulerCache) SetFaultNodesFromCM ¶
func (reCache *DealReSchedulerCache) SetFaultNodesFromCM() error
SetFaultNodesFromCM unmarshal FaultNodes from string into struct and set the value
func (*DealReSchedulerCache) SetJobRecentRescheduleRecords ¶
func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
SetJobRecentRescheduleRecords get already recorded rescheduling records from cm, and cache it
func (*DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM ¶
func (reCache *DealReSchedulerCache) SetNodeRankOccurrenceMapFromCM() error
SetNodeRankOccurrenceMapFromCM unmarshal NodeRankOccurrenceMap from string into struct and set the value
func (*DealReSchedulerCache) SetRetryTimesFromCM ¶
func (reCache *DealReSchedulerCache) SetRetryTimesFromCM() error
SetRetryTimesFromCM unmarshal RetryTimes from string into struct and set the value
func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache ¶
func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap
type DealReSchedulerConfigmap ¶
DealReSchedulerConfigmap object with method for re-scheduler configmap
type FaultDeviceList ¶
type FaultDeviceList struct { FaultType string `json:"fault_type"` NPUName string `json:"npu_name"` FaultLevel string `json:"fault_level"` FaultHandling string `json:"fault_handling"` LargeModelFaultLevel string `json:"large_model_fault_level"` FaultCode string `json:"fault_code"` }
FaultDeviceList is the fault reason of card
func GetNodeDeviceFaultFromDeviceInfo ¶
func GetNodeDeviceFaultFromDeviceInfo(node *plugin.NPUNode) ([]FaultDeviceList, error)
GetNodeDeviceFaultFromDeviceInfo get device fault from device info
type FaultJob ¶
type FaultJob struct { ReScheduleKey string // values taken off/grace/force SubHealthyStrategy string IsSubHealthFault bool PendingSessionNum int IsFaultJob bool IsInSession bool JobName string JobUID api.JobID JobNamespace string JobRankIds []string // useCardIndex + 8*NodeRankIndex NodeNames []string SuperPods map[string][]plugin.SuperNode NodeNameMaps map[string]struct{} FaultTasks []FaultTask UpdateTime int64 JobRankIdCreateTime int64 // stop updating when job becomes a real fault one FaultTypes []string DeleteExecutedFlag bool ElasticScheduling string ReferenceName string FaultRetryTimes int UUID types.UID // contains filtered or unexported fields }
FaultJob job object for re-scheduling
func (*FaultJob) CheckJobExistsInKubernetes ¶
CheckJobExistsInKubernetes check whether job recorded in cache can be traced in kubernetes
func (*FaultJob) ForceDeleteJob ¶
func (fJob *FaultJob) ForceDeleteJob(ssn *framework.Session, schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones
func (*FaultJob) GetJobElasticSchedulingLabel ¶
func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
GetJobElasticSchedulingLabel get job's elastic scheduling label
func (*FaultJob) GetJobFaultNPUTaskNum ¶
GetJobFaultNPUTaskNum get fob fault NPU task num
func (*FaultJob) GetJobFaultRescheduleLabel ¶
func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
GetJobFaultRescheduleLabel Get job's fault reschedule label.
func (*FaultJob) GraceDeleteJob ¶
func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
GraceDeleteJob grace delete jobs labelled to be deleted gracefully
func (*FaultJob) IsJobSingleRescheduling ¶
func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
IsJobSingleRescheduling valid job.
func (*FaultJob) IsNormalJobNeedRestart ¶
IsNormalJobNeedRestart is Job has the key of PreSeparateNPU os Job has software fault
func (*FaultJob) IsProcessReschedulingJob ¶
func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
IsProcessReschedulingJob valid job.
type FaultNode ¶
type FaultNode struct { SuperPodID int32 NodeName string NPUName string FaultDeviceList []FaultDeviceList UpdateTime int64 UnhealthyNPU []string NetworkUnhealthyNPU []string IsFaultNode bool NodeDEnable bool NodeHealthState string AllCards []string FaultCards []FaultCard HasSwitchSubHealthFault bool HasCardSubHealthFault bool }
FaultNode node object for re-scheduling
type FaultNodeInfoToCm ¶
type FaultNodeInfoToCm struct { FaultDeviceList []FaultDeviceList NodeName string UnhealthyNPU []string NetworkUnhealthyNPU []string NodeDEnable bool NodeHealthState string UpdateTime int64 }
FaultNodeInfoToCm fault node info to cm
type FaultRankIdsJobCMData ¶
FaultRankIdsJobCMData used by RestoreManager for every job.
type FaultReasonList ¶
type FaultReasonList struct { NodeName string `json:"node_name"` TaskName string `json:"task_name"` FaultRankList []string FaultDeviceList }
FaultReasonList node Fault Device List
type FaultTask ¶
type FaultTask struct { Reason []FaultReasonList IsFaultTask bool IsFaultRetryEnable bool HasSubHealthFault bool IsSoftwareFault bool TaskUID api.TaskID TaskName string TaskNamespace string NodeName string JobName string NodeRankIndex string UseCardName []string PodCreateTime int64 PodUID types.UID // contains filtered or unexported fields }
FaultTask object dealing with node for rescheduling
type ReScheduler ¶
type ReScheduler struct { *DealReSchedulerCache GraceDeleteTime int64 Level string Jobs map[api.JobID]plugin.SchedulerJob Nodes map[string]plugin.NPUNode IsFirstSession *bool // contains filtered or unexported fields }
ReScheduler object for re-scheduling
func New ¶
func New(env *plugin.ScheduleEnv, jobType string) *ReScheduler
New Initialisation of ReScheduler
func (*ReScheduler) AddFaultJobWithSession ¶
func (reScheduler *ReScheduler) AddFaultJobWithSession( jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects
func (*ReScheduler) AddFaultNodeWithSession ¶
func (reScheduler *ReScheduler) AddFaultNodeWithSession()
AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache
func (*ReScheduler) CheckNodeNPUByTask ¶
func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode, npuName string) error
CheckNodeNPUByTask used in the predicate process of task and node
func (*ReScheduler) GenerateNodeRankIndexTaskMap ¶
func (reScheduler *ReScheduler) GenerateNodeRankIndexTaskMap()
GenerateNodeRankIndexTaskMap get the nodeName, rankIndex, and Occurrence of nodes in a job
func (ReScheduler) GetFaultJobOfGivenTaskInfoFromCache ¶
func (reScheduler ReScheduler) GetFaultJobOfGivenTaskInfoFromCache(task *api.TaskInfo) *FaultJob
GetFaultJobOfGivenTaskInfoFromCache get fault job from task info
func (*ReScheduler) GetGraceDeleteTime ¶
func (reScheduler *ReScheduler) GetGraceDeleteTime(Conf []config.Configuration) (int64, error)
GetGraceDeleteTime Get the graceful delete time from configuration
func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs ¶
func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs( schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully
func (*ReScheduler) GetRunningJobs ¶
func (reScheduler *ReScheduler) GetRunningJobs( ssn *framework.Session) (map[api.JobID]*api.JobInfo, error)
GetRunningJobs get all the running jobs of <UseCardName> type
func (*ReScheduler) InitFaultNodeMap ¶
func (reScheduler *ReScheduler) InitFaultNodeMap()
InitFaultNodeMap init the node map of fault node
func (*ReScheduler) New910ReScheduler ¶
func (reScheduler *ReScheduler) New910ReScheduler()
New910ReScheduler initialise ReScheduler.FaultJobs for 910x8
func (*ReScheduler) NewCommonReScheduler ¶
func (reScheduler *ReScheduler) NewCommonReScheduler(jobType string)
NewCommonReScheduler initialise ReScheduler.FaultJobs for non 910x8
func (*ReScheduler) RestartFaultJobs ¶
func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off
func (*ReScheduler) RestartNeedForceDeleteJobs ¶
func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartNeedForceDeleteJobs Restart jobs that need to be force deleted
func (*ReScheduler) ScoreBestNPUNodes ¶
func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64) error
ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks
func (*ReScheduler) SynCacheFaultJobWithSession ¶
func (reScheduler *ReScheduler) SynCacheFaultJobWithSession(ssn *framework.Session)
SynCacheFaultJobWithSession Synchronise FaultJobs in cache by updating the information using current session
func (*ReScheduler) SynCacheFaultNodeWithSession ¶
func (reScheduler *ReScheduler) SynCacheFaultNodeWithSession()
SynCacheFaultNodeWithSession Synchronise FaultNodes in cache by updating the information using current session
func (*ReScheduler) SynCacheNodeRankOccMapWithSession ¶
func (reScheduler *ReScheduler) SynCacheNodeRankOccMapWithSession(ssn *framework.Session)
SynCacheNodeRankOccMapWithSession Synchronise FaultJobs in cache by updating the information using current session
func (*ReScheduler) SyncJobRecentRescheduleReason ¶
func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
SyncJobRecentRescheduleReason sync recent reschedule records with ssn, to ensure cache is new and sync
func (*ReScheduler) SyncJobRemainRetryTimes ¶
func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session
func (*ReScheduler) ValidJobByReschedule ¶
func (reScheduler *ReScheduler) ValidJobByReschedule(curSchedulerJob util.SchedulerJobAttr) *api.ValidateResult
ValidJobByReschedule valid job by reschedule
type RemainRetryTimes ¶
RemainRetryTimes remained retry times
type RescheduleReason ¶
type RescheduleReason struct { // JobID the job id of this record JobID api.JobID // TotalRescheduleTimes to show how many times reschedule has happened since job created TotalRescheduleTimes int // RescheduleRecords keep recent MaxRescheduleRecordsNum records of rescheduling RescheduleRecords []RescheduleRecord // AdditionalInfo is used to provide additional information, such as for length concern reduce some records AdditionalInfo string `json:",omitempty"` }
RescheduleReason shows the reason of this job rescheduling
type RescheduleRecord ¶
type RescheduleRecord struct { // LogFileFormatTime is the formated time, to make it convenient to read and locate log LogFileFormatTime string // RescheduleTimeStamp time.now.unix() indicates when the rescheduling happened RescheduleTimeStamp int64 // ReasonOfTask record the reason of this rescheduling of task ReasonOfTask []RescheduleTaskReason }
RescheduleRecord will records job rescheduling records
type RescheduleTaskReason ¶
type RescheduleTaskReason struct { // RescheduleReason the fault type of this rescheduling RescheduleReason string // PodName the fault task caused this rescheduling PodName string // NodeName the fault node caused this rescheduling NodeName string // NodeRankIndex the rank index of the fault task NodeRankIndex string }
RescheduleTaskReason record the reason of this rescheduling of task