scheduler

package
v0.14.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 19, 2023 License: GPL-3.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Executor

type Executor interface {
	ExecAs(ctx context.Context, user string, cmd string) (string, error)
}

type FindSpecOption

type FindSpecOption func(*findSpecOptions)

func WithOnlyResponding

func WithOnlyResponding() FindSpecOption

type JobDefinition

type JobDefinition struct {
	// TimeLimit is a time allocation which at the end kills the running job.
	//
	// TimeLimit is in minutes.
	TimeLimit uint64
	// NTasks indicates the number parallel tasks.
	NTasks uint64
	// NTasksPerNode indicates the number parallel tasks per node.
	//
	// If NTasks is not 0, NTasksPerNode is a maximum. Otherwise, it is a scrictly the number of tasks per node.
	NTasksPerNode uint64
	// MinNodes indicates the minimum number of allocated node.
	MinNodes uint64
	// MaxNodes indicates the maximum number of allocated node.
	// MinNodes is required.
	MaxNodes uint64
	// GPUsPerTask indicates the number of requested GPU.
	GPUsPerTask *uint64
	// GPUsPerNode indicates the number of requested GPUs per node.
	GPUsPerNode uint64
	// CPUs indicates the number of requested CPU.
	CPUsPerTask uint64
	// CPUs indicates the minimum number of CPU per node.
	CPUsPerNode uint64
	// MemoryPerCpu indicates the number of requested MB of memory.
	MemoryPerCPU uint64
	// Memory indicates the number of requested MB of memory.
	Memory *uint64
	// Body of the job, in a sbatch script.
	Body string
	// Wait for the job to end. The exit code of the sbatch will be the exit code
	// of the job.
	Wait bool
}

type Scheduler

type Scheduler interface {
	// HealthCheck verifies if the scheduler accepts jobs.
	HealthCheck(ctx context.Context) error
	// Submit a job to the scheduler.
	Submit(ctx context.Context, req *SubmitRequest) (string, error)
	// CancelJob kills a job.
	CancelJob(ctx context.Context, name string, user string) error
	// TopUp increases the time limit in minutes of a job.
	TopUp(ctx context.Context, name string, additionalTime uint64) error
	// Find the memory (MB) per node
	FindMemPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)
	// Find the GPU per node.
	FindGPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)
	// Find the CPU per node.
	FindCPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)
	// Find the total number of memory (MB) available
	FindTotalMem(ctx context.Context) (uint64, error)
	// Find the total number of GPUs available.
	FindTotalGPUs(ctx context.Context) (uint64, error)
	// Find the total number of CPUs available.
	FindTotalCPUs(ctx context.Context) (uint64, error)
	// Find the total number of nodes available.
	FindTotalNodes(ctx context.Context, opts ...FindSpecOption) (uint64, error)
	// FindRunningJobByName find a running job using squeue.
	//
	// Returns 0 if not found.
	FindRunningJobByName(
		ctx context.Context,
		name string,
		user string,
	) (int, error)
}

func NewSlurm

func NewSlurm(
	executor Executor,
	adminUser string,
	supervisorPublicAddress string,
	partition string,
	opts ...SlurmOption,
) Scheduler

type Slurm

type Slurm struct {
	Executor
	// contains filtered or unexported fields
}

func (*Slurm) CancelJob

func (s *Slurm) CancelJob(ctx context.Context, name string, user string) error

CancelJob kills a job using scancel command.

func (*Slurm) FindCPUsPerNode

func (s *Slurm) FindCPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)

func (*Slurm) FindGPUsPerNode

func (s *Slurm) FindGPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)

func (*Slurm) FindMemPerNode

func (s *Slurm) FindMemPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)

func (*Slurm) FindRunningJobByName

func (s *Slurm) FindRunningJobByName(
	ctx context.Context,
	name string,
	user string,
) (int, error)

FindRunningJobByName find a running job using squeue.

func (*Slurm) FindTotalCPUs

func (s *Slurm) FindTotalCPUs(ctx context.Context) (uint64, error)

func (*Slurm) FindTotalGPUs

func (s *Slurm) FindTotalGPUs(ctx context.Context) (uint64, error)

func (*Slurm) FindTotalMem

func (s *Slurm) FindTotalMem(ctx context.Context) (uint64, error)

func (*Slurm) FindTotalNodes

func (s *Slurm) FindTotalNodes(ctx context.Context, opts ...FindSpecOption) (uint64, error)

func (*Slurm) HealthCheck

func (s *Slurm) HealthCheck(ctx context.Context) error

HealthCheck runs squeue to check if the queue is running

func (*Slurm) Submit

func (s *Slurm) Submit(ctx context.Context, req *SubmitRequest) (string, error)

Submit a sbatch definition script to the SLURM controller using the sbatch command.

func (*Slurm) TopUp

func (s *Slurm) TopUp(ctx context.Context, name string, additionalTime uint64) error

type SlurmOption

type SlurmOption func(*Slurm)

func WithNVidiaSMI

func WithNVidiaSMI(path string) SlurmOption

func WithSBatch

func WithSBatch(path string) SlurmOption

func WithSCancel

func WithSCancel(path string) SlurmOption

func WithSControl

func WithSControl(path string) SlurmOption

func WithSInfo

func WithSInfo(path string) SlurmOption

func WithSQueue

func WithSQueue(path string) SlurmOption

type SubmitRequest

type SubmitRequest struct {
	// Name of the job
	Name string
	// User is a UNIX User used for impersonation.
	User string
	// Prefix is appended to the log and comment.
	Prefix string
	// JobDefinition specifies the job allocations
	*JobDefinition
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL