Documentation ¶
Index ¶
- type Executor
- type FindSpecOption
- type JobDefinition
- type Scheduler
- type Slurm
- func (s *Slurm) CancelJob(ctx context.Context, name string, user string) error
- func (s *Slurm) CancelJobByID(ctx context.Context, id uint64) error
- func (s *Slurm) FindCPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)
- func (s *Slurm) FindGPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)
- func (s *Slurm) FindMemPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error)
- func (s *Slurm) FindRunningJobByName(ctx context.Context, name string, user string) (int, error)
- func (s *Slurm) FindTotalCPUs(ctx context.Context) (uint64, error)
- func (s *Slurm) FindTotalGPUs(ctx context.Context) (uint64, error)
- func (s *Slurm) FindTotalMem(ctx context.Context) (uint64, error)
- func (s *Slurm) FindTotalNodes(ctx context.Context, opts ...FindSpecOption) (uint64, error)
- func (s *Slurm) HealthCheck(ctx context.Context) error
- func (s *Slurm) Submit(ctx context.Context, req *SubmitRequest) (string, error)
- func (s *Slurm) TopUp(ctx context.Context, name string, additionalTime uint64) error
- type SlurmOption
- type SubmitRequest
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type FindSpecOption ¶
type FindSpecOption func(*findSpecOptions)
func WithOnlyResponding ¶
func WithOnlyResponding() FindSpecOption
type JobDefinition ¶
type JobDefinition struct { // TimeLimit is a time allocation which at the end kills the running job. // // TimeLimit is in minutes. TimeLimit uint64 // NTasks indicates the number parallel tasks. NTasks uint64 // NTasksPerNode indicates the number parallel tasks per node. // // If NTasks is not 0, NTasksPerNode is a maximum. Otherwise, it is a scrictly the number of tasks per node. NTasksPerNode uint64 // MinNodes indicates the minimum number of allocated node. MinNodes uint64 // MaxNodes indicates the maximum number of allocated node. // MinNodes is required. MaxNodes uint64 // GPUs indicates the number of requested GPU per job. GPUs *uint64 // GPUsPerTask indicates the number of requested GPU per task. GPUsPerTask *uint64 // GPUsPerNode indicates the number of requested GPUs per node. GPUsPerNode uint64 // CPUs indicates the number of requested CPU. CPUsPerTask uint64 // CPUs indicates the minimum number of CPU per node. CPUsPerNode uint64 // MemoryPerCpu indicates the number of requested MB of memory. MemoryPerCPU uint64 // Memory indicates the number of requested MB of memory. Memory *uint64 // Body of the job, in a sbatch script. Body string // Wait for the job to end. The exit code of the sbatch will be the exit code // of the job. Wait bool }
type Scheduler ¶
type Scheduler interface { // HealthCheck verifies if the scheduler accepts jobs. HealthCheck(ctx context.Context) error // Submit a job to the scheduler. Submit(ctx context.Context, req *SubmitRequest) (string, error) // CancelJob kills a job. CancelJob(ctx context.Context, name string, user string) error // CancelJobByID kills a job using admin user and ID. CancelJobByID(ctx context.Context, id uint64) error // TopUp increases the time limit in minutes of a job. TopUp(ctx context.Context, name string, additionalTime uint64) error // Find the memory (MB) per node FindMemPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error) // Find the GPU per node. FindGPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error) // Find the CPU per node. FindCPUsPerNode(ctx context.Context, opts ...FindSpecOption) ([]uint64, error) // Find the total number of memory (MB) available FindTotalMem(ctx context.Context) (uint64, error) // Find the total number of GPUs available. FindTotalGPUs(ctx context.Context) (uint64, error) // Find the total number of CPUs available. FindTotalCPUs(ctx context.Context) (uint64, error) // Find the total number of nodes available. FindTotalNodes(ctx context.Context, opts ...FindSpecOption) (uint64, error) // FindRunningJobByName find a running job using squeue. // // Returns 0 if not found. FindRunningJobByName( ctx context.Context, name string, user string, ) (int, error) }
type Slurm ¶
type Slurm struct { Executor // contains filtered or unexported fields }
func (*Slurm) CancelJobByID ¶ added in v0.16.2
CancelJobByID kills a job using scancel command.
func (*Slurm) FindCPUsPerNode ¶
func (*Slurm) FindGPUsPerNode ¶
func (*Slurm) FindMemPerNode ¶
func (*Slurm) FindRunningJobByName ¶
FindRunningJobByName find a running job using squeue.
func (*Slurm) FindTotalNodes ¶
func (*Slurm) HealthCheck ¶
HealthCheck runs squeue to check if the queue is running
type SlurmOption ¶
type SlurmOption func(*Slurm)
func WithNVidiaSMI ¶
func WithNVidiaSMI(path string) SlurmOption
func WithSBatch ¶
func WithSBatch(path string) SlurmOption
func WithSCancel ¶
func WithSCancel(path string) SlurmOption
func WithSControl ¶
func WithSControl(path string) SlurmOption
func WithSInfo ¶
func WithSInfo(path string) SlurmOption
func WithSQueue ¶
func WithSQueue(path string) SlurmOption
type SubmitRequest ¶
type SubmitRequest struct { // Name of the job Name string // User is a UNIX User used for impersonation. User string // Prefix is appended to the log and comment. Prefix string // JobDefinition specifies the job allocations *JobDefinition }
Click to show internal directories.
Click to hide internal directories.