Documentation ¶
Index ¶
- func AddAllocationAcceleratorData(ctx context.Context, accData model.AcceleratorData) error
- func InsertNTSCAllocationWorkspaceRecord(ctx context.Context, allocationID model.AllocationID, workspaceID int, ...) error
- func InsertTrialAllocationWorkspaceRecord(ctx context.Context, experimentID int, allocationID model.AllocationID) error
- type AllocationExited
- type AllocationService
- type AllocationSignal
- type AllocationState
- type AllocationUnfulfilledError
- type AlreadyCancelledError
- type BehaviorDisabledError
- type BehaviorUnsupportedError
- type NoAllocationError
- type RendezvousInfoOrError
- type RendezvousWatcher
- type StaleContainerError
- type StaleResourcesError
- type StaleResourcesReceivedError
- type TimeoutExceededError
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AddAllocationAcceleratorData ¶
func AddAllocationAcceleratorData(ctx context.Context, accData model.AcceleratorData, ) error
AddAllocationAcceleratorData stores acceleration data for an allocation.
func InsertNTSCAllocationWorkspaceRecord ¶
func InsertNTSCAllocationWorkspaceRecord( ctx context.Context, allocationID model.AllocationID, workspaceID int, workspaceName string, ) error
InsertNTSCAllocationWorkspaceRecord inserts a record linking an NTSC tasks' allocation to it's respective workspace.
func InsertTrialAllocationWorkspaceRecord ¶
func InsertTrialAllocationWorkspaceRecord( ctx context.Context, experimentID int, allocationID model.AllocationID, ) error
InsertTrialAllocationWorkspaceRecord inserts a record linking an trial's allocation to a trial to it's respective workspace & experiment.
Types ¶
type AllocationExited ¶
type AllocationExited struct { // userRequestedStop is when a container unexpectedly exits with 0. UserRequestedStop bool Err error FinalState AllocationState }
AllocationExited summarizes the exit status of an allocation.
func (*AllocationExited) String ¶
func (a *AllocationExited) String() string
type AllocationService ¶
type AllocationService interface { GetAllAllocationIDs() []model.AllocationID StartAllocation( logCtx logger.Context, req sproto.AllocateRequest, db db.DB, rm rm.ResourceManager, specifier tasks.TaskSpecifier, onExit func(*AllocationExited), ) error AwaitTermination(id model.AllocationID) Signal( id model.AllocationID, sig AllocationSignal, reason string, ) error State(id model.AllocationID) (AllocationState, error) SetReady(ctx context.Context, id model.AllocationID) error SetWaiting(ctx context.Context, id model.AllocationID) error SetProxyAddress( ctx context.Context, id model.AllocationID, addr string, ) error GetAllocation( ctx context.Context, allocallocationID string, ) (*model.Allocation, error) SetAcceleratorData( ctx context.Context, accData model.AcceleratorData, ) error WatchRendezvous( ctx context.Context, id model.AllocationID, rID sproto.ResourcesID, ) (*trialv1.RendezvousInfo, error) SetResourcesAsDaemon( ctx context.Context, id model.AllocationID, rID sproto.ResourcesID, ) error AllGather( ctx context.Context, allocationID model.AllocationID, id uuid.UUID, numPeers int, data any, ) ([]any, error) WatchPreemption(ctx context.Context, id model.AllocationID) (bool, error) AckPreemption(ctx context.Context, id model.AllocationID) error SendLog( ctx context.Context, id model.AllocationID, log *sproto.ContainerLog, ) WaitForRestore(ctx context.Context, id model.AllocationID) error Detach(id model.AllocationID) error }
AllocationService allows callers to launch, direct and query allocations.
var DefaultService AllocationService = newAllocationService()
DefaultService is the singleton default allocationService.
type AllocationSignal ¶
type AllocationSignal string
AllocationSignal is an interface for signals that can be sent to an allocation.
const ( // KillAllocation is the signal to kill an allocation; analogous to SIGKILL. KillAllocation AllocationSignal = "kill" // TerminateAllocation is the signal to kill an allocation; analogous to SIGTERM. TerminateAllocation AllocationSignal = "terminate" )
type AllocationState ¶
type AllocationState struct { State model.AllocationState Resources map[sproto.ResourcesID]sproto.ResourcesSummary Ready bool Addresses map[sproto.ResourcesID][]cproto.Address Containers map[sproto.ResourcesID][]cproto.Container }
AllocationState requests allocation state. A copy is filled and returned.
func (AllocationState) SingleContainer ¶
func (a AllocationState) SingleContainer() *cproto.Container
SingleContainer returns a single random container from the allocation state.
func (AllocationState) SingleContainerAddresses ¶
func (a AllocationState) SingleContainerAddresses() []cproto.Address
SingleContainerAddresses returns a single random container's addresses from the allocation state.
type AllocationUnfulfilledError ¶
type AllocationUnfulfilledError struct {
Action string
}
AllocationUnfulfilledError is returned an operation is tried without an active allocation.
func (AllocationUnfulfilledError) Error ¶
func (e AllocationUnfulfilledError) Error() string
type AlreadyCancelledError ¶
type AlreadyCancelledError struct{}
AlreadyCancelledError is returned to the allocation when it tries to take an action but has an unread cancellation in its inbox.
func (AlreadyCancelledError) Error ¶
func (e AlreadyCancelledError) Error() string
type BehaviorDisabledError ¶
type BehaviorDisabledError struct {
Behavior string
}
BehaviorDisabledError is returned an operation is tried without the behavior being enabled.
func (BehaviorDisabledError) Error ¶
func (e BehaviorDisabledError) Error() string
type BehaviorUnsupportedError ¶
type BehaviorUnsupportedError struct {
Behavior string
}
BehaviorUnsupportedError is returned an operation is tried without the behavior being supported.
func (BehaviorUnsupportedError) Error ¶
func (e BehaviorUnsupportedError) Error() string
type NoAllocationError ¶
type NoAllocationError struct {
Action string
}
NoAllocationError is returned an operation is tried without a requested allocation.
func (NoAllocationError) Error ¶
func (e NoAllocationError) Error() string
type RendezvousInfoOrError ¶
type RendezvousInfoOrError struct { Info *trialv1.RendezvousInfo Err error }
RendezvousInfoOrError contains either rendezvous info or an error from failing to materialize it.
type RendezvousWatcher ¶
type RendezvousWatcher struct {
C <-chan RendezvousInfoOrError
}
RendezvousWatcher contains a channel which can be polled for rendezvous info.
type StaleContainerError ¶
StaleContainerError is returned when an operation was attempted by a stale container.
func (StaleContainerError) Error ¶
func (e StaleContainerError) Error() string
type StaleResourcesError ¶
type StaleResourcesError struct {
ID sproto.ResourcesID
}
StaleResourcesError is returned when an operation was attempted by a stale resources.
func (StaleResourcesError) Error ¶
func (e StaleResourcesError) Error() string
type StaleResourcesReceivedError ¶
type StaleResourcesReceivedError struct{}
StaleResourcesReceivedError is returned the scheduler gives an allocation resources between when it requests them and it deciding, for some reason or another, they are not needed.
func (StaleResourcesReceivedError) Error ¶
func (e StaleResourcesReceivedError) Error() string
type TimeoutExceededError ¶
type TimeoutExceededError struct {
Message string
}
TimeoutExceededError is return, with a bit of detail, when a timeout is exceeded.
func (TimeoutExceededError) Error ¶
func (e TimeoutExceededError) Error() string