Documentation ¶
Index ¶
- func BestFit(req *sproto.AllocateRequest, agent *agentState) float64
- func MakeFitFunction(fittingPolicy string) func(*sproto.AllocateRequest, *agentState) float64
- func Single[K comparable, V any](m map[K]V) (kr K, vr V, ok bool)
- func WorstFit(req *sproto.AllocateRequest, agent *agentState) float64
- type HardConstraint
- type ResourceManager
- func (a *ResourceManager) Allocate(msg sproto.AllocateRequest) (*sproto.ResourcesSubscription, error)
- func (a *ResourceManager) CheckMaxSlotsExceeded(v *sproto.ValidateResourcesRequest) (bool, error)
- func (a *ResourceManager) CreateNamespace(string, string, bool) error
- func (a *ResourceManager) DefaultNamespace(string) (*string, error)
- func (*ResourceManager) DeleteJob(sproto.DeleteJob) (sproto.DeleteJobResponse, error)
- func (a *ResourceManager) DeleteNamespace(namespaceName string) error
- func (a *ResourceManager) DisableAgent(msg *apiv1.DisableAgentRequest) (*apiv1.DisableAgentResponse, error)
- func (a *ResourceManager) DisableSlot(req *apiv1.DisableSlotRequest) (*apiv1.DisableSlotResponse, error)
- func (a *ResourceManager) EnableAgent(msg *apiv1.EnableAgentRequest) (*apiv1.EnableAgentResponse, error)
- func (a *ResourceManager) EnableSlot(req *apiv1.EnableSlotRequest) (*apiv1.EnableSlotResponse, error)
- func (*ResourceManager) ExternalPreemptionPending(sproto.PendingPreemption) error
- func (a *ResourceManager) GetAgent(msg *apiv1.GetAgentRequest) (*apiv1.GetAgentResponse, error)
- func (a *ResourceManager) GetAgents() (*apiv1.GetAgentsResponse, error)
- func (a *ResourceManager) GetAllocationSummaries() (map[model.AllocationID]sproto.AllocationSummary, error)
- func (a *ResourceManager) GetDefaultAuxResourcePool() (rm.ResourcePoolName, error)
- func (a *ResourceManager) GetDefaultComputeResourcePool() (rm.ResourcePoolName, error)
- func (*ResourceManager) GetExternalJobs(rm.ResourcePoolName) ([]*jobv1.Job, error)
- func (a *ResourceManager) GetJobQ(rpName rm.ResourcePoolName) (map[model.JobID]*sproto.RMJobInfo, error)
- func (a *ResourceManager) GetJobQueueStatsRequest(msg *apiv1.GetJobQueueStatsRequest) (*apiv1.GetJobQueueStatsResponse, error)
- func (a *ResourceManager) GetNamespaceResourceQuota(string, string) (*float64, error)
- func (a *ResourceManager) GetResourcePools() (*apiv1.GetResourcePoolsResponse, error)
- func (a *ResourceManager) GetSlot(req *apiv1.GetSlotRequest) (*apiv1.GetSlotResponse, error)
- func (a *ResourceManager) GetSlots(msg *apiv1.GetSlotsRequest) (*apiv1.GetSlotsResponse, error)
- func (a *ResourceManager) HealthCheck() []model.ResourceManagerHealth
- func (*ResourceManager) IsReattachableOnlyAfterStarted() bool
- func (*ResourceManager) NotifyContainerRunning(sproto.NotifyContainerRunning) error
- func (a *ResourceManager) RecoverJobPosition(msg sproto.RecoverJobPosition)
- func (a *ResourceManager) Release(msg sproto.ResourcesReleased)
- func (a *ResourceManager) RemoveEmptyNamespace(string, string) error
- func (a *ResourceManager) ResolveResourcePool(name rm.ResourcePoolName, workspaceID int, slots int) (rm.ResourcePoolName, error)
- func (a *ResourceManager) SetGroupMaxSlots(msg sproto.SetGroupMaxSlots)
- func (a *ResourceManager) SetGroupPriority(msg sproto.SetGroupPriority) error
- func (a *ResourceManager) SetGroupWeight(msg sproto.SetGroupWeight) error
- func (a *ResourceManager) SetResourceQuota(int, string, string) error
- func (a *ResourceManager) TaskContainerDefaults(resourcePoolName rm.ResourcePoolName, ...) (model.TaskContainerDefaultsConfig, error)
- func (a *ResourceManager) ValidateResourcePool(name rm.ResourcePoolName) error
- func (a *ResourceManager) ValidateResources(msg sproto.ValidateResourcesRequest) ([]command.LaunchWarning, error)
- func (a *ResourceManager) VerifyNamespaceExists(string, string) error
- type Scheduler
- type SoftConstraint
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func BestFit ¶
func BestFit(req *sproto.AllocateRequest, agent *agentState) float64
BestFit returns a float affinity score between 0 and 1 for the affinity between the task and the agent. This method attempts to allocate tasks to the agent that is both most utilized and offers the fewest slots. This method should be used when the cluster is dominated by multi-slot applications.
func MakeFitFunction ¶
func MakeFitFunction(fittingPolicy string) func( *sproto.AllocateRequest, *agentState) float64
MakeFitFunction returns the corresponding fitting function.
func Single ¶
func Single[K comparable, V any](m map[K]V) (kr K, vr V, ok bool)
Single asserts there's a single element in the map and take it.
func WorstFit ¶
func WorstFit(req *sproto.AllocateRequest, agent *agentState) float64
WorstFit returns a float affinity score between 0 and 1 for the affinity between the task and the agent. This method attempts to allocate tasks to the agent that is least utilized. This method should be used when the cluster is dominated by single-slot applications.
Types ¶
type HardConstraint ¶
type HardConstraint func(req *sproto.AllocateRequest, agent *agentState) bool
HardConstraint returns true if the task can be assigned to the agent and false otherwise.
type ResourceManager ¶
type ResourceManager struct {
// contains filtered or unexported fields
}
A ResourceManager manages many resource pools and routing requests for resources to them.
func New ¶
func New( db *db.PgDB, e *echo.Echo, config *config.ResourceManagerWithPoolsConfig, opts *aproto.MasterSetAgentOptions, cert *tls.Certificate, ) (*ResourceManager, error)
New returns a new ResourceManager, which manages communicating with and scheduling on Determined agents.
func (*ResourceManager) Allocate ¶
func (a *ResourceManager) Allocate(msg sproto.AllocateRequest) (*sproto.ResourcesSubscription, error)
Allocate implements rm.ResourceManager.
func (*ResourceManager) CheckMaxSlotsExceeded ¶
func (a *ResourceManager) CheckMaxSlotsExceeded(v *sproto.ValidateResourcesRequest) (bool, error)
CheckMaxSlotsExceeded checks if the job exceeded the maximum number of slots.
func (*ResourceManager) CreateNamespace ¶
func (a *ResourceManager) CreateNamespace(string, string, bool) error
CreateNamespace is not supported.
func (*ResourceManager) DefaultNamespace ¶
func (a *ResourceManager) DefaultNamespace(string) (*string, error)
DefaultNamespace is not supported.
func (*ResourceManager) DeleteJob ¶
func (*ResourceManager) DeleteJob(sproto.DeleteJob) (sproto.DeleteJobResponse, error)
DeleteJob implements rm.ResourceManager.
func (*ResourceManager) DeleteNamespace ¶
func (a *ResourceManager) DeleteNamespace(namespaceName string) error
DeleteNamespace is not supported.
func (*ResourceManager) DisableAgent ¶
func (a *ResourceManager) DisableAgent(msg *apiv1.DisableAgentRequest) (*apiv1.DisableAgentResponse, error)
DisableAgent implements rm.ResourceManager.
func (*ResourceManager) DisableSlot ¶
func (a *ResourceManager) DisableSlot(req *apiv1.DisableSlotRequest) (*apiv1.DisableSlotResponse, error)
DisableSlot implements rm.ResourceManager.
func (*ResourceManager) EnableAgent ¶
func (a *ResourceManager) EnableAgent(msg *apiv1.EnableAgentRequest) (*apiv1.EnableAgentResponse, error)
EnableAgent implements rm.ResourceManager.
func (*ResourceManager) EnableSlot ¶
func (a *ResourceManager) EnableSlot(req *apiv1.EnableSlotRequest) (*apiv1.EnableSlotResponse, error)
EnableSlot implements rm.ResourceManager.
func (*ResourceManager) ExternalPreemptionPending ¶ added in v0.750.0
func (*ResourceManager) ExternalPreemptionPending(sproto.PendingPreemption) error
ExternalPreemptionPending implements rm.ResourceManager.
func (*ResourceManager) GetAgent ¶
func (a *ResourceManager) GetAgent(msg *apiv1.GetAgentRequest) (*apiv1.GetAgentResponse, error)
GetAgent implements rm.ResourceManager.
func (*ResourceManager) GetAgents ¶
func (a *ResourceManager) GetAgents() (*apiv1.GetAgentsResponse, error)
GetAgents implements rm.ResourceManager.
func (*ResourceManager) GetAllocationSummaries ¶
func (a *ResourceManager) GetAllocationSummaries() (map[model.AllocationID]sproto.AllocationSummary, error)
GetAllocationSummaries implements rm.ResourceManager.
func (*ResourceManager) GetDefaultAuxResourcePool ¶
func (a *ResourceManager) GetDefaultAuxResourcePool() (rm.ResourcePoolName, error)
GetDefaultAuxResourcePool implements rm.ResourceManager.
func (*ResourceManager) GetDefaultComputeResourcePool ¶
func (a *ResourceManager) GetDefaultComputeResourcePool() (rm.ResourcePoolName, error)
GetDefaultComputeResourcePool implements rm.ResourceManager.
func (*ResourceManager) GetExternalJobs ¶
func (*ResourceManager) GetExternalJobs(rm.ResourcePoolName) ([]*jobv1.Job, error)
GetExternalJobs implements rm.ResourceManager.
func (*ResourceManager) GetJobQ ¶
func (a *ResourceManager) GetJobQ(rpName rm.ResourcePoolName) (map[model.JobID]*sproto.RMJobInfo, error)
GetJobQ implements rm.ResourceManager.
func (*ResourceManager) GetJobQueueStatsRequest ¶
func (a *ResourceManager) GetJobQueueStatsRequest( msg *apiv1.GetJobQueueStatsRequest, ) (*apiv1.GetJobQueueStatsResponse, error)
GetJobQueueStatsRequest implements rm.ResourceManager.
func (*ResourceManager) GetNamespaceResourceQuota ¶
func (a *ResourceManager) GetNamespaceResourceQuota(string, string) (*float64, error)
GetNamespaceResourceQuota is not supported.
func (*ResourceManager) GetResourcePools ¶
func (a *ResourceManager) GetResourcePools() (*apiv1.GetResourcePoolsResponse, error)
GetResourcePools implements rm.ResourceManager.
func (*ResourceManager) GetSlot ¶
func (a *ResourceManager) GetSlot(req *apiv1.GetSlotRequest) (*apiv1.GetSlotResponse, error)
GetSlot implements rm.ResourceManager.
func (*ResourceManager) GetSlots ¶
func (a *ResourceManager) GetSlots(msg *apiv1.GetSlotsRequest) (*apiv1.GetSlotsResponse, error)
GetSlots implements rm.ResourceManager.
func (*ResourceManager) HealthCheck ¶
func (a *ResourceManager) HealthCheck() []model.ResourceManagerHealth
HealthCheck always returns healthy for agentrm.
func (*ResourceManager) IsReattachableOnlyAfterStarted ¶
func (*ResourceManager) IsReattachableOnlyAfterStarted() bool
IsReattachableOnlyAfterStarted implements rm.ResourceManager.
func (*ResourceManager) NotifyContainerRunning ¶
func (*ResourceManager) NotifyContainerRunning(sproto.NotifyContainerRunning) error
NotifyContainerRunning implements rm.ResourceManager.
func (*ResourceManager) RecoverJobPosition ¶
func (a *ResourceManager) RecoverJobPosition(msg sproto.RecoverJobPosition)
RecoverJobPosition implements rm.ResourceManager.
func (*ResourceManager) Release ¶
func (a *ResourceManager) Release(msg sproto.ResourcesReleased)
Release implements rm.ResourceManager.
func (*ResourceManager) RemoveEmptyNamespace ¶
func (a *ResourceManager) RemoveEmptyNamespace(string, string) error
RemoveEmptyNamespace is not supported.
func (*ResourceManager) ResolveResourcePool ¶
func (a *ResourceManager) ResolveResourcePool(name rm.ResourcePoolName, workspaceID int, slots int) ( rm.ResourcePoolName, error, )
ResolveResourcePool implements rm.ResourceManager.
func (*ResourceManager) SetGroupMaxSlots ¶
func (a *ResourceManager) SetGroupMaxSlots(msg sproto.SetGroupMaxSlots)
SetGroupMaxSlots implements rm.ResourceManager.
func (*ResourceManager) SetGroupPriority ¶
func (a *ResourceManager) SetGroupPriority(msg sproto.SetGroupPriority) error
SetGroupPriority implements rm.ResourceManager.
func (*ResourceManager) SetGroupWeight ¶
func (a *ResourceManager) SetGroupWeight(msg sproto.SetGroupWeight) error
SetGroupWeight implements rm.ResourceManager.
func (*ResourceManager) SetResourceQuota ¶
func (a *ResourceManager) SetResourceQuota(int, string, string) error
SetResourceQuota is not supported.
func (*ResourceManager) TaskContainerDefaults ¶
func (a *ResourceManager) TaskContainerDefaults( resourcePoolName rm.ResourcePoolName, defaultConfig model.TaskContainerDefaultsConfig, ) (model.TaskContainerDefaultsConfig, error)
TaskContainerDefaults implements rm.ResourceManager.
func (*ResourceManager) ValidateResourcePool ¶
func (a *ResourceManager) ValidateResourcePool(name rm.ResourcePoolName) error
ValidateResourcePool implements rm.ResourceManager.
func (*ResourceManager) ValidateResources ¶
func (a *ResourceManager) ValidateResources( msg sproto.ValidateResourcesRequest, ) ([]command.LaunchWarning, error)
ValidateResources implements rm.ResourceManager.
func (*ResourceManager) VerifyNamespaceExists ¶
func (a *ResourceManager) VerifyNamespaceExists(string, string) error
VerifyNamespaceExists is not supported.
type Scheduler ¶
type Scheduler interface { Schedule(rp *resourcePool) ([]*sproto.AllocateRequest, []model.AllocationID) JobQInfo(rp *resourcePool) map[model.JobID]*sproto.RMJobInfo }
Scheduler schedules tasks on agents. Its only function Schedule is called to determine which pending requests can be fulfilled and which scheduled tasks can be terminated. Schedule is expected to ba called every time there is a change to the cluster status, for example, new agents being connected, devices being disabled, and etc,. Schedule should avoid unnecessary shuffling tasks on agents to avoid the overhead of restarting a preempted task.
func MakeScheduler ¶
func MakeScheduler(conf *config.SchedulerConfig) (Scheduler, error)
MakeScheduler returns the corresponding scheduler implementation.
func NewFairShareScheduler ¶
func NewFairShareScheduler() Scheduler
NewFairShareScheduler creates a new scheduler that schedules tasks according to the max-min fairness of groups. For groups that are above their fair share, the scheduler requests them to terminate their idle tasks until they have achieved their fair share.
func NewPriorityScheduler ¶
func NewPriorityScheduler(config *config.SchedulerConfig) Scheduler
NewPriorityScheduler creates a new scheduler that schedules tasks via priority.
type SoftConstraint ¶
type SoftConstraint func(req *sproto.AllocateRequest, agent *agentState) float64
SoftConstraint returns a score from 0 (lowest) to 1 (highest) representing how optimal is the state of the cluster if the task were assigned to the agent.