Documentation ¶
Index ¶
- Constants
- type DefaultRespoolSpec
- type EventPublisher
- type EventPublisherConfig
- type Metrics
- type PerProcedureMetrics
- type PerResponseCodeMetrics
- type RespoolLoader
- type RespoolLoaderConfig
- type Server
- type ServiceHandler
- func (h *ServiceHandler) AbortJobUpdate(ctx context.Context, key *api.JobUpdateKey, message *string) (*api.Response, error)
- func (h *ServiceHandler) AddInstances(ctx context.Context, key *api.InstanceKey, count *int32) (*api.Response, error)
- func (h *ServiceHandler) CreateJob(ctx context.Context, description *api.JobConfiguration) (*api.Response, error)
- func (h *ServiceHandler) DescheduleCronJob(ctx context.Context, job *api.JobKey) (*api.Response, error)
- func (h *ServiceHandler) GetConfigSummary(ctx context.Context, job *api.JobKey) (*api.Response, error)
- func (h *ServiceHandler) GetJobSummary(ctx context.Context, role *string) (*api.Response, error)
- func (h *ServiceHandler) GetJobUpdateDetails(ctx context.Context, key *api.JobUpdateKey, query *api.JobUpdateQuery) (*api.Response, error)
- func (h *ServiceHandler) GetJobUpdateDiff(ctx context.Context, request *api.JobUpdateRequest) (*api.Response, error)
- func (h *ServiceHandler) GetJobUpdateSummaries(ctx context.Context, query *api.JobUpdateQuery) (*api.Response, error)
- func (h *ServiceHandler) GetJobs(ctx context.Context, ownerRole *string) (*api.Response, error)
- func (h *ServiceHandler) GetPendingReason(ctx context.Context, query *api.TaskQuery) (*api.Response, error)
- func (h *ServiceHandler) GetQuota(ctx context.Context, ownerRole *string) (*api.Response, error)
- func (h *ServiceHandler) GetRoleSummary(ctx context.Context) (*api.Response, error)
- func (h *ServiceHandler) GetTasksStatus(ctx context.Context, query *api.TaskQuery) (*api.Response, error)
- func (h *ServiceHandler) GetTasksWithoutConfigs(ctx context.Context, query *api.TaskQuery) (*api.Response, error)
- func (h *ServiceHandler) GetTierConfigs(ctx context.Context) (*api.Response, error)
- func (h *ServiceHandler) KillTasks(ctx context.Context, job *api.JobKey, instances map[int32]struct{}, ...) (*api.Response, error)
- func (h *ServiceHandler) PauseJobUpdate(ctx context.Context, key *api.JobUpdateKey, message *string) (*api.Response, error)
- func (h *ServiceHandler) PopulateJobConfig(ctx context.Context, description *api.JobConfiguration) (*api.Response, error)
- func (h *ServiceHandler) PulseJobUpdate(ctx context.Context, key *api.JobUpdateKey) (*api.Response, error)
- func (h *ServiceHandler) ReplaceCronTemplate(ctx context.Context, config *api.JobConfiguration) (*api.Response, error)
- func (h *ServiceHandler) RestartShards(ctx context.Context, job *api.JobKey, shardIds map[int32]struct{}) (*api.Response, error)
- func (h *ServiceHandler) ResumeJobUpdate(ctx context.Context, key *api.JobUpdateKey, message *string) (*api.Response, error)
- func (h *ServiceHandler) RollbackJobUpdate(ctx context.Context, key *api.JobUpdateKey, message *string) (*api.Response, error)
- func (h *ServiceHandler) ScheduleCronJob(ctx context.Context, description *api.JobConfiguration) (*api.Response, error)
- func (h *ServiceHandler) StartCronJob(ctx context.Context, job *api.JobKey) (*api.Response, error)
- func (h *ServiceHandler) StartJobUpdate(ctx context.Context, request *api.JobUpdateRequest, message *string) (*api.Response, error)
- type ServiceHandlerConfig
Constants ¶
const ( ProcedureAbortJobUpdate = "auroraschedulermanager__abortjobupdate" ProcedureGetConfigSummary = "readonlyscheduler__getconfigsummary" ProcedureGetJobSummary = "readonlyscheduler__getjobsummary" ProcedureGetJobUpdateDetails = "readonlyscheduler__getjobupdatedetails" ProcedureGetJobUpdateDiff = "readonlyscheduler__getjobupdatediff" ProcedureGetJobUpdateSummaries = "readonlyscheduler__getjobupdatesummaries" ProcedureGetJobs = "readonlyscheduler__getjobs" ProcedureGetTasksWithoutConfigs = "readonlyscheduler__gettaskswithoutconfigs" ProcedureGetTierConfigs = "readonlyscheduler__gettierconfigs" ProcedureKillTasks = "auroraschedulermanager__killtasks" ProcedurePauseJobUpdate = "auroraschedulermanager__pausejobupdate" ProcedurePulseJobUpdate = "auroraschedulermanager__pulsejobupdate" ProcedureResumeJobUpdate = "auroraschedulermanager__resumejobupdate" ProcedureRollbackJobUpdate = "auroraschedulermanager__rollbackjobupdate" ProcedureStartJobUpdate = "auroraschedulermanager__startjobupdate" // Metric tag names TagProcedure = "procedure" // handler procedure name TagResponseCode = "responsecode" // handler response code TagService = "updateservice" // input service name // Metric names MetricNameCalls = "calls" MetricNameCallLatency = "call_latency" )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type DefaultRespoolSpec ¶
type DefaultRespoolSpec struct { OwningTeam string `yaml:"owning_team"` LDAPGroups []string `yaml:"ldap_groups"` Description string `yaml:"description"` Resources []*respool.ResourceConfig `yaml:"resources"` Policy respool.SchedulingPolicy `yaml:"policy"` ControllerLimit *respool.ControllerLimit `yaml:"controller_limit"` SlackLimit *respool.SlackLimit `yaml:"slack_limit"` }
DefaultRespoolSpec defines parameters used to create a default respool for bridge when boostrapping a new cluster.
type EventPublisher ¶
type EventPublisher interface { // Start the watch on pod state changes and // publish of kafka Start() // Stop the watch on pod state changes and // publishing to kafka Stop() }
EventPublisher sets up a watch on pod state change event and then publishes them to kafka stream.
func NewEventPublisher ¶
func NewEventPublisher( kafkaURL string, jobClient statelesssvc.JobServiceYARPCClient, podClient podsvc.PodServiceYARPCClient, watchClient watchsvc.WatchServiceYARPCClient, client *http.Client, publishEvents bool, ) EventPublisher
NewEventPublisher return event publisher to stream pod state changes to kafka
type EventPublisherConfig ¶
type EventPublisherConfig struct { // KafkaURL represents the stream on which task state changes to publish KafkaURL string `yaml:"kafka_url"` // PublishEvents defines whether to publish task state changes to kafka PublishEvents bool `yaml:"publish_events"` // GRPCMsgSize defines the max payload size that can be send and recv GRPCMsgSize int `yaml:"grpc_msg_size"` }
EventPublisherConfig represents config for publishing task state change events to kafks
type Metrics ¶
type Metrics struct {
Procedures map[string]*PerProcedureMetrics
}
Metrics is the struct containing all metrics relevant for aurora api parrity
func NewMetrics ¶
NewMetrics returns a new Metrics struct, with all metrics initialized and rooted at the given tally.Scope
type PerProcedureMetrics ¶
type PerProcedureMetrics struct {
ResponseCodes map[api.ResponseCode]*PerResponseCodeMetrics
}
type PerResponseCodeMetrics ¶
type RespoolLoader ¶
RespoolLoader lazily loads a resource pool. If the resource pool does not exist, it boostraps one with provided defaults.
func NewRespoolLoader ¶
func NewRespoolLoader( config RespoolLoaderConfig, client respool.ResourceManagerYARPCClient, ) RespoolLoader
NewRespoolLoader creates a new RespoolLoader.
type RespoolLoaderConfig ¶
type RespoolLoaderConfig struct { RetryInterval time.Duration `yaml:"retry_interval"` RespoolPath string `yaml:"respool_path"` GPURespoolPath string `yaml:"gpu_respool_path"` DefaultRespoolSpec DefaultRespoolSpec `yaml:"default_respool_spec"` }
RespoolLoaderConfig defines RespoolLoader configuration.
type Server ¶
Server contains all structs necessary to run a aurorabrdige server. This struct also implements leader.Node interface so that it can perform leader election among multiple aurorabridge server instances.
func NewServer ¶
func NewServer( httpPort int, cfg leader.ElectionConfig, eventPublisher EventPublisher, role string) (*Server, error)
NewServer creates a aurorabridge Server instance.
func (*Server) GainedLeadershipCallback ¶
GainedLeadershipCallback is the callback when the current node becomes the leader
func (*Server) GetID ¶
GetID function returns jobmgr app address. This implements leader.Nomination.
func (*Server) HasGainedLeadership ¶
HasGainedLeadership returns true iff once GainedLeadershipCallback completes
func (*Server) LostLeadershipCallback ¶
LostLeadershipCallback is the callback when the current node lost leadership
func (*Server) ShutDownCallback ¶
ShutDownCallback is the callback to shut down gracefully if possible
type ServiceHandler ¶
type ServiceHandler struct {
// contains filtered or unexported fields
}
ServiceHandler implements a partial Aurora API. Various unneeded methods have been left intentionally unimplemented.
func NewServiceHandler ¶
func NewServiceHandler( config ServiceHandlerConfig, parent tally.Scope, jobClient statelesssvc.JobServiceYARPCClient, jobmgrClient jobmgrsvc.JobManagerServiceYARPCClient, podClient podsvc.PodServiceYARPCClient, respoolLoader RespoolLoader, random common.Random, jobIdCache cache.JobIDCache, ) (*ServiceHandler, error)
NewServiceHandler creates a new ServiceHandler.
func (*ServiceHandler) AbortJobUpdate ¶
func (h *ServiceHandler) AbortJobUpdate( ctx context.Context, key *api.JobUpdateKey, message *string, ) (*api.Response, error)
AbortJobUpdate permanently aborts the job update. Does not remove the update history.
func (*ServiceHandler) AddInstances ¶
func (h *ServiceHandler) AddInstances( ctx context.Context, key *api.InstanceKey, count *int32) (*api.Response, error)
AddInstances will remain unimplemented.
func (*ServiceHandler) CreateJob ¶
func (h *ServiceHandler) CreateJob( ctx context.Context, description *api.JobConfiguration) (*api.Response, error)
CreateJob will remain unimplemented.
func (*ServiceHandler) DescheduleCronJob ¶
func (h *ServiceHandler) DescheduleCronJob( ctx context.Context, job *api.JobKey) (*api.Response, error)
DescheduleCronJob will remain unimplemented.
func (*ServiceHandler) GetConfigSummary ¶
func (h *ServiceHandler) GetConfigSummary( ctx context.Context, job *api.JobKey) (*api.Response, error)
GetConfigSummary fetches the configuration summary of active tasks for the specified job.
func (*ServiceHandler) GetJobSummary ¶
GetJobSummary returns a summary of jobs, optionally only those owned by a specific role.
func (*ServiceHandler) GetJobUpdateDetails ¶
func (h *ServiceHandler) GetJobUpdateDetails( ctx context.Context, key *api.JobUpdateKey, query *api.JobUpdateQuery, ) (*api.Response, error)
GetJobUpdateDetails gets job update details. jobUpdateKey is marked to be deprecated from Aurora, and not used Aggregator It will be ignored to get job update details
func (*ServiceHandler) GetJobUpdateDiff ¶
func (h *ServiceHandler) GetJobUpdateDiff( ctx context.Context, request *api.JobUpdateRequest) (*api.Response, error)
GetJobUpdateDiff gets the diff between client (desired) and server (current) job states. TaskConfig is not set in GetJobUpdateDiffResult, since caller is not using it and fetching previous podspec is expensive
func (*ServiceHandler) GetJobUpdateSummaries ¶
func (h *ServiceHandler) GetJobUpdateSummaries( ctx context.Context, query *api.JobUpdateQuery, ) (*api.Response, error)
GetJobUpdateSummaries gets job update summaries.
func (*ServiceHandler) GetJobs ¶
GetJobs fetches the status of jobs. ownerRole is optional, in which case all jobs are returned.
func (*ServiceHandler) GetPendingReason ¶
func (h *ServiceHandler) GetPendingReason( ctx context.Context, query *api.TaskQuery) (*api.Response, error)
GetPendingReason will remain unimplemented.
func (*ServiceHandler) GetRoleSummary ¶
GetRoleSummary will remain unimplemented.
func (*ServiceHandler) GetTasksStatus ¶
func (h *ServiceHandler) GetTasksStatus( ctx context.Context, query *api.TaskQuery) (*api.Response, error)
GetTasksStatus will remain unimplemented.
func (*ServiceHandler) GetTasksWithoutConfigs ¶
func (h *ServiceHandler) GetTasksWithoutConfigs( ctx context.Context, query *api.TaskQuery, ) (*api.Response, error)
GetTasksWithoutConfigs is the same as getTasksStatus but without the TaskConfig.ExecutorConfig data set.
func (*ServiceHandler) GetTierConfigs ¶
GetTierConfigs is a no-op. It is only used to determine liveness of the scheduler.
func (*ServiceHandler) KillTasks ¶
func (h *ServiceHandler) KillTasks( ctx context.Context, job *api.JobKey, instances map[int32]struct{}, message *string, ) (*api.Response, error)
KillTasks initiates a kill on tasks.
func (*ServiceHandler) PauseJobUpdate ¶
func (h *ServiceHandler) PauseJobUpdate( ctx context.Context, key *api.JobUpdateKey, message *string, ) (*api.Response, error)
PauseJobUpdate pauses the specified job update. Can be resumed by resumeUpdate call.
func (*ServiceHandler) PopulateJobConfig ¶
func (h *ServiceHandler) PopulateJobConfig( ctx context.Context, description *api.JobConfiguration) (*api.Response, error)
PopulateJobConfig will remain unimplemented.
func (*ServiceHandler) PulseJobUpdate ¶
func (h *ServiceHandler) PulseJobUpdate( ctx context.Context, key *api.JobUpdateKey, ) (*api.Response, error)
PulseJobUpdate allows progress of the job update in case blockIfNoPulsesAfterMs is specified in JobUpdateSettings. Unblocks progress if the update was previously blocked. Responds with ResponseCode.INVALID_REQUEST in case an unknown update key is specified.
func (*ServiceHandler) ReplaceCronTemplate ¶
func (h *ServiceHandler) ReplaceCronTemplate( ctx context.Context, config *api.JobConfiguration) (*api.Response, error)
ReplaceCronTemplate will remain unimplemented.
func (*ServiceHandler) RestartShards ¶
func (h *ServiceHandler) RestartShards( ctx context.Context, job *api.JobKey, shardIds map[int32]struct{}) (*api.Response, error)
RestartShards will remain unimplemented.
func (*ServiceHandler) ResumeJobUpdate ¶
func (h *ServiceHandler) ResumeJobUpdate( ctx context.Context, key *api.JobUpdateKey, message *string, ) (*api.Response, error)
ResumeJobUpdate resumes progress of a previously paused job update.
func (*ServiceHandler) RollbackJobUpdate ¶
func (h *ServiceHandler) RollbackJobUpdate( ctx context.Context, key *api.JobUpdateKey, message *string, ) (*api.Response, error)
RollbackJobUpdate rollbacks the specified active job update to the initial state.
func (*ServiceHandler) ScheduleCronJob ¶
func (h *ServiceHandler) ScheduleCronJob( ctx context.Context, description *api.JobConfiguration) (*api.Response, error)
ScheduleCronJob will remain unimplemented.
func (*ServiceHandler) StartCronJob ¶
StartCronJob will remain unimplemented.
func (*ServiceHandler) StartJobUpdate ¶
func (h *ServiceHandler) StartJobUpdate( ctx context.Context, request *api.JobUpdateRequest, message *string, ) (*api.Response, error)
StartJobUpdate starts update of the existing service job.
type ServiceHandlerConfig ¶
type ServiceHandlerConfig struct { GetJobUpdateWorkers int `yaml:"get_job_update_workers"` GetJobsWorkers int `yaml:"get_jobs_workers"` GetJobSummaryWorkers int `yaml:"get_job_summary_workers"` StopPodWorkers int `yaml:"stop_pod_workers"` CreateJobSpecForUpdateWorkers int `yaml:"create_job_spec_for_update_workers"` // Config for number of workers for getTasksWithoutConfigs endpoint. GetTasksWithoutConfigsWorkers int `yaml:"get_tasks_without_configs_workers"` GetTasksWithoutConfigsMediumWorkers int `yaml:"get_tasks_without_configs_medium_workers"` GetTasksWithoutConfigsMediumThreshold int `yaml:"get_tasks_without_configs_medium_threshold"` GetTasksWithoutConfigsLargeWorkers int `yaml:"get_tasks_without_configs_large_workers"` GetTasksWithoutConfigsLargeThreshold int `yaml:"get_tasks_without_configs_large_threshold"` // getTasksWithoutConfigs task querying depth. It limits the number // of pods to be included in the return result - return pods from // current run if set to 1, return pods from current and previous one // run if set to 2, etc. // In Aurora, getTasksWithoutConfigs will return all the current and // completed tasks stored in the DB. However, in Peloton, since we // keep the complete history of pods, this parameter is used to limit // the number of pods returned. PodRunsDepth int `yaml:"pod_runs_depth"` // Maximum number of pods that will get returned, while meeting // minPodRunsDepth requirement. GetTasksPodMax int `yaml:"get_tasks_pod_max"` // QueryJobsLimit specifies Limit parameter passed to QueryJobs request QueryJobsLimit uint32 `yaml:"query_jobs_limit"` // InstanceEventsLimit specifies the limit on number of events per instance InstanceEventsLimit uint32 `yaml:"instance_events_limit"` // UpdatesLimit specifies the limit on number of updates to include per job UpdatesLimit uint32 `yaml:"updates_limit"` // ThemrosExecutor is config used to generate mesos CommandInfo / ExecutorInfo // for Thermos executor ThermosExecutor config.ThermosExecutorConfig `yaml:"thermos_executor"` // Enable Peloton inplace update EnableInPlace bool `yaml:"enable-inplace-update"` }
ServiceHandlerConfig defines ServiceHandler configuration.