trainer

package
v0.0.0-...-4e3790f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 30, 2019 License: Apache-2.0, CC-BY-4.0, MIT Imports: 43 Imported by: 1

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ConnectMongo

func ConnectMongo(mongoURI string, database string, username string, password string, cert string) (*mgo.Session, error)

ConnectMongo connects to a mongo database collection, using the provided username, password, and certificate file It returns a pointer to the session and collection objects, or an error if the connection attempt fails. TODO: This function could potentially be moved to a central utility package

func LockName

func LockName(gpuType string) string

LockName returns the name of the lock collection in mongo based on the GPU type

func QueueName

func QueueName(gpuType string) string

QueueName returns the name of the queue collection in mongo based on the GPU type

func TransformResourceName

func TransformResourceName(resource string) string

TransformResourceName performs replacement and capitalization so resource names are consistent

Types

type Entry

type Entry struct {
	ID         bson.ObjectId `bson:"_id" json:"id"`
	TrainingID string        `bson:"training_id" json:"training_id"`
	Submitted  time.Time     `bson:"submitted" json:"submitted"`
}

Entry represents a single training job in the queue

type JobHistoryEntry

type JobHistoryEntry struct {
	ID            bson.ObjectId          `bson:"_id,omitempty" json:"id"`
	TrainingID    string                 `bson:"training_id" json:"training_id"`
	Timestamp     string                 `bson:"timestamp,omitempty" json:"timestamp,omitempty"`
	Status        grpc_trainer_v2.Status `bson:"status,omitempty" json:"status,omitempty"`
	StatusMessage string                 `bson:"status_message,omitempty" json:"status_message,omitempty"`
	ErrorCode     string                 `bson:"error_code,omitempty" json:"error_code,omitempty"`
}

JobHistoryEntry stores training job status history in the Mongo collection "job_history"

type JobQueue

type JobQueue interface {
	Enqueue(string) error
	Dequeue() (string, error)
	Peek() (string, error)
	Delete(string) (bool, error)
	Size() (int, error)
	Empty() (bool, error)
	Lock() error
	Unlock() error
}

JobQueue represents the functionality of a queue

type LockEntry

type LockEntry struct {
	ID      bson.ObjectId `bson:"_id" json:"id"`
	LockID  string        `bson:"lock" json:"lock"`
	Owner   string        `bson:"owner" json:"owner"`
	Expires time.Time     `bson:"expires" json:"expires"`
}

LockEntry represents which trainer service currently has a lock on the queue

type ResettableMetrics

type ResettableMetrics struct {
	// contains filtered or unexported fields
}

ResettableMetrics ... Struct of resettable metrics

func NewResettableMetrics

func NewResettableMetrics() *ResettableMetrics

NewResettableMetrics ... Creating a new instance of resettable metrics

func (*ResettableMetrics) NewGauge

func (r *ResettableMetrics) NewGauge(name string, help string) metrics.Gauge

NewGauge ... Creating resettable gauges

type Service

type Service interface {
	grpc_trainer_v2.TrainerServer
	service.LifecycleHandler
	StopTrainer()
}

Service represents the functionality of the trainer service

func NewService

func NewService() Service

NewService creates a new trainer service.

func NewTestService

func NewTestService(ds storage.DataStore, repo repository, jobHistoryRepo jobHistoryRepository,
	lcm client.LcmClient, tds tdsClient.TrainingDataClient, ratelimiter rlClient.RatelimiterClient, queues map[string]*queueHandler) Service

NewTestService creates a new service instance for testing

type TrainingJobQueue

type TrainingJobQueue struct {
	// contains filtered or unexported fields
}

TrainingJobQueue is a JobQueue backed by mongo

func (*TrainingJobQueue) Delete

func (q *TrainingJobQueue) Delete(id string) (bool, error)

Delete removes a training job id from any position in the queue trainer should acquire the lock before calling Delete()

func (*TrainingJobQueue) Dequeue

func (q *TrainingJobQueue) Dequeue() (string, error)

Dequeue returns a single training job id and removes it from the queue trainer should acquire the lock before calling Dequeue()

func (*TrainingJobQueue) Empty

func (q *TrainingJobQueue) Empty() (bool, error)

Empty returns whether the queue has any jobs

func (*TrainingJobQueue) Enqueue

func (q *TrainingJobQueue) Enqueue(id string) error

Enqueue adds a training job id to the queue trainer should acquire the lock before calling Enqueue()

func (*TrainingJobQueue) Lock

func (q *TrainingJobQueue) Lock() error

Lock acquires a distributed lock in mongo trainer should use this when pulling jobs, so that multiple trainers do not peek/submit the same job to lcm

func (*TrainingJobQueue) Peek

func (q *TrainingJobQueue) Peek() (string, error)

Peek returns a single training job id and leaves it in the queue

func (*TrainingJobQueue) Size

func (q *TrainingJobQueue) Size() (int, error)

Size returns the number of elements in the queue.

func (*TrainingJobQueue) Unlock

func (q *TrainingJobQueue) Unlock() error

Unlock releases the lock in mongo

type TrainingRecord

type TrainingRecord struct {
	ID                    bson.ObjectId                    `bson:"_id,omitempty" json:"id"`
	TrainingID            string                           `bson:"training_id" json:"training_id"`
	UserID                string                           `bson:"user_id" json:"user_id"`
	JobID                 string                           `bson:"job_id" json:"job_id"`
	ModelDefinition       *grpc_trainer_v2.ModelDefinition `bson:"model_definition,omitempty" json:"model_definition"`
	Training              *grpc_trainer_v2.Training        `bson:"training,omitempty" json:"training"`
	Datastores            []*grpc_trainer_v2.Datastore     `bson:"data_stores,omitempty" json:"data_stores"`
	TrainingStatus        *grpc_trainer_v2.TrainingStatus  `bson:"training_status,omitempty" json:"training_status"`
	Metrics               *grpc_trainer_v2.Metrics         `bson:"metrics,omitempty" json:"metrics"`
	Deleted               bool                             `bson:"deleted,omitempty" json:"deleted"`
	EvaluationMetricsSpec string                           `bson:"evaluation_metrics_spec,omitempty" json:"evaluation_metrics_spec"`
	Zone                  string                           `bson:"zone" json:"zone"`
}

TrainingRecord is the data structure we store in the Mongo collection "training_jobs"

Directories

Path Synopsis
Package grpc_trainer_v2 is a generated protocol buffer package.
Package grpc_trainer_v2 is a generated protocol buffer package.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL