types

package
v0.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 12, 2023 License: AGPL-3.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var InstanceStates = map[string]int{
	"pending":       0,
	"running":       1,
	"shutting-down": 2,
	"terminated":    3,
	"stopping":      4,
}
View Source
var ProviderNames = []string{
	"aws",
	"gcp",
	"azure",
	"paperspace",
}

Functions

This section is empty.

Types

type CapacityError

type CapacityError struct {
	Code    string
	Message string
	Region  string
}

func (CapacityError) Error

func (e CapacityError) Error() string

type CedanaCluster

type CedanaCluster struct {
	gorm.Model
	ClusterID uuid.UUID  `json:"cluster_id" gorm:"type:uuid"`
	Workers   []Instance `json:"workers" gorm:"foreignKey:CedanaID"`
}

type CedanaState

type CedanaState struct {
	ClientInfo     ClientInfo     `json:"client_info" mapstructure:"client_info"`
	ProcessInfo    ProcessInfo    `json:"process_info" mapstructure:"process_info"`
	CheckpointType CheckpointType `json:"checkpoint_type" mapstructure:"checkpoint_type"`
	// either local or remote checkpoint path (url vs filesystem path)
	CheckpointPath string `json:"checkpoint_path" mapstructure:"checkpoint_path"`
	// process state at time of checkpoint
	CheckpointState CheckpointState `json:"checkpoint_state" mapstructure:"checkpoint_state"`
}

CedanaState encapsulates a CRIU checkpoint and includes filesystem state for a full restore. Typically serialized and shot around over the wire.

type CheckpointReason

type CheckpointReason string
const (
	CheckpointReasonInstanceTermination CheckpointReason = "instance_termination"
	CheckpointReasonJobTermination      CheckpointReason = "job_termination"
	CheckpointReasonHeartbeat           CheckpointReason = "heartbeat"
)

type CheckpointState

type CheckpointState string
const (
	CheckpointSuccess CheckpointState = "CHECKPOINTED"
	CheckpointFailed  CheckpointState = "CHECKPOINT_FAILED"
	RestoreSuccess    CheckpointState = "RESTORED"
	RestoreFailed     CheckpointState = "RESTORE_FAILED"
)

type CheckpointType

type CheckpointType string
const (
	CheckpointTypeNone    CheckpointType = "none"
	CheckpointTypeCRIU    CheckpointType = "criu"
	CheckpointTypePytorch CheckpointType = "pytorch"
)

type ClientInfo

type ClientInfo struct {
	Id              string `json:"id" mapstructure:"id"`
	Hostname        string `json:"hostname" mapstructure:"hostname"`
	Platform        string `json:"platform" mapstructure:"platform"`
	OS              string `json:"os" mapstructure:"os"`
	Uptime          uint64 `json:"uptime" mapstructure:"uptime"`
	RemainingMemory uint64 `json:"remaining_memory" mapstructure:"remaining_memory"`
}

type Commands

type Commands struct {
	C []string `mapstructure:"run"`
}

type GPU

type GPU struct {
	Name         string `json:"Name"`
	Manufacturer string `json:"Manufacturer"`
	Count        int    `json:"Count"`
	MemoryInfo   struct {
		SizeInMiB int `json:"SizeInMiB"`
	} `json:"MemoryInfo"`
}

type GpuInfo

type GpuInfo struct {
	Gpus                []GPU `json:"Gpus"`
	TotalGpuMemoryInMiB int   `json:"TotalGpuMemoryInMiB"`
}

type Instance

type Instance struct {
	gorm.Model
	CedanaID         string  `json:"-"`            // ignore json unmarshal. Cedana ID used for NATS messages
	AllocatedID      string  `json:"allocated_id"` // id allocated by the provider, not to be used as a key
	Provider         string  `json:"provider"`
	InstanceType     string  `json:"InstanceType"`
	AcceleratorName  string  `json:"AcceleratorName"`
	AcceleratorCount int     `json:"AcceleratorCount"`
	VCPUs            float64 `json:"vCPUs"`
	MemoryGiB        float64 `json:"MemoryGiB"`
	GPUs             string  `json:"GPU"`
	Region           string  `json:"Region"`
	AvailabilityZone string  `json:"AvailabilityZone"`
	Price            float64 `json:"Price"`
	IPAddress        string  `json:"ip_addr"`
	State            string  `json:"state"`
	Tag              string  `json:"-"` // tag instance as orch or client
}

func (*Instance) DeserializeSelf

func (i *Instance) DeserializeSelf(data []byte) (Instance, error)

func (*Instance) GetGPUs

func (i *Instance) GetGPUs() GpuInfo

func (*Instance) SerializeSelf

func (i *Instance) SerializeSelf() ([]byte, error)

type Job

type Job struct {
	gorm.Model
	JobID              string    `json:"job_id"`        // ignore json unmarshal
	JobFilePath        string    `json:"job_file_path"` // absolute path of job file
	Instances          string    `json:"instances"`     // serialized instances.TODO: need to figure out associations!!
	State              JobState  `json:"state"`
	Checkpointed       bool      `json:"checkpointed"`
	LastCheckpointedAt time.Time `json:"last_checkpointed_at"` // latest checkpoint
	Bucket             string    `json:"bucket"`
}

foreign keys are weird in GORM, just attach InstanceIDs for now

func (*Job) AppendInstance

func (j *Job) AppendInstance(id string) error

these should ideally be called from the db - keeps things consistent

func (*Job) GetInstanceIds

func (j *Job) GetInstanceIds() ([]SerializedInstance, error)

type JobFile

type JobFile struct {
	JobFilePath       string            `mapstructure:"job_file_path"`
	WorkDir           string            `mapstructure:"work_dir"` // TODO NR - should be s3 syncable?
	UserInstanceSpecs UserInstanceSpecs `mapstructure:"instance_specs"`
	SetupCommands     Commands          `mapstructure:"setup"`
	Task              Commands          `mapstructure:"task"`
	RestoredTask      Commands          `mapstructure:"restored_task"`
}

Job type to be used to run on an instance, user-defined should be yaml spec

func InitJobFile

func InitJobFile(filepath string) (*JobFile, error)

type JobState

type JobState string
const (
	JobStatePending     JobState = "PENDING"
	JobStateRunning     JobState = "RUNNING"
	JobStateFailed      JobState = "FAILED"
	JobStateDone        JobState = "DONE"
	JobStateSetupFailed JobState = "SETUP_FAILED"
)

type MetaState

type MetaState struct {
	Event            ProviderEvent    `json:"provider_event" mapstructure:"provider_event"`
	CheckpointReason CheckpointReason `json:"checkpoint_reason" mapstructure:"checkpoint_reason"`
}

type PricingModel

type PricingModel interface {
	GetPrices() []Instance
}

PricingModel populates Instance.Price

type ProcessInfo

type ProcessInfo struct {
	PID                     int32                   `json:"pid" mapstructure:"pid"`
	AttachedToHardwareAccel bool                    `json:"attached_to_hardware_accel" mapstructure:"attached_to_hardware_accel"`
	OpenFds                 []process.OpenFilesStat `json:"open_fds" mapstructure:"open_fds"` // list of open FDs
	OpenWriteOnlyFilePaths  []string                `json:"open_write_only" mapstructure:"open_write_only"`
	OpenConnections         []net.ConnectionStat    `json:"open_connections" mapstructure:"open_connections"` // open network connections
	MemoryPercent           float32                 `json:"memory_percent" mapstructure:"memory_percent"`     // % of total RAM used
	IsRunning               bool                    `json:"is_running" mapstructure:"is_running"`
	Status                  string                  `json:"status" mapstructure:"status"`
}

type Provider

type Provider interface {
	// CreateInstance takes a list of "optimal" instances as input and creates them.
	// We take multiple to circumvent any capacity issues.
	Name() string
	CreateInstance(Candidate *Instance) (*Instance, error)
	DestroyInstance(i Instance) error
	// Anywhere describeInstance is called, the entry in the db should be updated with the latest information
	DescribeInstance(Instances []*Instance, filter string) error
	// should encapsulate all events or state changes on the instance. Function that is used for state polling
	// regularly, so keep efficiency in mind when designing for a provider.
	GetInstanceStatus(i Instance) (*ProviderEvent, error)
}

Types for commodity providers (e.g. AWS, GCP, etc) generic interface for a commodity provider, that actors we broker between (AWS, GCP, etc) will each implement.

type ProviderEvent

type ProviderEvent struct {
	InstanceID string `json:"instance_id"`
	FaultCode  string `json:"fault_code"`
	// the below fields are deriviatives of the above, we keep the fault code for any downstream processing
	MarkedForTermination bool  `json:"marked_for_termination"`
	TerminationTime      int64 `json:"termination_time"`
}

type SerializedInstance

type SerializedInstance struct {
	InstanceID string `json:"instance_id"`
}

only serialize instanceID, can reverse lookup for instance using id

type ServerCommand

type ServerCommand struct {
	Command     string      `json:"command" mapstructure:"command"`
	Heartbeat   bool        `json:"heartbeat" mapstructure:"heartbeat"`
	CedanaState CedanaState `json:"cedana_state" mapstructure:"cedana_state"`
}

type UserCommands

type UserCommands struct {
	SetupCommands     Commands `mapstructure:"setup"`
	PostSetupCommands Commands `mapstructure:"post_setup"`
	PreCheckpoint     Commands `mapstructure:"pre_checkpoint"`
	PostCheckpoint    Commands `mapstructure:"post_checkpoint"`
	PreRestore        Commands `mapstructure:"pre_restore"`
	PostRestore       Commands `mapstructure:"post_restore"`
}

due to key-value nature of yaml, need a nested commands struct

type UserInstanceSpecs

type UserInstanceSpecs struct {
	InstanceType string  `mapstructure:"instance_type"`
	Memory       int     `mapstructure:"memory_gb"`
	VCPUs        int     `mapstructure:"cpu_cores"`
	VRAM         int     `mapstructure:"vram_gb"`
	GPU          string  `mapstructure:"gpu"`
	MaxPrice     float64 `mapstructure:"max_price_usd_hour"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL