config

package
v0.4.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 10, 2025 License: Apache-2.0 Imports: 67 Imported by: 0

Documentation

Overview

Package config provides the gpud configuration data for the server.

Index

Constants

View Source
const (
	DefaultAPIVersion = "v1"
	DefaultGPUdPort   = 15132
)

Variables

View Source
var (
	DefaultRefreshPeriod = metav1.Duration{Duration: time.Minute}

	// keep the metrics only for the last 3 hours
	DefaultRetentionPeriod = metav1.Duration{Duration: 3 * time.Hour}

	// compact/vacuum is disruptive to existing queries (including reads)
	// but necessary to keep the state database from growing indefinitely
	// TODO: disabled for now, until we have a better way to detect the performance issue
	DefaultCompactPeriod = metav1.Duration{Duration: 0}

	DefaultRefreshComponentsInterval = metav1.Duration{Duration: time.Minute}
)
View Source
var (
	DefaultNVIDIALibraries = []string{

		"libnvidia-ml.so",

		"libcuda.so",
	}
	DefaultNVIDIALibrariesSearchDirs = []string{

		"/",
		"/usr/lib64",
		"/usr/lib/x86_64-linux-gnu",
		"/usr/lib/aarch64-linux-gnu",
		"/usr/lib/x86_64-linux-gnu/nvidia/current",
		"/usr/lib/aarch64-linux-gnu/nvidia/current",
		"/lib64",
		"/lib/x86_64-linux-gnu",
		"/lib/aarch64-linux-gnu",
		"/lib/x86_64-linux-gnu/nvidia/current",
		"/lib/aarch64-linux-gnu/nvidia/current",
	}
)
View Source
var ErrInvalidAutoUpdateExitCode = errors.New("auto_update_exit_code is only valid when auto_update is enabled")

Functions

func DefaultConfigFile

func DefaultConfigFile() (string, error)

func DefaultContainerdComponent added in v0.0.4

func DefaultContainerdComponent(ctx context.Context) (any, bool)

func DefaultDmesgComponent added in v0.0.4

func DefaultDmesgComponent(ctx context.Context) (any, bool, error)

func DefaultDockerContainerComponent added in v0.0.4

func DefaultDockerContainerComponent(ctx context.Context, ignoreConnectionErrors bool) (any, bool)

func DefaultFifoFile

func DefaultFifoFile() (string, error)

func DefaultK8sPodComponent added in v0.0.4

func DefaultK8sPodComponent(ctx context.Context, ignoreConnectionErrors bool) (any, bool)

func DefaultStateFile

func DefaultStateFile() (string, error)

Types

type Config

type Config struct {
	APIVersion string `json:"api_version"`

	// Basic server annotations (e.g., machine id, host name, etc.).
	Annotations map[string]string `json:"annotations,omitempty"`

	// Address for the server to listen on.
	Address string `json:"address"`

	// Component specific configurations.
	Components map[string]any `json:"components,omitempty"`

	// State file that persists the latest status.
	// If empty, the states are not persisted to file.
	State string `json:"state"`

	// Amount of time to retain states/metrics for.
	// Once elapsed, old states/metrics are purged/compacted.
	RetentionPeriod metav1.Duration `json:"retention_period"`

	// Interval at which to compact the state database.
	CompactPeriod metav1.Duration `json:"compact_period"`

	// Interval at which to refresh selected components.
	// Disables refresh if not set.
	RefreshComponentsInterval metav1.Duration `json:"refresh_components_interval"`

	// Set true to enable profiler.
	Pprof bool `json:"pprof"`

	// Configures the local web configuration.
	Web *Web `json:"web,omitempty"`

	// Overwrites the tool binaries for testing.
	ToolOverwriteOptions ToolOverwriteOptions `json:"tool_overwrite_options"`

	// Set false to disable auto update
	EnableAutoUpdate bool `json:"enable_auto_update"`

	// Exit code to exit with when auto updating.
	// Only valid when the auto update is enabled.
	// Set -1 to disable the auto update by exit code.
	AutoUpdateExitCode int `json:"auto_update_exit_code"`
}

Config provides gpud configuration data for the server

func DefaultConfig

func DefaultConfig(ctx context.Context, opts ...OpOption) (*Config, error)

func (*Config) Validate

func (config *Config) Validate() error

type Op added in v0.0.4

type Op struct {
	FilesToCheck                  []string
	KernelModulesToCheck          []string
	DockerIgnoreConnectionErrors  bool
	KubeletIgnoreConnectionErrors bool

	nvidia_common.ToolOverwrites
}

func (*Op) ApplyOpts added in v0.0.5

func (op *Op) ApplyOpts(opts []OpOption) error

type OpOption added in v0.0.4

type OpOption func(*Op)

func WithDockerIgnoreConnectionErrors added in v0.0.5

func WithDockerIgnoreConnectionErrors(b bool) OpOption

func WithFilesToCheck added in v0.0.4

func WithFilesToCheck(files ...string) OpOption

func WithIbstatCommand added in v0.4.0

func WithIbstatCommand(p string) OpOption

Specifies the ibstat binary path to overwrite the default path.

func WithKernelModulesToCheck added in v0.2.0

func WithKernelModulesToCheck(modules ...string) OpOption

func WithKubeletIgnoreConnectionErrors added in v0.0.5

func WithKubeletIgnoreConnectionErrors(b bool) OpOption

func WithNvidiaSMICommand added in v0.4.0

func WithNvidiaSMICommand(p string) OpOption

Specifies the nvidia-smi binary path to overwrite the default path.

func WithNvidiaSMIQueryCommand added in v0.4.0

func WithNvidiaSMIQueryCommand(p string) OpOption

type ToolOverwriteOptions added in v0.4.0

type ToolOverwriteOptions struct {
	NvidiaSMICommand      string `json:"nvidia_smi_command"`
	NvidiaSMIQueryCommand string `json:"nvidia_smi_query_command"`
	IbstatCommand         string `json:"ibstat_command"`
}

type Web

type Web struct {
	// Enable the web interface.
	Enable bool `json:"enable"`

	// Enable the admin interface.
	Admin bool `json:"admin"`

	// RefreshPeriod is the time period to refresh metrics.
	RefreshPeriod metav1.Duration `json:"refresh_period"`

	// SincePeriod is the time period to start displaying metrics from.
	SincePeriod metav1.Duration `json:"since_period"`
}

Configures the local web configuration.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL