libcontainer

package
v0.0.0-...-bf3866f Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 9, 2023 License: Apache-2.0 Imports: 55 Imported by: 0

README

libcontainer

Go Reference

Libcontainer provides a native Go implementation for creating containers with namespaces, cgroups, capabilities, and filesystem access controls. It allows you to manage the lifecycle of the container performing additional operations after the container is created.

Container

A container is a self contained execution environment that shares the kernel of the host system and which is (optionally) isolated from other containers in the system.

Using libcontainer

Because containers are spawned in a two step process you will need a binary that will be executed as the init process for the container. In libcontainer, we use the current binary (/proc/self/exe) to be executed as the init process, and use arg "init", we call the first step process "bootstrap", so you always need a "init" function as the entry of "bootstrap".

In addition to the go init function the early stage bootstrap is handled by importing nsenter.

For details on how runc implements such "init", see init.go and libcontainer/init_linux.go.

Then to create a container you first have to create a configuration struct describing how the container is to be created. A sample would look similar to this:

defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
var devices []*devices.Rule
for _, device := range specconv.AllowedDevices {
	devices = append(devices, &device.Rule)
}
config := &configs.Config{
	Rootfs: "/your/path/to/rootfs",
	Capabilities: &configs.Capabilities{
		Bounding: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
		Effective: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
		Permitted: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
		Ambient: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
	},
	Namespaces: configs.Namespaces([]configs.Namespace{
		{Type: configs.NEWNS},
		{Type: configs.NEWUTS},
		{Type: configs.NEWIPC},
		{Type: configs.NEWPID},
		{Type: configs.NEWUSER},
		{Type: configs.NEWNET},
		{Type: configs.NEWCGROUP},
	}),
	Cgroups: &configs.Cgroup{
		Name:   "test-container",
		Parent: "system",
		Resources: &configs.Resources{
			MemorySwappiness: nil,
			Devices:          devices,
		},
	},
	MaskPaths: []string{
		"/proc/kcore",
		"/sys/firmware",
	},
	ReadonlyPaths: []string{
		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
	},
	Devices:  specconv.AllowedDevices,
	Hostname: "testing",
	Mounts: []*configs.Mount{
		{
			Source:      "proc",
			Destination: "/proc",
			Device:      "proc",
			Flags:       defaultMountFlags,
		},
		{
			Source:      "tmpfs",
			Destination: "/dev",
			Device:      "tmpfs",
			Flags:       unix.MS_NOSUID | unix.MS_STRICTATIME,
			Data:        "mode=755",
		},
		{
			Source:      "devpts",
			Destination: "/dev/pts",
			Device:      "devpts",
			Flags:       unix.MS_NOSUID | unix.MS_NOEXEC,
			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
		},
		{
			Device:      "tmpfs",
			Source:      "shm",
			Destination: "/dev/shm",
			Data:        "mode=1777,size=65536k",
			Flags:       defaultMountFlags,
		},
		{
			Source:      "mqueue",
			Destination: "/dev/mqueue",
			Device:      "mqueue",
			Flags:       defaultMountFlags,
		},
		{
			Source:      "sysfs",
			Destination: "/sys",
			Device:      "sysfs",
			Flags:       defaultMountFlags | unix.MS_RDONLY,
		},
	},
	UIDMappings: []configs.IDMap{
		{
			ContainerID: 0,
			HostID: 1000,
			Size: 65536,
		},
	},
	GIDMappings: []configs.IDMap{
		{
			ContainerID: 0,
			HostID: 1000,
			Size: 65536,
		},
	},
	Networks: []*configs.Network{
		{
			Type:    "loopback",
			Address: "127.0.0.1/0",
			Gateway: "localhost",
		},
	},
	Rlimits: []configs.Rlimit{
		{
			Type: unix.RLIMIT_NOFILE,
			Hard: uint64(1025),
			Soft: uint64(1025),
		},
	},
}

Once you have the configuration populated you can create a container with a specified ID under a specified state directory:

container, err := libcontainer.Create("/run/containers", "container-id", config)
if err != nil {
	logrus.Fatal(err)
	return
}

To spawn bash as the initial process inside the container and have the processes pid returned in order to wait, signal, or kill the process:

process := &libcontainer.Process{
	Args:   []string{"/bin/bash"},
	Env:    []string{"PATH=/bin"},
	User:   "daemon",
	Stdin:  os.Stdin,
	Stdout: os.Stdout,
	Stderr: os.Stderr,
	Init:   true,
}

err := container.Run(process)
if err != nil {
	container.Destroy()
	logrus.Fatal(err)
	return
}

// wait for the process to finish.
_, err := process.Wait()
if err != nil {
	logrus.Fatal(err)
}

// destroy the container.
container.Destroy()

Additional ways to interact with a running container are:

// return all the pids for all processes running inside the container.
processes, err := container.Processes()

// get detailed cpu, memory, io, and network statistics for the container and
// it's processes.
stats, err := container.Stats()

// pause all processes inside the container.
container.Pause()

// resume all paused processes.
container.Resume()

// send signal to container's init process.
container.Signal(signal)

// update container resource constraints.
container.Set(config)

// get current status of the container.
status, err := container.Status()

// get current container's state information.
state, err := container.State()
Checkpoint & Restore

libcontainer now integrates CRIU for checkpointing and restoring containers. This lets you save the state of a process running inside a container to disk, and then restore that state into a new process, on the same machine or on another machine.

criu version 1.5.2 or higher is required to use checkpoint and restore. If you don't already have criu installed, you can build it from source, following the online instructions. criu is also installed in the docker image generated when building libcontainer with docker.

Code and documentation copyright 2014 Docker, inc. The code and documentation are released under the Apache 2.0 license. The documentation is also released under Creative Commons Attribution 4.0 International License. You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.

Documentation

Overview

Package libcontainer provides a native Go implementation for creating containers with namespaces, cgroups, capabilities, and filesystem access controls. It allows you to manage the lifecycle of the container performing additional operations after the container is created.

Index

Constants

View Source
const (
	InitMsg          uint16 = 62000
	CloneFlagsAttr   uint16 = 27281
	NsPathsAttr      uint16 = 27282
	UidmapAttr       uint16 = 27283
	GidmapAttr       uint16 = 27284
	SetgroupAttr     uint16 = 27285
	OomScoreAdjAttr  uint16 = 27286
	RootlessEUIDAttr uint16 = 27287
	UidmapPathAttr   uint16 = 27288
	GidmapPathAttr   uint16 = 27289
	MountSourcesAttr uint16 = 27290
	IdmapSourcesAttr uint16 = 27291
)

list of known message types we want to send to bootstrap program The number is randomly chosen to not conflict with known netlink types

Variables

View Source
var (
	ErrExist      = errors.New("container with given ID already exists")
	ErrInvalidID  = errors.New("invalid container ID format")
	ErrNotExist   = errors.New("container does not exist")
	ErrPaused     = errors.New("container paused")
	ErrRunning    = errors.New("container still running")
	ErrNotRunning = errors.New("container not running")
	ErrNotPaused  = errors.New("container not paused")
)

Functions

func Init

func Init()

Init is part of "runc init" implementation.

Types

type BaseState

type BaseState struct {
	// ID is the container ID.
	ID string `json:"id"`

	// InitProcessPid is the init process id in the parent namespace.
	InitProcessPid int `json:"init_process_pid"`

	// InitProcessStartTime is the init process start time in clock cycles since boot time.
	InitProcessStartTime uint64 `json:"init_process_start"`

	// Created is the unix timestamp for the creation time of the container in UTC
	Created time.Time `json:"created"`

	// Config is the container's configuration.
	Config configs.Config `json:"config"`
}

BaseState represents the platform agnostic pieces relating to a running container's state

type Boolmsg

type Boolmsg struct {
	Type  uint16
	Value bool
}

func (*Boolmsg) Len

func (msg *Boolmsg) Len() int

func (*Boolmsg) Serialize

func (msg *Boolmsg) Serialize() []byte

type Bytemsg

type Bytemsg struct {
	Type  uint16
	Value []byte
}

Bytemsg has the following representation | nlattr len | nlattr type | | value | pad |

func (*Bytemsg) Len

func (msg *Bytemsg) Len() int

func (*Bytemsg) Serialize

func (msg *Bytemsg) Serialize() []byte

type Container

type Container struct {
	// contains filtered or unexported fields
}

Container is a libcontainer container object.

func Create

func Create(root, id string, config *configs.Config) (*Container, error)

Create creates a new container with the given id inside a given state directory (root), and returns a Container object.

The root is a state directory which many containers can share. It can be used later to get the list of containers, or to get information about a particular container (see Load).

The id must not be empty and consist of only the following characters: ASCII letters, digits, underscore, plus, minus, period. The id must be unique and non-existent for the given root path.

func Load

func Load(root, id string) (*Container, error)

Load takes a path to the state directory (root) and an id of an existing container, and returns a Container object reconstructed from the saved state. This presents a read only view of the container.

func (*Container) Checkpoint

func (c *Container) Checkpoint(criuOpts *CriuOpts) error

func (*Container) Config

func (c *Container) Config() configs.Config

Config returns the container's configuration

func (*Container) Destroy

func (c *Container) Destroy() error

Destroy destroys the container, if its in a valid state.

Any event registrations are removed before the container is destroyed. No error is returned if the container is already destroyed.

Running containers must first be stopped using Signal. Paused containers must first be resumed using Resume.

func (*Container) Exec

func (c *Container) Exec() error

Exec signals the container to exec the users process at the end of the init.

func (*Container) ID

func (c *Container) ID() string

ID returns the container's unique ID

func (*Container) NotifyMemoryPressure

func (c *Container) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)

NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level.

func (*Container) NotifyOOM

func (c *Container) NotifyOOM() (<-chan struct{}, error)

NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.

func (*Container) OCIState

func (c *Container) OCIState() (*specs.State, error)

OCIState returns the current container's state information.

func (*Container) Pause

func (c *Container) Pause() error

Pause pauses the container, if its state is RUNNING or CREATED, changing its state to PAUSED. If the state is already PAUSED, does nothing.

func (*Container) Processes

func (c *Container) Processes() ([]int, error)

Processes returns the PIDs inside this container. The PIDs are in the namespace of the calling process.

Some of the returned PIDs may no longer refer to processes in the container, unless the container state is PAUSED in which case every PID in the slice is valid.

func (*Container) Restore

func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error

Restore restores the checkpointed container to a running state using the criu(8) utility.

func (*Container) Resume

func (c *Container) Resume() error

Resume resumes the execution of any user processes in the container before setting the container state to RUNNING. This is only performed if the current state is PAUSED. If the Container state is RUNNING, does nothing.

func (*Container) Run

func (c *Container) Run(process *Process) error

Run immediately starts the process inside the container. Returns an error if the process fails to start. It does not block waiting for the exec fifo after start returns but opens the fifo after start returns.

func (*Container) Set

func (c *Container) Set(config configs.Config) error

Set resources of container as configured. Can be used to change resources when the container is running.

func (*Container) Signal

func (c *Container) Signal(s os.Signal) error

Signal sends a specified signal to container's init.

When s is SIGKILL and the container does not have its own PID namespace, all the container's processes are killed. In this scenario, the libcontainer user may be required to implement a proper child reaper.

func (*Container) Start

func (c *Container) Start(process *Process) error

Start starts a process inside the container. Returns error if process fails to start. You can track process lifecycle with passed Process structure.

func (*Container) State

func (c *Container) State() (*State, error)

State returns the current container's state information.

func (*Container) Stats

func (c *Container) Stats() (*Stats, error)

Stats returns statistics for the container.

func (*Container) Status

func (c *Container) Status() (Status, error)

Status returns the current status of the container.

type CriuOpts

type CriuOpts struct {
	ImagesDirectory         string             // directory for storing image files
	WorkDirectory           string             // directory to cd and write logs/pidfiles/stats to
	ParentImage             string             // directory for storing parent image files in pre-dump and dump
	LeaveRunning            bool               // leave container in running state after checkpoint
	TcpEstablished          bool               // checkpoint/restore established TCP connections
	ExternalUnixConnections bool               // allow external unix connections
	ShellJob                bool               // allow to dump and restore shell jobs
	FileLocks               bool               // handle file locks, for safety
	PreDump                 bool               // call criu predump to perform iterative checkpoint
	PageServer              CriuPageServerInfo // allow to dump to criu page server
	VethPairs               []VethPairName     // pass the veth to criu when restore
	ManageCgroupsMode       criu.CriuCgMode    // dump or restore cgroup mode
	EmptyNs                 uint32             // don't c/r properties for namespace from this mask
	AutoDedup               bool               // auto deduplication for incremental dumps
	LazyPages               bool               // restore memory pages lazily using userfaultfd
	StatusFd                int                // fd for feedback when lazy server is ready
	LsmProfile              string             // LSM profile used to restore the container
	LsmMountContext         string             // LSM mount context value to use during restore
}

type CriuPageServerInfo

type CriuPageServerInfo struct {
	Address string // IP address of CRIU page server
	Port    int32  // port number of CRIU page server
}

type IO

type IO struct {
	Stdin  io.WriteCloser
	Stdout io.ReadCloser
	Stderr io.ReadCloser
}

IO holds the process's STDIO

type Int32msg

type Int32msg struct {
	Type  uint16
	Value uint32
}

func (*Int32msg) Len

func (msg *Int32msg) Len() int

func (*Int32msg) Serialize

func (msg *Int32msg) Serialize() []byte

Serialize serializes the message. Int32msg has the following representation | nlattr len | nlattr type | | uint32 value |

type PressureLevel

type PressureLevel uint
const (
	LowPressure PressureLevel = iota
	MediumPressure
	CriticalPressure
)

type Process

type Process struct {
	// The command to be run followed by any arguments.
	Args []string

	// Env specifies the environment variables for the process.
	Env []string

	// User will set the uid and gid of the executing process running inside the container
	// local to the container's user and group configuration.
	User string

	// AdditionalGroups specifies the gids that should be added to supplementary groups
	// in addition to those that the user belongs to.
	AdditionalGroups []string

	// Cwd will change the processes current working directory inside the container's rootfs.
	Cwd string

	// Stdin is a pointer to a reader which provides the standard input stream.
	Stdin io.Reader

	// Stdout is a pointer to a writer which receives the standard output stream.
	Stdout io.Writer

	// Stderr is a pointer to a writer which receives the standard error stream.
	Stderr io.Writer

	// ExtraFiles specifies additional open files to be inherited by the container
	ExtraFiles []*os.File

	// Initial sizings for the console
	ConsoleWidth  uint16
	ConsoleHeight uint16

	// Capabilities specify the capabilities to keep when executing the process inside the container
	// All capabilities not specified will be dropped from the processes capability mask
	Capabilities *configs.Capabilities

	// AppArmorProfile specifies the profile to apply to the process and is
	// changed at the time the process is execed
	AppArmorProfile string

	// Label specifies the label to apply to the process.  It is commonly used by selinux
	Label string

	// NoNewPrivileges controls whether processes can gain additional privileges.
	NoNewPrivileges *bool

	// Rlimits specifies the resource limits, such as max open files, to set in the container
	// If Rlimits are not set, the container will inherit rlimits from the parent process
	Rlimits []configs.Rlimit

	// ConsoleSocket provides the masterfd console.
	ConsoleSocket *os.File

	// Init specifies whether the process is the first process in the container.
	Init bool

	// LogLevel is a string containing a numeric representation of the current
	// log level (i.e. "4", but never "info"). It is passed on to runc init as
	// _LIBCONTAINER_LOGLEVEL environment variable.
	LogLevel string

	// SubCgroupPaths specifies sub-cgroups to run the process in.
	// Map keys are controller names, map values are paths (relative to
	// container's top-level cgroup).
	//
	// If empty, the default top-level container's cgroup is used.
	//
	// For cgroup v2, the only key allowed is "".
	SubCgroupPaths map[string]string
	// contains filtered or unexported fields
}

Process specifies the configuration and IO for a process inside a container.

func (*Process) InitializeIO

func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error)

InitializeIO creates pipes for use with the process's stdio and returns the opposite side for each. Do not use this if you want to have a pseudoterminal set up for you by libcontainer (TODO: fix that too). TODO: This is mostly unnecessary, and should be handled by clients.

func (Process) Pid

func (p Process) Pid() (int, error)

Pid returns the process ID

func (Process) Signal

func (p Process) Signal(sig os.Signal) error

Signal sends a signal to the Process.

func (Process) Wait

func (p Process) Wait() (*os.ProcessState, error)

Wait waits for the process to exit. Wait releases any resources associated with the Process

type State

type State struct {
	BaseState

	// Specified if the container was started under the rootless mode.
	// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
	Rootless bool `json:"rootless"`

	// Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
	//
	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
	// to the cgroup for this subsystem.
	//
	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
	CgroupPaths map[string]string `json:"cgroup_paths"`

	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
	// with the value as the path.
	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`

	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
	ExternalDescriptors []string `json:"external_descriptors,omitempty"`

	// Intel RDT "resource control" filesystem path
	IntelRdtPath string `json:"intel_rdt_path"`
}

State represents a running container's state

type Stats

type Stats struct {
	Interfaces    []*types.NetworkInterface
	CgroupStats   *cgroups.Stats
	IntelRdtStats *intelrdt.Stats
}

type Status

type Status int

Status is the status of a container.

const (
	// Created is the status that denotes the container exists but has not been run yet.
	Created Status = iota
	// Running is the status that denotes the container exists and is running.
	Running
	// Paused is the status that denotes the container exists, but all its processes are paused.
	Paused
	// Stopped is the status that denotes the container does not have a created or running process.
	Stopped
)

func (Status) String

func (s Status) String() string

type VethPairName

type VethPairName struct {
	ContainerInterfaceName string
	HostInterfaceName      string
}

Directories

Path Synopsis
devices
Implements creation of eBPF device filter program.
Implements creation of eBPF device filter program.
fs
fs2
integration is used for integration testing of libcontainer
integration is used for integration testing of libcontainer
Package specconv implements conversion of specifications to libcontainer configurations
Package specconv implements conversion of specifications to libcontainer configurations

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL