Documentation ¶
Overview ¶
Package kernel provides an emulation of the Linux kernel.
See README.md for a detailed overview.
Lock order (outermost locks must be taken first):
Kernel.extMu
TaskSet.mu SignalHandlers.mu Task.mu
Locking SignalHandlers.mu in multiple SignalHandlers requires locking TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same time requires locking all of their signal mutexes first.
Index ¶
- Constants
- Variables
- func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool
- func RegisterSyscallTable(s *SyscallTable)
- type AbstractSocketNamespace
- type Auxmap
- type CloneOptions
- type CreateProcessArgs
- type ExitStatus
- type FDFlags
- type FDMap
- func (f *FDMap) DecRef()
- func (f *FDMap) Fork() *FDMap
- func (f *FDMap) GetDescriptor(fd kdefs.FD) (*fs.File, FDFlags)
- func (f *FDMap) GetFDs() FDs
- func (f *FDMap) GetFile(fd kdefs.FD) *fs.File
- func (f *FDMap) GetRefs() []*fs.File
- func (f *FDMap) ID() uint64
- func (f *FDMap) NewFDAt(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) error
- func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error)
- func (f *FDMap) Remove(fd kdefs.FD) (*fs.File, bool)
- func (f *FDMap) RemoveIf(cond func(*fs.File, FDFlags) bool)
- func (f *FDMap) SetFlags(fd kdefs.FD, flags FDFlags)
- func (f *FDMap) Size() int
- func (f *FDMap) String() string
- type FDs
- type FSContext
- func (f *FSContext) DecRef()
- func (f *FSContext) Fork() *FSContext
- func (f *FSContext) RootDirectory() *fs.Dirent
- func (f *FSContext) SetRootDirectory(d *fs.Dirent)
- func (f *FSContext) SetWorkingDirectory(d *fs.Dirent)
- func (f *FSContext) SwapUmask(mask uint) uint
- func (f *FSContext) Umask() uint
- func (f *FSContext) WorkingDirectory() *fs.Dirent
- type IPCNamespace
- type InitKernelArgs
- type Kernel
- func (k *Kernel) ApplicationCores() uint
- func (k *Kernel) CPUClockNow() uint64
- func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error)
- func (k *Kernel) Destroy()
- func (k *Kernel) ExitError() error
- func (k *Kernel) FeatureSet() *cpuid.FeatureSet
- func (k *Kernel) GenerateInotifyCookie() uint32
- func (k *Kernel) GlobalInit() *ThreadGroup
- func (k *Kernel) Init(args InitKernelArgs) error
- func (k *Kernel) Kill(es ExitStatus)
- func (k *Kernel) LoadFrom(r io.Reader, p platform.Platform, net inet.Stack) error
- func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, ...) (*TaskContext, error)
- func (k *Kernel) MonotonicClock() ktime.Clock
- func (k *Kernel) NetlinkPorts() *port.Manager
- func (k *Kernel) NetworkStack() inet.Stack
- func (k *Kernel) NewFDMap() *FDMap
- func (k *Kernel) NowNanoseconds() int64
- func (k *Kernel) Pause()
- func (k *Kernel) RealtimeClock() ktime.Clock
- func (k *Kernel) RootIPCNamespace() *IPCNamespace
- func (k *Kernel) RootMountNamespace() *fs.MountNamespace
- func (k *Kernel) RootUTSNamespace() *UTSNamespace
- func (k *Kernel) RootUserNamespace() *auth.UserNamespace
- func (k *Kernel) SaveTo(w io.Writer) error
- func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool
- func (k *Kernel) SetExitError(err error)
- func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace)
- func (k *Kernel) Start() error
- func (k *Kernel) SupervisorContext() context.Context
- func (k *Kernel) Syslog() *syslog
- func (k *Kernel) TaskSet() *TaskSet
- func (k *Kernel) Timekeeper() *Timekeeper
- func (k *Kernel) UniqueID() uint64
- func (k *Kernel) Unpause()
- func (k *Kernel) WaitExited()
- type MissingFn
- type PIDNamespace
- func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID
- func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID
- func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID
- func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID
- func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace
- func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup
- func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session
- func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task
- func (ns *PIDNamespace) Tasks() []*Task
- func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup
- func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup
- func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace
- type ProcessGroup
- type ProcessGroupID
- type RSEQCriticalRegion
- type Session
- type SessionID
- type SharingOptions
- type SignalAction
- type SignalHandlers
- type Stracer
- type SyscallControl
- type SyscallFlagsTable
- type SyscallFn
- type SyscallRestartBlock
- type SyscallRestartErrno
- type SyscallTable
- type Task
- func (t *Task) AbstractSockets() *AbstractSocketNamespace
- func (t *Task) Activate()
- func (t *Task) AppendSyscallFilter(p bpf.Program) error
- func (t *Task) Arch() arch.Context
- func (t *Task) AsyncContext() context.Context
- func (t *Task) BeginExternalStop()
- func (t *Task) Block(C chan struct{}) error
- func (t *Task) BlockWithDeadline(C chan struct{}, haveDeadline bool, deadline ktime.Time) error
- func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error)
- func (t *Task) BlockWithTimer(C chan struct{}, tchan <-chan struct{}) error
- func (t *Task) CPU() int32
- func (t *Task) CPUClock() ktime.Clock
- func (t *Task) CPUMask() sched.CPUSet
- func (t *Task) CPUStats() usage.CPUStats
- func (t *Task) CanTrace(target *Task, attach bool) bool
- func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error)
- func (t *Task) CopyIn(addr usermem.Addr, dst interface{}) (int, error)
- func (t *Task) CopyInBytes(addr usermem.Addr, dst []byte) (int, error)
- func (t *Task) CopyInIovecs(addr usermem.Addr, numIovecs int) (usermem.AddrRangeSeq, error)
- func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error)
- func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error)
- func (t *Task) CopyInString(addr usermem.Addr, maxlen int) (string, error)
- func (t *Task) CopyInVector(addr usermem.Addr, maxElemSize, maxTotalSize int) ([]string, error)
- func (t *Task) CopyOut(addr usermem.Addr, src interface{}) (int, error)
- func (t *Task) CopyOutBytes(addr usermem.Addr, src []byte) (int, error)
- func (t *Task) CopyOutIovecs(addr usermem.Addr, src usermem.AddrRangeSeq) error
- func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error
- func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error
- func (t *Task) CopyScratchBuffer(size int) []byte
- func (t *Task) Credentials() auth.Credentials
- func (t *Task) Deactivate()
- func (t *Task) DebugDumpState()
- func (t *Task) Debugf(fmt string, v ...interface{})
- func (t *Task) DropBoundingCapability(cp linux.Capability) error
- func (t *Task) EndExternalStop()
- func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error)
- func (t *Task) ExitState() TaskExitState
- func (t *Task) ExitStatus() ExitStatus
- func (t *Task) ExtractErrno(err error, sysno int) int
- func (t *Task) ExtractTask() *Task
- func (t *Task) FDMap() *FDMap
- func (t *Task) FSContext() *FSContext
- func (t *Task) Futex() *futex.Manager
- func (t *Task) FutexWaiter() *futex.Waiter
- func (t *Task) HasCapability(cp linux.Capability) bool
- func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool
- func (t *Task) IOUsage() *usage.IO
- func (t *Task) IPCNamespace() *IPCNamespace
- func (t *Task) Infof(fmt string, v ...interface{})
- func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error)
- func (t *Task) IsChrooted() bool
- func (t *Task) IsLogging(level log.Level) bool
- func (t *Task) IsNetworkNamespaced() bool
- func (t *Task) Kernel() *Kernel
- func (t *Task) Limits() *limits.LimitSet
- func (t *Task) MaxRSS(which int32) uint64
- func (t *Task) MemoryManager() *mm.MemoryManager
- func (t *Task) MountNamespace() *fs.MountNamespace
- func (t *Task) Name() string
- func (t *Task) NetworkContext() inet.Stack
- func (t *Task) Niceness() int
- func (t *Task) NumaPolicy() (policy int32, nodeMask uint32)
- func (t *Task) OnSignalStack(s arch.SignalStack) bool
- func (t *Task) PIDNamespace() *PIDNamespace
- func (t *Task) Parent() *Task
- func (t *Task) ParentDeathSignal() linux.Signal
- func (t *Task) PendingSignals() linux.SignalSet
- func (t *Task) PrepareExit(es ExitStatus)
- func (t *Task) PrepareGroupExit(es ExitStatus)
- func (t *Task) Priority() int
- func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error
- func (t *Task) RSEQAvailable() bool
- func (t *Task) RSEQCPUAddr() usermem.Addr
- func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion
- func (t *Task) SeccompMode() int
- func (t *Task) SendGroupSignal(info *arch.SignalInfo) error
- func (t *Task) SendSignal(info *arch.SignalInfo) error
- func (t *Task) SetCPUMask(mask sched.CPUSet) error
- func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error
- func (t *Task) SetClearTID(addr usermem.Addr)
- func (t *Task) SetExtraGIDs(gids []auth.GID) error
- func (t *Task) SetGID(gid auth.GID) error
- func (t *Task) SetKeepCaps(k bool)
- func (t *Task) SetName(name string)
- func (t *Task) SetNiceness(n int)
- func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32)
- func (t *Task) SetParentDeathSignal(sig linux.Signal)
- func (t *Task) SetREGID(r, e auth.GID) error
- func (t *Task) SetRESGID(r, e, s auth.GID) error
- func (t *Task) SetRESUID(r, e, s auth.UID) error
- func (t *Task) SetREUID(r, e auth.UID) error
- func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error
- func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error
- func (t *Task) SetSavedSignalMask(mask linux.SignalSet)
- func (t *Task) SetSignalMask(mask linux.SignalSet)
- func (t *Task) SetSignalStack(alt arch.SignalStack) error
- func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock)
- func (t *Task) SetUID(uid auth.UID) error
- func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error
- func (t *Task) SignalMask() linux.SignalSet
- func (t *Task) SignalReturn(rt bool) (*SyscallControl, error)
- func (t *Task) SignalStack() arch.SignalStack
- func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error)
- func (t *Task) SleepFinish(success bool)
- func (t *Task) SleepStart() <-chan struct{}
- func (t *Task) Stack() *arch.Stack
- func (t *Task) Start(tid ThreadID)
- func (t *Task) StartTime() ktime.Time
- func (t *Task) StateStatus() string
- func (t *Task) SyscallRestartBlock() SyscallRestartBlock
- func (t *Task) SyscallTable() *SyscallTable
- func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo
- func (t *Task) TaskContext() *TaskContext
- func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo
- func (t *Task) TaskResources() *TaskResources
- func (t *Task) TaskSet() *TaskSet
- func (t *Task) ThreadGroup() *ThreadGroup
- func (t *Task) ThreadID() ThreadID
- func (t *Task) Timekeeper() *Timekeeper
- func (t *Task) Tracer() *Task
- func (t *Task) UTSNamespace() *UTSNamespace
- func (t *Task) UninterruptibleSleepFinish(activate bool)
- func (t *Task) UninterruptibleSleepStart(deactivate bool)
- func (t *Task) Unshare(opts *SharingOptions) error
- func (t *Task) UserCPUClock() ktime.Clock
- func (t *Task) UserNamespace() *auth.UserNamespace
- func (t *Task) Value(key interface{}) interface{}
- func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error)
- func (t *Task) Warningf(fmt string, v ...interface{})
- func (t *Task) WithMuLocked(f func(*Task))
- func (t *Task) Yield()
- type TaskConfig
- type TaskContext
- type TaskExitState
- type TaskGoroutineSchedInfo
- type TaskGoroutineState
- type TaskMaybe
- type TaskResources
- type TaskSet
- type TaskStop
- type ThreadGroup
- func (tg *ThreadGroup) CPUClock() ktime.Clock
- func (tg *ThreadGroup) CPUStats() usage.CPUStats
- func (tg *ThreadGroup) Count() int
- func (tg *ThreadGroup) CreateProcessGroup() error
- func (tg *ThreadGroup) CreateSession() error
- func (tg *ThreadGroup) ExitStatus() ExitStatus
- func (tg *ThreadGroup) ID() ThreadID
- func (tg *ThreadGroup) IOUsage() *usage.IO
- func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error
- func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats
- func (tg *ThreadGroup) Leader() *Task
- func (tg *ThreadGroup) Limits() *limits.LimitSet
- func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID
- func (tg *ThreadGroup) PIDNamespace() *PIDNamespace
- func (tg *ThreadGroup) ProcessGroup() *ProcessGroup
- func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error
- func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error
- func (tg *ThreadGroup) Session() *Session
- func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit)
- func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error)
- func (tg *ThreadGroup) SignalHandlers() *SignalHandlers
- func (tg *ThreadGroup) TaskSet() *TaskSet
- func (tg *ThreadGroup) TerminationSignal() linux.Signal
- func (tg *ThreadGroup) Timer() *TimerManager
- func (tg *ThreadGroup) UserCPUClock() ktime.Clock
- func (tg *ThreadGroup) WaitExited()
- type ThreadID
- type Timekeeper
- type TimerManager
- type UTSNamespace
- func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace
- func (u *UTSNamespace) DomainName() string
- func (u *UTSNamespace) HostName() string
- func (u *UTSNamespace) SetDomainName(domain string)
- func (u *UTSNamespace) SetHostName(host string)
- func (u *UTSNamespace) UserNamespace() *auth.UserNamespace
- type VDSOParamPage
- type Version
- type WaitOptions
- type WaitResult
Constants ¶
const ( // CtxCanTrace is a Context.Value key for a function with the same // signature and semantics as kernel.Task.CanTrace. CtxCanTrace contextID = iota // CtxKernel is a Context.Value key for a Kernel. CtxKernel // CtxPIDNamespace is a Context.Value key for a PIDNamespace. CtxPIDNamespace // CtxTask is a Context.Value key for a Task. CtxTask // CtxUTSNamespace is a Context.Value key for a UTSNamespace. CtxUTSNamespace // CtxIPCNamespace is a Context.Value key for a IPCNamespace. CtxIPCNamespace )
const ( PTRACE_SEIZE = 0x4206 PTRACE_INTERRUPT = 0x4207 PTRACE_LISTEN = 0x4208 PTRACE_PEEKSIGINFO = 0x4209 PTRACE_GETSIGMASK = 0x420a PTRACE_SETSIGMASK = 0x420b )
ptrace constants from Linux's include/uapi/linux/ptrace.h.
const ( // StraceEnableLog enables syscall log tracing. StraceEnableLog // StraceEnableEvent enables syscall event tracing. StraceEnableEvent // ExternalBeforeEnable enables the external hook before syscall execution. ExternalBeforeEnable // ExternalAfterEnable enables the external hook after syscall execution. ExternalAfterEnable )
Possible flags for SyscallFlagsTable.enable.
const ( // EventExit represents an exit notification generated for a child thread // group leader or a tracee under the conditions specified in the comment // above runExitNotify. EventExit waiter.EventMask = 1 << iota // EventChildGroupStop occurs when a child thread group completes a group // stop (i.e. all tasks in the child thread group have entered a stopped // state as a result of a group stop). EventChildGroupStop // EventTraceeStop occurs when a task that is ptraced by a task in the // notified thread group enters a ptrace stop (see ptrace(2)). EventTraceeStop // EventGroupContinue occurs when a child thread group, or a thread group // whose leader is ptraced by a task in the notified thread group, that had // initiated or completed a group stop leaves the group stop, due to the // child thread group or any task in the child thread group being sent // SIGCONT. EventGroupContinue )
Task events that can be waited for.
const ( // ERESTARTSYS is returned by an interrupted syscall to indicate that it // should be converted to EINTR if interrupted by a signal delivered to a // user handler without SA_RESTART set, and restarted otherwise. ERESTARTSYS = SyscallRestartErrno(512) // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it // should always be restarted. ERESTARTNOINTR = SyscallRestartErrno(513) // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it // should be converted to EINTR if interrupted by a signal delivered to a // user handler, and restarted otherwise. ERESTARTNOHAND = SyscallRestartErrno(514) // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate // that it should be restarted using a custom function. The interrupted // syscall must register a custom restart function by calling // Task.SetRestartSyscallFn. ERESTART_RESTARTBLOCK = SyscallRestartErrno(516) )
These numeric values are significant because ptrace syscall exit tracing can observe them.
For all of the following errnos, if the syscall is not interrupted by a signal delivered to a user handler, the syscall is restarted.
const SignalPanic = linux.SIGUSR2
SignalPanic is used to panic the running threads. It is a signal which cannot be used by the application: it must be caught and ignored by the runtime (in order to catch possible races).
const StraceEnableBits = StraceEnableLog | StraceEnableEvent
StraceEnableBits combines both strace log and event flags.
const TasksLimit = (1 << 16)
TasksLimit is the maximum number of threads for untrusted application. Linux doesn't really limit this directly, rather it is limited by total memory size, stacks allocated and a global maximum. There's no real reason for us to limit it either, (esp. since threads are backed by go routines), and we would expect to hit resource limits long before hitting this number. However, for correctness, we still check that the user doesn't exceed this number.
Note that because of the way futexes are implemented, there *are* in fact serious restrictions on valid thread IDs. They are limited to 2^30 - 1 (kernel/fork.c:MAX_THREADS).
Variables ¶
var ( // CtrlDoExit is returned by the implementations of the exit and exit_group // syscalls to enter the task exit path directly, skipping syscall exit // tracing. CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true} )
var ErrNoSyscalls = errors.New("no syscall table found")
ErrNoSyscalls is returned if there is no syscall table.
var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g. waitpid(WNOHANG)) that find no waitable events, but determine that waitable events may exist in the future. (In contrast, if a non-blocking or blocking Wait determines that there are no tasks that can produce a waitable event, Task.Wait returns ECHILD.)
var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)
StopSignals is the set of signals whose default action is SignalActionStop.
var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)
UnblockableSignals contains the set of signals which cannot be blocked.
Functions ¶
func ContextCanTrace ¶
ContextCanTrace returns true if ctx is permitted to trace t, in the same sense as kernel.Task.CanTrace.
func RegisterSyscallTable ¶
func RegisterSyscallTable(s *SyscallTable)
RegisterSyscallTable registers a new syscall table for use by a Kernel.
Types ¶
type AbstractSocketNamespace ¶
type AbstractSocketNamespace struct {
// contains filtered or unexported fields
}
AbstractSocketNamespace is used to implement the Linux abstract socket functionality.
func NewAbstractSocketNamespace ¶
func NewAbstractSocketNamespace() *AbstractSocketNamespace
NewAbstractSocketNamespace returns a new AbstractSocketNamespace.
func (*AbstractSocketNamespace) Bind ¶
func (a *AbstractSocketNamespace) Bind(name string, ep unix.BoundEndpoint, rc refs.RefCounter) error
Bind binds the given socket.
When the last reference managed by rc is dropped, ep may be removed from the namespace.
func (*AbstractSocketNamespace) BoundEndpoint ¶
func (a *AbstractSocketNamespace) BoundEndpoint(name string) unix.BoundEndpoint
BoundEndpoint retrieves the endpoint bound to the given name. The return value is nil if no endpoint was bound.
type CloneOptions ¶
type CloneOptions struct { // SharingOptions defines the set of resources that the new task will share // with its parent. SharingOptions // Stack is the initial stack pointer of the new task. If Stack is 0, the // new task will start with the same stack pointer as its parent. Stack usermem.Addr // If SetTLS is true, set the new task's TLS (thread-local storage) // descriptor to TLS. If SetTLS is false, TLS is ignored. SetTLS bool TLS usermem.Addr // If ChildClearTID is true, when the child exits, 0 is written to the // address ChildTID in the child's memory, and if the write is successful a // futex wake on the same address is performed. // // If ChildSetTID is true, the child's thread ID (in the child's PID // namespace) is written to address ChildTID in the child's memory. (As in // Linux, failed writes are silently ignored.) ChildClearTID bool ChildSetTID bool ChildTID usermem.Addr // If ParentSetTID is true, the child's thread ID (in the parent's PID // namespace) is written to address ParentTID in the parent's memory. (As // in Linux, failed writes are silently ignored.) // // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID // causes the child's thread ID to be written to ptid in both the parent // and child's memory, but this is a documentation error fixed by // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). ParentSetTID bool ParentTID usermem.Addr // If Vfork is true, place the parent in vforkStop until the cloned task // releases its TaskContext. Vfork bool // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for // this clone(), and do not ptrace-attach the caller's tracer to the new // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). Untraced bool // If InheritTracer is true, ptrace-attach the caller's tracer to the new // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported // for it. If both Untraced and InheritTracer are true, no event will be // reported, but tracer inheritance will still occur. InheritTracer bool }
CloneOptions controls the behavior of Task.Clone.
type CreateProcessArgs ¶
type CreateProcessArgs struct { // Filename is the filename to load. // // If this is provided as "", then the file will be guessed via Argv[0]. Filename string // Argvv is a list of arguments. Argv []string // Envv is a list of environment variables. Envv []string // WorkingDirectory is the initial working directory. // // This defaults to the root if empty. WorkingDirectory string // Credentials is the initial credentials. Credentials *auth.Credentials // FDMap is the initial set of file descriptors. If CreateProcess succeeds, // it takes a reference on FDMap. FDMap *FDMap // Umask is the initial umask. Umask uint // Limits is the initial resource limits. Limits *limits.LimitSet // MaxSymlinkTraversals is the maximum number of symlinks to follow // during resolution. MaxSymlinkTraversals uint // UTSNamespace is the initial UTS namespace. UTSNamespace *UTSNamespace // IPCNamespace is the initial IPC namespace. IPCNamespace *IPCNamespace }
CreateProcessArgs holds arguments to kernel.CreateProcess.
func (*CreateProcessArgs) NewContext ¶
func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext
NewContext returns a context.Context that represents the task that will be created by args.NewContext(k).
type ExitStatus ¶
type ExitStatus struct { // Code is the numeric value passed to the call to exit or exit_group that // caused the exit. If the exit was not caused by such a call, Code is 0. Code int // Signo is the signal that caused the exit. If the exit was not caused by // a signal, Signo is 0. Signo int }
An ExitStatus is a value communicated from an exiting task or thread group to the party that reaps it.
func (ExitStatus) ShellExitCode ¶
func (es ExitStatus) ShellExitCode() int
ShellExitCode returns the numeric exit code that Bash would return for an exit status of es.
func (ExitStatus) Signaled ¶
func (es ExitStatus) Signaled() bool
Signaled returns true if the ExitStatus indicates that the exiting task or thread group was killed by a signal.
func (ExitStatus) Status ¶
func (es ExitStatus) Status() uint32
Status returns the numeric representation of the ExitStatus returned by e.g. the wait4() system call.
type FDFlags ¶
type FDFlags struct { // CloseOnExec indicates the descriptor should be closed on exec. CloseOnExec bool }
FDFlags define flags for an individual descriptor.
type FDMap ¶
type FDMap struct { refs.AtomicRefCount // contains filtered or unexported fields }
FDMap is used to manage File references and flags.
func (*FDMap) DecRef ¶
func (f *FDMap) DecRef()
DecRef implements RefCounter.DecRef with destructor f.destroy.
func (*FDMap) GetDescriptor ¶
GetDescriptor returns a reference to the file and the flags for the FD. It bumps its reference count as well. It returns nil if there is no File for the FD, i.e. if the FD is invalid. The caller must use DecRef when they are done.
func (*FDMap) GetFile ¶
GetFile returns a reference to the File for the FD and bumps its reference count as well. It returns nil if there is no File for the FD, i.e. if the FD is invalid. The caller must use DecRef when they are done.
func (*FDMap) GetRefs ¶
GetRefs returns a stable slice of references to all files and bumps the reference count on each. The caller must use DecRef on each reference when they're done using the slice.
func (*FDMap) NewFDAt ¶
NewFDAt sets the file reference for the given FD. If there is an active reference for that FD, the ref count for that existing reference is decremented.
func (*FDMap) NewFDFrom ¶
func (f *FDMap) NewFDFrom(fd kdefs.FD, file *fs.File, flags FDFlags, limitSet *limits.LimitSet) (kdefs.FD, error)
NewFDFrom allocates a new FD guaranteed to be the lowest number available greater than or equal to from. This property is important as Unix programs tend to count on this allocation order.
func (*FDMap) Remove ¶
Remove removes an FD from the FDMap, and returns (File, true) if a File one was found. Callers are expected to decrement the reference count on the File. Otherwise returns (nil, false).
type FSContext ¶
type FSContext struct { refs.AtomicRefCount // contains filtered or unexported fields }
FSContext contains filesystem context.
This includes umask and working directory.
func (*FSContext) DecRef ¶
func (f *FSContext) DecRef()
DecRef implements RefCounter.DecRef with destructor f.destroy.
func (*FSContext) RootDirectory ¶
RootDirectory returns the current filesystem root. You should call DecRef on the returned Dirent when finished.
This will return nil if called after destroy().
func (*FSContext) SetRootDirectory ¶
SetRootDirectory sets the root directory. This will take an extra reference on the Dirent.
This is not a valid call after free.
func (*FSContext) SetWorkingDirectory ¶
SetWorkingDirectory sets the current working directory. This will take an extra reference on the Dirent.
This is not a valid call after destroy.
func (*FSContext) SwapUmask ¶
SwapUmask atomically sets the current umask and returns the old umask.
func (*FSContext) WorkingDirectory ¶
WorkingDirectory returns the current working directory. You should call DecRef on the returned Dirent when finished.
This will return nil if called after destroy().
type IPCNamespace ¶
type IPCNamespace struct {
// contains filtered or unexported fields
}
IPCNamespace represents an IPC namespace.
func IPCNamespaceFromContext ¶
func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace
IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, or nil if there is no such IPC namespace.
func NewIPCNamespace ¶
func NewIPCNamespace() *IPCNamespace
NewIPCNamespace creates a new IPC namespace.
func (*IPCNamespace) SemaphoreRegistry ¶
func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry
SemaphoreRegistry returns the semanphore set registry for this namespace.
type InitKernelArgs ¶
type InitKernelArgs struct { // FeatureSet is the emulated CPU feature set. FeatureSet *cpuid.FeatureSet // Timekeeper manages time for all tasks in the system. Timekeeper *Timekeeper // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. NetworkStack inet.Stack // ApplicationCores is the number of logical CPUs visible to sandboxed // applications. The set of logical CPU IDs is [0, ApplicationCores); thus // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the // most significant bit in cpu_possible_mask + 1. ApplicationCores uint // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it // will be overridden. UseHostCores bool // ExtraAuxv contains additional auxiliary vector entries that are added to // each process by the ELF loader. ExtraAuxv []arch.AuxEntry // Vdso holds the VDSO and its parameter page. Vdso *loader.VDSO // RootUTSNamespace is the root UTS namepsace. RootUTSNamespace *UTSNamespace // RootIPCNamespace is the root IPC namepsace. RootIPCNamespace *IPCNamespace }
InitKernelArgs holds arguments to Init.
type Kernel ¶
type Kernel struct { // Platform is the platform that is used to execute tasks in the // created Kernel. It is embedded so that Kernel can directly serve as // Platform in mm logic and also serve as platform.MemoryProvider in // filemem S/R logic. platform.Platform `state:"nosave"` // contains filtered or unexported fields }
Kernel represents an emulated Linux kernel. It must be initialized by calling Init() or LoadFrom().
func KernelFromContext ¶
KernelFromContext returns the Kernel in which ctx is executing, or nil if there is no such Kernel.
func (*Kernel) ApplicationCores ¶
ApplicationCores returns the number of CPUs visible to sandboxed applications.
func (*Kernel) CPUClockNow ¶
CPUClockNow returns the current value of k.cpuClock.
func (*Kernel) CreateProcess ¶
func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, error)
CreateProcess creates a new task in a new thread group with the given options. The new task has no parent and is in the root PID namespace.
If k.Start() has already been called, the created task will begin running immediately. Otherwise, it will be started when k.Start() is called.
CreateProcess has no analogue in Linux; it is used to create the initial application task, as well as processes started by the control server.
func (*Kernel) Destroy ¶
func (k *Kernel) Destroy()
Destroy releases resources owned by k.
Preconditions: There must be no task goroutines running in k.
func (*Kernel) FeatureSet ¶
func (k *Kernel) FeatureSet() *cpuid.FeatureSet
FeatureSet returns the FeatureSet.
func (*Kernel) GenerateInotifyCookie ¶
GenerateInotifyCookie generates a unique inotify event cookie.
Returned values may overlap with previously returned values if the value space is exhausted. 0 is not a valid cookie value, all other values representable in a uint32 are allowed.
func (*Kernel) GlobalInit ¶
func (k *Kernel) GlobalInit() *ThreadGroup
GlobalInit returns the thread group with ID 1 in the root PID namespace, or nil if no such thread group exists. GlobalInit may return a thread group containing no tasks if the thread group has already exited.
func (*Kernel) Init ¶
func (k *Kernel) Init(args InitKernelArgs) error
Init initialize the Kernel with no tasks.
Callers must manually set Kernel.Platform before caling Init.
func (*Kernel) Kill ¶
func (k *Kernel) Kill(es ExitStatus)
Kill requests that all tasks in k immediately exit as if group exiting with status es. Kill does not wait for tasks to exit.
func (*Kernel) LoadTaskImage ¶
func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, error)
LoadTaskImage loads filename into a new TaskContext.
It takes several arguments:
- mounts: MountNamespace to lookup filename in
- root: Root to lookup filename under
- wd: Working directory to lookup filename under
- maxTraversals: maximum number of symlinks to follow
- filename: path to binary to load
- argv: Binary argv
- envv: Binary envv
- fs: Binary FeatureSet
func (*Kernel) MonotonicClock ¶
MonotonicClock returns the application CLOCK_MONOTONIC clock.
func (*Kernel) NetlinkPorts ¶
NetlinkPorts returns the netlink port manager.
func (*Kernel) NetworkStack ¶
NetworkStack returns the network stack. NetworkStack may return nil if no network stack is available.
func (*Kernel) NowNanoseconds ¶
NowNanoseconds implements tcpip.Clock.NowNanoseconds.
func (*Kernel) Pause ¶
func (k *Kernel) Pause()
Pause requests that all tasks in k temporarily stop executing, and blocks until all tasks in k have stopped. Multiple calls to Pause nest and require an equal number of calls to Unpause to resume execution.
func (*Kernel) RealtimeClock ¶
RealtimeClock returns the application CLOCK_REALTIME clock.
func (*Kernel) RootIPCNamespace ¶
func (k *Kernel) RootIPCNamespace() *IPCNamespace
RootIPCNamespace returns the root IPCNamespace.
func (*Kernel) RootMountNamespace ¶
func (k *Kernel) RootMountNamespace() *fs.MountNamespace
RootMountNamespace returns the MountNamespace.
func (*Kernel) RootUTSNamespace ¶
func (k *Kernel) RootUTSNamespace() *UTSNamespace
RootUTSNamespace returns the root UTSNamespace.
func (*Kernel) RootUserNamespace ¶
func (k *Kernel) RootUserNamespace() *auth.UserNamespace
RootUserNamespace returns the root UserNamespace.
func (*Kernel) SaveTo ¶
SaveTo saves the state of k to w.
Preconditions: The kernel must be paused throughout the call to SaveTo.
func (*Kernel) SendExternalSignal ¶
func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) bool
SendExternalSignal injects a signal into the kernel.
context is used only for debugging to describe how the signal was received.
Returns false if signal could not be sent because the Kernel is not fully initialized yet.
func (*Kernel) SetExitError ¶
SetExitError sets the sandbox error that caused the kernel to exit, if one is not already set.
func (*Kernel) SetRootMountNamespace ¶
func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace)
SetRootMountNamespace sets the MountNamespace.
func (*Kernel) Start ¶
Start starts execution of all tasks in k.
Preconditions: Start may be called exactly once.
func (*Kernel) SupervisorContext ¶
SupervisorContext returns a Context with maximum privileges in k. It should only be used by goroutines outside the control of the emulated kernel defined by e.
Callers are responsible for ensuring that the returned Context is not used concurrently with changes to the Kernel.
func (*Kernel) Timekeeper ¶
func (k *Kernel) Timekeeper() *Timekeeper
Timekeeper returns the Timekeeper.
func (*Kernel) Unpause ¶
func (k *Kernel) Unpause()
Unpause ends the effect of a previous call to Pause. If Unpause is called without a matching preceding call to Pause, Unpause may panic.
func (*Kernel) WaitExited ¶
func (k *Kernel) WaitExited()
WaitExited blocks until all tasks in k have exited.
type PIDNamespace ¶
type PIDNamespace struct {
// contains filtered or unexported fields
}
A PIDNamespace represents a PID namespace, a bimap between thread IDs and tasks. See the pid_namespaces(7) man page for further details.
N.B. A task is said to be visible in a PID namespace if the PID namespace contains a thread ID that maps to that task.
func PIDNamespaceFromContext ¶
func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace
PIDNamespaceFromContext returns the PID namespace in which ctx is executing, or nil if there is no such PID namespace.
func (*PIDNamespace) IDOfProcessGroup ¶
func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID
IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
The same constraints apply as IDOfSession.
func (*PIDNamespace) IDOfSession ¶
func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID
IDOfSession returns the Session assigned to s in PID namespace ns.
If this group isn't visible in this namespace, zero will be returned. It is the callers responsibility to check that before using this function.
func (*PIDNamespace) IDOfTask ¶
func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID
IDOfTask returns the TID assigned to the given task in PID namespace ns. If the task is not visible in that namespace, IDOfTask returns 0. (This return value is significant in some cases, e.g. getppid() is documented as returning 0 if the caller's parent is in an ancestor namespace and consequently not visible to the caller.) If the task is nil, IDOfTask returns 0.
func (*PIDNamespace) IDOfThreadGroup ¶
func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID
IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns. If the task is not visible in that namespace, IDOfThreadGroup returns 0.
func (*PIDNamespace) NewChild ¶
func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace
NewChild returns a new, empty PID namespace that is a child of ns. Authority over the new PID namespace is controlled by userns.
func (*PIDNamespace) ProcessGroupWithID ¶
func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup
ProcessGroupWithID returns the ProcessGroup with the given ID in the PID namespace ns, or nil if that given ID is not defined in this namespace.
A reference is not taken on the process group.
func (*PIDNamespace) SessionWithID ¶
func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session
SessionWithID returns the Session with the given ID in the PID namespace ns, or nil if that given ID is not defined in this namespace.
A reference is not taken on the session.
func (*PIDNamespace) TaskWithID ¶
func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task
TaskWithID returns the task with thread ID tid in PID namespace ns. If no task has that TID, TaskWithID returns nil.
func (*PIDNamespace) Tasks ¶
func (ns *PIDNamespace) Tasks() []*Task
Tasks returns a snapshot of the tasks in ns.
func (*PIDNamespace) ThreadGroupWithID ¶
func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup
ThreadGroupWithID returns the thread group lead by the task with thread ID tid in PID namespace ns. If no task has that TID, or if the task with that TID is not a thread group leader, ThreadGroupWithID returns nil.
func (*PIDNamespace) ThreadGroups ¶
func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup
ThreadGroups returns a snapshot of the thread groups in ns.
func (*PIDNamespace) UserNamespace ¶
func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace
UserNamespace returns the user namespace associated with PID namespace ns.
type ProcessGroup ¶
type ProcessGroup struct {
// contains filtered or unexported fields
}
ProcessGroup contains an originator threadgroup and a parent Session.
type RSEQCriticalRegion ¶
type RSEQCriticalRegion struct { // When a task in this thread group has its CPU preempted (as defined by // platform.ErrContextCPUPreempted) or has a signal delivered to an // application handler while its instruction pointer is in CriticalSection, // set the instruction pointer to Restart and application register r10 (on // amd64) to the former instruction pointer. CriticalSection usermem.AddrRange Restart usermem.Addr }
RSEQCriticalRegion describes a restartable sequence critical region.
type Session ¶
type Session struct {
// contains filtered or unexported fields
}
Session contains a leader threadgroup and a list of ProcessGroups.
type SharingOptions ¶
type SharingOptions struct { // If NewAddressSpace is true, the task should have an independent virtual // address space. NewAddressSpace bool // If NewSignalHandlers is true, the task should use an independent set of // signal handlers. NewSignalHandlers bool // If NewThreadGroup is true, the task should be the leader of its own // thread group. TerminationSignal is the signal that the thread group // will send to its parent when it exits. If NewThreadGroup is false, // TerminationSignal is ignored. NewThreadGroup bool TerminationSignal linux.Signal // If NewPIDNamespace is true: // // - In the context of Task.Clone, the new task should be the init task // (TID 1) in a new PID namespace. // // - In the context of Task.Unshare, the task should create a new PID // namespace, and all subsequent clones of the task should be members of // the new PID namespace. NewPIDNamespace bool // If NewUserNamespace is true, the task should have an independent user // namespace. NewUserNamespace bool // If NewNetworkNamespace is true, the task should have an independent // network namespace. (Note that network namespaces are not really // implemented; see comment on Task.netns for details.) NewNetworkNamespace bool // If NewFiles is true, the task should use an independent file descriptor // table. NewFiles bool // If NewFSContext is true, the task should have an independent FSContext. NewFSContext bool // If NewUTSNamespace is true, the task should have an independent UTS // namespace. NewUTSNamespace bool // If NewIPCNamespace is true, the task should have an independent IPC // namespace. NewIPCNamespace bool }
SharingOptions controls what resources are shared by a new task created by Task.Clone, or an existing task affected by Task.Unshare.
type SignalAction ¶
type SignalAction int
SignalAction is an internal signal action.
const ( SignalActionTerm SignalAction = iota SignalActionCore SignalActionStop SignalActionIgnore SignalActionHandler )
Available signal actions. Note that although we refer the complete set internally, the application is only capable of using the Default and Ignore actions from the system call interface.
type SignalHandlers ¶
type SignalHandlers struct {
// contains filtered or unexported fields
}
SignalHandlers holds information about signal actions.
func NewSignalHandlers ¶
func NewSignalHandlers() *SignalHandlers
NewSignalHandlers returns a new SignalHandlers specifying all default actions.
func (*SignalHandlers) CopyForExec ¶
func (sh *SignalHandlers) CopyForExec() *SignalHandlers
CopyForExec returns a copy of sh for a thread group that is undergoing an execve. (See comments in Task.finishExec.)
func (*SignalHandlers) Fork ¶
func (sh *SignalHandlers) Fork() *SignalHandlers
Fork returns a copy of sh for a new thread group.
type Stracer ¶
type Stracer interface { // SyscallEnter is called on syscall entry. // // The returned private data is passed to SyscallExit. // // TODO: remove kernel imports from the strace package so // that the type can be used directly. SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} // SyscallExit is called on syscall exit. SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) }
Stracer traces syscall execution.
type SyscallControl ¶
type SyscallControl struct {
// contains filtered or unexported fields
}
SyscallControl is returned by syscalls to control the behavior of Task.doSyscallInvoke.
type SyscallFlagsTable ¶
type SyscallFlagsTable struct {
// contains filtered or unexported fields
}
SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall basis.
func (*SyscallFlagsTable) Enable ¶
func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool)
Enable sets enable bit bit for all syscalls based on s.
Syscalls missing from s are disabled.
Syscalls missing from the initial table passed to Init cannot be added as individual syscalls. If present in s they will be ignored.
Callers to Word may see either the old or new value while this function is executing.
func (*SyscallFlagsTable) EnableAll ¶
func (e *SyscallFlagsTable) EnableAll(bit uint32)
EnableAll sets enable bit bit for all syscalls, present and missing.
func (*SyscallFlagsTable) Word ¶
func (e *SyscallFlagsTable) Word(sysno uintptr) uint32
Word returns the enable bitfield for sysno.
type SyscallFn ¶
type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
SyscallFn is a syscall implementation.
type SyscallRestartBlock ¶
SyscallRestartBlock represents the restart block for a syscall restartable with a custom function. It encapsulates the state required to restart a syscall across a S/R.
type SyscallRestartErrno ¶
type SyscallRestartErrno int
SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel include/linux/errno.h. These errnos are never returned to userspace directly, but are used to communicate the expected behavior of an interrupted syscall from the syscall to signal handling.
func SyscallRestartErrnoFromReturn ¶
func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool)
SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by rv, the value in a syscall return register.
func (SyscallRestartErrno) Error ¶
func (e SyscallRestartErrno) Error() string
Error implements error.Error.
type SyscallTable ¶
type SyscallTable struct { // OS is the operating system that this syscall table implements. OS abi.OS `state:"wait"` // Arch is the architecture that this syscall table targets. Arch arch.Arch `state:"wait"` // The OS version that this syscall table implements. Version Version `state:"manual"` // AuditNumber is a numeric constant that represents the syscall table. If // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by // linux/audit.h. AuditNumber uint32 `state:"manual"` // Table is the collection of functions. Table map[uintptr]SyscallFn `state:"manual"` // Emulate is a collection of instruction addresses to emulate. The // keys are addresses, and the values are system call numbers. Emulate map[usermem.Addr]uintptr `state:"manual"` // The function to call in case of a missing system call. Missing MissingFn `state:"manual"` // Stracer traces this syscall table. Stracer Stracer `state:"manual"` // External is used to handle an external callback. External func(*Kernel) `state:"manual"` // ExternalFilterBefore is called before External is called before the syscall is executed. // External is not called if it returns false. ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` // ExternalFilterAfter is called before External is called after the syscall is executed. // External is not called if it returns false. ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` // FeatureEnable stores the strace and one-shot enable bits. FeatureEnable SyscallFlagsTable `state:"manual"` // contains filtered or unexported fields }
SyscallTable is a lookup table of system calls. Critically, a SyscallTable is *immutable*. In order to make supporting suspend and resume sane, they must be uniquely registered and may not change during operation.
func LookupSyscallTable ¶
LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
func SyscallTables ¶
func SyscallTables() []*SyscallTable
SyscallTables returns a read-only slice of registered SyscallTables.
func (*SyscallTable) Lookup ¶
func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn
Lookup returns the syscall implementation, if one exists.
func (*SyscallTable) LookupEmulate ¶
func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool)
LookupEmulate looks up an emulation syscall number.
type Task ¶
type Task struct {
// contains filtered or unexported fields
}
Task represents a thread of execution in the untrusted app. It includes registers and any thread-specific state that you would normally expect.
Each task is associated with a goroutine, called the task goroutine, that executes code (application code, system calls, etc.) on behalf of that task. See Task.run (task_run.go).
All fields that are "owned by the task goroutine" can only be mutated by the task goroutine while it is running. The task goroutine does not require synchronization to read these fields, although it still requires synchronization as described for those fields to mutate them.
All fields that are "exclusive to the task goroutine" can only be accessed by the task goroutine while it is running. The task goroutine does not require synchronization to read or write these fields.
func TaskFromContext ¶
TaskFromContext returns the Task associated with ctx, or nil if there is no such Task.
func (*Task) AbstractSockets ¶
func (t *Task) AbstractSockets() *AbstractSocketNamespace
AbstractSockets returns t's AbstractSocketNamespace.
func (*Task) Activate ¶
func (t *Task) Activate()
Activate ensures that the task has an active address space.
func (*Task) AppendSyscallFilter ¶
AppendSyscallFilter adds BPF program p as a system call filter.
Preconditions: The caller must be running on the task goroutine.
func (*Task) Arch ¶
Arch returns t's arch.Context.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) AsyncContext ¶
AsyncContext returns a context.Context that may be used by goroutines that do work on behalf of t and therefore share its contextual values, but are not t's task goroutine (e.g. asynchronous I/O).
func (*Task) BeginExternalStop ¶
func (t *Task) BeginExternalStop()
BeginExternalStop indicates the start of an external stop that applies to t. BeginExternalStop does not wait for t's task goroutine to stop.
func (*Task) Block ¶
Block blocks t until an event is received from C or t is interrupted. It returns nil if an event is received from C and syserror.ErrInterrupted if t is interrupted.
Preconditions: The caller must be running on the task goroutine.
func (*Task) BlockWithDeadline ¶
BlockWithDeadline blocks t until an event is received from C, the application monotonic clock indicates a time of deadline (only if haveDeadline is true), or t is interrupted. It returns nil if an event is received from C, ETIMEDOUT if the deadline expired, and syserror.ErrInterrupted if t is interrupted.
Preconditions: The caller must be running on the task goroutine.
func (*Task) BlockWithTimeout ¶
func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error)
BlockWithTimeout blocks t until an event is received from C, the application monotonic clock indicates that timeout has elapsed (only if haveTimeout is true), or t is interrupted. It returns:
- The remaining timeout, which is guaranteed to be 0 if the timeout expired, and is unspecified if haveTimeout is false.
- An error which is nil if an event is received from C, ETIMEDOUT if the timeout expired, and syserror.ErrInterrupted if t is interrupted.
func (*Task) BlockWithTimer ¶
BlockWithTimer blocks t until an event is received from C or tchan, or t is interrupted. It returns nil if an event is received from C, ETIMEDOUT if an event is received from tchan, and syserror.ErrInterrupted if t is interrupted.
Most clients should use BlockWithDeadline or BlockWithTimeout instead.
Preconditions: The caller must be running on the task goroutine.
func (*Task) CPUClock ¶
CPUClock returns a clock measuring the CPU time the task has spent executing application and "kernel" code.
func (*Task) CanTrace ¶
CanTrace checks that t is permitted to access target's state, as defined by ptrace(2), subsection "Ptrace access mode checking". If attach is true, it checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access mode PTRACE_MODE_READ.
func (*Task) Clone ¶
func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error)
Clone implements the clone(2) syscall and returns the thread ID of the new task in t's PID namespace. Clone may return both a non-zero thread ID and a non-nil error.
Preconditions: The caller must be running Task.doSyscallInvoke on the task goroutine.
func (*Task) CopyIn ¶
CopyIn copies a fixed-size value or slice of fixed-size values in from the task's memory. The copy will fail with syscall.EFAULT if it traverses user memory that is unmapped or not readable by the user.
This Task's AddressSpace must be active.
func (*Task) CopyInBytes ¶
CopyInBytes is a fast version of CopyIn if the caller can serialize the data without reflection and pass in a byte slice.
This Task's AddressSpace must be active.
func (*Task) CopyInIovecs ¶
CopyInIovecs copies an array of numIovecs struct iovecs from the memory mapped at addr, converts them to usermem.AddrRanges, and returns them as a usermem.AddrRangeSeq.
CopyInIovecs shares the following properties with Linux's lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
- If the length of any AddrRange would exceed the range of an ssize_t, CopyInIovecs returns EINVAL.
- If the length of any AddrRange would cause its end to overflow, CopyInIovecs returns EFAULT.
- The combined length of all AddrRanges is limited to _MAX_RW_COUNT. If the combined length of all AddrRanges would otherwise exceed this amount, ranges beyond _MAX_RW_COUNT are silently truncated.
Preconditions: As for usermem.IO.CopyIn. The caller must be running on the task goroutine. t's AddressSpace must be active.
func (*Task) CopyInSignalAct ¶
CopyInSignalAct copies an architecture-specific sigaction type from task memory and then converts it into a SignalAct.
func (*Task) CopyInSignalStack ¶
CopyInSignalStack copies an architecture-specific stack_t from task memory and then converts it into a SignalStack.
func (*Task) CopyInString ¶
CopyInString copies a NUL-terminated string of length at most maxlen in from the task's memory. The copy will fail with syscall.EFAULT if it traverses user memory that is unmapped or not readable by the user.
This Task's AddressSpace must be active.
func (*Task) CopyInVector ¶
CopyInVector copies a NULL-terminated vector of strings from the task's memory. The copy will fail with syscall.EFAULT if it traverses user memory that is unmapped or not readable by the user.
maxElemSize is the maximum size of each individual element.
maxTotalSize is the maximum total length of all elements plus the total number of elements. For example, the following strings correspond to the following set of sizes:
{ "a", "b", "c" } => 6 (3 for lengths, 3 for elements) { "abc" } => 4 (3 for length, 1 for elements)
This Task's AddressSpace must be active.
func (*Task) CopyOut ¶
CopyOut copies a fixed-size value or slice of fixed-size values out to the task's memory. The copy will fail with syscall.EFAULT if it traverses user memory that is unmapped or not writeable by the user.
This Task's AddressSpace must be active.
func (*Task) CopyOutBytes ¶
CopyOutBytes is a fast version of CopyOut if the caller can serialize the data without reflection and pass in a byte slice.
This Task's AddressSpace must be active.
func (*Task) CopyOutIovecs ¶
CopyOutIovecs converts src to an array of struct iovecs and copies it to the memory mapped at addr.
Preconditions: As for usermem.IO.CopyOut. The caller must be running on the task goroutine. t's AddressSpace must be active.
func (*Task) CopyOutSignalAct ¶
CopyOutSignalAct converts the given SignalAct into an architecture-specific type and then copies it out to task memory.
func (*Task) CopyOutSignalStack ¶
CopyOutSignalStack converts the given SignalStack into an architecture-specific type and then copies it out to task memory.
func (*Task) CopyScratchBuffer ¶
CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut functions. It must only be used within those functions and can only be used by the task goroutine; it exists to improve performance and thus intentionally lacks any synchronization.
Callers should pass a constant value as an argument, which will allow the compiler to inline and optimize out the if statement below.
func (*Task) Credentials ¶
func (t *Task) Credentials() auth.Credentials
Credentials returns t's credentials by value.
func (*Task) Deactivate ¶
func (t *Task) Deactivate()
Deactivate relinquishes the task's active address space.
func (*Task) DebugDumpState ¶
func (t *Task) DebugDumpState()
DebugDumpState logs task state at log level debug.
Preconditions: The caller must be running on the task goroutine.
func (*Task) DropBoundingCapability ¶
func (t *Task) DropBoundingCapability(cp linux.Capability) error
DropBoundingCapability attempts to drop capability cp from t's capability bounding set.
func (*Task) EndExternalStop ¶
func (t *Task) EndExternalStop()
EndExternalStop indicates the end of an external stop started by a previous call to Task.BeginExternalStop. EndExternalStop does not wait for t's task goroutine to resume.
func (*Task) Execve ¶
func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error)
Execve implements the execve(2) syscall by killing all other tasks in its thread group and switching to newTC. Execve always takes ownership of newTC.
Preconditions: The caller must be running Task.doSyscallInvoke on the task goroutine.
func (*Task) ExitState ¶
func (t *Task) ExitState() TaskExitState
ExitState returns t's current progress through the exit path.
func (*Task) ExitStatus ¶
func (t *Task) ExitStatus() ExitStatus
ExitStatus returns t's exit status, which is only guaranteed to be meaningful if t.ExitState() != TaskExitNone.
func (*Task) ExtractErrno ¶
ExtractErrno extracts an integer error number from the error. The syscall number is purely for context in the error case. Use -1 if syscall number is unknown.
func (*Task) ExtractTask ¶
ExtractTask implements TaskMaybe.ExtractTask.
func (*Task) FDMap ¶
FDMap returns t's FDMap.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) FSContext ¶
FSContext returns t's FSContext.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) Futex ¶
Futex returns t's futex manager.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) FutexWaiter ¶
FutexWaiter returns the Task's futex.Waiter.
func (*Task) HasCapability ¶
func (t *Task) HasCapability(cp linux.Capability) bool
HasCapability checks if the task has capability cp in its user namespace.
func (*Task) HasCapabilityIn ¶
func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool
HasCapabilityIn checks if the task has capability cp in user namespace ns.
func (*Task) IPCNamespace ¶
func (t *Task) IPCNamespace() *IPCNamespace
IPCNamespace returns the task's IPC namespace.
func (*Task) IovecsIOSequence ¶
func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error)
IovecsIOSequence returns a usermem.IOSequence representing the array of iovcnt struct iovecs at addr in t's address space. opts applies to the returned IOSequence, not the reading of the struct iovec array.
IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
Preconditions: As for Task.CopyInIovecs.
func (*Task) IsChrooted ¶
IsChrooted returns true if the root directory of t's FSContext is not the root directory of t's MountNamespace.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) IsNetworkNamespaced ¶
IsNetworkNamespaced returns true if t is in a non-root network namespace.
func (*Task) MaxRSS ¶
MaxRSS returns the maximum resident set size of the task in bytes. which should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these flags.
func (*Task) MemoryManager ¶
func (t *Task) MemoryManager() *mm.MemoryManager
MemoryManager returns t's MemoryManager. MemoryManager does not take an additional reference on the returned MM.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) MountNamespace ¶
func (t *Task) MountNamespace() *fs.MountNamespace
MountNamespace returns t's MountNamespace. MountNamespace does not take an additional reference on the returned MountNamespace.
func (*Task) NetworkContext ¶
NetworkContext returns the network stack used by the task. NetworkContext may return nil if no network stack is available.
func (*Task) NumaPolicy ¶
NumaPolicy returns t's current numa policy.
func (*Task) OnSignalStack ¶
func (t *Task) OnSignalStack(s arch.SignalStack) bool
OnSignalStack returns true if, when the task resumes running, it will run on the task-private signal stack.
func (*Task) PIDNamespace ¶
func (t *Task) PIDNamespace() *PIDNamespace
PIDNamespace returns the PID namespace containing t.
func (*Task) ParentDeathSignal ¶
ParentDeathSignal returns t's parent death signal.
func (*Task) PendingSignals ¶
PendingSignals returns the set of pending signals.
func (*Task) PrepareExit ¶
func (t *Task) PrepareExit(es ExitStatus)
PrepareExit indicates an exit with status es.
Preconditions: The caller must be running on the task goroutine.
func (*Task) PrepareGroupExit ¶
func (t *Task) PrepareGroupExit(es ExitStatus)
PrepareGroupExit indicates a group exit with status es to t's thread group.
PrepareGroupExit is analogous to Linux's do_group_exit(), except that it does not tail-call do_exit(), except that it *does* set Task.exitStatus. (Linux does not do so until within do_exit(), since it reuses exit_code for ptrace.)
Preconditions: The caller must be running on the task goroutine.
func (*Task) RSEQAvailable ¶
RSEQAvailable returns true if t supports restartable sequences.
func (*Task) RSEQCPUAddr ¶
RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU number.
Preconditions: The caller must be running on the task goroutine.
func (*Task) RSEQCriticalRegion ¶
func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion
RSEQCriticalRegion returns a copy of t's thread group's current restartable sequence.
func (*Task) SeccompMode ¶
SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP) and /proc/[pid]/status.
func (*Task) SendGroupSignal ¶
func (t *Task) SendGroupSignal(info *arch.SignalInfo) error
SendGroupSignal sends the given signal to t's thread group.
func (*Task) SendSignal ¶
func (t *Task) SendSignal(info *arch.SignalInfo) error
SendSignal sends the given signal to t.
The following errors may be returned:
syserror.ESRCH - The task has exited. syserror.EINVAL - The signal is not valid. syserror.EAGAIN - THe signal is realtime, and cannot be queued.
func (*Task) SetCPUMask ¶
SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of mask.
Preconditions: mask.Size() == sched.CPUSetSize(t.Kernel().ApplicationCores()).
func (*Task) SetCapabilitySets ¶
func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error
SetCapabilitySets attempts to change t's permitted, inheritable, and effective capability sets.
func (*Task) SetClearTID ¶
SetClearTID sets t's cleartid.
Preconditions: The caller must be running on the task goroutine.
func (*Task) SetExtraGIDs ¶
SetExtraGIDs attempts to change t's supplemental groups. All IDs are interpreted as being in t's user namespace.
func (*Task) SetKeepCaps ¶
SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
func (*Task) SetNumaPolicy ¶
SetNumaPolicy sets t's numa policy.
func (*Task) SetParentDeathSignal ¶
SetParentDeathSignal sets t's parent death signal.
func (*Task) SetRSEQCPUAddr ¶
SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU number.
Preconditions: t.RSEQAvailable() == true. The caller must be running on the task goroutine. t's AddressSpace must be active.
func (*Task) SetRSEQCriticalRegion ¶
func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error
SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
Preconditions: t.RSEQAvailable() == true.
func (*Task) SetSavedSignalMask ¶
SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's comment).
Preconditions: SetSavedSignalMask can only be called by the task goroutine.
func (*Task) SetSignalMask ¶
SetSignalMask sets t's signal mask.
Preconditions: SetSignalMask can only be called by the task goroutine. t.exitState < TaskExitZombie.
func (*Task) SetSignalStack ¶
func (t *Task) SetSignalStack(alt arch.SignalStack) error
SetSignalStack sets the task-private signal stack and clears the SignalStackFlagDisable, since we have a signal stack.
func (*Task) SetSyscallRestartBlock ¶
func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock)
SetSyscallRestartBlock sets the restart block for use in restart_syscall(2). After registering a restart block, a syscall should return ERESTART_RESTARTBLOCK to request a restart using the block.
Precondition: The caller must be running on the task goroutine.
func (*Task) SetUserNamespace ¶
func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error
SetUserNamespace attempts to move c into ns.
func (*Task) SignalMask ¶
SignalMask returns a copy of t's signal mask.
func (*Task) SignalReturn ¶
func (t *Task) SignalReturn(rt bool) (*SyscallControl, error)
SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if rt is true).
func (*Task) SignalStack ¶
func (t *Task) SignalStack() arch.SignalStack
SignalStack returns the task-private signal stack.
func (*Task) SingleIOSequence ¶
func (t *Task) SingleIOSequence(addr usermem.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error)
SingleIOSequence returns a usermem.IOSequence representing [addr, addr+length) in t's address space. If length exceeds _MAX_RW_COUNT, it is silently truncated.
SingleIOSequence is analogous to Linux's lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and write syscalls in Linux do not use import_single_range(), but are still truncated to _MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
func (*Task) SleepFinish ¶
SleepFinish implements amutex.Sleeper.SleepFinish.
func (*Task) SleepStart ¶
func (t *Task) SleepStart() <-chan struct{}
SleepStart implements amutex.Sleeper.SleepStart.
func (*Task) Stack ¶
Stack returns the userspace stack.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) Start ¶
Start starts the task goroutine. Start must be called exactly once for each task returned by NewTask.
'tid' must be the task's TID in the root PID namespace and it's used for debugging purposes only (set as parameter to Task.run to make it visible in stack dumps).
func (*Task) StateStatus ¶
StateStatus returns a string representation of the task's current state, appropriate for /proc/[pid]/status.
func (*Task) SyscallRestartBlock ¶
func (t *Task) SyscallRestartBlock() SyscallRestartBlock
SyscallRestartBlock returns the currently registered restart block for use in restart_syscall(2). This function is *not* idempotent and may be called once per syscall. This function must not be called if a restart block has not been registered for the current syscall.
Precondition: The caller must be running on the task goroutine.
func (*Task) SyscallTable ¶
func (t *Task) SyscallTable() *SyscallTable
SyscallTable returns t's syscall table.
Preconditions: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) TakeSignal ¶
func (t *Task) TakeSignal(mask linux.SignalSet) *arch.SignalInfo
TakeSignal returns a pending signal not blocked by mask. Signal handlers are not affected. If there are no pending signals not blocked by mask, TakeSignal returns a nil SignalInfo.
func (*Task) TaskContext ¶
func (t *Task) TaskContext() *TaskContext
TaskContext returns t's TaskContext.
Precondition: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) TaskGoroutineSchedInfo ¶
func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo
TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info. Most clients should use t.CPUStats() instead.
func (*Task) TaskResources ¶
func (t *Task) TaskResources() *TaskResources
TaskResources returns t's TaskResources.
Precondition: The caller must be running on the task goroutine, or t.mu must be locked.
func (*Task) ThreadGroup ¶
func (t *Task) ThreadGroup() *ThreadGroup
ThreadGroup returns the thread group containing t.
func (*Task) ThreadID ¶
ThreadID returns t's thread ID in its own PID namespace. If the task is dead, ThreadID returns 0.
func (*Task) Timekeeper ¶
func (t *Task) Timekeeper() *Timekeeper
Timekeeper returns the system Timekeeper.
func (*Task) UTSNamespace ¶
func (t *Task) UTSNamespace() *UTSNamespace
UTSNamespace returns the task's UTS namespace.
func (*Task) UninterruptibleSleepFinish ¶
UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish.
func (*Task) UninterruptibleSleepStart ¶
UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
func (*Task) Unshare ¶
func (t *Task) Unshare(opts *SharingOptions) error
Unshare changes the set of resources t shares with other tasks, as specified by opts.
Preconditions: The caller must be running on the task goroutine.
func (*Task) UserCPUClock ¶
UserCPUClock returns a clock measuring the CPU time the task has spent executing application code.
func (*Task) UserNamespace ¶
func (t *Task) UserNamespace() *auth.UserNamespace
UserNamespace returns the user namespace associated with the task.
func (*Task) Value ¶
func (t *Task) Value(key interface{}) interface{}
Value implements context.Context.Value.
func (*Task) Wait ¶
func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error)
Wait waits for an event from a thread group that is a child of t's thread group, or a task in such a thread group, or a task that is ptraced by t, subject to the options specified in opts.
func (*Task) WithMuLocked ¶
WithMuLocked executes f with t.mu locked.
type TaskConfig ¶
type TaskConfig struct { // Kernel is the owning Kernel. *Kernel // Parent is the new task's parent. Parent may be nil. Parent *Task // ThreadGroup is the ThreadGroup the new task belongs to. *ThreadGroup // TaskContext is the TaskContext of the new task. *TaskContext // TaskResources is the TaskResources of the new task. *TaskResources // Credentials is the Credentials of the new task. Credentials *auth.Credentials // Niceness is the niceness of the new task. Niceness int // If NetworkNamespaced is true, the new task should observe a non-root // network namespace. NetworkNamespaced bool // AllowedCPUMask contains the cpus that this task can run on. AllowedCPUMask sched.CPUSet // UTSNamespace is the UTSNamespace of the new task. UTSNamespace *UTSNamespace // IPCNamespace is the IPCNamespace of the new task. IPCNamespace *IPCNamespace }
TaskConfig defines the configuration of a new Task (see below).
type TaskContext ¶
type TaskContext struct { // Name is the thread name set by the prctl(PR_SET_NAME) system call. Name string // Arch is the architecture-specific context (registers, etc.) Arch arch.Context // MemoryManager is the task's address space. MemoryManager *mm.MemoryManager // contains filtered or unexported fields }
TaskContext is the subset of a task's data that is provided by the loader.
func (*TaskContext) Fork ¶
func (tc *TaskContext) Fork(ctx context.Context, shareAddressSpace bool) (*TaskContext, error)
Fork returns a duplicate of tc. The copied TaskContext always has an independent arch.Context. If shareAddressSpace is true, the copied TaskContext shares an address space with the original; otherwise, the copied TaskContext has an independent address space that is initially a duplicate of the original's.
type TaskExitState ¶
type TaskExitState int
TaskExitState represents a step in the task exit path.
"Exiting" and "exited" are often ambiguous; prefer to name specific states.
const ( // TaskExitNone indicates that the task has not begun exiting. TaskExitNone TaskExitState = iota // TaskExitInitiated indicates that the task goroutine has entered the exit // path, and the task is no longer eligible to participate in group stops // or group signal handling. TaskExitInitiated is analogous to Linux's // PF_EXITING. TaskExitInitiated // TaskExitZombie indicates that the task has released its resources, and // the task no longer prevents a sibling thread from completing execve. TaskExitZombie // TaskExitDead indicates that the task's thread IDs have been released, // and the task no longer prevents its thread group leader from being // reaped. ("Reaping" refers to the transitioning of a task from // TaskExitZombie to TaskExitDead.) TaskExitDead )
func (TaskExitState) String ¶
func (t TaskExitState) String() string
String implements fmt.Stringer.
type TaskGoroutineSchedInfo ¶
type TaskGoroutineSchedInfo struct { // Timestamp was the value of Kernel.cpuClock when this // TaskGoroutineSchedInfo was last updated. Timestamp uint64 // State is the current state of the task goroutine. State TaskGoroutineState // UserTicks is the amount of time the task goroutine has spent executing // its associated Task's application code, in units of linux.ClockTick. UserTicks uint64 // SysTicks is the amount of time the task goroutine has spent executing in // the sentry, in units of linux.ClockTick. SysTicks uint64 }
TaskGoroutineSchedInfo contains task goroutine scheduling state which must be read and updated atomically.
type TaskGoroutineState ¶
type TaskGoroutineState int
TaskGoroutineState is a coarse representation of the current execution status of a kernel.Task goroutine.
const ( // TaskGoroutineNonexistent indicates that the task goroutine has either // not yet been created by Task.Start() or has returned from Task.run(). // This must be the zero value for TaskGoroutineState. TaskGoroutineNonexistent TaskGoroutineState = iota // TaskGoroutineRunningSys indicates that the task goroutine is executing // sentry code. TaskGoroutineRunningSys // TaskGoroutineRunningApp indicates that the task goroutine is executing // application code. TaskGoroutineRunningApp // TaskGoroutineBlockedInterruptible indicates that the task goroutine is // blocked in Task.block(), and hence may be woken by Task.interrupt() // (e.g. due to signal delivery). TaskGoroutineBlockedInterruptible // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is // stopped outside of Task.block() and Task.doStop(), and hence cannot be // woken by Task.interrupt(). TaskGoroutineBlockedUninterruptible // TaskGoroutineStopped indicates that the task goroutine is blocked in // Task.doStop(). TaskGoroutineStopped is similar to // TaskGoroutineBlockedUninterruptible, but is a separate state to make it // possible to determine when Task.stop is meaningful. TaskGoroutineStopped )
type TaskMaybe ¶
type TaskMaybe interface { // ExtractTask returns the Task. ExtractTask() *Task }
TaskMaybe is the interface for extracting Tasks out of things which may be or contain Task objects.
type TaskResources ¶
type TaskResources struct { // SignalMask is the set of signals whose delivery is currently blocked. // // FIXME: Determine if we also need RealSignalMask SignalMask linux.SignalSet // FSContext is the filesystem context. *FSContext // FDMap provides access to files to the task. *FDMap // Tracks abstract sockets that are in use. AbstractSockets *AbstractSocketNamespace }
TaskResources is the subset of a task's data provided by its creator that is not provided by the loader.
func (*TaskResources) Fork ¶
func (tr *TaskResources) Fork(shareFiles bool, shareFSContext bool) *TaskResources
Fork returns a duplicate of tr.
FIXME: Preconditions: When tr is owned by a Task, that task's signal mutex must be locked, or Fork must be called by the task's goroutine.
type TaskSet ¶
type TaskSet struct { // Root is the root PID namespace, in which all tasks in the TaskSet are // visible. The Root pointer is immutable. Root *PIDNamespace // contains filtered or unexported fields }
A TaskSet comprises all tasks in a system.
func (*TaskSet) BeginExternalStop ¶
func (ts *TaskSet) BeginExternalStop()
BeginExternalStop indicates the start of an external stop that applies to all current and future tasks in ts. BeginExternalStop does not wait for task goroutines to stop.
func (*TaskSet) EndExternalStop ¶
func (ts *TaskSet) EndExternalStop()
EndExternalStop indicates the end of an external stop started by a previous call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task goroutines to resume.
func (*TaskSet) Kill ¶
func (ts *TaskSet) Kill(es ExitStatus)
Kill requests that all tasks in ts exit as if group exiting with status es. Kill does not wait for tasks to exit.
Kill has no analogue in Linux; it's provided for save/restore only.
func (*TaskSet) NewTask ¶
func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error)
NewTask creates a new task defined by TaskConfig. Whether or not NewTask is successful, it takes ownership of both TaskContext and TaskResources of the TaskConfig.
NewTask does not start the returned task; the caller must call Task.Start.
type TaskStop ¶
type TaskStop interface { // Killable returns true if Task.Kill should end the stop prematurely. // Killable is analogous to Linux's TASK_WAKEKILL. Killable() bool }
A TaskStop is a condition visible to the task control flow graph that prevents a task goroutine from running or exiting, i.e. an internal stop.
NOTE: Most TaskStops don't contain any data; they're distinguished by their type. The obvious way to implement such a TaskStop is:
type groupStop struct{} func (groupStop) Killable() bool { return true } ... t.beginInternalStop(groupStop{})
However, this doesn't work because the state package can't serialize values, only pointers. Furthermore, the correctness of save/restore depends on the ability to pass a TaskStop to endInternalStop that will compare equal to the TaskStop that was passed to beginInternalStop, even if a save/restore cycle occurred between the two. As a result, the current idiom is to always use a typecast nil for data-free TaskStops:
type groupStop struct{} func (*groupStop) Killable() bool { return true } ... t.beginInternalStop((*groupStop)(nil))
This is pretty gross, but the alternatives seem grosser.
type ThreadGroup ¶
type ThreadGroup struct {
// contains filtered or unexported fields
}
A ThreadGroup is a logical grouping of tasks that has widespread significance to other kernel features (e.g. signal handling). ("Thread groups" are usually called "processes" in userspace documentation.)
ThreadGroup is a superset of Linux's struct signal_struct.
func NewThreadGroup ¶
func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup
NewThreadGroup returns a new, empty thread group in PID namespace ns. The thread group leader will send its parent terminationSignal when it exits. The new thread group isn't visible to the system until a task has been created inside of it by a successful call to TaskSet.NewTask.
func (*ThreadGroup) CPUClock ¶
func (tg *ThreadGroup) CPUClock() ktime.Clock
CPUClock returns a ktime.Clock that measures the time that a thread group has spent executing, including sentry time.
func (*ThreadGroup) CPUStats ¶
func (tg *ThreadGroup) CPUStats() usage.CPUStats
CPUStats returns the combined CPU usage statistics of all past and present threads in tg.
func (*ThreadGroup) Count ¶
func (tg *ThreadGroup) Count() int
Count returns the number of non-exited threads in the group.
func (*ThreadGroup) CreateProcessGroup ¶
func (tg *ThreadGroup) CreateProcessGroup() error
CreateProcessGroup creates a new process group.
An EPERM error will be returned if the ThreadGroup belongs to a different Session, is a Session leader or the group already exists.
func (*ThreadGroup) CreateSession ¶
func (tg *ThreadGroup) CreateSession() error
CreateSession creates a new Session, with the ThreadGroup as the leader.
EPERM may be returned if either the given ThreadGroup is already a Session leader, or a ProcessGroup already exists for the ThreadGroup's ID.
func (*ThreadGroup) ExitStatus ¶
func (tg *ThreadGroup) ExitStatus() ExitStatus
ExitStatus returns the exit status that would be returned by a consuming wait*() on tg.
func (*ThreadGroup) ID ¶
func (tg *ThreadGroup) ID() ThreadID
ID returns tg's leader's thread ID in its own PID namespace. If tg's leader is dead, ID returns 0.
func (*ThreadGroup) IOUsage ¶
func (tg *ThreadGroup) IOUsage() *usage.IO
IOUsage returns the total io usage of all dead and live threads in the group.
func (*ThreadGroup) JoinProcessGroup ¶
func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error
JoinProcessGroup joins an existing process group.
This function will return EACCES if an exec has been performed since fork by the given ThreadGroup, and EPERM if the Sessions are not the same or the group does not exist.
If checkExec is set, then the join is not permitted after the process has executed exec at least once.
func (*ThreadGroup) JoinedChildCPUStats ¶
func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats
JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return resource usage statistics for all children of [tg] that have terminated and been waited for. These statistics will include the resources used by grandchildren, and further removed descendants, if all of the intervening descendants waited on their terminated children."
func (*ThreadGroup) Limits ¶
func (tg *ThreadGroup) Limits() *limits.LimitSet
Limits returns tg's limits.
func (*ThreadGroup) MemberIDs ¶
func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID
MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for all tasks in tg.
func (*ThreadGroup) PIDNamespace ¶
func (tg *ThreadGroup) PIDNamespace() *PIDNamespace
PIDNamespace returns the PID namespace containing tg.
func (*ThreadGroup) ProcessGroup ¶
func (tg *ThreadGroup) ProcessGroup() *ProcessGroup
ProcessGroup returns the ThreadGroup's ProcessGroup.
A reference is not taken on the process group.
func (*ThreadGroup) SendSignal ¶
func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error
SendSignal sends the given signal to tg, using tg's leader to determine if the signal is blocked.
func (*ThreadGroup) SendTimerSignal ¶
func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error
SendTimerSignal mimics the process timer signal delivery behavior in linux: signals are delivered to the thread that triggers the timer expiration (see kernel/time/posix-cpu-timers.c:check_process_timers(). This means
- the thread is running on cpu at the time.
- a thread runs more frequently will get more of those signals.
We approximate this behavior by selecting a running task in a round-robin fashion. Statistically, a thread running more often should have a higher probability to be selected.
func (*ThreadGroup) Session ¶
func (tg *ThreadGroup) Session() *Session
Session returns the ThreadGroup's Session.
A reference is not taken on the session.
func (*ThreadGroup) SetCPUTimer ¶
func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit)
SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft limits on CPU time used by this process.
func (*ThreadGroup) SetSignalAct ¶
func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error)
SetSignalAct atomically sets the thread group's signal action for signal sig to *actptr (if actptr is not nil) and returns the old signal action.
func (*ThreadGroup) SignalHandlers ¶
func (tg *ThreadGroup) SignalHandlers() *SignalHandlers
SignalHandlers returns the signal handlers used by tg.
Preconditions: The caller must provide the synchronization required to read tg.signalHandlers, as described in the field's comment.
func (*ThreadGroup) TaskSet ¶
func (tg *ThreadGroup) TaskSet() *TaskSet
TaskSet returns the TaskSet containing tg.
func (*ThreadGroup) TerminationSignal ¶
func (tg *ThreadGroup) TerminationSignal() linux.Signal
TerminationSignal returns the thread group's termination signal.
func (*ThreadGroup) UserCPUClock ¶
func (tg *ThreadGroup) UserCPUClock() ktime.Clock
UserCPUClock returns a ktime.Clock that measures the time that a thread group has spent executing.
func (*ThreadGroup) WaitExited ¶
func (tg *ThreadGroup) WaitExited()
WaitExited blocks until all task goroutines in tg have exited.
WaitExited does not correspond to anything in Linux; it's provided so that external callers of Kernel.CreateProcess can wait for the created thread group to terminate.
type ThreadID ¶
type ThreadID int32
ThreadID is a generic thread identifier.
const InitTID ThreadID = 1
InitTID is the TID given to the first task added to each PID namespace. The thread group led by InitTID is called the namespace's init process. The death of a PID namespace's init process causes all tasks visible in that namespace to be killed.
type Timekeeper ¶
type Timekeeper struct {
// contains filtered or unexported fields
}
Timekeeper manages all of the kernel clocks.
func NewTimekeeper ¶
NewTimekeeper returns a Timekeeper that is automatically kept up-to-date. NewTimekeeper does not take ownership of paramPage.
SetClocks must be called on the returned Timekeeper before it is usable.
func (*Timekeeper) BootTime ¶
func (t *Timekeeper) BootTime() ktime.Time
BootTime returns the system boot real time.
func (*Timekeeper) Destroy ¶
func (t *Timekeeper) Destroy()
Destroy destroys the Timekeeper, freeing all associated resources.
func (*Timekeeper) GetTime ¶
func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error)
GetTime returns the current time in nanoseconds.
func (*Timekeeper) PauseUpdates ¶
func (t *Timekeeper) PauseUpdates()
PauseUpdates stops clock parameter updates. This should only be used when Tasks are not running and thus cannot access the clock.
func (*Timekeeper) ResumeUpdates ¶
func (t *Timekeeper) ResumeUpdates()
ResumeUpdates restarts clock parameter updates stopped by PauseUpdates.
func (*Timekeeper) SetClocks ¶
func (t *Timekeeper) SetClocks(c sentrytime.Clocks)
SetClocks the backing clock source.
SetClocks must be called before the Timekeeper is used, and it may not be called more than once, as changing the clock source without extra correction could cause time discontinuities.
It must also be called after Load.
type TimerManager ¶
type TimerManager struct { RealTimer *ktime.Timer VirtualTimer *ktime.Timer ProfTimer *ktime.Timer SoftLimitTimer *ktime.Timer HardLimitTimer *ktime.Timer // contains filtered or unexported fields }
TimerManager is a collection of supported process cpu timers.
type UTSNamespace ¶
type UTSNamespace struct {
// contains filtered or unexported fields
}
UTSNamespace represents a UTS namespace, a holder of two system identifiers: the hostname and domain name.
func NewUTSNamespace ¶
func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace
NewUTSNamespace creates a new UTS namespace.
func UTSNamespaceFromContext ¶
func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace
UTSNamespaceFromContext returns the UTS namespace in which ctx is executing, or nil if there is no such UTS namespace.
func (*UTSNamespace) Clone ¶
func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace
Clone makes a copy of this UTS namespace, associating the given user namespace.
func (*UTSNamespace) DomainName ¶
func (u *UTSNamespace) DomainName() string
DomainName returns the domain name of this UTS namespace.
func (*UTSNamespace) HostName ¶
func (u *UTSNamespace) HostName() string
HostName returns the host name of this UTS namespace.
func (*UTSNamespace) SetDomainName ¶
func (u *UTSNamespace) SetDomainName(domain string)
SetDomainName sets the domain name of this UTS namespace.
func (*UTSNamespace) SetHostName ¶
func (u *UTSNamespace) SetHostName(host string)
SetHostName sets the host name of this UTS namespace.
func (*UTSNamespace) UserNamespace ¶
func (u *UTSNamespace) UserNamespace() *auth.UserNamespace
UserNamespace returns the user namespace associated with this UTS namespace.
type VDSOParamPage ¶
type VDSOParamPage struct {
// contains filtered or unexported fields
}
VDSOParamPage manages a VDSO parameter page.
Its memory layout looks like:
type page struct { // seq is a sequence counter that protects the fields below. seq uint64 vdsoParams }
Everything in the struct is 8 bytes for easy alignment.
It must be kept in sync with params in vdso/vdso_time.cc.
func NewVDSOParamPage ¶
func NewVDSOParamPage(platform platform.Platform, fr platform.FileRange) *VDSOParamPage
NewVDSOParamPage returns a VDSOParamPage.
Preconditions:
- fr is a single page allocated from platform.Memory(). VDSOParamPage does not take ownership of fr; it must remain allocated for the lifetime of the VDSOParamPage.
* VDSOParamPage must be the only writer to fr.
* platform.Memory().MapInternal(fr) must return a single safemem.Block.
func (*VDSOParamPage) Write ¶
func (v *VDSOParamPage) Write(f func() vdsoParams) error
Write updates the VDSO parameters.
Write starts a write block, calls f to get the new parameters, writes out the new parameters, then ends the write block.
type Version ¶
type Version struct { // Operating system name (e.g. "Linux"). Sysname string // Operating system release (e.g. "3.11.10-amd64"). Release string // Operating system version. On Linux this takes the shape // "#VERSION CONFIG_FLAGS TIMESTAMP" // where: // - VERSION is a sequence counter incremented on every successful build // - CONFIG_FLAGS is a space-separated list of major enabled kernel features // (e.g. "SMP" and "PREEMPT") // - TIMESTAMP is the build timestamp as returned by `date` Version string }
Version defines the application-visible system version.
type WaitOptions ¶
type WaitOptions struct { // If SpecificTID is non-zero, only events from the task with thread ID // SpecificTID are eligible to be waited for. SpecificTID is resolved in // the PID namespace of the waiter (the method receiver of Task.Wait). If // no such task exists, or that task would not otherwise be eligible to be // waited for by the waiting task, then there are no waitable tasks and // Wait will return ECHILD. SpecificTID ThreadID // If SpecificPGID is non-zero, only events from ThreadGroups with a // matching ProcessGroupID are eligible to be waited for. (Same // constraints as SpecificTID apply.) SpecificPGID ProcessGroupID // If NonCloneTasks is true, events from non-clone tasks are eligible to be // waited for. NonCloneTasks bool // If CloneTasks is true, events from clone tasks are eligible to be waited // for. CloneTasks bool // Events is a bitwise combination of the events defined above that specify // what events are of interest to the call to Wait. Events waiter.EventMask // If ConsumeEvent is true, the Wait should consume the event such that it // cannot be returned by a future Wait. Note that if a task exit is // consumed in this way, in most cases the task will be reaped. ConsumeEvent bool // If BlockInterruptErr is not nil, Wait will block until either an event // is available or there are no tasks that could produce a waitable event; // if that blocking is interrupted, Wait returns BlockInterruptErr. If // BlockInterruptErr is nil, Wait will not block. BlockInterruptErr error }
WaitOptions controls the behavior of Task.Wait.
type WaitResult ¶
type WaitResult struct { // Task is the task that reported the event. Task *Task // TID is the thread ID of Task in the PID namespace of the task that // called Wait (that is, the method receiver of the call to Task.Wait). TID // is provided because consuming exit waits cause the thread ID to be // deallocated. TID ThreadID // UID is the real UID of Task in the user namespace of the task that // called Wait. UID auth.UID // Event is exactly one of the events defined above. Event waiter.EventMask // Status is the numeric status associated with the event. Status uint32 }
WaitResult contains information about a waited-for event.
Source Files ¶
- abstract_socket_namespace.go
- context.go
- fd_map.go
- fs_context.go
- ipc_namespace.go
- kernel.go
- pending_signals.go
- ptrace.go
- rseq.go
- seccomp.go
- sessions.go
- signal.go
- signal_handlers.go
- syscalls.go
- syscalls_state.go
- syslog.go
- task.go
- task_acct.go
- task_block.go
- task_clone.go
- task_context.go
- task_exec.go
- task_exit.go
- task_identity.go
- task_log.go
- task_net.go
- task_resources.go
- task_run.go
- task_sched.go
- task_signals.go
- task_start.go
- task_stop.go
- task_syscall.go
- task_usermem.go
- thread_group.go
- threads.go
- timekeeper.go
- timekeeper_state.go
- timer.go
- uts_namespace.go
- vdso.go
- version.go
Directories ¶
Path | Synopsis |
---|---|
Package auth implements an access control model that is a subset of Linux's.
|
Package auth implements an access control model that is a subset of Linux's. |
Package epoll provides an implementation of Linux's IO event notification facility.
|
Package epoll provides an implementation of Linux's IO event notification facility. |
Package eventfd provides an implementation of Linux's file-based event notification.
|
Package eventfd provides an implementation of Linux's file-based event notification. |
Package futex provides an implementation of the futex interface as found in the Linux kernel.
|
Package futex provides an implementation of the futex interface as found in the Linux kernel. |
Package kdefs defines common kernel definitions.
|
Package kdefs defines common kernel definitions. |
Package memevent implements the memory usage events controller, which periodically emits events via the eventchannel.
|
Package memevent implements the memory usage events controller, which periodically emits events via the eventchannel. |
Package pipe provides an in-memory implementation of a unidirectional pipe.
|
Package pipe provides an in-memory implementation of a unidirectional pipe. |
Package sched implements scheduler related features.
|
Package sched implements scheduler related features. |
Package semaphore implements System V semaphores.
|
Package semaphore implements System V semaphores. |
Package time defines the Timer type, which provides a periodic timer that works by sampling a user-provided clock.
|
Package time defines the Timer type, which provides a periodic timer that works by sampling a user-provided clock. |