dmesg

package
v0.1.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 13, 2024 License: Apache-2.0 Imports: 31 Imported by: 0

Documentation

Overview

Package dmesg scans and watches dmesg outputs for errors, as specified in the configuration (e.g., regex match NVIDIA GPU errors).

Index

Constants

View Source
const (
	EventNameDmesgMatched = "dmesg_matched"

	EventKeyDmesgMatchedUnixSeconds = "unix_seconds"
	EventKeyDmesgMatchedLine        = "line"
	EventKeyDmesgMatchedFilter      = "filter"
	EventKeyDmesgMatchedError       = "error"
)
View Source
const (
	StateNameDmesg = "dmesg"

	StateKeyDmesgFile           = "file"
	StateKeyDmesgLastSeekOffset = "offset"
	StateKeyDmesgLastSeekWhence = "whence"

	StateNameDmesgTailScanMatched = "dmesg_tail_matched"

	StateKeyDmesgTailScanMatchedUnixSeconds = "unix_seconds"
	StateKeyDmesgTailScanMatchedLine        = "line"
	StateKeyDmesgTailScanMatchedFilter      = "filter"
	StateKeyDmesgTailScanMatchedError       = "error"
)
View Source
const (
	// e.g.,
	// Out of memory: Killed process 123, UID 48, (httpd).
	EventOOMKill      = "oom_kill"
	EventOOMKillRegex = `Out of memory:`

	// e.g.,
	// oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
	EventOOMKillConstraint      = "oom_kill_constraint"
	EventOOMKillConstraintRegex = `oom-kill:constraint=`

	// e.g.,
	// postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0
	EventOOMKiller      = "oom_killer"
	EventOOMKillerRegex = `(?i)\b(invoked|triggered) oom-killer\b`

	// e.g.,
	// Memory cgroup out of memory: Killed process 123, UID 48, (httpd).
	EventOOMCgroup      = "oom_cgroup"
	EventOOMCgroupRegex = `Memory cgroup out of memory`
)
View Source
const (
	// e.g.,
	// [...] NVRM: Xid (0000:03:00): 14, Channel 00000001
	// [...] NVRM: Xid (PCI:0000:05:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus.
	// NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus.
	//
	// ref.
	// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf
	EventNvidiaNVRMXid = "nvidia_nvrm_xid"

	// e.g.,
	// [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)
	// [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up
	//
	// ref.
	// "D.4 Non-Fatal NVSwitch SXid Errors"
	// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf
	EventNvidiaNVSwitchSXid = "nvidia_nvswitch_sxid"

	// repeated messages may indicate more persistent issue on the inter-GPU communication
	// e.g.,
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	EventNvidiaPeermemInvalidContext = "nvidia_peermem_invalid_context"

	// repeated messages may indicate GPU communication issues, which may happen due to fabric manager issues
	// e.g.,
	// [Thu Oct 10 03:06:53 2024] pt_main_thread[2536443]: segfault at 7f797fe00000 ip 00007f7c7ac69996 sp 00007f7c12fd7c30 error 4 in libnccl.so.2[7f7c7ac00000+d3d3000]
	EventNvidiaNCCLSegfaultInLibnccl = "nvidia_nccl_segfault_in_libnccl"
)
View Source
const DefaultDmesgFile = "/var/log/dmesg"
View Source
const Name = "dmesg"

Variables

This section is empty.

Functions

func DefaultDmesgFiltersForNvidia

func DefaultDmesgFiltersForNvidia() []*query_log_common.Filter

func DefaultLogFilters

func DefaultLogFilters(ctx context.Context) ([]*query_log_common.Filter, error)

func DmesgExists

func DmesgExists() bool

func GetDefaultLogPoller

func GetDefaultLogPoller() query_log.Poller

func New

func ParseEventDmesgMatched

func ParseEventDmesgMatched(m map[string]string) (query_log.Item, error)

func ParseStateDmesg

func ParseStateDmesg(s *State, m map[string]string) error

func ParseStateDmesgTailScanMatched

func ParseStateDmesgTailScanMatched(m map[string]string) (query_log.Item, error)

Types

type Component

type Component struct {
	// contains filtered or unexported fields
}

func (*Component) Close

func (c *Component) Close() error

func (*Component) Events

func (c *Component) Events(ctx context.Context, since time.Time) ([]components.Event, error)

The dmesg component events returns the realtime events from the dmesg log poller. Returns `github.com/leptonai/gpud/components/query.ErrNoData` if there is no event found.

func (*Component) Metrics

func (c *Component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error)

func (*Component) Name

func (c *Component) Name() string

func (*Component) States

func (c *Component) States(ctx context.Context) ([]components.State, error)

The dmesg component fetches the latest state from the dmesg tail scanner, rather than querying the log poller, which watches for the realtime dmesg streaming outputs. This is because the tail scanner is cheaper and can read historical logs in case the dmesg log watcher had restarted. It is more important that dmesg state calls DOES NOT miss any logs than having the logs available real-time. The real-time dmesg events can be fetched via the events API.

func (*Component) TailScan added in v0.1.6

func (c *Component) TailScan() (*State, error)

type Config

type Config struct {
	Log query_log_config.Config `json:"log"`
}

func DefaultConfig

func DefaultConfig(ctx context.Context) (Config, error)

func ParseConfig

func ParseConfig(b any, db *sql.DB) (*Config, error)

func (Config) Validate

func (cfg Config) Validate() error

type Event

type Event struct {
	Matched []query_log.Item `json:"matched"`
}

func ParseEventJSON

func ParseEventJSON(data []byte) (*Event, error)

func ParseEvents

func ParseEvents(events ...components.Event) (*Event, error)

func (*Event) Events

func (ev *Event) Events() []components.Event

func (*Event) JSON

func (ev *Event) JSON() ([]byte, error)

type State

type State struct {
	File            string           `json:"file"`
	LastSeekInfo    tail.SeekInfo    `json:"last_seek_info"`
	TailScanMatched []query_log.Item `json:"tail_scan_matched"`
}

func ParseStateJSON

func ParseStateJSON(data []byte) (*State, error)

func ParseStates

func ParseStates(states ...components.State) (*State, error)

func (*State) JSON

func (s *State) JSON() ([]byte, error)

func (*State) States

func (s *State) States() []components.State

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL