dmesg

package

v0.3.8 Latest Latest Go to latest Published: Jan 8, 2025 License: Apache-2.0 Imports: 38 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/leptonai/gpud

Links

Open Source Insights

Documentation ¶

Overview ¶

Package dmesg scans and watches dmesg outputs for errors, as specified in the configuration (e.g., regex match NVIDIA GPU errors).

Index ¶

Constants
func DefaultDmesgFiltersForCPU() []*query_log_common.Filter
func DefaultDmesgFiltersForFileDescriptor() []*query_log_common.Filter
func DefaultDmesgFiltersForMemory() []*query_log_common.Filter
func DefaultDmesgFiltersForNvidia() []*query_log_common.Filter
func DefaultLogFilters(ctx context.Context) ([]*query_log_common.Filter, error)
func DmesgExists() bool
func GetDefaultLogPoller() query_log.Poller
func New(ctx context.Context, cfg Config, ...) (components.Component, error)
func ParseStateDmesg(s *State, m map[string]string) error
func ParseStateDmesgTailScanMatched(m map[string]string) (query_log.Item, error)
type Component
- func (c *Component) Close() error
- func (c *Component) Events(ctx context.Context, since time.Time) ([]components.Event, error)
- func (c *Component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error)
- func (c *Component) Name() string
- func (c *Component) States(ctx context.Context) ([]components.State, error)
- func (c *Component) TailScan() (*State, error)
type Config
- func DefaultConfig(ctx context.Context) (Config, error)
- func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error)
- func (cfg Config) Validate() error
type Event
- func ParseEventJSON(data []byte) (*Event, error)
- func (ev *Event) Events() []components.Event
- func (ev *Event) JSON() ([]byte, error)
type State
- func ParseStateJSON(data []byte) (*State, error)
- func ParseStates(states ...components.State) (*State, error)
- func (s *State) JSON() ([]byte, error)
- func (s *State) States() []components.State

Constants ¶

View Source

const (
	EventNameDmesgMatched = "dmesg_matched"

	EventKeyDmesgMatchedError   = "error"
	EventKeyDmesgMatchedLogItem = "log_item"
)

View Source

const (
	StateNameDmesg = "dmesg"

	StateKeyDmesgFile           = "file"
	StateKeyDmesgLastSeekOffset = "offset"
	StateKeyDmesgLastSeekWhence = "whence"

	StateNameDmesgTailScanMatched = "dmesg_tail_matched"

	StateKeyDmesgTailScanMatchedUnixSeconds = "unix_seconds"
	StateKeyDmesgTailScanMatchedLine        = "line"
	StateKeyDmesgTailScanMatchedFilter      = "filter"
	StateKeyDmesgTailScanMatchedError       = "error"
)

View Source

const (
	// DefaultDmesgFile default path with dmesg file
	DefaultDmesgFile = "/var/log/dmesg"
	// DefaultDmesgCmd DefaultDmesgCmdWithSince default scan dmesg command (in newer util-linux it works, but older is not)
	// some old dmesg versions don't support --since, thus fall back to the one without --since and tail the last 200 lines
	// ref. https://github.com/leptonai/gpud/issues/32
	DefaultDmesgCmd          = "dmesg --time-format=iso --nopager --buffer-size 163920"
	DefaultDmesgCmdWithSince = "dmesg --time-format=iso --nopager --buffer-size 163920 --since '1 hour ago'"
	DefaultScanDmesgCmd      = DefaultDmesgCmdWithSince + " || " + DefaultDmesgCmd + " | tail -n 200"

	// DefaultJournalCtlCmd default scan journalctl command
	DefaultJournalCtlCmd     = "journalctl -qk -o short-iso --no-pager --since '1 hour ago' | tail -n 200"
	DefaultJournalCtlScanCmd = "journalctl -qk -o short-iso --no-pager --since '1 hour ago' -f || true"
)

View Source

const (
	// e.g.,
	// INFO: task kcompactd1:1177 blocked for more than 120 seconds.
	// INFO: task jfsmount:136986 blocked for more than 120 seconds.
	// task jfsmount:136986 blocked for more than 120 seconds.
	// "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
	// task:jfsmount        state:D stack:    0 pid: 9831 ppid:  9614 flags:0x00000004
	EventCPUBlockedTooLong = "cpu_blocked_too_long"

	// e.g.,
	// [Sun Jan  5 18:28:55 2025] watchdog: BUG: soft lockup - CPU#18 stuck for 27s! [python3:2254956]
	// [Sun Jan  5 20:25:34 2025] watchdog: BUG: soft lockup - CPU#6 stuck for 48s! [python3:2257218]
	// [Sun Jan  5 18:33:00 2025] watchdog: BUG: soft lockup - CPU#0 stuck for 25s! [pt_data_pin:2273422]
	// [Sun Jan  5 19:42:34 2025] watchdog: BUG: soft lockup - CPU#4 stuck for 23s! [pt_autograd_0:2289563]
	// [Sun Jan  5 18:37:06 2025] watchdog: BUG: soft lockup - CPU#0 stuck for 27s! [cuda-EvtHandlr:2255424]
	EventCPUSoftLockup = "cpu_soft_lockup"
)

View Source

const (
	// e.g.,
	// Out of memory: Killed process 123, UID 48, (httpd).
	//
	// NOTE: this is often followed by a line like:
	// [Sun Dec  8 09:23:39 2024] oom_reaper: reaped process 345646 (vector), now anon-rss:0kB, file-rss:0kB, shmem-rss:0
	// (to reap the memory used by the OOM victim)
	EventMemoryOOM = "memory_oom"

	// e.g.,
	// oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
	EventMemoryOOMKillConstraint = "memory_oom_kill_constraint"

	// e.g.,
	// postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0
	EventMemoryOOMKiller = "memory_oom_killer"

	// e.g.,
	// Memory cgroup out of memory: Killed process 123, UID 48, (httpd).
	EventMemoryOOMCgroup = "memory_oom_cgroup"

	// e.g.,
	// [...] EDAC MC0: 1 CE memory read error
	// [...] EDAC MC1: 128 CE memory read error on CPU_SrcID#1_Ha#0_Chan#1_DIMM#1
	//
	// ref.
	// https://serverfault.com/questions/682909/how-to-find-faulty-memory-module-from-mce-message
	// https://github.com/Azure/azurehpc/blob/2d57191cb35ed638525ba9424cc2aa1b5abe1c05/experimental/aks_npd_draino/npd/deployment/node-problem-detector-config.yaml#L51C20-L51C40
	EventMemoryEDACCorrectableErrors = "memory_edac_correctable_errors"
)

View Source

const (
	// e.g.,
	// [...] NVRM: Xid (0000:03:00): 14, Channel 00000001
	// [...] NVRM: Xid (PCI:0000:05:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus.
	// NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus.
	//
	// ref.
	// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf
	EventNvidiaNVRMXid = "nvidia_nvrm_xid"

	// e.g.,
	// [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)
	// [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up
	//
	// ref.
	// "D.4 Non-Fatal NVSwitch SXid Errors"
	// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf
	EventNvidiaNVSwitchSXid = "nvidia_nvswitch_sxid"

	// repeated messages may indicate more persistent issue on the inter-GPU communication
	// e.g.,
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	EventNvidiaPeermemInvalidContext = "nvidia_peermem_invalid_context"

	// repeated messages may indicate GPU communication issues, which may happen due to fabric manager issues
	// e.g.,
	// [Thu Oct 10 03:06:53 2024] pt_main_thread[2536443]: segfault at 7f797fe00000 ip 00007f7c7ac69996 sp 00007f7c12fd7c30 error 4 in libnccl.so.2[7f7c7ac00000+d3d3000]
	EventNvidiaNCCLSegfaultInLibnccl = "nvidia_nccl_segfault_in_libnccl"
)

View Source

const (
	// e.g.,
	// [...] VFS: file-max limit 1000000 reached
	//
	// ref.
	// https://docs.kernel.org/admin-guide/sysctl/fs.html#file-max-file-nr
	EventFileDescriptorVFSFileMaxLimitReached = "file_descriptor_vfs_file_max_limit_reached"
)

View Source

const Name = "dmesg"

Variables ¶

This section is empty.

Functions ¶

func DefaultDmesgFiltersForCPU ¶ added in v0.3.8

func DefaultDmesgFiltersForCPU() []*query_log_common.Filter

func DefaultDmesgFiltersForFileDescriptor ¶ added in v0.3.1

func DefaultDmesgFiltersForFileDescriptor() []*query_log_common.Filter

func DefaultDmesgFiltersForMemory ¶ added in v0.3.5

func DefaultDmesgFiltersForMemory() []*query_log_common.Filter

func DefaultDmesgFiltersForNvidia ¶

func DefaultDmesgFiltersForNvidia() []*query_log_common.Filter

func DefaultLogFilters ¶

func DefaultLogFilters(ctx context.Context) ([]*query_log_common.Filter, error)

func DmesgExists ¶

func DmesgExists() bool

func GetDefaultLogPoller ¶

func GetDefaultLogPoller() query_log.Poller

func New ¶

func New(ctx context.Context, cfg Config, processMatched query_log_common.ProcessMatchedFunc) (components.Component, error)

func ParseStateDmesg ¶

func ParseStateDmesg(s *State, m map[string]string) error

func ParseStateDmesgTailScanMatched ¶

func ParseStateDmesgTailScanMatched(m map[string]string) (query_log.Item, error)

Types ¶

type Component ¶

type Component struct {
	// contains filtered or unexported fields
}

func (*Component) Close ¶

func (c *Component) Close() error

func (*Component) Events ¶

func (c *Component) Events(ctx context.Context, since time.Time) ([]components.Event, error)

The dmesg component events returns the realtime events from the dmesg log poller. Returns `github.com/leptonai/gpud/components/query.ErrNoData` if there is no event found.

func (*Component) Metrics ¶

func (c *Component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error)

func (*Component) Name ¶

func (c *Component) Name() string

func (*Component) States ¶

func (c *Component) States(ctx context.Context) ([]components.State, error)

The dmesg component fetches the latest state from the dmesg tail scanner, rather than querying the log poller, which watches for the realtime dmesg streaming outputs. This is because the tail scanner is cheaper and can read historical logs in case the dmesg log watcher had restarted. It is more important that dmesg state calls DOES NOT miss any logs than having the logs available real-time. The real-time dmesg events can be fetched via the events API.

func (*Component) TailScan ¶ added in v0.1.6

func (c *Component) TailScan() (*State, error)

type Config ¶

type Config struct {
	Log query_log_config.Config `json:"log"`
}

func DefaultConfig ¶

func DefaultConfig(ctx context.Context) (Config, error)

func ParseConfig ¶

func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error)

func (Config) Validate ¶

func (cfg Config) Validate() error

type Event ¶

type Event struct {
	Matched []query_log.Item `json:"matched"`
}

func ParseEventJSON ¶

func ParseEventJSON(data []byte) (*Event, error)

func (*Event) Events ¶

func (ev *Event) Events() []components.Event

TODO: deprecate

func (*Event) JSON ¶

func (ev *Event) JSON() ([]byte, error)

type State ¶

type State struct {
	File            string           `json:"file"`
	LastSeekInfo    tail.SeekInfo    `json:"last_seek_info"`
	TailScanMatched []query_log.Item `json:"tail_scan_matched"`
}

func ParseStateJSON ¶

func ParseStateJSON(data []byte) (*State, error)

func ParseStates ¶

func ParseStates(states ...components.State) (*State, error)

func (*State) JSON ¶

func (s *State) JSON() ([]byte, error)

func (*State) States ¶

func (s *State) States() []components.State

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL