dmesg

package
v0.0.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 3, 2024 License: Apache-2.0 Imports: 27 Imported by: 0

Documentation

Overview

Package dmesg scans and watches dmesg outputs for errors, as specified in the configuration (e.g., regex match NVIDIA GPU errors).

Index

Constants

View Source
const (
	EventNameDmesgMatched = "dmesg_matched"

	EventKeyDmesgMatchedUnixSeconds = "unix_seconds"
	EventKeyDmesgMatchedLine        = "line"
	EventKeyDmesgMatchedFilter      = "filter"
	EventKeyDmesgMatchedError       = "error"
)
View Source
const (
	StateNameDmesg = "dmesg"

	StateKeyDmesgFile           = "file"
	StateKeyDmesgLastSeekOffset = "offset"
	StateKeyDmesgLastSeekWhence = "whence"

	StateNameDmesgTailScanMatched = "dmesg_tail_matched"

	StateKeyDmesgTailScanMatchedUnixSeconds = "unix_seconds"
	StateKeyDmesgTailScanMatchedLine        = "line"
	StateKeyDmesgTailScanMatchedFilter      = "filter"
	StateKeyDmesgTailScanMatchedError       = "error"
)
View Source
const (
	// e.g.,
	// Out of memory: Killed process 123, UID 48, (httpd).
	EventOOMKill      = "oom_kill"
	EventOOMKillRegex = `Out of memory:`

	// e.g.,
	// oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),
	EventOOMKillConstraint      = "oom_kill_constraint"
	EventOOMKillConstraintRegex = `oom-kill:constraint=`

	// e.g.,
	// postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0
	EventOOMKiller      = "oom_killer"
	EventOOMKillerRegex = `(?i)\b(invoked|triggered) oom-killer\b`

	// e.g.,
	// Memory cgroup out of memory: Killed process 123, UID 48, (httpd).
	EventOOMCgroup      = "oom_cgroup"
	EventOOMCgroupRegex = `Memory cgroup out of memory`
)
View Source
const (
	// e.g.,
	// [...] NVRM: Xid (0000:03:00): 14, Channel 00000001
	// [...] NVRM: Xid (PCI:0000:05:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus.
	// NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus.
	//
	// ref.
	// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf
	EventNvidiaNVRMXid = "nvidia_nvrm_xid"

	// e.g.,
	// [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)
	// [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up
	//
	// ref.
	// "D.4 Non-Fatal NVSwitch SXid Errors"
	// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf
	EventNvidiaNVSwitchSXid = "nvidia_nvswitch_sxid"

	// repeated messages may indicate more persistent issue on the inter-GPU communication
	// e.g.,
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	// [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing
	EventNvidiaPeermemInvalidContext = "nvidia_peermem_invalid_context"
)
View Source
const DefaultDmesgFile = "/var/log/dmesg"
View Source
const Name = "dmesg"

Variables

This section is empty.

Functions

func DefaultDmesgFiltersForNvidia

func DefaultDmesgFiltersForNvidia() []*query_log_filter.Filter

func DefaultLogFilters

func DefaultLogFilters() []*query_log_filter.Filter

func DmesgExists

func DmesgExists() bool

func ExtractTimeFromLogLine

func ExtractTimeFromLogLine(line []byte) (time.Time, error)

does not return error for now assume "dmesg --ctime" is used TODO: once stable return error

func GetDefaultLogPoller

func GetDefaultLogPoller() query_log.Poller

func New

func ParseEventDmesgMatched

func ParseEventDmesgMatched(m map[string]string) (query_log.Item, error)

func ParseStateDmesg

func ParseStateDmesg(s *State, m map[string]string) error

func ParseStateDmesgTailScanMatched

func ParseStateDmesgTailScanMatched(m map[string]string) (query_log.Item, error)

Types

type Component

type Component struct {
	// contains filtered or unexported fields
}

func (*Component) Close

func (c *Component) Close() error

func (*Component) Events

func (c *Component) Events(ctx context.Context, since time.Time) ([]components.Event, error)

func (*Component) Metrics

func (c *Component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error)

func (*Component) Name

func (c *Component) Name() string

func (*Component) State

func (c *Component) State() (*State, error)

func (*Component) States

func (c *Component) States(ctx context.Context) ([]components.State, error)

type Config

type Config struct {
	Log query_log_config.Config `json:"log"`
}

func DefaultConfig

func DefaultConfig() Config

func ParseConfig

func ParseConfig(b any, db *sql.DB) (*Config, error)

func (Config) Validate

func (cfg Config) Validate() error

type Event

type Event struct {
	Matched []query_log.Item `json:"matched"`
}

func ParseEventJSON

func ParseEventJSON(data []byte) (*Event, error)

func ParseEvents

func ParseEvents(events ...components.Event) (*Event, error)

func (*Event) Events

func (ev *Event) Events() []components.Event

func (*Event) JSON

func (ev *Event) JSON() ([]byte, error)

type State

type State struct {
	File            string           `json:"file"`
	LastSeekInfo    tail.SeekInfo    `json:"last_seek_info"`
	TailScanMatched []query_log.Item `json:"tail_scan_matched"`
}

func ParseStateJSON

func ParseStateJSON(data []byte) (*State, error)

func ParseStates

func ParseStates(states ...components.State) (*State, error)

func (*State) JSON

func (s *State) JSON() ([]byte, error)

func (*State) States

func (s *State) States() []components.State

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL