Documentation
¶
Overview ¶
Package dmesg scans and watches dmesg outputs for errors, as specified in the configuration (e.g., regex match NVIDIA GPU errors).
Index ¶
- Constants
- func DefaultDmesgFiltersForNvidia() []*query_log_common.Filter
- func DefaultLogFilters(ctx context.Context) ([]*query_log_common.Filter, error)
- func DmesgExists() bool
- func GetDefaultLogPoller() query_log.Poller
- func New(ctx context.Context, cfg Config) (components.Component, error)
- func ParseEventDmesgMatched(m map[string]string) (query_log.Item, error)
- func ParseStateDmesg(s *State, m map[string]string) error
- func ParseStateDmesgTailScanMatched(m map[string]string) (query_log.Item, error)
- type Component
- func (c *Component) Close() error
- func (c *Component) Events(ctx context.Context, since time.Time) ([]components.Event, error)
- func (c *Component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error)
- func (c *Component) Name() string
- func (c *Component) States(ctx context.Context) ([]components.State, error)
- func (c *Component) TailScan() (*State, error)
- type Config
- type Event
- type State
Constants ¶
const ( EventNameDmesgMatched = "dmesg_matched" EventKeyDmesgMatchedUnixSeconds = "unix_seconds" EventKeyDmesgMatchedLine = "line" EventKeyDmesgMatchedFilter = "filter" EventKeyDmesgMatchedError = "error" )
const ( StateNameDmesg = "dmesg" StateKeyDmesgFile = "file" StateKeyDmesgLastSeekOffset = "offset" StateKeyDmesgLastSeekWhence = "whence" StateNameDmesgTailScanMatched = "dmesg_tail_matched" StateKeyDmesgTailScanMatchedUnixSeconds = "unix_seconds" StateKeyDmesgTailScanMatchedLine = "line" StateKeyDmesgTailScanMatchedFilter = "filter" StateKeyDmesgTailScanMatchedError = "error" )
const ( // e.g., // Out of memory: Killed process 123, UID 48, (httpd). EventOOMKill = "oom_kill" EventOOMKillRegex = `Out of memory:` // e.g., // oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null), EventOOMKillConstraint = "oom_kill_constraint" EventOOMKillConstraintRegex = `oom-kill:constraint=` // e.g., // postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0 EventOOMKiller = "oom_killer" EventOOMKillerRegex = `(?i)\b(invoked|triggered) oom-killer\b` // e.g., // Memory cgroup out of memory: Killed process 123, UID 48, (httpd). EventOOMCgroup = "oom_cgroup" EventOOMCgroupRegex = `Memory cgroup out of memory` )
const ( // e.g., // [...] NVRM: Xid (0000:03:00): 14, Channel 00000001 // [...] NVRM: Xid (PCI:0000:05:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus. // NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus. // // ref. // https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf EventNvidiaNVRMXid = "nvidia_nvrm_xid" // e.g., // [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First) // [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up // // ref. // "D.4 Non-Fatal NVSwitch SXid Errors" // https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf EventNvidiaNVSwitchSXid = "nvidia_nvswitch_sxid" // repeated messages may indicate more persistent issue on the inter-GPU communication // e.g., // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing EventNvidiaPeermemInvalidContext = "nvidia_peermem_invalid_context" // repeated messages may indicate GPU communication issues, which may happen due to fabric manager issues // e.g., // [Thu Oct 10 03:06:53 2024] pt_main_thread[2536443]: segfault at 7f797fe00000 ip 00007f7c7ac69996 sp 00007f7c12fd7c30 error 4 in libnccl.so.2[7f7c7ac00000+d3d3000] EventNvidiaNCCLSegfaultInLibnccl = "nvidia_nccl_segfault_in_libnccl" )
const DefaultDmesgFile = "/var/log/dmesg"
const Name = "dmesg"
Variables ¶
This section is empty.
Functions ¶
func DefaultDmesgFiltersForNvidia ¶
func DefaultDmesgFiltersForNvidia() []*query_log_common.Filter
func DefaultLogFilters ¶
func DefaultLogFilters(ctx context.Context) ([]*query_log_common.Filter, error)
func DmesgExists ¶
func DmesgExists() bool
func GetDefaultLogPoller ¶
func ParseEventDmesgMatched ¶
Types ¶
type Component ¶
type Component struct {
// contains filtered or unexported fields
}
func (*Component) Events ¶
The dmesg component events returns the realtime events from the dmesg log poller. Returns `github.com/leptonai/gpud/components/query.ErrNoData` if there is no event found.
func (*Component) States ¶
The dmesg component fetches the latest state from the dmesg tail scanner, rather than querying the log poller, which watches for the realtime dmesg streaming outputs. This is because the tail scanner is cheaper and can read historical logs in case the dmesg log watcher had restarted. It is more important that dmesg state calls DOES NOT miss any logs than having the logs available real-time. The real-time dmesg events can be fetched via the events API.
type Config ¶
type Config struct {
Log query_log_config.Config `json:"log"`
}
type Event ¶
func ParseEventJSON ¶
func ParseEvents ¶
func ParseEvents(events ...components.Event) (*Event, error)
func (*Event) Events ¶
func (ev *Event) Events() []components.Event
type State ¶
type State struct { File string `json:"file"` LastSeekInfo tail.SeekInfo `json:"last_seek_info"` TailScanMatched []query_log.Item `json:"tail_scan_matched"` }
func ParseStateJSON ¶
func ParseStates ¶
func ParseStates(states ...components.State) (*State, error)
func (*State) States ¶
func (s *State) States() []components.State