Documentation ¶
Overview ¶
Package dmesg scans and watches dmesg outputs for errors, as specified in the configuration (e.g., regex match NVIDIA GPU errors).
Index ¶
- Constants
- func DefaultDmesgFiltersForNvidia() []*query_log_filter.Filter
- func DefaultLogFilters() []*query_log_filter.Filter
- func DmesgExists() bool
- func ExtractTimeFromLogLine(line []byte) (time.Time, error)
- func GetDefaultLogPoller() query_log.Poller
- func New(ctx context.Context, cfg Config) (components.Component, error)
- func ParseEventDmesgMatched(m map[string]string) (query_log.Item, error)
- func ParseStateDmesg(s *State, m map[string]string) error
- func ParseStateDmesgTailScanMatched(m map[string]string) (query_log.Item, error)
- type Component
- func (c *Component) Close() error
- func (c *Component) Events(ctx context.Context, since time.Time) ([]components.Event, error)
- func (c *Component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error)
- func (c *Component) Name() string
- func (c *Component) State() (*State, error)
- func (c *Component) States(ctx context.Context) ([]components.State, error)
- type Config
- type Event
- type State
Constants ¶
View Source
const ( EventNameDmesgMatched = "dmesg_matched" EventKeyDmesgMatchedUnixSeconds = "unix_seconds" EventKeyDmesgMatchedLine = "line" EventKeyDmesgMatchedFilter = "filter" EventKeyDmesgMatchedError = "error" )
View Source
const ( StateNameDmesg = "dmesg" StateKeyDmesgFile = "file" StateKeyDmesgLastSeekOffset = "offset" StateKeyDmesgLastSeekWhence = "whence" StateNameDmesgTailScanMatched = "dmesg_tail_matched" StateKeyDmesgTailScanMatchedUnixSeconds = "unix_seconds" StateKeyDmesgTailScanMatchedLine = "line" StateKeyDmesgTailScanMatchedFilter = "filter" StateKeyDmesgTailScanMatchedError = "error" )
View Source
const ( // e.g., // Out of memory: Killed process 123, UID 48, (httpd). EventOOMKill = "oom_kill" EventOOMKillRegex = `Out of memory:` // e.g., // oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null), EventOOMKillConstraint = "oom_kill_constraint" EventOOMKillConstraintRegex = `oom-kill:constraint=` // e.g., // postgres invoked oom-killer: gfp_mask=0x201d2, order=0, oomkilladj=0 EventOOMKiller = "oom_killer" EventOOMKillerRegex = `(?i)\b(invoked|triggered) oom-killer\b` // e.g., // Memory cgroup out of memory: Killed process 123, UID 48, (httpd). EventOOMCgroup = "oom_cgroup" EventOOMCgroupRegex = `Memory cgroup out of memory` )
View Source
const ( // e.g., // [...] NVRM: Xid (0000:03:00): 14, Channel 00000001 // [...] NVRM: Xid (PCI:0000:05:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus. // NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus. // // ref. // https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf EventNvidiaNVRMXid = "nvidia_nvrm_xid" // e.g., // [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First) // [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up // // ref. // "D.4 Non-Fatal NVSwitch SXid Errors" // https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf EventNvidiaNVSwitchSXid = "nvidia_nvswitch_sxid" // repeated messages may indicate more persistent issue on the inter-GPU communication // e.g., // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing // [Thu Sep 19 02:29:46 2024] nvidia-peermem nv_get_p2p_free_callback:127 ERROR detected invalid context, skipping further processing EventNvidiaPeermemInvalidContext = "nvidia_peermem_invalid_context" )
View Source
const DefaultDmesgFile = "/var/log/dmesg"
View Source
const Name = "dmesg"
Variables ¶
This section is empty.
Functions ¶
func DefaultDmesgFiltersForNvidia ¶
func DefaultDmesgFiltersForNvidia() []*query_log_filter.Filter
func DefaultLogFilters ¶
func DefaultLogFilters() []*query_log_filter.Filter
func DmesgExists ¶
func DmesgExists() bool
func ExtractTimeFromLogLine ¶
does not return error for now assume "dmesg --ctime" is used TODO: once stable return error
func GetDefaultLogPoller ¶
func ParseEventDmesgMatched ¶
Types ¶
type Component ¶
type Component struct {
// contains filtered or unexported fields
}
type Config ¶
type Config struct {
Log query_log_config.Config `json:"log"`
}
func DefaultConfig ¶
func DefaultConfig() Config
type Event ¶
func ParseEventJSON ¶
func ParseEvents ¶
func ParseEvents(events ...components.Event) (*Event, error)
func (*Event) Events ¶
func (ev *Event) Events() []components.Event
type State ¶
type State struct { File string `json:"file"` LastSeekInfo tail.SeekInfo `json:"last_seek_info"` TailScanMatched []query_log.Item `json:"tail_scan_matched"` }
func ParseStateJSON ¶
func ParseStates ¶
func ParseStates(states ...components.State) (*State, error)
func (*State) States ¶
func (s *State) States() []components.State
Click to show internal directories.
Click to hide internal directories.