investigators

package
v0.0.0-...-599ae38 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 8, 2024 License: Apache-2.0 Imports: 20 Imported by: 0

Documentation

Overview

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache

Index

Constants

View Source
const (
	FoundMsg = "Please check below resource status, you can get possible root cause from messages."

	GetPoCmd = `
1. kubectl describe po {{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}}
`
	GetRsCmd = `
kubectl describe rs {{.Name}} -n {{.ObjectMeta.Namespace}}
`
	GetDeployCmd = `
kubectl describe deploy {{.Name}} -n {{.ObjectMeta.Namespace}}
`
	GetSsCmd = `
kubectl describe ss {{.Name}} -n {{.ObjectMeta.Namespace}}
`
	GetDsCmd = `
kubectl describe ds {{.Name}} -n {{.ObjectMeta.Namespace}}
`
	DesSvcCmd = `
kubectl describe svc {{.Name}} -n {{.ObjectMeta.Namespace}}
`
	GetEndpointsCmd = `
kubectl describe endpoints {{ .Name}} -n {{ .ObjectMeta.Namespace }}
`
	GetIngCmd = `
kubectl describe ing {{ .Name}} -n {{ .ObjectMeta.Namespace }}
`
	GetJobCmd = `
kubectl describe job {{ .Name}} -n {{ .ObjectMeta.Namespace }}
`
	GetCronjobCmd = `
kubectl describe cronjob {{ .Name}} -n {{ .ObjectMeta.Namespace }}
`
	GetNoCmd = `
kubectl describe no {{ .Name}}
`
)
View Source
const (
	CrashLoopBackOff = "CrashLoopBackOff"

	ExecutableNotFoundMsg = "executable file not found in $PATH: unknown"
	NoSuchFileMsg         = "no such file or directory: unknown"
	ReadinessProbeFailMsg = "Readiness probe failed"
	LivenessProbeFailMsg  = "Liveness probe failed"
	StartupProbeFailMsg   = "Startup probe failed"
	ExitWithOOM           = "Container killed due to OOM."
	ExitCode1             = "Generally application has issues, container exit code(1)."
	ExitCode126           = "Command invoked cannot execute, container exit code(126)."
	ExitCode127           = "Command not found, container exit code(127)."
	ExitCode2To128        = "Container terminated internally, EXIT with Non-Zero value(between 2 to 128)."
	ExitCode137           = "Container was killed, generally due to OOM, container exit code(137)."
	ExitCode139           = "Issues in application code or the base image, container exit code(139)"
	ExitCode129To255      = "Container terminated by external signal, EXIT with Non-Zero value(between 129 to 255)."

	CrushLoopBackOffMsg = `
1. Container {{.ContainerName}} has been restarted more than 10 times in the last few minutes.
`
	CrashLoopBackOffDocLink = `
https://containersolutions.github.io/runbooks/posts/kubernetes/crashloopbackoff
`
)
View Source
const (
	DescribePoCmd = `` /* 265-byte string literal not displayed */

	SolutionExitCode1 = `` /* 176-byte string literal not displayed */

	SolutionExitCode126 = `` /* 232-byte string literal not displayed */

	SolutionExitCode127 = `` /* 210-byte string literal not displayed */

	SolutionExitCode2To128 = `` /* 200-byte string literal not displayed */

	SolutionExitCode137 = `` /* 188-byte string literal not displayed */

	SolutionExitCode139 = `` /* 207-byte string literal not displayed */

	SolutionExitCode129To255 = `` /* 186-byte string literal not displayed */

	SolutionOOM = `2. Container {{.ContainerName}} has EXITED with reason OOMKilled (1). Check the resource limits of the container.
`
	SolutionReadinessProbeFail = `2. Following readiness probe has failed for the container {{.ContainerName}}.
`
	SolutionLivenessProbeFail = `2. Following liveliness probe has failed for the container {{.ContainerName}}.
`
	SolutionStartupProbeFailMsg = `2. Following startup probe has failed for the container {{.ContainerName}}.
`
	SolutionExecutableNotFoundMsg = `` /* 445-byte string literal not displayed */

	SolutionNoSuchFile = `` /* 446-byte string literal not displayed */

	DefaultSolution = `2. This may due to container resource request not enough, readiness probe or liveness probe failed. Not enough initial delay.
3. Or may due to commands inside container failed, command not found in path, readonly file system, missing configurations or dependencies.
4. Use the commands below to check logs or events can help found the root cause.
5. Below docs can help understand this issue:` + CrashLoopBackOffDocLink
)
View Source
const (
	UnknownManifestMsg    = "The named manifest is not known"
	NoSuchHostMsg         = "No such host"
	IOTimeoutMsg          = "I/O timeout"
	NotFoundMsg           = "failed to pull and unpack image .* not found"
	ConnectionRefused     = "Connection refused"
	UnauthorizedMsg       = "Unauthorized or access denied or authentication required"
	AuthorizeFailed       = "failed to authorize"
	QuotaRateLimitMsg     = "Quota exceeded or Too Many Requests or rate limit"
	RepositoryNotExistMsg = "Repository does not exist or may require 'docker login'"
)
View Source
const (
	UnknownManifestSolution = `` /* 305-byte string literal not displayed */

	NoSuchHostSolution = `` /* 215-byte string literal not displayed */

	IOTimeoutSolution = `` /* 438-byte string literal not displayed */

	ConnectionRefusedSolution = `` /* 405-byte string literal not displayed */

	UnauthorizedSolution = `` /* 209-byte string literal not displayed */

	AuthorizeFailedSolution = `` /* 247-byte string literal not displayed */

	QuotaRateLimitSolution = `` /* 205-byte string literal not displayed */

)
View Source
const (
	GetPoSecretCmd = `` /* 292-byte string literal not displayed */

	SecretMsg1 = `` /* 262-byte string literal not displayed */

	SecretMsg2NotExist = `` /* 260-byte string literal not displayed */

	SecretMsg3 = `Run the following command 2 to get the imagePullSecret name:
`
)
View Source
const (
	NotAvailableSolution = `
1. Deployment '{{.Name}}' is not available.
2. Please check the replica(s) status in this deployment.
`
	MemoryQuotaSolution = `
1. Deployment '{{.Name}}' has exceeded memory quotas.
2. Please check the memory requests/limits of your deployment.
`
	CPUQuotaSolution = `
1. Deployment '{{.Name}}' has exceeded CPU quotas.
2. Please check the CPU requests/limits of your deployment.
`
	ResourceQuotaSolution = `
1. Deployment '{{.Name}}' has exceeded resource quotas.
2. Please check the requests/limits of your deployment.
`
	DescribeCmd = `
kubectl describe deploy {{.Name}} -n {{ .ObjectMeta.Namespace }}
`
)
View Source
const (
	NotReadyAddressSolution = `
1. At least one subset has 'NotReadyAddresses' status.
2. Please check the health of the selected pods.
`
	NoPodSelectedSolution = `` /* 201-byte string literal not displayed */

	NoServiceFoundSolution = `
1. No Service or subsets found for Endpoint {{.Name}}.
2. Please check if this Endpoint is necessary.
`
)
View Source
const (
	NotReadySolution = `` /* 360-byte string literal not displayed */

	MemPressSolution = `` /* 451-byte string literal not displayed */

	DiskPressSolution = `` /* 314-byte string literal not displayed */

	PidPressSolution = `` /* 402-byte string literal not displayed */

	NetUnAvailableSolution = `` /* 459-byte string literal not displayed */

	FindPoOnNoCmd = `
1. kubectl get pods -o wide -A | grep {{ .ObjectMeta.Name }}
`

	KubeletCmd = `
1. journalctl -u kubelet.
2. systemctl status kubelet.
3. systemctl restart kubelet.
`
)
View Source
const (
	ReadinessProbeFailedSolution = `` /* 296-byte string literal not displayed */

	ReadinessGateFailedSolution = `` /* 288-byte string literal not displayed */

	UsefulCommands = `` /* 164-byte string literal not displayed */

)
View Source
const (
	NodesNotAvailable        = "0/.* nodes are available"
	PendingNodeSelector      = "didn't match .*selector"
	PendingNodeAffinity      = "didn't match .*affinity"
	PendingNodeTaint         = "untolerated taint"
	PendingNodeUnschedulable = "unschedulable"
	PendingInsufficient      = "Insufficient"
	PendingNoHostPort        = "node(s) didn't have free ports"

	PendingPVCGetErr      = "error getting PVC"
	PendingPVCNotFound    = "persistentvolumeclaim .* not found"
	PendingUnboundPVC     = "pod has unbound immediate PersistentVolumeClaims"
	PendingBindFailed     = "Failed to bind volumes"
	PendingCmNotFound     = "configmap .* not found"
	PendingSecretNotFound = "secret .* not found" //nolint:gosec
)
View Source
const (
	FailedSchedulingMessage          = "%d. Pod failed scheduling, message is: %s."
	NodeUnavailableSolution          = "" /* 151-byte string literal not displayed */
	PendingNodeUnschedulableSolution = "%d. Some nodes are unschedulable, try to uncordon these nodes may fix this."

	PendingNodeSelectorSolution = "" /* 204-byte string literal not displayed */
	PendingNodeTaintSolution    = "" /* 191-byte string literal not displayed */

	PendingInsufficientSolution = "" /* 147-byte string literal not displayed */
	PendingNoHostPortSolution   = "" /* 147-byte string literal not displayed */
	PVCNotFoundSolution         = "2. Pod {{ .ObjectMeta.Name }} is pending, used PVC not found." + KubectlPodAndPVC
	PVCUnboundSolution          = "2. Pod {{ .ObjectMeta.Name }} is pending, due to use an unbound PVC." + KubectlPodAndPVC
	KubectlPodAndPVC            = `` /* 172-byte string literal not displayed */

	ContainerFailMount         = "%d. Container failed mount, message is: %s."
	ContainerFailMountSolution = "%d. Please check your volumes of the Pod, try to change to correct and existing resources may fix this problem."
	CmNotFoundSolution         = "%d. Please check the configMap that mount, try to change to an existing configMap may fix this issue."
	SecretNotFoundSolution     = "%d. Please check the secret that mount, try to change to an existing secret may fix this issue."

	PendingUnknownSolution = `` /* 500-byte string literal not displayed */

	KubeDescribePoCmd   = "%d. kubectl describe po {{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}}"
	GetEventsCmd        = "%d. kubectl get events --field-selector involvedObject.name={{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}}"
	GetNoAllCmd         = "%d. kubectl get no"
	GetNoAllocatableCmd = "%d. kubectl get no -o custom-columns=NAME:.metadata.name,ALLOCATABLE:.status.allocatable --no-headers"
	GetNoLabelCmd       = "%d. kubectl get no --show-labels"
	GetNoTaintCmd       = "%d. kubectl get no -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints --no-headers"
	UncordonCmd         = "%d. kubectl uncordon <node name>"

	GetPvcCmd    = "%d. kubectl get pvc -n {{ .ObjectMeta.Namespace }}"
	GetCmCmd     = "%d. kubectl get cm -n {{ .ObjectMeta.Namespace }}"
	GetSecretCmd = "%d. kubectl get secret -n {{ .ObjectMeta.Namespace }}"
)

Variables

View Source
var CrashLoopBackOffSolutions = map[string]func(ctx context.Context, pod *v1.Pod, status *v1.ContainerStatus) ([]string, []string){
	ExecutableNotFoundMsg: getCrushLoopBackOffCommonSolution(SolutionExecutableNotFoundMsg, nil),
	NoSuchFileMsg:         getCrushLoopBackOffCommonSolution(SolutionNoSuchFile, nil),
	ReadinessProbeFailMsg: getCrushLoopBackOffCommonSolution(SolutionReadinessProbeFail, getSolutionReadinessProbeFailMsg),
	LivenessProbeFailMsg:  getCrushLoopBackOffCommonSolution(SolutionLivenessProbeFail, getSolutionLivenessProbeFailMsg),
	StartupProbeFailMsg:   getCrushLoopBackOffCommonSolution(SolutionStartupProbeFailMsg, getSolutionStartupProbeFailMsg),
	ExitWithOOM:           getCrushLoopBackOffCommonSolution(SolutionOOM, getSolutionOOM),
	ExitCode1:             getCrushLoopBackOffCommonSolution(SolutionExitCode1, nil),
	ExitCode126:           getCrushLoopBackOffCommonSolution(SolutionExitCode126, nil),
	ExitCode127:           getCrushLoopBackOffCommonSolution(SolutionExitCode127, nil),
	ExitCode2To128:        getCrushLoopBackOffCommonSolution(SolutionExitCode2To128, nil),
	ExitCode137:           getCrushLoopBackOffCommonSolution(SolutionExitCode137, nil),
	ExitCode139:           getCrushLoopBackOffCommonSolution(SolutionExitCode139, nil),
	ExitCode129To255:      getCrushLoopBackOffCommonSolution(SolutionExitCode129To255, nil),
}
View Source
var DefaultTimespan = problem.TimeSpan{
	Timespan:     48,
	TimespanType: time.Hour,
}

Default Timespan, used in Event Filtering.

View Source
var ImagePullBackOffReasons = []string{"ImagePullBackOff", "ErrImagePull", "ErrImagePullBackOff"}
View Source
var ImagePullBackOffSolutions = map[string]func(ctx context.Context, pod *v1.Pod, status *v1.ContainerStatus) ([]string, []string){
	UnknownManifestMsg:    getImagePullBackOffSolution(UnknownManifestSolution),
	RepositoryNotExistMsg: getImagePullBackOffSolution(UnknownManifestSolution),
	NoSuchHostMsg:         getImagePullBackOffSolution(NoSuchHostSolution),
	IOTimeoutMsg:          getImagePullBackOffSolution(IOTimeoutSolution),
	ConnectionRefused:     getImagePullBackOffSolution(ConnectionRefusedSolution),
	UnauthorizedMsg:       getImagePullBackOffSolution(UnauthorizedSolution),
	AuthorizeFailed:       getImagePullBackOffSolution(AuthorizeFailedSolution),
	QuotaRateLimitMsg:     getImagePullBackOffSolution(QuotaRateLimitSolution),
	NotFoundMsg:           getImagePullBackOffSolution(UnknownManifestSolution),
}
View Source
var PodNotReadyEventMessage = []string{
	"Readiness probe failed",
}

Functions

func CommonInvestigator

func CommonInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, input *problem.DetectorCreationInput)

func ContainerCrashLoopBackoffInvestigator

func ContainerCrashLoopBackoffInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func ContainerImagePullBackoffInvestigator

func ContainerImagePullBackoffInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func CreateEventFilterCriteria

func CreateEventFilterCriteria(timespan problem.TimeSpan,
	filterCriteria map[string]string) observability.EventFilterCriteria

Create event.FilterCriteria.

func DeploymentGenerationMismatchInvestigator

func DeploymentGenerationMismatchInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func DeploymentNotAvailableInvestigator

func DeploymentNotAvailableInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func DeploymentReplicasMismatchInvestigator

func DeploymentReplicasMismatchInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func EndpointAddressNotAvailableInvestigator

func EndpointAddressNotAvailableInvestigator(ctx context.Context, wg *sync.WaitGroup,
	problem *problem.Problem, input *problem.DetectorCreationInput)

func ExecGoTemplate

func ExecGoTemplate(ctx context.Context, template string, object interface{}) (s string, err error)

Execute Go Template parse

func GetSolutionsByTemplate

func GetSolutionsByTemplate(ctx context.Context, template string, object interface{}, splitIt bool) (solution []string)

A general function used to parse go template. Go template passed in string type, parsed results returned in []string type. Parameter splitIt, if true, parsed results will be split by \n.

func InitContainerImagePullBackoffInvestigator

func InitContainerImagePullBackoffInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func NodeDiskPressureInvestigator

func NodeDiskPressureInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func NodeMemoryPressureInvestigator

func NodeMemoryPressureInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func NodeNetworkUnavailableInvestigator

func NodeNetworkUnavailableInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func NodeNotReadyInvestigator

func NodeNotReadyInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func NodePIDPressureInvestigator

func NodePIDPressureInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem,
	input *problem.DetectorCreationInput)

func PodNotReadyInvestigator

func PodNotReadyInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, input *problem.DetectorCreationInput)

func PodNotRunningInvestigator

func PodNotRunningInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, input *problem.DetectorCreationInput)

func PodNotRunningSolutionsInvestigator

func PodNotRunningSolutionsInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, input *problem.DetectorCreationInput)

func SetStartTime

func SetStartTime(currentTime time.Time, timespan problem.TimeSpan) time.Time

Types

type CrushLoopPodInfo

type CrushLoopPodInfo struct {
	Pod           v1.Pod
	ContainerName string
	ExitCode      int32
}

type PodAndStatus

type PodAndStatus struct {
	Pod    v1.Pod
	Status *v1.ContainerStatus
}

type PodNotReady

type PodNotReady struct {
	Pod     *v1.Pod
	Reason  string
	Message string
	Config  string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL