Documentation
¶
Overview ¶
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
* Copyright FMR LLC <opensource@fidelity.com> * * SPDX-License-Identifier: Apache
Index ¶
- Constants
- Variables
- func CommonInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func ContainerCrashLoopBackoffInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func ContainerImagePullBackoffInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func CreateEventFilterCriteria(timespan problem.TimeSpan, filterCriteria map[string]string) observability.EventFilterCriteria
- func DeploymentGenerationMismatchInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func DeploymentNotAvailableInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func DeploymentReplicasMismatchInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func EndpointAddressNotAvailableInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func ExecGoTemplate(ctx context.Context, template string, object interface{}) (s string, err error)
- func GetResourceEvents(ctx context.Context, input *problem.DetectorCreationInput, name string, ...) ([]observability.EventRecord, error)
- func GetSolutionsByTemplate(ctx context.Context, template string, object interface{}, splitIt bool) (solution []string)
- func IngressMisconfiguredInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func IngressSolution(ctx context.Context, reason string, ing networkv1.Ingress) []string
- func InitContainerImagePullBackoffInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func NodeDiskPressureInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func NodeMemoryPressureInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func NodeNetworkUnavailableInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func NodeNotReadyInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func NodePIDPressureInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func PodNotReadyInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func PodNotRunningInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func PodNotRunningSolutionsInvestigator(ctx context.Context, wg *sync.WaitGroup, problem *problem.Problem, ...)
- func SetStartTime(currentTime time.Time, timespan problem.TimeSpan) time.Time
- type CrushLoopPodInfo
- type PodAndStatus
- type PodNotReady
Constants ¶
const ( FoundMsg = "Please check below resource status, you can get possible root cause from messages." GetPoCmd = ` 1. kubectl describe po {{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}} ` GetRsCmd = ` kubectl describe rs {{.Name}} -n {{.ObjectMeta.Namespace}} ` GetDeployCmd = ` kubectl describe deploy {{.Name}} -n {{.ObjectMeta.Namespace}} ` GetSsCmd = ` kubectl describe ss {{.Name}} -n {{.ObjectMeta.Namespace}} ` GetDsCmd = ` kubectl describe ds {{.Name}} -n {{.ObjectMeta.Namespace}} ` DesSvcCmd = ` kubectl describe svc {{.Name}} -n {{.ObjectMeta.Namespace}} ` GetEndpointsCmd = ` kubectl describe endpoints {{ .Name}} -n {{ .ObjectMeta.Namespace }} ` GetIngCmd = ` kubectl describe ing {{ .Name}} -n {{ .ObjectMeta.Namespace }} ` GetJobCmd = ` kubectl describe job {{ .Name}} -n {{ .ObjectMeta.Namespace }} ` GetCronjobCmd = ` kubectl describe cronjob {{ .Name}} -n {{ .ObjectMeta.Namespace }} ` GetNoCmd = ` kubectl describe no {{ .Name}} ` )
const ( CrashLoopBackOff = "CrashLoopBackOff" ExecutableNotFoundMsg = "executable file not found in $PATH: unknown" NoSuchFileMsg = "no such file or directory: unknown" ReadinessProbeFailMsg = "Readiness probe failed" LivenessProbeFailMsg = "Liveness probe failed" StartupProbeFailMsg = "Startup probe failed" ExitWithOOM = "Container killed due to OOM." ExitCode1 = "Generally application has issues, container exit code(1)." ExitCode126 = "Command invoked cannot execute, container exit code(126)." ExitCode127 = "Command not found, container exit code(127)." ExitCode2To128 = "Container terminated internally, EXIT with Non-Zero value(between 2 to 128)." ExitCode137 = "Container was killed, generally due to OOM, container exit code(137)." ExitCode139 = "Issues in application code or the base image, container exit code(139)" ExitCode129To255 = "Container terminated by external signal, EXIT with Non-Zero value(between 129 to 255)." CrushLoopBackOffMsg = ` 1. Container {{.ContainerName}} has been restarted more than 10 times in the last few minutes. ` CrashLoopBackOffDocLink = ` https://containersolutions.github.io/runbooks/posts/kubernetes/crashloopbackoff ` )
const ( DescribePoCmd = `` /* 265-byte string literal not displayed */ SolutionExitCode1 = `` /* 176-byte string literal not displayed */ SolutionExitCode126 = `` /* 232-byte string literal not displayed */ SolutionExitCode127 = `` /* 210-byte string literal not displayed */ SolutionExitCode2To128 = `` /* 200-byte string literal not displayed */ SolutionExitCode137 = `` /* 188-byte string literal not displayed */ SolutionExitCode139 = `` /* 207-byte string literal not displayed */ SolutionExitCode129To255 = `` /* 186-byte string literal not displayed */ SolutionOOM = `2. Container {{.ContainerName}} has EXITED with reason OOMKilled (1). Check the resource limits of the container. ` SolutionReadinessProbeFail = `2. Following readiness probe has failed for the container {{.ContainerName}}. ` SolutionLivenessProbeFail = `2. Following liveliness probe has failed for the container {{.ContainerName}}. ` SolutionStartupProbeFailMsg = `2. Following startup probe has failed for the container {{.ContainerName}}. ` SolutionExecutableNotFoundMsg = `` /* 445-byte string literal not displayed */ SolutionNoSuchFile = `` /* 446-byte string literal not displayed */ DefaultSolution = `2. This may due to container resource request not enough, readiness probe or liveness probe failed. Not enough initial delay. 3. Or may due to commands inside container failed, command not found in path, readonly file system, missing configurations or dependencies. 4. Use the commands below to check logs or events can help found the root cause. 5. Below docs can help understand this issue:` + CrashLoopBackOffDocLink )
const ( UnknownManifestMsg = "The named manifest is not known" NoSuchHostMsg = "No such host" IOTimeoutMsg = "I/O timeout" NotFoundMsg = "failed to pull and unpack image .* not found" ConnectionRefused = "Connection refused" AuthorizeFailed = "failed to authorize" QuotaRateLimitMsg = "Quota exceeded or Too Many Requests or rate limit" RepositoryNotExistMsg = "Repository does not exist or may require 'docker login'" )
const ( UnknownManifestSolution = `` /* 305-byte string literal not displayed */ NoSuchHostSolution = `` /* 215-byte string literal not displayed */ IOTimeoutSolution = `` /* 438-byte string literal not displayed */ ConnectionRefusedSolution = `` /* 405-byte string literal not displayed */ AuthorizeFailedSolution = `` /* 247-byte string literal not displayed */ QuotaRateLimitSolution = `` /* 205-byte string literal not displayed */ )
const ( GetPoSecretCmd = `` /* 292-byte string literal not displayed */ SecretMsg1 = `` /* 262-byte string literal not displayed */ SecretMsg2NotExist = `` /* 260-byte string literal not displayed */ SecretMsg3 = `Run the following command 2 to get the imagePullSecret name: ` )
const ( NotAvailableSolution = ` 1. Deployment '{{.Name}}' is not available. 2. Please check the replica(s) status in this deployment. ` MemoryQuotaSolution = ` 1. Deployment '{{.Name}}' has exceeded memory quotas. 2. Please check the memory requests/limits of your deployment. ` CPUQuotaSolution = ` 1. Deployment '{{.Name}}' has exceeded CPU quotas. 2. Please check the CPU requests/limits of your deployment. ` ResourceQuotaSolution = ` 1. Deployment '{{.Name}}' has exceeded resource quotas. 2. Please check the requests/limits of your deployment. ` DescribeCmd = ` kubectl describe deploy {{.Name}} -n {{ .ObjectMeta.Namespace }} ` )
const ( NotReadyAddressSolution = ` 1. At least one subset has 'NotReadyAddresses' status. 2. Please check the health of the selected pods. ` NoPodSelectedSolution = `` /* 201-byte string literal not displayed */ NoServiceFoundSolution = ` 1. No Service or subsets found for Endpoint {{.Name}}. 2. Please check if this Endpoint is necessary. ` )
const ( S3Conflicts = "conflicting attributes access_logs.s3.bucket" S3PrefixConflicts = "conflicting attributes access_logs.s3.prefix" BucketNotExist = "S3Bucket.* does not exist" NoCertFound = "no certificate found" CertNotFound = "CertificateNotFound" Protocol = "protocol" SecurityGroup = "securityGroups" Description = "Ingress {{.Name}} has incorrect configuration detected by Ingress Controller." IngDefaultSolution = "1. Ingress configuration Error: %s" FixIngSolution = "%d. Fix above issue then deploy again, could possibly make Ingress work." ProtocolSolution = "%d. Check protocol config in Ingress annotations, correct protocol includes http, https." S3ConflictsSolution = "" /* 147-byte string literal not displayed */ S3PrefixConflictsSolution = "" /* 128-byte string literal not displayed */ S3Solution = "%d. Check s3.bucket config in Ingress annotations, make sure S3 bucket exists and is accessible." SecurityGroupSolution = "%d. Check securityGroups config in Ingress annotations, pass the correct securityGroup associated with the cluster." CertNotFoundSolution = "" /* 157-byte string literal not displayed */ IngressCommands = `` /* 161-byte string literal not displayed */ )
const ( NotReadySolution = `` /* 360-byte string literal not displayed */ MemPressSolution = `` /* 451-byte string literal not displayed */ DiskPressSolution = `` /* 314-byte string literal not displayed */ PidPressSolution = `` /* 402-byte string literal not displayed */ NetUnAvailableSolution = `` /* 459-byte string literal not displayed */ FindPoOnNoCmd = ` 1. kubectl get pods -o wide -A | grep {{ .ObjectMeta.Name }} ` KubeletCmd = ` 1. journalctl -u kubelet. 2. systemctl status kubelet. 3. systemctl restart kubelet. ` )
const ( ReadinessProbeFailedSolution = `` /* 296-byte string literal not displayed */ ReadinessGateFailedSolution = `` /* 288-byte string literal not displayed */ UsefulCommands = `` /* 164-byte string literal not displayed */ )
const ( NodesNotAvailable = "0/.* nodes are available" PendingNodeSelector = "didn't match .*selector" PendingNodeAffinity = "didn't match .*affinity" PendingNodeTaint = "untolerated taint" PendingNodeUnschedulable = "unschedulable" PendingInsufficient = "Insufficient" PendingNoHostPort = "node(s) didn't have free ports" PendingPVCGetErr = "error getting PVC" PendingPVCNotFound = "persistentvolumeclaim .* not found" PendingUnboundPVC = "pod has unbound immediate PersistentVolumeClaims" PendingBindFailed = "Failed to bind volumes" PendingCmNotFound = "configmap .* not found" PendingSecretNotFound = "secret .* not found" //nolint:gosec )
const ( FailedSchedulingMessage = "%d. Pod failed scheduling, message is: %s." PendingNodeUnschedulableSolution = "%d. Some nodes are unschedulable, try to uncordon these nodes may fix this." PendingNodeSelectorSolution = "" /* 204-byte string literal not displayed */ PendingNodeTaintSolution = "" /* 191-byte string literal not displayed */ PendingInsufficientSolution = "" /* 147-byte string literal not displayed */ PendingNoHostPortSolution = "" /* 147-byte string literal not displayed */ PVCNotFoundSolution = "2. Pod {{ .ObjectMeta.Name }} is pending, used PVC not found." + KubectlPodAndPVC PVCUnboundSolution = "2. Pod {{ .ObjectMeta.Name }} is pending, due to use an unbound PVC." + KubectlPodAndPVC KubectlPodAndPVC = `` /* 172-byte string literal not displayed */ ContainerFailMount = "%d. Container failed mount, message is: %s." ContainerFailMountSolution = "%d. Please check your volumes of the Pod, try to change to correct and existing resources may fix this problem." CmNotFoundSolution = "%d. Please check the configMap that mount, try to change to an existing configMap may fix this issue." SecretNotFoundSolution = "%d. Please check the secret that mount, try to change to an existing secret may fix this issue." PendingUnknownSolution = `` /* 500-byte string literal not displayed */ KubeDescribePoCmd = "%d. kubectl describe po {{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}}" GetEventsCmd = "%d. kubectl get events --field-selector involvedObject.name={{.ObjectMeta.Name}} -n {{.ObjectMeta.Namespace}}" GetNoAllCmd = "%d. kubectl get no" GetNoAllocatableCmd = "%d. kubectl get no -o custom-columns=NAME:.metadata.name,ALLOCATABLE:.status.allocatable --no-headers" GetNoLabelCmd = "%d. kubectl get no --show-labels" GetNoTaintCmd = "%d. kubectl get no -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints --no-headers" UncordonCmd = "%d. kubectl uncordon <node name>" GetPvcCmd = "%d. kubectl get pvc -n {{ .ObjectMeta.Namespace }}" GetCmCmd = "%d. kubectl get cm -n {{ .ObjectMeta.Namespace }}" GetSecretCmd = "%d. kubectl get secret -n {{ .ObjectMeta.Namespace }}" )
Variables ¶
var CrashLoopBackOffSolutions = map[string]func(ctx context.Context, pod *v1.Pod, status *v1.ContainerStatus) ([]string, []string){ ExecutableNotFoundMsg: getCrushLoopBackOffCommonSolution(SolutionExecutableNotFoundMsg, nil), NoSuchFileMsg: getCrushLoopBackOffCommonSolution(SolutionNoSuchFile, nil), ReadinessProbeFailMsg: getCrushLoopBackOffCommonSolution(SolutionReadinessProbeFail, getSolutionReadinessProbeFailMsg), LivenessProbeFailMsg: getCrushLoopBackOffCommonSolution(SolutionLivenessProbeFail, getSolutionLivenessProbeFailMsg), StartupProbeFailMsg: getCrushLoopBackOffCommonSolution(SolutionStartupProbeFailMsg, getSolutionStartupProbeFailMsg), ExitWithOOM: getCrushLoopBackOffCommonSolution(SolutionOOM, getSolutionOOM), ExitCode1: getCrushLoopBackOffCommonSolution(SolutionExitCode1, nil), ExitCode126: getCrushLoopBackOffCommonSolution(SolutionExitCode126, nil), ExitCode127: getCrushLoopBackOffCommonSolution(SolutionExitCode127, nil), ExitCode2To128: getCrushLoopBackOffCommonSolution(SolutionExitCode2To128, nil), ExitCode137: getCrushLoopBackOffCommonSolution(SolutionExitCode137, nil), ExitCode139: getCrushLoopBackOffCommonSolution(SolutionExitCode139, nil), ExitCode129To255: getCrushLoopBackOffCommonSolution(SolutionExitCode129To255, nil), }
var DefaultTimespan = problem.TimeSpan{ Timespan: 48, TimespanType: time.Hour, }
Default Timespan, used in Event Filtering.
var ImagePullBackOffReasons = []string{"ImagePullBackOff", "ErrImagePull", "ErrImagePullBackOff"}
var ImagePullBackOffSolutions = map[string]func(ctx context.Context, pod *v1.Pod, status *v1.ContainerStatus) ([]string, []string){ UnknownManifestMsg: getImagePullBackOffSolution(UnknownManifestSolution), RepositoryNotExistMsg: getImagePullBackOffSolution(UnknownManifestSolution), NoSuchHostMsg: getImagePullBackOffSolution(NoSuchHostSolution), IOTimeoutMsg: getImagePullBackOffSolution(IOTimeoutSolution), ConnectionRefused: getImagePullBackOffSolution(ConnectionRefusedSolution), UnauthorizedMsg: getImagePullBackOffSolution(UnauthorizedSolution), AuthorizeFailed: getImagePullBackOffSolution(AuthorizeFailedSolution), QuotaRateLimitMsg: getImagePullBackOffSolution(QuotaRateLimitSolution), NotFoundMsg: getImagePullBackOffSolution(UnknownManifestSolution), }
var PodNotReadyEventMessage = []string{
"Readiness probe failed",
}
Functions ¶
func CommonInvestigator ¶
func CreateEventFilterCriteria ¶
func CreateEventFilterCriteria(timespan problem.TimeSpan, filterCriteria map[string]string) observability.EventFilterCriteria
Create event.FilterCriteria.
func ExecGoTemplate ¶
Execute Go Template parse
func GetResourceEvents ¶
func GetResourceEvents(ctx context.Context, input *problem.DetectorCreationInput, name string, namespace string) ([]observability.EventRecord, error)
func GetSolutionsByTemplate ¶
func GetSolutionsByTemplate(ctx context.Context, template string, object interface{}, splitIt bool) (solution []string)
A general function used to parse go template. Go template passed in string type, parsed results returned in []string type. Parameter splitIt, if true, parsed results will be split by \n.
func IngressSolution ¶
func PodNotReadyInvestigator ¶
Types ¶
type CrushLoopPodInfo ¶
type PodAndStatus ¶
type PodAndStatus struct { Pod v1.Pod Status *v1.ContainerStatus }
Source Files
¶
- common.go
- common_investigator.go
- container_crushloopbackoff_investigator.go
- container_imagepullbackoff_investigator.go
- deployment_investigator.go
- endpoint_investigator.go
- ingress_investigator.go
- initcontainer_investigator.go
- node_investigator.go
- pod_not_ready_investigator.go
- pod_pending_investigator.go