From 99ead985c0ec96f756567d1ae6ce5788d8c44de5 Mon Sep 17 00:00:00 2001 From: Mac Chaffee Date: Sun, 29 Jan 2023 15:44:12 -0500 Subject: [PATCH] Create alert for OOMKill events inside containers --- alerts/resource_alerts.libsonnet | 17 +++++++++++++++++ tests.yaml | 24 +++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index a3e4fa406..27b132913 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -15,6 +15,8 @@ // See https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-configure-overprovisioning-with-cluster-autoscaler // for more details. ignoringOverprovisionedWorkloadSelector: '', + // Maximum rate of OOMKiller events that is considered normal for any single container + maxOOMKillRate: 0, }, prometheusAlerts+:: { @@ -213,6 +215,21 @@ summary: 'Processes experience elevated CPU throttling.', }, }, + { + alert: 'KubeContainerOOMEvents', + expr: ||| + sum by (namespace, pod, container, %(clusterLabel)s) + (rate(container_oom_events_total[5m])) > %(maxOOMKillRate)s + ||| % $._config, + labels: { + severity: 'warning', + }, + 'for': '5m', + annotations: { + description: 'Processes are being killed by the Out Of Memory (OOM) Killer inside the "{{ $labels.container}}" container in pod {{ $labels.namespace }}/{{ $labels.pod }}.', + summary: 'Processes are being OOMKilled', + }, + }, ], }, ], diff --git a/tests.yaml b/tests.yaml index 91cf98d3b..630727fe2 100644 --- a/tests.yaml +++ b/tests.yaml @@ -1113,7 +1113,7 @@ tests: runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping" summary: "Pod is crash looping." - eval_time: 20m - alertname: KubePodCrashLooping # alert fired for a period of 5 minutes after resolution because the alert looks back at the last 5 minutes of data and the range vector doesn't take stale samples into account + alertname: KubePodCrashLooping # alert fired for a period of 5 minutes after resolution because the alert looks back at the last 5 minutes of data and the range vector doesn't take stale samples into account exp_alerts: - exp_labels: severity: "warning" @@ -1214,3 +1214,25 @@ tests: description: 'Cluster has overcommitted memory resource requests for Namespaces.' runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit" summary: "Cluster has overcommitted memory resource requests." + +- interval: 1m + input_series: + - series: 'container_oom_events_total{container="dbserver", namespace="default", pod="dbserver-128fnk319a-alq1x"}' + values: '0 0 0 0 0 0 0' + - series: 'container_oom_events_total{container="webserver", namespace="default", pod="webserver-69755ddb67-dn2v5"}' + values: '0 2 2 4 4 4 4' + alert_rule_test: + - eval_time: 5m + alertname: KubeContainerOOMEvents + - eval_time: 6m + alertname: KubeContainerOOMEvents + exp_alerts: + - exp_labels: + container: webserver + namespace: default + pod: webserver-69755ddb67-dn2v5 + severity: warning + exp_annotations: + summary: Processes are being OOMKilled + description: Processes are being killed by the Out Of Memory (OOM) Killer inside the "webserver" container in pod default/webserver-69755ddb67-dn2v5. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontaineroomevents