diff --git a/cmd/cadvisor.go b/cmd/cadvisor.go index 80f76c51ed9..acb9ce65bf8 100644 --- a/cmd/cadvisor.go +++ b/cmd/cadvisor.go @@ -112,6 +112,7 @@ var ( container.CPUTopologyMetrics: struct{}{}, container.ResctrlMetrics: struct{}{}, container.CPUSetMetrics: struct{}{}, + container.OOMMetrics: struct{}{}, } ) diff --git a/cmd/cadvisor_test.go b/cmd/cadvisor_test.go index 1e22dfd9ad0..755c03f6835 100644 --- a/cmd/cadvisor_test.go +++ b/cmd/cadvisor_test.go @@ -109,6 +109,7 @@ func TestToIncludedMetrics(t *testing.T) { container.CPUTopologyMetrics: struct{}{}, container.ResctrlMetrics: struct{}{}, container.CPUSetMetrics: struct{}{}, + container.OOMMetrics: struct{}{}, }, container.AllMetrics, {}, diff --git a/container/factory.go b/container/factory.go index 56d198976ef..f04f8a1b11e 100644 --- a/container/factory.go +++ b/container/factory.go @@ -64,6 +64,7 @@ const ( CPUTopologyMetrics MetricKind = "cpu_topology" ResctrlMetrics MetricKind = "resctrl" CPUSetMetrics MetricKind = "cpuset" + OOMMetrics MetricKind = "oom_event" ) // AllMetrics represents all kinds of metrics that cAdvisor supported. @@ -89,6 +90,7 @@ var AllMetrics = MetricSet{ CPUTopologyMetrics: struct{}{}, ResctrlMetrics: struct{}{}, CPUSetMetrics: struct{}{}, + OOMMetrics: struct{}{}, } func (mk MetricKind) String() string { diff --git a/info/v1/container.go b/info/v1/container.go index 9fe04fddea2..e44f50edfb1 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -963,6 +963,8 @@ type ContainerStats struct { Resctrl ResctrlStats `json:"resctrl,omitempty"` CpuSet CPUSetStats `json:"cpuset,omitempty"` + + OOMEvents uint64 `json:"oom_events,omitempty"` } func timeEq(t1, t2 time.Time, tolerance time.Duration) bool { diff --git a/manager/container.go b/manager/container.go index c776e10d640..09a1923b9df 100644 --- a/manager/container.go +++ b/manager/container.go @@ -27,6 +27,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/google/cadvisor/cache/memory" @@ -102,6 +103,8 @@ type containerData struct { // resctrlCollector updates stats for resctrl controller. resctrlCollector stats.Collector + + oomEvents uint64 } // jitter returns a time.Duration between duration and duration + maxFactor * duration, @@ -668,6 +671,9 @@ func (cd *containerData) updateStats() error { klog.V(2).Infof("Failed to add summary stats for %q: %v", cd.info.Name, err) } } + + stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents) + var customStatsErr error cm := cd.collectorManager.(*collector.GenericCollectorManager) if len(cm.Collectors) > 0 { diff --git a/manager/manager.go b/manager/manager.go index cf6a8a1a100..03423ab3dcb 100644 --- a/manager/manager.go +++ b/manager/manager.go @@ -24,6 +24,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/google/cadvisor/accelerators" @@ -35,7 +36,7 @@ import ( "github.com/google/cadvisor/events" "github.com/google/cadvisor/fs" info "github.com/google/cadvisor/info/v1" - "github.com/google/cadvisor/info/v2" + v2 "github.com/google/cadvisor/info/v2" "github.com/google/cadvisor/machine" "github.com/google/cadvisor/nvm" "github.com/google/cadvisor/perf" @@ -1237,6 +1238,24 @@ func (m *manager) watchForNewOoms() error { if err != nil { klog.Errorf("failed to add OOM kill event for %q: %v", oomInstance.ContainerName, err) } + + // Count OOM events for later collection by prometheus + request := v2.RequestOptions{ + IdType: v2.TypeName, + Count: 1, + } + conts, err := m.getRequestedContainers(oomInstance.ContainerName, request) + if err != nil { + klog.V(2).Infof("failed getting container info for %q: %v", oomInstance.ContainerName, err) + continue + } + if len(conts) != 1 { + klog.V(2).Info("Expected the request to match only one container") + continue + } + for _, cont := range conts { + atomic.AddUint64(&cont.oomEvents, 1) + } } }() return nil diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 7d3f24a99da..66846b649d7 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1757,6 +1757,17 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }, }...) } + if includedMetrics.Has(container.OOMMetrics) { + c.containerMetrics = append(c.containerMetrics, containerMetric{ + name: "container_oom_events_total", + help: "Count of out of memory events observed for the container", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.OOMEvents), timestamp: s.Timestamp}} + }, + }) + } + return c } diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index 79b17a395ea..b6db0825f0b 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -358,6 +358,9 @@ container_network_udp_usage_total{container_env_foo_env="prod",container_label_f container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="listen",zone_name="hello"} 0 1395066363000 container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="rxqueued",zone_name="hello"} 0 1395066363000 container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="txqueued",zone_name="hello"} 0 1395066363000 +# HELP container_oom_events_total Count of out of memory events observed for the container +# TYPE container_oom_events_total counter +container_oom_events_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0 1395066363000 # HELP container_perf_events_total Perf event metric. # TYPE container_perf_events_total counter container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="0",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 123 1395066363000