Module: check_mk
Branch: master
Commit: e2f8ca955436c2e9b920990b24eb88517a251cdf
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=e2f8ca955436c2…
Author: Sebastian Herbord <sh(a)mathias-kettner.de>
Date: Tue Aug 18 15:58:11 2015 +0200
#2529 kernel.util can now be configured to warn if a single cpu core exceeds a utilization
threshold for a while
---
.werks/2529 | 9 +++++++++
ChangeLog | 1 +
checks/cpu_util.include | 35 ++++++++++++++++++++++++++++++----
checks/kernel | 28 +++++++++++++++++----------
web/plugins/wato/check_parameters.py | 14 ++++++++++++++
5 files changed, 73 insertions(+), 14 deletions(-)
diff --git a/.werks/2529 b/.werks/2529
new file mode 100644
index 0000000..fa1048d
--- /dev/null
+++ b/.werks/2529
@@ -0,0 +1,9 @@
+Title: kernel.util can now be configured to warn if a single cpu core exceeds a
utilization threshold for a while
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i3
+Date: 1439906081
+Class: feature
+
+
diff --git a/ChangeLog b/ChangeLog
index b035920..03c206c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -48,6 +48,7 @@
* 2536 emc_isilon_fans, emc_isilon_temp, emc_isilon_temp.cpu: New checks for fans and
temperatures of EMC Isilon
* 2527 emc_isilon_power, emc_isilon_quota: New checks for power supply and fs quotas
on EMC Isilon data storage devices
* 2528 emc_isilon_cpu, emc_isilon_ifs: New checks for cpu utilization and the
combined cluster storage on EMC Isilon data storage devices
+ * 2529 kernel.util can now be configured to warn if a single cpu core exceeds a
utilization threshold for a while
* 2315 FIX: windows agent: BOM replacement, fixed incorrect byte offset...
* 2316 FIX: windows agent: fix garbled output of cached agent plugins...
* 2358 FIX: check_mk_agent.solaris: more correct computation of zfs used space...
diff --git a/checks/cpu_util.include b/checks/cpu_util.include
index efb4ea3..13fcbd3 100644
--- a/checks/cpu_util.include
+++ b/checks/cpu_util.include
@@ -89,18 +89,17 @@ def check_cpu_util(util, params, this_time = None):
# - 7 - steal: involuntary wait
# - 8 - guest: time spent in guest OK
# - 9 - guest_nice: time spent in niced guest OK
-def check_cpu_util_unix(values, params):
+def check_cpu_util_unix(values, params, cores = None):
this_time = int(time.time())
# Compute jiffi-differences of all relevant counters
diff_values = []
n = 0
- global g_counters
for v in values:
n += 1
countername = "cpu.util.%d" % n
- last_time, last_val = g_counters.get(countername, (0, 0))
+ last_time, last_val = get_item_state(countername, (0, 0))
diff_values.append(v - last_val)
- g_counters[countername] = (this_time, v)
+ set_item_state(countername, (this_time, v))
sum_jiffies = sum(diff_values) # do not account for steal!
if sum_jiffies == 0:
@@ -163,3 +162,31 @@ def check_cpu_util_unix(values, params):
levelstext = " (warn/crit at %.1f%%/%.1f%%)" % (warn, crit)
yield state, "total: %.1f%%" % util_total_perc + levelstext
+
+ if cores and "core_util_time" in params:
+ for core, user, nice, system, idle, iowait,\
+ irq, softirq, steal, guest, guest_nice in cores:
+
+ core_state_name = "cpu.util.core.high.%s" % core
+ total = user + nice + system + iowait + irq + softirq + steal + guest +
guest_nice
+
+ prev_total = get_item_state("cpu.util.%s.total" % core, 0)
+ total_diff = total - prev_total
+ set_item_state("cpu.util.%s.total" % core, total)
+
+ total_perc = (100.0 * total_diff) / sum_jiffies
+ threshold, warn_core, crit_core = params["core_util_time"]
+ if total_perc > threshold:
+ timestamp = get_item_state(core_state_name, 0)
+ high_load_duration = (this_time - timestamp) / 60
+ if timestamp == 0:
+ set_item_state(core_state_name, this_time)
+ elif high_load_duration > crit_core:
+ yield 2, "%s is under high load for %s minutes (warn/crit at
%s/%s minutes)" %\
+ (core, high_load_duration, warn_core, crit_core)
+ elif high_load_duration > warn_core:
+ yield 1, "%s is under high load for %s minutes (warn/crit at
%s/%s minutes)" %\
+ (core, high_load_duration, warn_core, crit_core)
+ else:
+ clear_item_state(core_state_name)
+
diff --git a/checks/kernel b/checks/kernel
index 9305bc9..9a7d25f 100644
--- a/checks/kernel
+++ b/checks/kernel
@@ -111,6 +111,13 @@ def inventory_cpu_utilization(info):
if len(x) > 0 and x[0] == 'cpu':
return [(None, {})]
+
+def transform_cpu_info(element):
+ if len(element) < 8:
+ element += ['0', '0', '0', '0'] # needed for
Linux 2.4
+
+ return [element[0]] + [int(x) for x in element[1:]]
+
# Columns of cpu usage /proc/stat:
# - cpuX: number of CPU or only 'cpu' for aggregation
# - user: normal processes executing in user mode
@@ -130,20 +137,21 @@ def kernel_check_cpu_utilization(item, params, info):
if type(params) != dict:
params = { "iowait": params }
- # Look for entry beginning with "cpu"
- f = [ l for l in info if l[0] == "cpu" ]
- if len(f) != 1:
- return 3, "More than one line with CPU info found. This check is not
cluster-enabled."
+ # Look for entry matching "cpu" (this is the combined load of all cores)
+ total = [transform_cpu_info(line)
+ for line in info
+ if line[0] == "cpu"]
- line = f[0]
- if len(line) < 8:
- line = line + ['0', '0', '0', '0'] # needed for
Linux 2.4
+ if len(total) != 1:
+ return 3, "More than one line with CPU info found. This check is not
cluster-enabled."
- # line contains now the following columns:
+ cores = [transform_cpu_info(line)
+ for line in info
+ if line[0].startswith("cpu") and len(line[0]) > 3]
+ # total contains now the following columns:
# 'cpu' user nice system idle wait hw-int sw-int (steal ...)
# convert number to int
- values = [ int(x) for x in line[1:] ]
- return check_cpu_util_unix(values, params)
+ return check_cpu_util_unix(total[0][1:], params, cores)
check_info["kernel.util"] = {
'check_function': kernel_check_cpu_utilization,
diff --git a/web/plugins/wato/check_parameters.py b/web/plugins/wato/check_parameters.py
index 7a88b1b..d336eee 100644
--- a/web/plugins/wato/check_parameters.py
+++ b/web/plugins/wato/check_parameters.py
@@ -4300,6 +4300,20 @@ register_check_parameters(
"the the bottleneck of your server is IO. Please
note that depending on the "
"applications being run this might or might not be
totally normal.")),
),
+ ( "core_util_time",
+ Tuple(
+ title = _("Alert on high utilization over an extended time
period on a single core"),
+ elements = [
+ Percentage(title = _("High utilization at "),
default_value = 100.0),
+ Integer(title = _("Warning after "), default_value =
5, unit = "min"),
+ Integer(title = _("Critical after "), default_value =
15, unit = "min"),
+ ],
+ help = _("A single thread fully utilizing a single core
(potentially due to a bug) "
+ "may go unnoticed when only monitoring the total
utilization of the CPU. "
+ "With this configuration, check_mk will alert if a
single core is "
+ "exceeding a utilization threshold over an extended
period of time.")
+ )
+ ),
]
),
forth = lambda old: type(old) != dict and { "iowait" : old } or old,