Module: check_mk
Branch: master
Commit: f8f11dc36a8056948bcf865bc5cc9a78c0811c51
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=f8f11dc36a8056…
Author: Sebastian Herbord <sh(a)mathias-kettner.de>
Date: Tue Aug 18 16:59:45 2015 +0200
get_counter and get_average now use the new API to store state instead of accessing g_counters directly
---
modules/check_mk_base.py | 31 ++++++++++++++-----------------
1 file changed, 14 insertions(+), 17 deletions(-)
diff --git a/modules/check_mk_base.py b/modules/check_mk_base.py
index 980222a..f789784 100644
--- a/modules/check_mk_base.py
+++ b/modules/check_mk_base.py
@@ -1016,18 +1016,19 @@ def clear_counters(pattern, older_than):
# Store arbitrary values until the next execution of a check
-def get_item_state(itemname, default=None):
- return g_counters.get(itemname, default)
-
-
def set_item_state(itemname, state):
g_counters[itemname] = state
+def get_item_state(itemname, default=None):
+ return g_counters.get(itemname, default)
+
+
def clear_item_state(itemname):
if itemname in g_counters:
del g_counters[itemname]
+
# Idea (1): We could keep global variables for the name of the checktype and item
# during a check and that way "countername" would need to be unique only
# within one checked item. So e.g. you could use "bcast" as name and not "if.%s.bcast" % item
@@ -1050,30 +1051,25 @@ def get_rate(countername, this_time, this_val, allow_negative=False, onwrap=SKIP
# Legacy. Do not use this function in checks directly any more!
def get_counter(countername, this_time, this_val, allow_negative=False, is_rate=False):
- global g_counters
+ old_state = get_item_state(countername, None)
+ set_item_state(countername, (this_time, this_val))
# First time we see this counter? Do not return
# any data!
- if not countername in g_counters:
- g_counters[countername] = (this_time, this_val)
+ if old_state is None:
# Do not suppress this check on check_mk -nv
if opt_dont_submit:
return 1.0, 0.0
raise MKCounterWrapped('Counter initialization')
- last_time, last_val = g_counters.get(countername)
+ last_time, last_val = old_state
timedif = this_time - last_time
if timedif <= 0: # do not update counter
- # Reset counter to a (hopefully) reasonable value
- g_counters[countername] = (this_time, this_val)
# Do not suppress this check on check_mk -nv
if opt_dont_submit:
return 1.0, 0.0
raise MKCounterWrapped('No time difference')
- # update counter for next time
- g_counters[countername] = (this_time, this_val)
-
if not is_rate:
valuedif = this_val - last_val
else:
@@ -1099,16 +1095,17 @@ def get_counter(countername, this_time, this_val, allow_negative=False, is_rate=
# backlog: averaging horizon in minutes
# initialize_zero: assume average of 0.0 when now previous average is stored
def get_average(itemname, this_time, this_val, backlog_minutes, initialize_zero = True):
+ old_state = get_item_state(itemname, None)
# first call: take current value as average or assume 0.0
- if not itemname in g_counters:
+ if old_state is None:
if initialize_zero:
this_val = 0
- g_counters[itemname] = (this_time, this_val)
+ set_item_state(itemname, (this_time, this_val))
return this_val # avoid time diff of 0.0 -> avoid division by zero
# Get previous value and time difference
- last_time, last_val = g_counters.get(itemname)
+ last_time, last_val = old_state
timedif = this_time - last_time
# Gracefully handle time-anomaly of target systems. We lose
@@ -1131,7 +1128,7 @@ def get_average(itemname, this_time, this_val, backlog_minutes, initialize_zero
new_val = last_val * weight + this_val * (1 - weight)
- g_counters[itemname] = (this_time, new_val)
+ set_item_state(itemname, (this_time, new_val))
return new_val
Module: check_mk
Branch: master
Commit: e2f8ca955436c2e9b920990b24eb88517a251cdf
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=e2f8ca955436c2…
Author: Sebastian Herbord <sh(a)mathias-kettner.de>
Date: Tue Aug 18 15:58:11 2015 +0200
#2529 kernel.util can now be configured to warn if a single cpu core exceeds a utilization threshold for a while
---
.werks/2529 | 9 +++++++++
ChangeLog | 1 +
checks/cpu_util.include | 35 ++++++++++++++++++++++++++++++----
checks/kernel | 28 +++++++++++++++++----------
web/plugins/wato/check_parameters.py | 14 ++++++++++++++
5 files changed, 73 insertions(+), 14 deletions(-)
diff --git a/.werks/2529 b/.werks/2529
new file mode 100644
index 0000000..fa1048d
--- /dev/null
+++ b/.werks/2529
@@ -0,0 +1,9 @@
+Title: kernel.util can now be configured to warn if a single cpu core exceeds a utilization threshold for a while
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i3
+Date: 1439906081
+Class: feature
+
+
diff --git a/ChangeLog b/ChangeLog
index b035920..03c206c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -48,6 +48,7 @@
* 2536 emc_isilon_fans, emc_isilon_temp, emc_isilon_temp.cpu: New checks for fans and temperatures of EMC Isilon
* 2527 emc_isilon_power, emc_isilon_quota: New checks for power supply and fs quotas on EMC Isilon data storage devices
* 2528 emc_isilon_cpu, emc_isilon_ifs: New checks for cpu utilization and the combined cluster storage on EMC Isilon data storage devices
+ * 2529 kernel.util can now be configured to warn if a single cpu core exceeds a utilization threshold for a while
* 2315 FIX: windows agent: BOM replacement, fixed incorrect byte offset...
* 2316 FIX: windows agent: fix garbled output of cached agent plugins...
* 2358 FIX: check_mk_agent.solaris: more correct computation of zfs used space...
diff --git a/checks/cpu_util.include b/checks/cpu_util.include
index efb4ea3..13fcbd3 100644
--- a/checks/cpu_util.include
+++ b/checks/cpu_util.include
@@ -89,18 +89,17 @@ def check_cpu_util(util, params, this_time = None):
# - 7 - steal: involuntary wait
# - 8 - guest: time spent in guest OK
# - 9 - guest_nice: time spent in niced guest OK
-def check_cpu_util_unix(values, params):
+def check_cpu_util_unix(values, params, cores = None):
this_time = int(time.time())
# Compute jiffi-differences of all relevant counters
diff_values = []
n = 0
- global g_counters
for v in values:
n += 1
countername = "cpu.util.%d" % n
- last_time, last_val = g_counters.get(countername, (0, 0))
+ last_time, last_val = get_item_state(countername, (0, 0))
diff_values.append(v - last_val)
- g_counters[countername] = (this_time, v)
+ set_item_state(countername, (this_time, v))
sum_jiffies = sum(diff_values) # do not account for steal!
if sum_jiffies == 0:
@@ -163,3 +162,31 @@ def check_cpu_util_unix(values, params):
levelstext = " (warn/crit at %.1f%%/%.1f%%)" % (warn, crit)
yield state, "total: %.1f%%" % util_total_perc + levelstext
+
+ if cores and "core_util_time" in params:
+ for core, user, nice, system, idle, iowait,\
+ irq, softirq, steal, guest, guest_nice in cores:
+
+ core_state_name = "cpu.util.core.high.%s" % core
+ total = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice
+
+ prev_total = get_item_state("cpu.util.%s.total" % core, 0)
+ total_diff = total - prev_total
+ set_item_state("cpu.util.%s.total" % core, total)
+
+ total_perc = (100.0 * total_diff) / sum_jiffies
+ threshold, warn_core, crit_core = params["core_util_time"]
+ if total_perc > threshold:
+ timestamp = get_item_state(core_state_name, 0)
+ high_load_duration = (this_time - timestamp) / 60
+ if timestamp == 0:
+ set_item_state(core_state_name, this_time)
+ elif high_load_duration > crit_core:
+ yield 2, "%s is under high load for %s minutes (warn/crit at %s/%s minutes)" %\
+ (core, high_load_duration, warn_core, crit_core)
+ elif high_load_duration > warn_core:
+ yield 1, "%s is under high load for %s minutes (warn/crit at %s/%s minutes)" %\
+ (core, high_load_duration, warn_core, crit_core)
+ else:
+ clear_item_state(core_state_name)
+
diff --git a/checks/kernel b/checks/kernel
index 9305bc9..9a7d25f 100644
--- a/checks/kernel
+++ b/checks/kernel
@@ -111,6 +111,13 @@ def inventory_cpu_utilization(info):
if len(x) > 0 and x[0] == 'cpu':
return [(None, {})]
+
+def transform_cpu_info(element):
+ if len(element) < 8:
+ element += ['0', '0', '0', '0'] # needed for Linux 2.4
+
+ return [element[0]] + [int(x) for x in element[1:]]
+
# Columns of cpu usage /proc/stat:
# - cpuX: number of CPU or only 'cpu' for aggregation
# - user: normal processes executing in user mode
@@ -130,20 +137,21 @@ def kernel_check_cpu_utilization(item, params, info):
if type(params) != dict:
params = { "iowait": params }
- # Look for entry beginning with "cpu"
- f = [ l for l in info if l[0] == "cpu" ]
- if len(f) != 1:
- return 3, "More than one line with CPU info found. This check is not cluster-enabled."
+ # Look for entry matching "cpu" (this is the combined load of all cores)
+ total = [transform_cpu_info(line)
+ for line in info
+ if line[0] == "cpu"]
- line = f[0]
- if len(line) < 8:
- line = line + ['0', '0', '0', '0'] # needed for Linux 2.4
+ if len(total) != 1:
+ return 3, "More than one line with CPU info found. This check is not cluster-enabled."
- # line contains now the following columns:
+ cores = [transform_cpu_info(line)
+ for line in info
+ if line[0].startswith("cpu") and len(line[0]) > 3]
+ # total contains now the following columns:
# 'cpu' user nice system idle wait hw-int sw-int (steal ...)
# convert number to int
- values = [ int(x) for x in line[1:] ]
- return check_cpu_util_unix(values, params)
+ return check_cpu_util_unix(total[0][1:], params, cores)
check_info["kernel.util"] = {
'check_function': kernel_check_cpu_utilization,
diff --git a/web/plugins/wato/check_parameters.py b/web/plugins/wato/check_parameters.py
index 7a88b1b..d336eee 100644
--- a/web/plugins/wato/check_parameters.py
+++ b/web/plugins/wato/check_parameters.py
@@ -4300,6 +4300,20 @@ register_check_parameters(
"the the bottleneck of your server is IO. Please note that depending on the "
"applications being run this might or might not be totally normal.")),
),
+ ( "core_util_time",
+ Tuple(
+ title = _("Alert on high utilization over an extended time period on a single core"),
+ elements = [
+ Percentage(title = _("High utilization at "), default_value = 100.0),
+ Integer(title = _("Warning after "), default_value = 5, unit = "min"),
+ Integer(title = _("Critical after "), default_value = 15, unit = "min"),
+ ],
+ help = _("A single thread fully utilizing a single core (potentially due to a bug) "
+ "may go unnoticed when only monitoring the total utilization of the CPU. "
+ "With this configuration, check_mk will alert if a single core is "
+ "exceeding a utilization threshold over an extended period of time.")
+ )
+ ),
]
),
forth = lambda old: type(old) != dict and { "iowait" : old } or old,
Module: check_mk
Branch: master
Commit: 16fdd7d0b8f23b9d042300f01144bfd2cee0aacf
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=16fdd7d0b8f23b…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Tue Aug 18 12:01:07 2015 +0200
Prepare new check API functions for remembering a state
---
modules/check_mk_base.py | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/modules/check_mk_base.py b/modules/check_mk_base.py
index cd0132e..980222a 100644
--- a/modules/check_mk_base.py
+++ b/modules/check_mk_base.py
@@ -1015,6 +1015,19 @@ def clear_counters(pattern, older_than):
del g_counters[name]
+# Store arbitrary values until the next execution of a check
+def get_item_state(itemname, default=None):
+ return g_counters.get(itemname, default)
+
+
+def set_item_state(itemname, state):
+ g_counters[itemname] = state
+
+
+def clear_item_state(itemname):
+ if itemname in g_counters:
+ del g_counters[itemname]
+
# Idea (1): We could keep global variables for the name of the checktype and item
# during a check and that way "countername" would need to be unique only
# within one checked item. So e.g. you could use "bcast" as name and not "if.%s.bcast" % item
@@ -1122,7 +1135,6 @@ def get_average(itemname, this_time, this_val, backlog_minutes, initialize_zero
return new_val
-
#.
# .--Checking------------------------------------------------------------.
# | ____ _ _ _ |