Module: check_mk
Branch: master
Commit: bc791f016dd14689a6cc71782be19db3322ea689
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=bc791f016dd146…
Author: Andreas Boesl <ab(a)mathias-kettner.de>
Date: Mon Jul 21 11:54:15 2014 +0200
FIX ps: now able to handle bigger process groups without constant MKCounterWrapped
Exceptions
When a monitored process group got to many volatile processes (process
spawn/despawn)<br>
the ps check had troubles to run through because there were multiple MKCounterWrapped
Exceptions
for each new process.<br>
This has been fixed. A newly detected process starts with usertime and kerneltime
delta-values of 0.
---
.werks/1084 | 12 ++++++++++++
ChangeLog | 3 ++-
checks/ps | 34 ++++++++++++++++++++++++++++------
modules/check_mk_base.py | 15 +++++++++++++++
4 files changed, 57 insertions(+), 7 deletions(-)
diff --git a/.werks/1084 b/.werks/1084
new file mode 100644
index 0000000..7353c87
--- /dev/null
+++ b/.werks/1084
@@ -0,0 +1,12 @@
+Title: ps: now able to handle bigger process groups without constant MKCounterWrapped
Exceptions
+Level: 1
+Component: checks
+Version: 1.2.5i5
+Date: 1405935795
+Class: fix
+
+When a monitored process group got to many volatile processes (processes
spawn/despawn)<br>
+the ps check had troubles to run through because there were multiple MKCounterWrapped
Exceptions
+for each new process.<br>
+
+This has been fixed. A newly detected process starts with usertime and kerneltime
delta-values of 0.
diff --git a/ChangeLog b/ChangeLog
index 6842fd8..a09e7ae 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -57,8 +57,9 @@
* 0634 FIX: Max Bandwidth for PNP-Graphs of Interface checks corrected...
* 0635 FIX: fc_port: the check no longer inventorizes ports with administrative state
of 'unknown' or 'offline'
* 0636 FIX: fc_port: do not inventorize if brocade fibre channel mib is also
supported on the device...
- * 0638 FIX: windows_updates: agent plugin now always sends section header, even if no
update information provided...
* 1083 FIX: ad_replication.bat: does not return data if the server is no DC
+ * 0638 FIX: windows_updates: agent plugin now always sends section header, even if no
update information provided...
+ * 1084 FIX: ps: now able to handle bigger process groups without constant
MKCounterWrapped Exceptions...
Multisite:
* 1013 Sort host names naturally, e.g. foobar11 comes after foobar2...
diff --git a/checks/ps b/checks/ps
index 1c6f2be..596998b 100644
--- a/checks/ps
+++ b/checks/ps
@@ -323,7 +323,7 @@ def process_matches(ps, procname, l_user):
# "okmax" : 1,
# "warnmax" : 1,
# }
-def check_procs(item, params, info, with_perfdata):
+def check_ps(item, params, info, with_perfdata):
now = time.time()
cpu_cores, info = ps_parse_info(info) # parse windows wmic information
@@ -351,6 +351,11 @@ def check_procs(item, params, info, with_perfdata):
percent_cpu = 0.0
extended_perfdata = False
+ # The counter names for the ps check are quite volatile, because there is
+ # dynamic part (the pid) in the name. Therefore we clear any counters
+ # older than one day. Affected are ps_wmic.user, ps_wmic.kernel, ps_stat.pcpu
+ clear_counters("ps_", 86400)
+
running_on = set([]) # collect information about nodes the processes run on
for line in info:
node_name = line[0]
@@ -366,8 +371,21 @@ def check_procs(item, params, info, with_perfdata):
resident_size += int(addinfo[2]) # kB
if len(addinfo) >= 10: # even more data: processId,
pagefile_usage, usermodetime, kernelmodetime, threadCount, openHandles
pid, pagefile_usage, user_c, kernel_c, handle_c = map(int,
addinfo[4:9])
- timedif, user_per_sec = get_counter("ps_wmic.user.%d"
% pid, now, user_c)
- timedif, kernel_per_sec =
get_counter("ps_wmic.kernel.%d" % pid, now, kernel_c)
+ counter_wrapped = False
+ try:
+ timedif, user_per_sec =
get_counter("ps_wmic.user.%d" % pid, now, user_c)
+ except MKCounterWrapped, e:
+ counter_wrapped = True
+
+ try:
+ timedif, kernel_per_sec =
get_counter("ps_wmic.kernel.%d" % pid, now, kernel_c)
+ except MKCounterWrapped, e:
+ counter_wrapped = True
+
+ if counter_wrapped:
+ user_per_sec = 0
+ kernel_per_sec = 0
+
user_perc = user_per_sec / 100000.0 / cpu_cores
kernel_perc = kernel_per_sec / 100000.0 / cpu_cores
percent_cpu += user_perc + kernel_perc
@@ -383,7 +401,11 @@ def check_procs(item, params, info, with_perfdata):
pid = addinfo[4]
hours, minutes, seconds = map(int,
addinfo[3].split(":"))
total_seconds = 86400 * days + 3600 * hours + 60 * minutes +
seconds
- timedif, cputime = get_counter("ps_stat.pcpu.%s" %
pid, now, total_seconds)
+ try:
+ timedif, cputime =
get_counter("ps_stat.pcpu.%s" % pid, now, total_seconds)
+ except MKCounterWrapped, e:
+ cputime = 0
+
pcpu = cputime * 100
else:
pcpu = savefloat(addinfo[3])
@@ -471,7 +493,7 @@ def check_procs(item, params, info, with_perfdata):
return state, infotext, perfdata
check_info['ps'] = {
- "check_function" : lambda i,p,n: check_procs(i,p,n,False),
+ "check_function" : lambda i,p,n: check_ps(i,p,n,False),
"inventory_function" : inventory_ps,
"service_description" : "proc_%s",
"has_perfdata" : False,
@@ -480,7 +502,7 @@ check_info['ps'] = {
}
check_info['ps.perf'] = {
- "check_function" : lambda i,p,n: check_procs(i,p,n,True),
+ "check_function" : lambda i,p,n: check_ps(i,p,n,True),
"inventory_function" : inventory_ps_perf,
"service_description" : "proc_%s",
"has_perfdata" : True,
diff --git a/modules/check_mk_base.py b/modules/check_mk_base.py
index bfea8f7..9237853 100644
--- a/modules/check_mk_base.py
+++ b/modules/check_mk_base.py
@@ -836,6 +836,21 @@ def load_counters(hostname):
except:
g_counters = {}
+
+# Deletes counters from g_counters matching the given pattern and are older_than x
seconds
+def clear_counters(pattern, older_than):
+ global g_counters
+ counters_to_delete = []
+ now = time.time()
+
+ for name, (timestamp, value) in g_counters.items():
+ if name.startswith(pattern):
+ if now > timestamp + older_than:
+ counters_to_delete.append(name)
+
+ for name in counters_to_delete:
+ del g_counters[name]
+
def get_counter(countername, this_time, this_val, allow_negative=False):
global g_counters