Module: check_mk
Branch: master
Commit: 0664973c5d123f7ff14b68d6c2e9c697523f7771
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=0664973c5d123f…
Author: Lars Michelsen <lm(a)mathias-kettner.de>
Date: Mon Jan 11 09:27:19 2016 +0100
#2899 FIX ps: Finally improved performance impact of perfdata when having a lot of
processes
When having a lot of processes, like several thousands, on a system and monitoring them,
this lead to a lot of performance counters being created in Check_MK. The management of
these counters was not very optimal, because not updated counters were only deleted based
on time. So systems with a lot of process creations were handling many non needed
counters
during every check interval. This affected CPU and memory usage during checking such
systems.
As we get the full process table, we have now changed the logic to automatically delete
the
counters of all processes that are not existant anymore. This reduces the number of
counters
significantly.
---
.werks/2899 | 17 +++++++++++
ChangeLog | 1 +
checks/ps | 73 ++++++++++++++++++++++++++++++++++++++++++++--
checks/ps.include | 2 +-
modules/check_mk_base.py | 44 +---------------------------
5 files changed, 91 insertions(+), 46 deletions(-)
diff --git a/.werks/2899 b/.werks/2899
new file mode 100644
index 0000000..9b73c9d
--- /dev/null
+++ b/.werks/2899
@@ -0,0 +1,17 @@
+Title: ps: Finally improved performance impact of perfdata when having a lot of
processes
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i4
+Date: 1452500538
+Class: fix
+
+When having a lot of processes, like several thousands, on a system and monitoring them,
+this lead to a lot of performance counters being created in Check_MK. The management of
+these counters was not very optimal, because not updated counters were only deleted
based
+on time. So systems with a lot of process creations were handling many non needed
counters
+during every check interval. This affected CPU and memory usage during checking such
systems.
+
+As we get the full process table, we have now changed the logic to automatically delete
the
+counters of all processes that are not existant anymore. This reduces the number of
counters
+significantly.
diff --git a/ChangeLog b/ChangeLog
index 9329466..a1a325a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -176,6 +176,7 @@
* 1320 FIX: fjdarye60_sum: Fixed bug in discovery function
* 2898 FIX: smart: Using normalized value for determining reallocated events
state...
* 2888 FIX: citrix_controller.licensing: ignoring double data from piggy backs
+ * 2899 FIX: ps: Finally improved performance impact of perfdata when having a lot of
processes...
Multisite:
* 2684 Added icons for downloading agent data / walks of hosts...
diff --git a/checks/ps b/checks/ps
index 78da52d..0721f78 100644
--- a/checks/ps
+++ b/checks/ps
@@ -78,8 +78,77 @@ ANY_USER = None
GRAB_USER = False
-def ps_cleanup_counters(lines):
- pass
+# FIXME: Direct access to g_item_state is not allowed in checks normally,
+# but there is no such API to access and modify it like it is needed here.
+def ps_cleanup_counters(parsed):
+ # remove legacy key used for some kind of caching
+ try:
+ del g_item_state["last.cleared.ps_"]
+ except KeyError:
+ pass
+
+ pids = ps_get_current_pids(parsed)
+
+ for ident in ps_get_counters_to_delete(pids):
+ del g_item_state[ident]
+
+
+# Get the idents of the counters which can be deleted because the process id of
+# the counter is not found anymore in the process table.
+#
+# Handle these formats of idents:
+# Old string based keys: 'ps_stat.pcpu.669': (1448634267.875281, 1),
+# New magic keys: ('ps', None, 'ps_wmic.kernel.692'):
(1448633487.573496, 1092007),
+def ps_get_counters_to_delete(pids):
+ counters_to_delete = []
+ for ident, state in g_item_state.iteritems():
+ ident_type = type(ident)
+ if ident_type == tuple and ident[0] == "ps":
+ check_ident = ident[2]
+ elif ident_type != tuple and (ident.startswith("ps_stat") or
ident.startswith("ps_wmic")):
+ check_ident = name
+ else:
+ continue
+
+ pid = check_ident.split(".")[-1]
+ if pid not in pids:
+ counters_to_delete.append(ident)
+ return counters_to_delete
+
+
+def ps_get_current_pids(parsed):
+ pids = []
+ for line in parsed:
+ process_info = line[1]
+ if ps_has_extended_perfdata(process_info):
+ pids.append(process_info[4])
+ return pids
+
+
+# Makes sure, that no counter with a give prefix is kept longer
+# than min_keep_seconds * 2. Counter is kept at least min_keep_seconds.
+def clear_counters(counter_name_prefix, min_keep_seconds):
+ global g_item_state
+
+ counters_to_delete = []
+
+ for name, state in g_item_state.iteritems():
+ if type(name) == tuple:
+ counter_name = name[0] # never needed, since only called by ps currently
+ else:
+ counter_name = name
+
+ if type(state) == tuple:
+ timestamp, value = state
+ else:
+ continue # unable to cleanup values without timestamp info, skip
+
+ if counter_name.startswith(counter_name_prefix):
+ if timestamp < remove_if_min_keep_seconds:
+ counters_to_delete.append(name)
+
+ for name in counters_to_delete:
+ del g_item_state[name]
# FIXME: Refactor this function for better readability
diff --git a/checks/ps.include b/checks/ps.include
index 86b3bf6..5577347 100644
--- a/checks/ps.include
+++ b/checks/ps.include
@@ -455,7 +455,7 @@ def check_ps_common(item, params, parsed, cpu_cores = 1, info_name =
"processes"
# correct number of DSes. To just look at the first process
# in the agent output to make sure. We assume that at least
# one process is always present.
- extended_perfdata = ps_has_extended_perfdata(parsed[0])
+ extended_perfdata = ps_has_extended_perfdata(parsed[0][1])
if extended_perfdata:
perfdata += [ ("vsz", virtual_size),
diff --git a/modules/check_mk_base.py b/modules/check_mk_base.py
index a83cccb..25a4897 100644
--- a/modules/check_mk_base.py
+++ b/modules/check_mk_base.py
@@ -987,14 +987,7 @@ def load_item_state(hostname):
try:
g_item_state = eval(file(filename).read())
except:
- # Try old syntax
- try:
- lines = file(filename).readlines()
- for line in lines:
- line = line.split()
- g_item_state[' '.join(line[0:-2])] = ( int(line[-2]),
int(line[-1]) )
- except:
- g_item_state = {}
+ g_item_state = {}
def save_item_state(hostname):
@@ -1096,41 +1089,6 @@ def last_counter_wrap():
return g_last_counter_wrap
-
-# Makes sure, that no counter with a give prefix is kept longer
-# than min_keep_seconds * 2. Counter is kept at least min_keep_seconds.
-def clear_counters(counter_name_prefix, min_keep_seconds):
- global g_item_state
-
- cleared_key = "last.cleared." + counter_name_prefix
- if cleared_key in g_item_state:
- last_cleared, none = g_item_state[cleared_key]
- if last_cleared + min_keep_seconds > time.time():
- return # recent enough
- g_item_state[cleared_key] = (time.time(), None)
-
- counters_to_delete = []
- remove_if_min_keep_seconds = time.time() - min_keep_seconds
-
- for name, state in g_item_state.iteritems():
- if type(name) == tuple:
- counter_name = name[0] # never needed, since only called by ps currently
- else:
- counter_name = name
-
- if type(state) == tuple:
- timestamp, value = state
- else:
- continue # unable to cleanup values without timestamp info, skip
-
- if counter_name.startswith(counter_name_prefix):
- if timestamp < remove_if_min_keep_seconds:
- counters_to_delete.append(name)
-
- for name in counters_to_delete:
- del g_item_state[name]
-
-
# Compute average by gliding exponential algorithm
# itemname : unique ID for storing this average until the next check
# this_time : timestamp of new value