Module: check_mk
Branch: master
Commit: f6481f42db0382ce6e7180a56a6bddb5e1508cd5
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=f6481f42db0382…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Thu Jul 10 11:43:39 2014 +0200
FIX Gracefully restart check_mk helpers in case of memory leak
When using the Check_MK Micro Core in combination with inline SNMP then
in certain rare situations the check_mk helper processes leak memory. We
suspect the leak to be in the SNMP libs but this is just an assumption.
This fix now watches the size of each check_mk helper. If the memory usage
(VM size) of the process grows by more then 50% compared to the size after
the first 20 hosts being checked, then the helper silently restarts itself
and leaves a message in <tt>var/log/check_mk/cmc-helper.log</tt>.
---
.werks/1045 | 16 ++++++++++++++++
ChangeLog | 1 +
modules/check_mk.py | 48 +++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 60 insertions(+), 5 deletions(-)
diff --git a/.werks/1045 b/.werks/1045
new file mode 100644
index 0000000..3cd3696
--- /dev/null
+++ b/.werks/1045
@@ -0,0 +1,16 @@
+Title: Gracefully restart check_mk helpers in case of memory leak
+Level: 2
+Component: core
+Version: 1.2.5i5
+Date: 1404985261
+Class: fix
+
+When using the Check_MK Micro Core in combination with inline SNMP then
+in certain rare situations the check_mk helper processes leak memory. We
+suspect the leak to be in the SNMP libs but this is just an assumption.
+
+This fix now watches the size of each check_mk helper. If the memory usage
+(VM size) of the process grows by more then 50% compared to the size after
+the first 20 hosts being checked, then the helper silently restarts itself
+and leaves a message in <tt>var/log/check_mk/cmc-helper.log</tt>.
+
diff --git a/ChangeLog b/ChangeLog
index 7268491..86393e6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -6,6 +6,7 @@
* 1035 FIX: Do not fail on errors in *.mk files anymore - except in interactive
mode...
* 0174 FIX: Fixed appending of --keepalive-fd parameters to checkhelpers...
* 1053 FIX: Fixed events check always being reporting OK state...
+ * 1045 FIX: Gracefully restart check_mk helpers in case of memory leak...
Checks & Agents:
* 0168 f5_bigip_pool: Added Wato configuration...
diff --git a/modules/check_mk.py b/modules/check_mk.py
index 8b35f86..acca7d3 100755
--- a/modules/check_mk.py
+++ b/modules/check_mk.py
@@ -5416,12 +5416,46 @@ def copy_globals():
if varname not in [ "g_service_description",
"g_multihost_checks",
"g_check_table_cache",
"g_singlehost_checks",
"total_check_outout", "g_nodesof_cache",
- "g_initial_times" ] \
+ "g_initial_times",
"g_keepalive_initial_memusage",
+ "g_dns_cache", "g_ip_lookup_cache" ] \
and type(value).__name__ not in [ "function", "module",
"SRE_Pattern" ]:
global_saved[varname] = copy.copy(value)
return global_saved
+# Determine currently (VmSize, VmRSS) in Bytes
+def current_memory_usage():
+ parts = file('/proc/self/stat').read().split()
+ vsize = int(parts[22]) # in Bytes
+ rss = int(parts[23]) * 4096 # in Pages
+ return (vsize, rss)
+
+keepalive_memcheck_cycle = 20
+g_keepalive_initial_memusage = None
+def keepalive_check_memory(num_checks, keepalive_fd):
+ if num_checks % keepalive_memcheck_cycle != 0: # Only do this after every 10 checks
+ return
+
+ global g_keepalive_initial_memusage
+ if not g_keepalive_initial_memusage:
+ g_keepalive_initial_memusage = current_memory_usage()
+ else:
+ usage = current_memory_usage()
+ # Allow VM size to grow by at most 50%
+ if usage[0] > 1.5 * g_keepalive_initial_memusage[0]:
+ file(log_dir + "/cmc-helper.log", "a") \
+ .write("%s [4] check helper[%d]: memory usage increased from %s to
%s after %d check cycles. Restarting.\n" %
+ (time.strftime("%F %T", time.localtime()), os.getpid(),
+ get_bytes_human_readable(g_keepalive_initial_memusage[0]),
+ get_bytes_human_readable(usage[0]), num_checks))
+ restart_myself(keepalive_fd)
+
+
+def restart_myself(keepalive_fd):
+ sys.argv = [ x for x in sys.argv if not x.startswith('--keepalive-fd=') ]
+ os.execvp("cmk", sys.argv + [ "--keepalive-fd=%d" % keepalive_fd
])
+
+
def do_check_keepalive():
global g_initial_times
@@ -5445,6 +5479,8 @@ def do_check_keepalive():
os.dup2(devnull, 1)
os.close(devnull)
+ num_checks = 0 # count total number of check cycles
+
global total_check_output
total_check_output = ""
if opt_debug:
@@ -5456,15 +5492,15 @@ def do_check_keepalive():
cleanup_globals()
hostname = keepalive_read_line()
g_initial_times = os.times()
- if not hostname:
- break
+
hostname = hostname.strip()
if hostname == "*":
- sys.argv = [ x for x in sys.argv if not
x.startswith('--keepalive-fd=') ]
- os.execvp("cmk", sys.argv + [ "--keepalive-fd=%d" %
keepalive_fd ])
+ restart_myself(keepalive_fd)
elif not hostname:
break
+ num_checks += 1
+
timeout = int(keepalive_read_line())
try: # catch non-timeout exceptions
try: # catch timeouts
@@ -5517,6 +5553,8 @@ def do_check_keepalive():
os.write(keepalive_fd, "%03d\n%08d\n%s" %
(3, len(total_check_output), total_check_output))
+ keepalive_check_memory(num_checks, keepalive_fd)
+
# Just one lines from stdin. But: make sure that
# nothing more is read - not even into some internal