Module: check_mk
Branch: master
Commit: dcef25d470b347a793b98997cdf634f1e16b2cee
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=dcef25d470b347…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Thu Dec 19 12:53:46 2013 +0100
FIX Fix CMC not executing any Check_MK checks after config reload
CMC uses Check_MK in the <tt>--keepalive</tt> mode. That mode mixed up
it's filedescriptors after a configuration reload. That resultet in the
output of the checks not arriving at the CMC. The CMC waited and considered
the helper as busy. All Check_MK based checks got stale. A restart fixed
that.
That problem is now solved.
---
.werks/314 | 16 ++++++++++++++++
ChangeLog | 1 +
modules/check_mk.py | 43 +++++++++++++++++++++++++++++++++----------
modules/check_mk_base.py | 1 +
4 files changed, 51 insertions(+), 10 deletions(-)
diff --git a/.werks/314 b/.werks/314
new file mode 100644
index 0000000..c385528
--- /dev/null
+++ b/.werks/314
@@ -0,0 +1,16 @@
+Title: Fix CMC not executing any Check_MK checks after config reload
+Level: 2
+Component: core
+Class: fix
+State: unknown
+Version: 1.2.5i1
+Date: 1387453914
+Targetversion: future
+
+CMC uses Check_MK in the <tt>--keepalive</tt> mode. That mode mixed up
+it's filedescriptors after a configuration reload. That resultet in the
+output of the checks not arriving at the CMC. The CMC waited and considered
+the helper as busy. All Check_MK based checks got stale. A restart fixed
+that.
+
+That problem is now solved.
diff --git a/ChangeLog b/ChangeLog
index 2217bbe..ab48683 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -6,6 +6,7 @@
* 0379 FIX: check_mk -c: Now also rewrites the location of conf.d directory
* 0354 FIX: Catch exception when check plugins do not return a state...
* 0398 FIX: Tolerate debug output in check plugins when using CMC...
+ * 0314 FIX: Fix CMC not executing any Check_MK checks after config reload...
Checks & Agents:
* 0306 esx_vsphere_counters: added missing ramdisk levels sfcbtickets
diff --git a/modules/check_mk.py b/modules/check_mk.py
index 1b723b5..97e4228 100755
--- a/modules/check_mk.py
+++ b/modules/check_mk.py
@@ -5117,10 +5117,14 @@ def do_check_keepalive():
# 1. move the filedescriptor 1 to a parking position
# 2. re-open 0 on /dev/null
# 3. Send our answers to the Micro Core with the parked FD.
- cmc_result_fd = os.dup(1)
- devnull = os.open("/tmp/dev_null", os.O_WRONLY | os.O_CREAT)
- os.dup2(devnull, 1)
- os.close(devnull)
+ # BEWARE: this must not happen after we have execve'd ourselves!
+ if opt_keepalive_fd:
+ keepalive_fd = opt_keepalive_fd
+ else:
+ keepalive_fd = os.dup(1)
+ devnull = os.open("/tmp/dev_null", os.O_WRONLY | os.O_CREAT)
+ os.dup2(devnull, 1)
+ os.close(devnull)
global total_check_output
total_check_output = ""
@@ -5131,17 +5135,17 @@ def do_check_keepalive():
while True:
cleanup_globals()
- hostname = sys.stdin.readline()
+ hostname = keepalive_read_line()
g_initial_times = os.times()
if not hostname:
break
hostname = hostname.strip()
if hostname == "*":
- os.execvp("cmk", sys.argv)
+ os.execvp("cmk", sys.argv + [ "--keepalive-fd=%d" %
keepalive_fd ])
elif not hostname:
break
- timeout = int(sys.stdin.readline())
+ timeout = int(keepalive_read_line())
try: # catch non-timeout exceptions
try: # catch timeouts
signal.signal(signal.SIGALRM, check_timeout)
@@ -5168,7 +5172,7 @@ def do_check_keepalive():
status = 3
total_check_output = "UNKNOWN - Check_MK timed out after %d
seconds\n" % timeout
- os.write(cmc_result_fd, "%03d\n%08d\n%s" %
+ os.write(keepalive_fd, "%03d\n%08d\n%s" %
(status, len(total_check_output), total_check_output))
total_check_output = ""
cleanup_globals()
@@ -5188,10 +5192,27 @@ def do_check_keepalive():
if opt_debug:
raise
total_check_output = "UNKNOWN - %s\n" % e
- os.write(cmc_result_fd, "%03d\n%08d\n%s" %
+ os.write(keepalive_fd, "%03d\n%08d\n%s" %
(3, len(total_check_output), total_check_output))
+# Just one lines from stdin. But: make sure that
+# nothing more is read - not even into some internal
+# buffer of sys.stdin! We do this by reading every
+# single byte. I know that this is not performant,
+# but we just read hostnames - not much data.
+
+def keepalive_read_line():
+ line = ""
+ while True:
+ byte = os.read(0, 1)
+ if byte == '\n':
+ return line
+ elif not byte: # EOF
+ return ''
+ else:
+ line += byte
+
# +----------------------------------------------------------------------+
# | ____ _ __ _ |
@@ -5538,7 +5559,7 @@ if __name__ == "__main__":
"list-checks", "list-hosts",
"list-tag", "no-tcp", "cache",
"flush", "package", "localize",
"donate", "snmpwalk", "snmptranslate",
"usewalk", "scan-parents", "procs=",
"automation=", "notify",
- "snmpget=", "profile", "keepalive",
"create-rrd",
+ "snmpget=", "profile", "keepalive",
"keepalive-fd=", "create-rrd",
"no-cache", "update", "restart",
"reload", "dump", "fake-dns=",
"man", "nowiki", "config-check",
"backup=", "restore=",
"check-inventory=", "paths",
"cleanup-autochecks", "checks=",
@@ -5587,6 +5608,8 @@ if __name__ == "__main__":
fake_dns = a
elif o == '--keepalive':
opt_keepalive = True
+ elif o == '--keepalive-fd':
+ opt_keepalive_fd = int(a)
elif o == '--usewalk':
opt_use_snmp_walk = True
elif o == '--procs':
diff --git a/modules/check_mk_base.py b/modules/check_mk_base.py
index bab5a00..7d82b7d 100644
--- a/modules/check_mk_base.py
+++ b/modules/check_mk_base.py
@@ -105,6 +105,7 @@ opt_cleanup_autochecks = False
fake_dns = False
opt_keepalive = False
opt_cmc_relfilename = "config"
+opt_keepalive_fd = None
# register SIGINT handler for consistenct CTRL+C handling
def interrupt_handler(signum, frame):