Module: check_mk
Branch: master
Commit: a78a1fd6fb0be5158e856157e08b71775d218512
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=a78a1fd6fb0be5…
Author: Lars Michelsen <lm(a)mathias-kettner.de>
Date: Fri Oct 13 11:35:34 2017 +0200
5395 FIX Nagios: Enable passive host check translation by default
The monitoring cores Nagios and Microcore where treating "fake check results"
that can be sent from the GUI differently.
These fake check results are in fact passive host checks. From these passive host
checks Nagios simply takes over the reported state and uses it as new host state,
ignoring the parent logic. The microcore takes the parent logic into account and
may change a DOWN check result to UNREACHABLE when all parents are down.
We have now changed the configuration option
<tt>translate_passive_host_checks</tt>
of Nagios to be enabled by default to make Nagios also check this logic.
Change-Id: Ica73530773f18c2720088a99aa83f093535ffb5b
---
.werks/5395 | 19 +++
.../nagios/skel/etc/nagios/nagios.d/freshness.cfg | 2 +-
.../test_unreachable_notifications.py | 144 +++++++++++++++++++++
tests/testlib/__init__.py | 24 ++++
tests/web/test_webapi.py | 2 +-
5 files changed, 189 insertions(+), 2 deletions(-)
diff --git a/.werks/5395 b/.werks/5395
new file mode 100644
index 0000000..be92538
--- /dev/null
+++ b/.werks/5395
@@ -0,0 +1,19 @@
+Title: Nagios: Enable passive host check translation by default
+Level: 1
+Component: core
+Compatible: compat
+Edition: cre
+Version: 1.5.0i1
+Date: 1507886901
+Class: fix
+
+The monitoring cores Nagios and Microcore where treating "fake check results"
+that can be sent from the GUI differently.
+
+These fake check results are in fact passive host checks. From these passive host
+checks Nagios simply takes over the reported state and uses it as new host state,
+ignoring the parent logic. The microcore takes the parent logic into account and
+may change a DOWN check result to UNREACHABLE when all parents are down.
+
+We have now changed the configuration option
<tt>translate_passive_host_checks</tt>
+of Nagios to be enabled by default to make Nagios also check this logic.
diff --git a/omd/packages/nagios/skel/etc/nagios/nagios.d/freshness.cfg
b/omd/packages/nagios/skel/etc/nagios/nagios.d/freshness.cfg
index c5fa11a..3360260 100644
--- a/omd/packages/nagios/skel/etc/nagios/nagios.d/freshness.cfg
+++ b/omd/packages/nagios/skel/etc/nagios/nagios.d/freshness.cfg
@@ -12,7 +12,7 @@
# passively into the correct state from the view of this server.
# Values: 1 = perform translation, 0 = do not translate (default)
-translate_passive_host_checks=0
+translate_passive_host_checks=1
diff --git a/tests/notifications/test_unreachable_notifications.py
b/tests/notifications/test_unreachable_notifications.py
new file mode 100644
index 0000000..67e62dc
--- /dev/null
+++ b/tests/notifications/test_unreachable_notifications.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+import pytest
+import time
+from testlib import web
+
+(a)pytest.fixture(scope="module")
+def test_cfg(web, site):
+ try:
+ print "Applying test config"
+
+ web.add_host("notify-test-parent", attributes={
+ "ipaddress": "127.0.0.1",
+ })
+
+ web.add_host("notify-test-child", attributes={
+ "ipaddress": "127.0.0.1",
+ "parents": [ "notify-test-parent" ],
+ })
+
+ web.activate_changes()
+
+ site.live.command("[%d] DISABLE_HOST_CHECK;notify-test-parent" %
time.time())
+ site.live.command("[%d] DISABLE_HOST_CHECK;notify-test-child" %
time.time())
+
+ yield None
+ finally:
+ #
+ # Cleanup code
+ #
+ print "Cleaning up default config"
+
+ web.delete_host("notify-test-child")
+ web.delete_host("notify-test-parent")
+
+
+def set_initial_state(site):
+ # Before each test: Set to initial state: Both UP
+ site.send_host_check_result("notify-test-child", 0, "UP")
+ site.send_host_check_result("notify-test-parent", 0, "UP")
+
+ # Before each test: Clear logs
+ site.live.command("[%d] ROTATE_LOGFILE" % time.time())
+ time.sleep(1) # TODO: Add check for rotation
+
+
+def open_history_log(core):
+ if core == "cmc":
+ return open("var/check_mk/core/history")
+ elif core == "nagios":
+ return open("var/nagios/nagios.log")
+ else:
+ raise NotImplementedError()
+
+
+STATE_UP = 0
+STATE_DOWN = 1
+STATE_UNREACHABLE = 2
+
+(a)pytest.mark.parametrize(("core")ore"), [ "nagios", "cmc" ])
+def test_unreachable_child_down_before_parent_down(test_cfg, site, core):
+ site.set_core(core)
+ set_initial_state(site)
+
+ # TODO:
+ # - Set child down, expect DOWN notification
+ site.send_host_check_result("notify-test-child", STATE_DOWN,
"DOWN")
+
+ assert "HOST ALERT: notify-test-child;DOWN;HARD;1;DOWN" in
open_history_log(core).read()
+
+ # - Set parent down, expect DOWN notification for parent and UNREACHABLE notification
for child
+ site.send_host_check_result("notify-test-parent", STATE_DOWN,
"DOWN")
+
+ # Difference beween nagios/cmc: when sending DOWN via PROCESS_HOST_CHECK_RESULT
+ # the nagios core needs another child down check result to report it as unreachable.
+ if core == "nagios":
+ site.send_host_check_result("notify-test-child", STATE_DOWN,
"DOWN", expected_state=STATE_UNREACHABLE)
+
+ history_log = open_history_log(core).read()
+
+ assert "HOST ALERT: notify-test-parent;DOWN;HARD;1;DOWN" in history_log
+ assert "HOST ALERT: notify-test-child;UNREACHABLE;HARD;1;" in history_log
+
+ if core == "cmc":
+ assert "HOST NOTIFICATION:
check-mk-notify;notify-test-parent;DOWN;check-mk-notify;" in history_log
+ assert "HOST NOTIFICATION:
check-mk-notify;notify-test-child;UNREACHABLE;check-mk-notify;" in history_log
+ else:
+ # TODO: Nagios does not log the entries checked above for cmc. This may be a
problem e.g. for availability.
+ pass
+
+
+(a)pytest.mark.parametrize(("core")ore"), [ "nagios", "cmc" ])
+def test_unreachable_child_after_parent_is_down(test_cfg, site, core):
+ site.set_core(core)
+ set_initial_state(site)
+
+
+ # TODO:
+ # - Set parent down, expect DOWN notification
+ site.send_host_check_result("notify-test-parent", STATE_DOWN,
"DOWN")
+
+ assert "HOST ALERT: notify-test-parent;DOWN;HARD;1;DOWN" in
open_history_log(core).read()
+
+ # - set child down, expect UNREACHABLE notification
+ assert site.get_host_state("notify-test-child") == STATE_UP
+ site.send_host_check_result("notify-test-ychild", STATE_DOWN,
"DOWN")
+ assert site.get_host_state("notify-test-child") == STATE_UNREACHABLE
+
+ history_log = open_history_log(core).read()
+
+ assert "HOST ALERT: notify-test-child;DOWN;HARD;1;DOWN" in history_log
+
+ #if core == "cmc":
+ # assert "HOST ALERT: notify-test-child;UNREACHABLE;HARD;1;child becomes
unreachable due to state change of parent host" in history_log
+
+ # assert "HOST NOTIFICATION:
check-mk-notify;notify-test-parent;DOWN;check-mk-notify;" in history_log
+ # assert "HOST NOTIFICATION:
check-mk-notify;notify-test-child;UNREACHABLE;check-mk-notify;" in history_log
+ #else:
+ # # TODO: Nagios does not log the entries checked above for cmc. This may be a
problem e.g. for availability.
+ # pass
+
+ # TODO:
+ # - Set parent down, expect DOWN notification
+ # - Set child up, expect no notification
+
+ # TODO:
+ # - Set parent down, expect DOWN notification
+ # - set child down, expect UNREACHABLE notification
+ # - set child up, expect UP notification
+ # - set child down, expect UNREACHABLE notification
+
+ # TODO:
+ # - Set parent down, expect DOWN notification
+ # - set child down, expect UNREACHABLE notification
+ # - Set parent up, expect UP notification and child DOWN notification
+ # - Set parent down, expect DOWN notification and UNREACHABLE child notification
+
+
+#(a)pytest.mark.parametrize(("core")ore"), [ "nagios", "cmc" ])
+#def test_unreachable_disabled_by_default(test_cfg, site, core):
+# # TODO: Set child down, set parent down
+# # TODO: Check log that no UNREACHABLE notification has been created
+# pass
diff --git a/tests/testlib/__init__.py b/tests/testlib/__init__.py
index 248dc6d..74a9886 100644
--- a/tests/testlib/__init__.py
+++ b/tests/testlib/__init__.py
@@ -239,6 +239,26 @@ class Site(object):
return live
+ def send_host_check_result(self, hostname, state, output, expected_state=None):
+ if expected_state is None:
+ expected_state = state
+
+ self.live.command("[%d] PROCESS_HOST_CHECK_RESULT;%s;%d;%s" %
(time.time(), hostname, state, output))
+
+ # Wait for processed command
+ timeout = 10
+ host_state = self.get_host_state(hostname)
+ while timeout and not host_state == expected_state:
+ timeout -= 1
+ time.sleep(1)
+ host_state = self.get_host_state(hostname)
+ assert host_state == expected_state, "Expected %d state, got %d state"
% (expected_state, host_state)
+
+
+ def get_host_state(self, hostname):
+ return self.live.query_value("GET hosts\nColumns: state\nFilter: host_name =
%s" % hostname)
+
+
def _is_running_as_site_user(self):
return pwd.getpwuid(os.getuid()).pw_name == self.id
@@ -579,6 +599,10 @@ class Site(object):
print "Started site"
+ def set_core(self, core):
+ self.set_config("CORE", core, with_restart=True)
+
+
def get_config(self, key):
p = self.execute(["omd", "config", "show", key],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
diff --git a/tests/web/test_webapi.py b/tests/web/test_webapi.py
index 9b5540d..b1fcc27 100644
--- a/tests/web/test_webapi.py
+++ b/tests/web/test_webapi.py
@@ -358,7 +358,7 @@ def test_get_graph(web, site):
web.activate_changes()
# Issue a reschedule
-
site.live.command("SCHEDULE_FORCED_SERVICE_CHECK;test-host-get-graph;Check_MK;%d"
% int(time.time()))
+ site.live.command("[%d]
SCHEDULE_FORCED_SERVICE_CHECK;test-host-get-graph;Check_MK;%d" % (int(time.time),
int(time.time())))
# Wait for RRD file creation
# Isn't this a bug that the graph is not instantly available?