Module: check_mk
Branch: master
Commit: 350c84b2d7a1f05cea52eac63b1207087b233e83
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=350c84b2d7a1f0…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Fri Apr 4 13:36:30 2014 +0200
FIX FIX: compute correct state transitions for notifications
This fixes a problem with the combination of
<ul>
<li>Rule based notifications</li>
<li>Maximum check attempts > 1</li>
<li>Rule conditions based on the <i>original</i> state of a host or
service</li>
</ul>
When a service got CRIT while having more than one check attempt, the previous
(soft) state would be displayed as CRIT, while it is OK in fact. This has
been fixed natively when using the Check_MK Micro Core. When using Nagios then
during the notification the previous hard state is not always known. In doubt
a notification is being sent out rather then omitted.
---
.werks/752 | 20 ++++++++++++++++++++
ChangeLog | 3 +++
check_mk_templates.cfg | 2 ++
modules/notify.py | 49 +++++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 69 insertions(+), 5 deletions(-)
diff --git a/.werks/752 b/.werks/752
new file mode 100644
index 0000000..619cda9
--- /dev/null
+++ b/.werks/752
@@ -0,0 +1,20 @@
+Title: FIX: compute correct state transitions for notifications
+Level: 2
+Component: notifications
+Class: fix
+State: unknown
+Version: 1.2.5i3
+Date: 1396610543
+
+This fixes a problem with the combination of
+<ul>
+<li>Rule based notifications</li>
+<li>Maximum check attempts > 1</li>
+<li>Rule conditions based on the <i>original</i> state of a host or
service</li>
+</ul>
+
+When a service got CRIT while having more than one check attempt, the previous
+(soft) state would be displayed as CRIT, while it is OK in fact. This has
+been fixed natively when using the Check_MK Micro Core. When using Nagios then
+during the notification the previous hard state is not always known. In doubt
+a notification is being sent out rather then omitted.
diff --git a/ChangeLog b/ChangeLog
index b7c5b87..ad0bdd2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -16,6 +16,9 @@
WATO:
* 0781 FIX: host diag page: fixed problem with update of diagnose subwindows...
+ Notifications:
+ * 0752 FIX: FIX: compute correct state transitions for notifications...
+
Livestatus:
* 0747 FIX: livestatus table hostsbygroup: fixed bug with group_authorization
strict...
diff --git a/check_mk_templates.cfg b/check_mk_templates.cfg
index 1f295b1..406fc57 100644
--- a/check_mk_templates.cfg
+++ b/check_mk_templates.cfg
@@ -364,6 +364,7 @@ define command {
NOTIFY_HOSTNAME='$HOSTNAME$' \
NOTIFY_HOSTALIAS='$HOSTALIAS$' \
NOTIFY_HOSTADDRESS='$HOSTADDRESS$' \
+ NOTIFY_HOSTATTEMPT='$HOSTATTEMPT$' \
NOTIFY_LASTHOSTSTATE='$LASTHOSTSTATE$' \
NOTIFY_LASTHOSTSTATEID='$LASTHOSTSTATEID$' \
NOTIFY_LASTHOSTSTATECHANGE='$LASTHOSTSTATECHANGE$' \
@@ -379,6 +380,7 @@ define command {
NOTIFY_LASTSERVICESTATEID='$LASTSERVICESTATEID$' \
NOTIFY_LASTSERVICESTATECHANGE='$LASTSERVICESTATECHANGE$' \
NOTIFY_LASTSERVICEOK='$LASTSERVICEOK$' \
+ NOTIFY_SERVICEATTEMPT='$SERVICEATTEMPT$' \
NOTIFY_SERVICESTATE='$SERVICESTATE$' \
NOTIFY_SERVICESTATEID='$SERVICESTATEID$' \
NOTIFY_SERVICEOUTPUT='$SERVICEOUTPUT$' \
diff --git a/modules/notify.py b/modules/notify.py
index 6adc3ef..c638286 100644
--- a/modules/notify.py
+++ b/modules/notify.py
@@ -744,7 +744,7 @@ def rbn_match_host_event(rule, context):
return # Let this be handled by match_service_event
allowed_events = rule["match_host_event"]
state = context["HOSTSTATE"]
- last_state = context["LASTHOSTSTATE"]
+ last_state = context["PREVIOUSHOSTHARDSTATE"]
events = { "UP" : 'r', "DOWN" : 'd',
"UNREACHABLE" : 'u' }
return rbn_match_event(context, state, last_state, events, allowed_events)
@@ -758,7 +758,7 @@ def rbn_match_service_event(rule, context):
return # Let this be handled by match_host_event
allowed_events = rule["match_service_event"]
state = context["SERVICESTATE"]
- last_state = context["LASTSERVICESTATE"]
+ last_state = context["PREVIOUSSERVICEHARDSTATE"]
events = { "OK" : 'r', "WARNING" :
'w', "CRITICAL" : 'c', "UNKNOWN" : 'u' }
return rbn_match_event(context, state, last_state, events, allowed_events)
@@ -777,9 +777,16 @@ def rbn_match_event(context, state, last_state, events,
allowed_events):
else:
event = events.get(last_state, '?') + events.get(state, '?')
- if event not in allowed_events:
- return "Event type '%s' not handled by this rule. Allowed are:
%s" % (
- event, ", ".join(allowed_events))
+ notify_log("Event type is %s" % event)
+
+ # Now go through the allowed events. Handle '?' has matching all types!
+ for allowed in allowed_events:
+ if event == allowed or \
+ event[0] == '?' and event[1] == allowed[1]:
+ return
+
+ return "Event type '%s' not handled by this rule. Allowed are: %s"
% (
+ event, ", ".join(allowed_events))
def rbn_rule_contacts(rule, context):
@@ -1495,6 +1502,38 @@ def complete_raw_context(raw_context):
if not contact or contact == "check-mk-notify":
add_rulebased_macros(raw_context)
+
+ # Add the previous hard state. This is neccessary for notification rules that depend
on certain transitions,
+ # like OK -> WARN (but not CRIT -> WARN). The CMC sends PREVIOUSHOSTHARDSTATE
and PREVIOUSSERVICEHARDSTATE.
+ # Nagios does not have this information and we try to deduct this.
+ if "PREVIOUSHOSTHARDSTATE" not in raw_context:
+ prev_state = raw_context["LASTHOSTSTATE"]
+ # When the attempts are > 1 then the last state could be identical with
+ # the current one, e.g. both critical. In that case we assume the
+ # previous hard state to be OK.
+ if prev_state == raw_context["HOSTSTATE"]:
+ prev_state = "UP"
+ elif "HOSTATTEMPT" not in raw_context or \
+ ("HOSTATTEMPT" in raw_context and
raw_context["HOSTATTEMPT"] != "1"):
+ # Here We do not know. The transition might be OK -> WARN -> CRIT and
+ # the initial OK is completely lost. We use the artificial state
"?"
+ # here, which matches all states and makes sure that when in doubt a
+ # notification is being sent out.
+ prev_state = "?"
+ notify_log("Previous host hard state not known. Allowing all
states.")
+ raw_context["PREVIOUSHOSTHARDSTATE"] = prev_state
+
+ # Same for services
+ if raw_context["WHAT"] == "SERVICE" and
"PREVIOUSSERVICEHARDSTATE" not in raw_context:
+ prev_state = raw_context["LASTSERVICESTATE"]
+ if prev_state == raw_context["SERVICESTATE"]:
+ prev_state = "OK"
+ elif "SERVICEATTEMPT" not in raw_context or \
+ ("SERVICEATTEMPT" in raw_context and
raw_context["SERVICEATTEMPT"] != "1"):
+ prev_state = "?"
+ notify_log("Previous service hard state not known. Allowing all
states.")
+ raw_context["PREVIOUSSERVICEHARDSTATE"] = prev_state
+
convert_context_to_unicode(raw_context)