Module: check_mk
Branch: master
Commit: a97c2cb3822122059c362b4216658da828cf4a3d
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=a97c2cb3822122…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Thu Mar 17 12:09:35 2016 +0100
3271 New option in aggregation for tuning aggregation of downtimes
When computing the state 'in scheduled downtime' for an aggregate
first all leaf nodes that are within downtime are assumed CRIT and all others
OK. Then each aggregated node is assumed to be in downtime if the state
is CRIT under this assumption. You can change this to WARN now by checking
the new option {{Escalate downtimes based on aggregated WARN state}} in
the properties of an aggregation now. The influence of
this setting is especially relevant if you use aggregation functions of type
<i>count</i>
and want the downtime information also escalated in case such a node would go into
WARN state.
---
.werks/3271 | 17 +++++++++++++++++
ChangeLog | 1 +
web/htdocs/bi.py | 37 ++++++++++++++++++++++++++-----------
web/htdocs/config.py | 1 +
web/plugins/wato/bi.py | 32 +++++++++++++++++++++++++++-----
5 files changed, 72 insertions(+), 16 deletions(-)
diff --git a/.werks/3271 b/.werks/3271
new file mode 100644
index 0000000..671c87b
--- /dev/null
+++ b/.werks/3271
@@ -0,0 +1,17 @@
+Title: New option in aggregation for tuning aggregation of downtimes
+Level: 1
+Component: bi
+Compatible: compat
+Version: 1.2.9i1
+Date: 1458212879
+Class: feature
+
+When computing the state 'in scheduled downtime' for an aggregate
+first all leaf nodes that are within downtime are assumed CRIT and all others
+OK. Then each aggregated node is assumed to be in downtime if the state
+is CRIT under this assumption. You can change this to WARN now by checking
+the new option {{Escalate downtimes based on aggregated WARN state}} in
+the properties of an aggregation now. The influence of
+this setting is especially relevant if you use aggregation functions of type
<i>count</i>
+and want the downtime information also escalated in case such a node would go into
+WARN state.
diff --git a/ChangeLog b/ChangeLog
index 8f745cd..191c8a7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -147,6 +147,7 @@
* 3253 FIX: sms: notification script sms now handles single quotes in the message in
the right way
BI:
+ * 3271 New option in aggregation for tuning aggregation of downtimes...
* 3247 FIX: Do not allow multiple BI aggregates with the same title...
* 3232 FIX: BI: Only querying site of host for specific host status info...
* 3291 FIX: Improved error handling when searching for BI aggregations with invalid
regex
diff --git a/web/htdocs/bi.py b/web/htdocs/bi.py
index b4c5200..be7e25f 100644
--- a/web/htdocs/bi.py
+++ b/web/htdocs/bi.py
@@ -316,6 +316,12 @@ def compile_forest(user, only_hosts = None, only_groups = None):
if entry[0] == config.DISABLED:
continue
+ if entry[0] == config.DT_AGGR_WARN:
+ downtime_aggr_warn = True
+ entry = entry[1:]
+ else:
+ downtime_aggr_warn = False
+
if entry[0] == config.HARD_STATES:
use_hard_states = True
entry = entry[1:]
@@ -345,6 +351,7 @@ def compile_forest(user, only_hosts = None, only_groups = None):
for this_entry in new_entries:
remove_empty_nodes(this_entry)
this_entry["use_hard_states"] = use_hard_states
+ this_entry["downtime_aggr_warn"] = downtime_aggr_warn
new_entries = [ e for e in new_entries if len(e["nodes"]) > 0 ]
@@ -1018,20 +1025,25 @@ service_nomatch_cache = set([])
# Execution of the trees. Returns a tree object reflecting
# the states of all nodes
def execute_tree(tree, status_info = None):
- use_hard_states = tree["use_hard_states"]
+ aggregation_options = {
+ "use_hard_states" : tree["use_hard_states"],
+ "downtime_aggr_warn" : tree["downtime_aggr_warn"],
+ }
+
if status_info == None:
required_hosts = tree["reqhosts"]
status_info = get_status_info(required_hosts)
- return execute_node(tree, status_info, use_hard_states)
+ return execute_node(tree, status_info, aggregation_options)
+
-def execute_node(node, status_info, use_hard_states):
+def execute_node(node, status_info, aggregation_options):
if node["type"] == NT_LEAF:
- return execute_leaf_node(node, status_info, use_hard_states)
+ return execute_leaf_node(node, status_info, aggregation_options)
else:
- return execute_rule_node(node, status_info, use_hard_states)
+ return execute_rule_node(node, status_info, aggregation_options)
-def execute_leaf_node(node, status_info, use_hard_states):
+def execute_leaf_node(node, status_info, aggregation_options):
site, host = node["host"]
service = node.get("service")
@@ -1065,7 +1077,7 @@ def execute_leaf_node(node, status_info, use_hard_states):
if has_been_checked == 0:
output = _("This service has not been checked yet")
state = PENDING
- if use_hard_states:
+ if aggregation_options["use_hard_states"]:
st = hard_state
else:
st = state
@@ -1098,7 +1110,7 @@ def execute_leaf_node(node, status_info, use_hard_states):
}, None, node)
else:
- if use_hard_states:
+ if aggregation_options["use_hard_states"]:
st = host_hard_state
else:
st = host_state
@@ -1123,7 +1135,7 @@ def execute_leaf_node(node, status_info, use_hard_states):
return (state, assumed_state, node)
-def execute_rule_node(node, status_info, use_hard_states):
+def execute_rule_node(node, status_info, aggregation_options):
# get aggregation function
funcspec = node["func"]
parts = funcspec.split('!')
@@ -1144,7 +1156,7 @@ def execute_rule_node(node, status_info, use_hard_states):
ack_states = [] # Needed for computing the acknowledgement of non-OK nodes
one_assumption = False
for n in node["nodes"]:
- result = execute_node(n, status_info, use_hard_states) # state, assumed_state,
node [, subtrees]
+ result = execute_node(n, status_info, aggregation_options) # state,
assumed_state, node [, subtrees]
subtrees.append(result)
# Assume items in downtime as CRIT when computing downtime state
@@ -1174,7 +1186,10 @@ def execute_rule_node(node, status_info, use_hard_states):
downtime_state = func(*([downtime_states] + funcargs))
host_downtime_state = func(*([host_downtime_states] + funcargs))
- state["in_downtime"] = downtime_state["state"] >= 2
+ if aggregation_options["downtime_aggr_warn"]:
+ state["in_downtime"] = downtime_state["state"] >= 1
+ else:
+ state["in_downtime"] = downtime_state["state"] >= 2
# Compute acknowledgedment state
if state["state"] > 0: # Non-OK-State -> compute acknowledgedment
diff --git a/web/htdocs/config.py b/web/htdocs/config.py
index 5bc4557..19692ae 100644
--- a/web/htdocs/config.py
+++ b/web/htdocs/config.py
@@ -92,6 +92,7 @@ class FOREACH_SERVICE: pass
class REMAINING: pass
class DISABLED: pass
class HARD_STATES: pass
+class DT_AGGR_WARN: pass
# Has to be declared here once since the functions can be assigned in
# bi.py and also in multisite.mk. "Double" declarations are no problem
diff --git a/web/plugins/wato/bi.py b/web/plugins/wato/bi.py
index cd38c59..119126b 100644
--- a/web/plugins/wato/bi.py
+++ b/web/plugins/wato/bi.py
@@ -58,6 +58,7 @@ class ModeBI(WatoMode):
'REMAINING' :
'REMAINING-f41e728b-0bce-40dc-82ea-51091d034fc3',
'DISABLED' :
'DISABLED-f41e728b-0bce-40dc-82ea-51091d034fc3',
'HARD_STATES' :
'HARD_STATES-f41e728b-0bce-40dc-82ea-51091d034fc3',
+ 'DT_AGGR_WARN' :
'DT_AGGR_WARN-f41e728b-0bce-40dc-82ea-51091d034fc3',
}
self._load_config()
@@ -206,6 +207,8 @@ class ModeBI(WatoMode):
convaggr = conv + node
if aggr["hard_states"]:
convaggr = (self._bi_constants["HARD_STATES"],) + convaggr
+ if aggr["downtime_aggr_warn"]:
+ convaggr = (self._bi_constants["DT_AGGR_WARN"],) + convaggr
if aggr["disabled"]:
convaggr = (self._bi_constants["DISABLED"],) + convaggr
return convaggr
@@ -255,6 +258,12 @@ class ModeBI(WatoMode):
else:
disabled = False
+ if aggr[0] == self._bi_constants["DT_AGGR_WARN"]:
+ downtime_aggr_warn = True
+ aggr = aggr[1:]
+ else:
+ downtime_aggr_warn = False
+
if aggr[0] == self._bi_constants["HARD_STATES"]:
hard_states = True
aggr = aggr[1:]
@@ -267,11 +276,12 @@ class ModeBI(WatoMode):
groups = aggr[0]
node = self._convert_node_from_bi(aggr[1:])
return {
- "disabled" : disabled,
- "hard_states" : hard_states,
- "groups" : groups,
- "node" : node,
- "single_host" : single_host,
+ "disabled" : disabled,
+ "hard_states" : hard_states,
+ "downtime_aggr_warn" : downtime_aggr_warn,
+ "groups" : groups,
+ "node" : node,
+ "single_host" : single_host,
}
# Make some conversions so that the format of the
@@ -630,6 +640,18 @@ class ModeBI(WatoMode):
"just since one check then it's soft state is CRIT,
but its hard state is still OK."),
)
),
+ ( "downtime_aggr_warn",
+ Checkbox(
+ title = _("Aggregation of Downtimes"),
+ label = _("Escalate downtimes based on aggregated WARN
state"),
+ help = _("When computing the state 'in scheduled downtime'
for an aggregate "
+ "first all leaf nodes that are within downtime are
assumed CRIT and all others "
+ "OK. Then each aggregated node is assumed to be in
downtime if the state "
+ "is CRIT under this assumption. You can change this to
WARN. The influence of "
+ "this setting is especially relevant if you use
aggregation functions of type <i>count</i> "
+ "and want the downtime information also escalated in case
such a node would go into "
+ "WARN state."),
+ )),
( "single_host",
Checkbox(
title = _("Optimization"),