Module: check_mk
Branch: master
Commit: a6910ea6c8368953c416d076d7fbeefb7adbea51
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=a6910ea6c83689…
Author: Sebastian Herbord <sh(a)mathias-kettner.de>
Date: Tue Sep 22 10:24:37 2015 +0200
#2628 check_bi_aggr service now also goes into downtime if the monitored bi is in
downtime
the service is assigned a "real" downtime even if the downtime of the bi is
derived from the
aggregated services.
The downtime is also removed automatically when the bi leaves its own downtime.
Assignment and removal of the downtime happens only when the check is run so there will be
a delay
between bi going into downtime and the service following.
---
.werks/2628 | 13 +++++++
ChangeLog | 1 +
checks/check_bi_aggr | 3 ++
doc/treasures/active_checks/check_bi_aggr | 57 ++++++++++++++++++++++++++++-
web/plugins/wato/active_checks.py | 11 +++++-
5 files changed, 82 insertions(+), 3 deletions(-)
diff --git a/.werks/2628 b/.werks/2628
new file mode 100644
index 0000000..2d00868
--- /dev/null
+++ b/.werks/2628
@@ -0,0 +1,13 @@
+Title: check_bi_aggr service now also goes into downtime if the monitored bi is in
downtime
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i3
+Date: 1442910035
+Class: feature
+
+the service is assigned a "real" downtime even if the downtime of the bi is
derived from the
+aggregated services.
+The downtime is also removed automatically when the bi leaves its own downtime.
+Assignment and removal of the downtime happens only when the check is run so there will
be a delay
+between bi going into downtime and the service following.
diff --git a/ChangeLog b/ChangeLog
index 6309e8b..56a644b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -85,6 +85,7 @@
* 2592 new checks netextreme_fan, netextreme_psu, netextreme_psu_in,
netextreme_psu_out, netextreme_temp for Extreme Networks Switches...
* 2625 kernel.util and winperf_processor.util checks now support an optional graph
showing utilization of individual cores...
* 2626 ps check configurable to list state of individual processes in long output...
+ * 2628 check_bi_aggr service now also goes into downtime if the monitored bi is in
downtime...
* 2315 FIX: windows agent: BOM replacement, fixed incorrect byte offset...
* 2316 FIX: windows agent: fix garbled output of cached agent plugins...
* 2358 FIX: check_mk_agent.solaris: more correct computation of zfs used space...
diff --git a/checks/check_bi_aggr b/checks/check_bi_aggr
index 47ae5be..88ba907 100644
--- a/checks/check_bi_aggr
+++ b/checks/check_bi_aggr
@@ -45,6 +45,9 @@ def check_bi_aggr_arguments(params):
if opt_params.get("acknowledged"):
args += ' --acknowledged %s' % opt_params['acknowledged']
+ if opt_params.get("track_downtimes"):
+ args += ' -r -n $HOSTNAME$'
+
return args
diff --git a/doc/treasures/active_checks/check_bi_aggr
b/doc/treasures/active_checks/check_bi_aggr
index c67a84a..434bbba 100755
--- a/doc/treasures/active_checks/check_bi_aggr
+++ b/doc/treasures/active_checks/check_bi_aggr
@@ -25,6 +25,9 @@
# Boston, MA 02110-1301 USA.
import sys, getopt, urllib2, traceback
+import os
+import time
+
# tell urllib2 not to honour "http(s)_proxy" env variables
urllib2.getproxies = lambda: {}
@@ -32,7 +35,7 @@ urllib2.getproxies = lambda: {}
def usage():
sys.stderr.write("""
USAGE: check_bi_aggr -b <BASE_URL> -a <AGGR_NAME> -u <USER> -s
<SECRET>
- [-m <AUTH_MODE>] [-t <TIMEOUT>] [-d]
+ [-m <AUTH_MODE>] [-r] [-n <HOSTNAME>] [-t
<TIMEOUT>] [-d]
OPTIONS:
-b BASE_URL The base URL to the monitoring environment, e.g.
@@ -47,6 +50,8 @@ OPTIONS:
-m AUTH_MODE Authentication mode, either "cookie", "basic" or
"digest",
defaults to "cookie"
-t TIMEOUT HTTP connect timeout in seconds (Default: 60)
+ -r track downtimes. This requires the hostname to be set.
+ -n HOSTNAME The hostname for which this check is run.
--in-downtime S S can be "ok" or "warn". Force this state if the
aggregate is in scheduled downtime. OK states will always
be unchanged.
@@ -56,7 +61,7 @@ OPTIONS:
""")
-short_options = 'b:a:u:s:m:t:dh'
+short_options = 'b:a:u:s:m:t:n:dhr'
long_options = [ "help", "in-downtime=", "acknowledged=" ]
try:
@@ -74,6 +79,9 @@ timeout = 60
debug = False
opt_in_downtime = None
opt_acknowledged = None
+track_downtime = False
+hostname = None
+
for o,a in opts:
if o in [ '-h', '--help' ]:
@@ -91,6 +99,10 @@ for o,a in opts:
auth_mode = a
elif o == '-t':
timeout = int(a)
+ elif o == '-r':
+ track_downtime = True
+ elif o == '-n':
+ hostname = a
elif o == '-d':
debug = True
elif o == '--in-downtime':
@@ -114,6 +126,13 @@ if not username or not password:
usage()
sys.exit(1)
+if track_downtime and not hostname:
+ sys.stderr.write('Please provide a hostname when using '
+ 'downtime tracking.\n')
+ usage()
+ sys.exit(1)
+
+
def init_auth():
if username and password:
passwdmngr = urllib2.HTTPPasswordMgrWithDefaultRealm()
@@ -186,6 +205,39 @@ if opt_in_downtime and row["aggr_in_downtime"] ==
'1':
else: # "warn"
aggr_state = min(aggr_state, 1)
+if track_downtime:
+ # connect to livestatus
+ try:
+ import livestatus
+ except ImportError:
+ sys.stderr.write('The python livestatus api module is missing. Please install
from\n'
+ 'Check_MK livestatus sources to a python import
path.\n')
+ sys.exit(1)
+
+ socket_path = os.environ['OMD_ROOT'] + '/tmp/run/live'
+
+ conn = livestatus.SingleSiteConnection('unix:' + socket_path)
+
+ now = time.time()
+ # find out if, according to previous tracking, there already is a downtime
+ ids = conn.query_table(("GET downtimes\n"
+ "Columns: id\n"
+ "Filter: service_description = Aggr Host %s\n"
+ "Filter: author = tracking\n"
+ "Filter: end_time > %d") % (hostname, now))
+ downtime_tracked = len(ids) > 0
+ if downtime_tracked != (row["aggr_in_downtime"] == '1'):
+ # there is a discrepance between tracked downtime state and the real state
+ if row["aggr_in_downtime"] == '1':
+ print("schedule tracked downtime")
+ # need to track downtime
+ conn.command("[%d] SCHEDULE_SVC_DOWNTIME;%s;Aggr Host
%s;%d;%d;1;0;0;"
+ "tracking;Automatic downtime" % (now, hostname,
hostname, now, 2147483647))
+ else:
+ for dt_id in ids:
+ conn.command("[%d] DEL_SVC_DOWNTIME;%d" % (now, dt_id[0]))
+
+
if opt_acknowledged and row["aggr_acknowledged"] == '1':
aggr_output += ", is acknowledged"
if opt_acknowledged == "ok":
@@ -196,3 +248,4 @@ if opt_acknowledged and row["aggr_acknowledged"] ==
'1':
sys.stdout.write('%s\n' % aggr_output)
sys.exit(aggr_state)
+
diff --git a/web/plugins/wato/active_checks.py b/web/plugins/wato/active_checks.py
index 7769008..dae8d04 100644
--- a/web/plugins/wato/active_checks.py
+++ b/web/plugins/wato/active_checks.py
@@ -331,7 +331,7 @@ register_rule(group,
Dictionary(
title = _("Optional parameters"),
elements = [
- ("name",
+ ("name",
TextUnicode(
title = _("Alternative Service description"),
help = _("The service description will be this name
instead <i>DNS Servername</i>"),
@@ -1516,6 +1516,15 @@ register_rule(group,
( "warn", _("Force to be WARN, if aggregate is
not OK") ),
]
)),
+ ("track_downtimes",
+ Checkbox(
+ title = _("Track downtimes"),
+ label = _("Automatically track downtimes of
aggregation"),
+ help = _("If this is active, the check will automatically go
into downtime "
+ "whenever the aggregation does. This downtime is
also cleaned up "
+ "automatically when the aggregation leaves downtime.
"
+ "Downtimes you set manually for this check are
unaffected."),
+ )),
]
),
]