Module: check_mk
Branch: master
Commit: 641329290cf1b4ef1e5428b0c44c525ed0dfec1e
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=641329290cf1b4…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Mon Feb 29 16:46:45 2016 +0100
#3252 FIX ntp.time: better handling of temporary synchronization loss
After a reboot of the target machine or of the NTP server the check always
go CRIT for a couple of minutes - since NTP disregarded the peers as suitable.
Now this is being tolerated for a configurable amount of time. Per default
after 5 minutes of being without a valid time source the check goes WARN
and after 60 minute CRIT - but only if before this there was a valid time.
---
.werks/3252 | 13 +++++++++
ChangeLog | 1 +
checks/ntp | 52 ++++++++++++++++++++++++++++++----
web/plugins/wato/check_parameters.py | 19 ++++++++++++-
4 files changed, 79 insertions(+), 6 deletions(-)
diff --git a/.werks/3252 b/.werks/3252
new file mode 100644
index 0000000..68d8463
--- /dev/null
+++ b/.werks/3252
@@ -0,0 +1,13 @@
+Title: ntp.time: better handling of temporary synchronization loss
+Level: 2
+Component: checks
+Compatible: compat
+Version: 1.2.9i1
+Date: 1456760686
+Class: fix
+
+After a reboot of the target machine or of the NTP server the check always
+go CRIT for a couple of minutes - since NTP disregarded the peers as suitable.
+Now this is being tolerated for a configurable amount of time. Per default
+after 5 minutes of being without a valid time source the check goes WARN
+and after 60 minute CRIT - but only if before this there was a valid time.
diff --git a/ChangeLog b/ChangeLog
index 3d7dc78..7c50568 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -49,6 +49,7 @@
* 3245 FIX: Fix interpretation of check levels 0.0 as "no levels"...
* 3250 FIX: cisco_mem: remove bogus memory check for "Driver text"...
* 3084 FIX: windows agent: fixed crash when specifying an invalid performance
counter
+ * 3252 FIX: ntp.time: better handling of temporary synchronization loss...
Multisite:
* 3187 notification view: new filter for log command via regex
diff --git a/checks/ntp b/checks/ntp
index b1dcd62..3b90f06 100644
--- a/checks/ntp
+++ b/checks/ntp
@@ -26,6 +26,11 @@
ntp_default_levels = (10, 200.0, 500.0) # stratum, ms offset
+factory_settings["ntp_time_default_levels"] = {
+ "ntp_levels" : ntp_default_levels,
+ "alert_delay" : (300, 3600),
+}
+
# Example output from agent:
# <<<ntp>>>
# - 42.202.61.100 .INIT. 16 u - 1024 0 0.000 0.000 0.000
@@ -123,6 +128,7 @@ def check_ntp_server_state(line, params):
else:
return (0, infotext, offset, jitter)
+
def check_ntp(item, params, info):
for line in info:
if line[1] == item:
@@ -139,18 +145,53 @@ def check_ntp(item, params, info):
return (3, "peer not found")
-def check_ntp_summary(item, params, info):
+
+def check_ntp_summary(_no_item, params, info):
+ if type(params) == tuple:
+ params = {
+ "ntp_levels" : params,
+ "alert_delay" : (300, 3600),
+ }
# No information at all? NTP daemon not running or timeout in ntpq -p
if len(info) == 0:
- return (3, "no information from NTP: timeout in ntpq -p or NTP daemon not
running")
+ yield 3, "no information from NTP: timeout in ntpq -p or NTP daemon not
running"
+ return
# We only are interested in our system peer or pulse per second source (pps)
for line in info:
if line[0] in [ "*", "o" ]:
- state, text, perfdata = check_ntp(line[1], params, [line])
+ state, text, perfdata = check_ntp(line[1], params["ntp_levels"],
[line])
text += " (synchronized on %s)" % line[1]
- return (state, text, perfdata)
- return (2, "found %d peers, but none is suitable" % len(info))
+ set_item_state(None, ("sync", time.time(), state, "last
successful sync")) # remember last successfull sync
+ yield state, text, perfdata
+ return
+
+ infotext = "found %d peers, but none is suitable" % len(info)
+ yield 0, infotext
+
+ # Currently no peer is suitable. But we want to tolerate that for a while.
+ entry = get_item_state(None)
+ if entry == None:
+ set_item_state(None, ("init", time.time(), 0, infotext))
+ yield 0, "just started monitoring"
+ return
+
+ how, last_successful_sync, last_state, last_infotext = entry
+ time_since_sync = time.time() - last_successful_sync
+ infotext = "this is %s since " % get_age_human_readable(time_since_sync)
+ infotext += last_infotext
+ yield last_state, infotext
+
+ warn_time, crit_time = params["alert_delay"]
+ if time_since_sync >= crit_time:
+ status = 2
+ elif time_since_sync >= warn_time:
+ status = 1
+ else:
+ status = 0
+
+ if status:
+ yield status, "(levels at %s/%s)" % (get_age_human_readable(warn_time),
get_age_human_readable(crit_time))
@@ -168,4 +209,5 @@ check_info["ntp.time"] = {
'service_description': 'NTP Time',
'has_perfdata': True,
'group': 'ntp_time',
+ 'default_levels_variable': "ntp_time_default_levels",
}
diff --git a/web/plugins/wato/check_parameters.py b/web/plugins/wato/check_parameters.py
index 6cf9cf3..7c4a2bb 100644
--- a/web/plugins/wato/check_parameters.py
+++ b/web/plugins/wato/check_parameters.py
@@ -8730,6 +8730,7 @@ register_check_parameters(
)
ntp_params = \
Tuple(
+ title = _("Thresholds for quality of time"),
elements = [
Integer(
title = _("Critical at stratum"),
@@ -8755,7 +8756,23 @@ register_check_parameters(
subgroup_os,
"ntp_time",
_("State of NTP time synchronisation"),
- ntp_params,
+ Transform(
+ Dictionary(
+ elements = [
+ ( "ntp_levels",
+ ntp_params, ),
+ ( "alert_delay",
+ Tuple(
+ title = _("Phases without synchronization"),
+ elements = [
+ Age(title=_("Warning at"), display=["hours",
"minutes"], default_value = 300, ),
+ Age(title=_("Critical at"),
display=["hours", "minutes"], default_value = 3600,),
+ ]
+ )),
+ ]
+ ),
+ forth = lambda params: type(params) == tuple and { "ntp_levels" : params
} or params
+ ),
None,
"first"
)