Module: check_mk
Branch: master
Commit: 1618847f4db6cdbe2f8eca8ee33551f8cb60cf9e
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=1618847f4db6cd…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Tue Feb 21 15:57:40 2012 +0100
Livestatus-API: do reconnect until timeout is used up
---
.bugs/608 | 7 +++-
.bugs/647 | 7 +++-
ChangeLog | 3 ++
web/htdocs/htmllib.py | 2 +-
web/htdocs/index.py | 1 +
web/htdocs/livestatus.py | 62 +++++++++++++++++++++++++++++++--------------
web/htdocs/wato.py | 7 +++--
7 files changed, 62 insertions(+), 27 deletions(-)
diff --git a/.bugs/608 b/.bugs/608
index ebd2214..483c818 100644
--- a/.bugs/608
+++ b/.bugs/608
@@ -1,9 +1,9 @@
Title: Sidebar Refresh bei Nagios Reload
Component: multisite
-State: open
+Class: nastiness
+State: done
Date: 2012-01-27 16:09:02
Targetversion: 1.2.0
-Class: nastiness
Wird über Wato ein Nagios Reload anstelle des Restart ausgeführt, braucht
das Neuladen des Livestatus Modules zu lange und die Sidebar läuf in einen
@@ -12,3 +12,6 @@ Livestatus Timeout und muss neu geladen werden.
Mathias: Ich denke wir lösen das Problem in der Livestatus-API für Python.
Wenn der Connect nicht klappt, probieren wir es eine Zeitlang. Dafür
gibt es ja eine Timeout-Einstellung.
+
+2012-02-21 15:57:25: changed state open -> done
+Das ist jetzt in livestatus.py umgesetzt.
diff --git a/.bugs/647 b/.bugs/647
index 681efb7..6d0a5a4 100644
--- a/.bugs/647
+++ b/.bugs/647
@@ -1,11 +1,14 @@
Title: Sidebar Reload macht während Nagios Restart Probleme
Component: multisite
-State: open
+Class: nastiness
+State: done
Date: 2012-02-06 12:46:07
Targetversion: future
-Class: nastiness
Man könnte so lange versuchen die Livestatus Verbindung aufzubauen
bis das Connect Timeout erreicht ist.
Momentan wird es nur einmal versucht. Dann wird mit "Connection reset by peer"
abgebrochen.
+
+2012-02-21 15:56:50: changed state open -> done
+Now within the timeout a reconnect is tried.
diff --git a/ChangeLog b/ChangeLog
index d2be19b..b79678a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -56,6 +56,9 @@
* FIX: Fixed preview table styling in view editor
* FIX: Multisite authed users without permission to multisite are
automatically logged out after showing the error message
+ * Retry livestatus connect until timeout is used up. This avoids
+ error messages when the core is being restarted
+
BI:
* New column (painter) for simplistic box display of tree.
diff --git a/web/htdocs/htmllib.py b/web/htdocs/htmllib.py
index 9d4ff21..807f7f0 100644
--- a/web/htdocs/htmllib.py
+++ b/web/htdocs/htmllib.py
@@ -978,7 +978,7 @@ class html:
return (omd_mode, omd_site)
def begin_foldable_container(self, treename, id, isopen, title, indent = True, first
= False):
- # try to get persistet state of tree
+ # try to get persisted state of tree
tree_state = weblib.get_tree_states(treename)
if id in tree_state:
diff --git a/web/htdocs/index.py b/web/htdocs/index.py
index 4c13de4..6cffb43 100644
--- a/web/htdocs/index.py
+++ b/web/htdocs/index.py
@@ -139,6 +139,7 @@ def connect_to_livestatus(html):
else:
html.live = livestatus.SingleSiteConnection("unix:" +
defaults.livestatus_unix_socket)
+ html.live.set_timeout(10) # default timeout is 10 seconds
html.site_status = { '': { "state" : "dead",
"site" : config.site('') } }
v1, v2, ps = html.live.query_row("GET status\nColumns: livestatus_version
program_version program_start")
html.site_status[''].update({ "state" : "online",
"livestatus_version": v1, "program_version" : v2,
"program_start" : ps })
diff --git a/web/htdocs/livestatus.py b/web/htdocs/livestatus.py
index eef1be3..c5211ce 100644
--- a/web/htdocs/livestatus.py
+++ b/web/htdocs/livestatus.py
@@ -194,7 +194,7 @@ class BaseConnection:
parts = url.split(":")
if parts[0] == "unix":
if len(parts) != 2:
- raise MKLivestatusConfigError("Invalid livestatus unix url: %s.
"
+ raise MKLivestatusConfigError("Invalid livestatus unix URL: %s.
"
"Correct example is
'unix:/var/run/nagios/rw/live'" % url)
self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
target = parts[1]
@@ -204,21 +204,37 @@ class BaseConnection:
host = parts[1]
port = int(parts[2])
except:
- raise MKLivestatusConfigError("Invalid livestatus tcp url
'%s'. "
+ raise MKLivestatusConfigError("Invalid livestatus tcp URL
'%s'. "
"Correct example is 'tcp:somehost:6557'" %
url)
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
target = (host, port)
else:
- raise MKLivestatusConfigError("Invalid livestatus url '%s'.
"
+ raise MKLivestatusConfigError("Invalid livestatus URL '%s'.
"
"Must begin with 'tcp:' or 'unix:'" % url)
- try:
- if self.timeout:
- self.socket.settimeout(float(self.timeout))
- self.socket.connect(target)
- except Exception, e:
- self.socket = None
- raise MKLivestatusSocketError("Cannot connect to '%s': %s"
% (self.socketurl, e))
+ # If a timeout is set, then we retry after a failure with mild
+ # a binary backoff.
+ if self.timeout:
+ before = time.time()
+ sleep_interval = 0.1
+
+ while True:
+ try:
+ if self.timeout:
+ self.socket.settimeout(float(sleep_interval))
+ self.socket.connect(target)
+ break
+ except Exception, e:
+ if self.timeout:
+ time_left = self.timeout - (time.time() - before)
+ # only try again, if there is substantial time left
+ if time_left > sleep_interval:
+ time.sleep(sleep_interval)
+ sleep_interval *= 1.5
+ continue
+
+ self.socket = None
+ raise MKLivestatusSocketError("Cannot connect to '%s':
%s" % (self.socketurl, e))
if self.persist:
persistent_connections[self.socketurl] = self.socket
@@ -242,7 +258,7 @@ class BaseConnection:
self.send_query(query, add_headers)
return self.recv_response(query, add_headers)
- def send_query(self, query, add_headers = ""):
+ def send_query(self, query, add_headers = "", do_reconnect=True):
if self.socket == None:
self.connect()
if not query.endswith("\n"):
@@ -261,7 +277,15 @@ class BaseConnection:
del persistent_connections[self.socketurl]
self.successful_persistence = False
self.socket = None
- raise MKLivestatusSocketError(str(e))
+
+ if do_reconnect:
+ # Automatically try to reconnect in case of an error, but
+ # only once.
+ self.connect()
+ self.send_query(query, add_headers, False)
+ return
+
+ raise MKLivestatusSocketError("RC1:" + str(e))
# Reads a response from the livestatus socket. If the socket is closed
# by the livestatus server, we automatically make a reconnect and send
@@ -282,20 +306,20 @@ class BaseConnection:
raise MKLivestatusSocketError("Malformed output")
else:
raise MKLivestatusQueryError(code, data.strip())
- except MKLivestatusSocketClosed:
+
+ # In case of an IO error or the other side having
+ # closed the socket do a reconnect and try again, but
+ # only once
+ except (MKLivestatusSocketClosed, IOError), e:
self.disconnect()
if query:
+ time.sleep(0.1)
self.connect()
self.send_query(query, add_headers)
return self.recv_response() # do not send query again -> danger of
infinite loop
else:
- raise
+ raise MKLivestatusSocketError(str(e))
- except IOError, e:
- self.socket = None
- if self.persist:
- del persistent_connections[self.socketurl]
- raise MKLivestatusSocketError(str(e))
def do_command(self, command):
if self.socket == None:
diff --git a/web/htdocs/wato.py b/web/htdocs/wato.py
index 5f90ed2..5d3b2a2 100644
--- a/web/htdocs/wato.py
+++ b/web/htdocs/wato.py
@@ -5741,11 +5741,12 @@ def mode_edit_site(phase):
# Timeout
html.write("<tr><td class=legend>")
- html.write(_("Connect Timeout<br><i>This setting limits the time
Multisites waits for a connection "
+ html.write(_("Connect Timeout<br><i>This sets the time that
Multisite waits for a connection "
"to the site to be established before the site is considered to be
unreachable. "
- "If not set, the operating system defaults are begin
used.</i>"))
+ "If not set, the operating system defaults are begin used and just
one login attempt is being. "
+ "performed.</i>"))
html.write("</td><td class=content>")
- timeout = site.get("timeout", "")
+ timeout = site.get("timeout", 10)
html.number_input("timeout", timeout, size=2)
html.write(_(" seconds"))
html.write("</td></tr>")