Module: check_mk
Branch: master
Commit: f6235fb501f8a6b4988c86b302e5958b00affd79
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=f6235fb501f8a6…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Wed May 14 10:44:34 2014 +0200
FIX liveproxyd: handle situations with more then 1024 open files
When you are using the Livestatus Proxy Daemon for connecting lots
of sites and you have lots of concurrent users then two things could
happen:
1. You might run out of files. The reason is that per default on most
Linux systems the number of files a process is allowed to keep open
is limited to 1024. The can be increased by using
<tt>/etc/security/limits.conf</tt>.
The script launching the <tt>liveproxyd</tt> need to issue a <tt>ulimit
-S -n unlimited</tt>
before starting. In the OMD builds of the Check_MK Monitoring System this
has been added for recent development builds. The <tt>liveproxyd</tt> now
handles
this situation better and does not long run into a busy CPU loop. It waits for
5 seconds and restarts itself instead.
2. Even if you increased the number of allowed open files the
<tt>liveproxyd</tt>
could never open more than 1024 files on most Python versions (due to a limit of
the system call <tt>select()</tt>. This has been fixed by using the
<tt>poll()</tt>
system call.
---
.werks/970 | 25 +++++++++++++++
ChangeLog | 3 ++
doc/treasures/liveproxy/liveproxyd | 60 +++++++++++++++++++++++++-----------
3 files changed, 70 insertions(+), 18 deletions(-)
diff --git a/.werks/970 b/.werks/970
new file mode 100644
index 0000000..0dd6b6f
--- /dev/null
+++ b/.werks/970
@@ -0,0 +1,25 @@
+Title: liveproxyd: handle situations with more then 1024 open files
+Level: 2
+Component: liveproxy
+Version: 1.2.5i3
+Date: 1400056696
+Class: fix
+
+When you are using the Livestatus Proxy Daemon for connecting lots
+of sites and you have lots of concurrent users then two things could
+happen:
+
+1. You might run out of files. The reason is that per default on most
+Linux systems the number of files a process is allowed to keep open
+is limited to 1024. The can be increased by using
<tt>/etc/security/limits.conf</tt>.
+The script launching the <tt>liveproxyd</tt> need to issue a <tt>ulimit
-S -n unlimited</tt>
+before starting. In the OMD builds of the Check_MK Monitoring System this
+has been added for recent development builds. The <tt>liveproxyd</tt> now
handles
+this situation better and does not long run into a busy CPU loop. It waits for
+5 seconds and restarts itself instead.
+
+2. Even if you increased the number of allowed open files the
<tt>liveproxyd</tt>
+could never open more than 1024 files on most Python versions (due to a limit of
+the system call <tt>select()</tt>. This has been fixed by using the
<tt>poll()</tt>
+system call.
+
diff --git a/ChangeLog b/ChangeLog
index e3b0ec6..ee78c6c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -114,6 +114,9 @@
* 0747 FIX: livestatus table hostsbygroup: fixed bug with group_authorization
strict...
* 0831 FIX: table statehist: no longer crashes on TIMEPERIOD TRANSITION entries with
an invalid syntax...
+ Livestatus-Proxy:
+ * 0970 FIX: liveproxyd: handle situations with more then 1024 open files...
+
HW/SW-Inventory:
* 0913 lnx_distro: Now able to detect SuSE distributions...
* 0886 FIX: Fix exception on non-UTF-8 encoded characters in software list
diff --git a/doc/treasures/liveproxy/liveproxyd b/doc/treasures/liveproxy/liveproxyd
index 162ebe2..7d80797 100755
--- a/doc/treasures/liveproxy/liveproxyd
+++ b/doc/treasures/liveproxy/liveproxyd
@@ -96,7 +96,15 @@ def liveproxyd_run():
if opt_debug:
raise
+ if "Too many open files" in str(e):
+ log("Too many open files! Please increase the ulimit for
'nofiles'!")
+ log("Waiting for 5 seconds...")
+ time.sleep(5)
+ log("Restarting, in the hope that files will the again be enough for
a while.")
+ do_restart()
+
log("Ignoring exception: %s: %s" % (e, traceback.format_exc()))
+ time.sleep(1) # Avoid CPU loop in case of permanent error
def initiate_connections():
# Create new channels to target sites. Nonblocking!
@@ -198,7 +206,7 @@ def disconnect_from_site(sitename):
def complete_connections(writable):
for sitename, sitestate in g_sites.items():
for channel in sitestate["channels"]:
- if channel["state"] == "connecting" and
channel["socket"] in writable:
+ if channel["state"] == "connecting" and
channel["socket"].fileno() in writable:
try:
channel["socket"].send("")
channel["socket"].setblocking(1) # avoid signals from
interrupting us
@@ -209,44 +217,60 @@ def complete_connections(writable):
channel["state"] = "error"
sitestate["last_failed_connect"] = time.time()
-# Master/Mega/Central select(). We are going to be the select() master. Harhar.
+# Master/Mega/Central poll(). We though we are the select() master. But select()
+# is limited to 1024 filedescriptors in most Python versions. So we rather use
+# poll(). If you have many sites and many users you will easily get more
+# filedescriptors...
def do_select(timeout):
- read_fds = []
- write_fds = []
+ p = select.poll()
for sitename, sitestate in g_sites.items():
# outgoing connections currently building up
for channel in sitestate["channels"]:
+ sock = channel["socket"]
if channel["state"] == "connecting":
- write_fds.append(channel["socket"])
+ p.register(sock, select.POLLOUT)
# new client connections
- read_fds.append(sitestate["socket"])
+ sock = sitestate["socket"]
+ p.register(sock, select.POLLIN)
for client in sitestate["clients"]:
+ sock = client["socket"]
+
# new requests from existing clients
if client["state"] == "idle":
- read_fds.append(client["socket"])
+ p.register(sock, select.POLLIN)
# clients ready to receive a response
- if client["state"] == "response":
- write_fds.append(client["socket"])
+ elif client["state"] == "response":
+ p.register(sock, select.POLLOUT)
# Responses from channels, also heartbeat responses
for channel in sitestate["channels"]:
if channel["state"] in [ "busy", "heartbeat"
]:
- read_fds.append(channel["socket"])
+ p.register(channel["socket"], select.POLLIN)
try:
- r_able, w_able, x_able = select.select(read_fds, write_fds, [], timeout)
- except select.error:
+ readylist = p.poll(timeout)
+ r_able = []
+ w_able = []
+ for fd, event in readylist:
+ if event & select.POLLIN:
+ r_able.append(fd)
+ if event & select.POLLOUT:
+ w_able.append(fd)
+ return r_able, w_able
+
+ except select.error, e:
+ log("Error during poll(): %s" % e)
return [], []
- return r_able, w_able
+
def accept_new_clients(readable):
for sitename, sitestate in g_sites.items():
- if sitestate["socket"] in readable:
+ if sitestate["socket"].fileno() in readable:
try:
s, addrinfo = sitestate["socket"].accept()
s.setblocking(1)
@@ -261,7 +285,7 @@ def get_new_requests(readable):
for sitename, sitestate in g_sites.items():
for client in sitestate["clients"]:
if client["state"] == "idle" and \
- (client["socket"] in readable or
client.get("nextrequest")):
+ (client["socket"].fileno() in readable or
client.get("nextrequest")):
try:
request = receive_request(sitename, client)
if not request:
@@ -419,7 +443,7 @@ def receive_request(sitename, client):
def get_responses(readable):
for sitename, sitestate in g_sites.items():
for channel in sitestate["channels"]:
- if channel["socket"] in readable:
+ if channel["socket"].fileno() in readable:
if channel["state"] == "busy":
receive_response(sitename, channel)
else:
@@ -498,7 +522,7 @@ def receive_response(sitename, channel):
def send_responses(writable):
for sitename, sitestatus in g_sites.items():
for client in sitestatus["clients"]:
- if client["state"] == "response" and
client["socket"] in writable:
+ if client["state"] == "response" and
client["socket"].fileno() in writable:
try:
# ACHTUNG: Beim senden an den Client können wir blockieren, wenn
@@ -822,7 +846,7 @@ def read_configuration():
def do_restart():
log("Restarting myself")
- for fd in range(3, 256):
+ for fd in range(3, 8192):
try:
os.close(fd)
except: