Module: check_mk
Branch: master
Commit: d3ac96af5a265902251eb5e6897cd7e2925ca651
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=d3ac96af5a2659…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Mon Mar 25 23:46:41 2013 +0100
FIX: avoid simultanous activation of changes by means of a lock
Conflicts:
ChangeLog
---
ChangeLog | 1 +
modules/automation.py | 4 ++++
modules/check_mk.py | 28 ++++++++++++++++++++++++++++
web/htdocs/livestatus.py | 9 ++++++---
web/plugins/wato/check_mk_configuration.py | 17 +++++++++++++++++
5 files changed, 56 insertions(+), 3 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 4360157..ae8cf67 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -80,6 +80,7 @@
will be added automatically if missing.
* FIX: livecheck: fixed compilation bug
* FIX: check_mk: convert service description unicode into utf-8
+ * FIX: avoid simultanous activation of changes by means of a lock
Checks & Agents:
* FIX: jolokia_metrics.mem - now able to handle negative/missing max values
diff --git a/modules/automation.py b/modules/automation.py
index a0ed7e9..1994de8 100644
--- a/modules/automation.py
+++ b/modules/automation.py
@@ -479,6 +479,10 @@ def automation_restart(job="restart"):
sys.stdout = null_file()
try:
+ backup_path = None
+ if not lock_nagios_objects_file():
+ raise MKAutomationError("Cannot activate changes. "
+ "Another activation process is currently in progresss")
if os.path.exists(nagios_objects_file):
backup_path = nagios_objects_file + ".save"
os.rename(nagios_objects_file, backup_path)
diff --git a/modules/check_mk.py b/modules/check_mk.py
index a710bb3..0b1211e 100755
--- a/modules/check_mk.py
+++ b/modules/check_mk.py
@@ -214,6 +214,7 @@ agent_ports = []
snmp_ports = [] # UDP ports used for SNMP
tcp_connect_timeout = 5.0
delay_precompile = False # delay Python compilation to Nagios
execution
+restart_locking = "abort" # also possible: "wait",
None
check_submission = "file" # alternative: "pipe"
aggr_summary_hostname = "%s-s"
agent_min_version = 0 # warn, if plugin has not at least version
@@ -4066,6 +4067,12 @@ def do_reload():
def do_restart(only_reload = False):
try:
+ backup_path = None
+
+ if not lock_nagios_objects_file():
+ sys.stderr.write("Other restart currently in progress.
Aborting.\n")
+ sys.exit(1)
+
# Save current configuration
if os.path.exists(nagios_objects_file):
backup_path = nagios_objects_file + ".save"
@@ -4108,6 +4115,27 @@ def do_restart(only_reload = False):
sys.stderr.write("An error occurred: %s\n" % e)
sys.exit(1)
+restart_lock_fd = None
+def lock_nagios_objects_file():
+ global restart_lock_fd
+ # In some bizarr cases (as cmk -RR) we need to avoid duplicate locking!
+ if restart_locking and restart_lock_fd == None:
+ lock_file = default_config_dir + "/main.mk"
+ import fcntl
+ restart_lock_fd = os.open(lock_file, os.O_RDONLY)
+ # Make sure that open file is not inherited to monitoring core!
+ fcntl.fcntl(restart_lock_fd, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
+ try:
+ if opt_debug:
+ sys.stderr.write("Waiting for exclusive lock on %s.\n" %
+ lock_file)
+ fcntl.flock(restart_lock_fd, fcntl.LOCK_EX |
+ ( restart_locking == "abort" and fcntl.LOCK_NB or 0))
+ except:
+ return False
+ return True
+
+
def do_donation():
donate = []
cache_files = os.listdir(tcp_cache_dir)
diff --git a/web/htdocs/livestatus.py b/web/htdocs/livestatus.py
index 6823873..bff1cfd 100644
--- a/web/htdocs/livestatus.py
+++ b/web/htdocs/livestatus.py
@@ -296,7 +296,7 @@ class BaseConnection:
# Reads a response from the livestatus socket. If the socket is closed
# by the livestatus server, we automatically make a reconnect and send
# the query again (once). This is due to timeouts during keepalive.
- def recv_response(self, query = None, add_headers = ""):
+ def recv_response(self, query = None, add_headers = "", timeout_at =
None):
try:
resp = self.receive_data(16)
code = resp[0:3]
@@ -318,11 +318,14 @@ class BaseConnection:
# only once
except (MKLivestatusSocketClosed, IOError), e:
self.disconnect()
- if query:
+ now = time.time()
+ if query and (not timeout_at or timeout_at > now):
+ if timeout_at == None:
+ timeout_at = now + self.timeout
time.sleep(0.1)
self.connect()
self.send_query(query, add_headers)
- return self.recv_response() # do not send query again -> danger of
infinite loop
+ return self.recv_response(query, add_headers, timeout_at) # do not send
query again -> danger of infinite loop
else:
raise MKLivestatusSocketError(str(e))
diff --git a/web/plugins/wato/check_mk_configuration.py
b/web/plugins/wato/check_mk_configuration.py
index ef4fb86..f3e5e54 100644
--- a/web/plugins/wato/check_mk_configuration.py
+++ b/web/plugins/wato/check_mk_configuration.py
@@ -268,6 +268,7 @@ register_configvar(group,
domain = "multisite"
)
+
register_configvar(group,
"bi_precompile_on_demand",
Checkbox(title = _("Precompile aggregations on demand"),
@@ -632,6 +633,22 @@ register_configvar(group,
need_restart = True)
register_configvar(group,
+ "restart_locking",
+ DropdownChoice(
+ title = _("Simultanous activation of changes"),
+ help = _("When two users simultanously try to activate the changes then
"
+ "you can decide to abort with an error (default) or have the
requests "
+ "serialized. It is also possible - but not recommended - to turn
"
+ "off locking altogether."),
+ choices = [
+ ('abort', _("Abort with an error")),
+ ('ait' , _("Wait until the other has finished") ),
+ (None , _("Disable locking") ),
+ ]),
+ need_restart = False
+ )
+
+register_configvar(group,
"agent_simulator",
Checkbox(title = _("SNMP Agent Simulator"),
label = _("Process stored SNMP walks with agent simulator"),