Module: check_mk
Branch: master
Commit: 6fb0b9e1c104913d5031091b2c3981a2ff9eb54b
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=6fb0b9e1c10491…
Author: Simon Betz <si(a)mathias-kettner.de>
Date: Mon Nov 27 11:08:14 2017 +0100
5444 FIX SNMP commands: Prevent zombie processes in case of timeouts
Change-Id: I7366ceaefa9b68ddb85388c943a87a4e8b6aac1e
---
.werks/5444 | 20 +++++++++++
cmk_base/classic_snmp.py | 87 +++++++++++++++++++++++++++++++-----------------
2 files changed, 76 insertions(+), 31 deletions(-)
diff --git a/.werks/5444 b/.werks/5444
new file mode 100644
index 0000000..0c3852f
--- /dev/null
+++ b/.werks/5444
@@ -0,0 +1,20 @@
+Title: SNMP commands: Prevent zombie processes in case of timeouts
+Level: 1
+Component: core
+Class: fix
+Compatible: compat
+Edition: cre
+State: unknown
+Version: 1.5.0i2
+Date: 1510582108
+
+This concerns the RAW edition or users which use CEE/CME edition of Check_MK
+with deactivated inline SNMP.
+
+When executing SNMP commands like get, getnext, walk or bulkwalk to get agent data
+from hosts it may happen that these commands remain open as zombie processes.
+Check_MK has been extended to deal with this situation and clean up these processes.
+
+Details: When the command execution takes too long Check_MK sends a SIGTERM to the
+process group of the executed program. After sending the signal Check_MK is now
+waiting for the process to finish.
diff --git a/cmk_base/classic_snmp.py b/cmk_base/classic_snmp.py
index fa4a5d9..f24f404 100644
--- a/cmk_base/classic_snmp.py
+++ b/cmk_base/classic_snmp.py
@@ -57,9 +57,40 @@ def walk(hostname, ip, oid, hex_plain=False, context_name=None):
debug_cmd = [ "''" if a == "" else a for a in command ]
console.vverbose("Running '%s'\n" % " ".join(debug_cmd))
- snmp_process = subprocess.Popen(command, close_fds=True, stdin=open(os.devnull),
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ snmp_process = None
+ exitstatus = None
+ rowinfo = []
+ try:
+ snmp_process = subprocess.Popen(command, close_fds=True, stdin=open(os.devnull),
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+ rowinfo = _get_rowinfo_from_snmp_process(snmp_process, hex_plain)
+
+ except MKTimeout:
+ # On timeout exception try to stop the process to prevent child process "leakage"
+ if snmp_process:
+ os.kill(snmp_process.pid, signal.SIGTERM)
+ snmp_process.wait()
+ raise
+
+ finally:
+ # The stdout and stderr pipe are not closed correctly on a MKTimeout
+ # Normally these pipes getting closed after p.communicate finishes
+ # Closing them a second time in a OK scenario won't hurt neither..
+ if snmp_process:
+ exitstatus = snmp_process.wait()
+ error = snmp_process.stderr.read()
+ snmp_process.stdout.close()
+ snmp_process.stderr.close()
+
+ if exitstatus:
+ console.verbose(tty.red + tty.bold + "ERROR: " + tty.normal + "SNMP error: %s\n" % error.strip())
+ raise MKSNMPError("SNMP Error on %s: %s (Exit-Code: %d)" % (ip, error.strip(), exitstatus))
+ return rowinfo
+
+def _get_rowinfo_from_snmp_process(snmp_process, hex_plain):
+ line_iter = snmp_process.stdout.xreadlines()
# Ugly(1): in some cases snmpwalk inserts line feed within one
# dataset. This happens for example on hexdump outputs longer
# than a few bytes. Those dumps are enclosed in double quotes.
@@ -67,36 +98,30 @@ def walk(hostname, ip, oid, hex_plain=False, context_name=None):
# does not end with a double quote, we take the next line(s) as
# a continuation line.
rowinfo = []
- try:
- line_iter = snmp_process.stdout.xreadlines()
- while True:
+ while True:
+ try:
line = line_iter.next().strip()
- parts = line.split('=', 1)
- if len(parts) < 2:
- continue # broken line, must contain =
- oid = parts[0].strip()
- value = parts[1].strip()
- # Filter out silly error messages from snmpwalk >:-P
- if value.startswith('No more variables') or value.startswith('End of MIB') \
- or value.startswith('No Such Object available') or value.startswith('No Such Instance currently exists'):
- continue
-
- if value == '"' or (len(value) > 1 and value[0] == '"' and (value[-1] != '"')): # to be continued
- while True: # scan for end of this dataset
- nextline = line_iter.next().strip()
- value += " " + nextline
- if value[-1] == '"':
- break
- rowinfo.append((oid, strip_snmp_value(value, hex_plain)))
-
- except StopIteration:
- pass
-
- error = snmp_process.stderr.read()
- exitstatus = snmp_process.wait()
- if exitstatus:
- console.verbose(tty.red + tty.bold + "ERROR: " + tty.normal + "SNMP error: %s\n" % error.strip())
- raise MKSNMPError("SNMP Error on %s: %s (Exit-Code: %d)" % (ip, error.strip(), exitstatus))
+ except StopIteration:
+ break
+
+ parts = line.split('=', 1)
+ if len(parts) < 2:
+ continue # broken line, must contain =
+ oid = parts[0].strip()
+ value = parts[1].strip()
+ # Filter out silly error messages from snmpwalk >:-P
+ if value.startswith('No more variables') or value.startswith('End of MIB') \
+ or value.startswith('No Such Object available') \
+ or value.startswith('No Such Instance currently exists'):
+ continue
+
+ if value == '"' or (len(value) > 1 and value[0] == '"' and (value[-1] != '"')): # to be continued
+ while True: # scan for end of this dataset
+ nextline = line_iter.next().strip()
+ value += " " + nextline
+ if value[-1] == '"':
+ break
+ rowinfo.append((oid, strip_snmp_value(value, hex_plain)))
return rowinfo
Module: check_mk
Branch: master
Commit: 18b19b2a6e2bc575df46781da17c4b1f8a69eae0
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=18b19b2a6e2bc5…
Author: Lars Michelsen <lm(a)mathias-kettner.de>
Date: Mon Nov 27 10:50:59 2017 +0100
5543 Host rename: Reduced duration of core downtime during host renamings
The most time consuming action during renaming of hosts, the rewrite of the
monitoring history, is now done while the core is running. After this is done,
the core is being stopped and the remaining history files (current one) is
updated.
Change-Id: I35b166a3bcc9b43cd3be910335b0335bbe9824d7
---
.werks/5543 | 13 +++++
cmk_base/automations/check_mk.py | 110 ++++++++++++++++++++++++++++-----------
2 files changed, 94 insertions(+), 29 deletions(-)
diff --git a/.werks/5543 b/.werks/5543
new file mode 100644
index 0000000..0800024
--- /dev/null
+++ b/.werks/5543
@@ -0,0 +1,13 @@
+Title: Host rename: Reduced duration of core downtime during host renamings
+Level: 1
+Component: core
+Compatible: compat
+Edition: cre
+Version: 1.5.0i2
+Date: 1511776151
+Class: feature
+
+The most time consuming action during renaming of hosts, the rewrite of the
+monitoring history, is now done while the core is running. After this is done,
+the core is being stopped and the remaining history files (current one) is
+updated.
diff --git a/cmk_base/automations/check_mk.py b/cmk_base/automations/check_mk.py
index bccaa61..8e2099e 100644
--- a/cmk_base/automations/check_mk.py
+++ b/cmk_base/automations/check_mk.py
@@ -192,6 +192,10 @@ class AutomationRenameHosts(Automation):
needs_config = True
needs_checks = True
+ def __init__(self):
+ super(AutomationRenameHosts, self).__init__()
+ self._finished_history_files = {}
+
# WATO calls this automation when hosts have been renamed. We need to change
# several file and directory names. This function has no argument but reads
# Python pair-list from stdin:
@@ -201,6 +205,17 @@ class AutomationRenameHosts(Automation):
actions = []
+ # The history archive can be renamed with running core. We need to keep
+ # the list of already handled history archive files, because a new history
+ # file may be created by the core during this step. All unhandled files,
+ # including the current history files will be handled later when the core
+ # is stopped.
+ for oldname, newname in renamings:
+ self._finished_history_files[(oldname, newname)] = \
+ self._rename_host_in_core_history_archive(oldname, newname)
+ if self._finished_history_files[(oldname, newname)]:
+ actions.append("history")
+
# At this place WATO already has changed it's configuration. All further
# data might be changed by the still running core. So we need to stop
# it now.
@@ -338,7 +353,7 @@ class AutomationRenameHosts(Automation):
# This functions could be moved out of Check_MK.
def _omd_rename_host(self, oldname, newname):
- oldregex = oldname.replace(".", "[.]")
+ oldregex = self._escape_name_for_regex_matching(oldname)
actions = []
# Temporarily stop processing of performance data
@@ -386,34 +401,7 @@ class AutomationRenameHosts(Automation):
if npcd_running:
os.system("omd start npcd >/dev/null 2>&1 </dev/null")
- # Logfiles and history files of CMC and Nagios. Problem
- # here: the exact place of the hostname varies between the
- # various log entry lines
- sed_commands = r'''
-s/(INITIAL|CURRENT) (HOST|SERVICE) STATE: %(old)s;/\1 \2 STATE: %(new)s;/
-s/(HOST|SERVICE) (DOWNTIME |FLAPPING |)ALERT: %(old)s;/\1 \2ALERT: %(new)s;/
-s/PASSIVE (HOST|SERVICE) CHECK: %(old)s;/PASSIVE \1 CHECK: %(new)s;/
-s/(HOST|SERVICE) NOTIFICATION: ([^;]+);%(old)s;/\1 NOTIFICATION: \2;%(new)s;/
-''' % { "old" : oldregex, "new" : newname }
- path_patterns = [
- "var/check_mk/core/history",
- "var/check_mk/core/archive/*",
- "var/nagios/nagios.log",
- "var/nagios/archive/*",
- ]
- one_matched = False
- for path_pattern in path_patterns:
- command = ["sed", "-ri", "--file=/dev/fd/0"]
- files = glob.glob("%s/%s" % (cmk.paths.omd_root, path_pattern))
- p = subprocess.Popen(command + files, stdin=subprocess.PIPE,
- stdout=open(os.devnull, "w"), stderr=subprocess.STDOUT,
- close_fds=True)
- p.communicate(sed_commands)
- if files:
- one_matched = True
-
- if one_matched:
- actions.append("history")
+ self._rename_host_in_remaining_core_history_files(oldname, newname)
# State retention (important for Downtimes, Acknowledgements, etc.)
if config.monitoring_core == "nagios":
@@ -440,6 +428,65 @@ s/(HOST|SERVICE) NOTIFICATION: ([^;]+);%(old)s;/\1 NOTIFICATION: \2;%(new)s;/
return actions
+ def _rename_host_in_remaining_core_history_files(self, oldname, newname):
+ """Perform the rename operation in all history archive files that have not been handled yet"""
+ finished_file_paths = self._finished_history_files[(oldname, newname)]
+ all_file_paths = set(self._get_core_history_files(only_archive=True))
+ todo_file_paths = list(all_file_paths.difference(finished_file_paths))
+ return self._rename_host_in_core_history_files(todo_file_paths, oldname, newname)
+
+
+ def _rename_host_in_core_history_archive(self, oldname, newname):
+ """Perform the rename operation in all history archive files"""
+ file_paths = self._get_core_history_files(only_archive=True)
+ return self._rename_host_in_core_history_files(file_paths, oldname, newname)
+
+
+ def _get_core_history_files(self, only_archive):
+ path_patterns = [
+ "var/check_mk/core/archive/*",
+ "var/nagios/archive/*",
+ ]
+
+ if not only_archive:
+ path_patterns += [
+ "var/check_mk/core/history",
+ "var/nagios/nagios.log",
+ ]
+
+ file_paths = []
+ for path_pattern in path_patterns:
+ file_paths += glob.glob("%s/%s" % (cmk.paths.omd_root, path_pattern))
+ return file_paths
+
+
+ def _rename_host_in_core_history_files(self, file_paths, oldname, newname):
+ oldregex = self._escape_name_for_regex_matching(oldname)
+
+ # Logfiles and history files of CMC and Nagios. Problem
+ # here: the exact place of the hostname varies between the
+ # various log entry lines
+ sed_commands = r'''
+s/(INITIAL|CURRENT) (HOST|SERVICE) STATE: %(old)s;/\1 \2 STATE: %(new)s;/
+s/(HOST|SERVICE) (DOWNTIME |FLAPPING |)ALERT: %(old)s;/\1 \2ALERT: %(new)s;/
+s/PASSIVE (HOST|SERVICE) CHECK: %(old)s;/PASSIVE \1 CHECK: %(new)s;/
+s/(HOST|SERVICE) NOTIFICATION: ([^;]+);%(old)s;/\1 NOTIFICATION: \2;%(new)s;/
+''' % { "old" : oldregex, "new" : newname }
+
+ handled_files = []
+
+ command = ["sed", "-ri", "--file=/dev/fd/0"]
+ p = subprocess.Popen(command + file_paths, stdin=subprocess.PIPE,
+ stdout=open(os.devnull, "w"), stderr=subprocess.STDOUT,
+ close_fds=True)
+ p.communicate(sed_commands)
+ # TODO: error handling?
+
+ handled_files += file_paths
+
+ return handled_files
+
+
# Returns True in case files were found, otherwise False
def rename_host_in_files(self, path_pattern, old, new, extended_regex=False):
paths = glob.glob(path_pattern)
@@ -452,6 +499,11 @@ s/(HOST|SERVICE) NOTIFICATION: ([^;]+);%(old)s;/\1 NOTIFICATION: \2;%(new)s;/
return False
+ def _escape_name_for_regex_matching(self, name):
+ return name.replace(".", "[.]")
+
+
+
automations.register(AutomationRenameHosts())