Module: check_mk
Branch: master
Commit: 30d4a5ba7fc5ae7de00d96d3a381bf7d0e402660
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=30d4a5ba7fc5ae…
Author: Goetz Golla <gg(a)mathias-kettner.de>
Date: Fri Jun 5 18:14:15 2015 +0200
#2119 omd_status: check can now work in a cluster environment
---
.werks/2119 | 9 ++++++
ChangeLog | 1 +
checkman/omd_status | 5 +++
checks/omd_status | 86 +++++++++++++++++++++++++++++++++++++++------------
4 files changed, 81 insertions(+), 20 deletions(-)
diff --git a/.werks/2119 b/.werks/2119
new file mode 100644
index 0000000..df120d8
--- /dev/null
+++ b/.werks/2119
@@ -0,0 +1,9 @@
+Title: omd_status: check can now work in a cluster environment
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i2
+Date: 1433520823
+Class: feature
+
+
diff --git a/ChangeLog b/ChangeLog
index 244c288..fa36cee 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -6,6 +6,7 @@
* 2117 postfix_mailq: agent and check now retrieve and monitor active queue data as well
* 2325 oracle_tablespaces: correctly handle case where check runs in clustered mode...
* 2216 raritan_pdu_ocprot: New check to monitor overcurrent protectors of Raritan PDUs...
+ * 2119 omd_status: check can now work in a cluster environment
* 2323 FIX: df: Fix new graphs for all filesystem checks in case of existing inode information
* 2305 FIX: agent_vsphere, esx_vsphere_sensors: now able to handle sensor names with semicolon...
* 2118 FIX: aix_sap_processlist: agent plugin now is more general to deal with various AIX versions...
diff --git a/checkman/omd_status b/checkman/omd_status
index 362aad7..a51a9fb 100644
--- a/checkman/omd_status
+++ b/checkman/omd_status
@@ -10,6 +10,11 @@ description:
otherwise. If the site is only partially running then the check output
shows the names of the stopped services.
+ The check also works in a cluster environment, listing all node where the
+ sites are running on and their states. The total state of the service is
+ then only critical when one of the sites on a node is partially running,
+ or when all sites on the nodes are stopped.
+
item:
The name of the site (string).
diff --git a/checks/omd_status b/checks/omd_status
index b3b56d0..4e47cf9 100644
--- a/checks/omd_status
+++ b/checks/omd_status
@@ -41,33 +41,79 @@
# OVERALL 2
def inventory_omd_status(info):
- for line in info:
- if line[0][0] == '[':
- yield line[0][1:-1], None
+ for site in info.keys():
+ yield site, None
-def check_omd_status(item, _no_params, info):
+def parse_omd_status(info):
active = False
+ parsed = {}
+
for line in info:
- if line[0] == '[' + item + ']':
+ if line[1][0] == '[':
+ item = line[1][1:-1]
+ # items may appear several times in clusters
+ # so dont overwrite the previous node result
+ if item not in parsed:
+ parsed[item] = {}
+ node = line[0]
+ parsed[item][node] = {}
+ parsed[item][node]["stopped"] = []
active = True
stopped = []
- elif active:
- if line[0] == 'OVERALL':
- if line[1] == '0':
- return (0, 'all services are running')
- elif line[1] == '1':
- return (2, 'site is stopped')
- else:
- return (2, 'partially running! stopped services: %s' % ", ".join(stopped))
- elif line[1] != '0':
- stopped.append(line[0])
- return (3, "site not existing or AUTOSTART off")
+ elif active and line[1] == 'OVERALL':
+ if line[2] == '0':
+ parsed[item][node]["overall"] = "running"
+ elif line[2] == '1':
+ parsed[item][node]["overall"] = "stopped"
+ active = False
+ elif active and line[2] != '0':
+ parsed[item][node]["stopped"].append(line[1])
+ parsed[item][node]["overall"] = "partially"
+
+ return parsed
+
+
+def check_omd_status(item, _no_params, info):
+
+ parsed_site = info[item]
+ number_nodes = len(parsed_site)
+ stopped_nodes = 0
+
+ for node, services in parsed_site.iteritems():
+ if services["overall"] == "stopped":
+ stopped_nodes += 1
+
+ # stopped sites are only CRIT when all are stopped
+ if stopped_nodes == number_nodes:
+ state = 2
+ else:
+ state = 0
+ for node, services in parsed_site.iteritems():
+ if node:
+ node_text = " on %s" % node
+ else:
+ node_text = ""
+ if services["overall"] == "running":
+ infotext = "running%s" % node_text
+ # running sites are always OK
+ yield 0, infotext
+ elif services["overall"] == "stopped":
+ infotext = "stopped%s" % node_text
+ # stopped sites are only CRIT when all are stopped
+ yield state, infotext
+ else:
+ infotext = "partially running%s, stopped services: " % node_text
+ infotext += ", ".join(services["stopped"])
+ # partially running sites are always CRIT
+ yield 2, infotext
check_info["omd_status"] = {
- 'check_function': check_omd_status,
- 'inventory_function': inventory_omd_status,
- 'service_description': 'OMD %s status',
- 'group': 'omd_status',
+ 'check_function' : check_omd_status,
+ 'inventory_function' : inventory_omd_status,
+ 'parse_function' : parse_omd_status,
+ 'service_description' : 'OMD %s status',
+ 'group' : 'omd_status',
+ 'node_info' : True,
}