Check_MK Git: check_mk: #1675 esx_vsphere_hostsystem.cpu_util_cluster: Averaged CPU utilization of all cluster nodes - Checkmk git commits

26 Jan 2015

Module: check_mk
Branch: master
Commit: 30bcc57fce9c6fec48ece4cc0fc8e5526c532f3e
URL:   
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=30bcc57fce9c6f…

Author: Andreas Boesl &lt;ab(a)mathias-kettner.de&gt;
Date:   Mon Jan 26 16:56:57 2015 +0100

#1675 esx_vsphere_hostsystem.cpu_util_cluster: Averaged CPU utilization of all cluster
nodes

This new check allows you to monitor the average CPU utilization over all cluster nodes.
Each node has informations about the total available MHz and the actually used MHz for
computing.
The actual usage of a cluster is calculated by the formula <i>(total_used_mhz /
total_avail_mhz) * 100</i>

Via WATO you can configure a set of WARN and CRIT levels for specific node counts.

---

 .werks/1675                                       |   13 ++++
 ChangeLog                                         |    1 +
 checkman/esx_vsphere_hostsystem.cpu_util_cluster  |   32 +++++++++
 checkman/esx_vsphere_hostsystem.mem_usage_cluster |    5 --
 checks/esx_vsphere_hostsystem                     |   76 +++++++++++++++++++++
 web/plugins/wato/check_parameters.py              |   29 ++++++++
 6 files changed, 151 insertions(+), 5 deletions(-)

diff --git a/.werks/1675 b/.werks/1675
new file mode 100644
index 0000000..07c644a
--- /dev/null
+++ b/.werks/1675
@@ -0,0 +1,13 @@
+Title: esx_vsphere_hostsystem.cpu_util_cluster: Averaged CPU utilization of all cluster
nodes
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i1
+Date: 1422287199
+Class: feature
+
+This new check allows you to monitor the average CPU utilization over all cluster nodes.
+Each node has informations about the total available MHz and the actually used MHz for
computing.
+The actual usage of a cluster is calculated by the formula <i>(total_used_mhz /
total_avail_mhz) * 100</i>
+
+Via WATO you can configure a set of WARN and CRIT levels for specific node counts.
diff --git a/ChangeLog b/ChangeLog
index ed3e6d1..de4d9a5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -54,6 +54,7 @@
     * 1857 ibm_svc_portsas: new check and extended special agent for IBM SVC / Storwize
V3700 / V7000 devices
     * 1918 ps: new option for checking the age of a process (on Linux)...
     * 1920 df: Linux filesystem check now supports displaying data reserved for root...
+    * 1675 esx_vsphere_hostsystem.cpu_util_cluster: Averaged CPU utilization of all
cluster nodes...
     * 1457 FIX: logins: new check renamed from "users" check...
             NOTE: Please refer to the migration notes!
     * 1762 FIX: lnx_thermal: Now ignoring trip points with level 0...
diff --git a/checkman/esx_vsphere_hostsystem.cpu_util_cluster
b/checkman/esx_vsphere_hostsystem.cpu_util_cluster
new file mode 100644
index 0000000..2d34c32
--- /dev/null
+++ b/checkman/esx_vsphere_hostsystem.cpu_util_cluster
@@ -0,0 +1,32 @@
+title: VMWare ESX host systems: CPU utilization of clusters
+agents: vsphere
+catalog: os/kernel
+license: GPL
+distribution: check_mk
+description:
+ This check measures the averaged CPU utilization for a cluster of ESX Host systems.
+ You can configure multiple WARN/CRIT levels depending on the number of nodes in this
cluster.
+ This check is only applicable for clusters and cannot be inventorized.
+ You need to create a manual check for it in WATO and assign it to a cluster host.
+
+examples:
+ static_checks['cpu_utilization_cluster'] = [
+   ( ('esx_vsphere_hostsystem.cpu_util_cluster', None,
+    [(2, (60.0, 70.0)),
+     (5, (70.0, 80.0)),
+     (7, (85.0, 90.0))
+    ]), [], ['esx_cluster'] ),
+ ] + static_checks['cpu_utilization_cluster']
+
+perfdata:
+  One value: The current usage in percent - ranging from 0 to 100.
+  The "maximum" value is not 100, but the number of CPU threads. This
+  case be used for scaling the graph in terms of the number of used CPU threads.
+
+[parameters]
+parameters (list): A list of tuples where you can define the number of
+ minimum required nodes and the corresponding levels.
+
+ One Element: ( number_of_nodes(int), ( WARN(float), CRIT(float) ) )
+
+ Please refer example.
diff --git a/checkman/esx_vsphere_hostsystem.mem_usage_cluster
b/checkman/esx_vsphere_hostsystem.mem_usage_cluster
index 1d019b3..4ab8bd0 100644
--- a/checkman/esx_vsphere_hostsystem.mem_usage_cluster
+++ b/checkman/esx_vsphere_hostsystem.mem_usage_cluster
@@ -17,11 +17,6 @@ examples:
     ]), [], ['esx_cluster'] ),
  ] + static_checks['mem_cluster']
 
-
-
-inventory:
- Not applicable
-
 perfdata:
  One value: the current total usage in bytes.
 
diff --git a/checks/esx_vsphere_hostsystem b/checks/esx_vsphere_hostsystem
index 7b9a57d..be09a49 100644
--- a/checks/esx_vsphere_hostsystem
+++ b/checks/esx_vsphere_hostsystem
@@ -148,7 +148,83 @@ check_info['esx_vsphere_hostsystem.mem_usage_cluster'] = {
 }
 
 
+#.
+#   .--CPU-Cluster---------------------------------------------------------.
+#   |        ____ ____  _   _        ____ _           _                    |
+#   |       / ___|  _ \| | | |      / ___| |_   _ ___| |_ ___ _ __         |
+#   |      | |   | |_) | | | |_____| |   | | | | / __| __/ _ \ '__|        |
+#   |      | |___|  __/| |_| |_____| |___| | |_| \__ \ ||  __/ |           |
+#   |       \____|_|    \___/       \____|_|\__,_|___/\__\___|_|           |
+#   |                                                                      |
+#   +----------------------------------------------------------------------+
+
+
+def check_esx_vsphere_hostsystem_cpu_util_cluster(item, params, info):
+    current_node = {}
+    def get_node_usage(node):
+        num_sockets    = int(node['hardware.cpuInfo.numCpuPackages'])
+        num_cores      = int(node['hardware.cpuInfo.numCpuCores'])
+        num_threads    = int(node['hardware.cpuInfo.numCpuThreads'])
+        used_mhz     = float(node['summary.quickStats.overallCpuUsage'])
+        mhz_per_core = float(node['hardware.cpuInfo.hz']) / 1024.0 / 1024.0
+        total_mhz = mhz_per_core * num_cores
+        return used_mhz, total_mhz, num_threads
+
+    overall_used    = []
+    overall_total   = []
+    overall_threads = []
+    for line in info:
+        if line[0] in [ "hardware.cpuInfo.numCpuPackages",
+                        "hardware.cpuInfo.numCpuCores",
+                        "hardware.cpuInfo.numCpuThreads",
+                        "summary.quickStats.overallCpuUsage",
+                        "hardware.cpuInfo.hz" ]:
+            current_node[line[0]] = line[1]
+        if len(current_node) == 5: # 5 keys -> node complete
+            node_used, node_total, node_threads = get_node_usage(current_node)
+            overall_used.append(node_used)
+            overall_total.append(node_total)
+            overall_threads.append(node_threads)
+            current_node = {}
+
+
+    sum_used    = sum(overall_used)
+    sum_total   = sum(overall_total)
+    sum_threads = sum(overall_threads)
+    usage       = sum_used / sum_total * 100
+    node_count  = len(overall_used)
+
+    # Convert legacy parameters
+    this_time = time.time()
+
+
+    sorted_params = sorted(params, reverse = True)
+    for count, levels in sorted_params:
+        if node_count >= count:
+            state, infotext, perfdata = check_cpu_util(usage, levels)
+            break
+    else:
+        state, infotext, perfdata = check_cpu_util(usage, None)
 
+    yield 0, "%d Nodes" % node_count
+    yield 0, "%.2fGHz/%.2fGHz" % (sum_used / 1024.0, sum_total / 1024.0)
+    yield 0, "%d threads" % sum_threads
+
+    # put number of threads as MAX value for first perf-data. This
+    # is needed by the PNP template.
+    perfdata_cpu = list(perfdata[0])
+    perfdata_cpu[-1] = sum_threads
+    perfdata = [ tuple(perfdata_cpu) ] + perfdata[1:]
+    yield state, infotext, perfdata
+
+
+check_info['esx_vsphere_hostsystem.cpu_util_cluster'] = {
+   "check_function"          : check_esx_vsphere_hostsystem_cpu_util_cluster,
+   "service_description"     : "CPU utilization",
+   "group"                   : "cpu_utilization_cluster",
+   "has_perfdata"            : True,
+   "includes"                : [ "cpu_util.include" ],
+}
 #   .--Memory--------------------------------------------------------------.
 #   |               __  __                                                 |
 #   |              |  \/  | ___ _ __ ___   ___  _ __ _   _                 |
diff --git a/web/plugins/wato/check_parameters.py b/web/plugins/wato/check_parameters.py
index 7455602..d3e87f0 100644
--- a/web/plugins/wato/check_parameters.py
+++ b/web/plugins/wato/check_parameters.py
@@ -2998,6 +2998,35 @@ register_check_parameters(
 )
 
 register_check_parameters(
+   subgroup_networking,
+   "cpu_utilization_cluster",
+   _("CPU Utilization of Clusters"),
+    ListOf(
+        Tuple(
+            elements = [
+                Integer(title = _("Equal or more than"), unit =
_("nodes")),
+                Tuple(
+                      elements = [
+                          Percentage(title = _("Warning at a utilization of"),
default_value = 90.0),
+                          Percentage(title = _("Critical at a utilization of"),
default_value = 95.0)
+                      ],
+                      title = _("Alert on too high CPU utilization"),
+                )
+            ]
+        ),
+        help = _("Configure levels for averaged CPU utilization depending on number
of cluster nodes. "
+                 "The CPU utilization sums up the percentages of CPU time that is
used "
+                 "for user processes and kernel routines over all available cores
within "
+                 "the last check interval. The possible range is from 0% to
100%"),
+        title = _("Memory Usage"),
+        add_label = _("Add limits")
+    ),
+   None,
+   "first",
+   False
+)
+
+register_check_parameters(
     subgroup_os,
     "esx_host_memory",
     _("Main memory usage of ESX host system"),