Module: check_mk
Branch: master
Commit: ca2dcbb41c3b87bf28fc76f982b9b1da1af28261
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=ca2dcbb41c3b87…
Author: Andreas Boesl <ab(a)mathias-kettner.de>
Date: Mon May 26 12:40:09 2014 +0200
ps: improved/fixed calculation of CPU utilization
Previously, the CPU utilization value was taken from the output <tt>pcpu</tt>
from
the ps command. This value didn't reflect the exact utilization since the last check
because its definition is
<pre>
CPU usage is currently expressed as the percentage of time spent running
during the entire lifetime of a process. This is not ideal, and it does not
conform to the standards that ps otherwise conforms to. CPU usage is
unlikely to add up to exactly 100%.
</pre>
The evaluation of the <tt>pcpu</tt> field has been removed and got replaced
by the field <tt>cputime</tt>, which reflects the number of cpu seconds since
program start.
With the <tt>cputime</tt> we are able to determine the correct value.
To utilize this new calculation method, you need to update the check_mk_agent on the
target host.
The ps check itself is able to handle both formats, <tt>pcpu</tt> and
<tt>cputime</tt>.
---
.werks/925 | 24 ++++++++++++++++++++++++
ChangeLog | 1 +
agents/check_mk_agent.linux | 3 +--
checks/ps | 25 ++++++++++++++++++++-----
4 files changed, 46 insertions(+), 7 deletions(-)
diff --git a/.werks/925 b/.werks/925
new file mode 100644
index 0000000..4bca5de
--- /dev/null
+++ b/.werks/925
@@ -0,0 +1,24 @@
+Title: ps: improved/fixed calculation of CPU utilization
+Level: 2
+Component: checks
+Version: 1.2.5i3
+Date: 1401100287
+Class: feature
+
+Previously, the CPU utilization value was taken from the output <tt>pcpu</tt>
from
+the ps command. This value didn't reflect the exact utilization since the last check
+because its definition is <br>
+<pre>
+CPU usage is currently expressed as the percentage of time spent running
+during the entire lifetime of a process. This is not ideal, and it does not
+conform to the standards that ps otherwise conforms to. CPU usage is
+unlikely to add up to exactly 100%.
+</pre>
+
+The evaluation of the <tt>pcpu</tt> field has been removed and got replaced
+by the field <tt>cputime</tt>, which reflects the number of cpu seconds since
program start.
+With the <tt>cputime</tt> we are able to determine the correct value.
+
+To utilize this new calculation method, you need to update the check_mk_agent on the
target host.
+The ps check itself is able to handle both formats, <tt>pcpu</tt> and
<tt>cputime</tt>.
+
diff --git a/ChangeLog b/ChangeLog
index ff51c64..f4aff67 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -56,6 +56,7 @@
NOTE: Please refer to the migration notes!
* 0920 blade_bays: now also detects if blade server is switched off
* 0977 check_traceroute: new active check for checking presence and absence of
routes...
+ * 0925 ps: improved/fixed calculation of CPU utilization...
* 0777 FIX: special agent emcvnx: did not work with security file authentication...
* 0786 FIX: zfsget: fixed compatibility with older Solaris agents...
* 0809 FIX: brocade_fcport: Fixed recently introduced problem with port speed
detection
diff --git a/agents/check_mk_agent.linux b/agents/check_mk_agent.linux
index 1c9400a..59e9c76 100755
--- a/agents/check_mk_agent.linux
+++ b/agents/check_mk_agent.linux
@@ -171,8 +171,7 @@ grep ^/dev < /proc/mounts
# processes including username, without kernel processes
echo '<<<ps>>>'
-ps ax -o user,vsz,rss,pcpu,command --columns 10000 | sed -e 1d -e 's/ *\([^ ]*\)
*\([^ ]*\) *\([^ ]*\) *\([^ ]*\) */(\1,\2,\3,\4) /'
-
+ps ax -o user,vsz,rss,cputime,pid,command --columns 10000 | sed -e 1d -e 's/ *\([^
]*\) *\([^ ]*\) *\([^ ]*\) *\([^ ]*\) *\([^ ]*\) */(\1,\2,\3,\4,\5) /'
# Memory usage
echo '<<<mem>>>'
diff --git a/checks/ps b/checks/ps
index 01bbe5b..1c6f2be 100644
--- a/checks/ps
+++ b/checks/ps
@@ -76,13 +76,14 @@ ANY_USER = None
GRAB_USER = False
def ps_parse_info(info):
- result = []
+ ps_result = []
lines = iter(info)
wmic_info = {}
use_wmic_info = True
cpu_cores = 1
+
try:
- is_wmic = False
+ is_wmic = False
while True:
line = lines.next()
if line[-1] == '[wmic process]':
@@ -101,7 +102,7 @@ def ps_parse_info(info):
# We need to determine the number of cpu_cores without the wmic_info
if line[2].lower() == "system idle process":
cpu_cores = int(line[1][1:-1].split(",")[9])
- result.append(line)
+ ps_result.append(line)
else:
row = dict(zip(wmic_headers, line))
wmic_info[(row["node"], row["Name"],
row["ProcessId"])] = row
@@ -119,7 +120,7 @@ def ps_parse_info(info):
return value
info = []
seen_pids = set([]) # Remove duplicate entries
- for line in result:
+ for line in ps_result:
psinfo = get_ps_info(line[0], line[1])
# Get number of CPU cores from system idle process
if psinfo:
@@ -372,7 +373,21 @@ def check_procs(item, params, info, with_perfdata):
percent_cpu += user_perc + kernel_perc
handle_count += handle_c
else:
- percent_cpu += savefloat(addinfo[3])
+ if ":" in addinfo[3]:
+ if "-" in addinfo[3]:
+ tokens = addinfo[3].split("-")
+ days = int(tokens[0])
+ addinfo[3] = tokens[1]
+ else:
+ days = 0
+ pid = addinfo[4]
+ hours, minutes, seconds = map(int,
addinfo[3].split(":"))
+ total_seconds = 86400 * days + 3600 * hours + 60 * minutes +
seconds
+ timedif, cputime = get_counter("ps_stat.pcpu.%s" %
pid, now, total_seconds)
+ pcpu = cputime * 100
+ else:
+ pcpu = savefloat(addinfo[3])
+ percent_cpu += pcpu
extended_perfdata = True
if "cpulevels" in params: