Module: check_mk
Branch: master
Commit: ce6e7cff98819ae6bce5587e78bc3886c970f153
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=ce6e7cff98819a…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Fri Jan 23 15:38:31 2015 +0100
#1918 ps: new option for checking the age of a process (on Linux)
The process check now shows the age of processes (the total elapsed time since
the start of a process) and also allows to set upper limits on this. Currently
this is only implemented in the Linux agent. You need to update that agent if
you want to make use of the new feature.
---
.werks/1918 | 12 +++++++
ChangeLog | 1 +
agents/check_mk_agent.linux | 2 +-
checks/ps | 4 ++-
checks/ps.include | 60 +++++++++++++++++++++++++++++-----
web/plugins/wato/check_parameters.py | 9 +++++
6 files changed, 77 insertions(+), 11 deletions(-)
diff --git a/.werks/1918 b/.werks/1918
new file mode 100644
index 0000000..9a9b3ab
--- /dev/null
+++ b/.werks/1918
@@ -0,0 +1,12 @@
+Title: ps: new option for checking the age of a process (on Linux)
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i1
+Date: 1422023842
+Class: feature
+
+The process check now shows the age of processes (the total elapsed time since
+the start of a process) and also allows to set upper limits on this. Currently
+this is only implemented in the Linux agent. You need to update that agent if
+you want to make use of the new feature.
diff --git a/ChangeLog b/ChangeLog
index 8921fed..0f8fa5f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -51,6 +51,7 @@
* 1673 netapp_volumes: now able to configure levels by magic factor
* 1854 netscaler_tcp_conns: new check to monitor tcp connections on Citrix Netscaler
Loadbalancer Appliances
* 1857 ibm_svc_portsas: new check and extended special agent for IBM SVC / Storwize
V3700 / V7000 devices
+ * 1918 ps: new option for checking the age of a process (on Linux)...
* 1457 FIX: logins: new check renamed from "users" check...
NOTE: Please refer to the migration notes!
* 1762 FIX: lnx_thermal: Now ignoring trip points with level 0...
diff --git a/agents/check_mk_agent.linux b/agents/check_mk_agent.linux
index 165170e..4d8fc0f 100755
--- a/agents/check_mk_agent.linux
+++ b/agents/check_mk_agent.linux
@@ -196,7 +196,7 @@ grep ^/dev < /proc/mounts
# processes including username, without kernel processes
echo '<<<ps>>>'
-ps ax -o user,vsz,rss,cputime,pid,command --columns 10000 | sed -e 1d -e 's/ *\([^
]*\) *\([^ ]*\) *\([^ ]*\) *\([^ ]*\) *\([^ ]*\) */(\1,\2,\3,\4,\5) /'
+ps ax -o user,vsz,rss,cputime,etime,pid,command --columns 10000 | sed -e 1d -e 's/
*\([^ ]*\) *\([^ ]*\) *\([^ ]*\) *\([^ ]*\) *\([^ ]*\) *\([^ ]*\) */(\1,\2,\3,\4\/\5,\6)
/'
# Memory usage
echo '<<<mem>>>'
diff --git a/checks/ps b/checks/ps
index 8adc9bc..091f7bf 100644
--- a/checks/ps
+++ b/checks/ps
@@ -37,6 +37,8 @@
# (user, virtual_size, resident_size, %cpu, processID, pagefile_usage, usermodetime,
kernelmodetime, openHandles, threadCount) name
#(\\KLAPPRECHNER\ab,29284,2948,0,3124,904,400576,901296,35,1) NOTEPAD.EXE
+# Sixth generation (>=1.2.7) adds an optional etime, joined by "/" with the
CPU time
+
# The plugin "psperf.bat" is deprecated. As of version 1.2.5 all of this
information
# is reported by the windows agent itself. However, we still support sections from
psperf.bat
# if the agent version is lower than 1.2.5.
@@ -136,7 +138,7 @@ def ps_parse_info(info):
kernelc = int(psinfo["KernelModeTime"]) # do not resolve
counter here!
handlec = int(psinfo.get("HandleCount", 0)) # Only in
newer psperf.bat versions
threadc = int(psinfo["ThreadCount"]) # do not resolve
counter here!
- line[1:1] = [ "(unknown,%d,%d,0,%d,%d,%d,%d,%d,%d)" %
+ line[1:1] = [ "(unknown,%d,%d,0,%d,%d,%d,%d,%d,%d,)" %
(virt, resi, pid, pagefile, userc, kernelc, handlec,
threadc) ]
info.append(line)
return cpu_cores, info
diff --git a/checks/ps.include b/checks/ps.include
index b3b6a47..450e60d 100644
--- a/checks/ps.include
+++ b/checks/ps.include
@@ -191,6 +191,24 @@ def process_matches(ps, procname, l_user):
# "user" : "foo",
# "levels" : (1, 1, 99999, 99999)
# }
+# Parse time as output by ps into seconds.
+# Example 1: "12:17"
+# Example 2: "55:12:17"
+# Example 3: "7-12:34:59" (with 7 days)
+def parse_ps_time(text):
+ if "-" in text:
+ tokens = text.split("-")
+ days = int(tokens[0])
+ text = tokens[1]
+ else:
+ days = 0
+ parts = map(int, text.split(":"))
+ if len(parts) == 3:
+ hours, minutes, seconds = parts
+ else:
+ hours, minutes, seconds = 0, parts[0], parts[1]
+
+ return 86400 * days + 3600 * hours + 60 * minutes + seconds
def check_ps_common(item, params, info, cpu_cores = 1, info_name = "processes"
):
now = time.time()
@@ -219,6 +237,8 @@ def check_ps_common(item, params, info, cpu_cores = 1, info_name =
"processes" )
resident_size = 0
handle_count = 0
percent_cpu = 0.0
+ max_elapsed = None
+ min_elapsed = None
extended_perfdata = False
# The counter names for the ps check are quite volatile, because there is
@@ -240,7 +260,7 @@ def check_ps_common(item, params, info, cpu_cores = 1, info_name =
"processes" )
virtual_size += int(addinfo[1]) # kB
resident_size += int(addinfo[2]) # kB
if len(addinfo) >= 10: # even more data: processId,
pagefile_usage, usermodetime, kernelmodetime, threadCount, openHandles
- pid, pagefile_usage, user_c, kernel_c, handle_c = map(int,
addinfo[4:9])
+ pid, pagefile_usage, user_c, kernel_c, handle_c = map(saveint,
addinfo[4:9])
counter_wrapped = False
try:
user_per_sec = get_rate("ps_wmic.user.%d" % pid,
now, user_c, onwrap=RAISE)
@@ -261,16 +281,22 @@ def check_ps_common(item, params, info, cpu_cores = 1, info_name =
"processes" )
percent_cpu += user_perc + kernel_perc
handle_count += handle_c
else:
+ # addinfo[3] contains the used CPU time and possibly, separated
by /, also
+ # the total elapsed time since the birth of the process.
if ":" in addinfo[3]:
- if "-" in addinfo[3]:
- tokens = addinfo[3].split("-")
- days = int(tokens[0])
- addinfo[3] = tokens[1]
+ if '/' in addinfo[3]:
+ pcpu_text, elapsed_text = addinfo[3].split('/')
+ elapsed = parse_ps_time(elapsed_text)
+ max_elapsed = max(max_elapsed, elapsed)
+ if min_elapsed == None:
+ min_elapsed = elapsed
+ else:
+ min_elapsed = min(min_elapsed, elapsed)
else:
- days = 0
+ pcpu_text = addinfo[3]
+
+ total_seconds = parse_ps_time(pcpu_text)
pid = addinfo[4]
- hours, minutes, seconds = map(int,
addinfo[3].split(":"))
- total_seconds = 86400 * days + 3600 * hours + 60 * minutes +
seconds
try:
cputime = get_rate("ps_stat.pcpu.%s" % pid,
now, total_seconds, onwrap=RAISE)
except MKCounterWrapped, e:
@@ -280,6 +306,7 @@ def check_ps_common(item, params, info, cpu_cores = 1, info_name =
"processes" )
else:
pcpu = savefloat(addinfo[3])
percent_cpu += pcpu
+
extended_perfdata = True
if "cpulevels" in params:
@@ -287,7 +314,6 @@ def check_ps_common(item, params, info, cpu_cores = 1, info_name =
"processes" )
else:
warn_cpu, crit_cpu = None, None
-
warnmin, okmin, okmax, warnmax = params["levels"]
perfdata = [ ("count", count, okmax+1, warnmax+1, 0) ]
if count == 0 and not extended_perfdata:
@@ -306,6 +332,7 @@ def check_ps_common(item, params, info, cpu_cores = 1, info_name =
"processes" )
perfdata.append(("pcpu", percent_cpu, warn_cpu, crit_cpu))
infotext = "%d %s" % (count, info_name)
+
if running_on:
infotext += " [running on %s]" % ", ".join(running_on)
@@ -359,4 +386,19 @@ def check_ps_common(item, params, info, cpu_cores = 1, info_name =
"processes" )
state = max(state, 1)
infotext += "(!)"
+ if min_elapsed != None:
+ if min_elapsed == max_elapsed:
+ infotext += ", running for %s" %
get_age_human_readable(min_elapsed)
+ else:
+ infotext += ", youngest running for %s" %
get_age_human_readable(min_elapsed)
+ infotext += ", oldest running for %s" %
get_age_human_readable(max_elapsed)
+ if "max_age" in params:
+ warn_age, crit_age = params["max_age"]
+ if max_elapsed >= crit_age:
+ state = 2
+ infotext += "(!!)"
+ elif max_elapsed >= crit_age:
+ state = max(state, 1)
+ infotext += "(!)"
+
return state, infotext, perfdata
diff --git a/web/plugins/wato/check_parameters.py b/web/plugins/wato/check_parameters.py
index 61a3774..adf4487 100644
--- a/web/plugins/wato/check_parameters.py
+++ b/web/plugins/wato/check_parameters.py
@@ -476,6 +476,15 @@ process_level_elements = [
default_value = 15,
)
),
+ ( "max_age",
+ Tuple(
+ title = _("Maximum allowed age"),
+ help = _("Alarms you if the age of the process (not the consumed CPU time,
but the real time) exceed the configured levels."),
+ elements = [
+ Age(title=_("Warning at:"), default_value = 3600,),
+ Age(title=_("Critical at:"), default_value = 7200),
+ ]
+ )),
( "virtual_levels",
Tuple(
title = _("Virtual memory usage"),