Module: check_mk
Branch: master
Commit: c3e57e8b48441b6e7dc435215d2f6f074e87a20d
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=c3e57e8b48441b…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Thu Oct 27 15:14:32 2016 +0200
4036 Measurement of time used by Check_MK now includes time waited for agent
Now in the graph you can see how much time was spent waiting for the monitoring
agent.
---
.werks/4036 | 10 ++++++++++
ChangeLog | 1 +
modules/check_mk_base.py | 29 ++++++++++++++++++----------
modules/snmp.py | 2 ++
web/htdocs/metrics.py | 2 +-
web/plugins/metrics/check_mk.py | 42 +++++++++++++++++++++++++++++++++++++++--
6 files changed, 73 insertions(+), 13 deletions(-)
diff --git a/.werks/4036 b/.werks/4036
new file mode 100644
index 0000000..321f3a9
--- /dev/null
+++ b/.werks/4036
@@ -0,0 +1,10 @@
+Title: Measurement of time used by Check_MK now includes time waited for agent
+Level: 1
+Component: core
+Compatible: compat
+Version: 1.4.0i2
+Date: 1477574025
+Class: feature
+
+Now in the graph you can see how much time was spent waiting for the monitoring
+agent.
diff --git a/ChangeLog b/ChangeLog
index 6e1b24f..34fc4c8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,6 @@
1.4.0i2:
Core & Setup:
+ * 4036 Measurement of time used by Check_MK now includes time waited for agent...
* 3648 FIX: Fixed log file parsing of host states...
* 3965 FIX: Nagios: Improved handling of broken core config...
diff --git a/modules/check_mk_base.py b/modules/check_mk_base.py
index f41cedf..3f58475 100644
--- a/modules/check_mk_base.py
+++ b/modules/check_mk_base.py
@@ -58,6 +58,7 @@ from cmk.regex import regex
import cmk.tty as tty
import cmk.render as render
import cmk.crash_reporting as crash_reporting
+import cmk.cpu_tracking as cpu_tracking
import cmk.paths
import cmk_base.agent_simulator
@@ -689,9 +690,12 @@ def get_agent_info(hostname, ipaddress, max_cache_age):
# as ssh or rsh or agent_vsphere) instead of a TCP connect.
commandline = get_datasource_program(hostname, ipaddress)
if commandline:
+ cpu_tracking.push_phase("ds")
output = get_agent_info_program(commandline)
else:
+ cpu_tracking.push_phase("agent")
output = get_agent_info_tcp(hostname, ipaddress)
+ cpu_tracking.pop_phase()
# Got new data? Write to cache file
write_cache_file(hostname, output)
@@ -1162,9 +1166,9 @@ def get_average(itemname, this_time, this_val, backlog_minutes,
initialize_zero
# This is the main check function - the central entry point to all and
# everything
def do_check(hostname, ipaddress, only_check_types = None):
- console.verbose("Check_mk version %s\n" % cmk.__version__)
+ cpu_tracking.start("busy")
+ console.verbose("Check_MK version %s\n" % cmk.__version__)
- start_time = time.time()
expected_version = agent_target_version(hostname)
@@ -1233,16 +1237,21 @@ def do_check(hostname, ipaddress, only_check_types = None):
if checkresult_file_fd != None:
close_checkresult_file()
- run_time = time.time() - start_time
- if check_mk_perfdata_with_times:
- if opt_keepalive:
- times = get_keepalive_times()
- else:
- times = os.times()
+ cpu_tracking.end()
+ phase_times = cpu_tracking.get_times()
+ total_times = phase_times["TOTAL"]
+ run_time = total_times[4]
+ if check_mk_perfdata_with_times:
output += "execution time %.1f sec|execution_time=%.3f user_time=%.3f
"\
- "system_time=%.3f children_user_time=%.3f
children_system_time=%.3f\n" %\
- (run_time, run_time, times[0], times[1], times[2], times[3])
+ "system_time=%.3f children_user_time=%.3f
children_system_time=%.3f" %\
+ (run_time, run_time, total_times[0], total_times[1], total_times[2],
total_times[3])
+
+ for phase, times in phase_times.items():
+ if phase in [ "agent", "snmp", "ds" ]:
+ t = times[4] - sum(times[:4]) # real time - CPU time
+ output += " cmk_time_%s=%.3f" % (phase, t)
+ output += "\n"
else:
output += "execution time %.1f sec|execution_time=%.3f\n" % (run_time,
run_time)
diff --git a/modules/snmp.py b/modules/snmp.py
index 8ce7743..3b0bffb 100644
--- a/modules/snmp.py
+++ b/modules/snmp.py
@@ -243,12 +243,14 @@ def perform_snmpwalk(hostname, ip, check_type, base_oid, fetchoid):
snmp_contexts = [None]
for context_name in snmp_contexts:
+ cpu_tracking.push_phase("snmp")
if is_inline_snmp_host(hostname):
rows = inline_snmpwalk_on_suboid(hostname, check_type, fetchoid, base_oid,
context_name=context_name,
ipaddress=ip)
else:
rows = snmpwalk_on_suboid(hostname, ip, fetchoid, context_name=context_name)
+ cpu_tracking.pop_phase()
# I've seen a broken device (Mikrotik Router), that broke after an
# update to RouterOS v6.22. It would return 9 time the same OID when
diff --git a/web/htdocs/metrics.py b/web/htdocs/metrics.py
index a9a17de..648f32e 100644
--- a/web/htdocs/metrics.py
+++ b/web/htdocs/metrics.py
@@ -25,7 +25,7 @@
# Boston, MA 02110-1301 USA.
# Frequently used variable names:
-# perf_data_string: Raw performance data as sent by the core, e.g
"foor=17M;1;2;4;5"
+# perf_data_string: Raw performance data as sent by the core, e.g
"foo=17M;1;2;4;5"
# perf_data: Split performance data, e.g. [("foo", "17",
"M", "1", "2", "4", "5")]
# translated_metrics: Completely parsed and translated into metrics, e.g. {
"foo" : { "value" : 17.0, "unit" : { "render" :
... }, ... } }
# color: RGB color representation ala HTML, e.g. "#ffbbc3" or
"#FFBBC3", len() is always 7!
diff --git a/web/plugins/metrics/check_mk.py b/web/plugins/metrics/check_mk.py
index 6fbbbf1..e73ae02 100644
--- a/web/plugins/metrics/check_mk.py
+++ b/web/plugins/metrics/check_mk.py
@@ -1714,7 +1714,7 @@ metric_info["hours_since_service"] = {
}
metric_info["execution_time"] = {
- "title" : _("Execution time"),
+ "title" : _("Total execution time"),
"unit" : "s",
"color" : "#d080af",
}
@@ -2285,6 +2285,25 @@ metric_info["livestatus_overflows_rate"] = {
"color" : "16/a",
}
+metric_info["cmk_time_agent"] = {
+ "title" : _("Time spent waiting for Check_MK agent"),
+ "unit" : "s",
+ "color" : "35/a",
+}
+
+metric_info["cmk_time_snmp"] = {
+ "title" : _("Time spent waiting for SNMP responses"),
+ "unit" : "s",
+ "color" : "36/a",
+}
+
+metric_info["cmk_time_ds"] = {
+ "title" : _("Time spent waiting for special agent"),
+ "unit" : "s",
+ "color" : "34/a",
+}
+
+
metric_info["num_open_events"] = {
"title" : _("Open events"),
"unit" : "count",
@@ -5907,6 +5926,25 @@ graph_info.append({
(
"user_time,children_user_time,system_time,children_system_time,+,+,+#888888",
"line", _("Total") ),
],
"omit_zero_metrics" : True,
+ "conflicting_metrics" : [ "cmk_time_agent",
"cmk_time_snmp", "cmk_time_ds" ],
+})
+
+
+graph_info.append({
+ "title" : _("Time usage by phase"),
+ "metrics" : [
+ ( "user_time,children_user_time,+", "stack", _("CPU
time in user space") ),
+ ( "system_time,children_system_time,+", "stack", _("CPU
time in operating system") ),
+ ( "cmk_time_agent", "stack" ),
+ ( "cmk_time_snmp", "stack" ),
+ ( "cmk_time_ds", "stack" ),
+ ( "execution_time", "line" ),
+ ],
+ "optional_metrics" : [
+ "cmk_time_agent",
+ "cmk_time_snmp",
+ "cmk_time_ds"
+ ],
})
graph_info.append({
@@ -5952,7 +5990,7 @@ graph_info.append({
"load1:warn",
"load1:crit",
],
- "optional_metrics" : [ "load15" ],
+ "optional_metrics" : [ "load15", ],
})
graph_info.append({