Module: check_mk
Branch: master
Commit: 9256d8286d12041815459342d9b658e50f30bb9e
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=9256d8286d1204…
Author: Götz Golla <gg(a)mathias-kettner.de>
Date: Fri Jan 10 17:28:01 2014 +0100
job: check now monitors the time since last start of the job, limits can be configured in
WATO
---
.werks/78 | 8 ++++++++
ChangeLog | 1 +
checkman/job | 35 ++++++++++++++++++++++------------
checks/job | 27 +++++++++++++++++++-------
web/plugins/wato/check_parameters.py | 22 +++++++++++++++++++++
5 files changed, 74 insertions(+), 19 deletions(-)
diff --git a/.werks/78 b/.werks/78
new file mode 100644
index 0000000..a66346a
--- /dev/null
+++ b/.werks/78
@@ -0,0 +1,8 @@
+Title: job: check now monitors the time since last start of the job, limits can be
configured in WATO
+Level: 1
+Component: checks
+Version: 1.2.5i1
+Date: 1389371210
+Class: feature
+
+
diff --git a/ChangeLog b/ChangeLog
index 9a5addd..a411a03 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -36,6 +36,7 @@
* 0254 agent_vsphere: Make handling of spaces in hostnames of ESX configurable...
* 0077 cmciii.psm_current, cmciii_psm_plugs, cmciii_io, cmciii.access, cmciii.temp,
cmciii.can_current, cmciii.sensor, cmciii.state: new sub checks included in one new check
cmcmiii superseding and improving several previous checks of the Rittal CMCIII device...
NOTE: Please refer to the migration notes!
+ * 0078 job: check now monitors the time since last start of the job, limits can be
configured in WATO
* 0103 FIX: services: Fixed bug with service inventory defined in main.mk...
* 0299 FIX: borcade_mlx_fan: Prettified output, handling "other" state now
* 0300 FIX: cisco_fru_power: Trying not to inventorize not plugged in FRUs...
diff --git a/checkman/job b/checkman/job
index 128b82c..1d46c5c 100644
--- a/checkman/job
+++ b/checkman/job
@@ -5,25 +5,27 @@ license: GPL
distribution: check_mk
description:
This check monitors state and performance information of any linux program
- call like for example regular running cronjobs.
+ call, for example regular running cronjobs.
The check uses information provided by the wrapper program {mk-job}. This
program is shipped with the linux agent and installed to {/usr/bin}.
- {mk-job} is a wrapper which is called instead of the normal program. For
- example if you have a command line {nightly-backup >/dev/null} which gets
- executed by cronjob every night, you can change the command line to
+ {mk-job} is a wrapper which is called instead of the program. For
+ example, if you have a command line {nightly-backup >/dev/null} which gets
+ executed by a cronjob every night, you can change the command line to
{mk-job backup nightly-backup >/dev/null} to let mk-job collect information
- about the job during runtime while the string {backup} is the ident of the
- job to be executed. This ident must be an unique identifier for this job
- on each host. When the job finished, mk-job writes the collected data to
- {/var/lib/check_mk_agent/job/<job-id>}. The agent outputs
- all available data to the Check_MK server.
+ about the job during runtime. In this expression, the string {backup} is
+ the identifier of the job to be executed. It must be a unique identifier
+ for this job on each host. When the job is finished, {mk-job} writes the
+ collected data to {/var/lib/check_mk_agent/job/<identifier>}. The agent sends
+ all these data to the Check_MK server.
- At the moment this check has no parameters. It reports a {CRITICAL}
- state when the exit code of the job is not 0.
+ The check is {CRITICAL} if the exit code of the job is not {0}, or if
+ warning or critical limits for the job age have been reached.
+
+ Limits can be configured with WATO.
item:
- The ident of the job defined by the first argument to {mk-job}.
+ The identifier of the job defined by the first argument to {mk-job}.
inventory:
One check per job will be created.
@@ -36,3 +38,12 @@ perfdata:
{writes}: Number of file system outputs by the process.
{max_res_bytes}: Maximum resident set size of the process during its lifetime.
{avg_mem_bytes}: Average total (data+stack+text) memory use of the process.
+
+[parameters]
+parameters(dict): parameters is a dictionary with one key
+
+ {"age"}: contains a tuple for the warning and critical limits for the time
+ since last start of the job. Units are seconds.
+
+[configuration]
+job_default_levels(dict): This variable is preset to {{ "age": ( 0, 0 ) }},
which means that the limits are disabled
diff --git a/checks/job b/checks/job
index bf58809..046a4fd 100644
--- a/checks/job
+++ b/checks/job
@@ -48,12 +48,16 @@
#max_res_kbytes 1984
#avg_mem_kbytes 0
+factory_settings["job_default_levels"] = {
+ "age": ( 0, 0 ) # disabled as default
+}
+
def inventory_job(info):
inventory = []
for line in info:
if line[0] == '==>':
item = ' '.join(line[1:-1])
- inventory.append( (item, None) )
+ inventory.append( (item, {} ) )
return inventory
def job_parse_real_time(s):
@@ -91,7 +95,8 @@ def job_parse(item, info):
return data
-def check_job(item, _no_params, info):
+def check_job(item, params, info):
+ warn, crit = params.get('age')
job = job_parse(item, info)
if not job:
return 3, 'Got no information for this job'
@@ -124,6 +129,13 @@ def check_job(item, _no_params, info):
display_value = get_age_human_readable(value)
elif key == 'start_time':
display_value = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(value) )
+ job_age = time.time() - value
+ if crit > 0 and job_age >= crit:
+ state = max(state, 2)
+ display_value += "(!!) (more than %s ago)" %
get_age_human_readable(crit)
+ elif warn > 0 and job_age >= warn:
+ state = max(state, 1)
+ display_value += "(!!) (more than %s ago)" %
get_age_human_readable(warn)
else:
display_value = value
@@ -133,9 +145,10 @@ def check_job(item, _no_params, info):
return state, ', '.join(output), perfdata
check_info["job"] = {
- 'check_function': check_job,
- 'inventory_function': inventory_job,
- 'service_description': 'Job %s',
- 'group': 'job',
- 'has_perfdata': True,
+ 'check_function' : check_job,
+ 'inventory_function' : inventory_job,
+ 'service_description' : 'Job %s',
+ 'default_levels_variable' : 'job_default_levels',
+ 'group' : 'job',
+ 'has_perfdata' : True,
}
diff --git a/web/plugins/wato/check_parameters.py b/web/plugins/wato/check_parameters.py
index e43c7ba..72898f8 100644
--- a/web/plugins/wato/check_parameters.py
+++ b/web/plugins/wato/check_parameters.py
@@ -2483,6 +2483,28 @@ register_check_parameters(
register_check_parameters(
subgroup_applications,
+ "job",
+ _("Age of jobs controlled by mk-job"),
+ Dictionary(
+ elements = [
+ ("age",
+ Tuple(
+ title = _("Maximum time since last start of job execution"),
+ elements = [
+ Age(title = _("Warning if above"), default_value = 0),
+ Age(title = _("Critical if above"), default_value = 0)
+ ]
+ )
+ )]
+ ),
+ TextAscii(
+ title = _("Job name"),
+ ),
+ None
+)
+
+register_check_parameters(
+ subgroup_applications,
"mssql_counters_locks",
_("MSSQL Locks"),
Dictionary(