Module: check_mk
Branch: master
Commit: 9823d40205f9321bd5977b36bfe64d078c8df597
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=9823d40205f932…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Fri Feb 25 12:20:18 2011 +0100
h3c_lanswitch_cpu: make conform to new guidelines
---
README.writing_checks | 45 +++++++++++++++++++++++++++++--
checks/h3c_lanswitch_cpu | 65 ++++++++++++++++++++++++++-------------------
2 files changed, 79 insertions(+), 31 deletions(-)
diff --git a/README.writing_checks b/README.writing_checks
index 308cb82..687720b 100644
--- a/README.writing_checks
+++ b/README.writing_checks
@@ -1,5 +1,7 @@
This file will help you to write *good* checks for Check_MK.
+Naming
+
* Check file names should be named short and unique. They must consist
only of lower case characters, digits and underscores and begin
with a lower case character.
@@ -16,10 +18,9 @@ This file will help you to write *good* checks for Check_MK.
not be named after the vendor but after the MIB. An example are the
hr_* checks.
-In case of SNMP checks you might use the name of the MIB you fetch data from as
- part of check name
-Header notes:
+Coding style
+
* If the check is contributed by a third party (like you), you must
add your name and your email address.
@@ -82,6 +83,44 @@ Header notes:
You can use it to get nagios state string from nagios return codes. e.g.:
nagios_state_names[0] gives you 'OK'.
+SNMP Scan function:
+
+* Every SNMP check *must* have an SNMP scan function. That function
+ should as *minimal* as possible: It should only fire at those devices
+ that really can support the check. Reason: unneccessary SNMP walks
+ on devices not supporting that check must be avoied.
+ The scan function must on the other hand not be so strict that it
+ rules out devices where the check would work. If in conflict between
+ these two issues than rather make the scan function not too strict.
+ The scan function should avoid fetching non-standard-OIDs by any
+ means. It should rather try to use the basic SMIv2 OIDs as these will
+ already have been fetched and cached by the scan functions of other checks!
+ All scan functions of all checks together should fetch as few OIDs as
+ possible!
+
+Other issues:
+
+* Default values for check parameters (e.g. switch_cpu_default_levels) must be
+ chosen in a way that they make sense for *everybody*, not just for your special
+ case. In case you are not sure then rather choose too loose than too tight levels.
+ This helps avoid false alarms.
+
+* If the same configuration variable is used in multiple checks, *all* of them
+ must set a default value and all those values must be identical!
+
+* Your check should assume that the agent is always producing valid data. It
+ should *not* try to handle cases where the agent output is broken. This is
+ handled by Check_MK via Python exceptions. Otherwise you would make the code
+ uglier and also disable the debug handler.
+
+* int() vs. saveint(), float vs. savefloat(): int(s) will throw an exception if
+ if is not a valid number string (or empty). Then Check_MK will catch the exception
+ and make the check result "UNKNOWN" with an according error message.
saveint(s) will
+ assume 0, if s is not valid. Important: use saveint() in all places, where you know
+ or suspect that some device does not supply valid data *but* the check can work
+ with the rest of the data and produce useful results. use int() in all other cases,
+ e.g. if the check does not make any sense if you have no valid data.
+
Manpages:
*
diff --git a/checks/h3c_lanswitch_cpu b/checks/h3c_lanswitch_cpu
index 6b799bb..227411e 100644
--- a/checks/h3c_lanswitch_cpu
+++ b/checks/h3c_lanswitch_cpu
@@ -34,11 +34,38 @@
# SNMPv2-SMI::enterprises.43.45.1.6.1.1.1.3.13 = Gauge32: 16
# Reasonably low warning and crit levels
-h3c_lanswitch_cpu_default_levels = (50, 75)
+switch_cpu_default_levels = (50, 75)
+
+
+# We do not want to use the end OID as item since.
+# We prefer "Switch 1 CPU 1" over "65537"...
+def h3c_lanswitch_cpu_genitem(item):
+ # decide switch class here (stacked or standalone/modular)
+ cpuid = int(item)
+
+ # if we have a cpuid lower than (hopefully) 256 it is not hashed with a unit ID
+ if cpuid < 256:
+ switchid = 1
+ cputype = "Slot"
+ cpunum = cpuid
+
+ # othwise, if above 64k it is a known stackable switch
+ elif cpuid >= 65536:
+ switchid = cpuid / 65536
+ cputype = "CPU"
+ cpunum = cpuid % 65536
+
+ # if we end up here 3com has added another hash method.
+ else:
+ switchid = 1
+ cputype = "Unknown"
+ cpunum = cpuid
+ return ("Switch %d %s %d" % (switchid, cputype, cpunum))
def inventory_h3c_lanswitch_cpu(checkname, info):
- return [ (h3c_lanswitch_cpu_genitem(line[0]),
"h3c_lanswitch_cpu_default_levels") for line in info ]
+ return [ (h3c_lanswitch_cpu_genitem(line[0]), "switch_cpu_default_levels")
for line in info ]
+
def check_h3c_lanswitch_cpu(item, params, info):
warn, crit = params
@@ -46,7 +73,7 @@ def check_h3c_lanswitch_cpu(item, params, info):
if h3c_lanswitch_cpu_genitem(line[0]) == item:
util = int(line[1])
infotext = (" - average usage was %d%% over last 5 minutes." % util)
- perfdata = [ ( "Usage", util, warn, crit, 0) ]
+ perfdata = [ ( "usage", util, warn, crit, 0) ]
if util > crit:
return (2, "CRIT" + infotext, perfdata)
@@ -54,38 +81,20 @@ def check_h3c_lanswitch_cpu(item, params, info):
return (1, "WARN" + infotext, perfdata)
else:
return (0, "OK" + infotext, perfdata)
- return (3, "UNKNOWN - Unit/CPU %s not found" % item)
-# decide switch class here (stacked or standalone/modular) and make a check item for it.
-def h3c_lanswitch_cpu_genitem(item):
- cpuid = int(item)
-# if we have a cpuid lower than 512 it is not hashed with a unit ID
- if cpuid < 256:
- switchid = 1
- cputype = "Slot"
- cpunum = cpuid
-# othwise, if above 64k it is a known stackable switch
- elif cpuid >= 65536:
- switchid = cpuid / 65536
- cputype = "CPU"
- cpunum = cpuid % 65536
-# if we end up here 3com has added another hash method.
- else:
- switchid = 1
- cputype = "Unknown"
- cpunum = cpuid
- return ("Switch %d %s %d" % (switchid, cputype, cpunum))
+ return (3, "UNKNOWN - %s not found" % item)
+
check_info["h3c_lanswitch_cpu"] = (check_h3c_lanswitch_cpu, "CPU Load
%s", 1, inventory_h3c_lanswitch_cpu )
-# get only the 5-min average load.
snmp_info["h3c_lanswitch_cpu"] = \
- ( "1.3.6.1.4.1.43.45.1.6.1.1.1", [ OID_END, "3" ] )
+ ( "1.3.6.1.4.1.43.45.1.6.1.1.1", [
+ OID_END,
+ "3" # 5-min average load.
+ ])
# just a rough match that will handle most devices.
snmp_scan_functions["h3c_lanswitch_cpu"] = \
- lambda oid: oid (".1.3.6.1.2.1.1.1.0").lower().startswith('3com
s')
-
-
+ lambda oid: oid(".1.3.6.1.2.1.1.1.0").lower().startswith('3com s')