Module: check_mk
Branch: master
Commit: 0512487a221448398e7cd019645dfa1aa6317da7
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=0512487a221448…
Author: Bastian Kuhn <bk(a)mathias-kettner.de>
Date: Tue Aug 21 14:12:23 2012 +0200
Added check_dns for wato active checks
---
checks/check_dns | 55 ++++++++++++++++++++++++++++++++++
web/plugins/wato/active_checks.py | 59 ++++++++++++++++++++++++++++++++++++-
2 files changed, 113 insertions(+), 1 deletions(-)
diff --git a/checks/check_dns b/checks/check_dns
new file mode 100644
index 0000000..4eea0b5
--- /dev/null
+++ b/checks/check_dns
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+# -*- encoding: utf-8; py-indent-offset: 4 -*-
+# +------------------------------------------------------------------+
+# | ____ _ _ __ __ _ __ |
+# | / ___| |__ ___ ___| | __ | \/ | |/ / |
+# | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
+# | | |___| | | | __/ (__| < | | | | . \ |
+# | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
+# | |
+# | Copyright Mathias Kettner 2012 mk(a)mathias-kettner.de |
+# +------------------------------------------------------------------+
+#
+# This file is part of Check_MK.
+# The official homepage is at http://mathias-kettner.de/check_mk.
+#
+# check_mk is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation in version 2. check_mk is distributed
+# in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
+# out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE. See the GNU General Public License for more de-
+# ails. You should have received a copy of the GNU General Public
+# License along with GNU Make; see the file COPYING. If not, write
+# to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+# Boston, MA 02110-1301 USA.
+
+def check_dns_arguments(params):
+ description, settings = params
+ args = '-H %s' % quote_shell_string(description)
+
+ if "server" in settings:
+ args += ' -s %s' % quote_shell_string(settings["server"])
+
+ if "expected_address" in settings:
+ args += ' -a %s' % quote_shell_string(settings["expected_address"])
+
+ if "expected_authority" in settings:
+ args += ' -A %s' % quote_shell_string(settings["expected_authority"])
+
+ if "response_time" in settings:
+ args += ' -w %d' % int(settings["response_time"][0])
+ args += ' -c %d' % int(settings["response_time"][1])
+
+ if "timeout" in settings:
+ args += ' -t %d' % int(settings['timeout'])
+
+ return args
+
+active_check_info['dns'] = {
+ "command_line" : '$USER1$/check_dns $ARG1$',
+ "argument_function" : check_dns_arguments,
+ "service_description" : lambda params: "DNS %s" % params[0],
+ "has_perfdata" : False,
+}
+
diff --git a/web/plugins/wato/active_checks.py b/web/plugins/wato/active_checks.py
index a231038..38606f5 100644
--- a/web/plugins/wato/active_checks.py
+++ b/web/plugins/wato/active_checks.py
@@ -30,6 +30,64 @@ register_rulegroup("activechecks",
group = "activechecks"
register_rule(group,
+ "active_checks:dns",
+ Tuple(
+ title = _("Check DNS service"),
+ help = _("Check optain an IP address for a host or domain"
+ "It uses <tt>check_dns</tt> from standard plugins."),
+ elements = [
+ TextAscii(title = _("Hostname"), allow_empty = False,
+ help = _('The name or address you want to query')),
+ Dictionary(
+ title = _("Optional parameters"),
+ elements = [
+ ( "server",
+ TextAscii(
+ title = _("DNS Server"),
+ allow_empty = False,
+ help = _("Optional DNS server you want to use for the lookup"))),
+ ( "expected_address",
+ TextAscii(
+ title = _("Expected Address"),
+ allow_empty = False,
+ help = _("Optional IP-ADDRESS you expect the DNS server to return. HOST"
+ "must end with a dot (.) " )),
+ ),
+ ( "expected_authority",
+ TextAscii(
+ title = _("Expected Authority"),
+ allow_empty = False,
+ help = _("Optional expect the DNS server to be authoriative"
+ "for the lookup ")),
+ ),
+ ( "response_time",
+ Tuple(
+ title = _("Expected response time"),
+ elements = [
+ Float(
+ title = _("Warning at"),
+ unit = "sec",
+ default_value = 1),
+ Float(
+ title = _("Critical at"),
+ unit = "sec",
+ default_value = 2),
+ ])
+ ),
+ ( "timeout",
+ Integer(
+ title = _("Seconds before connection times out"),
+ unit = _("sec"),
+ default_value = 10,
+ )
+ ),
+ ]),
+ ]
+ ),
+ match = 'all')
+
+
+register_rule(group,
"active_checks:tcp",
Tuple(
title = _("Check connecting to a TCP port"),
@@ -165,7 +223,6 @@ register_rule(group,
match = 'all')
-
register_rule(group,
"active_checks:http",
Tuple(
Module: check_mk
Branch: master
Commit: f5d59f3f8d0b404341cd54f7493d7a5dac500d70
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=f5d59f3f8d0b40…
Author: Florian Heigl <fh(a)mathias-kettner.de>
Date: Tue Aug 21 13:27:46 2012 +0200
Add new bug entry
---
.bugs/724 | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/.bugs/724 b/.bugs/724
new file mode 100644
index 0000000..f84c29b
--- /dev/null
+++ b/.bugs/724
@@ -0,0 +1,9 @@
+Title: Notification delay WATO rules only save first service listed
+Component: wato
+State: open
+Date: 2012-08-21 13:25:39
+Targetversion: 1.2.0
+Class: bug
+
+WATO shows entries for multiple services if you try to define a notification
+delay, but only the first service named will also be in the stored ruleset.
Module: check_mk
Branch: master
Commit: 8b1f650c5bb673ab1e6502922c3a3bc5ac67d216
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=8b1f650c5bb673…
Author: Florian Heigl <fh(a)mathias-kettner.de>
Date: Mon Aug 20 17:24:17 2012 +0200
checks/megaraid_bbu: The check for BBU status now silences while a battery learn cycle is running
---
ChangeLog | 1 +
checkman/megaraid_bbu | 21 ++++++++++++++++-----
checks/megaraid_bbu | 22 ++++++++++++++--------
3 files changed, 31 insertions(+), 13 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index e34dad1..e704d88 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -9,6 +9,7 @@
* New Checks for Siemens Blades (BX600)
* New Checks for Fortigate Firewalls
* FIX: megaraid_pdisks: handle case where no enclosure device exists
+ * FIX: megaraid_bbu: handle the controller's learn cycle. No errors in that period.
* mysql_capacity: cleaned up check, levels are in MB now
* jolokia_info, jolokia_metrics: new rewritten checks for jolokia (formerly
jmx4perl). You need the new plugin mk_jokokia for using them
diff --git a/checkman/megaraid_bbu b/checkman/megaraid_bbu
index e4ea82f..0e7f83e 100644
--- a/checkman/megaraid_bbu
+++ b/checkman/megaraid_bbu
@@ -4,14 +4,25 @@ author: Florian Heigl <fh(a)mathias-kettner.de>
license: GPL
distribution: check_mk
description:
- This check monitors the existance and status of battery backup units on controllers that are based on the mid- and highend LSI Megaraid chipsets. The entry chipsets do not support BBUs. The Linux agent will try find any existing BBUs - if {MegaCli} is found in your search path.
- The BBUs might come in various types {(iBBU, BBU)} and also some other vendors are using this RAID chip. It is tested against Intel, Dell, IBM and FSC models.
+ This check monitors the existance and status of battery backup units on controllers
+ that are based on the mid- and highend LSI Megaraid chipsets. The entry chipsets do
+ not support BBUs. The Linux agent will try find any existing BBUs - if {MegaCli} is
+ found in your search path.
+ The BBUs might come in various types {(iBBU, BBU)} and also some other vendors are
+ using this RAID chip. It is tested against Intel, Dell, IBM and FSC models.
- The check works by matching the agent output against a dictionary of expected values. If you have MegaCli installed and some values are not detected, it might be neccessary to update your version of MegaCli.
+ The check works by matching the agent output against a dictionary of expected values.
+ If you have MegaCli installed and some values are not detected, it might be neccessary
+ to update your version of MegaCli.
- {MegaCli} can be downloaded from LSI at the following URL {http://www.lsi.com/downloads/Public/MegaRAID%20Common%20Files/8.02.16_MegaCLI.zip}
+ {MegaCli} can be downloaded from LSI at the following URL
+ {http://www.lsi.com/downloads/Public/MegaRAID%20Common%20Files/8.02.16_MegaCLI.zip}
- It would be possible to make the warning / critical levels user specifiable. See the check source for this if you have a need to influence those parameters.
+ It would be possible to make the warning / critical levels user specifiable.
+ See the check source for this if you have a need to influence those parameters.
+ Most controllers run a "battery learn cycle" periodically or on user request.
+ The check detects this learn cycle and suppresses all errors while this cycle is active.
+ This affects all models that do not have a flash / capacitor based BBU system.
item:
A string "RAID Adapter/BBU" followed by the ID of the adapter as reported by MegaCli.
diff --git a/checks/megaraid_bbu b/checks/megaraid_bbu
index 1e0066b..be2f9b2 100644
--- a/checks/megaraid_bbu
+++ b/checks/megaraid_bbu
@@ -31,18 +31,19 @@
# Load a fake controller with known good values for the most
# important parameters only and try to define their importance
megaraid_bbu_refvalues = {
- 'Remaining Capacity Low' : ('No', 1),
- 'I2c Errors Detected' : ('No', 1),
+ 'Remaining Capacity Low' : ('No', 1), # nolearn
+ 'I2c Errors Detected' : ('No', 1),
'Temperature' : ('OK', 2),
'Pack is about to fail & should be replaced': ('No', 1),
- 'Charging Status' : ('None', 1),
- 'Battery State' : ('Operational', 2),
+ 'Charging Status' : ('None', 1), # nolearn
+ 'Battery State' : ('Operational', 2), # nolearn
'Learn Cycle Status' : ('OK', 1),
+ 'Learn Cycle Active' : ('Yes', 0),
'Battery Pack Missing' : ('No', 2),
'Battery Replacement required' : ('No', 1),
'Over Temperature' : ('No', 2),
'Over Charged' : ('No', 1),
- 'Voltage' : ('OK', 2),
+ 'Voltage' : ('OK', 2), # nolearn
}
@@ -80,19 +81,23 @@ def check_megaraid_bbu(item, _no_params, info):
# get current charge level
charge = (", Charge is %s" % controller['Relative State of Charge'])
+
# verify defined important parameters to current level
for varname, (refvalue, refstate) in megaraid_bbu_refvalues.items():
+ # the try/except should handle controller types that don't have certain values
+ # if your bbu chipset fails and you still get a partial response this will lead
+ # to a false result. but people asked for it :>
try:
- value = controller[varname]
- # build a list of all errors
if controller[varname] != refvalue:
broken.append("%s is %s, but should be %s(%s)" % (varname, value, refvalue, "!" * refstate))
state = max(state, refstate)
except:
pass
+ if controller["Learn Cycle Active"] == "Yes":
+ return (0, "OK - no states to check (controller is in learn cycle)" + charge)
# return assembled info
- if broken:
+ elif broken:
return (state, nagios_state_names[state] + " - " + ", ".join(broken) + charge)
else:
return (0, "OK - all states as expected" + charge)
@@ -100,4 +105,5 @@ def check_megaraid_bbu(item, _no_params, info):
return (3, "UNKNOWN - Check not implemented")
+
check_info["megaraid_bbu"] = (check_megaraid_bbu, "RAID Adapter/BBU %s", 0, inventory_megaraid_bbu)
Module: check_mk
Branch: master
Commit: f2c5a46d4d466f79094dffa89c615453a1950d33
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=f2c5a46d4d466f…
Author: Florian Heigl <fh(a)mathias-kettner.de>
Date: Thu Aug 16 19:16:28 2012 +0200
Add new Check for monitoring HSRP redundancy groups on Cisco routers
---
ChangeLog | 1 +
checkman/cisco_hsrp | 37 ++++++++++++++++
checks/cisco_hsrp | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 155 insertions(+), 0 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 0f12101..e34dad1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -45,6 +45,7 @@
* carel_uniflair_cooling: new check for monitoring datacenter air conditioning by "CAREL"
* Added Agent for OpenBSD
* Added Checks for MasterGuard UPS devices
+ * cisco_hsrp: New Check for monitoring HSRP groups on Cisco Routers. (SMIv2 version)
WATO:
* Added permission to control the "clone host" feature in WATO
diff --git a/checkman/cisco_hsrp b/checkman/cisco_hsrp
new file mode 100644
index 0000000..77457f5
--- /dev/null
+++ b/checkman/cisco_hsrp
@@ -0,0 +1,37 @@
+title: Cisco HSRP group status
+agents: snmp
+author: Florian Heigl <fh(a)mathias-kettner.de>
+license: GPLv3
+distribution: check_mk
+description:
+ Cisco routers support virtual redundant interfaces by using HSRP
+ (Hot Standby Router Protocol).
+ HSRP is configured by assigning IP interfaces to a group that has a
+ "floating" virtual IP.
+ If, for example, a router has an interface failure on either of the routed
+ interfaces, the protocol lowers the routers priority, and if that priority
+ drops too low, it will fail over the virtual IP.
+
+
+ The check returns {OK} if the HSRP status is a good one ("active",
+ "standby") and the same as during inventory.
+ If the states flip from "active" to "standby" or vice-versa, the check goes
+ to {WARN}, assuming that HSRP is doing it's job.
+
+ Should the status ever be a different one that for example points to HSRP
+ still initializing, the check will return {CRIT} as HSRP is assumet to be
+ inoperable in these states.
+
+perfdata:
+ none
+
+
+item:
+ The Virtual IP of the failover group.
+
+inventory:
+ The check reads the HSRP MIB and creates one service per virtual IP. It also
+ stores the HSRP state of
+ the IP as seen by the monitored device, normally either standby or active.
+ It also stores the ID of
+ the failover group. (For future use)
diff --git a/checks/cisco_hsrp b/checks/cisco_hsrp
new file mode 100644
index 0000000..5593cb0
--- /dev/null
+++ b/checks/cisco_hsrp
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+
+
+
+# Good docs:
+# http://www.cisco.com/en/US/tech/tk648/tk362/technologies_tech_note09186a008…
+#.1.3.6.1.4.1.9.9.106.1.1.1.0 5
+# cHsrpGrpTable
+###########################
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.2.1.192 "HSRP Secret key here"
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.2.7.193 "HSRP Secret key here"
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.3.1.192 100
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.3.7.193 100
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.4.1.192 1
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.4.7.193 1
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.5.1.192 300
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.5.7.193 300
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.6.1.192 2
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.6.7.193 2
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.7.1.192 0
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.7.7.193 0
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.8.1.192 0
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.8.7.193 0
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.9.1.192 3000
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.9.7.193 3000
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.10.1.192 10000
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.10.7.193 10000
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.11.1.192 192.168.10.4
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.11.7.193 172.20.10.20 <- hsrp ip
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.12.1.192 1
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.12.7.193 1
+# HSRP Monitored IP interfaces. If any of those go down, the priority of
+# the router will be lowered.
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.13.1.192 192.168.10.5 <- ip Router 1 int 1
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.13.7.193 172.20.10.21 <- ip Router 2 int 7
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.14.1.192 192.168.10.6 <- ip Router 1 int 1
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.14.7.193 172.20.10.22 <- ip Router 2 int 7
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.15.1.192 6 <- group #1 "standby" state
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.15.7.193 6 <- group #2 "standby" state
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.16.1.192 "00 00 0C 07 AC C0 "
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.16.7.193 "00 00 0C 07 AC C1 "
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.17.1.192 1
+#.1.3.6.1.4.1.9.9.106.1.2.1.1.17.7.193 1
+
+
+# we'll be alerting if the state is not either 5 or 6.
+# We could also not inventorize if the state isn't 5/6 but
+# since you have to configure a group, to even show up in the
+# MIB it's supposedly ok to alert if something isn't right there.
+# otherwise modify the inventory.
+hsrp_states = { 1: "initial", 2: "learn", 3: "listen", 4: "speak", 5: "standby", 6: "active" }
+
+
+def inventory_cisco_hsrp(info):
+
+ inventory = []
+ for line in info:
+ hsrp_grp_entry, vip, actrouter, sbrouter, hsrp_state, vmac = line
+ interface_index, hsrp_grp = hsrp_grp_entry.split(".")
+ # inventorize HSRP group name+IP and the standby state as seen from "this" box.
+ inventory.append( (vip, (hsrp_grp, int(hsrp_state))) )
+
+ return inventory
+
+
+def check_cisco_hsrp(item, params, info):
+
+ hsrp_grp_wanted, hsrp_state_wanted = params
+
+ for line in info:
+ hsrp_grp_entry, vip, actrouter, sbrouter, hsrp_state, vmac = line
+ interface_index, hsrp_grp = hsrp_grp_entry.split(".")
+ hsrp_state = int(hsrp_state)
+
+ if vip == item:
+ # FIXME: This should be shorter.
+ # Validate that we the inventorized state is a "good one"
+ # if it's also the one we have now, then we're fine.
+
+ if hsrp_state_wanted == 5 and hsrp_state == hsrp_state_wanted:
+ state = 0
+ msgtxt = "Redundancy Group %s is OK" % hsrp_grp
+ elif hsrp_state_wanted == 6 and hsrp_state == hsrp_state_wanted:
+ state = 0
+ msgtxt = "Redundancy Group %s is OK" % hsrp_grp
+ # otherwise if it's a good one, but flipped, then we are in a failover
+ elif hsrp_state == 5 or hsrp_state == 6:
+ state = 1
+ msgtxt = "Redundancy Group %s has failed over" % hsrp_grp
+ # anything else must be a non-operative state already
+ else:
+ state = 2
+ msgtxt = "Redundancy Group %s is %s" % ( hsrp_grp, hsrp_states[hsrp_state])
+
+ return (state, nagios_state_names[state] + " - " + msgtxt)
+
+ return (3, "UNKNOWN - HSRP Group %s not found in Agent output" % hsrp_grp_wanted )
+
+
+
+# FIXME: Outdated format. Fix after discussing WATO options.
+check_info["cisco_hsrp"] = (check_cisco_hsrp, "HSRP Group %s", 0, inventory_cisco_hsrp)
+
+snmp_info["cisco_hsrp"] = \
+ ( ".1.3.6.1.4.1.9.9.106.1.2.1.1", [
+ OID_END,
+ "11", # cHsrpGrpVirtualIpAddr
+ "13", # cHsrpGrpActiveRouter
+ "14", # cHsrpGrpStandbyRouter
+ "15", # cHsrpGrpStandbyState
+ "16", # cHsrpGrpVirtualMacAddr
+ ])
+
+# We can't scan for the HSRP table since the entries are indexed
+# based on information we dont have without fetching all of it
+# Instead we use the HSRP timeout
+snmp_scan_functions["cisco_hsrp"] = lambda oid: "cisco" in oid(".1.3.6.1.2.1.1.1.0").lower() and oid(".1.3.6.1.4.1.9.9.106.1.1.1.0")