Module: check_mk
Branch: master
Commit: e48d9c472b48f09a52e2652ea3417baa7d69c057
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=e48d9c472b48f0…
Author: Lars Michelsen <lm(a)mathias-kettner.de>
Date: Fri Oct 29 11:20:24 2010 +0200
cleaned up some code in win_dhcp_pools check
---
ChangeLog | 1 +
LIESMICH.zutun | 19 -------------------
checks/win_dhcp_pools | 36 ++++++++++++------------------------
3 files changed, 13 insertions(+), 43 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index bc3c93d..b5551d3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -21,6 +21,7 @@
* blade_health: More detailed output on problems
* blade_blades: Added new check for checking the health-, present- and
power-state of IBM Bladecenter blades
+ * win_dhcp_pools: Several cleanups in check
Multisite:
* The custom open/close states of custom links are now stored for each
diff --git a/LIESMICH.zutun b/LIESMICH.zutun
index 558032d..429882e 100644
--- a/LIESMICH.zutun
+++ b/LIESMICH.zutun
@@ -9,25 +9,6 @@ BUGS beheben ab 1.1.9i1
Wenn zwei Checks mit dem gleichen Namen existieren bei einem Host
soll mit einem Fehler abgebrochen werden.
-win_dhcp_pools sendet:
-[['MIBCounts:'],
- ['Discovers', '=', '193.'],
- ['Offers', '=', '193.'],
- ['Delayed', 'Offers', '=', '0.'],
- ['Requests', '=', '14540.'],
- ['Acks', '=', '11767.'],
- ['Naks', '=', '2.'],
- ['Declines', '=', '0.'],
- ['Releases', '=', '39.'],
- ['ServerStartTime', '=', '26', 'June', '2010', '04:47:49'],
- ['Scopes', '=', '1.'],
- ['Scopes', 'with', 'Delay', 'configured=', '0.'],
- ['Subnet', '=', '10.3.108.0.'],
- ['No.', 'of', 'Addresses', 'in', 'use', '=', '40.'],
- ['No.', 'of', 'free', 'Addresses', '=', '150.'],
- ['No.', 'of', 'pending', 'offers', '=', '0.']]
-Besser: = als Trenner nehmen, dann ist das parsen einfacher.
-
ich bin mir nicht sicher ob das nun ein Thruk Problem oder ein Livestatus Problem ist. Vermutlich beides :-)
Wenn ich im Thruk im Suchfenster einfach Enter drücke, kommt folgende Query bei raus:
GET services
diff --git a/checks/win_dhcp_pools b/checks/win_dhcp_pools
index bc96e1a..4e5d698 100644
--- a/checks/win_dhcp_pools
+++ b/checks/win_dhcp_pools
@@ -77,34 +77,26 @@ win_dhcp_pools_stats_translate = {
'Anzahl der anstehenden Angebote': 'No. of pending offers',
}
+def parse_win_dhcp_pools(info):
+ return [ ' '.join(line).rstrip('.').split(' = ') for line in info ]
+
def inventory_win_dhcp_pools(checktype, info):
inventory = []
- for line in info:
- start = line[0]
- if start in win_dhcp_pools_stats_translate:
- start = win_dhcp_pools_stats_translate[line[0]]
- if start == 'Subnet':
- inventory.append((line[2].rstrip('.'), 'win_dhcp_pools_default_levels'))
+ for line in parse_win_dhcp_pools(info):
+ if win_dhcp_pools_stats_translate.get(line[0], line[0]) == 'Subnet':
+ inventory.append((line[1], 'win_dhcp_pools_default_levels'))
return inventory
def check_win_dhcp_pools(item, params, info):
inBlock = False
poolStats = []
status = 0
- for line in info:
- # Translate if needed
- start = line[0]
- if start in win_dhcp_pools_stats_translate:
- start = win_dhcp_pools_stats_translate[line[0]]
-
- if start == 'Subnet' and line[-1] == item+'.':
+ for line in parse_win_dhcp_pools(info):
+ if win_dhcp_pools_stats_translate.get(line[0], line[0]) == 'Subnet' and line[1] == item:
inBlock = True
continue
if inBlock:
- key = ' '.join(line[:-2])
- if key in win_dhcp_pools_stats_translate:
- key = win_dhcp_pools_stats_translate[key]
- poolStats.append(saveint(line[-1].rstrip('.')))
+ poolStats.append(saveint(line[1]))
if len(poolStats) == 3:
break
@@ -140,16 +132,12 @@ def check_win_dhcp_pools_stats(item, params, info):
this_time = int(time.time())
timedif = 0
- for line in info:
+ for line in parse_win_dhcp_pools(info):
if len(line) > 0:
- if line[0] in win_dhcp_pools_stats_translate:
- key = win_dhcp_pools_stats_translate[line[0]]
- else:
- key = line[0]
-
+ key = win_dhcp_pools_stats_translate.get(line[0], line[0])
if key in [ 'Discovers', 'Offers', 'Requests', 'Acks',
'Nacks', 'Declines', 'Releases', 'Scopes' ]:
- value = saveint(line[2].rstrip('.'))
+ value = saveint(line[1])
try:
timedif, per_sec = get_counter("win_dhcp_stats.%s" % key, this_time, value)
except MKCounterWrapped:
Module: check_mk
Branch: master
Commit: e688626d32701ac5e38c5a43e2d08c8a59094ad5
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=e688626d32701a…
Author: Mathias Kettner <mk(a)mathias-kettner.de>
Date: Thu Oct 28 11:38:48 2010 +0200
Fix error detection for tcp+snmp hosts
---
ChangeLog | 6 ++++
modules/check_mk.py | 11 ++++++-
modules/check_mk_base.py | 72 ++++++++++++++++++++++++++++++++++++---------
modules/snmp.py | 4 +-
4 files changed, 74 insertions(+), 19 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index fc36b90..8f0fd5a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,10 @@
1.1.9i1
+ Core, Setup, etc.:
+ * Improve error handling: if hosts are monitored with SNMP *and* TCP,
+ then after an error with one of those two agents checks from the
+ other haven't been executed. This is fixed now. Inventory check
+ is still not complete in that error condition.
+
Checks & Agents:
* megaraid_pdisks: Using the real enclosure number as check item now
* if/if64: Added expected interface speed to warning output
diff --git a/modules/check_mk.py b/modules/check_mk.py
index 8a69f09..b1cd68c 100755
--- a/modules/check_mk.py
+++ b/modules/check_mk.py
@@ -1618,9 +1618,16 @@ def make_inventory(checkname, hostnamelist, check_only=False):
try:
info = get_realhost_info(hostname, ipaddress, checkname_base, inventory_max_cachefile_age)
except MKAgentError, e:
- if check_only:
+ if check_only and str(e):
raise
- sys.stderr.write("Host '%s': %s\n" % (hostname, str(e)))
+ elif str(e):
+ sys.stderr.write("Host '%s': %s\n" % (hostname, str(e)))
+ continue
+ except MKSNMPError, e:
+ if check_only and str(e):
+ raise
+ elif str(e):
+ sys.stderr.write("Host '%s': %s\n" % (hostname, str(e)))
continue
except Exception, e:
if check_only or opt_debug:
diff --git a/modules/check_mk_base.py b/modules/check_mk_base.py
index 0ba0cfd..e6e719c 100755
--- a/modules/check_mk_base.py
+++ b/modules/check_mk_base.py
@@ -87,6 +87,8 @@ compiled_regexes = {} # avoid recompiling regexes
nagios_command_pipe = None # Filedescriptor to open nagios command pipe.
g_single_oid_hostname = None
g_single_oid_cache = {}
+g_broken_snmp_hosts = set([])
+g_broken_agent_hosts = set([])
# variables set later by getopt
@@ -121,6 +123,12 @@ class MKAgentError(Exception):
def __str__(self):
return self.reason
+class MKSNMPError(Exception):
+ def __init__(self, reason):
+ self.reason = reason
+ def __str__(self):
+ return self.reason
+
# +----------------------------------------------------------------------+
# | _ _ _ |
# | / \ __ _ __ _ _ __ ___ __ _ __ _| |_(_) ___ _ __ |
@@ -250,6 +258,7 @@ def get_host_info(hostname, ipaddress, checkname):
exception_texts = []
global opt_use_cachefile
opt_use_cachefile = True
+ is_snmp_error = False
for node in nodes:
# If an error with the agent occurs, we still can (and must)
# try the other node.
@@ -259,15 +268,23 @@ def get_host_info(hostname, ipaddress, checkname):
at_least_one_without_exception = True
except MKAgentError, e:
exception_texts.append(str(e))
+ g_broken_agent_hosts.add(node)
+ except SNMPErrorError, e:
+ exception_texts.append(str(e))
+ g_broken_snmp_hosts.add(node)
+ is_snmp_error = true
if not at_least_one_without_exception:
- raise MKAgentError(", ".join(exception_texts))
+ if is_snmp_error:
+ raise MKSNMPError(", ".join(exception_texts))
+ else:
+ raise MKAgentError(", ".join(exception_texts))
return info
else:
return get_realhost_info(hostname, ipaddress, checkname, check_max_cachefile_age)
# Gets info from a real host (not a cluster). There are three possible
# ways: TCP, SNMP and external command. This function raises
-# MKAgentError, if there could not retrieved any data. It returns [],
+# MKAgentError or MKSNMPError, if there could not retrieved any data. It returns [],
# if the agent could be contacted but the data is empty (no items of
# this check type).
#
@@ -290,6 +307,12 @@ def get_realhost_info(hostname, ipaddress, checkname, max_cache_age):
content = read_cache_file(cache_relpath, max_cache_age)
if content:
return eval(content)
+ # Not cached -> need to get info via SNMP
+
+ # Try to contact host only once
+ if hostname in g_broken_snmp_hosts:
+ raise MKSNMPError("")
+
# New in 1.1.3: oid_info can now be a list: Each element
# of that list is interpreted as one real oid_info, fetches
# a separate snmp table. The overall result is then the list
@@ -320,9 +343,10 @@ def get_realhost_info(hostname, ipaddress, checkname, max_cache_age):
return table
# No SNMP check. Then we must contact the check_mk_agent. Have we already
- # to get data from the agent? If yes we must not do that again!
+ # to get data from the agent? If yes we must not do that again! Even if
+ # no cache file is present
if g_agent_already_contacted.has_key(hostname):
- return []
+ raise MKAgentError("")
g_agent_already_contacted[hostname] = True
store_cached_hostinfo(hostname, []) # leave emtpy info in case of error
@@ -390,6 +414,10 @@ def get_agent_info(hostname, ipaddress, max_cache_age):
if result:
return result
+ # Try to contact every host only once
+ if hostname in g_broken_agent_hosts:
+ raise MKAgentError("")
+
# If the host ist listed in datasource_programs the data from
# that host is retrieved by calling an external program (such
# as ssh or rsy) instead of a TCP connect.
@@ -625,9 +653,12 @@ def do_check(hostname, ipaddress):
try:
load_counters(hostname)
- agent_version, num_success, num_errors = do_all_checks_on_host(hostname, ipaddress)
+ agent_version, num_success, num_errors, problems = do_all_checks_on_host(hostname, ipaddress)
save_counters(hostname)
- if num_errors > 0 and num_success > 0:
+ if problems:
+ output = "CRIT - %s" % problems
+ status = 2
+ elif num_errors > 0 and num_success > 0:
output = "WARNING - Got only %d out of %d infos" % (num_success, num_success + num_errors)
status = 1
elif num_errors > 0:
@@ -646,12 +677,6 @@ def do_check(hostname, ipaddress):
output = "UNKNOWN - %s" % e
status = 3
- except MKAgentError, e:
- if opt_debug:
- raise
- output = "CRIT - %s" % e
- status = 2
-
if aggregate_check_mk:
try:
submit_check_mk_aggregation(hostname, status, output)
@@ -675,6 +700,7 @@ def do_all_checks_on_host(hostname, ipaddress):
num_success = 0
num_errors = 0
check_table = get_sorted_check_table(hostname)
+ problems = []
for checkname, item, params, description, info in check_table:
# In case of a precompiled check table info is the aggrated
# service name. In the non-precompiled version there are the dependencies
@@ -684,7 +710,22 @@ def do_all_checks_on_host(hostname, ipaddress):
aggrname = aggregated_service_name(hostname, description)
infotype = checkname.split('.')[0]
- info = get_host_info(hostname, ipaddress, infotype)
+ try:
+ info = get_host_info(hostname, ipaddress, infotype)
+ except MKSNMPError, e:
+ if str(e):
+ problems.append(str(e))
+ num_errors += 1
+ g_broken_snmp_hosts.add(hostname)
+ continue
+
+ except MKAgentError, e:
+ if str(e):
+ problems.append(str(e))
+ num_errors += 1
+ g_broken_agent_hosts.add(hostname)
+ continue
+
if info or info == []:
num_success += 1
try:
@@ -741,10 +782,11 @@ def do_all_checks_on_host(hostname, ipaddress):
else:
agent_version = "(unknown)"
except MKAgentError, e:
- raise
+ g_broken_agent_hosts.add(hostname)
+ agent_version = "(unknown)"
except:
agent_version = "(unknown)"
- return agent_version, num_success, num_errors
+ return agent_version, num_success, num_errors, ", ".join(problems)
def nagios_pipe_open_timeout(signum, stackframe):
raise IOError("Timeout while opening pipe")
diff --git a/modules/snmp.py b/modules/snmp.py
index a4ac246..ea45083 100644
--- a/modules/snmp.py
+++ b/modules/snmp.py
@@ -108,7 +108,7 @@ def get_snmp_explicit(hostname, ipaddress, mib, baseoid, suffixes):
if exitstatus:
if opt_verbose:
sys.stderr.write(tty_red + tty_bold + "ERROR: " + tty_normal + "SNMP error\n")
- return None
+ raise MKSNMPError("SNMP Error")
return info
def snmpwalk_on_suboid(hostname, ip, oid):
@@ -153,7 +153,7 @@ def snmpwalk_on_suboid(hostname, ip, oid):
if exitstatus:
if opt_verbose:
sys.stderr.write(tty_red + tty_bold + "ERROR: " + tty_normal + "SNMP error\n")
- return []
+ raise MKSNMPError("SNMP Error")
return rowinfo
def get_snmp_table(hostname, ip, oid_info):
Module: check_mk
Branch: master
Commit: 7b4d7b64214034b33936ddd522db944876d6ef87
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=7b4d7b64214034…
Author: Lars Michelsen <lm(a)mathias-kettner.de>
Date: Mon Oct 25 17:14:03 2010 +0200
Improved cisco_power check
---
ChangeLog | 5 +++--
LIESMICH.zutun | 2 ++
checks/cisco_power | 13 ++++++++++---
3 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 5a8ab9a..42177c5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -5,8 +5,9 @@
* wut_webtherm: handle more variants of WuT Webtherms (thanks to Lefty)
* blade_bays: Also inventorizes standby blades and has a little more
verbose output.
- * blade_blowers: Can handle responses without rpm values now. Improved output.
- * cisco_fan: Does not inentorize 'notPresent' sensors anymore. Imporved output.
+ * blade_blowers: Can handle responses without rpm values now. Improved output
+ * cisco_fan: Does not inentorize 'notPresent' sensors anymore. Improved output
+ * cisco_power: Not using power source as threshold anymore. Improved output
Multisite:
* The custom open/close states of custom links are now stored for each
diff --git a/LIESMICH.zutun b/LIESMICH.zutun
index 3383b0e..27e6ac0 100644
--- a/LIESMICH.zutun
+++ b/LIESMICH.zutun
@@ -45,6 +45,8 @@ man rechts plötzlich ein anderes System. Lösung ist noch nicht in Sicht.
LARS: Bei Opera kann man kein Snapin nach ganz unten ziehen. Der Indikator
springt dann immer nach ganz oben.
+LARS: Im Opera wird sich die aktuelle Scroll-Position der Sidebar nicht korrekt gemerkt.
+
SNMP-Checks: Es wurde berichtet, dass - wenn die Community falsch ist -
der Check_MK gruen war, aber die Checks unknown. Sollte es dann nicht
genau umgekehrt sein? Wird der Exit-code von snmpwalk ausgewertet?
diff --git a/checks/cisco_power b/checks/cisco_power
index 285c790..25efbc1 100644
--- a/checks/cisco_power
+++ b/checks/cisco_power
@@ -24,6 +24,9 @@
# to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA 02110-1301 USA.
+cisco_power_states = ('', 'normal', 'warning', 'critical',
+ 'shutdown', 'notPresent', 'notFunctioning')
+cisco_power_source = ( '', 'unknown', 'ac', 'dc', 'externalPowerSupply', 'internalRedundant')
def inventory_cisco_power(checkname, info):
return [ (line[0], '', '""') for line in info if 'RPS NotExist' not in line[0]]
@@ -31,10 +34,14 @@ def inventory_cisco_power(checkname, info):
def check_cisco_power(item, params, info):
for line in info:
if line[0] == item:
- if line[1] == "1" and line[2] == "5":
- return (0, "OK")
+ state, source = map(saveint, line[1:3])
+ output = 'State: %s, Source: %s' % (cisco_power_states[state], cisco_power_source[source])
+ if state == 1:
+ return (0, "OK - %s" % output)
+ elif state == 2:
+ return (1, "WARN - %s" % output)
else:
- return (2, "CRIT - Invalid state %s/%s" % (line[1], line[2]))
+ return (2, "CRIT - %s" % output)
return (3, "UNKNOWN - item not found in snmp data")
check_info['cisco_power'] = (check_cisco_power, "%s", 0, inventory_cisco_power)