Module: check_mk
Branch: master
Commit: b0246f6e1ea574b04d8e34c6e2df49c175b5ea04
URL: http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=b0246f6e1ea574…
Author: Lars Michelsen <lm(a)mathias-kettner.de>
Date: Fri Jan 8 14:12:43 2016 +0100
#2898 FIX smart: Using normalized value for determining reallocated events state
The smart check often reported errors for disks which had just a small number of
reallocated events but where the normalized state value (aggregated by hardware
manufacturer logic) did not report an issue.
This was caused by the logic of the check which stores values the agent reports
during service discovery and continously checks the difference between these values
and the current values reported by the disk. In this progress the "raw values" of
the smartctl output were used for all attributes.
But this seemed - at least for the "reallocated events count" - not a really good
approach because the raw values may slightly change during regular operation. But
small changes of the values does not necessarly mean a predicted failure. So we
decided to change the logic only for the "reallocated events" now.
Maybe we need to change it for the other values in the future. But for the moment
only the single value interpretation was changed.
---
.werks/2898 | 24 ++++++++++++++++++++++++
ChangeLog | 1 +
checks/smart | 50 +++++++++++++++++++++++++++++++++++++++++---------
3 files changed, 66 insertions(+), 9 deletions(-)
diff --git a/.werks/2898 b/.werks/2898
new file mode 100644
index 0000000..ae28eb0
--- /dev/null
+++ b/.werks/2898
@@ -0,0 +1,24 @@
+Title: smart: Using normalized value for determining reallocated events state
+Level: 1
+Component: checks
+Compatible: compat
+Version: 1.2.7i4
+Date: 1452258418
+Class: fix
+
+The smart check often reported errors for disks which had just a small number of
+reallocated events but where the normalized state value (aggregated by hardware
+manufacturer logic) did not report an issue.
+
+This was caused by the logic of the check which stores values the agent reports
+during service discovery and continously checks the difference between these values
+and the current values reported by the disk. In this progress the "raw values" of
+the smartctl output were used for all attributes.
+
+But this seemed - at least for the "reallocated events count" - not a really good
+approach because the raw values may slightly change during regular operation. But
+small changes of the values does not necessarly mean a predicted failure. So we
+decided to change the logic only for the "reallocated events" now.
+
+Maybe we need to change it for the other values in the future. But for the moment
+only the single value interpretation was changed.
diff --git a/ChangeLog b/ChangeLog
index 5af50e1..4e484c5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -173,6 +173,7 @@
* 2887 FIX: docsis_channels_downstream: now handles correct the downstream power in dBmV
* 2914 FIX: hr_cpu: fixed check not being applied on some systems...
* 1320 FIX: fjdarye60_sum: Fixed bug in discovery function
+ * 2898 FIX: smart: Using normalized value for determining reallocated events state...
Multisite:
* 2684 Added icons for downloading agent data / walks of hosts...
diff --git a/checks/smart b/checks/smart
index 21e598c..a9b1ce2 100644
--- a/checks/smart
+++ b/checks/smart
@@ -24,8 +24,6 @@
# to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA 02110-1301 USA.
-# Author: Lars Michelsen <lm(a)mathias-kettner.de>
-
# EXAMPLE DATA FROM: WDC SSC-D0128SC-2100
#<<<smart>>>
#/dev/sda ATA WDC_SSC-D0128SC- 1 Raw_Read_Error_Rate 0x000b 100 100 050 Pre-fail Always - 16777215
@@ -58,7 +56,7 @@ smart_stats_default_levels = {
'udma_crcs': (1, 1),
}
-def parse_smart(info):
+def parse_smart_raw_values(info):
disks = {}
disk_name = None
@@ -76,6 +74,25 @@ def parse_smart(info):
return disks
+def parse_smart_normalized_values(info):
+ disks = {}
+ disk_name = None
+
+ for line in info:
+ if len(line) >= 13:
+ if line[0] != disk_name:
+ disk_name = line[0]
+ disk = {}
+ disks[disk_name] = disk
+
+ field = line[4]
+ if field != "Unknown_Attribute":
+ value = int(line[6])
+ threshold = int(line[8])
+ disk[field] = value, threshold
+ return disks
+
+
smart_stats_fields = [
'Reallocated_Sector_Ct',
'Spin_Retry_Count',
@@ -87,7 +104,7 @@ smart_stats_fields = [
'UDMA_CRC_Error_Count', ]
def inventory_smart_stats(info):
- disks = parse_smart(info)
+ disks = parse_smart_raw_values(info)
inventory = []
for disk_name, disk in disks.items():
for field in disk.keys():
@@ -100,7 +117,10 @@ def inventory_smart_stats(info):
def check_smart_stats(item, params, info):
# params is a snapshot of all counters at the point of time of inventory
- disks = parse_smart(info)
+
+ disks = parse_smart_raw_values(info)
+ normalized = parse_smart_normalized_values(info)
+
if item not in disks:
return 3, "Disk not found"
disk = disks[item]
@@ -124,11 +144,23 @@ def check_smart_stats(item, params, info):
value = disk[field]
infos.append("%s: %d%s" % (descr, value, unit))
perfdata.append((field, value))
+
if field in params:
ref_value = params[field]
- if value > ref_value:
+
+ # For reallocated event counts we experienced to many reported errors for disks
+ # which still seem to be OK. The raw value increased by a small amount but the
+ # aggregated value remained at it's initial/ok state. So we use the aggregated
+ # value now. Only for this field.
+ if field == "Reallocated_Event_Count":
+ infos[-1] += " (was %d during discovery; normalized value looks OK)" % ref_value
+ norm_value, norm_threshold = normalized[item][field]
+ if norm_value <= norm_threshold:
+ state = 2
+
+ elif value > ref_value:
state = 2
- infos[-1] += " - was %d during inventory(!!)" % ref_value
+ infos[-1] += "(!!) (was %d during discovery)" % ref_value
return state, ", ".join(infos), perfdata
@@ -141,13 +173,13 @@ check_info["smart.stats"] = {
def inventory_smart_temp(info):
- disks = parse_smart(info)
+ disks = parse_smart_raw_values(info)
return [(disk_name, 'smart_temp_default_levels')
for disk_name, disk in disks.items()
if "Temperature_Celsius" in disk]
def check_smart_temp(item, params, info):
- disks = parse_smart(info)
+ disks = parse_smart_raw_values(info)
try:
temperature = disks[item]["Temperature_Celsius"]