Module: check_mk
Branch: master
Commit: d672f3898694778f678e906945a20601bdbce6f1
URL:
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=d672f389869477…
Author: Jukka Aro <ja(a)mathias-kettner.de>
Date: Wed Nov 22 12:06:59 2017 +0100
5411 FIX Windows agent: handle WMI timeouts
All sections depending on WMI (Windows Management Instrumentation)
queries have been suffering from periodic freezing, the time interval
between subsequent freezes being typically 18...20 minutes. At those
moments, the Windows agent has not been delivering any output for some
of its WMI-dependent sections (e. g., ps, uptime, dotnet_clrmemory,
wmi_cpuload, msexch and wmi_webservices). The corresponding checks have
issued error messages of type "Missing agent sections...". Various
strategies have been previously used attempting to cope with the
periodic problems with WMI. Werk #4008 introduced a timeout of 10s in
order to prevent the agent from completely blocking if a WMI query
freezes. However, this led to the described problem of missing agent
output totally when no response was given to a WMI query within 10s.
Moreover, multiple WMI queries waiting for 10s after another led to
periodic long execution times of the Windows agent.
This Werk introduces a new strategy for coping with the periodic
freezing of WMI queries. The timeout of the queries is reduced to 2.5s
instead of 10s per query, reducing the total execution time of the
Windows agent by approximately 75% when the problem occurs. Upon a WMI
timeout, the Windows agent issues it in its output so, that the affected
checks can tolerate it by setting their state to UNKNOWN. In normal
cases, the check should get back to OK when the agent is contacted the
next time and the WMI freeze is most likely gone.
There seems to be a connection of the WMI freezes to the Windows service
WMI Performance Adapter.
https://lokna.no/?p=1430 suggests that the
startup type of this service be set to automatic, ensuring the service
is running. Without this, the WMI Performance Adapter seems to get
started periodically when WMI is queried. Testing with WMI Performance
Adapter service running has showed clear signs of improvement, reducing
the frequency of freezing WMI queries even if not completely ending
them.
Change-Id: I83024d93ee41e47853fde1fbdd59680ad3dd07cd
---
.werks/5411 | 41 ++++++++++++++++++++++++
agents/windows/build_version | 2 +-
agents/windows/sections/SectionWMI.cc | 60 ++++++++++++++++++++++-------------
agents/windows/wmiHelper.cc | 41 +++++++++++++-----------
agents/windows/wmiHelper.h | 5 +++
checks/wmi.include | 6 ++++
6 files changed, 113 insertions(+), 42 deletions(-)
diff --git a/.werks/5411 b/.werks/5411
new file mode 100644
index 0000000..c155fe9
--- /dev/null
+++ b/.werks/5411
@@ -0,0 +1,41 @@
+Title: Windows agent: handle WMI timeouts
+Level: 1
+Component: checks
+Compatible: compat
+Edition: cre
+Version: 1.5.0i2
+Date: 1511347136
+Class: fix
+
+All sections depending on WMI (Windows Management Instrumentation)
+queries have been suffering from periodic freezing, the time interval
+between subsequent freezes being typically 18...20 minutes. At those
+moments, the Windows agent has not been delivering any output for some
+of its WMI-dependent sections (e. g., ps, uptime, dotnet_clrmemory,
+wmi_cpuload, msexch and wmi_webservices). The corresponding checks have
+issued error messages of type "Missing agent sections...". Various
+strategies have been previously used attempting to cope with the
+periodic problems with WMI. Werk #4008 introduced a timeout of 10s in
+order to prevent the agent from completely blocking if a WMI query
+freezes. However, this led to the described problem of missing agent
+output totally when no response was given to a WMI query within 10s.
+Moreover, multiple WMI queries waiting for 10s after another led to
+periodic long execution times of the Windows agent.
+
+This Werk introduces a new strategy for coping with the periodic
+freezing of WMI queries. The timeout of the queries is reduced to 2.5s
+instead of 10s per query, reducing the total execution time of the
+Windows agent by approximately 75% when the problem occurs. Upon a WMI
+timeout, the Windows agent issues it in its output so, that the affected
+checks can tolerate it by setting their state to UNKNOWN. In normal
+cases, the check should get back to OK when the agent is contacted the
+next time and the WMI freeze is most likely gone.
+
+There seems to be a connection of the WMI freezes to the Windows service
+WMI Performance Adapter.
https://lokna.no/?p=1430 suggests that the
+startup type of this service be set to automatic, ensuring the service
+is running. Without this, the WMI Performance Adapter seems to get
+started periodically when WMI is queried. Testing with WMI Performance
+Adapter service running has showed clear signs of improvement, reducing
+the frequency of freezing WMI queries even if not completely ending
+them.
diff --git a/agents/windows/build_version b/agents/windows/build_version
index 13de30f..830a203 100644
--- a/agents/windows/build_version
+++ b/agents/windows/build_version
@@ -1 +1 @@
-3000
+3002
diff --git a/agents/windows/sections/SectionWMI.cc
b/agents/windows/sections/SectionWMI.cc
index 5b8678f..57956fa 100644
--- a/agents/windows/sections/SectionWMI.cc
+++ b/agents/windows/sections/SectionWMI.cc
@@ -65,7 +65,10 @@ void SectionWMI::outputTable(std::ostream &out, wmi::Result
&data) {
if (!data.valid()) {
return;
}
- out << Utf8(join(data.names(), L",")) << "\n";
+
+ // First use a local stream buffer...
+ std::stringstream localStream;
+ localStream << Utf8(join(data.names(), L",")) <<
"\n";
// output data
bool more = true;
@@ -76,13 +79,17 @@ void SectionWMI::outputTable(std::ostream &out, wmi::Result
&data) {
[&data](const std::wstring &name) {
return data.get<std::wstring>(name.c_str());
});
- out << Utf8(join(values, L","));
+ localStream << Utf8(join(values, L","));
more = data.next();
+
if (more) {
- out << "\n";
+ localStream << "\n";
}
}
+
+ // ...and output local stream buffer only when no WMI timeout was thrown.
+ out << localStream.rdbuf();
}
void SectionWMI::suspend(int duration) {
@@ -94,31 +101,40 @@ bool SectionWMI::produceOutputInner(std::ostream &out) {
return false;
}
- if (_helper.get() == nullptr) {
- _helper.reset(new wmi::Helper(_winapi, _namespace.c_str()));
- }
+ bool success = true;
+
+ try {
+ if (_helper.get() == nullptr) {
+ _helper.reset(new wmi::Helper(_winapi, _namespace.c_str()));
+ }
wmi::Result result(_winapi);
- if (_columns.empty()) {
- // no columns set, return everything
- result = _helper->getClass(_object.c_str());
- } else {
- std::wstringstream query;
- query << L"SELECT " << join(_columns, L",")
<< L" FROM " << _object;
- result = _helper->query(query.str().c_str());
- }
+ if (_columns.empty()) {
+ // no columns set, return everything
+ result = _helper->getClass(_object.c_str());
+ } else {
+ std::wstringstream query;
+ query << L"SELECT " << join(_columns, L",")
<< L" FROM " << _object;
+ result = _helper->query(query.str().c_str());
+ }
- bool success = result.valid() || SUCCEEDED(result.last_error());
+ success = result.valid() || SUCCEEDED(result.last_error());
- if (_toggle_if_missing && !success) {
- // in the past, wmi tables were toggled permanently if they were
- // missing,
- // but testing occasionally shouldn't hurt.
- suspend(3600);
- }
+ if (_toggle_if_missing && !success) {
+ // in the past, wmi tables were toggled permanently if they were
+ // missing,
+ // but testing occasionally shouldn't hurt.
+ suspend(3600);
+ }
- outputTable(out, result);
+ outputTable(out, result);
+ } catch (const wmi::Timeout &t) {
+ // Output WMI timeout so that the check in question knows to handle it.
+ out << t.what() << std::endl;
+ Debug(_logger) << "SectionWMI::produceOutputInner caught "
<< t.what();
+ success = true;
+ }
return success;
}
diff --git a/agents/windows/wmiHelper.cc b/agents/windows/wmiHelper.cc
index 903f700..09484d7 100644
--- a/agents/windows/wmiHelper.cc
+++ b/agents/windows/wmiHelper.cc
@@ -193,27 +193,30 @@ bool Result::next() {
return false;
}
- IWbemClassObject *obj;
- ULONG numReturned;
+ IWbemClassObject *obj = nullptr;
+ ULONG numReturned = 0;
// always retrieve only one element
- HRESULT res = _enumerator->Next(10000, 1, &obj, &numReturned);
-
- if (FAILED(res)) {
- // in this case the "current" object isn't changed to guarantee
that the
- // Result remains valid
- // throw ComException("Failed to retrieve element", res, _winapi);
- _last_error = res;
- return false;
- }
-
- if (numReturned == 0) {
- // no more values. the current object remains at the last element so
- // that a call to get continues to work
- return false;
+ HRESULT res = _enumerator->Next(2500, 1, &obj, &numReturned);
+
+ switch (res) {
+ case WBEM_NO_ERROR:
+ _current.reset(obj, releaseInterface);
+ return true;
+ case WBEM_S_FALSE:
+ // No more values. The current object remains at the last element so
+ // that a call to get continues to work.
+ return false;
+ case WBEM_S_TIMEDOUT:
+ // A timeout occurred before getting the object.
+ throw Timeout("WMItimeout");
+ default:
+ // Any of the four possible errors: WBEM_E_INVALID_PARAMETER,
+ // WBEM_E_OUT_OF_MEMORY, WBEM_E_UNEXPECTED or
+ // WBEM_E_TRANSPORT_FAILURE. In this case the "current" object
isn't
+ // changed to guarantee that the Result remains valid.
+ _last_error = res;
+ return false;
}
-
- _current.reset(obj, releaseInterface);
- return true;
}
template <>
diff --git a/agents/windows/wmiHelper.h b/agents/windows/wmiHelper.h
index 736cfb5..67fd503 100644
--- a/agents/windows/wmiHelper.h
+++ b/agents/windows/wmiHelper.h
@@ -55,6 +55,11 @@ struct ComTypeException : public std::runtime_error {
ComTypeException(const std::string &message);
};
+class Timeout : public std::runtime_error {
+public:
+ explicit Timeout(const std::string &msg) : std::runtime_error(msg) {}
+};
+
class Variant {
VARIANT _value;
const WinApiAdaptor &_winapi;
diff --git a/checks/wmi.include b/checks/wmi.include
index ae744cf..5d88176 100644
--- a/checks/wmi.include
+++ b/checks/wmi.include
@@ -147,6 +147,9 @@ def parse_wmi_table(info, key="Name"):
timestamp = int(line[1])
frequency = int(line[2])
line = info_iter.next()
+ elif line[0] == "WMItimeout":
+ # main section got WMI timeout
+ raise MKCounterWrapped("WMI query timed out")
while line is not None:
current = None
if len(line) == 1 and line[0].startswith("["):
@@ -155,6 +158,9 @@ def parse_wmi_table(info, key="Name"):
res = {}
tablename = regex("\[(.*)\]").search(line[0]).group(1)
line = info_iter.next()
+ if line[0] == "WMItimeout":
+ # subsection got WMI timeout
+ raise MKCounterWrapped("WMI query timed out")
if tablename in res:
# known table, append to it
current = res[tablename]