[checkmk-commits] agent_azure: Persist metrics with resolution below one minute

7 May 2019

Module: check_mk
Branch: master
Commit: f445a8f9e0f03d89489acbf57d94cc00a794e710
URL:   
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=f445a8f9e0f03d…

Author: Moritz Kiemer &lt;mo(a)mathias-kettner.de&gt;
Date:   Mon May  6 09:03:01 2019 +0200

agent_azure: Persist metrics with resolution below one minute

CMK-1437

Change-Id: I5ed88c9317edf26163fddc5730f36426a92e6b80

---

 agents/special/agent_azure | 91 ++++++++++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 36 deletions(-)

diff --git a/agents/special/agent_azure b/agents/special/agent_azure
index a1d659a..3308f04 100755
--- a/agents/special/agent_azure
+++ b/agents/special/agent_azure
@@ -38,6 +38,7 @@ Special agent for monitoring azure cloud applications with Check_MK.
 import json
 import datetime
 import calendar
+import string
 import sys
 import re
 import argparse
@@ -46,6 +47,8 @@ import logging
 from multiprocessing import Process, Lock, Queue
 from Queue import Empty as QueueEmpty
 
+from pathlib2 import Path
+
 # We have to set a null handler for logging before importing the azure stuff.
 #   Otherwise a warning will be sent to stderr - and if for some other reason
 #   the agent returns a non-zero exit code this (irrelevant) warning would be
@@ -59,6 +62,9 @@ from azure.mgmt.compute import ComputeManagementClient
 from azure.common.credentials import ServicePrincipalCredentials
 logging.getLogger().handlers.pop()
 
+from cmk.utils.paths import tmp_dir
+
+from cmk.special_agents.utils import DataCache
 import cmk.utils.password_store
 cmk.utils.password_store.replace_passwords()
 
@@ -83,6 +89,8 @@ METRICS_SELECTED = {
     'Microsoft.Web/sites': [('CpuTime,AverageResponseTime,Http5xx',
'PT1M', 'total', None),],
 }
 
+AZURE_CACHE_FILE_PATH = Path(tmp_dir) / "agents" / "agent_azure"
+
 
 class AsyncMapper(object):  # pylint: disable=too-few-public-methods
     '''Create an async drop-in replacement for builtin 'map'
@@ -536,23 +544,36 @@ class AzureClient(object):
         return self._cache["resources"]
 
 
-class MetricsCollecter(object):
+class MetricCache(DataCache):
 
     NOW = datetime.datetime.utcnow()
 
-    def __init__(self, resource, debug=False):
-        super(MetricsCollecter, self).__init__()
-        self.debug = debug
-        self.resource = resource
-        self.remaining_reads = "unknown (no metrics fetched)"
-        self.timedeltas = {
+    def __init__(self, resource, metric_definition, debug=False):
+        self.metric_definition = metric_definition
+        metricnames = metric_definition[0]
+        super(MetricCache, self).__init__(self.get_cache_path(resource), metricnames,
debug=debug)
+        self.remaining_reads = None
+        self.timedelta = {
             "PT1M": datetime.timedelta(minutes=1),
             "PT5M": datetime.timedelta(minutes=5),
             "PT1H": datetime.timedelta(hours=1),
-        }
+        }[metric_definition[1]]
+
+    @staticmethod
+    def get_cache_path(resource):
+        valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
+        subdir = ''.join(c if c in valid_chars else '_' for c in
resource.info['id'])
+        return AZURE_CACHE_FILE_PATH / subdir
 
-    def _metric_api_call(self, monitor_client, timespan, interval, metric, aggregation,
filter_):
-        rid = self.resource.info['id']
+    @property
+    def cache_interval(self):
+        return self.timedelta.seconds
+
+    def get_validity_from_args(self, *args):
+        return True
+
+    @staticmethod
+    def _metric_api_call(monitor_client, rid, timespan, interval, metric, aggregation,
filter_):
         LOG.debug(
             "metrics.list(%r, timespan=%r, interval=%r, metric=%r,
aggregation=%r,"
             " filter=%r, raw=True)", rid, timespan, interval, metric,
aggregation, filter_)
@@ -571,17 +592,16 @@ class MetricsCollecter(object):
         raw_metrics = list(raw.output.value)
         return raw.response, raw_metrics
 
-    def _fetch_specific_metrics(self, monitor_client, metricnames, interval, aggregation,
filter_,
-                                err):
-        resource_id = self.resource.info["id"]
+    def get_live_data(self, monitor_client, resource_id, err):  # pylint:
disable=arguments-differ
+        metricnames, interval, aggregation, filter_ = self.metric_definition
 
-        start = self.NOW - 3 * self.timedeltas[interval]
+        start = self.NOW - 3 * self.timedelta
         timespan = "%s/%s" % (start.strftime("%Y-%m-%dT%H:%M:%SZ"),
                               self.NOW.strftime("%Y-%m-%dT%H:%M:%SZ"))
 
         try:
-            response, raw_metrics = self._metric_api_call(monitor_client, timespan,
interval,
-                                                          metricnames, aggregation,
filter_)
+            response, raw_metrics = self._metric_api_call(
+                monitor_client, resource_id, timespan, interval, metricnames,
aggregation, filter_)
         except () if self.debug else ErrorResponseException as exc:
             err.add("exception", resource_id, exc.message)
             LOG.exception(exc)
@@ -600,36 +620,35 @@ class MetricsCollecter(object):
 
         return metrics
 
-    # TODO: adding the metrics as a side effect is not good.
-    #       I'll get the caching right, then change this back. (mo)
-    def collect_metrics(self, monitor_client, err):
-        metric_params = METRICS_SELECTED.get(self.resource.info["type"], [])
 
-        for metricnames, interval, aggregation, filter_ in metric_params:
-            self.resource.metrics += self._fetch_specific_metrics(
-                monitor_client, metricnames, interval, aggregation, filter_, err)
+def gather_metrics(client, resource):
+    '''
+    Gather all metrics for a resource. These metrics have different time
+    resolutions, so every metric needs its own cache.
+    Along the way collect ocurrring errors and keep track of the remaining
+    API reads.
+    '''
+    err = IssueCollecter()
+    metric_definitions = METRICS_SELECTED.get(resource.info["type"], [])
+    remaining_api_reads = None
+    for metric_def in metric_definitions:
+        cache = MetricCache(resource, metric_def, debug=client.args.debug)
+        resource.metrics += cache.get_data(client.monitor_client,
resource.info['id'], err)
+        remaining_api_reads = (min((remaining_api_reads, cache.remaining_reads)) or
+                               cache.remaining_reads or remaining_api_reads)
+    remaining_api_reads = remaining_api_reads or "unknown (no metrics
fetched)"
+    return remaining_api_reads, err
 
 
 def process_resource(args):
     resource, client = args
 
-    #
-    # process the resource with all clients specific to the resource type
-    #
     client.process_specific(resource)
 
-    #
-    # gather all metrics of the resource
-    #
-    err = IssueCollecter()  # pass this to methods to collect issues
-    metrics_collecter = MetricsCollecter(resource, debug=client.args.debug)
-    metrics_collecter.collect_metrics(client.monitor_client, err)
+    remaining_api_reads, err = gather_metrics(client, resource)
 
-    #
-    # create all sections
-    #
     agent_info_section = AzureSection('agent_info')
-    agent_info_section.add(('remaining-reads',
metrics_collecter.remaining_reads))
+    agent_info_section.add(('remaining-reads', remaining_api_reads))
     agent_info_section.add(err.dumpinfo())
 
     sections = [agent_info_section]


    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

[checkmk-commits] agent_azure: Persist metrics with resolution below one minute