[checkmk-commits] Reduced load caused by GUI crawl tests; made test more reliable

19 Dec 2016

Module: check_mk
Branch: master
Commit: 8a528b3c0d25addcc138c75a1b735b348331080d
URL:   
http://git.mathias-kettner.de/git/?p=check_mk.git;a=commit;h=8a528b3c0d25ad…

Author: Lars Michelsen &lt;lm(a)mathias-kettner.de&gt;
Date:   Mon Dec 19 20:13:48 2016 +0100

Reduced load caused by GUI crawl tests; made test more reliable

Change-Id: I8d8367a3749df569150c714397ffcc25c28378fe

---

 tests/testlib/__init__.py | 45 +++++++++++++++++++--------------------------
 tests/web/test_crawl.py   | 28 ++++++++++------------------
 2 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/tests/testlib/__init__.py b/tests/testlib/__init__.py
index 201934e..aef8123 100644
--- a/tests/testlib/__init__.py
+++ b/tests/testlib/__init__.py
@@ -434,6 +434,8 @@ class Site(object):
 class WebSession(requests.Session):
     def __init__(self):
         self.transids = []
+        # Resources are
+        self.verified_resources = set()
         super(WebSession, self).__init__()
 
 
@@ -541,50 +543,41 @@ class WebSession(requests.Session):
     def _check_html_page_resources(self, response):
         soup = BeautifulSoup(response.text, "lxml")
 
-        parsed_url = urlparse(response.url)
-
-        base_url = parsed_url.path
-        if ".py" in base_url:
-            base_url = os.path.dirname(base_url)
-
         # There might be other resources like iframe, audio, ... but we don't care
about them
 
-        for img_url in self._find_resource_urls("img", "src", soup):
-            assert not img_url.startswith("/"), "%s starts with /" %
img_url
-            req = self.get(base_url + "/" + img_url, proto=parsed_url.scheme,
verify=False)
-
-            mime_type = self._get_mime_type(req)
-            assert mime_type in [ "image/png" ]
+        self._check_resources(soup, response, "img",    "src",  [
"image/png"])
+        self._check_resources(soup, response, "script", "src",  [
"application/javascript"])
+        self._check_resources(soup, response, "link",   "href", [
"text/css"], filters=[("rel", "stylesheet")])
+        self._check_resources(soup, response, "link",   "href", [
"image/vnd.microsoft.icon"], filters=[("rel", "shortcut
icon")])
 
-        for script_url in self._find_resource_urls("script", "src",
soup):
-            assert not script_url.startswith("/")
-            req = self.get(base_url + "/" + script_url,
proto=parsed_url.scheme, verify=False)
 
-            mime_type = self._get_mime_type(req)
-            assert mime_type in [ "application/javascript" ]
+    def _check_resources(self, soup, response, tag, attr, allowed_mime_types,
filters=None):
+        parsed_url = urlparse(response.url)
 
-        for css_url in self._find_resource_urls("link", "href", soup,
filters=[("rel", "stylesheet")]):
-            assert not css_url.startswith("/")
-            req = self.get(base_url + "/" + css_url, proto=parsed_url.scheme,
verify=False)
+        base_url = parsed_url.path
+        if ".py" in base_url:
+            base_url = os.path.dirname(base_url)
 
-            mime_type = self._get_mime_type(req)
-            assert mime_type in [ "text/css" ]
+        for url in self._find_resource_urls(tag, attr, soup, filters):
+            # Only check resources once per session
+            if url in self.verified_resources:
+                continue
+            self.verified_resources.add(url)
 
-        for url in self._find_resource_urls("link", "href", soup,
filters=[("rel", "shortcut icon")]):
             assert not url.startswith("/")
             req = self.get(base_url + "/" + url, proto=parsed_url.scheme,
verify=False)
 
             mime_type = self._get_mime_type(req)
-            assert mime_type in [ "image/vnd.microsoft.icon" ]
+            assert mime_type in allowed_mime_types
 
 
-    def _find_resource_urls(self, tag, attribute, soup, filters=[]):
+    def _find_resource_urls(self, tag, attribute, soup, filters=None):
         urls = []
 
         for element in soup.findAll(tag):
             try:
                 skip = False
-                for attr, val in filters:
+                for attr, val in filters or []:
                     if element[attr] != val:
                         skip = True
                         break
diff --git a/tests/web/test_crawl.py b/tests/web/test_crawl.py
index 7af4a68..5876d6a 100644
--- a/tests/web/test_crawl.py
+++ b/tests/web/test_crawl.py
@@ -61,9 +61,9 @@ class Worker(threading.Thread):
                     except Exception, e:
                         self.error(url, "Failed to visit: %s\n%s" %
                                      (e, traceback.format_exc()))
-                    self.idle = True
                     self.crawler.todo.task_done()
             except Queue.Empty:
+                self.idle = True
                 time.sleep(0.5)
 
 
@@ -82,7 +82,7 @@ class Worker(threading.Thread):
 
         started = time.time()
         try:
-            #print url.url_without_host()
+            #print "FETCH", url.url_without_host()
             response = self.crawler.client.get(url.url_without_host())
         except AssertionError, e:
             if "This view can only be used in mobile mode" in "%s" %
e:
@@ -142,10 +142,9 @@ class Worker(threading.Thread):
     def check_response(self, url, response):
         soup = BeautifulSoup(response.text, "lxml")
 
+        # The referenced resources (images, stylesheets, javascript files) are checked
by
+        # the generic web client handler. This only needs to reaslize the crawling.
         self.check_content(url, response, soup)
-        self.check_images(url, soup)
-        self.check_styles(url, soup)
-        self.check_scripts(url, soup)
         self.check_links(url, soup)
         self.check_frames(url, soup)
         self.check_iframes(url, soup)
@@ -181,10 +180,6 @@ class Worker(threading.Thread):
         self.check_referenced(url, soup, "a", "href")
 
 
-    def check_images(self, url, soup):
-        self.check_referenced(url, soup, "img", "src")
-
-
     def check_referenced(self, referer_url, soup, tag, attr):
         elements = soup.find_all(tag)
 
@@ -193,17 +188,10 @@ class Worker(threading.Thread):
             url = self.normalize_url(self.crawler.site.internal_url, orig_url)
 
             if url is not None and self.is_valid_url(url) \
-               and url not in self.crawler.visited:
+               and url not in self.crawler.handled:
                 #file("/tmp/todo", "a").write("%s (%s)\n" %
(url, referer_url.url))
                 self.crawler.todo.put(Url(url, orig_url=orig_url,
referer_url=referer_url.url))
-
-
-    def check_styles(self, url, soup):
-        pass # TODO
-
-
-    def check_scripts(self, url, soup):
-        pass # TODO
+                self.crawler.handled.add(url)
 
 
     def is_valid_url(self, url):
@@ -276,6 +264,9 @@ class TestCrawler(object):
         self.todo    = SetQueue()
         self.started = time.time()
         self.visited = []
+        # Contains all already seen and somehow handled URLs. Something like the
+        # summary of self.todo and self.handled but todo contains Url() objects.
+        self.handled = set()
         self.errors  = []
         self.site    = site
         self.client  = web
@@ -284,6 +275,7 @@ class TestCrawler(object):
         self.load_stats()
 
         self.todo.put(Url(site.internal_url))
+        self.handled.add(site.internal_url)
 
         self.crawl()
 


    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

[checkmk-commits] Reduced load caused by GUI crawl tests; made test more reliable