benchmark: put downloaded sites into a configurable subdir

scrapinghub · immerrr · Feb 27, 2015 · Feb 27, 2015 · Feb 28, 2015 · Mar 2, 2015
commit 938e04b433d319f0ac43a74b34cf68173bcc70a3
diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst
@@ -3,7 +3,7 @@ This directory contains a preliminary version of splash benchmark suite.
 To use it, do the following:
 
 - install ``httrack``
-- create a directory for downloaded files, e.g. ``files``
-- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
+- run ``python download_sites.py``, it will create ``sites`` subdirectory in
+  current directory and download sites to be used in the benchmark there
 - run ``python benchmark.py`` to run the benchmark
 
diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
@@ -17,6 +17,7 @@
 from multiprocessing.pool import ThreadPool
 from pprint import pformat
 from time import time
+import re
 
 import requests
 from splash.benchmark.file_server import serve_files
@@ -65,10 +66,9 @@ def make_render_png_lua_req(splash, params):
 
 #: Port at which static pages will be served.
 PORT = 8806
-#: Static pages to be used in the benchmark.
-PAGES = glob('localhost_8806/*.html')
 #: Combinations of width & height to test.
 WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
+#: Splash log filename.
 SPLASH_LOG = 'splash.log'
 #: This script is used to collect maxrss & cpu time from splash process.
 GET_PERF_STATS_SCRIPT = """
@@ -85,14 +85,24 @@ def make_render_png_lua_req(splash, params):
                     help='Request thread count')
 parser.add_argument('--request-count', type=int, default=10,
                     help='Benchmark request count')
+parser.add_argument('--sites-dir', type=str, default='sites',
+                    help='Directory with downloaded sites')
 
 
 def generate_requests(splash, args):
     log = logging.getLogger('generate_requests')
     log.info("Using pRNG seed: %s", args.seed)
+
+    # Static pages (relative to sites_dir) to be used in the benchmark.
+    pages = [re.sub('^%s/' % args.sites_dir, '', v)
+             for v in glob(os.path.join(args.sites_dir, 'localhost_8806',
+                                        '*.html'))]
+    for p in pages:
+        log.info("Using page for benchmark: %s", p)
+
     rng = random.Random(args.seed)
     for i in xrange(args.request_count):
-        page = rng.choice(PAGES)
+        page = rng.choice(pages)
         width, height = rng.choice(WIDTH_HEIGHT)
         req_factory = rng.choice(REQ_FACTORIES)
         url = 'http://localhost:%d/%s' % (PORT, page)
@@ -140,7 +150,7 @@ def main():
                     '--disable-xvfb',
                     '--max-timeout=600'])
 
-    with splash, serve_files(PORT):
+    with splash, serve_files(PORT, args.sites_dir):
         start_time = time()
         results = parallel_map(invoke_request, generate_requests(splash, args),
                                args.thread_count)

diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
@@ -1,4 +1,13 @@
+#!/usr/bin/env python
+
+"""
+Site downloader script for Splash benchmark suite.
+"""
+
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+import errno
 import json
+import os
 import re
 import subprocess
 from urlparse import urlsplit
@@ -9,7 +18,7 @@
 from splash.benchmark.file_server import serve_files
 from splash.tests.stress import lua_runonce
 
-script_html = """
+SCRIPT_HTML = """
 function main(splash)
 splash:set_images_enabled(false)
 splash:go(splash.args.url)
@@ -18,24 +27,19 @@
 end
 """
 
-script_png = """
-
-function main(splash)
-splash:go(splash.args.url)
-splash:wait(0.5)
-return splash:png()
-end
-"""
-
-
+#: This UA is used by httrack to mimic Splash requests when downloading sites.
 USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"
 
-
 PORT = 8806
 
+parser = ArgumentParser(description=__doc__,
+                        formatter_class=ArgumentDefaultsHelpFormatter)
+parser.add_argument('--sites-dir', default='sites',
+                    help='Directory for downloaded sites')
 
-def preprocess_main_page(url):
-    out = json.loads(lua_runonce(script_html, url=url,
+
+def preprocess_main_page(sites_dir, url):
+    out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
                                  splash_args=['--disable-lua-sandbox',
                                               '--disable-xvfb',
                                               '--max-timeout=600'],
@@ -56,13 +60,13 @@ def preprocess_main_page(url):
         out['html'] = html.tostring(root, encoding='utf-8',
                                     doctype='<!DOCTYPE html>')
     filename = re.sub(r'[^\w]+', '_', url) + '.html'
-    with open(filename, 'w') as f:
+    with open(os.path.join(sites_dir, filename), 'w') as f:
         f.write(out['html'])
     return filename
 
 
-def download_sites(sites):
-    local_files = [preprocess_main_page(s) for s in sites]
+def download_sites(sites_dir, sites):
+    local_files = [preprocess_main_page(sites_dir, s) for s in sites]
 
     local_urls = [
         'http://localhost:%(port)d/%(filename)s' % {
@@ -75,12 +79,20 @@ def download_sites(sites):
             '-%P',              # Try parsing links in non-href/src sections
             '-F', USERAGENT,    # Emulate splash UA
             '--depth=1']
-    subprocess.check_call(['httrack'] + args + local_urls)
-
-
-if __name__ == '__main__':
-    with serve_files(PORT):
-        download_sites([
+    subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir)
+
+
+def main():
+    args = parser.parse_args()
+    try:
+        os.makedirs(args.sites_dir)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+        elif not os.path.isdir(args.sites_dir):
+            raise RuntimeError("Not a directory: %s" % args.sites_dir)
+    with serve_files(PORT, args.sites_dir):
+        download_sites(args.sites_dir, [
             'http://www.wikipedia.org',
             'http://www.google.com',
             'http://www.reddit.com',
@@ -89,3 +101,7 @@ def download_sites(sites):
             # "http://blog.pinterest.com",
             # "http://imgur.com",
         ])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py
@@ -1,10 +1,22 @@
+#!/usr/bin/env python
+
+"""
+Simple static file server.
+"""
+
+import argparse
+import os
 import SimpleHTTPServer
 import SocketServer
 import subprocess
-import sys
 from contextlib import contextmanager
 
 
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument('port', type=int, help='Port number to listen at')
+parser.add_argument('directory', type=str, help='Directory to serve')
+
+
 class ReusingTCPServer(SocketServer.TCPServer):
     allow_reuse_address = True
 
@@ -15,17 +27,18 @@ def address_string(self):
 
 
 @contextmanager
-def serve_files(port):
+def serve_files(port, directory):
     """Serve files from current directory statically in a subprocess."""
     site_server = subprocess.Popen(['python', '-m', __name__,
-                                    str(port)])
+                                    str(port), directory])
     try:
         yield
     finally:
         site_server.terminate()
 
 
 if __name__ == '__main__':
-    port = int(sys.argv[1])
-    server = ReusingTCPServer(("", port), RequestHandler)
+    args = parser.parse_args()
+    os.chdir(args.directory)
+    server = ReusingTCPServer(("", args.port), RequestHandler)
     server.serve_forever()