From 85b42571e861b7e7fb095083304d2f64ea86e790 Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Fri, 27 Feb 2015 21:10:27 +0300
Subject: [PATCH 01/10] Initial version of benchmark suite

---
 splash/benchmark/README.rst        |  9 +++
 splash/benchmark/benchmark.py      | 80 ++++++++++++++++++++++++++
 splash/benchmark/download_sites.py | 90 ++++++++++++++++++++++++++++++
 splash/benchmark/file_server.py    | 31 ++++++++++
 4 files changed, 210 insertions(+)
 create mode 100644 splash/benchmark/README.rst
 create mode 100755 splash/benchmark/benchmark.py
 create mode 100644 splash/benchmark/download_sites.py
 create mode 100644 splash/benchmark/file_server.py

diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst
new file mode 100644
index 000000000..10e3b3e23
--- /dev/null
+++ b/splash/benchmark/README.rst
@@ -0,0 +1,9 @@
+This directory contains a preliminary version of splash benchmark suite.
+
+To use it, do the following:
+
+- install ``httrack``
+- create a directory for downloaded files, e.g. ``files``
+- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
+- run ``python benchmark.py`` to run the benchmark
+
diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
new file mode 100755
index 000000000..44201c1c7
--- /dev/null
+++ b/splash/benchmark/benchmark.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+"""
+Splash benchmark script.
+
+It takes a directory downloaded with splash & httrack, fires up a static file
+server and runs a series of requests via splash on those downloaded pages.
+
+"""
+
+import logging
+import random
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+from glob import glob
+from multiprocessing.pool import ThreadPool
+
+import requests
+from splash.file_server import serve_files
+from splash.tests.utils import SplashServer
+
+PORT = 8806
+#: URLs to benchmark against.
+PAGES = glob('localhost_8806/*.html')
+#: Combinations of width & height to test.
+WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
+# XXX: add benchmark of different API endpoints.
+SPLASH_LOG = 'splash.log'
+
+parser = ArgumentParser(description=__doc__,
+                        formatter_class=ArgumentDefaultsHelpFormatter)
+parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number')
+parser.add_argument('--thread-count', type=int, default=1,
+                    help='Request thread count')
+parser.add_argument('--request-count', type=int, default=10,
+                    help='Benchmark request count')
+
+
+def generate_requests(splash, args):
+    log = logging.getLogger('generate_requests')
+    log.info("Using pRNG seed: %s", args.seed)
+    rng = random.Random(args.seed)
+    for i in xrange(args.request_count):
+        page = rng.choice(PAGES)
+        width, height = rng.choice(WIDTH_HEIGHT)
+        url = 'http://localhost:%d/%s' % (PORT, page)
+        yield (i + 1, args.request_count,
+               {'url': splash.url('render.png'),
+                'params': {'url': url, 'width': width, 'height': height}})
+
+
+def parallel_map(func, iterable, thread_count):
+    if thread_count == 1:
+        return map(func, iterable)
+    else:
+        pool = ThreadPool(thread_count)
+        return pool.map(func, iterable)
+
+
+def invoke_request(invoke_args):
+    log = logging.getLogger('bench_worker')
+    req_no, total_reqs, kwargs = invoke_args
+    log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs)
+    return requests.get(**kwargs)
+
+
+def main():
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    with SplashServer(logfile=SPLASH_LOG,
+                      extra_args=['--disable-lua-sandbox',
+                                  '--disable-xvfb',
+                                  '--max-timeout=600']) as splash, \
+         serve_files(PORT):
+        parallel_map(invoke_request, generate_requests(splash, args),
+                     args.thread_count)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
new file mode 100644
index 000000000..0d0bf4b0f
--- /dev/null
+++ b/splash/benchmark/download_sites.py
@@ -0,0 +1,90 @@
+from splash.tests.stress import lua_runonce
+
+import re
+from urlparse import urlsplit
+import json
+from lxml import html
+import w3lib.html
+import subprocess
+from splash.file_server import serve_files
+
+script_html = """
+function main(splash)
+splash:set_images_enabled(false)
+splash:go(splash.args.url)
+splash:wait(0.5)
+return {url=splash:url(), html=splash:html()}
+end
+"""
+
+script_png = """
+
+function main(splash)
+splash:go(splash.args.url)
+splash:wait(0.5)
+return splash:png()
+end
+"""
+
+
+USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"
+
+
+PORT = 8806
+
+
+def preprocess_main_page(url):
+    out = json.loads(lua_runonce(script_html, url=url,
+                                 splash_args=['--disable-lua-sandbox',
+                                              '--disable-xvfb',
+                                              '--max-timeout=600'],
+                                 timeout=600.,))
+    final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
+    if not w3lib.html.get_base_url(out['html']):
+        out['html'] = w3lib.html.remove_tags_with_content(
+            out['html'], ('script',))
+        root = html.fromstring(out['html'], parser=html.HTMLParser(),
+                               base_url=final_url)
+        try:
+            head = root.xpath('./head')[0]
+        except IndexError:
+            head = html.Element('head')
+            root.insert(0, head)
+        head.insert(0, html.Element('base', {'href': final_url}))
+        head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
+        out['html'] = html.tostring(root, encoding='utf-8',
+                                    doctype='<!DOCTYPE html>')
+    filename = re.sub(r'[^\w]+', '_', url) + '.html'
+    with open(filename, 'w') as f:
+        f.write(out['html'])
+    return filename
+
+
+def download_sites(sites):
+    local_files = [preprocess_main_page(s) for s in sites]
+
+    local_urls = [
+        'http://localhost:%(port)d/%(filename)s' % {
+            'port': PORT, 'filename': filename
+        }
+        for filename in local_files
+    ]
+    args = ['--continue',
+            '--near',           # Fetch referred non-html files.
+            '-%P',              # Try parsing links in non-href/src sections
+            '-F', USERAGENT,    # Emulate splash UA
+            '--depth=1']
+    subprocess.check_call(['httrack'] + args + local_urls)
+
+
+if __name__ == '__main__':
+    with serve_files(PORT):
+        download_sites([
+            'http://www.wikipedia.org',
+            'http://www.google.com',
+            'http://www.reddit.com',
+            "http://w3.org",
+            "http://w3.org/TR/2010/REC-xhtml-basic-20101123/",
+            # "http://blog.pinterest.com",
+            # "http://imgur.com",
+        ])
diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py
new file mode 100644
index 000000000..77e2b7084
--- /dev/null
+++ b/splash/benchmark/file_server.py
@@ -0,0 +1,31 @@
+import SimpleHTTPServer
+import SocketServer
+import subprocess
+import sys
+from contextlib import contextmanager
+
+
+class ReusingTCPServer(SocketServer.TCPServer):
+    allow_reuse_address = True
+
+
+class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    def address_string(self):
+        return "fileserver"
+
+
+@contextmanager
+def serve_files(port):
+    """Serve files from current directory statically in a subprocess."""
+    site_server = subprocess.Popen(['python', '-m', __name__,
+                                    str(port)])
+    try:
+        yield
+    finally:
+        site_server.terminate()
+
+
+if __name__ == '__main__':
+    port = int(sys.argv[1])
+    server = ReusingTCPServer(("", port), RequestHandler)
+    server.serve_forever()

From b1e0baeab54ac198ae80af5afb8bba532ed4dd39 Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Fri, 27 Feb 2015 22:50:46 +0300
Subject: [PATCH 02/10] Fix & organize imports

---
 splash/benchmark/__init__.py       |  0
 splash/benchmark/benchmark.py      |  2 +-
 splash/benchmark/download_sites.py | 11 ++++++-----
 3 files changed, 7 insertions(+), 6 deletions(-)
 create mode 100644 splash/benchmark/__init__.py

diff --git a/splash/benchmark/__init__.py b/splash/benchmark/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index 44201c1c7..25fac405e 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -15,7 +15,7 @@
 from multiprocessing.pool import ThreadPool
 
 import requests
-from splash.file_server import serve_files
+from splash.benchmark.file_server import serve_files
 from splash.tests.utils import SplashServer
 
 PORT = 8806
diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
index 0d0bf4b0f..abe921eee 100644
--- a/splash/benchmark/download_sites.py
+++ b/splash/benchmark/download_sites.py
@@ -1,12 +1,13 @@
-from splash.tests.stress import lua_runonce
-
+import json
 import re
+import subprocess
 from urlparse import urlsplit
-import json
+
 from lxml import html
+
 import w3lib.html
-import subprocess
-from splash.file_server import serve_files
+from splash.benchmark.file_server import serve_files
+from splash.tests.stress import lua_runonce
 
 script_html = """
 function main(splash)

From 9b418b775609162d0d8c399075316a0e8272b4ad Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Sat, 28 Feb 2015 19:20:20 +0300
Subject: [PATCH 03/10] benchmark: print some metrics

---
 splash/benchmark/benchmark.py | 51 ++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index 25fac405e..da4d8bf2e 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -9,22 +9,33 @@
 """
 
 import logging
+import os
 import random
+import shutil
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 from glob import glob
 from multiprocessing.pool import ThreadPool
+from pprint import pformat
+from time import time
 
 import requests
 from splash.benchmark.file_server import serve_files
 from splash.tests.utils import SplashServer
 
+#: Port at which static pages will be served.
 PORT = 8806
-#: URLs to benchmark against.
+#: Static pages to be used in the benchmark.
 PAGES = glob('localhost_8806/*.html')
 #: Combinations of width & height to test.
 WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
 # XXX: add benchmark of different API endpoints.
 SPLASH_LOG = 'splash.log'
+#: This script is used to collect maxrss & cpu time from splash process.
+GET_PERF_STATS_SCRIPT = """
+function main(splash)
+  return splash:get_perf_stats()
+end
+"""
 
 parser = ArgumentParser(description=__doc__,
                         formatter_class=ArgumentDefaultsHelpFormatter)
@@ -60,20 +71,42 @@ def invoke_request(invoke_args):
     log = logging.getLogger('bench_worker')
     req_no, total_reqs, kwargs = invoke_args
     log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs)
-    return requests.get(**kwargs)
+    stime = time()
+    requests.get(**kwargs)
+    etime = time()
+    return {'start_time': stime,
+            'end_time': etime,
+            'duration': etime - stime,
+            'endpoint': kwargs['url'],
+            'site': kwargs['params']['url'],
+            'width': kwargs['params']['width'],
+            'height': kwargs['params']['height']}
 
 
 def main():
+    log = logging.getLogger("benchmark")
     args = parser.parse_args()
     logging.basicConfig(level=logging.DEBUG)
 
-    with SplashServer(logfile=SPLASH_LOG,
-                      extra_args=['--disable-lua-sandbox',
-                                  '--disable-xvfb',
-                                  '--max-timeout=600']) as splash, \
-         serve_files(PORT):
-        parallel_map(invoke_request, generate_requests(splash, args),
-                     args.thread_count)
+    splash = SplashServer(
+        logfile=SPLASH_LOG,
+        extra_args=['--disable-lua-sandbox',
+                    '--disable-xvfb',
+                    '--max-timeout=600'])
+
+    with splash, serve_files(PORT):
+        start_time = time()
+        results = parallel_map(invoke_request, generate_requests(splash, args),
+                               args.thread_count)
+        end_time = time()
+        resources = requests.get(
+            splash.url('execute'),
+            params={'lua_source': GET_PERF_STATS_SCRIPT}).json()
+
+    log.info("Request stats:\n%s", pformat(dict(enumerate(results))))
+    log.info("Splash max RSS: %s B", resources['maxrss'])
+    log.info("Splash CPU time elapsed: %.2f sec", resources['cputime'])
+    log.info("Wallclock time elapsed: %.2f sec", end_time - start_time)
 
 
 if __name__ == '__main__':

From 871d07dc5f7b6587f1b11998db54df978f5975cd Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Mon, 2 Mar 2015 16:47:08 +0300
Subject: [PATCH 04/10] benchmark: add different endpoints (png, json, lua-png)

---
 splash/benchmark/benchmark.py | 56 +++++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 5 deletions(-)

diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index da4d8bf2e..b9c501713 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -22,13 +22,53 @@
 from splash.benchmark.file_server import serve_files
 from splash.tests.utils import SplashServer
 
+
+def make_render_png_req(splash, params):
+    """Prepare request for render.png endpoint."""
+    return {'url': splash.url('render.png'),
+            'params': params}
+
+
+def make_render_json_req(splash, params):
+    """Prepare request for render.json endpoint."""
+    json_params = params.copy()
+    json_params['png'] = 1
+    return {'url': splash.url('render.json'),
+            'params': json_params}
+
+
+def make_render_png_lua_req(splash, params):
+    """Prepare request for execute endpoint."""
+    lua_params = params.copy()
+    lua_params['lua_source'] = """
+function main(splash)
+  assert(splash:go(splash.args.url))
+  if splash.args.wait then
+    assert(splash:wait(splash.args.wait))
+  end
+  splash:set_result_content_type("image/png")
+  return splash:png{width=splash.args.width,
+                    height=splash.args.height,
+                    render_all=splash.args.render_all}
+end
+"""
+    return {'url': splash.url('execute'),
+            'params': lua_params}
+
+
+REQ_FACTORIES = [
+    make_render_png_req,
+    make_render_json_req,
+    make_render_png_lua_req,
+]
+
+
 #: Port at which static pages will be served.
 PORT = 8806
 #: Static pages to be used in the benchmark.
 PAGES = glob('localhost_8806/*.html')
 #: Combinations of width & height to test.
 WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
-# XXX: add benchmark of different API endpoints.
 SPLASH_LOG = 'splash.log'
 #: This script is used to collect maxrss & cpu time from splash process.
 GET_PERF_STATS_SCRIPT = """
@@ -37,6 +77,7 @@
 end
 """
 
+
 parser = ArgumentParser(description=__doc__,
                         formatter_class=ArgumentDefaultsHelpFormatter)
 parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number')
@@ -53,10 +94,12 @@ def generate_requests(splash, args):
     for i in xrange(args.request_count):
         page = rng.choice(PAGES)
         width, height = rng.choice(WIDTH_HEIGHT)
+        req_factory = rng.choice(REQ_FACTORIES)
         url = 'http://localhost:%d/%s' % (PORT, page)
-        yield (i + 1, args.request_count,
-               {'url': splash.url('render.png'),
-                'params': {'url': url, 'width': width, 'height': height}})
+        params = {'url': url, 'render_all': 1, 'wait': 0.1,
+                  'width': width, 'height': height}
+        log.debug("Req factory: %s, params: %s", req_factory, params)
+        yield (i + 1, args.request_count, req_factory(splash, params))
 
 
 def parallel_map(func, iterable, thread_count):
@@ -72,12 +115,15 @@ def invoke_request(invoke_args):
     req_no, total_reqs, kwargs = invoke_args
     log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs)
     stime = time()
-    requests.get(**kwargs)
+    response = requests.get(**kwargs)
     etime = time()
+    if response.status_code != 200:
+        log.error("Non-OK response:\n%s", response.text)
     return {'start_time': stime,
             'end_time': etime,
             'duration': etime - stime,
             'endpoint': kwargs['url'],
+            'status': response.status_code,
             'site': kwargs['params']['url'],
             'width': kwargs['params']['width'],
             'height': kwargs['params']['height']}

From 938e04b433d319f0ac43a74b34cf68173bcc70a3 Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Mon, 2 Mar 2015 20:06:17 +0300
Subject: [PATCH 05/10] benchmark: put downloaded sites into a configurable
 subdir

---
 splash/benchmark/README.rst        |  4 +-
 splash/benchmark/benchmark.py      | 18 +++++++--
 splash/benchmark/download_sites.py | 62 +++++++++++++++++++-----------
 splash/benchmark/file_server.py    | 23 ++++++++---
 4 files changed, 73 insertions(+), 34 deletions(-)
 mode change 100644 => 100755 splash/benchmark/download_sites.py
 mode change 100644 => 100755 splash/benchmark/file_server.py

diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst
index 10e3b3e23..39a70688e 100644
--- a/splash/benchmark/README.rst
+++ b/splash/benchmark/README.rst
@@ -3,7 +3,7 @@ This directory contains a preliminary version of splash benchmark suite.
 To use it, do the following:
 
 - install ``httrack``
-- create a directory for downloaded files, e.g. ``files``
-- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
+- run ``python download_sites.py``, it will create ``sites`` subdirectory in
+  current directory and download sites to be used in the benchmark there
 - run ``python benchmark.py`` to run the benchmark
 
diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index b9c501713..3a62e11bf 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -17,6 +17,7 @@
 from multiprocessing.pool import ThreadPool
 from pprint import pformat
 from time import time
+import re
 
 import requests
 from splash.benchmark.file_server import serve_files
@@ -65,10 +66,9 @@ def make_render_png_lua_req(splash, params):
 
 #: Port at which static pages will be served.
 PORT = 8806
-#: Static pages to be used in the benchmark.
-PAGES = glob('localhost_8806/*.html')
 #: Combinations of width & height to test.
 WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
+#: Splash log filename.
 SPLASH_LOG = 'splash.log'
 #: This script is used to collect maxrss & cpu time from splash process.
 GET_PERF_STATS_SCRIPT = """
@@ -85,14 +85,24 @@ def make_render_png_lua_req(splash, params):
                     help='Request thread count')
 parser.add_argument('--request-count', type=int, default=10,
                     help='Benchmark request count')
+parser.add_argument('--sites-dir', type=str, default='sites',
+                    help='Directory with downloaded sites')
 
 
 def generate_requests(splash, args):
     log = logging.getLogger('generate_requests')
     log.info("Using pRNG seed: %s", args.seed)
+
+    # Static pages (relative to sites_dir) to be used in the benchmark.
+    pages = [re.sub('^%s/' % args.sites_dir, '', v)
+             for v in glob(os.path.join(args.sites_dir, 'localhost_8806',
+                                        '*.html'))]
+    for p in pages:
+        log.info("Using page for benchmark: %s", p)
+
     rng = random.Random(args.seed)
     for i in xrange(args.request_count):
-        page = rng.choice(PAGES)
+        page = rng.choice(pages)
         width, height = rng.choice(WIDTH_HEIGHT)
         req_factory = rng.choice(REQ_FACTORIES)
         url = 'http://localhost:%d/%s' % (PORT, page)
@@ -140,7 +150,7 @@ def main():
                     '--disable-xvfb',
                     '--max-timeout=600'])
 
-    with splash, serve_files(PORT):
+    with splash, serve_files(PORT, args.sites_dir):
         start_time = time()
         results = parallel_map(invoke_request, generate_requests(splash, args),
                                args.thread_count)
diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
old mode 100644
new mode 100755
index abe921eee..482469bab
--- a/splash/benchmark/download_sites.py
+++ b/splash/benchmark/download_sites.py
@@ -1,4 +1,13 @@
+#!/usr/bin/env python
+
+"""
+Site downloader script for Splash benchmark suite.
+"""
+
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+import errno
 import json
+import os
 import re
 import subprocess
 from urlparse import urlsplit
@@ -9,7 +18,7 @@
 from splash.benchmark.file_server import serve_files
 from splash.tests.stress import lua_runonce
 
-script_html = """
+SCRIPT_HTML = """
 function main(splash)
 splash:set_images_enabled(false)
 splash:go(splash.args.url)
@@ -18,24 +27,19 @@
 end
 """
 
-script_png = """
-
-function main(splash)
-splash:go(splash.args.url)
-splash:wait(0.5)
-return splash:png()
-end
-"""
-
-
+#: This UA is used by httrack to mimic Splash requests when downloading sites.
 USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"
 
-
 PORT = 8806
 
+parser = ArgumentParser(description=__doc__,
+                        formatter_class=ArgumentDefaultsHelpFormatter)
+parser.add_argument('--sites-dir', default='sites',
+                    help='Directory for downloaded sites')
 
-def preprocess_main_page(url):
-    out = json.loads(lua_runonce(script_html, url=url,
+
+def preprocess_main_page(sites_dir, url):
+    out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
                                  splash_args=['--disable-lua-sandbox',
                                               '--disable-xvfb',
                                               '--max-timeout=600'],
@@ -56,13 +60,13 @@ def preprocess_main_page(url):
         out['html'] = html.tostring(root, encoding='utf-8',
                                     doctype='<!DOCTYPE html>')
     filename = re.sub(r'[^\w]+', '_', url) + '.html'
-    with open(filename, 'w') as f:
+    with open(os.path.join(sites_dir, filename), 'w') as f:
         f.write(out['html'])
     return filename
 
 
-def download_sites(sites):
-    local_files = [preprocess_main_page(s) for s in sites]
+def download_sites(sites_dir, sites):
+    local_files = [preprocess_main_page(sites_dir, s) for s in sites]
 
     local_urls = [
         'http://localhost:%(port)d/%(filename)s' % {
@@ -75,12 +79,20 @@ def download_sites(sites):
             '-%P',              # Try parsing links in non-href/src sections
             '-F', USERAGENT,    # Emulate splash UA
             '--depth=1']
-    subprocess.check_call(['httrack'] + args + local_urls)
-
-
-if __name__ == '__main__':
-    with serve_files(PORT):
-        download_sites([
+    subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir)
+
+
+def main():
+    args = parser.parse_args()
+    try:
+        os.makedirs(args.sites_dir)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+        elif not os.path.isdir(args.sites_dir):
+            raise RuntimeError("Not a directory: %s" % args.sites_dir)
+    with serve_files(PORT, args.sites_dir):
+        download_sites(args.sites_dir, [
             'http://www.wikipedia.org',
             'http://www.google.com',
             'http://www.reddit.com',
@@ -89,3 +101,7 @@ def download_sites(sites):
             # "http://blog.pinterest.com",
             # "http://imgur.com",
         ])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py
old mode 100644
new mode 100755
index 77e2b7084..2931f41ae
--- a/splash/benchmark/file_server.py
+++ b/splash/benchmark/file_server.py
@@ -1,10 +1,22 @@
+#!/usr/bin/env python
+
+"""
+Simple static file server.
+"""
+
+import argparse
+import os
 import SimpleHTTPServer
 import SocketServer
 import subprocess
-import sys
 from contextlib import contextmanager
 
 
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument('port', type=int, help='Port number to listen at')
+parser.add_argument('directory', type=str, help='Directory to serve')
+
+
 class ReusingTCPServer(SocketServer.TCPServer):
     allow_reuse_address = True
 
@@ -15,10 +27,10 @@ def address_string(self):
 
 
 @contextmanager
-def serve_files(port):
+def serve_files(port, directory):
     """Serve files from current directory statically in a subprocess."""
     site_server = subprocess.Popen(['python', '-m', __name__,
-                                    str(port)])
+                                    str(port), directory])
     try:
         yield
     finally:
@@ -26,6 +38,7 @@ def serve_files(port):
 
 
 if __name__ == '__main__':
-    port = int(sys.argv[1])
-    server = ReusingTCPServer(("", port), RequestHandler)
+    args = parser.parse_args()
+    os.chdir(args.directory)
+    server = ReusingTCPServer(("", args.port), RequestHandler)
     server.serve_forever()

From b705bcfed8956e8d46daed3b21681f36ab809cdf Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Mon, 2 Mar 2015 20:45:32 +0300
Subject: [PATCH 06/10] benchmark: ignore benchmark files for pytest purposes

---
 splash/conftest.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/splash/conftest.py b/splash/conftest.py
index 2a20cc38a..f93ed2990 100644
--- a/splash/conftest.py
+++ b/splash/conftest.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from splash import lua
+import glob
 
 collect_ignore = []
 
@@ -15,3 +16,8 @@
         'kernel/__main__.py',
         'kernel/__init__.py',
     ]
+
+collect_ignore.extend([
+    'benchmark/download_sites.py',
+    'benchmark/file_server.py',
+    'benchmark/benchmark.py'])

From f8b201b65952ec4a3d2727ace601fd683e5cb7f4 Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Mon, 2 Mar 2015 21:30:51 +0300
Subject: [PATCH 07/10] benchmark: several post-review improvements

- download_sites: fix encoding unconditionally if it is missing
- download_sites: add base/href only if it is missing
- download_sites: remove scripts unconditionally
- benchmark: specify pre-existing splash instance with --splash-server HOST:PORT
---
 splash/benchmark/benchmark.py      | 32 +++++++++++++++++++++++++-----
 splash/benchmark/download_sites.py | 31 ++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index 3a62e11bf..2ba766fc7 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -87,6 +87,8 @@ def make_render_png_lua_req(splash, params):
                     help='Benchmark request count')
 parser.add_argument('--sites-dir', type=str, default='sites',
                     help='Directory with downloaded sites')
+parser.add_argument('--splash-server', metavar='HOST:PORT',
+                    help='Use existing Splash instance available at HOST:PORT')
 
 
 def generate_requests(splash, args):
@@ -139,16 +141,36 @@ def invoke_request(invoke_args):
             'height': kwargs['params']['height']}
 
 
+class ExistingSplashWrapper(object):
+    """Wrapper for pre-existing Splash instance."""
+    def __init__(self, server):
+        self.server = server
+        if not self.server.startswith('http://'):
+            self.server = 'http://' + self.server
+
+    def url(self, endpoint):
+        return self.server + '/' + endpoint
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        pass
+
+
 def main():
     log = logging.getLogger("benchmark")
     args = parser.parse_args()
     logging.basicConfig(level=logging.DEBUG)
 
-    splash = SplashServer(
-        logfile=SPLASH_LOG,
-        extra_args=['--disable-lua-sandbox',
-                    '--disable-xvfb',
-                    '--max-timeout=600'])
+    if args.splash_server:
+        splash = ExistingSplashWrapper(args.splash_server)
+    else:
+        splash = SplashServer(
+            logfile=SPLASH_LOG,
+            extra_args=['--disable-lua-sandbox',
+                        '--disable-xvfb',
+                        '--max-timeout=600'])
 
     with splash, serve_files(PORT, args.sites_dir):
         start_time = time()
diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
index 482469bab..e50fee33f 100755
--- a/splash/benchmark/download_sites.py
+++ b/splash/benchmark/download_sites.py
@@ -39,26 +39,33 @@
 
 
 def preprocess_main_page(sites_dir, url):
+    """
+    This function does several things:
+    - strip javascript so that downloaded pages look exactly the same
+    - add baseurl to resolve relative links properly (if it is missing)
+    - add meta charset description (if it is missing)
+    """
     out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
                                  splash_args=['--disable-lua-sandbox',
                                               '--disable-xvfb',
                                               '--max-timeout=600'],
                                  timeout=600.,))
     final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
-    if not w3lib.html.get_base_url(out['html']):
-        out['html'] = w3lib.html.remove_tags_with_content(
-            out['html'], ('script',))
-        root = html.fromstring(out['html'], parser=html.HTMLParser(),
-                               base_url=final_url)
-        try:
-            head = root.xpath('./head')[0]
-        except IndexError:
-            head = html.Element('head')
-            root.insert(0, head)
+    # Ensure there are no scripts to be executed.
+    out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',))
+    root = html.fromstring(out['html'], parser=html.HTMLParser(),
+                           base_url=final_url)
+    try:
+        head = root.xpath('./head')[0]
+    except IndexError:
+        head = html.Element('head')
+        root.insert(0, head)
+    if not head.xpath('./base/@href'):
         head.insert(0, html.Element('base', {'href': final_url}))
+    if not head.xpath('./meta/@charset'):
         head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
-        out['html'] = html.tostring(root, encoding='utf-8',
-                                    doctype='<!DOCTYPE html>')
+    out['html'] = html.tostring(root, encoding='utf-8',
+                                doctype='<!DOCTYPE html>')
     filename = re.sub(r'[^\w]+', '_', url) + '.html'
     with open(os.path.join(sites_dir, filename), 'w') as f:
         f.write(out['html'])

From 9371c2335550e5b7115f4562261a550dcf1a8ab3 Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Mon, 2 Mar 2015 22:04:29 +0300
Subject: [PATCH 08/10] benchmark: serve files via twisted.web.static

---
 splash/benchmark/benchmark.py   |  5 ++-
 splash/benchmark/file_server.py | 60 ++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index 2ba766fc7..d326ce6c8 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -11,7 +11,6 @@
 import logging
 import os
 import random
-import shutil
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 from glob import glob
 from multiprocessing.pool import ThreadPool
@@ -68,7 +67,7 @@ def make_render_png_lua_req(splash, params):
 PORT = 8806
 #: Combinations of width & height to test.
 WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
-#: Splash log filename.
+#: Splash log filename (set to None to put it to stdout).
 SPLASH_LOG = 'splash.log'
 #: This script is used to collect maxrss & cpu time from splash process.
 GET_PERF_STATS_SCRIPT = """
@@ -172,7 +171,7 @@ def main():
                         '--disable-xvfb',
                         '--max-timeout=600'])
 
-    with splash, serve_files(PORT, args.sites_dir):
+    with splash, serve_files(port=PORT, directory=args.sites_dir):
         start_time = time()
         results = parallel_map(invoke_request, generate_requests(splash, args),
                                args.thread_count)
diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py
index 2931f41ae..52d4532ea 100755
--- a/splash/benchmark/file_server.py
+++ b/splash/benchmark/file_server.py
@@ -1,44 +1,56 @@
 #!/usr/bin/env python
 
-"""
-Simple static file server.
-"""
+"""Simple static file server."""
 
 import argparse
 import os
-import SimpleHTTPServer
-import SocketServer
 import subprocess
+import time
 from contextlib import contextmanager
 
+from twisted.internet import reactor
+from twisted.web.server import Site
+from twisted.web.static import File
 
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument('port', type=int, help='Port number to listen at')
-parser.add_argument('directory', type=str, help='Directory to serve')
+import requests
 
-
-class ReusingTCPServer(SocketServer.TCPServer):
-    allow_reuse_address = True
-
-
-class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
-    def address_string(self):
-        return "fileserver"
+parser = argparse.ArgumentParser("")
+parser.add_argument('--port', type=int)
+parser.add_argument('--directory', help='Directory to be served')
 
 
 @contextmanager
-def serve_files(port, directory):
-    """Serve files from current directory statically in a subprocess."""
-    site_server = subprocess.Popen(['python', '-m', __name__,
-                                    str(port), directory])
+def serve_files(port, directory, logfile=None):
+    """Serve files from specified directory statically in a subprocess."""
+    command = ['twistd',
+               '-n',    # don't daemonize
+               'web',   # start web component
+               '--port', str(int(port)),
+               '--path', os.path.abspath(directory), ]
+    if logfile is not None:
+        command += ['--logfile', logfile]
+    site_server = subprocess.Popen(command)
     try:
+        # It might take some time to bring up the server, wait for up to 10s.
+        for i in xrange(100):
+            try:
+                requests.get('http://localhost:%d' % port)
+            except requests.ConnectionError:
+                time.sleep(0.1)
+            else:
+                break
         yield
     finally:
         site_server.terminate()
 
 
-if __name__ == '__main__':
+def main():
     args = parser.parse_args()
-    os.chdir(args.directory)
-    server = ReusingTCPServer(("", args.port), RequestHandler)
-    server.serve_forever()
+    resource = File(os.path.abspath(args.directory))
+    site = Site(resource)
+    reactor.listenTCP(args.port, site)
+    reactor.run()
+
+
+if __name__ == '__main__':
+    main()

From 4d072a696530b30ad92139da43c138cb8a4122db Mon Sep 17 00:00:00 2001
From: immerrr <immerrr@gmail.com>
Date: Thu, 5 Mar 2015 11:59:26 +0000
Subject: [PATCH 09/10] benchmark: several minor changes

- add fileserver logs, write them to file (--logfile)
- put bench results into file (--out-file)
- silence requests.packages.urllib3.connectionpool logger
- fix cputime metric for preexisting splash instances
---
 splash/benchmark/benchmark.py   | 36 ++++++++++++++++++++++++---------
 splash/benchmark/file_server.py | 23 ++++++++++++++-------
 splash/qtrender_lua.py          |  2 +-
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index d326ce6c8..e99380008 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -11,12 +11,13 @@
 import logging
 import os
 import random
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, FileType
 from glob import glob
 from multiprocessing.pool import ThreadPool
 from pprint import pformat
 from time import time
 import re
+import sys
 
 import requests
 from splash.benchmark.file_server import serve_files
@@ -67,8 +68,9 @@ def make_render_png_lua_req(splash, params):
 PORT = 8806
 #: Combinations of width & height to test.
 WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
-#: Splash log filename (set to None to put it to stdout).
+#: Splash & fileserver log filenames (set to None to put it to stderr).
 SPLASH_LOG = 'splash.log'
+FILESERVER_LOG = 'fileserver.log'
 #: This script is used to collect maxrss & cpu time from splash process.
 GET_PERF_STATS_SCRIPT = """
 function main(splash)
@@ -88,6 +90,8 @@ def make_render_png_lua_req(splash, params):
                     help='Directory with downloaded sites')
 parser.add_argument('--splash-server', metavar='HOST:PORT',
                     help='Use existing Splash instance available at HOST:PORT')
+parser.add_argument('--out-file', type=FileType(mode='w'), default=sys.stdout,
+                    help='Write detailed request information in this file')
 
 
 def generate_requests(splash, args):
@@ -95,9 +99,10 @@ def generate_requests(splash, args):
     log.info("Using pRNG seed: %s", args.seed)
 
     # Static pages (relative to sites_dir) to be used in the benchmark.
-    pages = [re.sub('^%s/' % args.sites_dir, '', v)
-             for v in glob(os.path.join(args.sites_dir, 'localhost_8806',
-                                        '*.html'))]
+    log.info("sites dir: %s", args.sites_dir)
+    sites_found = glob(os.path.join(args.sites_dir, 'localhost_8806', '*.html'))
+    log.info("sites found: %s", sites_found)
+    pages = [re.sub('^%s/' % args.sites_dir.rstrip('/'), '', v) for v in sites_found]
     for p in pages:
         log.info("Using page for benchmark: %s", p)
 
@@ -160,6 +165,7 @@ def __exit__(self, *args):
 def main():
     log = logging.getLogger("benchmark")
     args = parser.parse_args()
+    logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING)
     logging.basicConfig(level=logging.DEBUG)
 
     if args.splash_server:
@@ -171,18 +177,28 @@ def main():
                         '--disable-xvfb',
                         '--max-timeout=600'])
 
-    with splash, serve_files(port=PORT, directory=args.sites_dir):
+    with splash, serve_files(port=PORT, directory=args.sites_dir, logfile=FILESERVER_LOG):
+        log.info("Servers are up, starting benchmark...")
+        start_res = requests.get(
+            splash.url('execute'),
+            params={'lua_source': GET_PERF_STATS_SCRIPT}).json()
         start_time = time()
         results = parallel_map(invoke_request, generate_requests(splash, args),
                                args.thread_count)
         end_time = time()
-        resources = requests.get(
+        end_res = requests.get(
             splash.url('execute'),
             params={'lua_source': GET_PERF_STATS_SCRIPT}).json()
 
-    log.info("Request stats:\n%s", pformat(dict(enumerate(results))))
-    log.info("Splash max RSS: %s B", resources['maxrss'])
-    log.info("Splash CPU time elapsed: %.2f sec", resources['cputime'])
+    log.info("Writing stats to %s", args.out_file.name)
+    args.out_file.write(pformat({
+                'maxrss': end_res['maxrss'],
+                'cputime': end_res['cputime'] - start_res['cputime'],
+                'walltime': end_time - start_time,
+                'requests': results}))
+    log.info("Splash max RSS: %s B", end_res['maxrss'])
+    log.info("Splash CPU time elapsed: %.2f sec",
+             end_res['cputime'] - start_res['cputime'])
     log.info("Wallclock time elapsed: %.2f sec", end_time - start_time)
 
 
diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py
index 52d4532ea..5e4cb3acf 100755
--- a/splash/benchmark/file_server.py
+++ b/splash/benchmark/file_server.py
@@ -6,27 +6,35 @@
 import os
 import subprocess
 import time
+import sys
 from contextlib import contextmanager
 
 from twisted.internet import reactor
 from twisted.web.server import Site
 from twisted.web.static import File
+from twisted.python.log import startLogging
 
 import requests
 
 parser = argparse.ArgumentParser("")
-parser.add_argument('--port', type=int)
-parser.add_argument('--directory', help='Directory to be served')
-
+parser.add_argument('--port', type=int, default=8806)
+parser.add_argument('--directory', help='Directory to be served', default='.')
+parser.add_argument('--logfile', default=sys.stderr, type=argparse.FileType(mode='w'), 
+                    help='File to write logs to')
 
 @contextmanager
 def serve_files(port, directory, logfile=None):
     """Serve files from specified directory statically in a subprocess."""
-    command = ['twistd',
-               '-n',    # don't daemonize
-               'web',   # start web component
+    # command = ['twistd',
+    #            '-n',    # don't daemonize
+    #            'web',   # start web component
+    #            '--port', str(int(port)),
+    #            '--path', os.path.abspath(directory), ]
+    # if logfile is not None:
+    #     command += ['--logfile', logfile]
+    command = ['python', __file__,
                '--port', str(int(port)),
-               '--path', os.path.abspath(directory), ]
+               '--directory', os.path.abspath(directory)]
     if logfile is not None:
         command += ['--logfile', logfile]
     site_server = subprocess.Popen(command)
@@ -46,6 +54,7 @@ def serve_files(port, directory, logfile=None):
 
 def main():
     args = parser.parse_args()
+    startLogging(args.logfile)
     resource = File(os.path.abspath(args.directory))
     site = Site(resource)
     reactor.listenTCP(args.port, site)
diff --git a/splash/qtrender_lua.py b/splash/qtrender_lua.py
index 82f6f8fe5..0e12b8503 100644
--- a/splash/qtrender_lua.py
+++ b/splash/qtrender_lua.py
@@ -497,7 +497,7 @@ def get_perf_stats(self):
         rss_mul = 1 if sys.platform == 'darwin' else 1024
         return {'maxrss': rusage.ru_maxrss * rss_mul,
                 'cputime': rusage.ru_utime + rusage.ru_stime,
-                'walltime': time.time()}
+                 'walltime': time.time()}
 
     def get_real_exception(self):
         if self._exceptions:

From f7a43dada156274792047f0053e77a3082743f17 Mon Sep 17 00:00:00 2001
From: immerrr again <immerrr@gmail.com>
Date: Mon, 9 Mar 2015 09:42:21 +0000
Subject: [PATCH 10/10] benchmark: more features & fixes

- add support for preexisting file server instance (--fileserver)
- add HTML endpoint benchmarks (--render-type html)
- make --sites-dir required
- dump output in proper JSON
---
 splash/benchmark/benchmark.py      | 106 ++++++++++++++++++++++-------
 splash/benchmark/download_sites.py |   9 ++-
 splash/benchmark/file_server.py    |  71 ++++++++++++-------
 3 files changed, 135 insertions(+), 51 deletions(-)

diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
index e99380008..ca22db518 100755
--- a/splash/benchmark/benchmark.py
+++ b/splash/benchmark/benchmark.py
@@ -8,6 +8,7 @@
 
 """
 
+import json
 import logging
 import os
 import random
@@ -20,18 +21,16 @@
 import sys
 
 import requests
-from splash.benchmark.file_server import serve_files
-from splash.tests.utils import SplashServer
 
 
 def make_render_png_req(splash, params):
-    """Prepare request for render.png endpoint."""
+    """Make PNG render request via render.png endpoint."""
     return {'url': splash.url('render.png'),
             'params': params}
 
 
 def make_render_json_req(splash, params):
-    """Prepare request for render.json endpoint."""
+    """Make PNG render request via JSON endpoint."""
     json_params = params.copy()
     json_params['png'] = 1
     return {'url': splash.url('render.json'),
@@ -39,7 +38,7 @@ def make_render_json_req(splash, params):
 
 
 def make_render_png_lua_req(splash, params):
-    """Prepare request for execute endpoint."""
+    """Make PNG render request via Lua execute endpoint."""
     lua_params = params.copy()
     lua_params['lua_source'] = """
 function main(splash)
@@ -57,11 +56,51 @@ def make_render_png_lua_req(splash, params):
             'params': lua_params}
 
 
-REQ_FACTORIES = [
-    make_render_png_req,
-    make_render_json_req,
-    make_render_png_lua_req,
-]
+def make_render_html_req(splash, params):
+    """Make HTML render request via render.html endpoint."""
+    return {'url': splash.url('render.html'),
+            'params': params}
+
+
+def make_render_html_json_req(splash, params):
+    """Make HTML render request via JSON endpoint."""
+    json_params = params.copy()
+    json_params['html'] = 1
+    return {'url': splash.url('render.json'),
+            'params': json_params}
+
+
+def make_render_html_lua_req(splash, params):
+    """Make HTML render request via Lua execute endpoint."""
+    lua_params = params.copy()
+    lua_params['lua_source'] = """
+function main(splash)
+  assert(splash:go(splash.args.url))
+  if splash.args.wait then
+    assert(splash:wait(splash.args.wait))
+  end
+  splash:set_result_content_type("text/html; charset=UTF-8")
+  return splash:html{}
+end
+"""
+    return {'url': splash.url('execute'),
+            'params': lua_params}
+
+
+#: Same resource may be rendered by various endpoints with slightly varying
+#: parameter combinations.  Request factories set those combinations up.
+REQ_FACTORIES = {
+    'png': [
+        make_render_png_req,
+        make_render_json_req,
+        make_render_png_lua_req,
+    ],
+    'html': [
+        make_render_html_req,
+        make_render_html_json_req,
+        make_render_html_lua_req,
+    ],
+}
 
 
 #: Port at which static pages will be served.
@@ -86,15 +125,20 @@ def make_render_png_lua_req(splash, params):
                     help='Request thread count')
 parser.add_argument('--request-count', type=int, default=10,
                     help='Benchmark request count')
-parser.add_argument('--sites-dir', type=str, default='sites',
+parser.add_argument('--sites-dir', type=str, default='sites', required=True,
                     help='Directory with downloaded sites')
+parser.add_argument('--file-server', metavar='HOST:PORT',
+                    help='Use existing file server instance available at HOST:PORT')
 parser.add_argument('--splash-server', metavar='HOST:PORT',
                     help='Use existing Splash instance available at HOST:PORT')
 parser.add_argument('--out-file', type=FileType(mode='w'), default=sys.stdout,
                     help='Write detailed request information in this file')
+parser.add_argument('--render-type', choices=('html', 'png'), default='png',
+                    help=('Type of rendering to benchmark'
+                          ' (either "html" or "png")'))
 
 
-def generate_requests(splash, args):
+def generate_requests(splash, file_server, args):
     log = logging.getLogger('generate_requests')
     log.info("Using pRNG seed: %s", args.seed)
 
@@ -106,12 +150,14 @@ def generate_requests(splash, args):
     for p in pages:
         log.info("Using page for benchmark: %s", p)
 
+    request_factories = REQ_FACTORIES[args.render_type]
+
     rng = random.Random(args.seed)
     for i in xrange(args.request_count):
         page = rng.choice(pages)
         width, height = rng.choice(WIDTH_HEIGHT)
-        req_factory = rng.choice(REQ_FACTORIES)
-        url = 'http://localhost:%d/%s' % (PORT, page)
+        req_factory = rng.choice(request_factories)
+        url = file_server.url(page)
         params = {'url': url, 'render_all': 1, 'wait': 0.1,
                   'width': width, 'height': height}
         log.debug("Req factory: %s, params: %s", req_factory, params)
@@ -145,7 +191,7 @@ def invoke_request(invoke_args):
             'height': kwargs['params']['height']}
 
 
-class ExistingSplashWrapper(object):
+class ExistingServerWrapper(object):
     """Wrapper for pre-existing Splash instance."""
     def __init__(self, server):
         self.server = server
@@ -165,25 +211,36 @@ def __exit__(self, *args):
 def main():
     log = logging.getLogger("benchmark")
     args = parser.parse_args()
-    logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING)
+    (logging.getLogger('requests.packages.urllib3.connectionpool')
+     .setLevel(logging.WARNING))
     logging.basicConfig(level=logging.DEBUG)
 
     if args.splash_server:
-        splash = ExistingSplashWrapper(args.splash_server)
+        splash = ExistingServerWrapper(args.splash_server)
     else:
+        from splash.tests.utils import SplashServer
         splash = SplashServer(
             logfile=SPLASH_LOG,
             extra_args=['--disable-lua-sandbox',
                         '--disable-xvfb',
                         '--max-timeout=600'])
 
-    with splash, serve_files(port=PORT, directory=args.sites_dir, logfile=FILESERVER_LOG):
+    if args.file_server:
+        file_server = ExistingServerWrapper(args.file_server)
+    else:
+        from splash.benchmark.file_server import FileServerSubprocess
+        file_server = FileServerSubprocess(port=PORT,
+                                           path=args.sites_dir,
+                                           logfile=FILESERVER_LOG)
+
+    with splash, file_server:
         log.info("Servers are up, starting benchmark...")
         start_res = requests.get(
             splash.url('execute'),
             params={'lua_source': GET_PERF_STATS_SCRIPT}).json()
         start_time = time()
-        results = parallel_map(invoke_request, generate_requests(splash, args),
+        results = parallel_map(invoke_request,
+                               generate_requests(splash, file_server, args),
                                args.thread_count)
         end_time = time()
         end_res = requests.get(
@@ -191,11 +248,12 @@ def main():
             params={'lua_source': GET_PERF_STATS_SCRIPT}).json()
 
     log.info("Writing stats to %s", args.out_file.name)
-    args.out_file.write(pformat({
-                'maxrss': end_res['maxrss'],
-                'cputime': end_res['cputime'] - start_res['cputime'],
-                'walltime': end_time - start_time,
-                'requests': results}))
+    args.out_file.write(json.dumps(
+        {'maxrss': end_res['maxrss'],
+         'cputime': end_res['cputime'] - start_res['cputime'],
+         'walltime': end_time - start_time,
+         'requests': results},
+        indent=2))
     log.info("Splash max RSS: %s B", end_res['maxrss'])
     log.info("Splash CPU time elapsed: %.2f sec",
              end_res['cputime'] - start_res['cputime'])
diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
index e50fee33f..07a9577de 100755
--- a/splash/benchmark/download_sites.py
+++ b/splash/benchmark/download_sites.py
@@ -10,12 +10,13 @@
 import os
 import re
 import subprocess
+import logging
 from urlparse import urlsplit
 
 from lxml import html
 
 import w3lib.html
-from splash.benchmark.file_server import serve_files
+from splash.benchmark.file_server import FileServerSubprocess
 from splash.tests.stress import lua_runonce
 
 SCRIPT_HTML = """
@@ -91,6 +92,10 @@ def download_sites(sites_dir, sites):
 
 def main():
     args = parser.parse_args()
+    (logging.getLogger('requests.packages.urllib3.connectionpool')
+     .setLevel(logging.WARNING))
+    logging.basicConfig(level=logging.DEBUG)
+    logging.info("Starting site download suite")
     try:
         os.makedirs(args.sites_dir)
     except OSError as e:
@@ -98,7 +103,7 @@ def main():
             raise
         elif not os.path.isdir(args.sites_dir):
             raise RuntimeError("Not a directory: %s" % args.sites_dir)
-    with serve_files(PORT, args.sites_dir):
+    with FileServerSubprocess(port=PORT, path=args.sites_dir):
         download_sites(args.sites_dir, [
             'http://www.wikipedia.org',
             'http://www.google.com',
diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py
index 5e4cb3acf..bb0549ce8 100755
--- a/splash/benchmark/file_server.py
+++ b/splash/benchmark/file_server.py
@@ -7,6 +7,7 @@
 import subprocess
 import time
 import sys
+import logging
 from contextlib import contextmanager
 
 from twisted.internet import reactor
@@ -18,44 +19,64 @@
 
 parser = argparse.ArgumentParser("")
 parser.add_argument('--port', type=int, default=8806)
-parser.add_argument('--directory', help='Directory to be served', default='.')
-parser.add_argument('--logfile', default=sys.stderr, type=argparse.FileType(mode='w'), 
+parser.add_argument('--path', help='Path to be served', default='.')
+parser.add_argument('--logfile', default=sys.stderr,
+                    type=argparse.FileType(mode='w'),
                     help='File to write logs to')
 
-@contextmanager
-def serve_files(port, directory, logfile=None):
+
+class FileServerSubprocess(object):
+    logger = logging.getLogger('file_server')
+
     """Serve files from specified directory statically in a subprocess."""
-    # command = ['twistd',
-    #            '-n',    # don't daemonize
-    #            'web',   # start web component
-    #            '--port', str(int(port)),
-    #            '--path', os.path.abspath(directory), ]
-    # if logfile is not None:
-    #     command += ['--logfile', logfile]
-    command = ['python', __file__,
-               '--port', str(int(port)),
-               '--directory', os.path.abspath(directory)]
-    if logfile is not None:
-        command += ['--logfile', logfile]
-    site_server = subprocess.Popen(command)
-    try:
+    def __init__(self, port, path, logfile=None):
+        self.port = port
+        self.path = path
+        self.logfile = logfile
+        self.server = 'http://localhost:%d' % port
+
+    def url(self, endpoint):
+        return self.server + '/' + endpoint
+
+    def __enter__(self):
+        # command = ['twistd',
+        #            '-n',    # don't daemonize
+        #            'web',   # start web component
+        #            '--port', str(int(port)),
+        #            '--path', os.path.abspath(directory), ]
+        # if logfile is not None:
+        #     command += ['--logfile', logfile]
+        command = ['python', __file__,
+                   '--port', str(int(self.port)),
+                   '--path', os.path.abspath(self.path)]
+        if self.logfile is not None:
+            command += ['--logfile', self.logfile]
+        self.logger.info("Starting file server subprocess: %s", command)
+        self._site_server = subprocess.Popen(command)
         # It might take some time to bring up the server, wait for up to 10s.
         for i in xrange(100):
             try:
-                requests.get('http://localhost:%d' % port)
+                self.logger.info("Checking if file server is active")
+                requests.get(self.url(''))
+                break
             except requests.ConnectionError:
                 time.sleep(0.1)
-            else:
-                break
-        yield
-    finally:
-        site_server.terminate()
+        else:
+            msg = "File server subprocess startup timed out"
+            if self.logfile:
+                with open(self.logfile, 'r') as log_f:
+                    msg += ", logs:\n" + log_f.read()
+            raise RuntimeError(msg)
+
+    def __exit__(self, *args):
+        self._site_server.kill()
+        self._site_server.wait()
 
 
 def main():
     args = parser.parse_args()
     startLogging(args.logfile)
-    resource = File(os.path.abspath(args.directory))
+    resource = File(os.path.abspath(args.path))
     site = Site(resource)
     reactor.listenTCP(args.port, site)
     reactor.run()