From 85b42571e861b7e7fb095083304d2f64ea86e790 Mon Sep 17 00:00:00 2001 From: immerrr Date: Fri, 27 Feb 2015 21:10:27 +0300 Subject: [PATCH 01/10] Initial version of benchmark suite --- splash/benchmark/README.rst | 9 +++ splash/benchmark/benchmark.py | 80 ++++++++++++++++++++++++++ splash/benchmark/download_sites.py | 90 ++++++++++++++++++++++++++++++ splash/benchmark/file_server.py | 31 ++++++++++ 4 files changed, 210 insertions(+) create mode 100644 splash/benchmark/README.rst create mode 100755 splash/benchmark/benchmark.py create mode 100644 splash/benchmark/download_sites.py create mode 100644 splash/benchmark/file_server.py diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst new file mode 100644 index 000000000..10e3b3e23 --- /dev/null +++ b/splash/benchmark/README.rst @@ -0,0 +1,9 @@ +This directory contains a preliminary version of splash benchmark suite. + +To use it, do the following: + +- install ``httrack`` +- create a directory for downloaded files, e.g. ``files`` +- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark +- run ``python benchmark.py`` to run the benchmark + diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py new file mode 100755 index 000000000..44201c1c7 --- /dev/null +++ b/splash/benchmark/benchmark.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +""" +Splash benchmark script. + +It takes a directory downloaded with splash & httrack, fires up a static file +server and runs a series of requests via splash on those downloaded pages. + +""" + +import logging +import random +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from glob import glob +from multiprocessing.pool import ThreadPool + +import requests +from splash.file_server import serve_files +from splash.tests.utils import SplashServer + +PORT = 8806 +#: URLs to benchmark against. +PAGES = glob('localhost_8806/*.html') +#: Combinations of width & height to test. +WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] +# XXX: add benchmark of different API endpoints. +SPLASH_LOG = 'splash.log' + +parser = ArgumentParser(description=__doc__, + formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number') +parser.add_argument('--thread-count', type=int, default=1, + help='Request thread count') +parser.add_argument('--request-count', type=int, default=10, + help='Benchmark request count') + + +def generate_requests(splash, args): + log = logging.getLogger('generate_requests') + log.info("Using pRNG seed: %s", args.seed) + rng = random.Random(args.seed) + for i in xrange(args.request_count): + page = rng.choice(PAGES) + width, height = rng.choice(WIDTH_HEIGHT) + url = 'http://localhost:%d/%s' % (PORT, page) + yield (i + 1, args.request_count, + {'url': splash.url('render.png'), + 'params': {'url': url, 'width': width, 'height': height}}) + + +def parallel_map(func, iterable, thread_count): + if thread_count == 1: + return map(func, iterable) + else: + pool = ThreadPool(thread_count) + return pool.map(func, iterable) + + +def invoke_request(invoke_args): + log = logging.getLogger('bench_worker') + req_no, total_reqs, kwargs = invoke_args + log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs) + return requests.get(**kwargs) + + +def main(): + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG) + + with SplashServer(logfile=SPLASH_LOG, + extra_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600']) as splash, \ + serve_files(PORT): + parallel_map(invoke_request, generate_requests(splash, args), + args.thread_count) + + +if __name__ == '__main__': + main() diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py new file mode 100644 index 000000000..0d0bf4b0f --- /dev/null +++ b/splash/benchmark/download_sites.py @@ -0,0 +1,90 @@ +from splash.tests.stress import lua_runonce + +import re +from urlparse import urlsplit +import json +from lxml import html +import w3lib.html +import subprocess +from splash.file_server import serve_files + +script_html = """ +function main(splash) +splash:set_images_enabled(false) +splash:go(splash.args.url) +splash:wait(0.5) +return {url=splash:url(), html=splash:html()} +end +""" + +script_png = """ + +function main(splash) +splash:go(splash.args.url) +splash:wait(0.5) +return splash:png() +end +""" + + +USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34" + + +PORT = 8806 + + +def preprocess_main_page(url): + out = json.loads(lua_runonce(script_html, url=url, + splash_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600'], + timeout=600.,)) + final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl() + if not w3lib.html.get_base_url(out['html']): + out['html'] = w3lib.html.remove_tags_with_content( + out['html'], ('script',)) + root = html.fromstring(out['html'], parser=html.HTMLParser(), + base_url=final_url) + try: + head = root.xpath('./head')[0] + except IndexError: + head = html.Element('head') + root.insert(0, head) + head.insert(0, html.Element('base', {'href': final_url})) + head.insert(0, html.Element('meta', {'charset': 'utf-8'})) + out['html'] = html.tostring(root, encoding='utf-8', + doctype='') + filename = re.sub(r'[^\w]+', '_', url) + '.html' + with open(filename, 'w') as f: + f.write(out['html']) + return filename + + +def download_sites(sites): + local_files = [preprocess_main_page(s) for s in sites] + + local_urls = [ + 'http://localhost:%(port)d/%(filename)s' % { + 'port': PORT, 'filename': filename + } + for filename in local_files + ] + args = ['--continue', + '--near', # Fetch referred non-html files. + '-%P', # Try parsing links in non-href/src sections + '-F', USERAGENT, # Emulate splash UA + '--depth=1'] + subprocess.check_call(['httrack'] + args + local_urls) + + +if __name__ == '__main__': + with serve_files(PORT): + download_sites([ + 'http://www.wikipedia.org', + 'http://www.google.com', + 'http://www.reddit.com', + "http://w3.org", + "http://w3.org/TR/2010/REC-xhtml-basic-20101123/", + # "http://blog.pinterest.com", + # "http://imgur.com", + ]) diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py new file mode 100644 index 000000000..77e2b7084 --- /dev/null +++ b/splash/benchmark/file_server.py @@ -0,0 +1,31 @@ +import SimpleHTTPServer +import SocketServer +import subprocess +import sys +from contextlib import contextmanager + + +class ReusingTCPServer(SocketServer.TCPServer): + allow_reuse_address = True + + +class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): + def address_string(self): + return "fileserver" + + +@contextmanager +def serve_files(port): + """Serve files from current directory statically in a subprocess.""" + site_server = subprocess.Popen(['python', '-m', __name__, + str(port)]) + try: + yield + finally: + site_server.terminate() + + +if __name__ == '__main__': + port = int(sys.argv[1]) + server = ReusingTCPServer(("", port), RequestHandler) + server.serve_forever() From b1e0baeab54ac198ae80af5afb8bba532ed4dd39 Mon Sep 17 00:00:00 2001 From: immerrr Date: Fri, 27 Feb 2015 22:50:46 +0300 Subject: [PATCH 02/10] Fix & organize imports --- splash/benchmark/__init__.py | 0 splash/benchmark/benchmark.py | 2 +- splash/benchmark/download_sites.py | 11 ++++++----- 3 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 splash/benchmark/__init__.py diff --git a/splash/benchmark/__init__.py b/splash/benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index 44201c1c7..25fac405e 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -15,7 +15,7 @@ from multiprocessing.pool import ThreadPool import requests -from splash.file_server import serve_files +from splash.benchmark.file_server import serve_files from splash.tests.utils import SplashServer PORT = 8806 diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py index 0d0bf4b0f..abe921eee 100644 --- a/splash/benchmark/download_sites.py +++ b/splash/benchmark/download_sites.py @@ -1,12 +1,13 @@ -from splash.tests.stress import lua_runonce - +import json import re +import subprocess from urlparse import urlsplit -import json + from lxml import html + import w3lib.html -import subprocess -from splash.file_server import serve_files +from splash.benchmark.file_server import serve_files +from splash.tests.stress import lua_runonce script_html = """ function main(splash) From 9b418b775609162d0d8c399075316a0e8272b4ad Mon Sep 17 00:00:00 2001 From: immerrr Date: Sat, 28 Feb 2015 19:20:20 +0300 Subject: [PATCH 03/10] benchmark: print some metrics --- splash/benchmark/benchmark.py | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index 25fac405e..da4d8bf2e 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -9,22 +9,33 @@ """ import logging +import os import random +import shutil from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from glob import glob from multiprocessing.pool import ThreadPool +from pprint import pformat +from time import time import requests from splash.benchmark.file_server import serve_files from splash.tests.utils import SplashServer +#: Port at which static pages will be served. PORT = 8806 -#: URLs to benchmark against. +#: Static pages to be used in the benchmark. PAGES = glob('localhost_8806/*.html') #: Combinations of width & height to test. WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] # XXX: add benchmark of different API endpoints. SPLASH_LOG = 'splash.log' +#: This script is used to collect maxrss & cpu time from splash process. +GET_PERF_STATS_SCRIPT = """ +function main(splash) + return splash:get_perf_stats() +end +""" parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) @@ -60,20 +71,42 @@ def invoke_request(invoke_args): log = logging.getLogger('bench_worker') req_no, total_reqs, kwargs = invoke_args log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs) - return requests.get(**kwargs) + stime = time() + requests.get(**kwargs) + etime = time() + return {'start_time': stime, + 'end_time': etime, + 'duration': etime - stime, + 'endpoint': kwargs['url'], + 'site': kwargs['params']['url'], + 'width': kwargs['params']['width'], + 'height': kwargs['params']['height']} def main(): + log = logging.getLogger("benchmark") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG) - with SplashServer(logfile=SPLASH_LOG, - extra_args=['--disable-lua-sandbox', - '--disable-xvfb', - '--max-timeout=600']) as splash, \ - serve_files(PORT): - parallel_map(invoke_request, generate_requests(splash, args), - args.thread_count) + splash = SplashServer( + logfile=SPLASH_LOG, + extra_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600']) + + with splash, serve_files(PORT): + start_time = time() + results = parallel_map(invoke_request, generate_requests(splash, args), + args.thread_count) + end_time = time() + resources = requests.get( + splash.url('execute'), + params={'lua_source': GET_PERF_STATS_SCRIPT}).json() + + log.info("Request stats:\n%s", pformat(dict(enumerate(results)))) + log.info("Splash max RSS: %s B", resources['maxrss']) + log.info("Splash CPU time elapsed: %.2f sec", resources['cputime']) + log.info("Wallclock time elapsed: %.2f sec", end_time - start_time) if __name__ == '__main__': From 871d07dc5f7b6587f1b11998db54df978f5975cd Mon Sep 17 00:00:00 2001 From: immerrr Date: Mon, 2 Mar 2015 16:47:08 +0300 Subject: [PATCH 04/10] benchmark: add different endpoints (png, json, lua-png) --- splash/benchmark/benchmark.py | 56 +++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index da4d8bf2e..b9c501713 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -22,13 +22,53 @@ from splash.benchmark.file_server import serve_files from splash.tests.utils import SplashServer + +def make_render_png_req(splash, params): + """Prepare request for render.png endpoint.""" + return {'url': splash.url('render.png'), + 'params': params} + + +def make_render_json_req(splash, params): + """Prepare request for render.json endpoint.""" + json_params = params.copy() + json_params['png'] = 1 + return {'url': splash.url('render.json'), + 'params': json_params} + + +def make_render_png_lua_req(splash, params): + """Prepare request for execute endpoint.""" + lua_params = params.copy() + lua_params['lua_source'] = """ +function main(splash) + assert(splash:go(splash.args.url)) + if splash.args.wait then + assert(splash:wait(splash.args.wait)) + end + splash:set_result_content_type("image/png") + return splash:png{width=splash.args.width, + height=splash.args.height, + render_all=splash.args.render_all} +end +""" + return {'url': splash.url('execute'), + 'params': lua_params} + + +REQ_FACTORIES = [ + make_render_png_req, + make_render_json_req, + make_render_png_lua_req, +] + + #: Port at which static pages will be served. PORT = 8806 #: Static pages to be used in the benchmark. PAGES = glob('localhost_8806/*.html') #: Combinations of width & height to test. WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] -# XXX: add benchmark of different API endpoints. SPLASH_LOG = 'splash.log' #: This script is used to collect maxrss & cpu time from splash process. GET_PERF_STATS_SCRIPT = """ @@ -37,6 +77,7 @@ end """ + parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', type=int, default=1234, help='PRNG seed number') @@ -53,10 +94,12 @@ def generate_requests(splash, args): for i in xrange(args.request_count): page = rng.choice(PAGES) width, height = rng.choice(WIDTH_HEIGHT) + req_factory = rng.choice(REQ_FACTORIES) url = 'http://localhost:%d/%s' % (PORT, page) - yield (i + 1, args.request_count, - {'url': splash.url('render.png'), - 'params': {'url': url, 'width': width, 'height': height}}) + params = {'url': url, 'render_all': 1, 'wait': 0.1, + 'width': width, 'height': height} + log.debug("Req factory: %s, params: %s", req_factory, params) + yield (i + 1, args.request_count, req_factory(splash, params)) def parallel_map(func, iterable, thread_count): @@ -72,12 +115,15 @@ def invoke_request(invoke_args): req_no, total_reqs, kwargs = invoke_args log.info("Initiating request %d/%d: %s", req_no, total_reqs, kwargs) stime = time() - requests.get(**kwargs) + response = requests.get(**kwargs) etime = time() + if response.status_code != 200: + log.error("Non-OK response:\n%s", response.text) return {'start_time': stime, 'end_time': etime, 'duration': etime - stime, 'endpoint': kwargs['url'], + 'status': response.status_code, 'site': kwargs['params']['url'], 'width': kwargs['params']['width'], 'height': kwargs['params']['height']} From 938e04b433d319f0ac43a74b34cf68173bcc70a3 Mon Sep 17 00:00:00 2001 From: immerrr Date: Mon, 2 Mar 2015 20:06:17 +0300 Subject: [PATCH 05/10] benchmark: put downloaded sites into a configurable subdir --- splash/benchmark/README.rst | 4 +- splash/benchmark/benchmark.py | 18 +++++++-- splash/benchmark/download_sites.py | 62 +++++++++++++++++++----------- splash/benchmark/file_server.py | 23 ++++++++--- 4 files changed, 73 insertions(+), 34 deletions(-) mode change 100644 => 100755 splash/benchmark/download_sites.py mode change 100644 => 100755 splash/benchmark/file_server.py diff --git a/splash/benchmark/README.rst b/splash/benchmark/README.rst index 10e3b3e23..39a70688e 100644 --- a/splash/benchmark/README.rst +++ b/splash/benchmark/README.rst @@ -3,7 +3,7 @@ This directory contains a preliminary version of splash benchmark suite. To use it, do the following: - install ``httrack`` -- create a directory for downloaded files, e.g. ``files`` -- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark +- run ``python download_sites.py``, it will create ``sites`` subdirectory in + current directory and download sites to be used in the benchmark there - run ``python benchmark.py`` to run the benchmark diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index b9c501713..3a62e11bf 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -17,6 +17,7 @@ from multiprocessing.pool import ThreadPool from pprint import pformat from time import time +import re import requests from splash.benchmark.file_server import serve_files @@ -65,10 +66,9 @@ def make_render_png_lua_req(splash, params): #: Port at which static pages will be served. PORT = 8806 -#: Static pages to be used in the benchmark. -PAGES = glob('localhost_8806/*.html') #: Combinations of width & height to test. WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] +#: Splash log filename. SPLASH_LOG = 'splash.log' #: This script is used to collect maxrss & cpu time from splash process. GET_PERF_STATS_SCRIPT = """ @@ -85,14 +85,24 @@ def make_render_png_lua_req(splash, params): help='Request thread count') parser.add_argument('--request-count', type=int, default=10, help='Benchmark request count') +parser.add_argument('--sites-dir', type=str, default='sites', + help='Directory with downloaded sites') def generate_requests(splash, args): log = logging.getLogger('generate_requests') log.info("Using pRNG seed: %s", args.seed) + + # Static pages (relative to sites_dir) to be used in the benchmark. + pages = [re.sub('^%s/' % args.sites_dir, '', v) + for v in glob(os.path.join(args.sites_dir, 'localhost_8806', + '*.html'))] + for p in pages: + log.info("Using page for benchmark: %s", p) + rng = random.Random(args.seed) for i in xrange(args.request_count): - page = rng.choice(PAGES) + page = rng.choice(pages) width, height = rng.choice(WIDTH_HEIGHT) req_factory = rng.choice(REQ_FACTORIES) url = 'http://localhost:%d/%s' % (PORT, page) @@ -140,7 +150,7 @@ def main(): '--disable-xvfb', '--max-timeout=600']) - with splash, serve_files(PORT): + with splash, serve_files(PORT, args.sites_dir): start_time = time() results = parallel_map(invoke_request, generate_requests(splash, args), args.thread_count) diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py old mode 100644 new mode 100755 index abe921eee..482469bab --- a/splash/benchmark/download_sites.py +++ b/splash/benchmark/download_sites.py @@ -1,4 +1,13 @@ +#!/usr/bin/env python + +""" +Site downloader script for Splash benchmark suite. +""" + +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import errno import json +import os import re import subprocess from urlparse import urlsplit @@ -9,7 +18,7 @@ from splash.benchmark.file_server import serve_files from splash.tests.stress import lua_runonce -script_html = """ +SCRIPT_HTML = """ function main(splash) splash:set_images_enabled(false) splash:go(splash.args.url) @@ -18,24 +27,19 @@ end """ -script_png = """ - -function main(splash) -splash:go(splash.args.url) -splash:wait(0.5) -return splash:png() -end -""" - - +#: This UA is used by httrack to mimic Splash requests when downloading sites. USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34" - PORT = 8806 +parser = ArgumentParser(description=__doc__, + formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--sites-dir', default='sites', + help='Directory for downloaded sites') -def preprocess_main_page(url): - out = json.loads(lua_runonce(script_html, url=url, + +def preprocess_main_page(sites_dir, url): + out = json.loads(lua_runonce(SCRIPT_HTML, url=url, splash_args=['--disable-lua-sandbox', '--disable-xvfb', '--max-timeout=600'], @@ -56,13 +60,13 @@ def preprocess_main_page(url): out['html'] = html.tostring(root, encoding='utf-8', doctype='') filename = re.sub(r'[^\w]+', '_', url) + '.html' - with open(filename, 'w') as f: + with open(os.path.join(sites_dir, filename), 'w') as f: f.write(out['html']) return filename -def download_sites(sites): - local_files = [preprocess_main_page(s) for s in sites] +def download_sites(sites_dir, sites): + local_files = [preprocess_main_page(sites_dir, s) for s in sites] local_urls = [ 'http://localhost:%(port)d/%(filename)s' % { @@ -75,12 +79,20 @@ def download_sites(sites): '-%P', # Try parsing links in non-href/src sections '-F', USERAGENT, # Emulate splash UA '--depth=1'] - subprocess.check_call(['httrack'] + args + local_urls) - - -if __name__ == '__main__': - with serve_files(PORT): - download_sites([ + subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir) + + +def main(): + args = parser.parse_args() + try: + os.makedirs(args.sites_dir) + except OSError as e: + if e.errno != errno.EEXIST: + raise + elif not os.path.isdir(args.sites_dir): + raise RuntimeError("Not a directory: %s" % args.sites_dir) + with serve_files(PORT, args.sites_dir): + download_sites(args.sites_dir, [ 'http://www.wikipedia.org', 'http://www.google.com', 'http://www.reddit.com', @@ -89,3 +101,7 @@ def download_sites(sites): # "http://blog.pinterest.com", # "http://imgur.com", ]) + + +if __name__ == '__main__': + main() diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py old mode 100644 new mode 100755 index 77e2b7084..2931f41ae --- a/splash/benchmark/file_server.py +++ b/splash/benchmark/file_server.py @@ -1,10 +1,22 @@ +#!/usr/bin/env python + +""" +Simple static file server. +""" + +import argparse +import os import SimpleHTTPServer import SocketServer import subprocess -import sys from contextlib import contextmanager +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument('port', type=int, help='Port number to listen at') +parser.add_argument('directory', type=str, help='Directory to serve') + + class ReusingTCPServer(SocketServer.TCPServer): allow_reuse_address = True @@ -15,10 +27,10 @@ def address_string(self): @contextmanager -def serve_files(port): +def serve_files(port, directory): """Serve files from current directory statically in a subprocess.""" site_server = subprocess.Popen(['python', '-m', __name__, - str(port)]) + str(port), directory]) try: yield finally: @@ -26,6 +38,7 @@ def serve_files(port): if __name__ == '__main__': - port = int(sys.argv[1]) - server = ReusingTCPServer(("", port), RequestHandler) + args = parser.parse_args() + os.chdir(args.directory) + server = ReusingTCPServer(("", args.port), RequestHandler) server.serve_forever() From b705bcfed8956e8d46daed3b21681f36ab809cdf Mon Sep 17 00:00:00 2001 From: immerrr Date: Mon, 2 Mar 2015 20:45:32 +0300 Subject: [PATCH 06/10] benchmark: ignore benchmark files for pytest purposes --- splash/conftest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/splash/conftest.py b/splash/conftest.py index 2a20cc38a..f93ed2990 100644 --- a/splash/conftest.py +++ b/splash/conftest.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import from splash import lua +import glob collect_ignore = [] @@ -15,3 +16,8 @@ 'kernel/__main__.py', 'kernel/__init__.py', ] + +collect_ignore.extend([ + 'benchmark/download_sites.py', + 'benchmark/file_server.py', + 'benchmark/benchmark.py']) From f8b201b65952ec4a3d2727ace601fd683e5cb7f4 Mon Sep 17 00:00:00 2001 From: immerrr Date: Mon, 2 Mar 2015 21:30:51 +0300 Subject: [PATCH 07/10] benchmark: several post-review improvements - download_sites: fix encoding unconditionally if it is missing - download_sites: add base/href only if it is missing - download_sites: remove scripts unconditionally - benchmark: specify pre-existing splash instance with --splash-server HOST:PORT --- splash/benchmark/benchmark.py | 32 +++++++++++++++++++++++++----- splash/benchmark/download_sites.py | 31 ++++++++++++++++++----------- 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index 3a62e11bf..2ba766fc7 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -87,6 +87,8 @@ def make_render_png_lua_req(splash, params): help='Benchmark request count') parser.add_argument('--sites-dir', type=str, default='sites', help='Directory with downloaded sites') +parser.add_argument('--splash-server', metavar='HOST:PORT', + help='Use existing Splash instance available at HOST:PORT') def generate_requests(splash, args): @@ -139,16 +141,36 @@ def invoke_request(invoke_args): 'height': kwargs['params']['height']} +class ExistingSplashWrapper(object): + """Wrapper for pre-existing Splash instance.""" + def __init__(self, server): + self.server = server + if not self.server.startswith('http://'): + self.server = 'http://' + self.server + + def url(self, endpoint): + return self.server + '/' + endpoint + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + def main(): log = logging.getLogger("benchmark") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG) - splash = SplashServer( - logfile=SPLASH_LOG, - extra_args=['--disable-lua-sandbox', - '--disable-xvfb', - '--max-timeout=600']) + if args.splash_server: + splash = ExistingSplashWrapper(args.splash_server) + else: + splash = SplashServer( + logfile=SPLASH_LOG, + extra_args=['--disable-lua-sandbox', + '--disable-xvfb', + '--max-timeout=600']) with splash, serve_files(PORT, args.sites_dir): start_time = time() diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py index 482469bab..e50fee33f 100755 --- a/splash/benchmark/download_sites.py +++ b/splash/benchmark/download_sites.py @@ -39,26 +39,33 @@ def preprocess_main_page(sites_dir, url): + """ + This function does several things: + - strip javascript so that downloaded pages look exactly the same + - add baseurl to resolve relative links properly (if it is missing) + - add meta charset description (if it is missing) + """ out = json.loads(lua_runonce(SCRIPT_HTML, url=url, splash_args=['--disable-lua-sandbox', '--disable-xvfb', '--max-timeout=600'], timeout=600.,)) final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl() - if not w3lib.html.get_base_url(out['html']): - out['html'] = w3lib.html.remove_tags_with_content( - out['html'], ('script',)) - root = html.fromstring(out['html'], parser=html.HTMLParser(), - base_url=final_url) - try: - head = root.xpath('./head')[0] - except IndexError: - head = html.Element('head') - root.insert(0, head) + # Ensure there are no scripts to be executed. + out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',)) + root = html.fromstring(out['html'], parser=html.HTMLParser(), + base_url=final_url) + try: + head = root.xpath('./head')[0] + except IndexError: + head = html.Element('head') + root.insert(0, head) + if not head.xpath('./base/@href'): head.insert(0, html.Element('base', {'href': final_url})) + if not head.xpath('./meta/@charset'): head.insert(0, html.Element('meta', {'charset': 'utf-8'})) - out['html'] = html.tostring(root, encoding='utf-8', - doctype='') + out['html'] = html.tostring(root, encoding='utf-8', + doctype='') filename = re.sub(r'[^\w]+', '_', url) + '.html' with open(os.path.join(sites_dir, filename), 'w') as f: f.write(out['html']) From 9371c2335550e5b7115f4562261a550dcf1a8ab3 Mon Sep 17 00:00:00 2001 From: immerrr Date: Mon, 2 Mar 2015 22:04:29 +0300 Subject: [PATCH 08/10] benchmark: serve files via twisted.web.static --- splash/benchmark/benchmark.py | 5 ++- splash/benchmark/file_server.py | 60 ++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index 2ba766fc7..d326ce6c8 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -11,7 +11,6 @@ import logging import os import random -import shutil from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from glob import glob from multiprocessing.pool import ThreadPool @@ -68,7 +67,7 @@ def make_render_png_lua_req(splash, params): PORT = 8806 #: Combinations of width & height to test. WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] -#: Splash log filename. +#: Splash log filename (set to None to put it to stdout). SPLASH_LOG = 'splash.log' #: This script is used to collect maxrss & cpu time from splash process. GET_PERF_STATS_SCRIPT = """ @@ -172,7 +171,7 @@ def main(): '--disable-xvfb', '--max-timeout=600']) - with splash, serve_files(PORT, args.sites_dir): + with splash, serve_files(port=PORT, directory=args.sites_dir): start_time = time() results = parallel_map(invoke_request, generate_requests(splash, args), args.thread_count) diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py index 2931f41ae..52d4532ea 100755 --- a/splash/benchmark/file_server.py +++ b/splash/benchmark/file_server.py @@ -1,44 +1,56 @@ #!/usr/bin/env python -""" -Simple static file server. -""" +"""Simple static file server.""" import argparse import os -import SimpleHTTPServer -import SocketServer import subprocess +import time from contextlib import contextmanager +from twisted.internet import reactor +from twisted.web.server import Site +from twisted.web.static import File -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument('port', type=int, help='Port number to listen at') -parser.add_argument('directory', type=str, help='Directory to serve') +import requests - -class ReusingTCPServer(SocketServer.TCPServer): - allow_reuse_address = True - - -class RequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): - def address_string(self): - return "fileserver" +parser = argparse.ArgumentParser("") +parser.add_argument('--port', type=int) +parser.add_argument('--directory', help='Directory to be served') @contextmanager -def serve_files(port, directory): - """Serve files from current directory statically in a subprocess.""" - site_server = subprocess.Popen(['python', '-m', __name__, - str(port), directory]) +def serve_files(port, directory, logfile=None): + """Serve files from specified directory statically in a subprocess.""" + command = ['twistd', + '-n', # don't daemonize + 'web', # start web component + '--port', str(int(port)), + '--path', os.path.abspath(directory), ] + if logfile is not None: + command += ['--logfile', logfile] + site_server = subprocess.Popen(command) try: + # It might take some time to bring up the server, wait for up to 10s. + for i in xrange(100): + try: + requests.get('http://localhost:%d' % port) + except requests.ConnectionError: + time.sleep(0.1) + else: + break yield finally: site_server.terminate() -if __name__ == '__main__': +def main(): args = parser.parse_args() - os.chdir(args.directory) - server = ReusingTCPServer(("", args.port), RequestHandler) - server.serve_forever() + resource = File(os.path.abspath(args.directory)) + site = Site(resource) + reactor.listenTCP(args.port, site) + reactor.run() + + +if __name__ == '__main__': + main() From 4d072a696530b30ad92139da43c138cb8a4122db Mon Sep 17 00:00:00 2001 From: immerrr Date: Thu, 5 Mar 2015 11:59:26 +0000 Subject: [PATCH 09/10] benchmark: several minor changes - add fileserver logs, write them to file (--logfile) - put bench results into file (--out-file) - silence requests.packages.urllib3.connectionpool logger - fix cputime metric for preexisting splash instances --- splash/benchmark/benchmark.py | 36 ++++++++++++++++++++++++--------- splash/benchmark/file_server.py | 23 ++++++++++++++------- splash/qtrender_lua.py | 2 +- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index d326ce6c8..e99380008 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -11,12 +11,13 @@ import logging import os import random -from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, FileType from glob import glob from multiprocessing.pool import ThreadPool from pprint import pformat from time import time import re +import sys import requests from splash.benchmark.file_server import serve_files @@ -67,8 +68,9 @@ def make_render_png_lua_req(splash, params): PORT = 8806 #: Combinations of width & height to test. WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)] -#: Splash log filename (set to None to put it to stdout). +#: Splash & fileserver log filenames (set to None to put it to stderr). SPLASH_LOG = 'splash.log' +FILESERVER_LOG = 'fileserver.log' #: This script is used to collect maxrss & cpu time from splash process. GET_PERF_STATS_SCRIPT = """ function main(splash) @@ -88,6 +90,8 @@ def make_render_png_lua_req(splash, params): help='Directory with downloaded sites') parser.add_argument('--splash-server', metavar='HOST:PORT', help='Use existing Splash instance available at HOST:PORT') +parser.add_argument('--out-file', type=FileType(mode='w'), default=sys.stdout, + help='Write detailed request information in this file') def generate_requests(splash, args): @@ -95,9 +99,10 @@ def generate_requests(splash, args): log.info("Using pRNG seed: %s", args.seed) # Static pages (relative to sites_dir) to be used in the benchmark. - pages = [re.sub('^%s/' % args.sites_dir, '', v) - for v in glob(os.path.join(args.sites_dir, 'localhost_8806', - '*.html'))] + log.info("sites dir: %s", args.sites_dir) + sites_found = glob(os.path.join(args.sites_dir, 'localhost_8806', '*.html')) + log.info("sites found: %s", sites_found) + pages = [re.sub('^%s/' % args.sites_dir.rstrip('/'), '', v) for v in sites_found] for p in pages: log.info("Using page for benchmark: %s", p) @@ -160,6 +165,7 @@ def __exit__(self, *args): def main(): log = logging.getLogger("benchmark") args = parser.parse_args() + logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) logging.basicConfig(level=logging.DEBUG) if args.splash_server: @@ -171,18 +177,28 @@ def main(): '--disable-xvfb', '--max-timeout=600']) - with splash, serve_files(port=PORT, directory=args.sites_dir): + with splash, serve_files(port=PORT, directory=args.sites_dir, logfile=FILESERVER_LOG): + log.info("Servers are up, starting benchmark...") + start_res = requests.get( + splash.url('execute'), + params={'lua_source': GET_PERF_STATS_SCRIPT}).json() start_time = time() results = parallel_map(invoke_request, generate_requests(splash, args), args.thread_count) end_time = time() - resources = requests.get( + end_res = requests.get( splash.url('execute'), params={'lua_source': GET_PERF_STATS_SCRIPT}).json() - log.info("Request stats:\n%s", pformat(dict(enumerate(results)))) - log.info("Splash max RSS: %s B", resources['maxrss']) - log.info("Splash CPU time elapsed: %.2f sec", resources['cputime']) + log.info("Writing stats to %s", args.out_file.name) + args.out_file.write(pformat({ + 'maxrss': end_res['maxrss'], + 'cputime': end_res['cputime'] - start_res['cputime'], + 'walltime': end_time - start_time, + 'requests': results})) + log.info("Splash max RSS: %s B", end_res['maxrss']) + log.info("Splash CPU time elapsed: %.2f sec", + end_res['cputime'] - start_res['cputime']) log.info("Wallclock time elapsed: %.2f sec", end_time - start_time) diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py index 52d4532ea..5e4cb3acf 100755 --- a/splash/benchmark/file_server.py +++ b/splash/benchmark/file_server.py @@ -6,27 +6,35 @@ import os import subprocess import time +import sys from contextlib import contextmanager from twisted.internet import reactor from twisted.web.server import Site from twisted.web.static import File +from twisted.python.log import startLogging import requests parser = argparse.ArgumentParser("") -parser.add_argument('--port', type=int) -parser.add_argument('--directory', help='Directory to be served') - +parser.add_argument('--port', type=int, default=8806) +parser.add_argument('--directory', help='Directory to be served', default='.') +parser.add_argument('--logfile', default=sys.stderr, type=argparse.FileType(mode='w'), + help='File to write logs to') @contextmanager def serve_files(port, directory, logfile=None): """Serve files from specified directory statically in a subprocess.""" - command = ['twistd', - '-n', # don't daemonize - 'web', # start web component + # command = ['twistd', + # '-n', # don't daemonize + # 'web', # start web component + # '--port', str(int(port)), + # '--path', os.path.abspath(directory), ] + # if logfile is not None: + # command += ['--logfile', logfile] + command = ['python', __file__, '--port', str(int(port)), - '--path', os.path.abspath(directory), ] + '--directory', os.path.abspath(directory)] if logfile is not None: command += ['--logfile', logfile] site_server = subprocess.Popen(command) @@ -46,6 +54,7 @@ def serve_files(port, directory, logfile=None): def main(): args = parser.parse_args() + startLogging(args.logfile) resource = File(os.path.abspath(args.directory)) site = Site(resource) reactor.listenTCP(args.port, site) diff --git a/splash/qtrender_lua.py b/splash/qtrender_lua.py index 82f6f8fe5..0e12b8503 100644 --- a/splash/qtrender_lua.py +++ b/splash/qtrender_lua.py @@ -497,7 +497,7 @@ def get_perf_stats(self): rss_mul = 1 if sys.platform == 'darwin' else 1024 return {'maxrss': rusage.ru_maxrss * rss_mul, 'cputime': rusage.ru_utime + rusage.ru_stime, - 'walltime': time.time()} + 'walltime': time.time()} def get_real_exception(self): if self._exceptions: From f7a43dada156274792047f0053e77a3082743f17 Mon Sep 17 00:00:00 2001 From: immerrr again Date: Mon, 9 Mar 2015 09:42:21 +0000 Subject: [PATCH 10/10] benchmark: more features & fixes - add support for preexisting file server instance (--fileserver) - add HTML endpoint benchmarks (--render-type html) - make --sites-dir required - dump output in proper JSON --- splash/benchmark/benchmark.py | 106 ++++++++++++++++++++++------- splash/benchmark/download_sites.py | 9 ++- splash/benchmark/file_server.py | 71 ++++++++++++------- 3 files changed, 135 insertions(+), 51 deletions(-) diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py index e99380008..ca22db518 100755 --- a/splash/benchmark/benchmark.py +++ b/splash/benchmark/benchmark.py @@ -8,6 +8,7 @@ """ +import json import logging import os import random @@ -20,18 +21,16 @@ import sys import requests -from splash.benchmark.file_server import serve_files -from splash.tests.utils import SplashServer def make_render_png_req(splash, params): - """Prepare request for render.png endpoint.""" + """Make PNG render request via render.png endpoint.""" return {'url': splash.url('render.png'), 'params': params} def make_render_json_req(splash, params): - """Prepare request for render.json endpoint.""" + """Make PNG render request via JSON endpoint.""" json_params = params.copy() json_params['png'] = 1 return {'url': splash.url('render.json'), @@ -39,7 +38,7 @@ def make_render_json_req(splash, params): def make_render_png_lua_req(splash, params): - """Prepare request for execute endpoint.""" + """Make PNG render request via Lua execute endpoint.""" lua_params = params.copy() lua_params['lua_source'] = """ function main(splash) @@ -57,11 +56,51 @@ def make_render_png_lua_req(splash, params): 'params': lua_params} -REQ_FACTORIES = [ - make_render_png_req, - make_render_json_req, - make_render_png_lua_req, -] +def make_render_html_req(splash, params): + """Make HTML render request via render.html endpoint.""" + return {'url': splash.url('render.html'), + 'params': params} + + +def make_render_html_json_req(splash, params): + """Make HTML render request via JSON endpoint.""" + json_params = params.copy() + json_params['html'] = 1 + return {'url': splash.url('render.json'), + 'params': json_params} + + +def make_render_html_lua_req(splash, params): + """Make HTML render request via Lua execute endpoint.""" + lua_params = params.copy() + lua_params['lua_source'] = """ +function main(splash) + assert(splash:go(splash.args.url)) + if splash.args.wait then + assert(splash:wait(splash.args.wait)) + end + splash:set_result_content_type("text/html; charset=UTF-8") + return splash:html{} +end +""" + return {'url': splash.url('execute'), + 'params': lua_params} + + +#: Same resource may be rendered by various endpoints with slightly varying +#: parameter combinations. Request factories set those combinations up. +REQ_FACTORIES = { + 'png': [ + make_render_png_req, + make_render_json_req, + make_render_png_lua_req, + ], + 'html': [ + make_render_html_req, + make_render_html_json_req, + make_render_html_lua_req, + ], +} #: Port at which static pages will be served. @@ -86,15 +125,20 @@ def make_render_png_lua_req(splash, params): help='Request thread count') parser.add_argument('--request-count', type=int, default=10, help='Benchmark request count') -parser.add_argument('--sites-dir', type=str, default='sites', +parser.add_argument('--sites-dir', type=str, default='sites', required=True, help='Directory with downloaded sites') +parser.add_argument('--file-server', metavar='HOST:PORT', + help='Use existing file server instance available at HOST:PORT') parser.add_argument('--splash-server', metavar='HOST:PORT', help='Use existing Splash instance available at HOST:PORT') parser.add_argument('--out-file', type=FileType(mode='w'), default=sys.stdout, help='Write detailed request information in this file') +parser.add_argument('--render-type', choices=('html', 'png'), default='png', + help=('Type of rendering to benchmark' + ' (either "html" or "png")')) -def generate_requests(splash, args): +def generate_requests(splash, file_server, args): log = logging.getLogger('generate_requests') log.info("Using pRNG seed: %s", args.seed) @@ -106,12 +150,14 @@ def generate_requests(splash, args): for p in pages: log.info("Using page for benchmark: %s", p) + request_factories = REQ_FACTORIES[args.render_type] + rng = random.Random(args.seed) for i in xrange(args.request_count): page = rng.choice(pages) width, height = rng.choice(WIDTH_HEIGHT) - req_factory = rng.choice(REQ_FACTORIES) - url = 'http://localhost:%d/%s' % (PORT, page) + req_factory = rng.choice(request_factories) + url = file_server.url(page) params = {'url': url, 'render_all': 1, 'wait': 0.1, 'width': width, 'height': height} log.debug("Req factory: %s, params: %s", req_factory, params) @@ -145,7 +191,7 @@ def invoke_request(invoke_args): 'height': kwargs['params']['height']} -class ExistingSplashWrapper(object): +class ExistingServerWrapper(object): """Wrapper for pre-existing Splash instance.""" def __init__(self, server): self.server = server @@ -165,25 +211,36 @@ def __exit__(self, *args): def main(): log = logging.getLogger("benchmark") args = parser.parse_args() - logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARNING) + (logging.getLogger('requests.packages.urllib3.connectionpool') + .setLevel(logging.WARNING)) logging.basicConfig(level=logging.DEBUG) if args.splash_server: - splash = ExistingSplashWrapper(args.splash_server) + splash = ExistingServerWrapper(args.splash_server) else: + from splash.tests.utils import SplashServer splash = SplashServer( logfile=SPLASH_LOG, extra_args=['--disable-lua-sandbox', '--disable-xvfb', '--max-timeout=600']) - with splash, serve_files(port=PORT, directory=args.sites_dir, logfile=FILESERVER_LOG): + if args.file_server: + file_server = ExistingServerWrapper(args.file_server) + else: + from splash.benchmark.file_server import FileServerSubprocess + file_server = FileServerSubprocess(port=PORT, + path=args.sites_dir, + logfile=FILESERVER_LOG) + + with splash, file_server: log.info("Servers are up, starting benchmark...") start_res = requests.get( splash.url('execute'), params={'lua_source': GET_PERF_STATS_SCRIPT}).json() start_time = time() - results = parallel_map(invoke_request, generate_requests(splash, args), + results = parallel_map(invoke_request, + generate_requests(splash, file_server, args), args.thread_count) end_time = time() end_res = requests.get( @@ -191,11 +248,12 @@ def main(): params={'lua_source': GET_PERF_STATS_SCRIPT}).json() log.info("Writing stats to %s", args.out_file.name) - args.out_file.write(pformat({ - 'maxrss': end_res['maxrss'], - 'cputime': end_res['cputime'] - start_res['cputime'], - 'walltime': end_time - start_time, - 'requests': results})) + args.out_file.write(json.dumps( + {'maxrss': end_res['maxrss'], + 'cputime': end_res['cputime'] - start_res['cputime'], + 'walltime': end_time - start_time, + 'requests': results}, + indent=2)) log.info("Splash max RSS: %s B", end_res['maxrss']) log.info("Splash CPU time elapsed: %.2f sec", end_res['cputime'] - start_res['cputime']) diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py index e50fee33f..07a9577de 100755 --- a/splash/benchmark/download_sites.py +++ b/splash/benchmark/download_sites.py @@ -10,12 +10,13 @@ import os import re import subprocess +import logging from urlparse import urlsplit from lxml import html import w3lib.html -from splash.benchmark.file_server import serve_files +from splash.benchmark.file_server import FileServerSubprocess from splash.tests.stress import lua_runonce SCRIPT_HTML = """ @@ -91,6 +92,10 @@ def download_sites(sites_dir, sites): def main(): args = parser.parse_args() + (logging.getLogger('requests.packages.urllib3.connectionpool') + .setLevel(logging.WARNING)) + logging.basicConfig(level=logging.DEBUG) + logging.info("Starting site download suite") try: os.makedirs(args.sites_dir) except OSError as e: @@ -98,7 +103,7 @@ def main(): raise elif not os.path.isdir(args.sites_dir): raise RuntimeError("Not a directory: %s" % args.sites_dir) - with serve_files(PORT, args.sites_dir): + with FileServerSubprocess(port=PORT, path=args.sites_dir): download_sites(args.sites_dir, [ 'http://www.wikipedia.org', 'http://www.google.com', diff --git a/splash/benchmark/file_server.py b/splash/benchmark/file_server.py index 5e4cb3acf..bb0549ce8 100755 --- a/splash/benchmark/file_server.py +++ b/splash/benchmark/file_server.py @@ -7,6 +7,7 @@ import subprocess import time import sys +import logging from contextlib import contextmanager from twisted.internet import reactor @@ -18,44 +19,64 @@ parser = argparse.ArgumentParser("") parser.add_argument('--port', type=int, default=8806) -parser.add_argument('--directory', help='Directory to be served', default='.') -parser.add_argument('--logfile', default=sys.stderr, type=argparse.FileType(mode='w'), +parser.add_argument('--path', help='Path to be served', default='.') +parser.add_argument('--logfile', default=sys.stderr, + type=argparse.FileType(mode='w'), help='File to write logs to') -@contextmanager -def serve_files(port, directory, logfile=None): + +class FileServerSubprocess(object): + logger = logging.getLogger('file_server') + """Serve files from specified directory statically in a subprocess.""" - # command = ['twistd', - # '-n', # don't daemonize - # 'web', # start web component - # '--port', str(int(port)), - # '--path', os.path.abspath(directory), ] - # if logfile is not None: - # command += ['--logfile', logfile] - command = ['python', __file__, - '--port', str(int(port)), - '--directory', os.path.abspath(directory)] - if logfile is not None: - command += ['--logfile', logfile] - site_server = subprocess.Popen(command) - try: + def __init__(self, port, path, logfile=None): + self.port = port + self.path = path + self.logfile = logfile + self.server = 'http://localhost:%d' % port + + def url(self, endpoint): + return self.server + '/' + endpoint + + def __enter__(self): + # command = ['twistd', + # '-n', # don't daemonize + # 'web', # start web component + # '--port', str(int(port)), + # '--path', os.path.abspath(directory), ] + # if logfile is not None: + # command += ['--logfile', logfile] + command = ['python', __file__, + '--port', str(int(self.port)), + '--path', os.path.abspath(self.path)] + if self.logfile is not None: + command += ['--logfile', self.logfile] + self.logger.info("Starting file server subprocess: %s", command) + self._site_server = subprocess.Popen(command) # It might take some time to bring up the server, wait for up to 10s. for i in xrange(100): try: - requests.get('http://localhost:%d' % port) + self.logger.info("Checking if file server is active") + requests.get(self.url('')) + break except requests.ConnectionError: time.sleep(0.1) - else: - break - yield - finally: - site_server.terminate() + else: + msg = "File server subprocess startup timed out" + if self.logfile: + with open(self.logfile, 'r') as log_f: + msg += ", logs:\n" + log_f.read() + raise RuntimeError(msg) + + def __exit__(self, *args): + self._site_server.kill() + self._site_server.wait() def main(): args = parser.parse_args() startLogging(args.logfile) - resource = File(os.path.abspath(args.directory)) + resource = File(os.path.abspath(args.path)) site = Site(resource) reactor.listenTCP(args.port, site) reactor.run()