Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark suite #185

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
benchmark: put downloaded sites into a configurable subdir
immerrr committed Apr 6, 2015

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 938e04b433d319f0ac43a74b34cf68173bcc70a3
4 changes: 2 additions & 2 deletions splash/benchmark/README.rst
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ This directory contains a preliminary version of splash benchmark suite.
To use it, do the following:

- install ``httrack``
- create a directory for downloaded files, e.g. ``files``
- run ``python download_sites.py`` in that directory to download sites to be used in the benchmark
- run ``python download_sites.py``, it will create ``sites`` subdirectory in
current directory and download sites to be used in the benchmark there
- run ``python benchmark.py`` to run the benchmark

18 changes: 14 additions & 4 deletions splash/benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -17,6 +17,7 @@
from multiprocessing.pool import ThreadPool
from pprint import pformat
from time import time
import re

import requests
from splash.benchmark.file_server import serve_files
@@ -65,10 +66,9 @@ def make_render_png_lua_req(splash, params):

#: Port at which static pages will be served.
PORT = 8806
#: Static pages to be used in the benchmark.
PAGES = glob('localhost_8806/*.html')
#: Combinations of width & height to test.
WIDTH_HEIGHT = [(None, None), (500, None), (None, 500), (500, 500)]
#: Splash log filename.
SPLASH_LOG = 'splash.log'
#: This script is used to collect maxrss & cpu time from splash process.
GET_PERF_STATS_SCRIPT = """
@@ -85,14 +85,24 @@ def make_render_png_lua_req(splash, params):
help='Request thread count')
parser.add_argument('--request-count', type=int, default=10,
help='Benchmark request count')
parser.add_argument('--sites-dir', type=str, default='sites',
help='Directory with downloaded sites')


def generate_requests(splash, args):
log = logging.getLogger('generate_requests')
log.info("Using pRNG seed: %s", args.seed)

# Static pages (relative to sites_dir) to be used in the benchmark.
pages = [re.sub('^%s/' % args.sites_dir, '', v)
for v in glob(os.path.join(args.sites_dir, 'localhost_8806',
'*.html'))]
for p in pages:
log.info("Using page for benchmark: %s", p)

rng = random.Random(args.seed)
for i in xrange(args.request_count):
page = rng.choice(PAGES)
page = rng.choice(pages)
width, height = rng.choice(WIDTH_HEIGHT)
req_factory = rng.choice(REQ_FACTORIES)
url = 'http://localhost:%d/%s' % (PORT, page)
@@ -140,7 +150,7 @@ def main():
'--disable-xvfb',
'--max-timeout=600'])

with splash, serve_files(PORT):
with splash, serve_files(PORT, args.sites_dir):
start_time = time()
results = parallel_map(invoke_request, generate_requests(splash, args),
args.thread_count)
62 changes: 39 additions & 23 deletions splash/benchmark/download_sites.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
#!/usr/bin/env python

"""
Site downloader script for Splash benchmark suite.
"""

from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import errno
import json
import os
import re
import subprocess
from urlparse import urlsplit
@@ -9,7 +18,7 @@
from splash.benchmark.file_server import serve_files
from splash.tests.stress import lua_runonce

script_html = """
SCRIPT_HTML = """
function main(splash)
splash:set_images_enabled(false)
splash:go(splash.args.url)
@@ -18,24 +27,19 @@
end
"""

script_png = """

function main(splash)
splash:go(splash.args.url)
splash:wait(0.5)
return splash:png()
end
"""


#: This UA is used by httrack to mimic Splash requests when downloading sites.
USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) Qt/4.8.1 Safari/534.34"


PORT = 8806

parser = ArgumentParser(description=__doc__,
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--sites-dir', default='sites',
help='Directory for downloaded sites')

def preprocess_main_page(url):
out = json.loads(lua_runonce(script_html, url=url,

def preprocess_main_page(sites_dir, url):
out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
splash_args=['--disable-lua-sandbox',
'--disable-xvfb',
'--max-timeout=600'],
@@ -56,13 +60,13 @@ def preprocess_main_page(url):
out['html'] = html.tostring(root, encoding='utf-8',
doctype='<!DOCTYPE html>')
filename = re.sub(r'[^\w]+', '_', url) + '.html'
with open(filename, 'w') as f:
with open(os.path.join(sites_dir, filename), 'w') as f:
f.write(out['html'])
return filename


def download_sites(sites):
local_files = [preprocess_main_page(s) for s in sites]
def download_sites(sites_dir, sites):
local_files = [preprocess_main_page(sites_dir, s) for s in sites]

local_urls = [
'http://localhost:%(port)d/%(filename)s' % {
@@ -75,12 +79,20 @@ def download_sites(sites):
'-%P', # Try parsing links in non-href/src sections
'-F', USERAGENT, # Emulate splash UA
'--depth=1']
subprocess.check_call(['httrack'] + args + local_urls)


if __name__ == '__main__':
with serve_files(PORT):
download_sites([
subprocess.check_call(['httrack'] + args + local_urls, cwd=sites_dir)


def main():
args = parser.parse_args()
try:
os.makedirs(args.sites_dir)
except OSError as e:
if e.errno != errno.EEXIST:
raise
elif not os.path.isdir(args.sites_dir):
raise RuntimeError("Not a directory: %s" % args.sites_dir)
with serve_files(PORT, args.sites_dir):
download_sites(args.sites_dir, [
'http://www.wikipedia.org',
'http://www.google.com',
'http://www.reddit.com',
@@ -89,3 +101,7 @@ def download_sites(sites):
# "http://blog.pinterest.com",
# "http://imgur.com",
])


if __name__ == '__main__':
main()
23 changes: 18 additions & 5 deletions splash/benchmark/file_server.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
#!/usr/bin/env python

"""
Simple static file server.
"""

import argparse
import os
import SimpleHTTPServer
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could be better to use Twisted (or some other async framework) because SimpleHTTPServer is single-threaded, and Splash can download multiple resources in parallel. I wonder if it is important for tests, maybe not.

With Twisted we can also simulate conditions like non-responding servers, delayed responses, etc.

import SocketServer
import subprocess
import sys
from contextlib import contextmanager


parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('port', type=int, help='Port number to listen at')
parser.add_argument('directory', type=str, help='Directory to serve')


class ReusingTCPServer(SocketServer.TCPServer):
allow_reuse_address = True

@@ -15,17 +27,18 @@ def address_string(self):


@contextmanager
def serve_files(port):
def serve_files(port, directory):
"""Serve files from current directory statically in a subprocess."""
site_server = subprocess.Popen(['python', '-m', __name__,
str(port)])
str(port), directory])
try:
yield
finally:
site_server.terminate()


if __name__ == '__main__':
port = int(sys.argv[1])
server = ReusingTCPServer(("", port), RequestHandler)
args = parser.parse_args()
os.chdir(args.directory)
server = ReusingTCPServer(("", args.port), RequestHandler)
server.serve_forever()