diff --git a/pyspider/fetcher/phantomjs_fetcher.js b/pyspider/fetcher/phantomjs_fetcher.js index 43f356072..3c61ec8af 100644 --- a/pyspider/fetcher/phantomjs_fetcher.js +++ b/pyspider/fetcher/phantomjs_fetcher.js @@ -40,6 +40,8 @@ if (system.args.length !== 2) { page_loaded = false, start_time = Date.now(), end_time = null, + redirect_urls = new Array(), + request_urls = new Array(), script_executed = false, script_result = null; @@ -95,7 +97,11 @@ if (system.args.length !== 2) { end_time = Date.now() + wait_before_end; setTimeout(make_result, wait_before_end+10, page); }; + page.onNavigationRequested = function(url) { + redirect_urls.push(url); + } page.onResourceRequested = function(request) { + request_urls.push(request.url); console.debug("Starting request: #"+request.id+" ["+request.method+"]"+request.url); end_time = null; }; @@ -160,6 +166,8 @@ if (system.args.length !== 2) { content: page.content || "", headers: {}, url: page.url || fetch.url, + redirect_urls: redirect_urls, + request_urls: request_urls, cookies: {}, time: (Date.now() - start_time) / 1000, js_script_result: null, @@ -200,6 +208,8 @@ if (system.args.length !== 2) { content: page.content, headers: headers, url: page.url, + redirect_urls: redirect_urls, + request_urls: request_urls, cookies: cookies, time: (Date.now() - start_time) / 1000, js_script_result: script_result, diff --git a/pyspider/libs/response.py b/pyspider/libs/response.py index 8975781b2..c17edeade 100644 --- a/pyspider/libs/response.py +++ b/pyspider/libs/response.py @@ -22,7 +22,7 @@ class Response(object): def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(), - content='', cookies=None, error=None, traceback=None, save=None, js_script_result=None, time=0): + content='', redirect_urls=None, request_urls=None, cookies=None, error=None, traceback=None, save=None, js_script_result=None, time=0): if cookies is None: cookies = {} self.status_code = status_code @@ -30,6 +30,8 @@ def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsens self.orig_url = orig_url self.headers = headers self.content = content + self.redirect_urls = redirect_urls + self.request_urls = request_urls self.cookies = cookies self.error = error self.traceback = traceback @@ -197,6 +199,8 @@ def rebuild_response(r): url=r.get('url', ''), headers=CaseInsensitiveDict(r.get('headers', {})), content=r.get('content', ''), + redirect_urls=r.get('redirect_urls', []), + request_urls=r.get('request_urls', []), cookies=r.get('cookies', {}), error=r.get('error'), traceback=r.get('traceback'),