From c14061565f0709f3d83150fe39e377ac9fc54931 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 13:29:02 +0530 Subject: [PATCH 01/15] added SplashHtmlResponse --- scrapy_splash/response.py | 9 ++++++++- scrapy_splash/responsetypes.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index e5250c2..ac38d28 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -5,7 +5,7 @@ import base64 import re -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy import Selector from scrapy_splash.utils import headers_to_scrapy @@ -87,6 +87,13 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) +class SplashHtmlResponse(_SplashResponseMixin, HtmlResponse): + """ + This HtmlResponse subclass sets response.url to the URL of a remote website + instead of an URL of Splash server. "Real" response URL is still available + as ``response.real_url``. + """ + class SplashJsonResponse(SplashResponse): """ Splash Response with JSON data. It provides a convenient way to access diff --git a/scrapy_splash/responsetypes.py b/scrapy_splash/responsetypes.py index 04e9264..a510185 100644 --- a/scrapy_splash/responsetypes.py +++ b/scrapy_splash/responsetypes.py @@ -9,7 +9,7 @@ class SplashResponseTypes(ResponseTypes): CLASSES = { - 'text/html': 'scrapy_splash.response.SplashTextResponse', + 'text/html': 'scrapy_splash.response.SplashHtmlResponse', 'application/atom+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rss+xml': 'scrapy_splash.response.SplashTextResponse', From d4c280af5504ec5d25962ebae33bc9c127f7ac6a Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 14:28:11 +0530 Subject: [PATCH 02/15] fixed test_middleware --- tests/test_middleware.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 66b79ce..4a67fef 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -7,7 +7,7 @@ import scrapy from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware import scrapy_splash @@ -82,14 +82,14 @@ def test_splash_request(): assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing - response = TextResponse("http://127.0.0.1:8050/render.html", + response = HtmlResponse("http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, body=b"Hello") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) - assert isinstance(response2, scrapy_splash.SplashTextResponse) + assert isinstance(response2, scrapy_splash.SplashHtmlResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url From 9ff12e4825291804de4496c86387330fbc842e3b Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 14:54:32 +0530 Subject: [PATCH 03/15] fixed test_middleware1 --- tests/test_middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 4a67fef..ac7988c 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -89,7 +89,7 @@ def test_splash_request(): body=b"Hello") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) - assert isinstance(response2, scrapy_splash.SplashHtmlResponse) + assert isinstance(response2, scrapy_splash.SplashTextResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url From 67edaaee5d91b8615119fe489835c8d37684d46f Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 15:16:08 +0530 Subject: [PATCH 04/15] fixed response --- scrapy_splash/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index ac38d28..61a91b1 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -87,7 +87,7 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) -class SplashHtmlResponse(_SplashResponseMixin, HtmlResponse): +class SplashHtmlResponse(_SplashResponseMixin, SplashTextResponse): """ This HtmlResponse subclass sets response.url to the URL of a remote website instead of an URL of Splash server. "Real" response URL is still available From 5a592974a74b48e6d4ffc285ace139e2a63b1793 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 15:21:21 +0530 Subject: [PATCH 05/15] fixed response1 --- tests/test_middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index ac7988c..9177728 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -82,7 +82,7 @@ def test_splash_request(): assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing - response = HtmlResponse("http://127.0.0.1:8050/render.html", + response = TextResponse("http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, From 7b1cbcf5fea0e45b54cd18e5cec2875ee30c8e17 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Mon, 27 Mar 2017 15:26:36 +0530 Subject: [PATCH 06/15] fixed response2 --- scrapy_splash/response.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index 61a91b1..735f126 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -87,12 +87,13 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) -class SplashHtmlResponse(_SplashResponseMixin, SplashTextResponse): +class SplashHtmlResponse(SplashTextResponse): """ This HtmlResponse subclass sets response.url to the URL of a remote website instead of an URL of Splash server. "Real" response URL is still available as ``response.real_url``. """ + pass class SplashJsonResponse(SplashResponse): """ From d951b56b4ecfcafabf35d192a4cb4d2535760d05 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Tue, 28 Mar 2017 09:26:06 +0530 Subject: [PATCH 07/15] added as subclass of HtmlResponse --- scrapy_splash/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index 735f126..73b09ad 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -87,7 +87,7 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) -class SplashHtmlResponse(SplashTextResponse): +class SplashHtmlResponse(SplashTextResponse,HtmlResponse): """ This HtmlResponse subclass sets response.url to the URL of a remote website instead of an URL of Splash server. "Real" response URL is still available From d6584eca2c07cea4a3d33ad67be866950172f774 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Tue, 28 Mar 2017 09:30:22 +0530 Subject: [PATCH 08/15] now support application/xhtml+xml and application/vnd.wap.xhtml+xml --- scrapy_splash/responsetypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_splash/responsetypes.py b/scrapy_splash/responsetypes.py index a510185..71fb242 100644 --- a/scrapy_splash/responsetypes.py +++ b/scrapy_splash/responsetypes.py @@ -13,8 +13,8 @@ class SplashResponseTypes(ResponseTypes): 'application/atom+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rss+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse', + 'application/xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', + 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', 'application/xml': 'scrapy_splash.response.SplashTextResponse', 'application/json': 'scrapy_splash.response.SplashJsonResponse', 'application/x-json': 'scrapy_splash.response.SplashJsonResponse', From 4d0c305d7b2b56905faa88f2b8134524997b42b5 Mon Sep 17 00:00:00 2001 From: Atul Krishna Date: Tue, 28 Mar 2017 09:37:29 +0530 Subject: [PATCH 09/15] removed unused import --- tests/test_middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 9177728..66b79ce 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -7,7 +7,7 @@ import scrapy from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler -from scrapy.http import Response, TextResponse, HtmlResponse +from scrapy.http import Response, TextResponse from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware import scrapy_splash From b2be0d5b05b75e6f6edf6fd5289911978441274f Mon Sep 17 00:00:00 2001 From: Illia Ananich Date: Thu, 18 Jan 2018 01:25:51 +0200 Subject: [PATCH 10/15] add test for `Splash*Response` types: * new `test/test_response.py` file * new `test/test_response/test_response_types` test function `test_response_types` checks for compatibility of `Splash*Response` classes with Scrapy's `*Response` classes with `issubclass` func --- scrapy_splash/response.py | 3 ++- tests/test_response.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 tests/test_response.py diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index 73b09ad..3d60836 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -87,7 +87,7 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) -class SplashHtmlResponse(SplashTextResponse,HtmlResponse): +class SplashHtmlResponse(SplashTextResponse, HtmlResponse): """ This HtmlResponse subclass sets response.url to the URL of a remote website instead of an URL of Splash server. "Real" response URL is still available @@ -95,6 +95,7 @@ class SplashHtmlResponse(SplashTextResponse,HtmlResponse): """ pass + class SplashJsonResponse(SplashResponse): """ Splash Response with JSON data. It provides a convenient way to access diff --git a/tests/test_response.py b/tests/test_response.py new file mode 100644 index 0000000..8756e17 --- /dev/null +++ b/tests/test_response.py @@ -0,0 +1,10 @@ +from scrapy.http import HtmlResponse, TextResponse, Response +from scrapy_splash.response import ( + SplashTextResponse, SplashHtmlResponse, SplashResponse, +) + + +def test_response_types(): + assert issubclass(SplashResponse, Response) + assert issubclass(SplashTextResponse, TextResponse) + assert issubclass(SplashHtmlResponse, HtmlResponse) From 692f5908825086c4857179e44b0a942bd1165b6a Mon Sep 17 00:00:00 2001 From: Illia Ananich Date: Thu, 18 Jan 2018 01:31:35 +0200 Subject: [PATCH 11/15] add `SplashHtmlResponse` to `__init__.py` --- scrapy_splash/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scrapy_splash/__init__.py b/scrapy_splash/__init__.py index cbdc7cf..d16777d 100644 --- a/scrapy_splash/__init__.py +++ b/scrapy_splash/__init__.py @@ -9,5 +9,7 @@ ) from .dupefilter import SplashAwareDupeFilter, splash_request_fingerprint from .cache import SplashAwareFSCacheStorage -from .response import SplashResponse, SplashTextResponse, SplashJsonResponse +from .response import ( + SplashResponse, SplashTextResponse, SplashHtmlResponse, SplashJsonResponse, +) from .request import SplashRequest, SplashFormRequest From 268cbc12427cfb75a824ba3b1943fbc144511c3d Mon Sep 17 00:00:00 2001 From: Illia Ananich Date: Thu, 18 Jan 2018 02:41:56 +0200 Subject: [PATCH 12/15] add `response.splash_scrapy_text_responses` tuple of pairs * use it in `test_middlewares.py` for `test_splash_request` * use it inside `SplashMiddleware._change_response_class` method --- scrapy_splash/middleware.py | 15 +++++++++++---- scrapy_splash/response.py | 6 ++++++ tests/test_middleware.py | 15 ++++++++++----- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py index 24ab23a..9cdb1ae 100644 --- a/scrapy_splash/middleware.py +++ b/scrapy_splash/middleware.py @@ -14,6 +14,7 @@ from scrapy.exceptions import NotConfigured from scrapy.http.headers import Headers from scrapy.http.response.text import TextResponse +from scrapy.http.response.html import HtmlResponse from scrapy import signals from scrapy_splash.responsetypes import responsetypes @@ -397,19 +398,25 @@ def process_response(self, request, response, spider): return response def _change_response_class(self, request, response): - from scrapy_splash import SplashResponse, SplashTextResponse + from scrapy_splash import ( + SplashResponse, SplashTextResponse, SplashHtmlResponse, + ) + from scrapy_splash.response import splash_scrapy_text_responses + if not isinstance(response, (SplashResponse, SplashTextResponse)): # create a custom Response subclass based on response Content-Type # XXX: usually request is assigned to response only when all # downloader middlewares are executed. Here it is set earlier. # Does it have any negative consequences? respcls = responsetypes.from_args(headers=response.headers) - if isinstance(response, TextResponse) and respcls is SplashResponse: + for splash_cls, scrapy_cls in splash_scrapy_text_responses: # Even if the headers say it's binary, it has already - # been detected as a text response by scrapy (for example + # been detected as a text/html response by scrapy (for example # because it was decoded successfully), so we should not # convert it to SplashResponse. - respcls = SplashTextResponse + if isinstance(response, scrapy_cls) and respcls is SplashResponse: + respcls = splash_cls + break response = response.replace(cls=respcls, request=request) return response diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index 3d60836..4871b4c 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -194,3 +194,9 @@ def _load_from_json(self): # response.headers if 'headers' in self.data: self.headers = headers_to_scrapy(self.data['headers']) + + +splash_scrapy_text_responses = ( + (SplashTextResponse, TextResponse), + (SplashHtmlResponse, HtmlResponse), +) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 66b79ce..b0918bc 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -3,11 +3,12 @@ import copy import json import base64 +import pytest import scrapy from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware import scrapy_splash @@ -60,7 +61,11 @@ def test_nosplash(): assert response3.url == "http://example.com" -def test_splash_request(): +@pytest.mark.parametrize( + 'scrapy_response_type, splash_response_type', + scrapy_splash.response.splash_scrapy_text_responses +) +def test_splash_request(splash_response_type, scrapy_response_type): mw = _get_mw() cookie_mw = _get_cookie_mw() @@ -82,14 +87,14 @@ def test_splash_request(): assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing - response = TextResponse("http://127.0.0.1:8050/render.html", + response = scrapy_response_type("http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, body=b"Hello") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) - assert isinstance(response2, scrapy_splash.SplashTextResponse) + assert isinstance(response2, splash_response_type) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url @@ -100,7 +105,7 @@ def test_splash_request(): # check .replace method response3 = response2.replace(status=404) assert response3.status == 404 - assert isinstance(response3, scrapy_splash.SplashTextResponse) + assert isinstance(response3, splash_response_type) for attr in ['url', 'real_url', 'headers', 'body']: assert getattr(response3, attr) == getattr(response2, attr) From f16d6d38d76596028dbbb981f42482c4b77dabdf Mon Sep 17 00:00:00 2001 From: Illia Ananich Date: Fri, 19 Jan 2018 05:05:18 +0200 Subject: [PATCH 13/15] fix `SplashMiddleware._change_response_class`: * case when response is instance of any `Splash*` class in `splash_scrapy_text_response` tuple of pairs --- scrapy_splash/middleware.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py index 9cdb1ae..06c68b6 100644 --- a/scrapy_splash/middleware.py +++ b/scrapy_splash/middleware.py @@ -398,12 +398,11 @@ def process_response(self, request, response, spider): return response def _change_response_class(self, request, response): - from scrapy_splash import ( - SplashResponse, SplashTextResponse, SplashHtmlResponse, - ) + from scrapy_splash import SplashResponse from scrapy_splash.response import splash_scrapy_text_responses + splash_text_response_types = tuple(x for x, y in splash_scrapy_text_responses) - if not isinstance(response, (SplashResponse, SplashTextResponse)): + if not isinstance(response, (SplashResponse, *splash_text_response_types)): # create a custom Response subclass based on response Content-Type # XXX: usually request is assigned to response only when all # downloader middlewares are executed. Here it is set earlier. From 1815f51ad157147cc31c6d6374df359b25dcdc3a Mon Sep 17 00:00:00 2001 From: Illia Ananich Date: Fri, 19 Jan 2018 06:55:18 +0200 Subject: [PATCH 14/15] fix test_splash_request: pass content type too --- tests/test_middleware.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index b0918bc..66ab83d 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -19,6 +19,13 @@ SlotPolicy, SplashCookiesMiddleware, SplashDeduplicateArgsMiddleware, + SplashHtmlResponse, + SplashTextResponse, +) + +splash_scrapy_content_types = ( + (SplashTextResponse, TextResponse, b'text/*'), + (SplashHtmlResponse, HtmlResponse, b'text/html'), ) @@ -62,10 +69,10 @@ def test_nosplash(): @pytest.mark.parametrize( - 'scrapy_response_type, splash_response_type', - scrapy_splash.response.splash_scrapy_text_responses + 'splash_response_type, scrapy_response_type, content_type', + splash_scrapy_content_types, ) -def test_splash_request(splash_response_type, scrapy_response_type): +def test_splash_request(splash_response_type, scrapy_response_type, content_type): mw = _get_mw() cookie_mw = _get_cookie_mw() @@ -87,11 +94,12 @@ def test_splash_request(splash_response_type, scrapy_response_type): assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing - response = scrapy_response_type("http://127.0.0.1:8050/render.html", + response = scrapy_response_type( + url="http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, - headers={b'Content-Type': b'text/html'}, - body=b"Hello") + headers={b'Content-Type': content_type}, + body=b"Hello", ) response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) assert isinstance(response2, splash_response_type) @@ -100,7 +108,7 @@ def test_splash_request(splash_response_type, scrapy_response_type): assert response2.url == req.url assert response2.body == b"Hello" assert response2.css("body").extract_first() == "Hello" - assert response2.headers == {b'Content-Type': [b'text/html']} + assert response2.headers == {b'Content-Type': [content_type, ]} # check .replace method response3 = response2.replace(status=404) From 6340dad717816b1b32ef528a7d88e1508b6beb32 Mon Sep 17 00:00:00 2001 From: Illia Ananich Date: Fri, 19 Jan 2018 17:23:43 +0200 Subject: [PATCH 15/15] fix Python2.7 compatibility in `SplashMiddleware._change_response_class` --- scrapy_splash/middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py index 06c68b6..5ec2c04 100644 --- a/scrapy_splash/middleware.py +++ b/scrapy_splash/middleware.py @@ -402,7 +402,7 @@ def _change_response_class(self, request, response): from scrapy_splash.response import splash_scrapy_text_responses splash_text_response_types = tuple(x for x, y in splash_scrapy_text_responses) - if not isinstance(response, (SplashResponse, *splash_text_response_types)): + if not isinstance(response, (SplashResponse, splash_text_response_types)): # create a custom Response subclass based on response Content-Type # XXX: usually request is assigned to response only when all # downloader middlewares are executed. Here it is set earlier.