From 5b834437f144d76b28c5f83c960a3692954e408d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 29 Apr 2022 21:28:55 +0800 Subject: [PATCH 1/6] update README regarding default params --- README.rst | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 13223d3a..34c45dd1 100644 --- a/README.rst +++ b/README.rst @@ -60,13 +60,40 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +To enable every request to be sent through Zyte API, you can set the following +in the ``settings.py`` file: + +.. code-block:: python + + ZYTE_API_ENABLED = True + +Moreover, it can also be set inside the spider as an attribute: + +.. code-block:: python + + class MySpider: + zyte_api_enabled = True + +The default parameters sent for every request could be declared in the ``settings.py`` +file or `any other settings `_ +as: + +.. code-block:: python + + ZYTE_API_DEFAULT_PARAMS = { + "browserHtml": True, + "geolocation": "US", + } + +You can see the full list of parameters in the `Zyte API Specification +`_. + Usage ----- -Set the ``zyte_api`` `Request.meta -`_ -key to download a request using Zyte API. Full list of parameters is provided in the -`Zyte API Specification `_. +Setting ``ZYTE_API_ENABLED=True`` would enable Zyte API for every request. On the +other hand, you could also control it on a per request basis by setting the +``zyte_api`` key in `Request.meta `_. .. code-block:: python From 10a460332b4a47b4ea479cd2378230dfd1378cf0 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 20:45:44 +0800 Subject: [PATCH 2/6] remove the ZYTE_API_ENABLED setting --- README.rst | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/README.rst b/README.rst index 34c45dd1..5f844e32 100644 --- a/README.rst +++ b/README.rst @@ -46,7 +46,7 @@ Lastly, make sure to `install the asyncio-based Twisted reactor `_ in the ``settings.py`` file as well: -Here's example of the things needed inside a Scrapy project's ``settings.py`` file: +Here's an example of the things needed inside a Scrapy project's ``settings.py`` file: .. code-block:: python @@ -60,39 +60,38 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +Usage +----- + To enable every request to be sent through Zyte API, you can set the following -in the ``settings.py`` file: +in the ``settings.py`` file or `any other settings within Scrapy +`_: .. code-block:: python - ZYTE_API_ENABLED = True + ZYTE_API_DEFAULT_PARAMS = { + "browserHtml": True, + "geolocation": "US", + } Moreover, it can also be set inside the spider as an attribute: .. code-block:: python class MySpider: - zyte_api_enabled = True + zyte_api_default_params = { + "browserHtml": True, + "geolocation": "US", + } -The default parameters sent for every request could be declared in the ``settings.py`` -file or `any other settings `_ -as: - -.. code-block:: python - - ZYTE_API_DEFAULT_PARAMS = { - "browserHtml": True, - "geolocation": "US", - } +If the default parameters are both set in the ``settings.py`` and the **spider**, +the values within ``settings.py`` will take effect first. The values within the +**spider** will be next, essentially overwriting similar parameters. You can see the full list of parameters in the `Zyte API Specification `_. -Usage ------ - -Setting ``ZYTE_API_ENABLED=True`` would enable Zyte API for every request. On the -other hand, you could also control it on a per request basis by setting the +On the other hand, you could also control it on a per request basis by setting the ``zyte_api`` key in `Request.meta `_. .. code-block:: python From b7102fad4acf7ec4ba390095be4f8fc96057060a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 20:51:35 +0800 Subject: [PATCH 3/6] remove zyte_api_default_params in the spider --- README.rst | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/README.rst b/README.rst index 5f844e32..065e1529 100644 --- a/README.rst +++ b/README.rst @@ -74,20 +74,6 @@ in the ``settings.py`` file or `any other settings within Scrapy "geolocation": "US", } -Moreover, it can also be set inside the spider as an attribute: - -.. code-block:: python - - class MySpider: - zyte_api_default_params = { - "browserHtml": True, - "geolocation": "US", - } - -If the default parameters are both set in the ``settings.py`` and the **spider**, -the values within ``settings.py`` will take effect first. The values within the -**spider** will be next, essentially overwriting similar parameters. - You can see the full list of parameters in the `Zyte API Specification `_. From 2b4a0fb07caf7c763a0c2b3dc364290169198a45 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 21:27:55 +0800 Subject: [PATCH 4/6] refactor TestAPI to have single producer of requests and responses --- tests/test_api_requests.py | 91 ++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index c62640ed..69d1fd1a 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -23,17 +23,9 @@ class TestAPI: - @pytest.mark.parametrize( - "meta", - [ - {"zyte_api": {"browserHtml": True}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US"}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US", "echoData": 123}}, - {"zyte_api": {"browserHtml": True, "randomParameter": None}}, - ], - ) - @pytest.mark.asyncio - async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): + + @staticmethod + async def produce_request_response(meta): with MockServer() as server: async with make_handler({}, server.urljoin("/")) as handler: req = Request( @@ -45,14 +37,27 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): assert iscoroutine(coro) assert not isinstance(coro, Deferred) resp = await coro # type: ignore + return req, resp - assert isinstance(resp, TextResponse) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.text == "" + @pytest.mark.parametrize( + "meta", + [ + {"zyte_api": {"browserHtml": True}}, + {"zyte_api": {"browserHtml": True, "geolocation": "US"}}, + {"zyte_api": {"browserHtml": True, "geolocation": "US", "echoData": 123}}, + {"zyte_api": {"browserHtml": True, "randomParameter": None}}, + ], + ) + @pytest.mark.asyncio + async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, TextResponse) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" + assert resp.text == "" @pytest.mark.parametrize( "meta", @@ -71,24 +76,13 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): ) @pytest.mark.asyncio async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore - - assert isinstance(resp, Response) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, Response) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" @pytest.mark.parametrize( "meta", @@ -99,24 +93,13 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]) ) @pytest.mark.asyncio async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore - - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.headers == {b"Test_Header": [b"test_value"]} + req, resp = await self.produce_request_response(meta) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" + assert resp.headers == {b"Test_Header": [b"test_value"]} @pytest.mark.parametrize( "meta, api_relevant", From 97ea1e4b6ec770eeda1dcf846e7e31531ba7a64d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 3 May 2022 10:54:32 +0800 Subject: [PATCH 5/6] implement ZYTE_API_DEFAULT_PARAMS in the settings --- README.rst | 2 ++ scrapy_zyte_api/handler.py | 12 ++++++--- tests/test_api_requests.py | 55 +++++++++++++++++++++++++++++++++++--- 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 065e1529..36ae4241 100644 --- a/README.rst +++ b/README.rst @@ -79,6 +79,8 @@ You can see the full list of parameters in the `Zyte API Specification On the other hand, you could also control it on a per request basis by setting the ``zyte_api`` key in `Request.meta `_. +When doing so, it will override any parameters that was set via the setting +named ``ZYTE_API_DEFAULT_PARAMS``. .. code-block:: python diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 273a8e3d..a395f871 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -31,6 +31,7 @@ def __init__( ) self._stats = crawler.stats self._job_id = crawler.settings.get("JOB") + self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") self._session = create_session() @classmethod @@ -56,11 +57,14 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: async def _download_request( self, request: Request, spider: Spider ) -> Union[ZyteAPITextResponse, ZyteAPIResponse]: - api_params: Dict[str, Any] = request.meta["zyte_api"] - if not isinstance(api_params, dict): + api_params: Dict[str, Any] = self._zyte_api_default_params or {} + try: + api_params.update(request.meta.get("zyte_api") or {}) + except TypeError: logger.error( - "zyte_api parameters in the request meta should be " - f"provided as dictionary, got {type(api_params)} instead ({request.url})." + f"zyte_api parameters in the request meta should be " + f"provided as dictionary, got {type(request.meta.get('zyte_api'))} " + f"instead ({request.url})." ) raise IgnoreRequest() # Define url by default diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 69d1fd1a..fe81b308 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,6 +1,7 @@ import os from asyncio import iscoroutine from typing import Any, Dict +from unittest import mock import pytest from _pytest.logging import LogCaptureFixture # NOQA @@ -23,17 +24,16 @@ class TestAPI: - @staticmethod - async def produce_request_response(meta): + async def produce_request_response(meta, custom_settings=None): with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: + async with make_handler(custom_settings, server.urljoin("/")) as handler: req = Request( "http://example.com", method="POST", meta=meta, ) - coro = handler._download_request(req, Spider("test")) + coro = handler._download_request(req, None) assert iscoroutine(coro) assert not isinstance(coro, Deferred) resp = await coro # type: ignore @@ -101,6 +101,53 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any assert resp.body == b"" assert resp.headers == {b"Test_Header": [b"test_value"]} + @pytest.mark.parametrize( + "meta,custom_settings,expected", + [ + ({}, {}, {}), + ({"zyte_api": {}}, {}, {}), + ( + {}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + ), + ( + {"zyte_api": {}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + ), + ( + {"zyte_api": {"javascript": True, "geolocation": "US"}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "US", "javascript": True}, + ), + ], + ) + @mock.patch("tests.AsyncClient") + @pytest.mark.asyncio + async def test_empty_zyte_api_request_meta( + self, + mock_client, + meta: Dict[str, Dict[str, Any]], + custom_settings: Dict[str, str], + expected: Dict[str, str], + ): + try: + # This would always error out since the mocked client doesn't + # return the expected API response. + await self.produce_request_response(meta, custom_settings=custom_settings) + except: + pass + + request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)] + if not request_call: + pytest.fail("The client's request_raw() method was not called.") + + args_used = request_call[0].args[0] + args_used.pop("url") + + assert args_used == expected + @pytest.mark.parametrize( "meta, api_relevant", [ From 5dd1bec7b9d181508f86df350d1a655d24166875 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 3 May 2022 11:07:44 +0800 Subject: [PATCH 6/6] fix failing tests --- README.rst | 4 ++-- tests/test_api_requests.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 36ae4241..a95c753d 100644 --- a/README.rst +++ b/README.rst @@ -79,8 +79,8 @@ You can see the full list of parameters in the `Zyte API Specification On the other hand, you could also control it on a per request basis by setting the ``zyte_api`` key in `Request.meta `_. -When doing so, it will override any parameters that was set via the setting -named ``ZYTE_API_DEFAULT_PARAMS``. +When doing so, it will override any parameters that was set in the +``ZYTE_API_DEFAULT_PARAMS`` setting. .. code-block:: python diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index fe81b308..316b0a59 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,4 +1,5 @@ import os +import sys from asyncio import iscoroutine from typing import Any, Dict from unittest import mock @@ -101,6 +102,9 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any assert resp.body == b"" assert resp.headers == {b"Test_Header": [b"test_value"]} + @pytest.mark.skipif( + sys.version_info < (3, 8), reason="Python3.7 has poor support for AsyncMocks" + ) @pytest.mark.parametrize( "meta,custom_settings,expected", [ @@ -136,9 +140,10 @@ async def test_empty_zyte_api_request_meta( # This would always error out since the mocked client doesn't # return the expected API response. await self.produce_request_response(meta, custom_settings=custom_settings) - except: + except Exception: pass + # What we're interested in is the Request call in the API request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)] if not request_call: pytest.fail("The client's request_raw() method was not called.")