diff --git a/README.rst b/README.rst index 13223d3a..a95c753d 100644 --- a/README.rst +++ b/README.rst @@ -46,7 +46,7 @@ Lastly, make sure to `install the asyncio-based Twisted reactor `_ in the ``settings.py`` file as well: -Here's example of the things needed inside a Scrapy project's ``settings.py`` file: +Here's an example of the things needed inside a Scrapy project's ``settings.py`` file: .. code-block:: python @@ -63,10 +63,24 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi Usage ----- -Set the ``zyte_api`` `Request.meta -`_ -key to download a request using Zyte API. Full list of parameters is provided in the -`Zyte API Specification `_. +To enable every request to be sent through Zyte API, you can set the following +in the ``settings.py`` file or `any other settings within Scrapy +`_: + +.. code-block:: python + + ZYTE_API_DEFAULT_PARAMS = { + "browserHtml": True, + "geolocation": "US", + } + +You can see the full list of parameters in the `Zyte API Specification +`_. + +On the other hand, you could also control it on a per request basis by setting the +``zyte_api`` key in `Request.meta `_. +When doing so, it will override any parameters that was set in the +``ZYTE_API_DEFAULT_PARAMS`` setting. .. code-block:: python diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 273a8e3d..a395f871 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -31,6 +31,7 @@ def __init__( ) self._stats = crawler.stats self._job_id = crawler.settings.get("JOB") + self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") self._session = create_session() @classmethod @@ -56,11 +57,14 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: async def _download_request( self, request: Request, spider: Spider ) -> Union[ZyteAPITextResponse, ZyteAPIResponse]: - api_params: Dict[str, Any] = request.meta["zyte_api"] - if not isinstance(api_params, dict): + api_params: Dict[str, Any] = self._zyte_api_default_params or {} + try: + api_params.update(request.meta.get("zyte_api") or {}) + except TypeError: logger.error( - "zyte_api parameters in the request meta should be " - f"provided as dictionary, got {type(api_params)} instead ({request.url})." + f"zyte_api parameters in the request meta should be " + f"provided as dictionary, got {type(request.meta.get('zyte_api'))} " + f"instead ({request.url})." ) raise IgnoreRequest() # Define url by default diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index c62640ed..316b0a59 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,6 +1,8 @@ import os +import sys from asyncio import iscoroutine from typing import Any, Dict +from unittest import mock import pytest from _pytest.logging import LogCaptureFixture # NOQA @@ -23,6 +25,21 @@ class TestAPI: + @staticmethod + async def produce_request_response(meta, custom_settings=None): + with MockServer() as server: + async with make_handler(custom_settings, server.urljoin("/")) as handler: + req = Request( + "http://example.com", + method="POST", + meta=meta, + ) + coro = handler._download_request(req, None) + assert iscoroutine(coro) + assert not isinstance(coro, Deferred) + resp = await coro # type: ignore + return req, resp + @pytest.mark.parametrize( "meta", [ @@ -34,25 +51,14 @@ class TestAPI: ) @pytest.mark.asyncio async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore - - assert isinstance(resp, TextResponse) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.text == "" + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, TextResponse) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" + assert resp.text == "" @pytest.mark.parametrize( "meta", @@ -71,24 +77,13 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): ) @pytest.mark.asyncio async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore - - assert isinstance(resp, Response) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, Response) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" @pytest.mark.parametrize( "meta", @@ -99,24 +94,64 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]) ) @pytest.mark.asyncio async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore + req, resp = await self.produce_request_response(meta) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" + assert resp.headers == {b"Test_Header": [b"test_value"]} - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.headers == {b"Test_Header": [b"test_value"]} + @pytest.mark.skipif( + sys.version_info < (3, 8), reason="Python3.7 has poor support for AsyncMocks" + ) + @pytest.mark.parametrize( + "meta,custom_settings,expected", + [ + ({}, {}, {}), + ({"zyte_api": {}}, {}, {}), + ( + {}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + ), + ( + {"zyte_api": {}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + ), + ( + {"zyte_api": {"javascript": True, "geolocation": "US"}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "US", "javascript": True}, + ), + ], + ) + @mock.patch("tests.AsyncClient") + @pytest.mark.asyncio + async def test_empty_zyte_api_request_meta( + self, + mock_client, + meta: Dict[str, Dict[str, Any]], + custom_settings: Dict[str, str], + expected: Dict[str, str], + ): + try: + # This would always error out since the mocked client doesn't + # return the expected API response. + await self.produce_request_response(meta, custom_settings=custom_settings) + except Exception: + pass + + # What we're interested in is the Request call in the API + request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)] + if not request_call: + pytest.fail("The client's request_raw() method was not called.") + + args_used = request_call[0].args[0] + args_used.pop("url") + + assert args_used == expected @pytest.mark.parametrize( "meta, api_relevant",