scrapy-plugins · kmike · May 30, 2022 · Apr 26, 2022 · Apr 27, 2022 · Apr 28, 2022
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,14 @@
 Changes
 =======
 
+TBD
+---
+
+* Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses
+  of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively.
+  These new response classes hold the raw Zyte API response in its
+  ``zyte_api_response`` attribute.
+
 0.1.0 (2022-02-03)
 ------------------
 

diff --git a/README.rst b/README.rst
@@ -33,8 +33,8 @@ Installation
 
 This package requires Python 3.7+.
 
-How to configure
-----------------
+Configuration
+-------------
 
 Replace the default ``http`` and ``https`` in Scrapy's
 `DOWNLOAD_HANDLERS <https://docs.scrapy.org/en/latest/topics/settings.html#std-setting-DOWNLOAD_HANDLERS>`_
@@ -60,8 +60,8 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi
 
     TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 
-How to use
-----------
+Usage
+-----
 
 Set the ``zyte_api`` `Request.meta
 <https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta>`_
@@ -70,27 +70,51 @@ key to download a request using Zyte API. Full list of parameters is provided in
 
 .. code-block:: python
 
-   import scrapy
-
-
-   class TestSpider(scrapy.Spider):
-       name = "test"
-
-       def start_requests(self):
-
-           yield scrapy.Request(
-               url="http://books.toscrape.com/",
-               callback=self.parse,
-               meta={
-                   "zyte_api": {
-                       "browserHtml": True,
-                       # You can set any GEOLocation region you want.
-                       "geolocation": "US",
-                       "javascript": True,
-                       "echoData": {"something": True},
-                   }
-               },
-           )
-
-       def parse(self, response):
-           yield {"URL": response.url, "status": response.status, "HTML": response.body}
+    import scrapy
+
+
+    class SampleQuotesSpider(scrapy.Spider):
+        name = "sample_quotes"
+
+        def start_requests(self):
+
+            yield scrapy.Request(
+                url="http://books.toscrape.com/",
+                callback=self.parse,
+                meta={
+                    "zyte_api": {
+                        "browserHtml": True,
+                        "geolocation": "US",  # You can set any Geolocation region you want.
+                        "javascript": True,
+                        "echoData": {"some_value_I_could_track": 123},
+                    }
+                },
+            )
+
+        def parse(self, response):
+            yield {"URL": response.url, "status": response.status, "HTML": response.body}
+
+            print(response.zyte_api_response)
+            # {
+            #     'url': 'https://quotes.toscrape.com/',
+            #     'browserHtml': '<html> ... </html>',
+            #     'echoData': {'some_value_I_could_track': 123},
+            # }
+
+            print(response.request.meta)
+            # {
+            #     'zyte_api': {
+            #         'browserHtml': True,
+            #         'geolocation': 'US',
+            #         'javascript': True,
+            #         'echoData': {'some_value_I_could_track': 123}
+            #     },
+            #     'download_timeout': 180.0,
+            #     'download_slot': 'quotes.toscrape.com'
+            # }
+
+The raw Zyte API Response can be accessed via the ``zyte_api_response`` attribute
+of the response object. Note that such responses are of ``ZyteAPIResponse`` and
+``ZyteAPITextResponse`` which are respectively subclasses of ``scrapy.http.Response``
+and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte API
+responses.
diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py
@@ -1,21 +1,22 @@
 import json
 import logging
 import os
-from base64 import b64decode
-from typing import Any, Dict, Generator, List, Optional
+from typing import Any, Dict, Generator, Union
 
 from scrapy import Spider
 from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
 from scrapy.crawler import Crawler
 from scrapy.exceptions import IgnoreRequest, NotConfigured
-from scrapy.http import Request, Response, TextResponse
+from scrapy.http import Request
 from scrapy.settings import Settings
 from scrapy.utils.defer import deferred_from_coro
 from scrapy.utils.reactor import verify_installed_reactor
 from twisted.internet.defer import Deferred, inlineCallbacks
 from zyte_api.aio.client import AsyncClient, create_session
 from zyte_api.aio.errors import RequestError
 
+from .responses import ZyteAPIResponse, ZyteAPITextResponse
+
 logger = logging.getLogger(__name__)
 
 
@@ -31,7 +32,6 @@ def __init__(
         self._stats = crawler.stats
         self._job_id = crawler.settings.get("JOB")
         self._session = create_session()
-        self._encoding = "utf-8"
 
     @classmethod
     def from_crawler(cls, crawler):
@@ -53,7 +53,9 @@ def download_request(self, request: Request, spider: Spider) -> Deferred:
         else:
             return super().download_request(request, spider)
 
-    async def _download_request(self, request: Request, spider: Spider) -> Response:
+    async def _download_request(
+        self, request: Request, spider: Spider
+    ) -> Union[ZyteAPITextResponse, ZyteAPIResponse]:
         api_params: Dict[str, Any] = request.meta["zyte_api"]
         if not isinstance(api_params, dict):
             logger.error(
@@ -81,30 +83,14 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
             )
             raise IgnoreRequest()
         self._stats.inc_value("scrapy-zyte-api/request_count")
-        headers = self._prepare_headers(api_response.get("httpResponseHeaders"))
         # browserHtml and httpResponseBody are not allowed at the same time,
         # but at least one of them should be present
         if api_response.get("browserHtml"):
             # Using TextResponse because browserHtml always returns a browser-rendered page
             # even when requesting files (like images)
-            return TextResponse(
-                url=api_response["url"],
-                status=200,
-                body=api_response["browserHtml"].encode(self._encoding),
-                encoding=self._encoding,
-                request=request,
-                flags=["zyte-api"],
-                headers=headers,
-            )
+            return ZyteAPITextResponse.from_api_response(api_response, request=request)
         else:
-            return Response(
-                url=api_response["url"],
-                status=200,
-                body=b64decode(api_response["httpResponseBody"]),
-                request=request,
-                flags=["zyte-api"],
-                headers=headers,
-            )
+            return ZyteAPIResponse.from_api_response(api_response, request=request)
 
     @inlineCallbacks
     def close(self) -> Generator:
@@ -129,9 +115,3 @@ def _get_request_error_message(error: RequestError) -> str:
         if error_data.get("detail"):
             return error_data["detail"]
         return base_message
-
-    @staticmethod
-    def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]):
-        if not init_headers:
-            return None
-        return {h["name"]: h["value"] for h in init_headers}
diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py
@@ -0,0 +1,73 @@
+from base64 import b64decode
+from typing import Dict, List, Optional
+
+from scrapy import Request
+from scrapy.http import Response, TextResponse
+
+_ENCODING = "utf-8"
+
+
+class ZyteAPIMixin:
+    def __init__(self, *args, zyte_api_response: Dict = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._zyte_api_response = zyte_api_response
+
+    def replace(self, *args, **kwargs):
+        """Create a new response with the same attributes except for those given
+        new values.
+
+        NOTE: This doesn't support replacing the ``zyte_api_response`` attribute.
+        """
+        instance = super().replace(*args, **kwargs)
+        instance._zyte_api_response = self.zyte_api_response
+        return instance
+
+    @property
+    def zyte_api_response(self) -> Optional[Dict]:
+        """Contains the raw API response from Zyte API.
+
+        To see the full list of parameters and their description, kindly refer to the
+        `Zyte API Specification <https://docs.zyte.com/zyte-api/openapi.html#zyte-openapi-spec>`_.
+        """
+        return self._zyte_api_response
+
+    @staticmethod
+    def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]):
+        if not init_headers:
+            return None
+        return {h["name"]: h["value"] for h in init_headers}
+
+
+class ZyteAPITextResponse(ZyteAPIMixin, TextResponse):
+    @classmethod
+    def from_api_response(cls, api_response: Dict, *, request: Request = None):
+        """Alternative constructor to instantiate the response from the raw
+        Zyte API response.
+        """
+        return cls(
+            url=api_response["url"],
+            status=200,
+            body=api_response["browserHtml"].encode(_ENCODING),
+            encoding=_ENCODING,
+            request=request,
+            flags=["zyte-api"],
+            headers=cls._prepare_headers(api_response.get("httpResponseHeaders")),
+            zyte_api_response=api_response,
+        )
+
+
+class ZyteAPIResponse(ZyteAPIMixin, Response):
+    @classmethod
+    def from_api_response(cls, api_response: Dict, *, request: Request = None):
+        """Alternative constructor to instantiate the response from the raw
+        Zyte API response.
+        """
+        return cls(
+            url=api_response["url"],
+            status=200,
+            body=b64decode(api_response["httpResponseBody"]),
+            request=request,
+            flags=["zyte-api"],
+            headers=cls._prepare_headers(api_response.get("httpResponseHeaders")),
+            zyte_api_response=api_response,
+        )
diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py
@@ -44,7 +44,7 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]):
                 coro = handler._download_request(req, Spider("test"))
                 assert iscoroutine(coro)
                 assert not isinstance(coro, Deferred)
-                resp = await coro  # NOQA
+                resp = await coro  # type: ignore
 
             assert isinstance(resp, TextResponse)
             assert resp.request is req
@@ -81,7 +81,7 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]])
                 coro = handler._download_request(req, Spider("test"))
                 assert iscoroutine(coro)
                 assert not isinstance(coro, Deferred)
-                resp = await coro  # NOQA
+                resp = await coro  # type: ignore
 
             assert isinstance(resp, Response)
             assert resp.request is req
@@ -109,7 +109,7 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any
                 coro = handler._download_request(req, Spider("test"))
                 assert iscoroutine(coro)
                 assert not isinstance(coro, Deferred)
-                resp = await coro  # NOQA
+                resp = await coro  # type: ignore
 
             assert resp.request is req
             assert resp.url == req.url