Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9a83471
create ZyteAPITextResponse and ZyteAPIResponse
BurnzZ Apr 26, 2022
8909473
update README and CHANGES with notes on new response classes
BurnzZ Apr 27, 2022
d0dc08d
set the encoding consistently to be 'utf-8'
BurnzZ Apr 28, 2022
109dbf0
improve example and docs
BurnzZ Apr 28, 2022
9695880
override replace() to prevent 'zyte_api_response' attribute from bein…
BurnzZ Apr 28, 2022
8812a05
fix mypy failures
BurnzZ Apr 28, 2022
ba64103
enforce 'utf-8' encoding on Text responses
BurnzZ Apr 28, 2022
84dac7d
update expectation for replacing zyte_api_response attribute
BurnzZ Apr 29, 2022
5b83443
update README regarding default params
BurnzZ Apr 29, 2022
fb0b412
remove 'Content-Encoding' header when returning responses
BurnzZ May 2, 2022
10a4603
remove the ZYTE_API_ENABLED setting
BurnzZ May 2, 2022
b7102fa
remove zyte_api_default_params in the spider
BurnzZ May 2, 2022
2b4a0fb
refactor TestAPI to have single producer of requests and responses
BurnzZ May 2, 2022
97ea1e4
implement ZYTE_API_DEFAULT_PARAMS in the settings
BurnzZ May 3, 2022
5dd1bec
fix failing tests
BurnzZ May 3, 2022
052d0d6
Merge pull request #14 from scrapy-plugins/fix-decompression-error
kmike May 11, 2022
48a4766
rename zyte_api_response into zyte_api
BurnzZ May 19, 2022
2455bdf
Merge pull request #13 from scrapy-plugins/default-settings
BurnzZ May 19, 2022
910085b
add tests for css/xpath selectors
BurnzZ May 25, 2022
e3214d8
enable css/xpath selectors on httpResponseBody
BurnzZ May 26, 2022
e530053
handle empty 'browserHtml' or 'httpResponseBody'
BurnzZ May 26, 2022
27c7a7d
Fix typos in docs
BurnzZ May 27, 2022
5b7cf6f
update how replace() works
BurnzZ May 27, 2022
2adc8a6
update README in line with the ZYTE_API_DEFAULT_PARAMS expectations
BurnzZ May 27, 2022
32faf3d
add test case to ensure zyte_api is intact when replacing other attribs
BurnzZ May 27, 2022
cec0677
make process_response() private
BurnzZ May 27, 2022
e0865e7
update tests to ensure other response attribs are not updated on .rep…
BurnzZ May 27, 2022
34a427f
raise an error if zyte_api is passed to .replace()
BurnzZ May 27, 2022
37a4cc7
rename '.zyte_api' attribute as '.raw_api_response'
BurnzZ May 27, 2022
f5a9bb0
refactor to accept 'True' and '{}' to trigger Zyte API Requests
BurnzZ May 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Changes
=======

TBD
---

* Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses
of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively.
These new response classes hold the raw Zyte API response in its
``zyte_api_response`` attribute.

0.1.0 (2022-02-03)
------------------

Expand Down
80 changes: 52 additions & 28 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ Installation

This package requires Python 3.7+.

How to configure
----------------
Configuration
-------------

Replace the default ``http`` and ``https`` in Scrapy's
`DOWNLOAD_HANDLERS <https://docs.scrapy.org/en/latest/topics/settings.html#std-setting-DOWNLOAD_HANDLERS>`_
Expand All @@ -60,8 +60,8 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

How to use
----------
Usage
-----

Set the ``zyte_api`` `Request.meta
<https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta>`_
Expand All @@ -70,27 +70,51 @@ key to download a request using Zyte API. Full list of parameters is provided in

.. code-block:: python

import scrapy


class TestSpider(scrapy.Spider):
name = "test"

def start_requests(self):

yield scrapy.Request(
url="http://books.toscrape.com/",
callback=self.parse,
meta={
"zyte_api": {
"browserHtml": True,
# You can set any GEOLocation region you want.
"geolocation": "US",
"javascript": True,
"echoData": {"something": True},
}
},
)

def parse(self, response):
yield {"URL": response.url, "status": response.status, "HTML": response.body}
import scrapy


class SampleQuotesSpider(scrapy.Spider):
name = "sample_quotes"

def start_requests(self):

yield scrapy.Request(
url="http://books.toscrape.com/",
callback=self.parse,
meta={
"zyte_api": {
"browserHtml": True,
"geolocation": "US", # You can set any Geolocation region you want.
"javascript": True,
"echoData": {"some_value_I_could_track": 123},
}
},
)

def parse(self, response):
yield {"URL": response.url, "status": response.status, "HTML": response.body}

print(response.zyte_api_response)
# {
# 'url': 'https://quotes.toscrape.com/',
# 'browserHtml': '<html> ... </html>',
# 'echoData': {'some_value_I_could_track': 123},
# }

print(response.request.meta)
# {
# 'zyte_api': {
# 'browserHtml': True,
# 'geolocation': 'US',
# 'javascript': True,
# 'echoData': {'some_value_I_could_track': 123}
# },
# 'download_timeout': 180.0,
# 'download_slot': 'quotes.toscrape.com'
# }

The raw Zyte API Response can be accessed via the ``zyte_api_response`` attribute
of the response object. Note that such responses are of ``ZyteAPIResponse`` and
``ZyteAPITextResponse`` which are respectively subclasses of ``scrapy.http.Response``
and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte API
responses.
38 changes: 9 additions & 29 deletions scrapy_zyte_api/handler.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
import json
import logging
import os
from base64 import b64decode
from typing import Any, Dict, Generator, List, Optional
from typing import Any, Dict, Generator, Union

from scrapy import Spider
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response, TextResponse
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.reactor import verify_installed_reactor
from twisted.internet.defer import Deferred, inlineCallbacks
from zyte_api.aio.client import AsyncClient, create_session
from zyte_api.aio.errors import RequestError

from .responses import ZyteAPIResponse, ZyteAPITextResponse

logger = logging.getLogger(__name__)


Expand All @@ -31,7 +32,6 @@ def __init__(
self._stats = crawler.stats
self._job_id = crawler.settings.get("JOB")
self._session = create_session()
self._encoding = "utf-8"

@classmethod
def from_crawler(cls, crawler):
Expand All @@ -53,7 +53,9 @@ def download_request(self, request: Request, spider: Spider) -> Deferred:
else:
return super().download_request(request, spider)

async def _download_request(self, request: Request, spider: Spider) -> Response:
async def _download_request(
self, request: Request, spider: Spider
) -> Union[ZyteAPITextResponse, ZyteAPIResponse]:
api_params: Dict[str, Any] = request.meta["zyte_api"]
if not isinstance(api_params, dict):
logger.error(
Expand Down Expand Up @@ -81,30 +83,14 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
)
raise IgnoreRequest()
self._stats.inc_value("scrapy-zyte-api/request_count")
headers = self._prepare_headers(api_response.get("httpResponseHeaders"))
# browserHtml and httpResponseBody are not allowed at the same time,
# but at least one of them should be present
if api_response.get("browserHtml"):
# Using TextResponse because browserHtml always returns a browser-rendered page
# even when requesting files (like images)
return TextResponse(
url=api_response["url"],
status=200,
body=api_response["browserHtml"].encode(self._encoding),
encoding=self._encoding,
request=request,
flags=["zyte-api"],
headers=headers,
)
return ZyteAPITextResponse.from_api_response(api_response, request=request)
else:
return Response(
url=api_response["url"],
status=200,
body=b64decode(api_response["httpResponseBody"]),
request=request,
flags=["zyte-api"],
headers=headers,
)
return ZyteAPIResponse.from_api_response(api_response, request=request)

@inlineCallbacks
def close(self) -> Generator:
Expand All @@ -129,9 +115,3 @@ def _get_request_error_message(error: RequestError) -> str:
if error_data.get("detail"):
return error_data["detail"]
return base_message

@staticmethod
def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]):
if not init_headers:
return None
return {h["name"]: h["value"] for h in init_headers}
73 changes: 73 additions & 0 deletions scrapy_zyte_api/responses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from base64 import b64decode
from typing import Dict, List, Optional

from scrapy import Request
from scrapy.http import Response, TextResponse

_ENCODING = "utf-8"


class ZyteAPIMixin:
def __init__(self, *args, zyte_api_response: Dict = None, **kwargs):
super().__init__(*args, **kwargs)
self._zyte_api_response = zyte_api_response

def replace(self, *args, **kwargs):
"""Create a new response with the same attributes except for those given
new values.

NOTE: This doesn't support replacing the ``zyte_api_response`` attribute.
"""
instance = super().replace(*args, **kwargs)
instance._zyte_api_response = self.zyte_api_response
return instance

@property
def zyte_api_response(self) -> Optional[Dict]:
"""Contains the raw API response from Zyte API.

To see the full list of parameters and their description, kindly refer to the
`Zyte API Specification <https://docs.zyte.com/zyte-api/openapi.html#zyte-openapi-spec>`_.
"""
return self._zyte_api_response

@staticmethod
def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]):
if not init_headers:
return None
return {h["name"]: h["value"] for h in init_headers}


class ZyteAPITextResponse(ZyteAPIMixin, TextResponse):
@classmethod
def from_api_response(cls, api_response: Dict, *, request: Request = None):
"""Alternative constructor to instantiate the response from the raw
Zyte API response.
"""
return cls(
url=api_response["url"],
status=200,
body=api_response["browserHtml"].encode(_ENCODING),
encoding=_ENCODING,
request=request,
flags=["zyte-api"],
headers=cls._prepare_headers(api_response.get("httpResponseHeaders")),
zyte_api_response=api_response,
)


class ZyteAPIResponse(ZyteAPIMixin, Response):
@classmethod
def from_api_response(cls, api_response: Dict, *, request: Request = None):
"""Alternative constructor to instantiate the response from the raw
Zyte API response.
"""
return cls(
url=api_response["url"],
status=200,
body=b64decode(api_response["httpResponseBody"]),
request=request,
flags=["zyte-api"],
headers=cls._prepare_headers(api_response.get("httpResponseHeaders")),
zyte_api_response=api_response,
)
6 changes: 3 additions & 3 deletions tests/test_api_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]):
coro = handler._download_request(req, Spider("test"))
assert iscoroutine(coro)
assert not isinstance(coro, Deferred)
resp = await coro # NOQA
resp = await coro # type: ignore

assert isinstance(resp, TextResponse)
assert resp.request is req
Expand Down Expand Up @@ -81,7 +81,7 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]])
coro = handler._download_request(req, Spider("test"))
assert iscoroutine(coro)
assert not isinstance(coro, Deferred)
resp = await coro # NOQA
resp = await coro # type: ignore

assert isinstance(resp, Response)
assert resp.request is req
Expand Down Expand Up @@ -109,7 +109,7 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any
coro = handler._download_request(req, Spider("test"))
assert iscoroutine(coro)
assert not isinstance(coro, Deferred)
resp = await coro # NOQA
resp = await coro # type: ignore

assert resp.request is req
assert resp.url == req.url
Expand Down
Loading