diff --git a/LICENSE b/LICENSE index 8cf9e15..2e17b1a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2020 Scrapinghub +Copyright (c) 2021 Zyte Group Ltd Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index 3af17cc..6b29841 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# Scrapy Middleware for Crawlera Simple Fetch API -[![actions](https://github.com/scrapy-plugins/scrapy-crawlera-fetch/workflows/Build/badge.svg)](https://github.com/scrapy-plugins/scrapy-crawlera-fetch/actions) -[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-crawlera-fetch/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-crawlera-fetch) +# Scrapy Middleware for Zyte Smart Proxy Manager Simple Fetch API +[![actions](https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch/workflows/Build/badge.svg)](https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch/actions) +[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-zyte-proxy-fetch/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-zyte-proxy-fetch) This package provides a Scrapy [Downloader Middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) to transparently interact with the -[Crawlera Fetch API](https://doc.scrapinghub.com/crawlera-fetch-api.html). +[Zyte Smart Proxy Manager Fetch API](https://docs.zyte.com/smart-proxy-manager/fetch-api.html). ## Requirements @@ -18,70 +18,70 @@ to transparently interact with the Not yet available on PyPI. However, it can be installed directly from GitHub: -`pip install git+ssh://git@github.com/scrapy-plugins/scrapy-crawlera-fetch.git` +`pip install git+ssh://git@github.com/scrapy-plugins/scrapy-zyte-proxy-fetch.git` or -`pip install git+https://github.com/scrapy-plugins/scrapy-crawlera-fetch.git` +`pip install git+https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch.git` ## Configuration -Enable the `CrawleraFetchMiddleware` via the +Enable the `SmartProxyManagerFetchMiddleware` via the [`DOWNLOADER_MIDDLEWARES`](https://docs.scrapy.org/en/latest/topics/settings.html#downloader-middlewares) setting: ``` DOWNLOADER_MIDDLEWARES = { - "crawlera_fetch.CrawleraFetchMiddleware": 585, + "zyte_proxy_fetch.SmartProxyManagerFetchMiddleware": 585, } ``` Please note that the middleware needs to be placed before the built-in `HttpCompressionMiddleware` middleware (which has a priority of 590), otherwise incoming responses will be compressed and the -Crawlera middleware won't be able to handle them. +Smart Proxy Manager middleware won't be able to handle them. ### Settings -* `CRAWLERA_FETCH_ENABLED` (type `bool`, default `False`). Whether or not the middleware will be enabled, - i.e. requests should be downloaded using the Crawlera Fetch API. The `crawlera_fetch_enabled` spider +* `ZYTE_PROXY_FETCH_ENABLED` (type `bool`, default `False`). Whether or not the middleware will be enabled, + i.e. requests should be downloaded using the Smart Proxy Manager Fetch API. The `zyte_proxy_fetch_enabled` spider attribute takes precedence over this setting. -* `CRAWLERA_FETCH_APIKEY` (type `str`). API key to be used to authenticate against the Crawlera endpoint +* `ZYTE_PROXY_FETCH_APIKEY` (type `str`). API key to be used to authenticate against the Smart Proxy Manager endpoint (mandatory if enabled) -* `CRAWLERA_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`). - The endpoint of a specific Crawlera instance +* `ZYTE_PROXY_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`). + The endpoint of a specific Smart Proxy Manager instance -* `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`). Whether or not the middleware will +* `ZYTE_PROXY_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`). Whether or not the middleware will raise an exception if an error occurs while downloading or decoding a request. If `False`, a warning will be logged and the raw upstream response will be returned upon encountering an error. -* `CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `crawlera_fetch.DownloadSlotPolicy`, +* `ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `zyte_proxy_fetch.DownloadSlotPolicy`, default `DownloadSlotPolicy.Domain`). Possible values are `DownloadSlotPolicy.Domain`, `DownloadSlotPolicy.Single`, `DownloadSlotPolicydefault` (Scrapy default). If set to `DownloadSlotPolicy.Domain`, please consider setting `SCHEDULER_PRIORITY_QUEUE="scrapy.pqueues.DownloaderAwarePriorityQueue"` to make better usage of concurrency options and avoid delays. -* `CRAWLERA_FETCH_DEFAULT_ARGS` (type `dict`, default `{}`) - Default values to be sent to the Crawlera Fetch API. For instance, set to `{"device": "mobile"}` +* `ZYTE_PROXY_FETCH_DEFAULT_ARGS` (type `dict`, default `{}`) + Default values to be sent to the Smart Proxy Manager Fetch API. For instance, set to `{"device": "mobile"}` to render all requests with a mobile profile. ### Spider attributes -* `crawlera_fetch_enabled` (type `bool`, default `False`). Whether or not the middleware will be enabled. - Takes precedence over the `CRAWLERA_FETCH_ENABLED` setting. +* `zyte_proxy_fetch_enabled` (type `bool`, default `False`). Whether or not the middleware will be enabled. + Takes precedence over the `ZYTE_PROXY_FETCH_ENABLED` setting. ### Log formatter Since the URL for outgoing requests is modified by the middleware, by default the logs will show -the URL for the Crawlera endpoint. To revert this behaviour you can enable the provided +the URL for the Smart Proxy Manager endpoint. To revert this behaviour you can enable the provided log formatter by overriding the [`LOG_FORMATTER`](https://docs.scrapy.org/en/latest/topics/settings.html#log-formatter) setting: ``` -LOG_FORMATTER = "crawlera_fetch.CrawleraFetchLogFormatter" +LOG_FORMATTER = "zyte_proxy_fetch.SmartProxyManagerLogFormatter" ``` Note that the ability to override the error messages for spider and download errors was added @@ -92,7 +92,7 @@ to the `Request.flags` attribute, which is shown in the logs by default. ## Usage If the middleware is enabled, by default all requests will be redirected to the specified -Crawlera Fetch endpoint, and modified to comply with the format expected by the Crawlera Fetch API. +Smart Proxy Manager Fetch endpoint, and modified to comply with the format expected by the Smart Proxy Manager Fetch API. The three basic processed arguments are `method`, `url` and `body`. For instance, the following request: @@ -103,7 +103,7 @@ Request(url="https://httpbin.org/post", method="POST", body="foo=bar") will be converted to: ```python -Request(url="", method="POST", +Request(url="", method="POST", body='{"url": "https://httpbin.org/post", "method": "POST", "body": "foo=bar"}', headers={"Authorization": "Basic ", "Content-Type": "application/json", @@ -112,12 +112,12 @@ Request(url="", method="POST", ### Additional arguments -Additional arguments could be specified under the `crawlera_fetch.args` `Request.meta` key. For instance: +Additional arguments could be specified under the `zyte_proxy_fetch.args` `Request.meta` key. For instance: ```python Request( url="https://example.org", - meta={"crawlera_fetch": {"args": {"region": "us", "device": "mobile"}}}, + meta={"zyte_proxy_fetch": {"args": {"region": "us", "device": "mobile"}}}, ) ``` @@ -127,26 +127,26 @@ is translated into the following body: '{"url": "https://example.org", "method": "GET", "body": "", "region": "us", "device": "mobile"}' ``` -Arguments set for a specific request through the `crawlera_fetch.args` key override those -set with the `CRAWLERA_FETCH_DEFAULT_ARGS` setting. +Arguments set for a specific request through the `zyte_proxy_fetch.args` key override those +set with the `ZYTE_PROXY_FETCH_DEFAULT_ARGS` setting. -### Accessing original request and raw Crawlera response +### Accessing original request and raw Zyte Smart Proxy Manager response The `url`, `method`, `headers` and `body` attributes of the original request are available under -the `crawlera_fetch.original_request` `Response.meta` key. +the `zyte_proxy_fetch.original_request` `Response.meta` key. -The `status`, `headers` and `body` attributes of the upstream Crawlera response are available under -the `crawlera_fetch.upstream_response` `Response.meta` key. +The `status`, `headers` and `body` attributes of the upstream Smart Proxy Manager response are available under +the `zyte_proxy_fetch.upstream_response` `Response.meta` key. ### Skipping requests -You can instruct the middleware to skip a specific request by setting the `crawlera_fetch.skip` +You can instruct the middleware to skip a specific request by setting the `zyte_proxy_fetch.skip` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key: ```python Request( url="https://example.org", - meta={"crawlera_fetch": {"skip": True}}, + meta={"zyte_proxy_fetch": {"skip": True}}, ) ``` diff --git a/crawlera_fetch/__init__.py b/crawlera_fetch/__init__.py deleted file mode 100644 index f04b93b..0000000 --- a/crawlera_fetch/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .logformatter import CrawleraFetchLogFormatter # noqa: F401 -from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy # noqa: F401 diff --git a/setup.py b/setup.py index 1bbd1b3..260052e 100644 --- a/setup.py +++ b/setup.py @@ -6,15 +6,15 @@ setuptools.setup( - name="scrapy-crawlera-fetch", + name="scrapy-zyte-proxy-fetch", version="0.0.1", license="BSD", - description="Scrapy downloader middleware to interact with Crawlera Simple Fetch API", + description="Scrapy downloader middleware to interact with Zyte Smart Proxy Manager Fetch API", long_description=long_description, - author="Scrapinghub", - author_email="info@scrapinghub.com", - url="https://github.com/scrapy-plugins/scrapy-crawlera-fetch", - packages=["crawlera_fetch"], + author="Zyte", + author_email="opensource@zyte.com", + url="https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch", + packages=["zyte_proxy_fetch"], classifiers=[ "Development Status :: 1 - Planning", "License :: OSI Approved :: BSD License", diff --git a/tests/data/__init__.py b/tests/data/__init__.py index fce26ee..bb365c5 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,6 +1,6 @@ SETTINGS = { - "CRAWLERA_FETCH_ENABLED": True, - "CRAWLERA_FETCH_URL": "https://example.org", - "CRAWLERA_FETCH_APIKEY": "secret-key", - "CRAWLERA_FETCH_APIPASS": "secret-pass", + "ZYTE_PROXY_FETCH_ENABLED": True, + "ZYTE_PROXY_FETCH_URL": "https://example.org", + "ZYTE_PROXY_FETCH_APIKEY": "secret-key", + "ZYTE_PROXY_FETCH_APIPASS": "secret-pass", } diff --git a/tests/data/requests.py b/tests/data/requests.py index b650070..fd91f99 100644 --- a/tests/data/requests.py +++ b/tests/data/requests.py @@ -15,7 +15,7 @@ def get_test_requests(): url="https://httpbin.org/anything", method="GET", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "args": { "render": "no", "region": "us", @@ -26,19 +26,19 @@ def get_test_requests(): }, ) expected1 = Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], callback=foo_spider.foo_callback, method="POST", headers={ "Authorization": basic_auth_header( - SETTINGS["CRAWLERA_FETCH_APIKEY"], SETTINGS["CRAWLERA_FETCH_APIPASS"] + SETTINGS["ZYTE_PROXY_FETCH_APIKEY"], SETTINGS["ZYTE_PROXY_FETCH_APIPASS"] ), "Content-Type": "application/json", "Accept": "application/json", "X-Crawlera-JobId": "1/2/3", }, meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "args": { "render": "no", "region": "us", @@ -72,22 +72,22 @@ def get_test_requests(): original2 = FormRequest( url="https://httpbin.org/post", callback=foo_spider.foo_callback, - meta={"crawlera_fetch": {"args": {"device": "desktop"}}}, + meta={"zyte_proxy_fetch": {"args": {"device": "desktop"}}}, formdata={"foo": "bar"}, ) expected2 = FormRequest( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], method="POST", headers={ "Authorization": basic_auth_header( - SETTINGS["CRAWLERA_FETCH_APIKEY"], SETTINGS["CRAWLERA_FETCH_APIPASS"] + SETTINGS["ZYTE_PROXY_FETCH_APIKEY"], SETTINGS["ZYTE_PROXY_FETCH_APIPASS"] ), "Content-Type": "application/json", "Accept": "application/json", "X-Crawlera-JobId": "1/2/3", }, meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "args": {"device": "desktop"}, "original_request": request_to_dict(original2, spider=foo_spider), "timing": {"start_ts": mocked_time()}, @@ -116,7 +116,7 @@ def get_test_requests(): "original": Request( url="https://example.org", method="HEAD", - meta={"crawlera_fetch": {"skip": True}}, + meta={"zyte_proxy_fetch": {"skip": True}}, ), "expected": None, } diff --git a/tests/data/responses.py b/tests/data/responses.py index 3af9ba7..824fa7c 100644 --- a/tests/data/responses.py +++ b/tests/data/responses.py @@ -15,7 +15,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -26,9 +26,9 @@ "Connection": "close", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://fake.host.com"), @@ -51,7 +51,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -62,9 +62,9 @@ "Connection": "close", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://httpbin.org/get"), @@ -97,7 +97,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -108,9 +108,9 @@ "Connection": "close", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -164,7 +164,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -172,9 +172,9 @@ "Date": "Fri, 24 Apr 2020 18:22:10 GMT", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("http://httpbin.org/ip"), diff --git a/tests/test_config.py b/tests/test_config.py index 35435f4..4f50953 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,7 @@ from scrapy import Spider from scrapy.utils.test import get_crawler -from crawlera_fetch import CrawleraFetchMiddleware +from zyte_proxy_fetch import SmartProxyManagerFetchMiddleware from tests.data import SETTINGS @@ -11,8 +11,8 @@ class FooSpider(Spider): name = "foo" foo_spider = FooSpider() - foo_spider.crawler = get_crawler(FooSpider, settings_dict={"CRAWLERA_FETCH_ENABLED": False}) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + foo_spider.crawler = get_crawler(FooSpider, settings_dict={"ZYTE_PROXY_FETCH_ENABLED": False}) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) assert not middleware.enabled @@ -20,11 +20,11 @@ class FooSpider(Spider): def test_disable_via_spider_attribute_bool(): class FooSpider(Spider): name = "foo" - crawlera_fetch_enabled = False + zyte_proxy_fetch_enabled = False foo_spider = FooSpider() foo_spider.crawler = get_crawler(spidercls=FooSpider) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) assert not middleware.enabled @@ -32,11 +32,11 @@ class FooSpider(Spider): def test_disable_via_spider_attribute_int(): class FooSpider(Spider): name = "foo" - crawlera_fetch_enabled = 0 + zyte_proxy_fetch_enabled = 0 foo_spider = FooSpider() foo_spider.crawler = get_crawler(spidercls=FooSpider) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) assert not middleware.enabled @@ -44,11 +44,11 @@ class FooSpider(Spider): def test_disable_via_spider_attribute_str(): class FooSpider(Spider): name = "foo" - crawlera_fetch_enabled = "False" + zyte_proxy_fetch_enabled = "False" foo_spider = FooSpider() foo_spider.crawler = get_crawler(spidercls=FooSpider) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) assert not middleware.enabled @@ -56,11 +56,11 @@ class FooSpider(Spider): def test_disable_override(): class FooSpider(Spider): name = "foo" - crawlera_fetch_enabled = False + zyte_proxy_fetch_enabled = False foo_spider = FooSpider() - foo_spider.crawler = get_crawler(FooSpider, settings_dict={"CRAWLERA_FETCH_ENABLED": True}) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + foo_spider.crawler = get_crawler(FooSpider, settings_dict={"ZYTE_PROXY_FETCH_ENABLED": True}) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) assert not middleware.enabled @@ -70,8 +70,8 @@ class FooSpider(Spider): name = "foo" foo_spider = FooSpider() - foo_spider.crawler = get_crawler(settings_dict={"CRAWLERA_FETCH_ENABLED": True}) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + foo_spider.crawler = get_crawler(settings_dict={"ZYTE_PROXY_FETCH_ENABLED": True}) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) assert not middleware.enabled @@ -80,24 +80,24 @@ def test_config_values(): FooSpider = type("FooSpider", (Spider,), {"name": "foo"}) foo_spider = FooSpider() foo_spider.crawler = get_crawler(spidercls=FooSpider, settings_dict=SETTINGS) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) - assert middleware.apikey == SETTINGS["CRAWLERA_FETCH_APIKEY"] - assert middleware.url == SETTINGS["CRAWLERA_FETCH_URL"] - assert middleware.apipass == SETTINGS["CRAWLERA_FETCH_APIPASS"] + assert middleware.apikey == SETTINGS["ZYTE_PROXY_FETCH_APIKEY"] + assert middleware.url == SETTINGS["ZYTE_PROXY_FETCH_URL"] + assert middleware.apipass == SETTINGS["ZYTE_PROXY_FETCH_APIPASS"] def test_config_without_apipass(): settings = SETTINGS.copy() - settings.pop("CRAWLERA_FETCH_APIPASS", None) + settings.pop("ZYTE_PROXY_FETCH_APIPASS", None) FooSpider = type("FooSpider", (Spider,), {"name": "foo"}) foo_spider = FooSpider() foo_spider.crawler = get_crawler(spidercls=FooSpider, settings_dict=settings) - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) - assert middleware.apikey == SETTINGS["CRAWLERA_FETCH_APIKEY"] - assert middleware.url == SETTINGS["CRAWLERA_FETCH_URL"] + assert middleware.apikey == SETTINGS["ZYTE_PROXY_FETCH_APIKEY"] + assert middleware.url == SETTINGS["ZYTE_PROXY_FETCH_URL"] assert middleware.apipass == "" diff --git a/tests/test_logformatter.py b/tests/test_logformatter.py index 120430e..95ea6b5 100644 --- a/tests/test_logformatter.py +++ b/tests/test_logformatter.py @@ -5,7 +5,7 @@ from scrapy.http.response import Response from twisted.python.failure import Failure -from crawlera_fetch.logformatter import CrawleraFetchLogFormatter +from zyte_proxy_fetch.logformatter import SmartProxyManagerLogFormatter from tests.data.requests import get_test_requests from tests.utils import foo_spider, get_test_middleware @@ -14,7 +14,7 @@ @unittest.skipIf(scrapy_version > (2, 0, 0), "Scrapy < 2.0 only") def test_log_formatter_scrapy_1(): middleware = get_test_middleware() - logformatter = CrawleraFetchLogFormatter() + logformatter = SmartProxyManagerLogFormatter() formatter = Formatter() for case in get_test_requests(): @@ -22,8 +22,8 @@ def test_log_formatter_scrapy_1(): response = Response(original.url) processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") or {} - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") or {} + if zyte_proxy_meta.get("skip"): assert processed is None continue @@ -41,7 +41,7 @@ def test_log_formatter_scrapy_1(): @unittest.skipIf(scrapy_version < (2, 0, 0), "Scrapy >= 2.0 only") def test_log_formatter_scrapy_2(): middleware = get_test_middleware() - logformatter = CrawleraFetchLogFormatter() + logformatter = SmartProxyManagerLogFormatter() formatter = Formatter() for case in get_test_requests(): @@ -49,8 +49,8 @@ def test_log_formatter_scrapy_2(): response = Response(original.url) processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") or {} - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") or {} + if zyte_proxy_meta.get("skip"): assert processed is None continue diff --git a/tests/test_requests.py b/tests/test_requests.py index b36423a..877d3b7 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -5,7 +5,7 @@ from scrapy import Request -from crawlera_fetch import DownloadSlotPolicy +from zyte_proxy_fetch import DownloadSlotPolicy from tests.data.requests import get_test_requests from tests.utils import foo_spider, get_test_middleware, mocked_time @@ -24,7 +24,7 @@ def shub_jobkey_env_variable(): def test_process_request_disabled(): - middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ENABLED": False}) + middleware = get_test_middleware(settings={"ZYTE_PROXY_FETCH_ENABLED": False}) for case in get_test_requests(): request = case["original"] with shub_jobkey_env_variable(): @@ -42,8 +42,8 @@ def test_process_request(): with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") + if zyte_proxy_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) @@ -59,20 +59,20 @@ def test_process_request(): @patch("time.time", mocked_time) def test_process_request_single_download_slot(): middleware = get_test_middleware( - settings={"CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single} + settings={"ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single} ) for case in get_test_requests(): original = case["original"] expected = case["expected"] if expected: - expected.meta["download_slot"] = "__crawlera_fetch__" + expected.meta["download_slot"] = "__zyte_proxy_fetch__" with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") + if zyte_proxy_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) @@ -88,15 +88,15 @@ def test_process_request_single_download_slot(): @patch("time.time", mocked_time) def test_process_request_default_args(): middleware = get_test_middleware( - settings={"CRAWLERA_FETCH_DEFAULT_ARGS": {"foo": "bar", "answer": "42"}} + settings={"ZYTE_PROXY_FETCH_DEFAULT_ARGS": {"foo": "bar", "answer": "42"}} ) for case in get_test_requests(): original = case["original"] processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") + if zyte_proxy_meta.get("skip"): assert processed is None else: processed_text = processed.body.decode(processed.encoding) diff --git a/tests/test_responses.py b/tests/test_responses.py index 2ddf87b..880423b 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -6,14 +6,14 @@ from scrapy.utils.reqser import request_to_dict from testfixtures import LogCapture -from crawlera_fetch.middleware import CrawleraFetchException +from zyte_proxy_fetch.middleware import SmartProxyManagerFetchException from tests.data.responses import test_responses from tests.utils import foo_spider, get_test_middleware, mocked_time def test_process_response_disabled(): - middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ENABLED": False}) + middleware = get_test_middleware(settings={"ZYTE_PROXY_FETCH_ENABLED": False}) for case in test_responses: response = case["original"] assert middleware.process_response(response.request, response, foo_spider) is response @@ -34,11 +34,11 @@ def test_process_response(): assert processed.headers == expected.headers assert processed.body == expected.body - crawlera_meta = processed.meta.get("crawlera_fetch") or {} - if crawlera_meta.get("upstream_response"): - assert crawlera_meta["upstream_response"]["body"] == json.loads(original.text) - assert crawlera_meta["upstream_response"]["headers"] == original.headers - assert crawlera_meta["upstream_response"]["status"] == original.status + zyte_proxy_meta = processed.meta.get("zyte_proxy_fetch") or {} + if zyte_proxy_meta.get("upstream_response"): + assert zyte_proxy_meta["upstream_response"]["body"] == json.loads(original.text) + assert zyte_proxy_meta["upstream_response"]["headers"] == original.headers + assert zyte_proxy_meta["upstream_response"]["status"] == original.status def test_process_response_skip(): @@ -52,7 +52,7 @@ def test_process_response_skip(): }, request=Request( url="https://example.org", - meta={"crawlera_fetch": {"skip": True}}, + meta={"zyte_proxy_fetch": {"skip": True}}, ), body=b"""""", ) @@ -66,11 +66,11 @@ def test_process_response_skip(): def test_process_response_error(): response_list = [ TextResponse( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", request=Request( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -89,11 +89,11 @@ def test_process_response_error(): }, ), TextResponse( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", request=Request( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -105,11 +105,11 @@ def test_process_response_error(): body=b'{"Bad": "JSON', ), TextResponse( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", request=Request( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -133,17 +133,17 @@ def test_process_response_error(): ), ] - middleware_raise = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) + middleware_raise = get_test_middleware(settings={"ZYTE_PROXY_FETCH_RAISE_ON_ERROR": True}) for response in response_list: - with pytest.raises(CrawleraFetchException): + with pytest.raises(SmartProxyManagerFetchException): middleware_raise.process_response(response.request, response, foo_spider) - assert middleware_raise.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error") == 3 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error/bad_proxy_auth") == 1 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error/JSONDecodeError") == 1 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error/serverbusy") == 1 - middleware_log = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) + middleware_log = get_test_middleware(settings={"ZYTE_PROXY_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response(response.request, response, foo_spider) @@ -151,23 +151,23 @@ def test_process_response_error(): logs.check_present( ( - "crawlera-fetch-middleware", + "zyte-proxy-fetch-middleware", "WARNING", "Error downloading (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), ( - "crawlera-fetch-middleware", + "zyte-proxy-fetch-middleware", "WARNING", "Error decoding (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), ( - "crawlera-fetch-middleware", + "zyte-proxy-fetch-middleware", "WARNING", "Error downloading (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)", # noqa: E501 ), ) - assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error") == 3 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error/bad_proxy_auth") == 1 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error/JSONDecodeError") == 1 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error/serverbusy") == 1 diff --git a/tests/test_stats.py b/tests/test_stats.py index a3e580b..07e2377 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -43,14 +43,14 @@ def test_stats(mocked_time): middleware.spider_closed(spider, "finished") - assert middleware.stats.get_value("crawlera_fetch/request_count") == count - assert middleware.stats.get_value("crawlera_fetch/response_count") == count - assert middleware.stats.get_value("crawlera_fetch/total_latency") == total_latency - assert middleware.stats.get_value("crawlera_fetch/avg_latency") == avg_latency - assert middleware.stats.get_value("crawlera_fetch/max_latency") == max_latency + assert middleware.stats.get_value("zyte_proxy_fetch/request_count") == count + assert middleware.stats.get_value("zyte_proxy_fetch/response_count") == count + assert middleware.stats.get_value("zyte_proxy_fetch/total_latency") == total_latency + assert middleware.stats.get_value("zyte_proxy_fetch/avg_latency") == avg_latency + assert middleware.stats.get_value("zyte_proxy_fetch/max_latency") == max_latency for status in set(status_list): - sc = middleware.stats.get_value("crawlera_fetch/response_status_count/{}".format(status)) + sc = middleware.stats.get_value("zyte_proxy_fetch/response_status_count/{}".format(status)) assert sc == status_list.count(status) for method in set(method_list): - mc = middleware.stats.get_value("crawlera_fetch/request_method_count/{}".format(method)) + mc = middleware.stats.get_value("zyte_proxy_fetch/request_method_count/{}".format(method)) assert mc == method_list.count(method) diff --git a/tests/utils.py b/tests/utils.py index 6b010c9..633f8fc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,7 @@ from scrapy import Spider from scrapy.utils.test import get_crawler -from crawlera_fetch.middleware import CrawleraFetchMiddleware +from zyte_proxy_fetch.middleware import SmartProxyManagerFetchMiddleware from tests.data import SETTINGS @@ -39,7 +39,7 @@ def get_test_middleware(settings=None): foo_spider.crawler = get_crawler(FooSpider, settings_dict=settings_dict) foo_spider.crawler.engine = MockEngine() - middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(foo_spider.crawler) middleware.spider_opened(foo_spider) return middleware diff --git a/tox.ini b/tox.ini index 8ffe602..a23df14 100644 --- a/tox.ini +++ b/tox.ini @@ -6,22 +6,22 @@ envlist = flake8,black,typing,py35-pinned,py36,py37,py38 deps = -rtests/requirements.txt commands = - py.test --verbose --cov=crawlera_fetch --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: crawlera_fetch tests} + py.test --verbose --cov=zyte_proxy_fetch --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: zyte_proxy_fetch tests} [testenv:flake8] deps = flake8>=3.7.9 basepython = python3.8 -commands = flake8 --exclude=.git,venv* crawlera_fetch tests +commands = flake8 --exclude=.git,venv* zyte_proxy_fetch tests [testenv:black] deps = black>=19.10b0 basepython = python3.8 -commands = black --check crawlera_fetch tests +commands = black --check zyte_proxy_fetch tests [testenv:typing] deps = mypy==0.770 basepython = python3.8 -commands = mypy --ignore-missing-imports --follow-imports=skip crawlera_fetch tests +commands = mypy --ignore-missing-imports --follow-imports=skip zyte_proxy_fetch tests [testenv:py35-pinned] deps = diff --git a/zyte_proxy_fetch/__init__.py b/zyte_proxy_fetch/__init__.py new file mode 100644 index 0000000..a24aca1 --- /dev/null +++ b/zyte_proxy_fetch/__init__.py @@ -0,0 +1,2 @@ +from .logformatter import SmartProxyManagerLogFormatter # noqa: F401 +from .middleware import SmartProxyManagerFetchMiddleware, DownloadSlotPolicy # noqa: F401 diff --git a/crawlera_fetch/logformatter.py b/zyte_proxy_fetch/logformatter.py similarity index 81% rename from crawlera_fetch/logformatter.py rename to zyte_proxy_fetch/logformatter.py index 0279996..53cdeb2 100644 --- a/crawlera_fetch/logformatter.py +++ b/zyte_proxy_fetch/logformatter.py @@ -9,9 +9,9 @@ from twisted.python.failure import Failure -class CrawleraFetchLogFormatter(LogFormatter): +class SmartProxyManagerLogFormatter(LogFormatter): """ - Since the CrawleraFetchMiddleware sets a new URL for outgoing requests, by + Since the SmartProxyManagerFetchMiddleware sets a new URL for outgoing requests, by default the URLs shown in the logs are not the original ones. By enabling this formatter, this behaviour is reverted. @@ -30,7 +30,7 @@ def _set_target_url(self, result: dict, request: Request) -> dict: def crawled(self, request: Request, response: Response, spider: Spider) -> dict: return self._set_target_url( - result=super(CrawleraFetchLogFormatter, self).crawled(request, response, spider), + result=super(SmartProxyManagerLogFormatter, self).crawled(request, response, spider), request=request, ) @@ -41,7 +41,7 @@ def spider_error( Only available in Scrapy 2.0+ """ return self._set_target_url( - result=super(CrawleraFetchLogFormatter, self).spider_error( + result=super(SmartProxyManagerLogFormatter, self).spider_error( failure, request, response, spider ), request=request, @@ -54,7 +54,7 @@ def download_error( Only available in Scrapy 2.0+ """ return self._set_target_url( - result=super(CrawleraFetchLogFormatter, self).download_error( + result=super(SmartProxyManagerLogFormatter, self).download_error( failure, request, spider, errmsg ), request=request, diff --git a/crawlera_fetch/middleware.py b/zyte_proxy_fetch/middleware.py similarity index 66% rename from crawlera_fetch/middleware.py rename to zyte_proxy_fetch/middleware.py index d468cad..2f3739b 100644 --- a/crawlera_fetch/middleware.py +++ b/zyte_proxy_fetch/middleware.py @@ -19,13 +19,13 @@ from w3lib.http import basic_auth_header -logger = logging.getLogger("crawlera-fetch-middleware") +logger = logging.getLogger("zyte-proxy-fetch-middleware") -MiddlewareTypeVar = TypeVar("MiddlewareTypeVar", bound="CrawleraFetchMiddleware") +MiddlewareTypeVar = TypeVar("MiddlewareTypeVar", bound="SmartProxyManagerFetchMiddleware") -META_KEY = "crawlera_fetch" +META_KEY = "zyte_proxy_fetch" class DownloadSlotPolicy(Enum): @@ -34,11 +34,11 @@ class DownloadSlotPolicy(Enum): Default = "default" -class CrawleraFetchException(Exception): +class SmartProxyManagerFetchException(Exception): pass -class CrawleraFetchMiddleware: +class SmartProxyManagerFetchMiddleware: url = "http://fetch.crawlera.com:8010/fetch/v2/" apikey = "" enabled = False @@ -58,71 +58,77 @@ def from_crawler(cls: Type[MiddlewareTypeVar], crawler: Crawler) -> MiddlewareTy return middleware def _read_settings(self, settings): - if not settings.get("CRAWLERA_FETCH_APIKEY"): + if not settings.get("ZYTE_PROXY_FETCH_APIKEY"): self.enabled = False - logger.info("Crawlera Fetch API cannot be used without an apikey") + logger.info("Zyte Smart Proxy Manager Fetch API cannot be used without an apikey") return - self.apikey = settings["CRAWLERA_FETCH_APIKEY"] - self.apipass = settings.get("CRAWLERA_FETCH_APIPASS", "") + self.apikey = settings["ZYTE_PROXY_FETCH_APIKEY"] + self.apipass = settings.get("ZYTE_PROXY_FETCH_APIPASS", "") self.auth_header = basic_auth_header(self.apikey, self.apipass) - if settings.get("CRAWLERA_FETCH_URL"): - self.url = settings["CRAWLERA_FETCH_URL"] + if settings.get("ZYTE_PROXY_FETCH_URL"): + self.url = settings["ZYTE_PROXY_FETCH_URL"] self.download_slot_policy = settings.get( - "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain + "ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain ) - self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True) + self.raise_on_error = settings.getbool("ZYTE_PROXY_FETCH_RAISE_ON_ERROR", True) - self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) + self.default_args = settings.getdict("ZYTE_PROXY_FETCH_DEFAULT_ARGS", {}) def spider_opened(self, spider): try: - spider_attr = getattr(spider, "crawlera_fetch_enabled") + spider_attr = getattr(spider, "zyte_proxy_fetch_enabled") except AttributeError: - if not spider.crawler.settings.getbool("CRAWLERA_FETCH_ENABLED"): + if not spider.crawler.settings.getbool("ZYTE_PROXY_FETCH_ENABLED"): self.enabled = False - logger.info("Crawlera Fetch disabled (CRAWLERA_FETCH_ENABLED setting)") + logger.info( + "Zyte Smart Proxy Manager Fetch disabled (ZYTE_PROXY_FETCH_ENABLED setting)" + ) return else: if not BaseSettings({"enabled": spider_attr}).getbool("enabled"): self.enabled = False - logger.info("Crawlera Fetch disabled (crawlera_fetch_enabled spider attribute)") + logger.info( + "Zyte Smart Proxy Manager Fetch disabled " + "(zyte_proxy_fetch_enabled spider attribute)" + ) return self.enabled = True self._read_settings(spider.crawler.settings) if self.enabled: logger.info( - "Using Crawlera Fetch API at %s with apikey %s***" % (self.url, self.apikey[:5]) + "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" + % (self.url, self.apikey[:5]) ) def spider_closed(self, spider, reason): if self.enabled: - self.stats.set_value("crawlera_fetch/total_latency", self.total_latency) - response_count = self.stats.get_value("crawlera_fetch/response_count") + self.stats.set_value("zyte_proxy_fetch/total_latency", self.total_latency) + response_count = self.stats.get_value("zyte_proxy_fetch/response_count") if response_count: avg_latency = self.total_latency / response_count - self.stats.set_value("crawlera_fetch/avg_latency", avg_latency) + self.stats.set_value("zyte_proxy_fetch/avg_latency", avg_latency) def process_request(self, request: Request, spider: Spider) -> Optional[Request]: if not self.enabled: return None try: - crawlera_meta = request.meta[META_KEY] + zyte_proxy_meta = request.meta[META_KEY] except KeyError: - crawlera_meta = {} + zyte_proxy_meta = {} - if crawlera_meta.get("skip") or crawlera_meta.get("original_request"): + if zyte_proxy_meta.get("skip") or zyte_proxy_meta.get("original_request"): return None self._set_download_slot(request, spider) - self.stats.inc_value("crawlera_fetch/request_count") - self.stats.inc_value("crawlera_fetch/request_method_count/{}".format(request.method)) + self.stats.inc_value("zyte_proxy_fetch/request_count") + self.stats.inc_value("zyte_proxy_fetch/request_method_count/{}".format(request.method)) shub_jobkey = os.environ.get("SHUB_JOBKEY") if shub_jobkey: @@ -134,14 +140,14 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request] if request.method != "GET": body["method"] = request.method body.update(self.default_args) - body.update(crawlera_meta.get("args") or {}) + body.update(zyte_proxy_meta.get("args") or {}) body_json = json.dumps(body) additional_meta = { "original_request": request_to_dict(request, spider=spider), "timing": {"start_ts": time.time()}, } - crawlera_meta.update(additional_meta) + zyte_proxy_meta.update(additional_meta) additional_headers = { "Content-Type": "application/json", @@ -159,7 +165,7 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request] if original_url_flag not in request.flags: request.flags.append(original_url_flag) - request.meta[META_KEY] = crawlera_meta + request.meta[META_KEY] = zyte_proxy_meta return request.replace(url=self.url, method="POST", body=body_json) def process_response(self, request: Request, response: Response, spider: Spider) -> Response: @@ -167,24 +173,24 @@ def process_response(self, request: Request, response: Response, spider: Spider) return response try: - crawlera_meta = request.meta[META_KEY] + zyte_proxy_meta = request.meta[META_KEY] except KeyError: - crawlera_meta = {} + zyte_proxy_meta = {} - if crawlera_meta.get("skip") or not crawlera_meta.get("original_request"): + if zyte_proxy_meta.get("skip") or not zyte_proxy_meta.get("original_request"): return response - original_request = request_from_dict(crawlera_meta["original_request"], spider=spider) + original_request = request_from_dict(zyte_proxy_meta["original_request"], spider=spider) - self.stats.inc_value("crawlera_fetch/response_count") + self.stats.inc_value("zyte_proxy_fetch/response_count") self._calculate_latency(request) - self.stats.inc_value("crawlera_fetch/api_status_count/{}".format(response.status)) + self.stats.inc_value("zyte_proxy_fetch/api_status_count/{}".format(response.status)) if response.headers.get("X-Crawlera-Error"): message = response.headers["X-Crawlera-Error"].decode("utf8") - self.stats.inc_value("crawlera_fetch/response_error") - self.stats.inc_value("crawlera_fetch/response_error/{}".format(message)) + self.stats.inc_value("zyte_proxy_fetch/response_error") + self.stats.inc_value("zyte_proxy_fetch/response_error/{}".format(message)) log_msg = "Error downloading <{} {}> (status: {}, X-Crawlera-Error header: {})" log_msg = log_msg.format( original_request.method, @@ -193,7 +199,7 @@ def process_response(self, request: Request, response: Response, spider: Spider) message, ) if self.raise_on_error: - raise CrawleraFetchException(log_msg) + raise SmartProxyManagerFetchException(log_msg) else: logger.warning(log_msg) return response @@ -201,8 +207,8 @@ def process_response(self, request: Request, response: Response, spider: Spider) try: json_response = json.loads(response.text) except json.JSONDecodeError as exc: - self.stats.inc_value("crawlera_fetch/response_error") - self.stats.inc_value("crawlera_fetch/response_error/JSONDecodeError") + self.stats.inc_value("zyte_proxy_fetch/response_error") + self.stats.inc_value("zyte_proxy_fetch/response_error/JSONDecodeError") log_msg = "Error decoding <{} {}> (status: {}, message: {}, lineno: {}, colno: {})" log_msg = log_msg.format( original_request.method, @@ -213,7 +219,7 @@ def process_response(self, request: Request, response: Response, spider: Spider) exc.colno, ) if self.raise_on_error: - raise CrawleraFetchException(log_msg) from exc + raise SmartProxyManagerFetchException(log_msg) from exc else: logger.warning(log_msg) return response @@ -223,8 +229,8 @@ def process_response(self, request: Request, response: Response, spider: Spider) request_id = json_response.get("id") or json_response.get("uncork_id") if server_error: message = json_response.get("body") or json_response.get("message") - self.stats.inc_value("crawlera_fetch/response_error") - self.stats.inc_value("crawlera_fetch/response_error/{}".format(server_error)) + self.stats.inc_value("zyte_proxy_fetch/response_error") + self.stats.inc_value("zyte_proxy_fetch/response_error/{}".format(server_error)) log_msg = ( "Error downloading <{} {}> (Original status: {}, " "Fetch API error message: {}, Request ID: {})" @@ -237,14 +243,14 @@ def process_response(self, request: Request, response: Response, spider: Spider) request_id or "unknown", ) if self.raise_on_error: - raise CrawleraFetchException(log_msg) + raise SmartProxyManagerFetchException(log_msg) else: logger.warning(log_msg) return response - self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status)) + self.stats.inc_value("zyte_proxy_fetch/response_status_count/{}".format(original_status)) - crawlera_meta["upstream_response"] = { + zyte_proxy_meta["upstream_response"] = { "status": response.status, "headers": response.headers, "body": json_response, @@ -273,7 +279,7 @@ def _set_download_slot(self, request, spider): slot = self.crawler.engine.downloader._get_slot_key(request, spider) request.meta["download_slot"] = slot elif self.download_slot_policy == DownloadSlotPolicy.Single: - request.meta["download_slot"] = "__crawlera_fetch__" + request.meta["download_slot"] = "__zyte_proxy_fetch__" # Otherwise use Scrapy default policy def _calculate_latency(self, request): @@ -281,5 +287,7 @@ def _calculate_latency(self, request): timing["end_ts"] = time.time() timing["latency"] = timing["end_ts"] - timing["start_ts"] self.total_latency += timing["latency"] - max_latency = max(self.stats.get_value("crawlera_fetch/max_latency", 0), timing["latency"]) - self.stats.set_value("crawlera_fetch/max_latency", max_latency) + max_latency = max( + self.stats.get_value("zyte_proxy_fetch/max_latency", 0), timing["latency"] + ) + self.stats.set_value("zyte_proxy_fetch/max_latency", max_latency)