From a475f2cf6cb2f9973fd72620d69082c6fd0a03a4 Mon Sep 17 00:00:00 2001 From: Veniamin Gvozdikov Date: Tue, 23 Mar 2021 11:03:00 +0300 Subject: [PATCH 1/5] Rebranding Crawlera -> Zyte --- LICENSE | 2 +- README.md | 50 +++++++++--------- crawlera_fetch/__init__.py | 4 +- crawlera_fetch/logformatter.py | 10 ++-- crawlera_fetch/middleware.py | 94 +++++++++++++++++----------------- setup.py | 10 ++-- tests/data/__init__.py | 8 +-- tests/data/requests.py | 18 +++---- tests/data/responses.py | 24 ++++----- tests/test_config.py | 26 +++++----- tests/test_logformatter.py | 14 ++--- tests/test_requests.py | 18 +++---- tests/test_responses.py | 42 +++++++-------- tests/test_stats.py | 14 ++--- tests/utils.py | 4 +- 15 files changed, 169 insertions(+), 169 deletions(-) diff --git a/LICENSE b/LICENSE index 8cf9e15..2e17b1a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2020 Scrapinghub +Copyright (c) 2021 Zyte Group Ltd Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index e744965..5fcc974 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# Scrapy Middleware for Crawlera Simple Fetch API -[![actions](https://github.com/scrapy-plugins/scrapy-crawlera-fetch/workflows/Build/badge.svg)](https://github.com/scrapy-plugins/scrapy-crawlera-fetch/actions) -[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-crawlera-fetch/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-crawlera-fetch) +# Scrapy Middleware for Zyte Smart Proxy Manager Simple Fetch API +[![actions](https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch/workflows/Build/badge.svg)](https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch/actions) +[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-zyte-proxy-fetch/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-zyte-proxy-fetch) This package provides a Scrapy [Downloader Middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) to transparently interact with the -[Crawlera Fetch API](https://doc.scrapinghub.com/crawlera-fetch-api.html). +[Zyte Smart Proxy Manager Fetch API](https://docs.zyte.com/smart-proxy-manager/fetch-api.html). ## Requirements @@ -18,64 +18,64 @@ to transparently interact with the Not yet available on PyPI. However, it can be installed directly from GitHub: -`pip install git+ssh://git@github.com/scrapy-plugins/scrapy-crawlera-fetch.git` +`pip install git+ssh://git@github.com/scrapy-plugins/scrapy-zyte-proxy-fetch.git` or -`pip install git+https://github.com/scrapy-plugins/scrapy-crawlera-fetch.git` +`pip install git+https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch.git` ## Configuration -Enable the `CrawleraFetchMiddleware` via the +Enable the `SmartProxyManagerFetchMiddleware` via the [`DOWNLOADER_MIDDLEWARES`](https://docs.scrapy.org/en/latest/topics/settings.html#downloader-middlewares) setting: ``` DOWNLOADER_MIDDLEWARES = { - "crawlera_fetch.CrawleraFetchMiddleware": 585, + "crawlera_fetch.SmartProxyManagerFetchMiddleware": 585, } ``` Please note that the middleware needs to be placed before the built-in `HttpCompressionMiddleware` middleware (which has a priority of 590), otherwise incoming responses will be compressed and the -Crawlera middleware won't be able to handle them. +Smart Proxy Manager middleware won't be able to handle them. ### Settings -* `CRAWLERA_FETCH_ENABLED` (type `bool`, default `False`). Whether or not the middleware will be enabled, - i.e. requests should be downloaded using the Crawlera Fetch API +* `ZYTE_PROXY_FETCH_ENABLED` (type `bool`, default `False`). Whether or not the middleware will be enabled, + i.e. requests should be downloaded using the Smart Proxy Manager Fetch API -* `CRAWLERA_FETCH_APIKEY` (type `str`). API key to be used to authenticate against the Crawlera endpoint +* `ZYTE_PROXY_FETCH_APIKEY` (type `str`). API key to be used to authenticate against the Smart Proxy Manager endpoint (mandatory if enabled) -* `CRAWLERA_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`). - The endpoint of a specific Crawlera instance +* `ZYTE_PROXY_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`). + The endpoint of a specific Smart Proxy Manager instance -* `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`). Whether or not the middleware will +* `ZYTE_PROXY_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`). Whether or not the middleware will raise an exception if an error occurs while downloading or decoding a request. If `False`, a warning will be logged and the raw upstream response will be returned upon encountering an error. -* `CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `crawlera_fetch.DownloadSlotPolicy`, +* `ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `crawlera_fetch.DownloadSlotPolicy`, default `DownloadSlotPolicy.Domain`). Possible values are `DownloadSlotPolicy.Domain`, `DownloadSlotPolicy.Single`, `DownloadSlotPolicydefault` (Scrapy default). If set to `DownloadSlotPolicy.Domain`, please consider setting `SCHEDULER_PRIORITY_QUEUE="scrapy.pqueues.DownloaderAwarePriorityQueue"` to make better usage of concurrency options and avoid delays. -* `CRAWLERA_FETCH_DEFAULT_ARGS` (type `dict`, default `{}`) - Default values to be sent to the Crawlera Fetch API. For instance, set to `{"device": "mobile"}` +* `ZYTE_PROXY_FETCH_DEFAULT_ARGS` (type `dict`, default `{}`) + Default values to be sent to the Smart Proxy Manager Fetch API. For instance, set to `{"device": "mobile"}` to render all requests with a mobile profile. ### Log formatter Since the URL for outgoing requests is modified by the middleware, by default the logs will show -the URL for the Crawlera endpoint. To revert this behaviour you can enable the provided +the URL for the Smart Proxy Manager endpoint. To revert this behaviour you can enable the provided log formatter by overriding the [`LOG_FORMATTER`](https://docs.scrapy.org/en/latest/topics/settings.html#log-formatter) setting: ``` -LOG_FORMATTER = "crawlera_fetch.CrawleraFetchLogFormatter" +LOG_FORMATTER = "crawlera_fetch.SmartProxyManagerLogFormatter" ``` Note that the ability to override the error messages for spider and download errors was added @@ -86,7 +86,7 @@ to the `Request.flags` attribute, which is shown in the logs by default. ## Usage If the middleware is enabled, by default all requests will be redirected to the specified -Crawlera Fetch endpoint, and modified to comply with the format expected by the Crawlera Fetch API. +Smart Proxy Manager Fetch endpoint, and modified to comply with the format expected by the Smart Proxy Manager Fetch API. The three basic processed arguments are `method`, `url` and `body`. For instance, the following request: @@ -97,7 +97,7 @@ Request(url="https://httpbin.org/post", method="POST", body="foo=bar") will be converted to: ```python -Request(url="", method="POST", +Request(url="", method="POST", body='{"url": "https://httpbin.org/post", "method": "POST", "body": "foo=bar"}', headers={"Authorization": "Basic ", "Content-Type": "application/json", @@ -122,14 +122,14 @@ is translated into the following body: ``` Arguments set for a specific request through the `crawlera_fetch.args` key override those -set with the `CRAWLERA_FETCH_DEFAULT_ARGS` setting. +set with the `ZYTE_PROXY_FETCH_DEFAULT_ARGS` setting. -### Accessing original request and raw Crawlera response +### Accessing original request and raw Zyte Smart Proxy Manager response The `url`, `method`, `headers` and `body` attributes of the original request are available under the `crawlera_fetch.original_request` `Response.meta` key. -The `status`, `headers` and `body` attributes of the upstream Crawlera response are available under +The `status`, `headers` and `body` attributes of the upstream Smart Proxy Manager response are available under the `crawlera_fetch.upstream_response` `Response.meta` key. ### Skipping requests diff --git a/crawlera_fetch/__init__.py b/crawlera_fetch/__init__.py index f04b93b..a24aca1 100644 --- a/crawlera_fetch/__init__.py +++ b/crawlera_fetch/__init__.py @@ -1,2 +1,2 @@ -from .logformatter import CrawleraFetchLogFormatter # noqa: F401 -from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy # noqa: F401 +from .logformatter import SmartProxyManagerLogFormatter # noqa: F401 +from .middleware import SmartProxyManagerFetchMiddleware, DownloadSlotPolicy # noqa: F401 diff --git a/crawlera_fetch/logformatter.py b/crawlera_fetch/logformatter.py index 0279996..53cdeb2 100644 --- a/crawlera_fetch/logformatter.py +++ b/crawlera_fetch/logformatter.py @@ -9,9 +9,9 @@ from twisted.python.failure import Failure -class CrawleraFetchLogFormatter(LogFormatter): +class SmartProxyManagerLogFormatter(LogFormatter): """ - Since the CrawleraFetchMiddleware sets a new URL for outgoing requests, by + Since the SmartProxyManagerFetchMiddleware sets a new URL for outgoing requests, by default the URLs shown in the logs are not the original ones. By enabling this formatter, this behaviour is reverted. @@ -30,7 +30,7 @@ def _set_target_url(self, result: dict, request: Request) -> dict: def crawled(self, request: Request, response: Response, spider: Spider) -> dict: return self._set_target_url( - result=super(CrawleraFetchLogFormatter, self).crawled(request, response, spider), + result=super(SmartProxyManagerLogFormatter, self).crawled(request, response, spider), request=request, ) @@ -41,7 +41,7 @@ def spider_error( Only available in Scrapy 2.0+ """ return self._set_target_url( - result=super(CrawleraFetchLogFormatter, self).spider_error( + result=super(SmartProxyManagerLogFormatter, self).spider_error( failure, request, response, spider ), request=request, @@ -54,7 +54,7 @@ def download_error( Only available in Scrapy 2.0+ """ return self._set_target_url( - result=super(CrawleraFetchLogFormatter, self).download_error( + result=super(SmartProxyManagerLogFormatter, self).download_error( failure, request, spider, errmsg ), request=request, diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 181ae4f..f23eb63 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -21,10 +21,10 @@ logger = logging.getLogger("crawlera-fetch-middleware") -MiddlewareTypeVar = TypeVar("MiddlewareTypeVar", bound="CrawleraFetchMiddleware") +MiddlewareTypeVar = TypeVar("MiddlewareTypeVar", bound="SmartProxyManagerFetchMiddleware") -META_KEY = "crawlera_fetch" +META_KEY = "zyte_proxy_fetch" class DownloadSlotPolicy(Enum): @@ -33,35 +33,35 @@ class DownloadSlotPolicy(Enum): Default = "default" -class CrawleraFetchException(Exception): +class SmartProxyManagerFetchException(Exception): pass -class CrawleraFetchMiddleware: +class SmartProxyManagerFetchMiddleware: url = "http://fetch.crawlera.com:8010/fetch/v2/" apikey = "" def __init__(self, crawler: Crawler) -> None: - if not crawler.settings.getbool("CRAWLERA_FETCH_ENABLED"): + if not crawler.settings.getbool("ZYTE_PROXY_FETCH_ENABLED"): raise NotConfigured() - elif crawler.settings.get("CRAWLERA_FETCH_APIKEY") is None: - raise NotConfigured("Crawlera Fetch API cannot be used without an apikey") - elif crawler.settings.get("CRAWLERA_FETCH_APIKEY"): - self.apikey = crawler.settings["CRAWLERA_FETCH_APIKEY"] - self.apipass = crawler.settings.get("CRAWLERA_FETCH_APIPASS", "") + elif crawler.settings.get("ZYTE_PROXY_FETCH_APIKEY") is None: + raise NotConfigured("Zyte Smart Proxy Manager Fetch API cannot be used without an apikey") + elif crawler.settings.get("ZYTE_PROXY_FETCH_APIKEY"): + self.apikey = crawler.settings["ZYTE_PROXY_FETCH_APIKEY"] + self.apipass = crawler.settings.get("ZYTE_PROXY_FETCH_APIPASS", "") self.auth_header = basic_auth_header(self.apikey, self.apipass) - if crawler.settings.get("CRAWLERA_FETCH_URL"): - self.url = crawler.settings["CRAWLERA_FETCH_URL"] + if crawler.settings.get("ZYTE_PROXY_FETCH_URL"): + self.url = crawler.settings["ZYTE_PROXY_FETCH_URL"] self.download_slot_policy = crawler.settings.get( - "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain + "ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain ) - self.raise_on_error = crawler.settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True) + self.raise_on_error = crawler.settings.getbool("ZYTE_PROXY_FETCH_RAISE_ON_ERROR", True) - self.default_args = crawler.settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) + self.default_args = crawler.settings.getdict("ZYTE_PROXY_FETCH_DEFAULT_ARGS", {}) crawler.signals.connect(self.spider_closed, signal=scrapy.signals.spider_closed) @@ -70,7 +70,7 @@ def __init__(self, crawler: Crawler) -> None: self.total_latency = 0 logger.info( - "Using Crawlera Fetch API at %s with apikey %s***" % (self.url, self.apikey[:5]) + "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" % (self.url, self.apikey[:5]) ) @classmethod @@ -78,25 +78,25 @@ def from_crawler(cls: Type[MiddlewareTypeVar], crawler: Crawler) -> MiddlewareTy return cls(crawler) def spider_closed(self, spider, reason): - self.stats.set_value("crawlera_fetch/total_latency", self.total_latency) - response_count = self.stats.get_value("crawlera_fetch/response_count") + self.stats.set_value("zyte_proxy_fetch/total_latency", self.total_latency) + response_count = self.stats.get_value("zyte_proxy_fetch/response_count") if response_count: avg_latency = self.total_latency / response_count - self.stats.set_value("crawlera_fetch/avg_latency", avg_latency) + self.stats.set_value("zyte_proxy_fetch/avg_latency", avg_latency) def process_request(self, request: Request, spider: Spider) -> Optional[Request]: try: - crawlera_meta = request.meta[META_KEY] + zyte_proxy_meta = request.meta[META_KEY] except KeyError: - crawlera_meta = {} + zyte_proxy_meta = {} - if crawlera_meta.get("skip") or crawlera_meta.get("original_request"): + if zyte_proxy_meta.get("skip") or zyte_proxy_meta.get("original_request"): return None self._set_download_slot(request, spider) - self.stats.inc_value("crawlera_fetch/request_count") - self.stats.inc_value("crawlera_fetch/request_method_count/{}".format(request.method)) + self.stats.inc_value("zyte_proxy_fetch/request_count") + self.stats.inc_value("zyte_proxy_fetch/request_method_count/{}".format(request.method)) shub_jobkey = os.environ.get("SHUB_JOBKEY") if shub_jobkey: @@ -108,14 +108,14 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request] if request.method != "GET": body["method"] = request.method body.update(self.default_args) - body.update(crawlera_meta.get("args") or {}) + body.update(zyte_proxy_meta.get("args") or {}) body_json = json.dumps(body) additional_meta = { "original_request": request_to_dict(request, spider=spider), "timing": {"start_ts": time.time()}, } - crawlera_meta.update(additional_meta) + zyte_proxy_meta.update(additional_meta) additional_headers = { "Content-Type": "application/json", @@ -133,29 +133,29 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request] if original_url_flag not in request.flags: request.flags.append(original_url_flag) - request.meta[META_KEY] = crawlera_meta + request.meta[META_KEY] = zyte_proxy_meta return request.replace(url=self.url, method="POST", body=body_json) def process_response(self, request: Request, response: Response, spider: Spider) -> Response: try: - crawlera_meta = request.meta[META_KEY] + zyte_proxy_meta = request.meta[META_KEY] except KeyError: - crawlera_meta = {} + zyte_proxy_meta = {} - if crawlera_meta.get("skip") or not crawlera_meta.get("original_request"): + if zyte_proxy_meta.get("skip") or not zyte_proxy_meta.get("original_request"): return response - original_request = request_from_dict(crawlera_meta["original_request"], spider=spider) + original_request = request_from_dict(zyte_proxy_meta["original_request"], spider=spider) - self.stats.inc_value("crawlera_fetch/response_count") + self.stats.inc_value("zyte_proxy_fetch/response_count") self._calculate_latency(request) - self.stats.inc_value("crawlera_fetch/api_status_count/{}".format(response.status)) + self.stats.inc_value("zyte_proxy_fetch/api_status_count/{}".format(response.status)) if response.headers.get("X-Crawlera-Error"): message = response.headers["X-Crawlera-Error"].decode("utf8") - self.stats.inc_value("crawlera_fetch/response_error") - self.stats.inc_value("crawlera_fetch/response_error/{}".format(message)) + self.stats.inc_value("zyte_proxy_fetch/response_error") + self.stats.inc_value("zyte_proxy_fetch/response_error/{}".format(message)) log_msg = "Error downloading <{} {}> (status: {}, X-Crawlera-Error header: {})" log_msg = log_msg.format( original_request.method, @@ -164,7 +164,7 @@ def process_response(self, request: Request, response: Response, spider: Spider) message, ) if self.raise_on_error: - raise CrawleraFetchException(log_msg) + raise SmartProxyManagerFetchException(log_msg) else: logger.warning(log_msg) return response @@ -172,8 +172,8 @@ def process_response(self, request: Request, response: Response, spider: Spider) try: json_response = json.loads(response.text) except json.JSONDecodeError as exc: - self.stats.inc_value("crawlera_fetch/response_error") - self.stats.inc_value("crawlera_fetch/response_error/JSONDecodeError") + self.stats.inc_value("zyte_proxy_fetch/response_error") + self.stats.inc_value("zyte_proxy_fetch/response_error/JSONDecodeError") log_msg = "Error decoding <{} {}> (status: {}, message: {}, lineno: {}, colno: {})" log_msg = log_msg.format( original_request.method, @@ -184,7 +184,7 @@ def process_response(self, request: Request, response: Response, spider: Spider) exc.colno, ) if self.raise_on_error: - raise CrawleraFetchException(log_msg) from exc + raise SmartProxyManagerFetchException(log_msg) from exc else: logger.warning(log_msg) return response @@ -194,8 +194,8 @@ def process_response(self, request: Request, response: Response, spider: Spider) request_id = json_response.get("id") or json_response.get("uncork_id") if server_error: message = json_response.get("body") or json_response.get("message") - self.stats.inc_value("crawlera_fetch/response_error") - self.stats.inc_value("crawlera_fetch/response_error/{}".format(server_error)) + self.stats.inc_value("zyte_proxy_fetch/response_error") + self.stats.inc_value("zyte_proxy_fetch/response_error/{}".format(server_error)) log_msg = ( "Error downloading <{} {}> (Original status: {}, " "Fetch API error message: {}, Request ID: {})" @@ -208,14 +208,14 @@ def process_response(self, request: Request, response: Response, spider: Spider) request_id or "unknown", ) if self.raise_on_error: - raise CrawleraFetchException(log_msg) + raise SmartProxyManagerFetchException(log_msg) else: logger.warning(log_msg) return response - self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status)) + self.stats.inc_value("zyte_proxy_fetch/response_status_count/{}".format(original_status)) - crawlera_meta["upstream_response"] = { + zyte_proxy_meta["upstream_response"] = { "status": response.status, "headers": response.headers, "body": json_response, @@ -244,7 +244,7 @@ def _set_download_slot(self, request, spider): slot = self.crawler.engine.downloader._get_slot_key(request, spider) request.meta["download_slot"] = slot elif self.download_slot_policy == DownloadSlotPolicy.Single: - request.meta["download_slot"] = "__crawlera_fetch__" + request.meta["download_slot"] = "__zyte_proxy_fetch__" # Otherwise use Scrapy default policy def _calculate_latency(self, request): @@ -252,5 +252,5 @@ def _calculate_latency(self, request): timing["end_ts"] = time.time() timing["latency"] = timing["end_ts"] - timing["start_ts"] self.total_latency += timing["latency"] - max_latency = max(self.stats.get_value("crawlera_fetch/max_latency", 0), timing["latency"]) - self.stats.set_value("crawlera_fetch/max_latency", max_latency) + max_latency = max(self.stats.get_value("zyte_proxy_fetch/max_latency", 0), timing["latency"]) + self.stats.set_value("zyte_proxy_fetch/max_latency", max_latency) diff --git a/setup.py b/setup.py index 1bbd1b3..de8a293 100644 --- a/setup.py +++ b/setup.py @@ -6,14 +6,14 @@ setuptools.setup( - name="scrapy-crawlera-fetch", + name="scrapy-zyte-proxy-fetch", version="0.0.1", license="BSD", - description="Scrapy downloader middleware to interact with Crawlera Simple Fetch API", + description="Scrapy downloader middleware to interact with Zyte Smart Proxy Manager Fetch API", long_description=long_description, - author="Scrapinghub", - author_email="info@scrapinghub.com", - url="https://github.com/scrapy-plugins/scrapy-crawlera-fetch", + author="Zyte", + author_email="opensource@zyte.com", + url="https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch", packages=["crawlera_fetch"], classifiers=[ "Development Status :: 1 - Planning", diff --git a/tests/data/__init__.py b/tests/data/__init__.py index fce26ee..bb365c5 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,6 +1,6 @@ SETTINGS = { - "CRAWLERA_FETCH_ENABLED": True, - "CRAWLERA_FETCH_URL": "https://example.org", - "CRAWLERA_FETCH_APIKEY": "secret-key", - "CRAWLERA_FETCH_APIPASS": "secret-pass", + "ZYTE_PROXY_FETCH_ENABLED": True, + "ZYTE_PROXY_FETCH_URL": "https://example.org", + "ZYTE_PROXY_FETCH_APIKEY": "secret-key", + "ZYTE_PROXY_FETCH_APIPASS": "secret-pass", } diff --git a/tests/data/requests.py b/tests/data/requests.py index b650070..fd91f99 100644 --- a/tests/data/requests.py +++ b/tests/data/requests.py @@ -15,7 +15,7 @@ def get_test_requests(): url="https://httpbin.org/anything", method="GET", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "args": { "render": "no", "region": "us", @@ -26,19 +26,19 @@ def get_test_requests(): }, ) expected1 = Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], callback=foo_spider.foo_callback, method="POST", headers={ "Authorization": basic_auth_header( - SETTINGS["CRAWLERA_FETCH_APIKEY"], SETTINGS["CRAWLERA_FETCH_APIPASS"] + SETTINGS["ZYTE_PROXY_FETCH_APIKEY"], SETTINGS["ZYTE_PROXY_FETCH_APIPASS"] ), "Content-Type": "application/json", "Accept": "application/json", "X-Crawlera-JobId": "1/2/3", }, meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "args": { "render": "no", "region": "us", @@ -72,22 +72,22 @@ def get_test_requests(): original2 = FormRequest( url="https://httpbin.org/post", callback=foo_spider.foo_callback, - meta={"crawlera_fetch": {"args": {"device": "desktop"}}}, + meta={"zyte_proxy_fetch": {"args": {"device": "desktop"}}}, formdata={"foo": "bar"}, ) expected2 = FormRequest( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], method="POST", headers={ "Authorization": basic_auth_header( - SETTINGS["CRAWLERA_FETCH_APIKEY"], SETTINGS["CRAWLERA_FETCH_APIPASS"] + SETTINGS["ZYTE_PROXY_FETCH_APIKEY"], SETTINGS["ZYTE_PROXY_FETCH_APIPASS"] ), "Content-Type": "application/json", "Accept": "application/json", "X-Crawlera-JobId": "1/2/3", }, meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "args": {"device": "desktop"}, "original_request": request_to_dict(original2, spider=foo_spider), "timing": {"start_ts": mocked_time()}, @@ -116,7 +116,7 @@ def get_test_requests(): "original": Request( url="https://example.org", method="HEAD", - meta={"crawlera_fetch": {"skip": True}}, + meta={"zyte_proxy_fetch": {"skip": True}}, ), "expected": None, } diff --git a/tests/data/responses.py b/tests/data/responses.py index 3af9ba7..824fa7c 100644 --- a/tests/data/responses.py +++ b/tests/data/responses.py @@ -15,7 +15,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -26,9 +26,9 @@ "Connection": "close", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://fake.host.com"), @@ -51,7 +51,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -62,9 +62,9 @@ "Connection": "close", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://httpbin.org/get"), @@ -97,7 +97,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -108,9 +108,9 @@ "Connection": "close", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -164,7 +164,7 @@ test_responses.append( { "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", @@ -172,9 +172,9 @@ "Date": "Fri, 24 Apr 2020 18:22:10 GMT", }, request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url=SETTINGS["ZYTE_PROXY_FETCH_URL"], meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("http://httpbin.org/ip"), diff --git a/tests/test_config.py b/tests/test_config.py index ec4ccc1..a31a900 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -2,38 +2,38 @@ from scrapy.exceptions import NotConfigured from scrapy.utils.test import get_crawler -from crawlera_fetch import CrawleraFetchMiddleware +from crawlera_fetch import SmartProxyManagerFetchMiddleware from tests.data import SETTINGS def test_not_enabled(): with pytest.raises(NotConfigured): - crawler = get_crawler(settings_dict={"CRAWLERA_FETCH_ENABLED": False}) - CrawleraFetchMiddleware.from_crawler(crawler) + crawler = get_crawler(settings_dict={"ZYTE_PROXY_FETCH_ENABLED": False}) + SmartProxyManagerFetchMiddleware.from_crawler(crawler) def test_no_apikey(): with pytest.raises(NotConfigured): - crawler = get_crawler(settings_dict={"CRAWLERA_FETCH_ENABLED": True}) - CrawleraFetchMiddleware.from_crawler(crawler) + crawler = get_crawler(settings_dict={"ZYTE_PROXY_FETCH_ENABLED": True}) + SmartProxyManagerFetchMiddleware.from_crawler(crawler) def test_config_values(): crawler = get_crawler(settings_dict=SETTINGS) - middleware = CrawleraFetchMiddleware.from_crawler(crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(crawler) - assert middleware.apikey == SETTINGS["CRAWLERA_FETCH_APIKEY"] - assert middleware.url == SETTINGS["CRAWLERA_FETCH_URL"] - assert middleware.apipass == SETTINGS["CRAWLERA_FETCH_APIPASS"] + assert middleware.apikey == SETTINGS["ZYTE_PROXY_FETCH_APIKEY"] + assert middleware.url == SETTINGS["ZYTE_PROXY_FETCH_URL"] + assert middleware.apipass == SETTINGS["ZYTE_PROXY_FETCH_APIPASS"] def test_config_without_apipass(): s = SETTINGS.copy() - s.pop("CRAWLERA_FETCH_APIPASS", None) + s.pop("ZYTE_PROXY_FETCH_APIPASS", None) crawler = get_crawler(settings_dict=s) - middleware = CrawleraFetchMiddleware.from_crawler(crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(crawler) - assert middleware.apikey == SETTINGS["CRAWLERA_FETCH_APIKEY"] - assert middleware.url == SETTINGS["CRAWLERA_FETCH_URL"] + assert middleware.apikey == SETTINGS["ZYTE_PROXY_FETCH_APIKEY"] + assert middleware.url == SETTINGS["ZYTE_PROXY_FETCH_URL"] assert middleware.apipass == "" diff --git a/tests/test_logformatter.py b/tests/test_logformatter.py index 120430e..2d3d895 100644 --- a/tests/test_logformatter.py +++ b/tests/test_logformatter.py @@ -5,7 +5,7 @@ from scrapy.http.response import Response from twisted.python.failure import Failure -from crawlera_fetch.logformatter import CrawleraFetchLogFormatter +from crawlera_fetch.logformatter import SmartProxyManagerLogFormatter from tests.data.requests import get_test_requests from tests.utils import foo_spider, get_test_middleware @@ -14,7 +14,7 @@ @unittest.skipIf(scrapy_version > (2, 0, 0), "Scrapy < 2.0 only") def test_log_formatter_scrapy_1(): middleware = get_test_middleware() - logformatter = CrawleraFetchLogFormatter() + logformatter = SmartProxyManagerLogFormatter() formatter = Formatter() for case in get_test_requests(): @@ -22,8 +22,8 @@ def test_log_formatter_scrapy_1(): response = Response(original.url) processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") or {} - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") or {} + if zyte_proxy_meta.get("skip"): assert processed is None continue @@ -41,7 +41,7 @@ def test_log_formatter_scrapy_1(): @unittest.skipIf(scrapy_version < (2, 0, 0), "Scrapy >= 2.0 only") def test_log_formatter_scrapy_2(): middleware = get_test_middleware() - logformatter = CrawleraFetchLogFormatter() + logformatter = SmartProxyManagerLogFormatter() formatter = Formatter() for case in get_test_requests(): @@ -49,8 +49,8 @@ def test_log_formatter_scrapy_2(): response = Response(original.url) processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") or {} - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") or {} + if zyte_proxy_meta.get("skip"): assert processed is None continue diff --git a/tests/test_requests.py b/tests/test_requests.py index 65e2c1d..f1d54ac 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -34,8 +34,8 @@ def test_process_request(): with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") + if zyte_proxy_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) @@ -51,20 +51,20 @@ def test_process_request(): @patch("time.time", mocked_time) def test_process_request_single_download_slot(): middleware = get_test_middleware( - settings={"CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single} + settings={"ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single} ) for case in get_test_requests(): original = case["original"] expected = case["expected"] if expected: - expected.meta["download_slot"] = "__crawlera_fetch__" + expected.meta["download_slot"] = "__zyte_proxy_fetch__" with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") + if zyte_proxy_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) @@ -80,15 +80,15 @@ def test_process_request_single_download_slot(): @patch("time.time", mocked_time) def test_process_request_default_args(): middleware = get_test_middleware( - settings={"CRAWLERA_FETCH_DEFAULT_ARGS": {"foo": "bar", "answer": "42"}} + settings={"ZYTE_PROXY_FETCH_DEFAULT_ARGS": {"foo": "bar", "answer": "42"}} ) for case in get_test_requests(): original = case["original"] processed = middleware.process_request(original, foo_spider) - crawlera_meta = original.meta.get("crawlera_fetch") - if crawlera_meta.get("skip"): + zyte_proxy_meta = original.meta.get("zyte_proxy_fetch") + if zyte_proxy_meta.get("skip"): assert processed is None else: processed_text = processed.body.decode(processed.encoding) diff --git a/tests/test_responses.py b/tests/test_responses.py index 4153bc9..2529ab0 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -6,7 +6,7 @@ from scrapy.utils.reqser import request_to_dict from testfixtures import LogCapture -from crawlera_fetch.middleware import CrawleraFetchException +from crawlera_fetch.middleware import SmartProxyManagerFetchException from tests.data.responses import test_responses from tests.utils import foo_spider, get_test_middleware, mocked_time @@ -27,11 +27,11 @@ def test_process_response(): assert processed.headers == expected.headers assert processed.body == expected.body - crawlera_meta = processed.meta.get("crawlera_fetch") or {} - if crawlera_meta.get("upstream_response"): - assert crawlera_meta["upstream_response"]["body"] == json.loads(original.text) - assert crawlera_meta["upstream_response"]["headers"] == original.headers - assert crawlera_meta["upstream_response"]["status"] == original.status + zyte_proxy_meta = processed.meta.get("zyte_proxy_fetch") or {} + if zyte_proxy_meta.get("upstream_response"): + assert zyte_proxy_meta["upstream_response"]["body"] == json.loads(original.text) + assert zyte_proxy_meta["upstream_response"]["headers"] == original.headers + assert zyte_proxy_meta["upstream_response"]["status"] == original.status def test_process_response_skip(): @@ -45,7 +45,7 @@ def test_process_response_skip(): }, request=Request( url="https://example.org", - meta={"crawlera_fetch": {"skip": True}}, + meta={"zyte_proxy_fetch": {"skip": True}}, ), body=b"""""", ) @@ -63,7 +63,7 @@ def test_process_response_error(): request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -86,7 +86,7 @@ def test_process_response_error(): request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -102,7 +102,7 @@ def test_process_response_error(): request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ - "crawlera_fetch": { + "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://example.org"), @@ -126,17 +126,17 @@ def test_process_response_error(): ), ] - middleware_raise = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) + middleware_raise = get_test_middleware(settings={"ZYTE_PROXY_FETCH_RAISE_ON_ERROR": True}) for response in response_list: - with pytest.raises(CrawleraFetchException): + with pytest.raises(SmartProxyManagerFetchException): middleware_raise.process_response(response.request, response, foo_spider) - assert middleware_raise.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error") == 3 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error/bad_proxy_auth") == 1 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error/JSONDecodeError") == 1 + assert middleware_raise.stats.get_value("zyte_proxy_fetch/response_error/serverbusy") == 1 - middleware_log = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) + middleware_log = get_test_middleware(settings={"ZYTE_PROXY_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response(response.request, response, foo_spider) @@ -160,7 +160,7 @@ def test_process_response_error(): ), ) - assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error") == 3 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error/bad_proxy_auth") == 1 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error/JSONDecodeError") == 1 + assert middleware_log.stats.get_value("zyte_proxy_fetch/response_error/serverbusy") == 1 diff --git a/tests/test_stats.py b/tests/test_stats.py index a3e580b..07e2377 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -43,14 +43,14 @@ def test_stats(mocked_time): middleware.spider_closed(spider, "finished") - assert middleware.stats.get_value("crawlera_fetch/request_count") == count - assert middleware.stats.get_value("crawlera_fetch/response_count") == count - assert middleware.stats.get_value("crawlera_fetch/total_latency") == total_latency - assert middleware.stats.get_value("crawlera_fetch/avg_latency") == avg_latency - assert middleware.stats.get_value("crawlera_fetch/max_latency") == max_latency + assert middleware.stats.get_value("zyte_proxy_fetch/request_count") == count + assert middleware.stats.get_value("zyte_proxy_fetch/response_count") == count + assert middleware.stats.get_value("zyte_proxy_fetch/total_latency") == total_latency + assert middleware.stats.get_value("zyte_proxy_fetch/avg_latency") == avg_latency + assert middleware.stats.get_value("zyte_proxy_fetch/max_latency") == max_latency for status in set(status_list): - sc = middleware.stats.get_value("crawlera_fetch/response_status_count/{}".format(status)) + sc = middleware.stats.get_value("zyte_proxy_fetch/response_status_count/{}".format(status)) assert sc == status_list.count(status) for method in set(method_list): - mc = middleware.stats.get_value("crawlera_fetch/request_method_count/{}".format(method)) + mc = middleware.stats.get_value("zyte_proxy_fetch/request_method_count/{}".format(method)) assert mc == method_list.count(method) diff --git a/tests/utils.py b/tests/utils.py index 71a794c..ff2f74b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,7 @@ from scrapy import Spider from scrapy.utils.test import get_crawler -from crawlera_fetch.middleware import CrawleraFetchMiddleware +from crawlera_fetch.middleware import SmartProxyManagerFetchMiddleware from tests.data import SETTINGS @@ -36,7 +36,7 @@ def get_test_middleware(settings=None): settings_dict.update(settings or {}) crawler = get_crawler(FooSpider, settings_dict=settings_dict) crawler.engine = MockEngine() - middleware = CrawleraFetchMiddleware.from_crawler(crawler) + middleware = SmartProxyManagerFetchMiddleware.from_crawler(crawler) return middleware From 6070abb858c94f7baee3b2e378cd80d927727c51 Mon Sep 17 00:00:00 2001 From: Veniamin Gvozdikov Date: Tue, 23 Mar 2021 11:21:41 +0300 Subject: [PATCH 2/5] Renamed crawlera_fetch -> zyte_proxy_fetch --- README.md | 20 +++++++++---------- setup.py | 2 +- tests/test_config.py | 2 +- tests/test_logformatter.py | 2 +- tests/test_requests.py | 2 +- tests/test_responses.py | 2 +- tests/utils.py | 2 +- tox.ini | 8 ++++---- .../__init__.py | 0 .../logformatter.py | 0 .../middleware.py | 13 +++++++++--- 11 files changed, 30 insertions(+), 23 deletions(-) rename {crawlera_fetch => zyte_proxy_fetch}/__init__.py (100%) rename {crawlera_fetch => zyte_proxy_fetch}/logformatter.py (100%) rename {crawlera_fetch => zyte_proxy_fetch}/middleware.py (96%) diff --git a/README.md b/README.md index 5fcc974..72b769b 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ setting: ``` DOWNLOADER_MIDDLEWARES = { - "crawlera_fetch.SmartProxyManagerFetchMiddleware": 585, + "zyte_proxy_fetch.SmartProxyManagerFetchMiddleware": 585, } ``` @@ -56,7 +56,7 @@ Smart Proxy Manager middleware won't be able to handle them. raise an exception if an error occurs while downloading or decoding a request. If `False`, a warning will be logged and the raw upstream response will be returned upon encountering an error. -* `ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `crawlera_fetch.DownloadSlotPolicy`, +* `ZYTE_PROXY_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `zyte_proxy_fetch.DownloadSlotPolicy`, default `DownloadSlotPolicy.Domain`). Possible values are `DownloadSlotPolicy.Domain`, `DownloadSlotPolicy.Single`, `DownloadSlotPolicydefault` (Scrapy default). If set to `DownloadSlotPolicy.Domain`, please @@ -75,7 +75,7 @@ log formatter by overriding the [`LOG_FORMATTER`](https://docs.scrapy.org/en/lat setting: ``` -LOG_FORMATTER = "crawlera_fetch.SmartProxyManagerLogFormatter" +LOG_FORMATTER = "zyte_proxy_fetch.SmartProxyManagerLogFormatter" ``` Note that the ability to override the error messages for spider and download errors was added @@ -106,12 +106,12 @@ Request(url="", method="POST", ### Additional arguments -Additional arguments could be specified under the `crawlera_fetch.args` `Request.meta` key. For instance: +Additional arguments could be specified under the `zyte_proxy_fetch.args` `Request.meta` key. For instance: ```python Request( url="https://example.org", - meta={"crawlera_fetch": {"args": {"region": "us", "device": "mobile"}}}, + meta={"zyte_proxy_fetch": {"args": {"region": "us", "device": "mobile"}}}, ) ``` @@ -121,26 +121,26 @@ is translated into the following body: '{"url": "https://example.org", "method": "GET", "body": "", "region": "us", "device": "mobile"}' ``` -Arguments set for a specific request through the `crawlera_fetch.args` key override those +Arguments set for a specific request through the `zyte_proxy_fetch.args` key override those set with the `ZYTE_PROXY_FETCH_DEFAULT_ARGS` setting. ### Accessing original request and raw Zyte Smart Proxy Manager response The `url`, `method`, `headers` and `body` attributes of the original request are available under -the `crawlera_fetch.original_request` `Response.meta` key. +the `zyte_proxy_fetch.original_request` `Response.meta` key. The `status`, `headers` and `body` attributes of the upstream Smart Proxy Manager response are available under -the `crawlera_fetch.upstream_response` `Response.meta` key. +the `zyte_proxy_fetch.upstream_response` `Response.meta` key. ### Skipping requests -You can instruct the middleware to skip a specific request by setting the `crawlera_fetch.skip` +You can instruct the middleware to skip a specific request by setting the `zyte_proxy_fetch.skip` [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key: ```python Request( url="https://example.org", - meta={"crawlera_fetch": {"skip": True}}, + meta={"zyte_proxy_fetch": {"skip": True}}, ) ``` diff --git a/setup.py b/setup.py index de8a293..260052e 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ author="Zyte", author_email="opensource@zyte.com", url="https://github.com/scrapy-plugins/scrapy-zyte-proxy-fetch", - packages=["crawlera_fetch"], + packages=["zyte_proxy_fetch"], classifiers=[ "Development Status :: 1 - Planning", "License :: OSI Approved :: BSD License", diff --git a/tests/test_config.py b/tests/test_config.py index a31a900..a87a465 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -2,7 +2,7 @@ from scrapy.exceptions import NotConfigured from scrapy.utils.test import get_crawler -from crawlera_fetch import SmartProxyManagerFetchMiddleware +from zyte_proxy_fetch import SmartProxyManagerFetchMiddleware from tests.data import SETTINGS diff --git a/tests/test_logformatter.py b/tests/test_logformatter.py index 2d3d895..95ea6b5 100644 --- a/tests/test_logformatter.py +++ b/tests/test_logformatter.py @@ -5,7 +5,7 @@ from scrapy.http.response import Response from twisted.python.failure import Failure -from crawlera_fetch.logformatter import SmartProxyManagerLogFormatter +from zyte_proxy_fetch.logformatter import SmartProxyManagerLogFormatter from tests.data.requests import get_test_requests from tests.utils import foo_spider, get_test_middleware diff --git a/tests/test_requests.py b/tests/test_requests.py index f1d54ac..dcd4d6a 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -5,7 +5,7 @@ from scrapy import Request -from crawlera_fetch import DownloadSlotPolicy +from zyte_proxy_fetch import DownloadSlotPolicy from tests.data.requests import get_test_requests from tests.utils import foo_spider, get_test_middleware, mocked_time diff --git a/tests/test_responses.py b/tests/test_responses.py index 2529ab0..45e8fb8 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -6,7 +6,7 @@ from scrapy.utils.reqser import request_to_dict from testfixtures import LogCapture -from crawlera_fetch.middleware import SmartProxyManagerFetchException +from zyte_proxy_fetch.middleware import SmartProxyManagerFetchException from tests.data.responses import test_responses from tests.utils import foo_spider, get_test_middleware, mocked_time diff --git a/tests/utils.py b/tests/utils.py index ff2f74b..fb4c033 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,7 @@ from scrapy import Spider from scrapy.utils.test import get_crawler -from crawlera_fetch.middleware import SmartProxyManagerFetchMiddleware +from zyte_proxy_fetch.middleware import SmartProxyManagerFetchMiddleware from tests.data import SETTINGS diff --git a/tox.ini b/tox.ini index 8ffe602..a23df14 100644 --- a/tox.ini +++ b/tox.ini @@ -6,22 +6,22 @@ envlist = flake8,black,typing,py35-pinned,py36,py37,py38 deps = -rtests/requirements.txt commands = - py.test --verbose --cov=crawlera_fetch --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: crawlera_fetch tests} + py.test --verbose --cov=zyte_proxy_fetch --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: zyte_proxy_fetch tests} [testenv:flake8] deps = flake8>=3.7.9 basepython = python3.8 -commands = flake8 --exclude=.git,venv* crawlera_fetch tests +commands = flake8 --exclude=.git,venv* zyte_proxy_fetch tests [testenv:black] deps = black>=19.10b0 basepython = python3.8 -commands = black --check crawlera_fetch tests +commands = black --check zyte_proxy_fetch tests [testenv:typing] deps = mypy==0.770 basepython = python3.8 -commands = mypy --ignore-missing-imports --follow-imports=skip crawlera_fetch tests +commands = mypy --ignore-missing-imports --follow-imports=skip zyte_proxy_fetch tests [testenv:py35-pinned] deps = diff --git a/crawlera_fetch/__init__.py b/zyte_proxy_fetch/__init__.py similarity index 100% rename from crawlera_fetch/__init__.py rename to zyte_proxy_fetch/__init__.py diff --git a/crawlera_fetch/logformatter.py b/zyte_proxy_fetch/logformatter.py similarity index 100% rename from crawlera_fetch/logformatter.py rename to zyte_proxy_fetch/logformatter.py diff --git a/crawlera_fetch/middleware.py b/zyte_proxy_fetch/middleware.py similarity index 96% rename from crawlera_fetch/middleware.py rename to zyte_proxy_fetch/middleware.py index f23eb63..3804d42 100644 --- a/crawlera_fetch/middleware.py +++ b/zyte_proxy_fetch/middleware.py @@ -46,7 +46,10 @@ def __init__(self, crawler: Crawler) -> None: if not crawler.settings.getbool("ZYTE_PROXY_FETCH_ENABLED"): raise NotConfigured() elif crawler.settings.get("ZYTE_PROXY_FETCH_APIKEY") is None: - raise NotConfigured("Zyte Smart Proxy Manager Fetch API cannot be used without an apikey") + raise NotConfigured( + "Zyte Smart Proxy Manager Fetch API cannot " + "be used without an apikey" + ) elif crawler.settings.get("ZYTE_PROXY_FETCH_APIKEY"): self.apikey = crawler.settings["ZYTE_PROXY_FETCH_APIKEY"] self.apipass = crawler.settings.get("ZYTE_PROXY_FETCH_APIPASS", "") @@ -70,7 +73,8 @@ def __init__(self, crawler: Crawler) -> None: self.total_latency = 0 logger.info( - "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" % (self.url, self.apikey[:5]) + "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" % + (self.url, self.apikey[:5]) ) @classmethod @@ -252,5 +256,8 @@ def _calculate_latency(self, request): timing["end_ts"] = time.time() timing["latency"] = timing["end_ts"] - timing["start_ts"] self.total_latency += timing["latency"] - max_latency = max(self.stats.get_value("zyte_proxy_fetch/max_latency", 0), timing["latency"]) + max_latency = max( + self.stats.get_value("zyte_proxy_fetch/max_latency", 0), + timing["latency"] + ) self.stats.set_value("zyte_proxy_fetch/max_latency", max_latency) From 2e7b704184106e76c08ecc7e909ff09dafd78410 Mon Sep 17 00:00:00 2001 From: Veniamin Gvozdikov Date: Tue, 23 Mar 2021 11:41:32 +0300 Subject: [PATCH 3/5] Fix formatting --- zyte_proxy_fetch/middleware.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/zyte_proxy_fetch/middleware.py b/zyte_proxy_fetch/middleware.py index 3804d42..87342e9 100644 --- a/zyte_proxy_fetch/middleware.py +++ b/zyte_proxy_fetch/middleware.py @@ -47,8 +47,7 @@ def __init__(self, crawler: Crawler) -> None: raise NotConfigured() elif crawler.settings.get("ZYTE_PROXY_FETCH_APIKEY") is None: raise NotConfigured( - "Zyte Smart Proxy Manager Fetch API cannot " - "be used without an apikey" + "Zyte Smart Proxy Manager Fetch API cannot be used without an apikey" ) elif crawler.settings.get("ZYTE_PROXY_FETCH_APIKEY"): self.apikey = crawler.settings["ZYTE_PROXY_FETCH_APIKEY"] @@ -73,8 +72,8 @@ def __init__(self, crawler: Crawler) -> None: self.total_latency = 0 logger.info( - "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" % - (self.url, self.apikey[:5]) + "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" + % (self.url, self.apikey[:5]) ) @classmethod @@ -257,7 +256,6 @@ def _calculate_latency(self, request): timing["latency"] = timing["end_ts"] - timing["start_ts"] self.total_latency += timing["latency"] max_latency = max( - self.stats.get_value("zyte_proxy_fetch/max_latency", 0), - timing["latency"] + self.stats.get_value("zyte_proxy_fetch/max_latency", 0), timing["latency"] ) self.stats.set_value("zyte_proxy_fetch/max_latency", max_latency) From a51f7a3a2917631dd10be162474cebcbdb68ce28 Mon Sep 17 00:00:00 2001 From: Veniamin Gvozdikov Date: Fri, 26 Mar 2021 19:45:51 +0300 Subject: [PATCH 4/5] Fix black checks --- zyte_proxy_fetch/middleware.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/zyte_proxy_fetch/middleware.py b/zyte_proxy_fetch/middleware.py index 8375b55..2f3739b 100644 --- a/zyte_proxy_fetch/middleware.py +++ b/zyte_proxy_fetch/middleware.py @@ -84,19 +84,25 @@ def spider_opened(self, spider): except AttributeError: if not spider.crawler.settings.getbool("ZYTE_PROXY_FETCH_ENABLED"): self.enabled = False - logger.info("Zyte Smart Proxy Manager Fetch disabled (ZYTE_PROXY_FETCH_ENABLED setting)") + logger.info( + "Zyte Smart Proxy Manager Fetch disabled (ZYTE_PROXY_FETCH_ENABLED setting)" + ) return else: if not BaseSettings({"enabled": spider_attr}).getbool("enabled"): self.enabled = False - logger.info("Zyte Smart Proxy Manager Fetch disabled (zyte_proxy_fetch_enabled spider attribute)") + logger.info( + "Zyte Smart Proxy Manager Fetch disabled " + "(zyte_proxy_fetch_enabled spider attribute)" + ) return self.enabled = True self._read_settings(spider.crawler.settings) if self.enabled: logger.info( - "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" % (self.url, self.apikey[:5]) + "Using Zyte Smart Proxy Manager Fetch API at %s with apikey %s***" + % (self.url, self.apikey[:5]) ) def spider_closed(self, spider, reason): From 8ef46d2ffcb73baaebe2fd9931e7b234cc84558e Mon Sep 17 00:00:00 2001 From: Veniamin Gvozdikov Date: Mon, 29 Mar 2021 17:15:44 +0300 Subject: [PATCH 5/5] Updated fake placeholder --- tests/test_responses.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_responses.py b/tests/test_responses.py index fdafa13..880423b 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -66,9 +66,9 @@ def test_process_response_skip(): def test_process_response_error(): response_list = [ TextResponse( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", request=Request( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", meta={ "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, @@ -89,9 +89,9 @@ def test_process_response_error(): }, ), TextResponse( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", request=Request( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", meta={ "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()}, @@ -105,9 +105,9 @@ def test_process_response_error(): body=b'{"Bad": "JSON', ), TextResponse( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", request=Request( - url="https://crawlera.com/fake/api/endpoint", + url="https://zyte.com/fake/api/endpoint", meta={ "zyte_proxy_fetch": { "timing": {"start_ts": mocked_time()},