Skip to content

Commit 21b397c

Browse files
authored
Add a provider for RequestUrl (#76)
1 parent e4589e6 commit 21b397c

File tree

3 files changed

+77
-6
lines changed

3 files changed

+77
-6
lines changed

scrapy_poet/middleware.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
are executed.
44
"""
55
import logging
6-
from typing import Optional, Type, TypeVar
6+
from typing import Generator, Optional, Type, TypeVar
77

88
from scrapy import Spider, signals
99
from scrapy.crawler import Crawler
1010
from scrapy.http import Request, Response
11-
from twisted.internet.defer import inlineCallbacks
11+
from twisted.internet.defer import Deferred, inlineCallbacks
1212

1313
from scrapy.utils.misc import create_instance, load_object
1414

@@ -17,6 +17,7 @@
1717
HttpClientProvider,
1818
HttpResponseProvider,
1919
PageParamsProvider,
20+
RequestUrlProvider,
2021
)
2122
from .overrides import OverridesRegistry
2223
from .injection import Injector
@@ -29,6 +30,7 @@
2930
HttpResponseProvider: 500,
3031
HttpClientProvider: 600,
3132
PageParamsProvider: 700,
33+
RequestUrlProvider: 800,
3234
}
3335

3436
InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
@@ -78,11 +80,12 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes
7880
return None
7981

8082
logger.debug(f"Using DummyResponse instead of downloading {request}")
83+
self.crawler.stats.inc_value("scrapy_poet/dummy_response_count")
8184
return DummyResponse(url=request.url, request=request)
8285

8386
@inlineCallbacks
8487
def process_response(self, request: Request, response: Response,
85-
spider: Spider) -> Response:
88+
spider: Spider) -> Generator[Deferred[object], object, Response]:
8689
"""This method fills ``request.cb_kwargs`` with instances for
8790
the required Page Objects found in the callback signature.
8891
@@ -98,7 +101,7 @@ def process_response(self, request: Request, response: Response,
98101
# Find out the dependencies
99102
final_kwargs = yield from self.injector.build_callback_dependencies(
100103
request,
101-
response
104+
response,
102105
)
103106
# Fill the callback arguments with the created instances
104107
for arg, value in final_kwargs.items():

scrapy_poet/page_input_providers.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from scrapy_poet.utils import scrapy_response_to_http_response
2222
from scrapy_poet.injection_errors import MalformedProvidedClassesError
2323
from scrapy_poet.downloader import create_scrapy_downloader
24-
from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams
24+
from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams, RequestUrl
2525

2626

2727
class PageObjectInputProvider:
@@ -223,3 +223,14 @@ def __call__(self, to_provide: Set[Callable], request: Request):
223223
``scrapy.http.Response`` instance.
224224
"""
225225
return [PageParams(request.meta.get("page_params", {}))]
226+
227+
228+
class RequestUrlProvider(PageObjectInputProvider):
229+
"""This class provides ``web_poet.page_inputs.RequestUrl`` instances."""
230+
231+
provided_classes = {RequestUrl}
232+
name = "request_url"
233+
234+
def __call__(self, to_provide: Set[Callable], request: Request):
235+
"""Builds a ``RequestUrl`` instance using a Scrapy ``Request``"""
236+
return [RequestUrl(url=request.url)]

tests/test_middleware.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
PageObjectInputProvider
2626
)
2727
from web_poet import default_registry
28-
from web_poet.page_inputs import HttpResponse
28+
from web_poet.page_inputs import HttpResponse, RequestUrl
2929
from scrapy_poet import DummyResponse
3030
from tests.utils import (HtmlResource,
3131
crawl_items,
@@ -317,13 +317,70 @@ def test_skip_downloads(settings):
317317
assert isinstance(item['response'], Response) is True
318318
assert isinstance(item['response'], DummyResponse) is False
319319
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1
320+
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 0
320321
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1
321322

322323
item, url, crawler = yield crawl_single_item(
323324
SkipDownloadSpider, ProductHtml, settings)
324325
assert isinstance(item['response'], Response) is True
325326
assert isinstance(item['response'], DummyResponse) is True
326327
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
328+
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1
329+
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1
330+
331+
332+
class RequestUrlSpider(scrapy.Spider):
333+
url = None
334+
335+
def start_requests(self):
336+
yield Request(url=self.url, callback=self.parse)
337+
338+
def parse(self, response: DummyResponse, url: RequestUrl):
339+
return {
340+
'response': response,
341+
'url': url,
342+
}
343+
344+
345+
@inlineCallbacks
346+
def test_skip_download_request_url(settings):
347+
item, url, crawler = yield crawl_single_item(
348+
RequestUrlSpider, ProductHtml, settings)
349+
assert isinstance(item['response'], Response) is True
350+
assert isinstance(item['response'], DummyResponse) is True
351+
assert isinstance(item['url'], RequestUrl)
352+
assert str(item['url']) == url
353+
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
354+
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1
355+
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1
356+
357+
358+
@attr.s(auto_attribs=True)
359+
class RequestUrlPage(ItemPage):
360+
url: RequestUrl
361+
362+
def to_item(self):
363+
return {'url': self.url}
364+
365+
366+
class RequestUrlPageSpider(scrapy.Spider):
367+
url = None
368+
369+
def start_requests(self):
370+
yield Request(url=self.url, callback=self.parse)
371+
372+
def parse(self, response: DummyResponse, page: RequestUrlPage):
373+
return page.to_item()
374+
375+
376+
@inlineCallbacks
377+
def test_skip_download_request_url_page(settings):
378+
item, url, crawler = yield crawl_single_item(
379+
RequestUrlPageSpider, ProductHtml, settings)
380+
assert tuple(item.keys()) == ('url',)
381+
assert str(item['url']) == url
382+
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
383+
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1
327384
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1
328385

329386

0 commit comments

Comments
 (0)