Skip to content
11 changes: 7 additions & 4 deletions scrapy_poet/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
are executed.
"""
import logging
from typing import Optional, Type, TypeVar
from typing import Generator, Optional, Type, TypeVar

from scrapy import Spider, signals
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from twisted.internet.defer import inlineCallbacks
from twisted.internet.defer import Deferred, inlineCallbacks

from scrapy.utils.misc import create_instance, load_object

Expand All @@ -17,6 +17,7 @@
HttpClientProvider,
HttpResponseProvider,
PageParamsProvider,
RequestUrlProvider,
)
from .overrides import OverridesRegistry
from .injection import Injector
Expand All @@ -29,6 +30,7 @@
HttpResponseProvider: 500,
HttpClientProvider: 600,
PageParamsProvider: 700,
RequestUrlProvider: 800,
}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
Expand Down Expand Up @@ -78,11 +80,12 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes
return None

logger.debug(f"Using DummyResponse instead of downloading {request}")
self.crawler.stats.inc_value("scrapy_poet/dummy_response_count")
return DummyResponse(url=request.url, request=request)

@inlineCallbacks
def process_response(self, request: Request, response: Response,
spider: Spider) -> Response:
spider: Spider) -> Generator[Deferred[object], object, Response]:
"""This method fills ``request.cb_kwargs`` with instances for
the required Page Objects found in the callback signature.

Expand All @@ -98,7 +101,7 @@ def process_response(self, request: Request, response: Response,
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response
response,
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
Expand Down
13 changes: 12 additions & 1 deletion scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from scrapy_poet.utils import scrapy_response_to_http_response
from scrapy_poet.injection_errors import MalformedProvidedClassesError
from scrapy_poet.downloader import create_scrapy_downloader
from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams
from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams, RequestUrl


class PageObjectInputProvider:
Expand Down Expand Up @@ -223,3 +223,14 @@ def __call__(self, to_provide: Set[Callable], request: Request):
``scrapy.http.Response`` instance.
"""
return [PageParams(request.meta.get("page_params", {}))]


class RequestUrlProvider(PageObjectInputProvider):
"""This class provides ``web_poet.page_inputs.RequestUrl`` instances."""

provided_classes = {RequestUrl}
name = "request_url"

def __call__(self, to_provide: Set[Callable], request: Request):
"""Builds a ``RequestUrl`` instance using a Scrapy ``Request``"""
return [RequestUrl(url=request.url)]
59 changes: 58 additions & 1 deletion tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
PageObjectInputProvider
)
from web_poet import default_registry
from web_poet.page_inputs import HttpResponse
from web_poet.page_inputs import HttpResponse, RequestUrl
from scrapy_poet import DummyResponse
from tests.utils import (HtmlResource,
crawl_items,
Expand Down Expand Up @@ -317,13 +317,70 @@ def test_skip_downloads(settings):
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is False
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 0
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1

item, url, crawler = yield crawl_single_item(
SkipDownloadSpider, ProductHtml, settings)
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is True
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


class RequestUrlSpider(scrapy.Spider):
url = None

def start_requests(self):
yield Request(url=self.url, callback=self.parse)

def parse(self, response: DummyResponse, url: RequestUrl):
return {
'response': response,
'url': url,
}


@inlineCallbacks
def test_skip_download_request_url(settings):
item, url, crawler = yield crawl_single_item(
RequestUrlSpider, ProductHtml, settings)
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is True
assert isinstance(item['url'], RequestUrl)
assert str(item['url']) == url
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@attr.s(auto_attribs=True)
class RequestUrlPage(ItemPage):
url: RequestUrl

def to_item(self):
return {'url': self.url}


class RequestUrlPageSpider(scrapy.Spider):
url = None

def start_requests(self):
yield Request(url=self.url, callback=self.parse)

def parse(self, response: DummyResponse, page: RequestUrlPage):
return page.to_item()


@inlineCallbacks
def test_skip_download_request_url_page(settings):
item, url, crawler = yield crawl_single_item(
RequestUrlPageSpider, ProductHtml, settings)
assert tuple(item.keys()) == ('url',)
assert str(item['url']) == url
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


Expand Down