Skip to content
35 changes: 20 additions & 15 deletions scrapy_poet/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@

from .api import DummyResponse
from .overrides import OverridesRegistry
from .page_input_providers import HttpResponseProvider
from .page_input_providers import HttpResponseProvider, RequestUrlProvider
from .injection import Injector


logger = logging.getLogger(__name__)


DEFAULT_PROVIDERS = {
HttpResponseProvider: 500
HttpResponseProvider: 500,
RequestUrlProvider: 600,
}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
Expand Down Expand Up @@ -54,6 +55,21 @@ def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> Injectio
def spider_closed(self, spider: Spider) -> None:
self.injector.close()

@inlineCallbacks
def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = None):
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response=response,
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
# Precedence of user callback arguments
if arg not in request.cb_kwargs:
request.cb_kwargs[arg] = value
# TODO: check if all arguments are fulfilled somehow?

@inlineCallbacks
def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]:
"""This method checks if the request is really needed and if its
download could be skipped by trying to infer if a ``Response``
Expand All @@ -70,7 +86,7 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes
"""
if self.injector.is_scrapy_response_required(request):
return None

yield from self._inject_cb_kwargs(request)
logger.debug(f"Using DummyResponse instead of downloading {request}")
return DummyResponse(url=request.url, request=request)

Expand All @@ -89,16 +105,5 @@ def process_response(self, request: Request, response: Response,
and an injectable attribute,
the user-defined ``cb_kwargs`` takes precedence.
"""
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
# Precedence of user callback arguments
if arg not in request.cb_kwargs:
request.cb_kwargs[arg] = value
# TODO: check if all arguments are fulfilled somehow?

yield from self._inject_cb_kwargs(request, response)
return response
13 changes: 12 additions & 1 deletion scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from scrapy.utils.request import request_fingerprint

from scrapy_poet.injection_errors import MalformedProvidedClassesError
from web_poet import HttpResponse, HttpResponseHeaders
from web_poet import HttpResponse, HttpResponseHeaders, RequestUrl


class PageObjectInputProvider:
Expand Down Expand Up @@ -197,3 +197,14 @@ def deserialize(self, data: Any) -> Sequence[Any]:
)
for response_data in data
]


class RequestUrlProvider(PageObjectInputProvider):
"""This class provides ``web_poet.page_inputs.RequestUrl`` instances."""

provided_classes = {RequestUrl}
name = "request_url"

def __call__(self, to_provide: Set[Callable], request: Request):
"""Builds a ``RequestUrl`` instance using a Scrapy ``Request``"""
return [RequestUrl(url=request.url)]
4 changes: 2 additions & 2 deletions tests/test_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,11 @@ def callback(response: DummyResponse,

class Html(Injectable):
url = "http://example.com"
html = """<html><body>Price: <span class="price">22</span>€</body></html>"""
text = """<html><body>Price: <span class="price">22</span>€</body></html>"""

@property
def selector(self):
return parsel.Selector(self.html)
return parsel.Selector(self.text)


class EurDollarRate(Injectable):
Expand Down
27 changes: 26 additions & 1 deletion tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
PageObjectInputProvider
)
from web_poet import default_registry
from web_poet.page_inputs import HttpResponse
from web_poet.page_inputs import HttpResponse, RequestUrl
from scrapy_poet import DummyResponse
from tests.utils import (HtmlResource,
crawl_items,
Expand Down Expand Up @@ -310,6 +310,19 @@ def parse(self, response: DummyResponse):
}


class RequestUrlSpider(scrapy.Spider):
url = None

def start_requests(self):
yield Request(url=self.url, callback=self.parse)

def parse(self, response: DummyResponse, *, url: RequestUrl):
return {
'response': response,
'url': url,
}


@inlineCallbacks
def test_skip_downloads(settings):
item, url, crawler = yield crawl_single_item(
Expand All @@ -327,6 +340,18 @@ def test_skip_downloads(settings):
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@inlineCallbacks
def test_skip_download_request_url(settings):
item, url, crawler = yield crawl_single_item(
RequestUrlSpider, ProductHtml, settings)
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is True
assert isinstance(item['url'], RequestUrl)
assert str(item['url']) == url
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@mock.patch("scrapy_poet.injection.SqlitedictCache", spec=SqlitedictCache)
def test_cache_closed_on_spider_close(mock_sqlitedictcache, settings):
def get_middleware(settings):
Expand Down