Skip to content
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
language = 'en'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
Expand Down
41 changes: 23 additions & 18 deletions scrapy_poet/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
are executed.
"""
import logging
from typing import Optional, Type, TypeVar
from typing import Generator, Optional, Type, TypeVar

from scrapy import Spider, signals
from scrapy.crawler import Crawler
Expand All @@ -14,15 +14,16 @@

from .api import DummyResponse
from .overrides import OverridesRegistry
from .page_input_providers import HttpResponseProvider
from .page_input_providers import HttpResponseProvider, RequestUrlProvider
from .injection import Injector


logger = logging.getLogger(__name__)


DEFAULT_PROVIDERS = {
HttpResponseProvider: 500
HttpResponseProvider: 500,
RequestUrlProvider: 600,
}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
Expand Down Expand Up @@ -54,7 +55,22 @@ def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> Injectio
def spider_closed(self, spider: Spider) -> None:
self.injector.close()

def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]:
@inlineCallbacks
def _inject_cb_kwargs(self, request: Request, response: Optional[Response] = None) -> Generator[None, None, None]:
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response=response,
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
# Precedence of user callback arguments
if arg not in request.cb_kwargs:
request.cb_kwargs[arg] = value
# TODO: check if all arguments are fulfilled somehow?

@inlineCallbacks
def process_request(self, request: Request, spider: Spider) -> Generator[None, None, Optional[DummyResponse]]:
"""This method checks if the request is really needed and if its
download could be skipped by trying to infer if a ``Response``
is going to be used by the callback or a Page Input.
Expand All @@ -70,13 +86,13 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes
"""
if self.injector.is_scrapy_response_required(request):
return None

yield from self._inject_cb_kwargs(request)
logger.debug(f"Using DummyResponse instead of downloading {request}")
return DummyResponse(url=request.url, request=request)

@inlineCallbacks
def process_response(self, request: Request, response: Response,
spider: Spider) -> Response:
spider: Spider) -> Generator[None, None, Response]:
"""This method fills ``request.cb_kwargs`` with instances for
the required Page Objects found in the callback signature.

Expand All @@ -89,16 +105,5 @@ def process_response(self, request: Request, response: Response,
and an injectable attribute,
the user-defined ``cb_kwargs`` takes precedence.
"""
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
# Precedence of user callback arguments
if arg not in request.cb_kwargs:
request.cb_kwargs[arg] = value
# TODO: check if all arguments are fulfilled somehow?

yield from self._inject_cb_kwargs(request, response)
return response
13 changes: 12 additions & 1 deletion scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from scrapy.utils.request import request_fingerprint

from scrapy_poet.injection_errors import MalformedProvidedClassesError
from web_poet import HttpResponse, HttpResponseHeaders
from web_poet import HttpResponse, HttpResponseHeaders, RequestUrl


class PageObjectInputProvider:
Expand Down Expand Up @@ -197,3 +197,14 @@ def deserialize(self, data: Any) -> Sequence[Any]:
)
for response_data in data
]


class RequestUrlProvider(PageObjectInputProvider):
"""This class provides ``web_poet.page_inputs.RequestUrl`` instances."""

provided_classes = {RequestUrl}
name = "request_url"

def __call__(self, to_provide: Set[Callable], request: Request):
"""Builds a ``RequestUrl`` instance using a Scrapy ``Request``"""
return [RequestUrl(url=request.url)]
4 changes: 2 additions & 2 deletions tests/test_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,11 @@ def callback(response: DummyResponse,

class Html(Injectable):
url = "http://example.com"
html = """<html><body>Price: <span class="price">22</span>€</body></html>"""
text = """<html><body>Price: <span class="price">22</span>€</body></html>"""

@property
def selector(self):
return parsel.Selector(self.html)
return parsel.Selector(self.text)


class EurDollarRate(Injectable):
Expand Down
27 changes: 26 additions & 1 deletion tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
PageObjectInputProvider
)
from web_poet import default_registry
from web_poet.page_inputs import HttpResponse
from web_poet.page_inputs import HttpResponse, RequestUrl
from scrapy_poet import DummyResponse
from tests.utils import (HtmlResource,
crawl_items,
Expand Down Expand Up @@ -310,6 +310,19 @@ def parse(self, response: DummyResponse):
}


class RequestUrlSpider(scrapy.Spider):
url = None

def start_requests(self):
yield Request(url=self.url, callback=self.parse)

def parse(self, response: DummyResponse, *, url: RequestUrl):
return {
'response': response,
'url': url,
}


@inlineCallbacks
def test_skip_downloads(settings):
item, url, crawler = yield crawl_single_item(
Expand All @@ -327,6 +340,18 @@ def test_skip_downloads(settings):
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@inlineCallbacks
def test_skip_download_request_url(settings):
item, url, crawler = yield crawl_single_item(
RequestUrlSpider, ProductHtml, settings)
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is True
assert isinstance(item['url'], RequestUrl)
assert str(item['url']) == url
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@mock.patch("scrapy_poet.injection.SqlitedictCache", spec=SqlitedictCache)
def test_cache_closed_on_spider_close(mock_sqlitedictcache, settings):
def get_middleware(settings):
Expand Down