Skip to content
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
language = 'en'

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
Expand Down
12 changes: 7 additions & 5 deletions scrapy_poet/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
are executed.
"""
import logging
from typing import Optional, Type, TypeVar
from typing import Generator, Optional, Type, TypeVar

from scrapy import Spider, signals
from scrapy.crawler import Crawler
Expand All @@ -14,15 +14,16 @@

from .api import DummyResponse
from .overrides import OverridesRegistry
from .page_input_providers import HttpResponseProvider
from .page_input_providers import HttpResponseProvider, RequestUrlProvider
from .injection import Injector


logger = logging.getLogger(__name__)


DEFAULT_PROVIDERS = {
HttpResponseProvider: 500
HttpResponseProvider: 500,
RequestUrlProvider: 600,
}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
Expand Down Expand Up @@ -72,11 +73,12 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes
return None

logger.debug(f"Using DummyResponse instead of downloading {request}")
self.crawler.stats.inc_value("downloader/request_count/skipped")
return DummyResponse(url=request.url, request=request)

@inlineCallbacks
def process_response(self, request: Request, response: Response,
spider: Spider) -> Response:
spider: Spider) -> Generator[None, None, Response]:
"""This method fills ``request.cb_kwargs`` with instances for
the required Page Objects found in the callback signature.

Expand All @@ -92,7 +94,7 @@ def process_response(self, request: Request, response: Response,
# Find out the dependencies
final_kwargs = yield from self.injector.build_callback_dependencies(
request,
response
response,
)
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
Expand Down
13 changes: 12 additions & 1 deletion scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from scrapy.utils.request import request_fingerprint

from scrapy_poet.injection_errors import MalformedProvidedClassesError
from web_poet import HttpResponse, HttpResponseHeaders
from web_poet import HttpResponse, HttpResponseHeaders, RequestUrl


class PageObjectInputProvider:
Expand Down Expand Up @@ -197,3 +197,14 @@ def deserialize(self, data: Any) -> Sequence[Any]:
)
for response_data in data
]


class RequestUrlProvider(PageObjectInputProvider):
"""This class provides ``web_poet.page_inputs.RequestUrl`` instances."""

provided_classes = {RequestUrl}
name = "request_url"

def __call__(self, to_provide: Set[Callable], request: Request):
"""Builds a ``RequestUrl`` instance using a Scrapy ``Request``"""
return [RequestUrl(url=request.url)]
4 changes: 2 additions & 2 deletions tests/test_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,11 @@ def callback(response: DummyResponse,

class Html(Injectable):
url = "http://example.com"
html = """<html><body>Price: <span class="price">22</span>€</body></html>"""
text = """<html><body>Price: <span class="price">22</span>€</body></html>"""

@property
def selector(self):
return parsel.Selector(self.html)
return parsel.Selector(self.text)


class EurDollarRate(Injectable):
Expand Down
59 changes: 58 additions & 1 deletion tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
PageObjectInputProvider
)
from web_poet import default_registry
from web_poet.page_inputs import HttpResponse
from web_poet.page_inputs import HttpResponse, RequestUrl
from scrapy_poet import DummyResponse
from tests.utils import (HtmlResource,
crawl_items,
Expand Down Expand Up @@ -317,13 +317,70 @@ def test_skip_downloads(settings):
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is False
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1
assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 0
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1

item, url, crawler = yield crawl_single_item(
SkipDownloadSpider, ProductHtml, settings)
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is True
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


class RequestUrlSpider(scrapy.Spider):
url = None

def start_requests(self):
yield Request(url=self.url, callback=self.parse)

def parse(self, response: DummyResponse, url: RequestUrl):
return {
'response': response,
'url': url,
}


@inlineCallbacks
def test_skip_download_request_url(settings):
item, url, crawler = yield crawl_single_item(
RequestUrlSpider, ProductHtml, settings)
assert isinstance(item['response'], Response) is True
assert isinstance(item['response'], DummyResponse) is True
assert isinstance(item['url'], RequestUrl)
assert str(item['url']) == url
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


@attr.s(auto_attribs=True)
class RequestUrlPage(ItemPage):
url: RequestUrl

def to_item(self):
return {'url': self.url}


class RequestUrlPageSpider(scrapy.Spider):
url = None

def start_requests(self):
yield Request(url=self.url, callback=self.parse)

def parse(self, response: DummyResponse, page: RequestUrlPage):
return page.to_item()


@inlineCallbacks
def test_skip_download_request_url_page(settings):
item, url, crawler = yield crawl_single_item(
RequestUrlPageSpider, ProductHtml, settings)
assert tuple(item.keys()) == ('url',)
assert str(item['url']) == url
assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0
assert crawler.stats.get_stats().get('downloader/request_count/skipped', 0) == 1
assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1


Expand Down