diff --git a/docs/api_reference.rst b/docs/api_reference.rst index ce58b44e..e6f54966 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -11,6 +11,14 @@ API :members: :no-special-members: +.. autoclass:: scrapy_poet.PickFields + :members: + :no-special-members: + +.. autoclass:: scrapy_poet.NotPickFields + :members: + :no-special-members: + Injection Middleware ==================== diff --git a/scrapy_poet/__init__.py b/scrapy_poet/__init__.py index 5a8d717f..51ba228b 100644 --- a/scrapy_poet/__init__.py +++ b/scrapy_poet/__init__.py @@ -1,4 +1,4 @@ -from .api import DummyResponse, callback_for +from .api import DummyResponse, NotPickFields, PickFields, callback_for from .downloadermiddlewares import InjectionMiddleware from .page_input_providers import ( CacheDataProviderMixin, diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index ca2e6db0..8c6a76f2 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -1,5 +1,5 @@ from inspect import iscoroutinefunction -from typing import Callable, Optional, Type +from typing import Callable, Iterable, Optional, Type from scrapy.http import Request, Response from web_poet.pages import ItemPage @@ -130,3 +130,36 @@ def parse(*args, item: page_or_item_cls, **kwargs): # type:ignore setattr(parse, _CALLBACK_FOR_MARKER, True) return parse + + +class _FieldController: + def __init__(self, *args: Iterable[str]): + self.fields = tuple(args) + + +class PickFields(_FieldController): + """To be used alongside :class:`typing.Annotated` to indicate an **inclusion** + list of fields which would be populated in an item. + + It accepts an arbitrary number of strings. + + .. code-block:: python + + Annotated[BigItem, PickFields("x", "y")] + """ + + pass + + +class NotPickFields(_FieldController): + """To be used alongside :class:`typing.Annotated` to indicate an **exclusion** + list of fields which would be populated in an item. + + It accepts an arbitrary number of strings. + + .. code-block:: python + + Annotated[BigItem, NotPickFields("x", "y")] + """ + + pass diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 9fbf4038..46e2508a 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -26,6 +26,7 @@ UndeclaredProvidedTypeError, ) from scrapy_poet.page_input_providers import PageObjectInputProvider +from scrapy_poet.utils import _normalize_annotated_cls from .utils import create_registry_instance, get_scrapy_data_path @@ -155,10 +156,18 @@ def build_instances(self, request: Request, response: Response, plan: andi.Plan) ) # All the remaining dependencies are internal so they can be built just # following the andi plan. - for cls, kwargs_spec in plan.dependencies: + for raw_cls, kwargs_spec in plan.dependencies: + # Need to handle both typing.Annotated[cls, PickFields(...)] and cls. + cls = _normalize_annotated_cls(raw_cls) + if cls not in instances.keys(): instances[cls] = cls(**kwargs_spec.kwargs(instances)) + # andi could still be expecting this signature, if there is, + # typing.Annotated[cls, PickFields(...)] + if raw_cls not in instances.keys(): + instances[raw_cls] = instances[cls] + return instances @inlineCallbacks @@ -226,7 +235,11 @@ def build_instances_from_providers( raise objs_by_type: Dict[Callable, Any] = {type(obj): obj for obj in objs} - extra_classes = objs_by_type.keys() - provided_classes + extra_classes = objs_by_type.keys() - ( + # ensure that cls from typing.Annotated[cls, PickFields(...)] + # is used when comparing. + {_normalize_annotated_cls(p) for p in provided_classes} + ) if extra_classes: raise UndeclaredProvidedTypeError( f"{provider} has returned instances of types {extra_classes} " diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index b8a165b5..661f90b8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -35,17 +35,21 @@ HttpClient, HttpResponse, HttpResponseHeaders, + ItemPage, PageParams, RequestUrl, ResponseUrl, ) +from web_poet.fields import item_from_fields_sync from web_poet.pages import is_injectable +from web_poet.utils import ensure_awaitable from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.injection_errors import ( MalformedProvidedClassesError, ProviderDependencyDeadlockError, ) +from scrapy_poet.utils import _derive_fields, _normalize_annotated_cls class PageObjectInputProvider: @@ -301,6 +305,7 @@ def provided_classes(self, cls): """If the item is in any of the ``to_return`` in the rules, then it can definitely provide by using the corresponding page object in ``use``. """ + cls = _normalize_annotated_cls(cls) return isclass(cls) and self.registry.search(to_return=cls) def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: @@ -318,7 +323,9 @@ async def __call__( response: Response, ) -> List[Any]: results = [] - for cls in to_provide: + for raw_item_cls in to_provide: + cls = _normalize_annotated_cls(raw_item_cls) + item = self.get_from_cache(request, cls) if item: results.append(item) @@ -349,10 +356,28 @@ async def __call__( ) page_object = po_instances[page_object_cls] - item = await page_object.to_item() + item = await self._produce_item(raw_item_cls, page_object) self.update_cache(request, po_instances) self.update_cache(request, {type(item): item}) results.append(item) return results + + async def _produce_item(self, cls_or_annotated: Any, page_object: ItemPage) -> Any: + field_names = _derive_fields(cls_or_annotated, page_object) + if field_names: + item_dict = item_from_fields_sync( + page_object, item_cls=dict, skip_nonitem_fields=False + ) + item_cls = _normalize_annotated_cls(cls_or_annotated) + item = item_cls( + **{ + name: await ensure_awaitable(item_dict[name]) + for name in item_dict + if name in field_names + } + ) + return item + + return await page_object.to_item() diff --git a/scrapy_poet/utils/__init__.py b/scrapy_poet/utils/__init__.py index 88a2617c..dc7a7a55 100644 --- a/scrapy_poet/utils/__init__.py +++ b/scrapy_poet/utils/__init__.py @@ -1,11 +1,19 @@ import os -from typing import Type +from typing import Any, Collection, Optional, Type from warnings import warn +try: + from typing import Annotated # Python 3.9+ +except ImportError: + from typing_extensions import _AnnotatedAlias as Annotated + from scrapy.crawler import Crawler from scrapy.http import Request, Response from scrapy.utils.project import inside_project, project_data_dir -from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders +from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders, ItemPage +from web_poet.fields import get_fields_dict + +from scrapy_poet.api import NotPickFields, PickFields def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str: @@ -48,6 +56,57 @@ def scrapy_response_to_http_response(response: Response) -> HttpResponse: ) +def get_origin(cls: Any) -> Any: + """Offers backward compatibility for Python 3.7 since ``typing.get_origin(tp)`` + is only available starting on 3.8. + + Moreover, ``typing_extensions.get_origin`` doesn't work well with + ``typing_extensions.Annotated``. + """ + return getattr(cls, "__origin__", ()) + + +def _normalize_annotated_cls(cls: Any) -> Any: + """Returns the type ``T`` in ``typing.Annotated[T, x]``, if applicable. + + See: https://peps.python.org/pep-0593/ + """ + if isinstance(cls, Annotated): + cls = get_origin(cls) + return cls + + +def _derive_fields(annotation: Any, page_obj: ItemPage) -> Optional[Collection[str]]: + """Returns a Collection of strings representing the fields names to extract + from the page object based on the annotations specified on its item: + + - ``typing.Annotated[T, PickFields(x, ...)]`` + - ``typing.Annotated[T, NotPickFields(x, ...)]`` + """ + + if not isinstance(annotation, Annotated): + return None + + def _use_fields(_fields, values): + if _fields: + raise ValueError("PickFields and NotPickFields should not be used together") + return values + + fields = [] + + for metadata in annotation.__metadata__: + if isinstance(metadata, PickFields): + fields = _use_fields(fields, metadata.fields) + + if isinstance(metadata, NotPickFields): + if metadata.fields: + fields = _use_fields( + fields, get_fields_dict(page_obj).keys() - set(metadata.fields) + ) + + return fields + + def create_registry_instance(cls: Type, crawler: Crawler): if "SCRAPY_POET_OVERRIDES" in crawler.settings: msg = ( diff --git a/setup.py b/setup.py index e5a0cde7..7cf4216e 100755 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ "twisted >= 18.9.0", "url-matcher >= 0.2.0", "web-poet >= 0.7.0", + "typing_extensions >= 4.4.0; python_version<'3.9'", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index 8665b567..5d938c03 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -8,12 +8,13 @@ import socket import warnings from collections import defaultdict -from typing import Any, Dict, List, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import attrs import pytest import scrapy from pytest_twisted import inlineCallbacks +from typing_extensions import Annotated from url_matcher import Patterns from url_matcher.util import get_domain from web_poet import ( @@ -28,7 +29,7 @@ ) from web_poet.pages import ItemT -from scrapy_poet import callback_for +from scrapy_poet import NotPickFields, PickFields, callback_for from scrapy_poet.downloadermiddlewares import DEFAULT_PROVIDERS from scrapy_poet.utils.mockserver import get_ephemeral_port from scrapy_poet.utils.testing import ( @@ -51,7 +52,6 @@ def rules_settings() -> dict: def spider_for(injectable: Type): class InjectableSpider(scrapy.Spider): - url = None custom_settings = { "SCRAPY_POET_PROVIDERS": DEFAULT_PROVIDERS, @@ -100,19 +100,22 @@ async def to_item(self) -> ItemT: @inlineCallbacks -def crawl_item_and_deps(PageObject) -> Tuple[Any, Any]: +def crawl_item_and_deps( + page_object: Optional[ItemPage], spider: Optional[scrapy.Spider] = None +) -> Tuple[Any, Any]: """Helper function to easily return the item and injected dependencies from a simulated Scrapy callback which asks for either of these dependencies: - page object - item class """ + spider = spider or spider_for(page_object) item, _, crawler = yield crawl_single_item( - spider_for(PageObject), ProductHtml, rules_settings(), port=PORT + spider, ProductHtml, rules_settings(), port=PORT ) return item, crawler.spider.collected_response_deps -def assert_deps(deps: List[Dict[str, Any]], expected: Dict[str, Any], size: int = 1): +def assert_deps(deps: List[Dict[str, Any]], expected: Dict[str, Any]): """Helper for easily checking the instances of the ``deps`` returned by ``crawl_item_and_deps()``. @@ -120,8 +123,7 @@ def assert_deps(deps: List[Dict[str, Any]], expected: Dict[str, Any], size: int that is passed to the spider callback. Currently, either "page" or "item" are supported as keys since ``scrapy_poet.callback`` is used. """ - assert len(deps) == size - if size == 0: + if not len(deps): return # Only checks the first element for now since it's used alongside crawling @@ -1387,6 +1389,227 @@ def test_page_object_with_item_dependency_deadlock_2_d(caplog) -> None: assert "ProviderDependencyDeadlockError" in caplog.text +@attrs.define +class BigItem: + x: Optional[str] = None + y: Optional[str] = None + z: Optional[str] = None + + +@handle_urls(URL) +@attrs.define +class BigPage(PageObjectCounterMixin, ItemPage[BigItem]): + @field + def x(self) -> str: + return "x" + + @field + def y(self) -> str: + return "y" + + @field + def z(self) -> str: + return "z" + + +class BaseSpiderForTest(scrapy.Spider): + name = "testing_spider" + url = None + custom_settings = {"SCRAPY_POET_PROVIDERS": DEFAULT_PROVIDERS} + + def start_requests(self): + yield scrapy.Request(self.url, capture_exceptions(self.parse_item)) + + +@inlineCallbacks +def test_pick_fields() -> None: + """Spider callbacks annotated with ``PickFields`` should only return the + requested field and completely avoid calling ``.to_item()``. + """ + + class Spider(BaseSpiderForTest): + def parse_item(self, response, item: Annotated[BigItem, PickFields("x", "y")]): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(x="x", y="y") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 0 + + +@inlineCallbacks +def test_pick_fields_with_other_metadata() -> None: + """Any other annotations inside ``Annotated`` will be ignored by the + ``ItemProvider``. + + Reference: https://peps.python.org/pep-0593/#consuming-annotations + """ + + class Spider(BaseSpiderForTest): + def parse_item( + self, + response, + item: Annotated[BigItem, PickFields("x", "y"), None, "random"], + ): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(x="x", y="y") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 0 + + +@inlineCallbacks +def test_pick_fields_empty() -> None: + """Same with ``test_pick_fields()`` but there's no field declarations inside + ``PickFields()``. + + In these cases, it's ignored. + """ + + class Spider(BaseSpiderForTest): + def parse_item(self, response, item: Annotated[BigItem, PickFields()]): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(x="x", y="y", z="z") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 1 + + +@inlineCallbacks +def test_pick_fields_not_available() -> None: + """When a field has been specified in ``PickFields()`` but the page object + does not support it to populate the item, it's simply ignored. + """ + + class Spider(BaseSpiderForTest): + def parse_item(self, response, item: Annotated[BigItem, PickFields("x", "na")]): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(x="x") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 0 + + +@inlineCallbacks +def test_not_pick_fields() -> None: + """Spider callbacks annotated with ``NotPickFields`` should NOT return the + indicated field and completely avoid calling ``.to_item()``. + """ + + class Spider(BaseSpiderForTest): + def parse_item( + self, response, item: Annotated[BigItem, NotPickFields("x", "y")] + ): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(z="z") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 0 + + +@inlineCallbacks +def test_not_pick_fields_with_other_metadata() -> None: + """Any other annotations inside ``Annotated`` will be ignored by the + ``ItemProvider``. + + Reference: https://peps.python.org/pep-0593/#consuming-annotations + """ + + class Spider(BaseSpiderForTest): + def parse_item( + self, + response, + item: Annotated[BigItem, NotPickFields("x", "y"), None, "random"], + ): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(z="z") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 0 + + +@inlineCallbacks +def test_not_pick_fields_empty() -> None: + """Same with ``test_not_pick_fields()`` but there's no field declarations + inside ``NotPickFields()``. + + In these cases, it's ignored. + """ + + class Spider(BaseSpiderForTest): + def parse_item(self, response, item: Annotated[BigItem, NotPickFields()]): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(x="x", y="y", z="z") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 1 + + +@inlineCallbacks +def test_not_pick_fields_not_available() -> None: + """When a field has been specified in ``NotPickFields()`` but the page object + does not support it to populate the item, it's simply ignored. + """ + + class Spider(BaseSpiderForTest): + def parse_item( + self, response, item: Annotated[BigItem, NotPickFields("z", "na")] + ): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert item == BigItem(x="x", y="y") + assert_deps(deps, {"item": BigItem}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 0 + + +@inlineCallbacks +def test_conflict_pick_fields(caplog) -> None: + """Using both ``PickFields()`` and ``NotPickFields()`` in the same annotation + should raise a ValueError. + """ + + class Spider(BaseSpiderForTest): + def parse_item( + self, + response, + item: Annotated[BigItem, PickFields("x"), NotPickFields("y")], + ): + yield item + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(None, Spider) + assert ( + "ValueError: PickFields and NotPickFields should not " "be used together" + ) in caplog.text + assert item is None + assert_deps(deps, {}) + PageObjectCounterMixin.assert_instance_count(1, BigPage) + assert BigPage.to_item_call_count == 0 + + def test_created_apply_rules() -> None: """Checks if the ``ApplyRules`` were created properly by ``@handle_urls`` in ``tests/po_lib/__init__.py``. @@ -1476,4 +1699,5 @@ def test_created_apply_rules() -> None: ApplyRule(URL, use=EggDeadlockPage, to_return=EggItem), ApplyRule(URL, use=Chicken2DeadlockPage, to_return=Chicken2Item), ApplyRule(URL, use=Egg2DeadlockPage, to_return=Egg2Item), + ApplyRule(URL, use=BigPage, to_return=BigItem), ]