scrapinghub · kmike · Jan 30, 2023 · Oct 12, 2022 · Oct 13, 2022 · Oct 13, 2022
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,7 +5,101 @@ Changelog
 TBR
 ---
 
-* Official support for Python 3.11
+This release enables scrapy-poet to fully support item classes as dependencies
+in page objects and spider callbacks. The following is now possible:
+
+.. code-block:: python
+
+    import attrs
+    import scrapy
+    from web_poet import WebPage, handle_urls, field
+    from scrapy_poet import DummyResponse
+
+    @attrs.define
+    class Image:
+        url: str
+
+    @handle_urls("example.com")
+    class ProductImagePage(WebPage[Image]):
+        @field
+        def url(self) -> str:
+            return self.css("#product img ::attr(href)").get("")
+
+    @attrs.define
+    class Product:
+        name: str
+        image: Image
+
+    @handle_urls("example.com")
+    @attrs.define
+    class ProductPage(WebPage[Product]):
+        # ✨ NEW: Notice that the page object can ask for items as dependencies.
+        # An instance of ``Image`` is injected behind the scenes by calling the
+        # ``.to_item()`` method of ``ProductImagePage``.
+        image_item: Image
+
+        @field
+        def name(self) -> str:
+            return self.css("h1.name ::text").get("")
+
+        @field
+        def image(self) -> Image:
+            return self.image_item
+
+    class MySpider(scrapy.Spider):
+        name = "myspider"
+
+        def start_requests(self):
+            yield scrapy.Request(
+                "https://example.com/products/some-product", self.parse
+            )
+
+        # ✨ NEW: Notice that we're directly using the item here and not the
+        # page object.
+        def parse(self, response: DummyResponse, item: Product):
+            return item
+
+In line with this, the following changes were made:
+
+    * Added a new ``scrapy_poet.page_input_providers.ItemProvider`` which makes
+      the usage above possible.
+    * Multiple changes to the ``scrapy_poet.PageObjectInputProvider`` base class
+      which are backward incompatible:
+
+        * It now accepts an instance of ``scrapy_poet.injection.Injector`` in its
+          constructor instead of ``scrapy.crawler.Crawler``. Although you can
+          still access the ``scrapy.crawler.Crawler`` via the ``Injector.crawler``
+          attribute.
+        * ``is_provided()`` is now an instance method instead of a class
+          method.
+
+    * An item class is now supported by ``scrapy_poet.callback_for`` alongside
+      the usual page objects. This means that it won't raise a ``TypeError``
+      anymore when not passing a subclass of ``web_poet.ItemPage``.
+    * ``scrapy_poet.overrides.OverridesRegistry`` has been overhauled:
+
+        * It is now subclassed from ``web_poet.RulesRegistry`` which allows
+          outright access to its registry methods.
+        * It now allows retrieval of rules based on the returned item class.
+        * ``OverridesRegistry`` (alongside ``SCRAPY_POET_OVERRIDES``) won't
+          accept tuples as rules anymore. Only ``web_poet.ApplyRule``
+          instances are allowed.
+
+            * As a result, the following type aliases have been removed:
+              ``scrapy_poet.overrides.RuleAsTuple`` and
+              ``scrapy_poet.overrides.RuleFromUser``
+            * These changes are backward incompatible.
+
+    * New exception: ``scrapy_poet.injector_error.ProviderDependencyDeadlockError``.
+      This is raised when it's not possible to create the dependencies due to
+      a deadlock in their sub-dependencies, e.g. due to a circular dependency
+      between page objects.
+
+Other changes:
+
+    * Moved some of the utility functions from the test module into
+      ``scrapy_poet.utils.testing``.
+    * Official support for Python 3.11
 
 0.6.0 (2022-11-24)
 ------------------

diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst
@@ -432,11 +432,13 @@ are used for the domain
 
 .. code-block:: python
 
+    from web_poet import ApplyRule
+
     "SCRAPY_POET_OVERRIDES": [
-        ("toscrape.com", BTSBookListPage, BookListPage),
-        ("toscrape.com", BTSBookPage, BookPage),
-        ("bookpage.com", BPBookListPage, BookListPage),
-        ("bookpage.com", BPBookPage, BookPage)
+        ApplyRule("toscrape.com", use=BTSBookListPage, instead_of=BookListPage),
+        ApplyRule("toscrape.com", use=BTSBookPage, instead_of=BookPage),
+        ApplyRule("bookpage.com", use=BPBookListPage, instead_of=BookListPage),
+        ApplyRule("bookpage.com", use=BPBookPage, instead_of=BookPage)
     ]
 
 The spider is now ready to extract books from both sites 😀.
@@ -452,27 +454,6 @@ for a particular domain, but more complex URL patterns are also possible.
 For example, the pattern ``books.toscrape.com/cataloge/category/``
 is accepted and it would restrict the override only to category pages.
 
-It is even possible to configure more complex patterns by using the
-:py:class:`web_poet.rules.ApplyRule` class instead of a triplet in
-the configuration. Another way of declaring the earlier config
-for ``SCRAPY_POET_OVERRIDES`` would be the following:
-
-.. code-block:: python
-
-    from url_matcher import Patterns
-    from web_poet import ApplyRule
-
-
-    SCRAPY_POET_OVERRIDES = [
-        ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage),
-        ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage),
-        ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage),
-        ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage),
-    ]
-
-As you can see, this could get verbose. The earlier tuple config simply offers
-a shortcut to be more concise.
-
 .. note::
 
     Also see the `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_

diff --git a/docs/overrides.rst b/docs/overrides.rst
@@ -13,9 +13,9 @@ page.
     Some real-world examples on this topic can be found in:
 
     - `Example 1 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_01.py>`_:
-      rules using tuples
+      shorter example
     - `Example 2 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_02.py>`_:
-      rules using tuples and :py:class:`web_poet.ApplyRule`
+      longer example
     - `Example 3 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_03.py>`_:
       rules using :py:func:`web_poet.handle_urls` decorator and retrieving them
       via :py:meth:`web_poet.rules.RulesRegistry.get_rules`

diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py
@@ -6,7 +6,7 @@
 The default configured PO logic contains the logic for books.toscrape.com
 """
 import scrapy
-from web_poet import WebPage
+from web_poet import ApplyRule, WebPage
 
 from scrapy_poet import callback_for
 
@@ -51,8 +51,8 @@ class BooksSpider(scrapy.Spider):
     # Configuring different page objects pages from the bookpage.com domain
     custom_settings = {
         "SCRAPY_POET_OVERRIDES": [
-            ("bookpage.com", BPBookListPage, BookListPage),
-            ("bookpage.com", BPBookPage, BookPage),
+            ApplyRule("bookpage.com", use=BPBookListPage, instead_of=BookListPage),
+            ApplyRule("bookpage.com", use=BPBookPage, instead_of=BookPage),
         ]
     }
 

diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py
@@ -7,7 +7,6 @@
 at all is applied.
 """
 import scrapy
-from url_matcher import Patterns
 from web_poet import WebPage
 from web_poet.rules import ApplyRule
 
@@ -63,19 +62,10 @@ class BooksSpider(scrapy.Spider):
     # Configuring different page objects pages for different domains
     custom_settings = {
         "SCRAPY_POET_OVERRIDES": [
-            ("toscrape.com", BTSBookListPage, BookListPage),
-            ("toscrape.com", BTSBookPage, BookPage),
-            # We could also use the long-form version if we want to.
-            ApplyRule(
-                for_patterns=Patterns(["bookpage.com"]),
-                use=BPBookListPage,
-                instead_of=BookListPage,
-            ),
-            ApplyRule(
-                for_patterns=Patterns(["bookpage.com"]),
-                use=BPBookPage,
-                instead_of=BookPage,
-            ),
+            ApplyRule("toscrape.com", use=BTSBookListPage, instead_of=BookListPage),
+            ApplyRule("toscrape.com", use=BTSBookPage, instead_of=BookPage),
+            ApplyRule("bookpage.com", use=BPBookListPage, instead_of=BookListPage),
+            ApplyRule("bookpage.com", use=BPBookPage, instead_of=BookPage),
         ]
     }
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,7 @@ multi_line_output = 3
 module = [
     "tests.test_cache.*",
     "tests.test_downloader.*",
+    "tests.test_web_poet_rules.*",
     "tests.test_scrapy_dependencies.*",
 ]
 # Ignore this type of error since mypy expects an Iterable return

diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py
@@ -29,8 +29,9 @@ def __init__(self, url: str, request=Optional[Request]):
         super().__init__(url=url, request=request)
 
 
-def callback_for(page_cls: Type[ItemPage]) -> Callable:
-    """Create a callback for an :class:`web_poet.pages.ItemPage` subclass.
+def callback_for(page_or_item_cls: Type) -> Callable:
+    """Create a callback for an :class:`web_poet.pages.ItemPage` subclass or an
+    item class.
 
     The generated callback returns the output of the
     ``ItemPage.to_item()`` method, i.e. extracts a single item
@@ -104,24 +105,28 @@ def parse(self, response):
     disk queues, because in this case Scrapy is able to serialize
     your request object.
     """
-    if not issubclass(page_cls, ItemPage):
-        raise TypeError(f"{page_cls.__name__} should be a subclass of ItemPage.")
-
     # When the callback is used as an instance method of the spider, it expects
     # to receive 'self' as its first argument. When used as a simple inline
     # function, it expects to receive a response as its first argument.
     #
     # To avoid a TypeError, we need to receive a list of unnamed arguments and
     # a dict of named arguments after our injectable.
-    def parse(*args, page: page_cls, **kwargs):  # type: ignore
-        yield page.to_item()  # type: ignore
+    if issubclass(page_or_item_cls, ItemPage):
+
+        def parse(*args, page: page_or_item_cls, **kwargs):  # type: ignore
+            yield page.to_item()  # type: ignore
+
+        async def async_parse(*args, page: page_or_item_cls, **kwargs):  # type: ignore
+            yield await page.to_item()  # type: ignore
+
+        if iscoroutinefunction(page_or_item_cls.to_item):
+            setattr(async_parse, _CALLBACK_FOR_MARKER, True)
+            return async_parse
 
-    async def async_parse(*args, page: page_cls, **kwargs):  # type: ignore
-        yield await page.to_item()  # type: ignore
+    else:
 
-    if iscoroutinefunction(page_cls.to_item):
-        setattr(async_parse, _CALLBACK_FOR_MARKER, True)
-        return async_parse
+        def parse(*args, item: page_or_item_cls, **kwargs):  # type:ignore
+            yield item
 
     setattr(parse, _CALLBACK_FOR_MARKER, True)
     return parse
diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py
@@ -17,6 +17,7 @@
 from .page_input_providers import (
     HttpClientProvider,
     HttpResponseProvider,
+    ItemProvider,
     PageParamsProvider,
     RequestUrlProvider,
     ResponseUrlProvider,
@@ -31,6 +32,7 @@
     PageParamsProvider: 700,
     RequestUrlProvider: 800,
     ResponseUrlProvider: 900,
+    ItemProvider: 1000,
 }
 
 InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")

diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py
@@ -57,7 +57,7 @@ def load_providers(self, default_providers: Optional[Mapping] = None):  # noqa:
         }
         provider_classes = build_component_list(providers_dict)
         logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}")
-        self.providers = [load_object(cls)(self.crawler) for cls in provider_classes]
+        self.providers = [load_object(cls)(self) for cls in provider_classes]
         check_all_providers_are_callable(self.providers)
         # Caching whether each provider requires the scrapy response
         self.is_provider_requiring_scrapy_response = {

diff --git a/scrapy_poet/injection_errors.py b/scrapy_poet/injection_errors.py
@@ -12,3 +12,15 @@ class UndeclaredProvidedTypeError(InjectionError):
 
 class MalformedProvidedClassesError(InjectionError):
     pass
+
+
+class ProviderDependencyDeadlockError(InjectionError):
+    """This is raised when it's not possible to create the dependencies due to
+    deadlock.
+
+    For example:
+        - Page object named "ChickenPage" require "EggPage" as a dependency.
+        - Page object named "EggPage" require "ChickenPage" as a dependency.
+    """
+
+    pass