From b4ac78962ff3471f24e35e0457a337c3140e01c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?=
 <ivanprado@users.noreply.github.com>
Date: Wed, 8 Dec 2021 11:51:05 +0100
Subject: [PATCH 01/19] url-matcher integration with scrapy-poet

---
 CHANGELOG.rst                                 |  2 +
 docs/conf.py                                  |  3 +-
 docs/intro/tutorial.rst                       | 44 ++++++----
 docs/overrides.rst                            | 85 +++++++++++++++++--
 .../example/spiders/books_04_overrides_01.py  | 14 ++-
 .../example/spiders/books_04_overrides_02.py  | 20 ++---
 scrapy_poet/injection.py                      |  8 +-
 scrapy_poet/middleware.py                     |  4 +-
 scrapy_poet/overrides.py                      | 74 +++++++++++++++-
 scrapy_poet/utils.py                          | 15 ----
 setup.py                                      |  3 +-
 tests/conftest.py                             |  2 -
 tests/test_injection.py                       | 19 +++--
 tests/test_middleware.py                      | 12 ++-
 tests/utils.py                                |  8 +-
 tox.ini                                       |  2 +-
 16 files changed, 224 insertions(+), 91 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e48b029a..0943b9e9 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,6 +6,8 @@ TBR:
 ------------------
 
 * Cache mechanism using SCRAPY_POET_CACHE setting
+* New and richer SCRAPY_POET_OVERRIDES registry that uses the
+  url-matcher patterns to configure the overrides
 
 0.2.1 (2021-06-11)
 ------------------
diff --git a/docs/conf.py b/docs/conf.py
index 027e717f..e13717de 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -188,7 +188,8 @@
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3', None, ),
     'scrapy': ('https://docs.scrapy.org/en/latest', None, ),
-    'web_poet': ('https://web-poet.readthedocs.io/en/stable/', None),
+    'web-poet': ('https://web-poet.readthedocs.io/en/stable/', None),
+    'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None),
 }
 
 autodoc_default_options = {
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index 8911cfaa..b04a84d0 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -348,12 +348,10 @@ be done by configuring ``SCRAPY_POET_OVERRIDES`` into ``settings.py``:
 
 .. code-block:: python
 
-    SCRAPY_POET_OVERRIDES = {
-        "toscrape.com": {
-            BookListPage: BTSBookListPage,
-            BookPage: BTSBookPage
-        }
-    }
+        "SCRAPY_POET_OVERRIDES": [
+            ("toscrape.com", BTSBookListPage, BookListPage),
+            ("toscrape.com", BTSBookPage, BookPage)
+        ]
 
 The spider is back to life!
 ``SCRAPY_POET_OVERRIDES`` contain rules that overrides the Page Objects
@@ -381,7 +379,7 @@ to implement new ones:
     class BPBookListPage(WebPage):
 
         def book_urls(self):
-            return self.css('.article-info a::attr(href)').getall()
+            return self.css('article.post h4 a::attr(href)').getall()
 
 
     class BPBookPage(ItemWebPage):
@@ -389,7 +387,7 @@ to implement new ones:
         def to_item(self):
             return {
                 'url': self.url,
-                'name': self.css(".book-data h4::text").get().strip(),
+                'name': self.css("body div > h1::text").get().strip(),
             }
 
 The last step is configuring the overrides so that these new Page Objects
@@ -399,16 +397,12 @@ are used for the domain
 
 .. code-block:: python
 
-    SCRAPY_POET_OVERRIDES = {
-        "toscrape.com": {
-            BookListPage: BTSBookListPage,
-            BookPage: BTSBookPage
-        },
-        "bookpage.com": {
-            BookListPage: BPBookListPage,
-            BookPage: BPBookPage
-        }
-    }
+        "SCRAPY_POET_OVERRIDES": [
+            ("toscrape.com", BTSBookListPage, BookListPage),
+            ("toscrape.com", BTSBookPage, BookPage),
+            ("bookpage.com", BPBookListPage, BookListPage),
+            ("bookpage.com", BPBookPage, BookPage)
+        ]
 
 The spider is now ready to extract books from both sites 😀.
 The full example
@@ -418,6 +412,20 @@ On a surface, it looks just like a different way to organize Scrapy spider
 code - and indeed, it *is* just a different way to organize the code,
 but it opens some cool possibilities.
 
+.. note::
+
+    In the examples above we have been configuring the overrides
+    for a particular domain, but more complex URL patterns are also possible.
+    For example, the pattern ``books.toscrape.com/cataloge/category/``
+    is accepted and it would restrict the override only to category pages.
+
+    It is even possible to configure more complex patterns by
+    using the ``OverrideRule`` class instead of a triplet in
+    the configuration.
+
+    Also see the `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
+    documentation for more information about the patterns syntax.
+
 Next steps
 ==========
 
diff --git a/docs/overrides.rst b/docs/overrides.rst
index 5d115757..8a07bbbe 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -47,11 +47,9 @@ And then override it for a particular domain using ``settings.py``:
 
 .. code-block:: python
 
-    SCRAPY_POET_OVERRIDES = {
-        "example.com": {
-            BookPage: ISBNBookPage
-        }
-    }
+    SCRAPY_POET_OVERRIDES = [
+        ("example.com", ISBNBookPage, BookPage)
+    ]
 
 This new Page Objects gets the original ``BookPage`` as dependency and enrich
 the obtained item with the ISBN from the page HTML.
@@ -79,13 +77,82 @@ the obtained item with the ISBN from the page HTML.
                 return item
 
 
+Overrides rules
+===============
+
+The default way of configuring the override rules is using triplets
+of the form (``url pattern``, ``override_type``, ``overridden_type``). But
+more complex rules can be introduced if the class ``OverrideRule``
+is used. The following example configures an override that
+is only applied for book pages from ``books.toscrape.com``:
+
+.. code-block:: python
+
+
+    SCRAPY_POET_OVERRIDES = [
+        OverrideRule(
+            for_patterns=Patterns(
+                include=["books.toscrape.com/cataloge/*index.html|"],
+                exclude=["/catalogue/category/"]),
+            use=MyBookPage,
+            instead_of=BookPage
+        )
+    ]
+
+Note how category pages are excludes by using a ``exclude`` pattern.
+You can find more information about the patterns syntax in the
+`url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
+documentation.
+
+
+Decorate Page Objects with the rules
+====================================
+
+Having the rules along with the Page Objects is a good idea,
+as you can identify with a single sight what the Page Object is doing
+along with where it is applied. This can be done by decorating the
+Page Objects with ``handle_urls`` and then
+configure the overrides automatically with the help of the function
+``find_page_object_overrides``.
+
+Let's see an example:
+
+.. code-block:: python
+
+        @handle_urls("toscrape.com", BookPage)
+        class BTSBookPage(BookPage):
+
+        def to_item(self):
+            return {
+                'url': self.url,
+                'name': self.css("title::text").get(),
+            }
+
+The ``handle_urls`` decorator in this case is indicating that
+the class ``BSTBookPage`` should be used instead of ``BookPage``
+for the domain ``toscrape.com``.
+
+In order to configure the scrapy-poet overrides automatically
+using these annotations,
+you can use the function ``find_page_object_overrides``.
+For example:
+
+.. code-block:: python
+
+    SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module")
+
+The function will collect all the ``handle_urls`` annotations from the
+``my_page_objects_module`` and submodules, and will convert them
+to rules ready to be used with ``SCRAPY_POET_OVERRIDES``.
+
 Overrides registry
 ==================
 
-The overrides registry is responsible for informing whether there exists an
-override for a particular type for a given response. The default overrides
-registry keeps a map of overrides for each domain and read this configuration
-from settings ``SCRAPY_POET_OVERRIDES`` as has been seen in the :ref:`intro-tutorial`
+The overrides registry is responsible of informing whether there exists an
+override for a particular type for a given request. The default overrides
+registry allows to configure these rules using patterns that follows the
+`url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_ syntax. These rules can be configured using the
+``SCRAPY_POET_OVERRIDES`` setting, as it has been seen in the :ref:`intro-tutorial`
 example.
 
 But the registry implementation can be changed at convenience. A different
diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py
index 266f019d..ab266c08 100644
--- a/example/example/spiders/books_04_overrides_01.py
+++ b/example/example/spiders/books_04_overrides_01.py
@@ -28,7 +28,7 @@ def to_item(self):
 class BPBookListPage(WebPage):
     """Logic to extract listings from pages like https://bookpage.com/reviews"""
     def book_urls(self):
-        return self.css('.article-info a::attr(href)').getall()
+        return self.css('article.post h4 a::attr(href)').getall()
 
 
 class BPBookPage(ItemWebPage):
@@ -36,7 +36,7 @@ class BPBookPage(ItemWebPage):
     def to_item(self):
         return {
             'url': self.url,
-            'name': self.css(".book-data h4::text").get().strip(),
+            'name': self.css("body div > h1::text").get().strip(),
         }
 
 
@@ -45,12 +45,10 @@ class BooksSpider(scrapy.Spider):
     start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews']
     # Configuring different page objects pages from the bookpage.com domain
     custom_settings = {
-        "SCRAPY_POET_OVERRIDES": {
-            "bookpage.com": {
-                BookListPage: BPBookListPage,
-                BookPage: BPBookPage
-            }
-        }
+        "SCRAPY_POET_OVERRIDES": [
+            ("bookpage.com", BPBookListPage, BookListPage),
+            ("bookpage.com", BPBookPage, BookPage)
+        ]
     }
 
     def parse(self, response, page: BookListPage):
diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py
index 9e6e8c2a..177656e5 100644
--- a/example/example/spiders/books_04_overrides_02.py
+++ b/example/example/spiders/books_04_overrides_02.py
@@ -41,7 +41,7 @@ def to_item(self):
 class BPBookListPage(BookListPage):
     """Logic to extract listings from pages like https://bookpage.com/reviews"""
     def book_urls(self):
-        return self.css('.article-info a::attr(href)').getall()
+        return self.css('article.post h4 a::attr(href)').getall()
 
 
 class BPBookPage(BookPage):
@@ -49,7 +49,7 @@ class BPBookPage(BookPage):
     def to_item(self):
         return {
             'url': self.url,
-            'name': self.css(".book-data h4::text").get().strip(),
+            'name': self.css("body div > h1::text").get().strip(),
         }
 
 
@@ -58,16 +58,12 @@ class BooksSpider(scrapy.Spider):
     start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews']
     # Configuring different page objects pages for different domains
     custom_settings = {
-        "SCRAPY_POET_OVERRIDES": {
-            "toscrape.com": {
-                BookListPage: BTSBookListPage,
-                BookPage: BTSBookPage
-            },
-            "bookpage.com": {
-                BookListPage: BPBookListPage,
-                BookPage: BPBookPage
-            },
-        }
+        "SCRAPY_POET_OVERRIDES": [
+            ("toscrape.com", BTSBookListPage, BookListPage),
+            ("toscrape.com", BTSBookPage, BookPage),
+            ("bookpage.com", BPBookListPage, BookListPage),
+            ("bookpage.com", BPBookPage, BookPage)
+        ]
     }
 
     def parse(self, response, page: BookListPage):
diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py
index dd18cbd4..adfc4292 100644
--- a/scrapy_poet/injection.py
+++ b/scrapy_poet/injection.py
@@ -15,14 +15,14 @@
 from scrapy.statscollectors import StatsCollector
 from scrapy.utils.conf import build_component_list
 from scrapy.utils.defer import maybeDeferred_coro
-from scrapy.utils.misc import load_object
+from scrapy.utils.misc import load_object, create_instance
 
 from scrapy_poet.cache import SqlitedictCache
 from scrapy_poet.injection_errors import (UndeclaredProvidedTypeError,
                                           NonCallableProviderError,
                                           InjectionError)
 from scrapy_poet.overrides import OverridesRegistryBase, \
-    PerDomainOverridesRegistry
+    OverridesRegistry
 from scrapy_poet.page_input_providers import PageObjectInputProvider
 from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse
 from web_poet.pages import is_injectable
@@ -43,7 +43,7 @@ def __init__(self,
                  overrides_registry: Optional[OverridesRegistryBase] = None):
         self.crawler = crawler
         self.spider = crawler.spider
-        self.overrides_registry = overrides_registry or PerDomainOverridesRegistry()
+        self.overrides_registry = overrides_registry or OverridesRegistry()
         self.load_providers(default_providers)
         self.init_cache()
 
@@ -348,6 +348,8 @@ class MySpider(Spider):
     spider = MySpider()
     spider.settings = settings
     crawler.spider = spider
+    if not overrides_registry:
+        overrides_registry = create_instance(OverridesRegistry, settings, crawler)
     return Injector(crawler, overrides_registry=overrides_registry)
 
 
diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py
index c2584c62..7b96b735 100644
--- a/scrapy_poet/middleware.py
+++ b/scrapy_poet/middleware.py
@@ -11,7 +11,7 @@
 
 from scrapy.utils.misc import create_instance, load_object
 from . import api
-from .overrides import PerDomainOverridesRegistry
+from .overrides import OverridesRegistry
 from .page_input_providers import ResponseDataProvider
 from .injection import Injector
 
@@ -35,7 +35,7 @@ def __init__(self, crawler: Crawler):
         self.crawler = crawler
         settings = self.crawler.settings
         registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY",
-                                                PerDomainOverridesRegistry))
+                                                OverridesRegistry))
         self.overrides_registry = create_instance(registry_cls, settings, crawler)
         self.injector = Injector(crawler,
                                  default_providers=DEFAULT_PROVIDERS,
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index dc3b9cf8..55347ac2 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -1,9 +1,14 @@
+from collections import defaultdict
+
 from abc import ABC, abstractmethod
-from typing import Dict, Mapping, Callable
+from typing import Dict, Mapping, Callable, Iterable, Union, Tuple, Optional, List
 
 from scrapy import Request
 from scrapy.crawler import Crawler
-from scrapy_poet.utils import get_domain
+from url_matcher import Patterns, URLMatcher
+
+from url_matcher.util import get_domain
+from web_poet.overrides import OverrideRule
 
 
 class OverridesRegistryBase(ABC):
@@ -42,3 +47,68 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
         return self.get(get_domain(request.url), {})
 
 
+RuleAsTuple = Union[Tuple[str, Callable, Callable], List]
+
+class OverridesRegistry(OverridesRegistryBase):
+    """
+    Overrides registry that reads the overrides
+    from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings. It
+    is a list and each rule can be a tuple or an instance of the class ``OverrideRule``.
+
+    If a tuple is provided, the first element is the pattern to match the URL,
+    the second element is the type to be used instead of the type in
+    the third element. Another way to see it:
+    for the URLs that match the pattern ``tuple[0]`` use ``tuple[1]`` instead of ``tuple[2]``.
+
+    Example of overrides configuration:
+
+    .. code-block:: python
+
+
+        SCRAPY_POET_OVERRIDES = [
+            ("books.toscrape.com", ISBNBookPage, BookPage),
+            OverrideRule(for_patterns=Patterns(["books.toscrape.com"]),
+                         use=MyBookListPage,
+                         instead_of=BookListPage,
+                         ),
+        ]
+
+    It can be handy to compile the list of rules automatically
+    from a module using the method ``find_page_object_overrides``. For example:
+
+    .. code-block:: python
+
+        SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module")
+
+    It finds all the rules annotated using the decorator ``handle_urls`` inside the module ``my_page_objects_module`` and
+    its submodules.
+    """
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", []))
+
+    def __init__(self, rules: Optional[Iterable[Union[RuleAsTuple, OverrideRule]]] = None):
+        self.rules: List[OverrideRule] = []
+        self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher)
+        for rule in rules or []:
+            self.add_rule(rule)
+
+    def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]):
+        if isinstance(rule, (tuple, list)):
+            if len(rule) != 3:
+                raise ValueError(f"Invalid overrides rule: {rule}. Rules as tuples must have three elements: "
+                                 f"the pattern, the type to override and the new type to use instead.")
+            pattern, use, instead_of = rule
+            rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of)
+        self.rules.append(rule)
+        print(rule)
+        self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns)
+
+    def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
+        overrides = {}
+        for instead_of, matcher in self.matcher.items():
+            rule_id = matcher.match(request.url)
+            if rule_id is not None:
+                overrides[instead_of] = self.rules[rule_id].use
+        return overrides
diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py
index 965b5f9c..94a4bde0 100644
--- a/scrapy_poet/utils.py
+++ b/scrapy_poet/utils.py
@@ -1,21 +1,6 @@
 import os
 
 from scrapy.utils.project import project_data_dir, inside_project
-from tldextract import tldextract
-
-
-def get_domain(url):
-    """
-    Return the domain without any subdomain
-
-    >>> get_domain("http://blog.example.com")
-    'example.com'
-    >>> get_domain("http://www.example.com")
-    'example.com'
-    >>> get_domain("http://deeper.blog.example.co.uk")
-    'example.co.uk'
-    """
-    return ".".join(tldextract.extract(url)[-2:])
 
 
 def get_scrapy_data_path(createdir=True):
diff --git a/setup.py b/setup.py
index d8b34efd..6404cdc0 100755
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,8 @@
         'andi >= 0.4.1',
         'attrs',
         'parsel',
-        'web-poet',
+        'web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls#egg=web-poet',
+        'url-matcher',
         'tldextract',
         'sqlitedict',
     ],
diff --git a/tests/conftest.py b/tests/conftest.py
index 6082152f..209ac514 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,8 +1,6 @@
 import pytest
 from scrapy.settings import Settings
 
-from scrapy_poet.page_input_providers import ResponseDataProvider
-
 
 @pytest.fixture()
 def settings(request):
diff --git a/tests/test_injection.py b/tests/test_injection.py
index cabe5bf4..95a38d6d 100644
--- a/tests/test_injection.py
+++ b/tests/test_injection.py
@@ -7,7 +7,9 @@
 
 from scrapy import Request
 from scrapy.http import Response
-from scrapy_poet.utils import get_domain
+from url_matcher import Patterns
+
+from url_matcher.util import get_domain
 
 from scrapy_poet import ResponseDataProvider, PageObjectInputProvider, \
     DummyResponse
@@ -15,9 +17,10 @@
     get_injector_for_testing, get_response_for_testing
 from scrapy_poet.injection_errors import NonCallableProviderError, \
     InjectionError, UndeclaredProvidedTypeError
-from scrapy_poet.overrides import PerDomainOverridesRegistry
+from scrapy_poet.overrides import OverridesRegistry
 from web_poet import Injectable, ItemPage
 from web_poet.mixins import ResponseShortcutsMixin
+from web_poet.overrides import OverrideRule
 
 
 def get_provider(classes, content=None):
@@ -301,13 +304,11 @@ def test_overrides(self, providers, override_should_happen):
         domain = "example.com" if override_should_happen else "other-example.com"
         # The request domain is example.com, so overrides shouldn't be applied
         # when we configure them for domain other-example.com
-        overrides = {
-            domain: {
-                PricePO: PriceInDollarsPO,
-                EurDollarRate: OtherEurDollarRate
-            }
-        }
-        registry = PerDomainOverridesRegistry(overrides)
+        overrides = [
+            (domain, PriceInDollarsPO, PricePO),
+            OverrideRule(Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate)
+        ]
+        registry = OverridesRegistry(overrides)
         injector = get_injector_for_testing(providers,
                                             overrides_registry=registry)
 
diff --git a/tests/test_middleware.py b/tests/test_middleware.py
index a7a4ab46..bdd2e8e9 100644
--- a/tests/test_middleware.py
+++ b/tests/test_middleware.py
@@ -13,7 +13,9 @@
 import attr
 
 from scrapy_poet import callback_for
-from scrapy_poet.utils import get_domain
+from url_matcher.util import get_domain
+
+from tests.mockserver import get_ephemeral_port
 from web_poet.pages import WebPage, ItemPage, ItemWebPage
 from scrapy_poet.page_input_providers import (
     PageObjectInputProvider
@@ -103,10 +105,12 @@ def test_basic_case(settings):
 def test_overrides(settings):
     host = socket.gethostbyname(socket.gethostname())
     domain = get_domain(host)
-    settings["SCRAPY_POET_OVERRIDES"] = {
-        domain: {BreadcrumbsExtraction: OverridenBreadcrumbsExtraction}}
+    port = get_ephemeral_port()
+    settings["SCRAPY_POET_OVERRIDES"] = [
+        (f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction)
+    ]
     item, url, _ = yield crawl_single_item(spider_for(ProductPage),
-                                           ProductHtml, settings)
+                                           ProductHtml, settings, port=port)
     assert item == {
         'url': url,
         'name': 'Chocolate',
diff --git a/tests/utils.py b/tests/utils.py
index 7dd46b7d..55b26f5b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -24,26 +24,26 @@ def render_GET(self, request):
 
 
 @inlineCallbacks
-def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None):
+def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None, port=None):
     """Use spider_cls to crawl resource_cls. URL of the resource is passed
     to the spider as ``url`` argument.
     Return ``(items, resource_url, crawler)`` tuple.
     """
     spider_kwargs = {} if spider_kwargs is None else spider_kwargs
     crawler = make_crawler(spider_cls, settings)
-    with MockServer(resource_cls) as s:
+    with MockServer(resource_cls, port=port) as s:
         root_url = s.root_url
         yield crawler.crawl(url=root_url, **spider_kwargs)
     return crawler.spider.collected_items, s.root_url, crawler
 
 
 @inlineCallbacks
-def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None):
+def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None, port=None):
     """Run a spider where a single item is expected. Use in combination with
     ``capture_capture_exceptions`` and ``CollectorPipeline``
     """
     items, url, crawler = yield crawl_items(spider_cls, resource_cls, settings,
-                                            spider_kwargs=spider_kwargs)
+                                            spider_kwargs=spider_kwargs, port=port)
     assert len(items) == 1
     resp = items[0]
     if 'exception' in resp:
diff --git a/tox.ini b/tox.ini
index 50d2b8d0..b6f46a45 100644
--- a/tox.ini
+++ b/tox.ini
@@ -7,7 +7,7 @@ deps =
     pytest-cov
     scrapy >= 2.1.0
     pytest-twisted
-    web-poet
+    web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls#egg=web-poet
 
 commands =
     py.test \

From 35e7876c15432d4008eed8831fd32d7043590e97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?=
 <ivanprado@users.noreply.github.com>
Date: Thu, 9 Dec 2021 10:51:09 +0100
Subject: [PATCH 02/19] Remove a print line

---
 scrapy_poet/overrides.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index 55347ac2..5f76f377 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -102,7 +102,6 @@ def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]):
             pattern, use, instead_of = rule
             rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of)
         self.rules.append(rule)
-        print(rule)
         self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns)
 
     def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:

From 327139e4ff43eb71665312df537d9166db57c180 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Tue, 21 Dec 2021 19:19:49 +0800
Subject: [PATCH 03/19] improve docs and example code

---
 CHANGELOG.rst                                 |  8 ++-
 docs/intro/tutorial.rst                       | 56 ++++++++++++-------
 docs/overrides.rst                            | 30 ++++++----
 .../example/spiders/books_04_overrides_02.py  |  9 ++-
 4 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index fd3363fb..6b681816 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,9 +6,13 @@ Changelog
 TBR:
 ------------------
 
-* Cache mechanism using SCRAPY_POET_CACHE setting
-* New and richer SCRAPY_POET_OVERRIDES registry that uses the
+* Cache mechanism using ``SCRAPY_POET_CACHE`` setting
+* New and richer ``SCRAPY_POET_OVERRIDES`` registry that uses the
   url-matcher patterns to configure the overrides
+
+  * This results in a **backward incompatible** change since the rules
+    follow a different structure.
+
 * removed support for Python 3.6
 * added support for Python 3.10
 
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index a2aab34a..33af5f4c 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -348,10 +348,10 @@ be done by configuring ``SCRAPY_POET_OVERRIDES`` into ``settings.py``:
 
 .. code-block:: python
 
-        "SCRAPY_POET_OVERRIDES": [
-            ("toscrape.com", BTSBookListPage, BookListPage),
-            ("toscrape.com", BTSBookPage, BookPage)
-        ]
+    "SCRAPY_POET_OVERRIDES": [
+        ("toscrape.com", BTSBookListPage, BookListPage),
+        ("toscrape.com", BTSBookPage, BookPage)
+    ]
 
 The spider is back to life!
 ``SCRAPY_POET_OVERRIDES`` contain rules that overrides the Page Objects
@@ -397,34 +397,48 @@ are used for the domain
 
 .. code-block:: python
 
-        "SCRAPY_POET_OVERRIDES": [
-            ("toscrape.com", BTSBookListPage, BookListPage),
-            ("toscrape.com", BTSBookPage, BookPage),
-            ("bookpage.com", BPBookListPage, BookListPage),
-            ("bookpage.com", BPBookPage, BookPage)
-        ]
+    "SCRAPY_POET_OVERRIDES": [
+        ("toscrape.com", BTSBookListPage, BookListPage),
+        ("toscrape.com", BTSBookPage, BookPage),
+        ("bookpage.com", BPBookListPage, BookListPage),
+        ("bookpage.com", BPBookPage, BookPage)
+    ]
 
 The spider is now ready to extract books from both sites 😀.
 The full example
 `can be seen here <https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders/books_04_overrides_02.py>`_
 
-On a surface, it looks just like a different way to organize Scrapy spider
+On the surface, it looks just like a different way to organize Scrapy spider
 code - and indeed, it *is* just a different way to organize the code,
 but it opens some cool possibilities.
 
-.. note::
+In the examples above we have been configuring the overrides
+for a particular domain, but more complex URL patterns are also possible.
+For example, the pattern ``books.toscrape.com/cataloge/category/``
+is accepted and it would restrict the override only to category pages.
+
+It is even possible to configure more complex patterns by
+using the ``OverrideRule`` class instead of a triplet in
+the configuration. Another way of declaring the earlier config
+for ``SCRAPY_POET_OVERRIDES`` would be the following:
+
+.. code-block:: python
+
+    from url_matcher import Patterns
+    from web_poet.overrides import OverrideRule
 
-    In the examples above we have been configuring the overrides
-    for a particular domain, but more complex URL patterns are also possible.
-    For example, the pattern ``books.toscrape.com/cataloge/category/``
-    is accepted and it would restrict the override only to category pages.
+    SCRAPY_POET_PROVIDERS = [
+        OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage),
+        OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage),
+        OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage),
+        OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage),
+    ]
 
-    It is even possible to configure more complex patterns by
-    using the ``OverrideRule`` class instead of a triplet in
-    the configuration.
+As you can see, this could get verbose. The earlier tuple config simply offers
+a shortcut to be more concise.
 
-    Also see the `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
-    documentation for more information about the patterns syntax.
+Also see the `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
+documentation for more information about the patterns syntax.
 
 Next steps
 ==========
diff --git a/docs/overrides.rst b/docs/overrides.rst
index 9a23060b..9bf89d04 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -100,7 +100,7 @@ is only applied for book pages from ``books.toscrape.com``:
         )
     ]
 
-Note how category pages are excludes by using a ``exclude`` pattern.
+Note how category pages are excluded by using a ``exclude`` pattern.
 You can find more information about the patterns syntax in the
 `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
 documentation.
@@ -120,38 +120,46 @@ Let's see an example:
 
 .. code-block:: python
 
-        @handle_urls("toscrape.com", BookPage)
-        class BTSBookPage(BookPage):
+    from web_poet import handle_urls
 
-        def to_item(self):
-            return {
-                'url': self.url,
-                'name': self.css("title::text").get(),
-            }
+    @handle_urls("toscrape.com", BookPage)
+    class BTSBookPage(BookPage):
+
+    def to_item(self):
+        return {
+            'url': self.url,
+            'name': self.css("title::text").get(),
+        }
 
 The ``handle_urls`` decorator in this case is indicating that
 the class ``BSTBookPage`` should be used instead of ``BookPage``
 for the domain ``toscrape.com``.
 
-In order to configure the scrapy-poet overrides automatically
+In order to configure the ``scrapy-poet`` overrides automatically
 using these annotations,
 you can use the function ``find_page_object_overrides``.
 For example:
 
 .. code-block:: python
 
+    from web_poet import find_page_object_overrides
+
     SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module")
 
 The function will collect all the ``handle_urls`` annotations from the
 ``my_page_objects_module`` and submodules, and will convert them
 to rules ready to be used with ``SCRAPY_POET_OVERRIDES``.
 
+For more info and advanced features, of ``web-poet``'s ``handle_urls``
+and ``find_page_object_overrides``, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
+documentatino regarding Overrides.
+
 Overrides registry
 ==================
 
-The overrides registry is responsible of informing whether there exists an
+The overrides registry is responsible for informing whether there exists an
 override for a particular type for a given request. The default overrides
-registry allows to configure these rules using patterns that follows the
+registry allows to configure these rules using patterns that follow the
 `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_ syntax. These rules can be configured using the
 ``SCRAPY_POET_OVERRIDES`` setting, as it has been seen in the :ref:`intro-tutorial`
 example.
diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py
index 177656e5..b4c366a7 100644
--- a/example/example/spiders/books_04_overrides_02.py
+++ b/example/example/spiders/books_04_overrides_02.py
@@ -8,6 +8,9 @@
 """
 import scrapy
 from web_poet import ItemWebPage, WebPage
+from web_poet.overrides import OverrideRule
+from url_matcher import Patterns
+
 from scrapy_poet import callback_for
 
 
@@ -61,8 +64,10 @@ class BooksSpider(scrapy.Spider):
         "SCRAPY_POET_OVERRIDES": [
             ("toscrape.com", BTSBookListPage, BookListPage),
             ("toscrape.com", BTSBookPage, BookPage),
-            ("bookpage.com", BPBookListPage, BookListPage),
-            ("bookpage.com", BPBookPage, BookPage)
+
+            # We could also use the long-form version if we want to.
+            OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage),
+            OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage),
         ]
     }
 

From d85766e2ade22b3acbd211106fe2bc130c977706 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Tue, 21 Dec 2021 19:31:16 +0800
Subject: [PATCH 04/19] deprecate PerDomainOverridesRegistry in lieu of
 OverridesRegistry

---
 CHANGELOG.rst            | 12 ++++++++----
 docs/settings.rst        |  4 ++--
 scrapy_poet/overrides.py | 24 ------------------------
 3 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 6b681816..b4513b1b 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,11 +7,15 @@ TBR:
 ------------------
 
 * Cache mechanism using ``SCRAPY_POET_CACHE`` setting
-* New and richer ``SCRAPY_POET_OVERRIDES`` registry that uses the
-  url-matcher patterns to configure the overrides
 
-  * This results in a **backward incompatible** change since the rules
-    follow a different structure.
+* We also have these **backward incompatible** changes since the
+  rules follow a different structure:
+
+    * Deprecated ``PerDomainOverridesRegistry`` in lieu of the newer
+      ``OverridesRegistry`` which provides a wide variety of features
+      for better URL matching.
+    * This resuls in a newer ``SCRAPY_POET_OVERRIDES`` which follows
+      a different format.
 
 * removed support for Python 3.6
 * added support for Python 3.10
diff --git a/docs/settings.rst b/docs/settings.rst
index c13a9580..2dbdec30 100644
--- a/docs/settings.rst
+++ b/docs/settings.rst
@@ -25,7 +25,7 @@ Default: ``None``
 
 Mapping of overrides for each domain. The format of the such ``dict`` mapping
 depends on the currently set Registry. The default is currently 
-:class:`~.PerDomainOverridesRegistry`. This can be overriden by the setting below:
+:class:`~.OverridesRegistry`. This can be overriden by the setting below:
 ``SCRAPY_POET_OVERRIDES_REGISTRY``.
 
 There are sections dedicated for this at :ref:`intro-tutorial` and :ref:`overrides`.
@@ -36,7 +36,7 @@ SCRAPY_POET_OVERRIDES_REGISTRY
 
 Defaut: ``None``
 
-Sets an alternative Registry to replace the default :class:`~.PerDomainOverridesRegistry`.
+Sets an alternative Registry to replace the default :class:`~.OverridesRegistry`.
 To use this, set a ``str`` which denotes the absolute object path of the new
 Registry.
 
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index 5f76f377..805a358e 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -23,30 +23,6 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
         pass
 
 
-class PerDomainOverridesRegistry(Dict[str, Dict[Callable, Callable]], OverridesRegistryBase):
-    """
-    Simple dictionary based registry that reads the overrides
-    from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings
-
-    Example of overrides configuration:
-
-    .. code-block:: python
-
-        SCRAPY_POET_OVERRIDES = {
-            "example.com": {
-                BookPage: ISBNBookPage
-            }
-        }
-    """
-
-    @classmethod
-    def from_crawler(cls, crawler: Crawler):
-        return cls(crawler.settings.getdict("SCRAPY_POET_OVERRIDES", {}))
-
-    def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
-        return self.get(get_domain(request.url), {})
-
-
 RuleAsTuple = Union[Tuple[str, Callable, Callable], List]
 
 class OverridesRegistry(OverridesRegistryBase):

From 670715a992928798ce036e3d018125e8fed3351d Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Tue, 21 Dec 2021 20:09:10 +0800
Subject: [PATCH 05/19] improve readability of OverridesRegistry's docs

---
 scrapy_poet/overrides.py | 51 +++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index 805a358e..0b83e51a 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -1,16 +1,18 @@
-from collections import defaultdict
-
+import logging
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from typing import Dict, Mapping, Callable, Iterable, Union, Tuple, Optional, List
 
 from scrapy import Request
 from scrapy.crawler import Crawler
 from url_matcher import Patterns, URLMatcher
-
 from url_matcher.util import get_domain
 from web_poet.overrides import OverrideRule
 
 
+logger = logging.getLogger(__name__)
+
+
 class OverridesRegistryBase(ABC):
 
     @abstractmethod
@@ -27,37 +29,47 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
 
 class OverridesRegistry(OverridesRegistryBase):
     """
-    Overrides registry that reads the overrides
-    from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings. It
-    is a list and each rule can be a tuple or an instance of the class ``OverrideRule``.
+    Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES``
+    in the spider settings. It is a list and each rule can be a tuple or an
+    instance of the class ``OverrideRule``.
 
-    If a tuple is provided, the first element is the pattern to match the URL,
-    the second element is the type to be used instead of the type in
-    the third element. Another way to see it:
-    for the URLs that match the pattern ``tuple[0]`` use ``tuple[1]`` instead of ``tuple[2]``.
+    If a tuple is provided:
+
+        - the **first** element is the pattern to match the URL,
+        - the **second** element is the type to be used instead of the type in
+          the **third** element.
+
+    Another way to see it for the URLs that match the pattern ``tuple[0]`` use
+    ``tuple[1]`` instead of ``tuple[2]``.
 
     Example of overrides configuration:
 
     .. code-block:: python
 
-
         SCRAPY_POET_OVERRIDES = [
             ("books.toscrape.com", ISBNBookPage, BookPage),
-            OverrideRule(for_patterns=Patterns(["books.toscrape.com"]),
-                         use=MyBookListPage,
-                         instead_of=BookListPage,
-                         ),
+            OverrideRule(
+                for_patterns=Patterns(["books.toscrape.com"]),
+                use=MyBookListPage,
+                instead_of=BookListPage,
+            ),
         ]
 
-    It can be handy to compile the list of rules automatically
-    from a module using the method ``find_page_object_overrides``. For example:
+    It can be handy to compile the list of rules automatically from a module
+    using the utility function ``find_page_object_overrides()`` from ``web-poet``.
+    For example:
 
     .. code-block:: python
 
+        from web_poet import find_page_object_overrides
+
         SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module")
 
-    It finds all the rules annotated using the decorator ``handle_urls`` inside the module ``my_page_objects_module`` and
-    its submodules.
+    It finds all the rules annotated using ``web-poet``'s ``@handle_urls``
+    decorator inside the ``my_page_objects_module`` module and all of its
+    submodules.
+
+    More info on this at `web-poet <https://web-poet.readthedocs.io>`_.
     """
 
     @classmethod
@@ -69,6 +81,7 @@ def __init__(self, rules: Optional[Iterable[Union[RuleAsTuple, OverrideRule]]] =
         self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher)
         for rule in rules or []:
             self.add_rule(rule)
+        logger.debug(f"List of parsed OverrideRules:\n{self.rules}")
 
     def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]):
         if isinstance(rule, (tuple, list)):

From 706e4ac6c5e3c1c533eeb66a81d413cc72adc518 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Tue, 21 Dec 2021 20:25:22 +0800
Subject: [PATCH 06/19] improve type annotations and errors in
 OverridesRegistry

---
 scrapy_poet/overrides.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index 0b83e51a..67b81dd4 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -12,9 +12,11 @@
 
 logger = logging.getLogger(__name__)
 
+RuleAsTuple = Union[Tuple[str, Callable, Callable], List]
+RuleFromUser = Union[RuleAsTuple, OverrideRule]
 
-class OverridesRegistryBase(ABC):
 
+class OverridesRegistryBase(ABC):
     @abstractmethod
     def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
         """
@@ -25,8 +27,6 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
         pass
 
 
-RuleAsTuple = Union[Tuple[str, Callable, Callable], List]
-
 class OverridesRegistry(OverridesRegistryBase):
     """
     Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES``
@@ -73,28 +73,35 @@ class OverridesRegistry(OverridesRegistryBase):
     """
 
     @classmethod
-    def from_crawler(cls, crawler: Crawler):
+    def from_crawler(cls, crawler: Crawler) -> Crawler:
         return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", []))
 
-    def __init__(self, rules: Optional[Iterable[Union[RuleAsTuple, OverrideRule]]] = None):
+    def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None:
         self.rules: List[OverrideRule] = []
         self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher)
         for rule in rules or []:
             self.add_rule(rule)
         logger.debug(f"List of parsed OverrideRules:\n{self.rules}")
 
-    def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]):
+    def add_rule(self, rule: RuleFromUser) -> None:
         if isinstance(rule, (tuple, list)):
             if len(rule) != 3:
-                raise ValueError(f"Invalid overrides rule: {rule}. Rules as tuples must have three elements: "
-                                 f"the pattern, the type to override and the new type to use instead.")
+                raise ValueError(
+                    f"Invalid overrides rule: {rule}. Rules as tuples must have "
+                    f"3 elements: (1) the pattern, (2) the PO class used as a "
+                    f"replacement and (3) the PO class to be replaced."
+                )
             pattern, use, instead_of = rule
-            rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of)
+            rule = OverrideRule(
+                for_patterns=Patterns([pattern]), use=use, instead_of=instead_of
+            )
         self.rules.append(rule)
-        self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns)
+        self.matcher[rule.instead_of].add_or_update(
+            len(self.rules) - 1, rule.for_patterns
+        )
 
     def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
-        overrides = {}
+        overrides: Dict[Callable, Callable] = {}
         for instead_of, matcher in self.matcher.items():
             rule_id = matcher.match(request.url)
             if rule_id is not None:

From bf4e61b52792eb8915e1b0a5f5e9aae3aa4e9d98 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Tue, 21 Dec 2021 20:40:55 +0800
Subject: [PATCH 07/19] improve test coverage

---
 scrapy_poet/cache.py |  4 ++--
 scrapy_poet/utils.py |  3 ++-
 tests/test_utils.py  | 21 +++++++++++++++++++++
 3 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_utils.py

diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py
index 07b4ee5e..d1a9ef47 100644
--- a/scrapy_poet/cache.py
+++ b/scrapy_poet/cache.py
@@ -54,14 +54,14 @@ def decode(self, obj: Any) -> Any:
         return pickle.loads(data)
 
     def __str__(self) -> str:
-        return (
+        return (  #pragma: no cover
             f"SqlitedictCache <{self.db.filename} | "
             f"compressed: {self.compressed} | "
             f"{len(self.db)} records>"
         )
 
     def __repr__(self) -> str:
-        return f"SqlitedictCache({self.path!r}, compressed={self.compressed})"
+        return f"SqlitedictCache({self.path!r}, compressed={self.compressed})"  #pragma: no cover
 
     def __getitem__(self, fingerprint: str) -> Any:
         return self.db[fingerprint]
diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py
index 83d1e2ca..7564b27c 100644
--- a/scrapy_poet/utils.py
+++ b/scrapy_poet/utils.py
@@ -1,9 +1,10 @@
 import os
+from pathlib import PosixPath
 
 from scrapy.utils.project import project_data_dir, inside_project
 
 
-def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str:
+def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> PosixPath:
     """Return a path to a folder where Scrapy is storing data.
     Usually that's a .scrapy folder inside the project.
     """
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..05e55542
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,21 @@
+from unittest import mock
+from pathlib import PosixPath
+
+from scrapy_poet.utils import get_scrapy_data_path
+
+
+@mock.patch("scrapy_poet.utils.os.makedirs")
+@mock.patch("scrapy_poet.utils.inside_project")
+def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path):
+    mock_inside_project.return_value = False
+
+    path = tmp_path / "test_dir"
+    result = get_scrapy_data_path(createdir=True, default_dir=path)
+
+    assert isinstance(result, PosixPath)
+    assert str(result)  # should be non-empty
+
+    mock_inside_project.assert_called_once()
+
+    mock_makedirs.assert_called_once()
+    mock_makedirs.assert_called_with(path, exist_ok=True)

From c865c60f1cea0fba17186fd230391be0b0bb1577 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 23 Dec 2021 10:25:21 +0800
Subject: [PATCH 08/19] update docs in-line with recent web-poet refactoring

---
 docs/overrides.rst       | 25 +++++++++++++------------
 scrapy_poet/overrides.py | 18 +++++++++++++-----
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/docs/overrides.rst b/docs/overrides.rst
index 9bf89d04..3c16f8cd 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -112,9 +112,7 @@ Decorate Page Objects with the rules
 Having the rules along with the Page Objects is a good idea,
 as you can identify with a single sight what the Page Object is doing
 along with where it is applied. This can be done by decorating the
-Page Objects with ``handle_urls`` and then
-configure the overrides automatically with the help of the function
-``find_page_object_overrides``.
+Page Objects with ``@handle_urls`` provided by ``web-poet``.
 
 Let's see an example:
 
@@ -131,28 +129,31 @@ Let's see an example:
             'name': self.css("title::text").get(),
         }
 
-The ``handle_urls`` decorator in this case is indicating that
+The ``@handle_urls`` decorator in this case is indicating that
 the class ``BSTBookPage`` should be used instead of ``BookPage``
 for the domain ``toscrape.com``.
 
 In order to configure the ``scrapy-poet`` overrides automatically
-using these annotations,
-you can use the function ``find_page_object_overrides``.
+using these annotations, you can directly interact with ``web-poet``'s
+default registry.
+
 For example:
 
 .. code-block:: python
 
-    from web_poet import find_page_object_overrides
+    from web_poet import default_registry
 
-    SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module")
+    SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module")
 
-The function will collect all the ``handle_urls`` annotations from the
+The function will collect all the ``@handle_urls`` annotations from the
 ``my_page_objects_module`` and submodules, and will convert them
 to rules ready to be used with ``SCRAPY_POET_OVERRIDES``.
 
-For more info and advanced features, of ``web-poet``'s ``handle_urls``
-and ``find_page_object_overrides``, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
-documentatino regarding Overrides.
+.. note::
+
+    For more info and advanced features of ``web-poet``'s ``@handle_urls``
+    and its registry, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
+    documentation regarding Overrides.
 
 Overrides registry
 ==================
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index 67b81dd4..d792fe91 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -55,20 +55,28 @@ class OverridesRegistry(OverridesRegistryBase):
             ),
         ]
 
-    It can be handy to compile the list of rules automatically from a module
-    using the utility function ``find_page_object_overrides()`` from ``web-poet``.
-    For example:
+    Now, if you've used ``web-poet``'s built-in functionality to directly create
+    the override rules in the Page Object via the ``@handle_urls`` annotation,
+    you can quickly import them via:
 
     .. code-block:: python
 
-        from web_poet import find_page_object_overrides
+        from web_poet import default_registry
 
-        SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module")
+        SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module")
 
     It finds all the rules annotated using ``web-poet``'s ``@handle_urls``
     decorator inside the ``my_page_objects_module`` module and all of its
     submodules.
 
+    However, for most cases, you'd most likely going to simply retrieve all of
+    the override rules that were ever declared on a given registry. Thus, you
+    could simply do:
+
+    .. code-block:: python
+
+        SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
+
     More info on this at `web-poet <https://web-poet.readthedocs.io>`_.
     """
 

From 63029dc1e29d7338ce92cc3fe41cfc956fa61f38 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 23 Dec 2021 11:40:14 +0800
Subject: [PATCH 09/19] add integration tests for web-poet

---
 tests/po_lib/__init__.py | 27 +++++++++++++++++++++++++++
 tests/test_middleware.py | 24 ++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 tests/po_lib/__init__.py

diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py
new file mode 100644
index 00000000..535de873
--- /dev/null
+++ b/tests/po_lib/__init__.py
@@ -0,0 +1,27 @@
+"""
+This package is just for overrides testing purposes.
+"""
+import socket
+from typing import Dict, Any, Callable
+
+from url_matcher import Patterns
+from url_matcher.util import get_domain
+from web_poet import handle_urls, ItemWebPage
+
+from tests.mockserver import get_ephemeral_port
+
+
+# Need to define it here since it's always changing
+DOMAIN = get_domain(socket.gethostbyname(socket.gethostname()))
+PORT = get_ephemeral_port()
+
+
+class POOverriden(ItemWebPage):
+    def to_item(self):
+        return {"msg": "PO that will be replace"}
+
+
+@handle_urls(f"{DOMAIN}:{PORT}", POOverriden)
+class POIntegration(ItemWebPage):
+    def to_item(self):
+        return {"msg": "PO replacement"}
diff --git a/tests/test_middleware.py b/tests/test_middleware.py
index 1434d895..ec650e09 100644
--- a/tests/test_middleware.py
+++ b/tests/test_middleware.py
@@ -24,6 +24,7 @@
 from scrapy_poet.page_input_providers import (
     PageObjectInputProvider
 )
+from web_poet import default_registry
 from web_poet.page_inputs import ResponseData
 from scrapy_poet import DummyResponse
 from tests.utils import (HtmlResource,
@@ -350,3 +351,26 @@ def get_middleware(settings):
         mock.call('/tmp/cache', compressed=True),
         mock.call().close()
     ]
+
+
+@inlineCallbacks
+def test_web_poet_integration(settings):
+    """This tests scrapy-poet's integration with web-poet most especially when
+    populating override settings via:
+
+        from web_poet import default_registry
+
+        SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
+    """
+
+    # Only import them in this test scope since they need to be synced with
+    # the URL of the Page Object annotated with @handle_urls.
+    from tests.po_lib import DOMAIN, PORT, POOverriden
+
+    # Override rules are defined in `tests/po_lib/__init__.py`.
+    settings["SCRAPY_POET_OVERRIDES"] = default_registry.get_overrides()
+
+    item, url, _ = yield crawl_single_item(
+        spider_for(POOverriden), ProductHtml, settings, port=PORT
+    )
+    assert item == {"msg": "PO replacement"}

From 5305da47e0258d1eb6348d693f79a17ec9efcc41 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Wed, 5 Jan 2022 11:59:58 +0800
Subject: [PATCH 10/19] fix and improve docs

---
 CHANGELOG.rst           |  3 +--
 docs/intro/tutorial.rst | 35 +++++++++++++++++++++++++++--------
 docs/overrides.rst      | 30 ++++++++++++++++++------------
 3 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index b4513b1b..ef7a3f7e 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -14,8 +14,7 @@ TBR:
     * Deprecated ``PerDomainOverridesRegistry`` in lieu of the newer
       ``OverridesRegistry`` which provides a wide variety of features
       for better URL matching.
-    * This resuls in a newer ``SCRAPY_POET_OVERRIDES`` which follows
-      a different format.
+    * This resuls in a newer format in the ``SCRAPY_POET_OVERRIDES`` setting.
 
 * removed support for Python 3.6
 * added support for Python 3.10
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index 33af5f4c..c799af69 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -9,7 +9,7 @@ system. If that’s not the case, see :ref:`intro-install`.
 
 .. note::
 
-    This tutorial can be followed without reading `web-poet docs`_, but
+    This tutorial can be followed without reading `web-poet`_ docs, but
     for a better understanding it is highly recommended to check them first.
 
 
@@ -26,7 +26,7 @@ This tutorial will walk you through these tasks:
 If you're not already familiar with Scrapy, and want to learn it quickly,
 the `Scrapy Tutorial`_ is a good resource.
 
-.. _web-poet docs: https://web-poet.readthedocs.io/en/stable/
+.. _web-poet: https://web-poet.readthedocs.io/en/stable/
 
 Creating a spider
 =================
@@ -125,7 +125,7 @@ To use ``scrapy-poet``, enable its downloader middleware in ``settings.py``:
 ``BookPage`` class we created previously can be used without ``scrapy-poet``,
 and even without Scrapy (note that imports were from ``web_poet`` so far).
 
-``scrapy-poet`` makes it easy to use ``web-poet`` Page Objects
+``scrapy-poet`` makes it easy to use `web-poet`_ Page Objects
 (such as BookPage) in Scrapy spiders.
 
 Changing spider
@@ -427,7 +427,7 @@ for ``SCRAPY_POET_OVERRIDES`` would be the following:
     from url_matcher import Patterns
     from web_poet.overrides import OverrideRule
 
-    SCRAPY_POET_PROVIDERS = [
+    SCRAPY_POET_OVERRIDES = [
         OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage),
         OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage),
         OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage),
@@ -437,8 +437,27 @@ for ``SCRAPY_POET_OVERRIDES`` would be the following:
 As you can see, this could get verbose. The earlier tuple config simply offers
 a shortcut to be more concise.
 
-Also see the `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
-documentation for more information about the patterns syntax.
+.. note::
+
+    Also see the `url-matcher <https://url-matcher.readthedocs.io/en/stable/>`_
+    documentation for more information about the patterns syntax.
+
+Manually defining overrides like this would be inconvenient, most
+especially for larger projects. Fortunately, `web-poet`_ has a cool feature
+to annotate Page Objects like ``@web_poet.handle_urls`` that would define and
+store the ``OverrideRule`` for you. All of the Override rules could then be
+simply read as:
+
+.. code:: python
+
+    from web_poet import default_registry
+
+    SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
+
+For more info on this, you can refer to these docs:
+
+    * :ref:`overrides` section
+    * external `web-poet`_ docs
 
 Next steps
 ==========
@@ -446,7 +465,7 @@ Next steps
 Now that you know how ``scrapy-poet`` is supposed to work, what about trying to
 apply it to an existing or new Scrapy project?
 
-Also, please check :ref:`overrides`, :ref:`providers` and refer to spiders in the "example"
-folder: https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders
+Also, please check the :ref:`overrides` and :ref:`providers` sections as well as
+refer to spiders in the "example" folder: https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders
 
 .. _Scrapy Tutorial: https://docs.scrapy.org/en/latest/intro/tutorial.html
diff --git a/docs/overrides.rst b/docs/overrides.rst
index 3c16f8cd..59cd6b34 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -89,7 +89,6 @@ is only applied for book pages from ``books.toscrape.com``:
 
 .. code-block:: python
 
-
     SCRAPY_POET_OVERRIDES = [
         OverrideRule(
             for_patterns=Patterns(
@@ -112,7 +111,7 @@ Decorate Page Objects with the rules
 Having the rules along with the Page Objects is a good idea,
 as you can identify with a single sight what the Page Object is doing
 along with where it is applied. This can be done by decorating the
-Page Objects with ``@handle_urls`` provided by ``web-poet``.
+Page Objects with ``@handle_urls`` provided by `web-poet`_.
 
 Let's see an example:
 
@@ -123,18 +122,18 @@ Let's see an example:
     @handle_urls("toscrape.com", BookPage)
     class BTSBookPage(BookPage):
 
-    def to_item(self):
-        return {
-            'url': self.url,
-            'name': self.css("title::text").get(),
-        }
+        def to_item(self):
+            return {
+                'url': self.url,
+                'name': self.css("title::text").get(),
+            }
 
 The ``@handle_urls`` decorator in this case is indicating that
 the class ``BSTBookPage`` should be used instead of ``BookPage``
 for the domain ``toscrape.com``.
 
 In order to configure the ``scrapy-poet`` overrides automatically
-using these annotations, you can directly interact with ``web-poet``'s
+using these annotations, you can directly interact with `web-poet`_'s
 default registry.
 
 For example:
@@ -143,15 +142,22 @@ For example:
 
     from web_poet import default_registry
 
+    # To get all of the Override Rules that were declared via annotations.
+    SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
+
+    # Or, you could even extract the rules on a specific subpackage or module.
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module")
 
-The function will collect all the ``@handle_urls`` annotations from the
-``my_page_objects_module`` and submodules, and will convert them
-to rules ready to be used with ``SCRAPY_POET_OVERRIDES``.
+The ``get_overrides()`` and ``get_overrides_from_module()`` methods of the
+``default_registry`` above returns ``List[OverrideRule]`` that were declared
+using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient
+that manually defining all of the ``OverrideRule``. Take note that since
+``SCRAPY_POET_OVERRIDES`` is structured as ``List[OverrideRule]``, you can easily
+modify it later on if needed.
 
 .. note::
 
-    For more info and advanced features of ``web-poet``'s ``@handle_urls``
+    For more info and advanced features of `web-poet`_'s ``@handle_urls``
     and its registry, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
     documentation regarding Overrides.
 

From 2d0c3bc5f8f21af766dd2332d9987a8e24a41654 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Fri, 7 Jan 2022 20:36:08 +0800
Subject: [PATCH 11/19] update docs to reflect new changes from web-poet

---
 docs/intro/tutorial.rst  |  7 ++++++-
 docs/overrides.rst       | 13 ++++++++++---
 scrapy_poet/overrides.py | 13 ++++++++++---
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index c799af69..136aa7ba 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -450,7 +450,12 @@ simply read as:
 
 .. code:: python
 
-    from web_poet import default_registry
+    from web_poet import default_registry, consume_modules
+
+    # The consume_modules() must be called first if you need to load
+    # rules from other packages. Otherwise, it can be omitted.
+    # More info about this caveat on web-poet docs.
+    consume_modules("external_package_A.po", "another_ext_package.lib")
 
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
diff --git a/docs/overrides.rst b/docs/overrides.rst
index 59cd6b34..690d3c56 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -140,15 +140,22 @@ For example:
 
 .. code-block:: python
 
-    from web_poet import default_registry
+    from web_poet import default_registry, consume_modules
+
+    # The consume_modules() must be called first if you need to load
+    # rules from other packages. Otherwise, it can be omitted.
+    # More info about this caveat on web-poet docs.
+    consume_modules("external_package_A.po", "another_ext_package.lib")
 
     # To get all of the Override Rules that were declared via annotations.
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
     # Or, you could even extract the rules on a specific subpackage or module.
-    SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module")
+    SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from(
+        "external_page_objects_package", "another_page_object_package.module_1"
+    )
 
-The ``get_overrides()`` and ``get_overrides_from_module()`` methods of the
+The ``get_overrides()`` and ``get_overrides_from()`` methods of the
 ``default_registry`` above returns ``List[OverrideRule]`` that were declared
 using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient
 that manually defining all of the ``OverrideRule``. Take note that since
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index d792fe91..732b89db 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -63,18 +63,25 @@ class OverridesRegistry(OverridesRegistryBase):
 
         from web_poet import default_registry
 
-        SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module")
+        SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from("my_page_objects_module")
 
     It finds all the rules annotated using ``web-poet``'s ``@handle_urls``
     decorator inside the ``my_page_objects_module`` module and all of its
     submodules.
 
     However, for most cases, you'd most likely going to simply retrieve all of
-    the override rules that were ever declared on a given registry. Thus, you
-    could simply do:
+    the override rules that were ever declared on a given registry. Though make
+    sure to call ``consume_module()`` beforehand:
 
     .. code-block:: python
 
+        from web_poet import default_registry, consume_modules
+
+        # The consume_modules() must be called first if you need to load
+        # rules from other packages. Otherwise, it can be omitted.
+        # More info about this caveat on web-poet docs.
+        consume_modules("external_package_A.po", "another_ext_package.lib")
+
         SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
     More info on this at `web-poet <https://web-poet.readthedocs.io>`_.

From ce2392324e5eb126290f333b557918bf863bb569 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Wed, 12 Jan 2022 15:29:39 +0800
Subject: [PATCH 12/19] update docs with respect to new Override Rules
 interface from web-poet

---
 docs/intro/tutorial.rst  |  6 +++++-
 docs/overrides.rst       | 20 ++++++++++++--------
 scrapy_poet/overrides.py |  8 ++++++--
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index 136aa7ba..acf20cd7 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -456,9 +456,13 @@ simply read as:
     # rules from other packages. Otherwise, it can be omitted.
     # More info about this caveat on web-poet docs.
     consume_modules("external_package_A.po", "another_ext_package.lib")
-
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
+    # The two lines above could be mixed together via this shortcut:
+    SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
+        consume=["external_package_A.po", "another_ext_package.lib"]
+    )
+
 For more info on this, you can refer to these docs:
 
     * :ref:`overrides` section
diff --git a/docs/overrides.rst b/docs/overrides.rst
index 690d3c56..1cbf4b19 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -150,17 +150,21 @@ For example:
     # To get all of the Override Rules that were declared via annotations.
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
+    # The two lines above could be mixed together via this shortcut:
+    SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
+        consume=["external_package_A.po", "another_ext_package.lib"]
+    )
+
     # Or, you could even extract the rules on a specific subpackage or module.
-    SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from(
-        "external_page_objects_package", "another_page_object_package.module_1"
+    SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
+        filters=["external_page_objects_package", "another_page_object_package.module_1"]
     )
 
-The ``get_overrides()`` and ``get_overrides_from()`` methods of the
-``default_registry`` above returns ``List[OverrideRule]`` that were declared
-using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient
-that manually defining all of the ``OverrideRule``. Take note that since
-``SCRAPY_POET_OVERRIDES`` is structured as ``List[OverrideRule]``, you can easily
-modify it later on if needed.
+The ``get_overrides()`` method of the ``default_registry`` above returns
+``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()``
+annotation. This is much more convenient that manually defining all of the 
+`OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as
+``List[OverrideRule]``, you can easily modify it later on if needed.
 
 .. note::
 
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index 732b89db..eaa04c81 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -63,7 +63,7 @@ class OverridesRegistry(OverridesRegistryBase):
 
         from web_poet import default_registry
 
-        SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from("my_page_objects_module")
+        SCRAPY_POET_OVERRIDES = default_registry.get_overrides(filters="my_page_objects_module")
 
     It finds all the rules annotated using ``web-poet``'s ``@handle_urls``
     decorator inside the ``my_page_objects_module`` module and all of its
@@ -81,9 +81,13 @@ class OverridesRegistry(OverridesRegistryBase):
         # rules from other packages. Otherwise, it can be omitted.
         # More info about this caveat on web-poet docs.
         consume_modules("external_package_A.po", "another_ext_package.lib")
-
         SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
+        # The two lines above could be mixed together via this shortcut:
+        SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
+            consume=["external_package_A.po", "another_ext_package.lib"]
+        )
+
     More info on this at `web-poet <https://web-poet.readthedocs.io>`_.
     """
 

From 0c94cf6d258642c654624192e7bb86308f97ff3e Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 13 Jan 2022 19:00:20 +0800
Subject: [PATCH 13/19] update docs to reflect web-poet's new 'registry_pool'

---
 docs/intro/tutorial.rst  |  4 ++--
 docs/overrides.rst       | 34 +++++++++++++++++++++++++++++++---
 tests/test_middleware.py |  5 ++++-
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index acf20cd7..ad735380 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -455,12 +455,12 @@ simply read as:
     # The consume_modules() must be called first if you need to load
     # rules from other packages. Otherwise, it can be omitted.
     # More info about this caveat on web-poet docs.
-    consume_modules("external_package_A.po", "another_ext_package.lib")
+    consume_modules("external_package_A", "another_ext_package.lib")
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
     # The two lines above could be mixed together via this shortcut:
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
-        consume=["external_package_A.po", "another_ext_package.lib"]
+        consume=["external_package_A", "another_ext_package.lib"]
     )
 
 For more info on this, you can refer to these docs:
diff --git a/docs/overrides.rst b/docs/overrides.rst
index 1cbf4b19..deed2a79 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -134,7 +134,7 @@ for the domain ``toscrape.com``.
 
 In order to configure the ``scrapy-poet`` overrides automatically
 using these annotations, you can directly interact with `web-poet`_'s
-default registry.
+``default_registry``.
 
 For example:
 
@@ -145,14 +145,14 @@ For example:
     # The consume_modules() must be called first if you need to load
     # rules from other packages. Otherwise, it can be omitted.
     # More info about this caveat on web-poet docs.
-    consume_modules("external_package_A.po", "another_ext_package.lib")
+    consume_modules("external_package_A", "another_ext_package.lib")
 
     # To get all of the Override Rules that were declared via annotations.
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
     # The two lines above could be mixed together via this shortcut:
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
-        consume=["external_package_A.po", "another_ext_package.lib"]
+        consume=["external_package_A", "another_ext_package.lib"]
     )
 
     # Or, you could even extract the rules on a specific subpackage or module.
@@ -172,6 +172,34 @@ annotation. This is much more convenient that manually defining all of the
     and its registry, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
     documentation regarding Overrides.
 
+In case the external packages you're using does not use `web-poet`_'s
+``default_registry``, you can find and collect custom registries via `web-poet`_'s
+``registry_pool``:
+
+.. code-block:: python
+
+    from web_poet import registry_pool, consume_modules
+
+    # Ensures that the external dependencies are properly imported so that the
+    # Registry and its accompanying rules can be discovered.
+    consume_modules("external_package_A", "another_ext_package_B.lib")
+
+    print(registry_pool)
+    # {
+    #     'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>,
+    #     'custom_reg' = <external_package_A.PageObjectRegistry object at 0x7f47d654382a>,
+    #     'another_custom_reg' = <another_ext_package_B.lib.PageObjectRegistry object at 0xd93746549dea>,
+    # }
+
+    SCRAPY_POET_OVERRIDES = [
+        rule
+        for _, registry in registry_pool.items()
+        for rule in registry.get_overrides()
+    ]
+
+    # Converting it to a set also ensures that there are no duplicate OverrideRules.
+    SCRAPY_POET_OVERRIDES = set(SCRAPY_POET_OVERRIDES)
+
 Overrides registry
 ==================
 
diff --git a/tests/test_middleware.py b/tests/test_middleware.py
index ec650e09..388fb839 100644
--- a/tests/test_middleware.py
+++ b/tests/test_middleware.py
@@ -368,7 +368,10 @@ def test_web_poet_integration(settings):
     from tests.po_lib import DOMAIN, PORT, POOverriden
 
     # Override rules are defined in `tests/po_lib/__init__.py`.
-    settings["SCRAPY_POET_OVERRIDES"] = default_registry.get_overrides()
+    rules = default_registry.get_overrides()
+
+    # Converting it to a set removes potential duplicate OverrideRules
+    settings["SCRAPY_POET_OVERRIDES"] = set(rules)
 
     item, url, _ = yield crawl_single_item(
         spider_for(POOverriden), ProductHtml, settings, port=PORT

From 1f52f3bb49d78bd82e6983a2b952db2599cf898e Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Wed, 2 Mar 2022 12:03:27 +0800
Subject: [PATCH 14/19] update docs with web-poet's new MVP version and POP
 definition

---
 docs/overrides.rst       | 44 +++++++++++-----------------------------
 scrapy_poet/overrides.py | 36 ++++++++++++++++----------------
 2 files changed, 31 insertions(+), 49 deletions(-)

diff --git a/docs/overrides.rst b/docs/overrides.rst
index deed2a79..f29305f4 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -155,50 +155,31 @@ For example:
         consume=["external_package_A", "another_ext_package.lib"]
     )
 
-    # Or, you could even extract the rules on a specific subpackage or module.
-    SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
-        filters=["external_page_objects_package", "another_page_object_package.module_1"]
-    )
-
 The ``get_overrides()`` method of the ``default_registry`` above returns
 ``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()``
 annotation. This is much more convenient that manually defining all of the 
 `OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as
 ``List[OverrideRule]``, you can easily modify it later on if needed.
 
-.. note::
+.. tip::
 
-    For more info and advanced features of `web-poet`_'s ``@handle_urls``
-    and its registry, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
-    documentation regarding Overrides.
+    If you're using External Packages which conform to the **POP**
+    standards as described in **web-poet's** `Page Object Projects (POP)
+    <https://web-poet.readthedocs.io/en/stable/intro/pop.html>`_ section,
+    then retrieving the rules should be as easy as:
 
-In case the external packages you're using does not use `web-poet`_'s
-``default_registry``, you can find and collect custom registries via `web-poet`_'s
-``registry_pool``:
-
-.. code-block:: python
+    .. code-block:: python
 
-    from web_poet import registry_pool, consume_modules
+        import external_package_A, another_ext_package
 
-    # Ensures that the external dependencies are properly imported so that the
-    # Registry and its accompanying rules can be discovered.
-    consume_modules("external_package_A", "another_ext_package_B.lib")
+        SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES
 
-    print(registry_pool)
-    # {
-    #     'default': <web_poet.overrides.PageObjectRegistry object at 0x7f47d654d8b0>,
-    #     'custom_reg' = <external_package_A.PageObjectRegistry object at 0x7f47d654382a>,
-    #     'another_custom_reg' = <another_ext_package_B.lib.PageObjectRegistry object at 0xd93746549dea>,
-    # }
+.. note::
 
-    SCRAPY_POET_OVERRIDES = [
-        rule
-        for _, registry in registry_pool.items()
-        for rule in registry.get_overrides()
-    ]
+    For more info and advanced features of `web-poet`_'s ``@handle_urls``
+    and its registry, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
+    documentation regarding Overrides.
 
-    # Converting it to a set also ensures that there are no duplicate OverrideRules.
-    SCRAPY_POET_OVERRIDES = set(SCRAPY_POET_OVERRIDES)
 
 Overrides registry
 ==================
@@ -217,4 +198,3 @@ must be a subclass of ``scrapy_poet.overrides.OverridesRegistryBase``
 and must implement the method ``overrides_for``. As other Scrapy components,
 it can be initialized from the ``from_crawler`` class method if implemented.
 This might be handy to be able to access settings, stats, request meta, etc.
-
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index eaa04c81..a623c064 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -57,21 +57,9 @@ class OverridesRegistry(OverridesRegistryBase):
 
     Now, if you've used ``web-poet``'s built-in functionality to directly create
     the override rules in the Page Object via the ``@handle_urls`` annotation,
-    you can quickly import them via:
-
-    .. code-block:: python
-
-        from web_poet import default_registry
-
-        SCRAPY_POET_OVERRIDES = default_registry.get_overrides(filters="my_page_objects_module")
-
-    It finds all the rules annotated using ``web-poet``'s ``@handle_urls``
-    decorator inside the ``my_page_objects_module`` module and all of its
-    submodules.
-
-    However, for most cases, you'd most likely going to simply retrieve all of
-    the override rules that were ever declared on a given registry. Though make
-    sure to call ``consume_module()`` beforehand:
+    you can quickly import them via the following code below. It finds all the
+    rules annotated using ``web-poet``'s ``@handle_urls`` decorator that were
+    registered into ``web_poet.default_registry``.
 
     .. code-block:: python
 
@@ -88,8 +76,22 @@ class OverridesRegistry(OverridesRegistryBase):
             consume=["external_package_A.po", "another_ext_package.lib"]
         )
 
-    More info on this at `web-poet <https://web-poet.readthedocs.io>`_.
-    """
+    Make sure to call ``consume_module()`` beforehand. More info on this at
+    `web-poet <https://web-poet.readthedocs.io>`_.
+
+    .. tip::
+
+        If you're using External Packages which conform to the **POP**
+        standards as described in **web-poet's** `Page Object Projects (POP)
+        <https://web-poet.readthedocs.io/en/stable/intro/pop.html>`_ section,
+        then retrieving the rules should be as easy as:
+
+        .. code-block:: python
+
+            import external_package_A, another_ext_package
+
+            SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES
+        """
 
     @classmethod
     def from_crawler(cls, crawler: Crawler) -> Crawler:

From 10ba139a38bd4e9c14e0e26e309e48bc924c54ce Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Fri, 25 Mar 2022 21:31:49 +0800
Subject: [PATCH 15/19] slight doc improvements

---
 CHANGELOG.rst            |  2 +-
 docs/intro/tutorial.rst  |  4 ++--
 docs/overrides.rst       |  7 +++++--
 scrapy_poet/overrides.py | 10 ++++++++--
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 3de3ad17..ae8cde19 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,7 +6,7 @@ TBR
 ---
 
 * We have these **backward incompatible** changes since the
-  ``OverrideRule`` follow a different structure:
+  ``web_poet.OverrideRule`` follow a different structure:
 
     * Deprecated ``PerDomainOverridesRegistry`` in lieu of the newer
       ``OverridesRegistry`` which provides a wide variety of features
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index ad735380..d8e82dbe 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -126,7 +126,7 @@ To use ``scrapy-poet``, enable its downloader middleware in ``settings.py``:
 and even without Scrapy (note that imports were from ``web_poet`` so far).
 
 ``scrapy-poet`` makes it easy to use `web-poet`_ Page Objects
-(such as BookPage) in Scrapy spiders.
+(such as ``BookPage``) in Scrapy spiders.
 
 Changing spider
 ===============
@@ -418,7 +418,7 @@ For example, the pattern ``books.toscrape.com/cataloge/category/``
 is accepted and it would restrict the override only to category pages.
 
 It is even possible to configure more complex patterns by
-using the ``OverrideRule`` class instead of a triplet in
+using the ``web_poet.OverrideRule`` class instead of a triplet in
 the configuration. Another way of declaring the earlier config
 for ``SCRAPY_POET_OVERRIDES`` would be the following:
 
diff --git a/docs/overrides.rst b/docs/overrides.rst
index f29305f4..31d87c35 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -158,7 +158,7 @@ For example:
 The ``get_overrides()`` method of the ``default_registry`` above returns
 ``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()``
 annotation. This is much more convenient that manually defining all of the 
-`OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as
+``OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as
 ``List[OverrideRule]``, you can easily modify it later on if needed.
 
 .. tip::
@@ -172,7 +172,10 @@ annotation. This is much more convenient that manually defining all of the
 
         import external_package_A, another_ext_package
 
-        SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES
+        SCRAPY_POET_OVERRIDES = (
+            external_package_A.REGISTRY.get_overrides()
+            + another_ext_package.REGISTRY.get_overrides()
+        )
 
 .. note::
 
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index a623c064..fa254c52 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -47,7 +47,10 @@ class OverridesRegistry(OverridesRegistryBase):
     .. code-block:: python
 
         SCRAPY_POET_OVERRIDES = [
+            # Option 1
             ("books.toscrape.com", ISBNBookPage, BookPage),
+
+            # Option 2
             OverrideRule(
                 for_patterns=Patterns(["books.toscrape.com"]),
                 use=MyBookListPage,
@@ -90,8 +93,11 @@ class OverridesRegistry(OverridesRegistryBase):
 
             import external_package_A, another_ext_package
 
-            SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES
-        """
+            SCRAPY_POET_OVERRIDES = (
+                external_package_A.REGISTRY.get_overrides()
+                + another_ext_package.REGISTRY.get_overrides()
+            )
+    """
 
     @classmethod
     def from_crawler(cls, crawler: Crawler) -> Crawler:

From da93452cdbbfaf8c1281ad23321a2b7e90f1cf2c Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Mon, 2 May 2022 19:24:58 +0800
Subject: [PATCH 16/19] improve docs after web-poet PR#27 has been merged

---
 docs/conf.py             |  2 +-
 docs/intro/tutorial.rst  | 28 ++++++++--------
 docs/overrides.rst       | 70 ++++++++++++++++++----------------------
 scrapy_poet/overrides.py | 49 +++++++++++-----------------
 4 files changed, 63 insertions(+), 86 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 9aa8ca4d..2e205d04 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -188,7 +188,7 @@
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3', None, ),
     'scrapy': ('https://docs.scrapy.org/en/latest', None, ),
-    'web-poet': ('https://web-poet.readthedocs.io/en/stable/', None),
+    'web-poet': ('https://web-poet.readthedocs.io/en/latest/', None),
     'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None),
 }
 
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
index d8e82dbe..5481c895 100644
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@@ -417,15 +417,16 @@ for a particular domain, but more complex URL patterns are also possible.
 For example, the pattern ``books.toscrape.com/cataloge/category/``
 is accepted and it would restrict the override only to category pages.
 
-It is even possible to configure more complex patterns by
-using the ``web_poet.OverrideRule`` class instead of a triplet in
+It is even possible to configure more complex patterns by using the
+:py:class:`web_poet.overrides.OverrideRule` class instead of a triplet in
 the configuration. Another way of declaring the earlier config
 for ``SCRAPY_POET_OVERRIDES`` would be the following:
 
 .. code-block:: python
 
     from url_matcher import Patterns
-    from web_poet.overrides import OverrideRule
+    from web_poet import OverrideRule
+
 
     SCRAPY_POET_OVERRIDES = [
         OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage),
@@ -443,30 +444,27 @@ a shortcut to be more concise.
     documentation for more information about the patterns syntax.
 
 Manually defining overrides like this would be inconvenient, most
-especially for larger projects. Fortunately, `web-poet`_ has a cool feature
-to annotate Page Objects like ``@web_poet.handle_urls`` that would define and
-store the ``OverrideRule`` for you. All of the Override rules could then be
-simply read as:
+especially for larger projects. Fortunately, `web-poet`_ has a cool feature to
+annotate Page Objects like :py:func:`web_poet.handle_urls` that would define
+and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the
+:py:class:`web_poet.overrides.OverrideRule` rules could then be simply read as:
 
 .. code:: python
 
     from web_poet import default_registry, consume_modules
 
-    # The consume_modules() must be called first if you need to load
+    # The consume_modules() must be called first if you need to properly import
     # rules from other packages. Otherwise, it can be omitted.
     # More info about this caveat on web-poet docs.
     consume_modules("external_package_A", "another_ext_package.lib")
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
-    # The two lines above could be mixed together via this shortcut:
-    SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
-        consume=["external_package_A", "another_ext_package.lib"]
-    )
-
 For more info on this, you can refer to these docs:
 
-    * :ref:`overrides` section
-    * external `web-poet`_ docs
+    * ``scrapy-poet``'s :ref:`overrides` Tutorial section.
+    * External `web-poet`_ docs.
+
+        * Specifically, the :external:ref:`intro-overrides` Tutorial section.
 
 Next steps
 ==========
diff --git a/docs/overrides.rst b/docs/overrides.rst
index 31d87c35..3e9c7e4d 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -51,7 +51,7 @@ And then override it for a particular domain using ``settings.py``:
         ("example.com", ISBNBookPage, BookPage)
     ]
 
-This new Page Objects gets the original ``BookPage`` as dependency and enrich
+This new Page Object gets the original ``BookPage`` as dependency and enrich
 the obtained item with the ISBN from the page HTML.
 
 .. note::
@@ -82,13 +82,16 @@ Overrides rules
 ===============
 
 The default way of configuring the override rules is using triplets
-of the form (``url pattern``, ``override_type``, ``overridden_type``). But
-more complex rules can be introduced if the class ``OverrideRule``
-is used. The following example configures an override that
-is only applied for book pages from ``books.toscrape.com``:
+of the form (``url pattern``, ``override_type``, ``overridden_type``). But more
+complex rules can be introduced if the class :py:class:`web_poet.overrides.OverrideRule`
+is used. The following example configures an override that is only applied for
+book pages from ``books.toscrape.com``:
 
 .. code-block:: python
 
+    from web_poet import OverrideRule
+
+
     SCRAPY_POET_OVERRIDES = [
         OverrideRule(
             for_patterns=Patterns(
@@ -111,7 +114,12 @@ Decorate Page Objects with the rules
 Having the rules along with the Page Objects is a good idea,
 as you can identify with a single sight what the Page Object is doing
 along with where it is applied. This can be done by decorating the
-Page Objects with ``@handle_urls`` provided by `web-poet`_.
+Page Objects with :py:func:`web_poet.handle_urls` provided by `web-poet`_.
+
+.. tip::
+    Make sure to read the :external:ref:`intro-overrides` Tutorial section of
+    `web-poet`_ to learn all of its other functionalities that is not covered
+    in this section.
 
 Let's see an example:
 
@@ -119,6 +127,7 @@ Let's see an example:
 
     from web_poet import handle_urls
 
+
     @handle_urls("toscrape.com", BookPage)
     class BTSBookPage(BookPage):
 
@@ -128,13 +137,13 @@ Let's see an example:
                 'name': self.css("title::text").get(),
             }
 
-The ``@handle_urls`` decorator in this case is indicating that
+The :py:func:`web_poet.handle_urls` decorator in this case is indicating that
 the class ``BSTBookPage`` should be used instead of ``BookPage``
 for the domain ``toscrape.com``.
 
 In order to configure the ``scrapy-poet`` overrides automatically
 using these annotations, you can directly interact with `web-poet`_'s
-``default_registry``.
+``default_registry`` (an instance of :py:class:`web_poet.overrides.PageObjectRegistry`).
 
 For example:
 
@@ -142,7 +151,7 @@ For example:
 
     from web_poet import default_registry, consume_modules
 
-    # The consume_modules() must be called first if you need to load
+    # The consume_modules() must be called first if you need to properly import
     # rules from other packages. Otherwise, it can be omitted.
     # More info about this caveat on web-poet docs.
     consume_modules("external_package_A", "another_ext_package.lib")
@@ -150,38 +159,20 @@ For example:
     # To get all of the Override Rules that were declared via annotations.
     SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
-    # The two lines above could be mixed together via this shortcut:
-    SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
-        consume=["external_package_A", "another_ext_package.lib"]
-    )
+The :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` method of the
+``default_registry`` above returns ``List[OverrideRule]`` that were declared
+using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much
+more convenient that manually defining all of the :py:class:`web_poet.overrides.OverrideRule`.
 
-The ``get_overrides()`` method of the ``default_registry`` above returns
-``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()``
-annotation. This is much more convenient that manually defining all of the 
-``OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as
+Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as
 ``List[OverrideRule]``, you can easily modify it later on if needed.
 
-.. tip::
-
-    If you're using External Packages which conform to the **POP**
-    standards as described in **web-poet's** `Page Object Projects (POP)
-    <https://web-poet.readthedocs.io/en/stable/intro/pop.html>`_ section,
-    then retrieving the rules should be as easy as:
-
-    .. code-block:: python
-
-        import external_package_A, another_ext_package
-
-        SCRAPY_POET_OVERRIDES = (
-            external_package_A.REGISTRY.get_overrides()
-            + another_ext_package.REGISTRY.get_overrides()
-        )
-
 .. note::
 
-    For more info and advanced features of `web-poet`_'s ``@handle_urls``
+    For more info and advanced features of `web-poet`_'s :py:func:`web_poet.handle_urls`
     and its registry, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
-    documentation regarding Overrides.
+    documentation, specifically its :external:ref:`intro-overrides` tutorial
+    section.
 
 
 Overrides registry
@@ -197,7 +188,8 @@ example.
 But the registry implementation can be changed at convenience. A different
 registry implementation can be configured using the property
 ``SCRAPY_POET_OVERRIDES_REGISTRY`` in ``settings.py``. The new registry
-must be a subclass of ``scrapy_poet.overrides.OverridesRegistryBase``
-and must implement the method ``overrides_for``. As other Scrapy components,
-it can be initialized from the ``from_crawler`` class method if implemented.
-This might be handy to be able to access settings, stats, request meta, etc.
+must be a subclass of :class:`scrapy_poet.overrides.OverridesRegistryBase` and
+must implement the method :meth:`scrapy_poet.overrides.OverridesRegistryBase.overrides_for`.
+As other Scrapy components, it can be initialized from the ``from_crawler`` class
+method if implemented. This might be handy to be able to access settings, stats,
+request meta, etc.
diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py
index fa254c52..a5e330d3 100644
--- a/scrapy_poet/overrides.py
+++ b/scrapy_poet/overrides.py
@@ -31,7 +31,7 @@ class OverridesRegistry(OverridesRegistryBase):
     """
     Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES``
     in the spider settings. It is a list and each rule can be a tuple or an
-    instance of the class ``OverrideRule``.
+    instance of the class :py:class:`web_poet.overrides.OverrideRule`.
 
     If a tuple is provided:
 
@@ -46,6 +46,10 @@ class OverridesRegistry(OverridesRegistryBase):
 
     .. code-block:: python
 
+        from url_matcher import Patterns
+        from scrapy_poet.overrides import OverrideRule
+
+
         SCRAPY_POET_OVERRIDES = [
             # Option 1
             ("books.toscrape.com", ISBNBookPage, BookPage),
@@ -58,45 +62,28 @@ class OverridesRegistry(OverridesRegistryBase):
             ),
         ]
 
-    Now, if you've used ``web-poet``'s built-in functionality to directly create
-    the override rules in the Page Object via the ``@handle_urls`` annotation,
-    you can quickly import them via the following code below. It finds all the
-    rules annotated using ``web-poet``'s ``@handle_urls`` decorator that were
-    registered into ``web_poet.default_registry``.
+    .. _web-poet: https://web-poet.readthedocs.io
+
+    Now, if you've used web-poet_'s built-in functionality to directly create
+    the :py:class:`web_poet.overrides.OverrideRule` in the Page Object via the
+    :py:func:`web_poet.handle_urls` annotation, you can quickly import them via
+    the following code below. It finds all the rules annotated using web-poet_'s
+    :py:func:`web_poet.handle_urls` as a decorator that were registered into
+    ``web_poet.default_registry`` (an instance of
+    :py:class:`web_poet.overrides.PageObjectRegistry`).
 
     .. code-block:: python
 
         from web_poet import default_registry, consume_modules
 
-        # The consume_modules() must be called first if you need to load
-        # rules from other packages. Otherwise, it can be omitted.
+        # The consume_modules() must be called first if you need to properly
+        # import rules from other packages. Otherwise, it can be omitted.
         # More info about this caveat on web-poet docs.
         consume_modules("external_package_A.po", "another_ext_package.lib")
         SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
 
-        # The two lines above could be mixed together via this shortcut:
-        SCRAPY_POET_OVERRIDES = default_registry.get_overrides(
-            consume=["external_package_A.po", "another_ext_package.lib"]
-        )
-
-    Make sure to call ``consume_module()`` beforehand. More info on this at
-    `web-poet <https://web-poet.readthedocs.io>`_.
-
-    .. tip::
-
-        If you're using External Packages which conform to the **POP**
-        standards as described in **web-poet's** `Page Object Projects (POP)
-        <https://web-poet.readthedocs.io/en/stable/intro/pop.html>`_ section,
-        then retrieving the rules should be as easy as:
-
-        .. code-block:: python
-
-            import external_package_A, another_ext_package
-
-            SCRAPY_POET_OVERRIDES = (
-                external_package_A.REGISTRY.get_overrides()
-                + another_ext_package.REGISTRY.get_overrides()
-            )
+    Make sure to call :py:func:`web_poet.overrides.consume_modules` beforehand.
+    More info on this at web-poet_.
     """
 
     @classmethod

From dd2a302dbcbfd4cd4463bd76438e93b90a9bc2b3 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Mon, 16 May 2022 13:44:30 +0800
Subject: [PATCH 17/19] update imports after web_poet refactoring

---
 tests/po_lib/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py
index 535de873..287bd7ea 100644
--- a/tests/po_lib/__init__.py
+++ b/tests/po_lib/__init__.py
@@ -21,7 +21,7 @@ def to_item(self):
         return {"msg": "PO that will be replace"}
 
 
-@handle_urls(f"{DOMAIN}:{PORT}", POOverriden)
+@handle_urls(f"{DOMAIN}:{PORT}", overrides=POOverriden)
 class POIntegration(ItemWebPage):
     def to_item(self):
         return {"msg": "PO replacement"}

From 05881057934c126a26a1232f981fd9928ac85f91 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 19 May 2022 13:12:32 +0800
Subject: [PATCH 18/19] fix return type annotation of get_scrapy_data_path()

---
 scrapy_poet/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py
index 7564b27c..80a7d715 100644
--- a/scrapy_poet/utils.py
+++ b/scrapy_poet/utils.py
@@ -1,11 +1,11 @@
 import os
-from pathlib import PosixPath
 
 from scrapy.utils.project import project_data_dir, inside_project
 
 
-def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> PosixPath:
+def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str:
     """Return a path to a folder where Scrapy is storing data.
+
     Usually that's a .scrapy folder inside the project.
     """
     # This code is extracted from scrapy.utils.project.data_path function,

From 0bc51b8670d75a88c03edc033d3d7b8b48b0cd20 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 19 May 2022 13:43:44 +0800
Subject: [PATCH 19/19] add override examples using @handle_urls

---
 docs/overrides.rst                            | 12 +++
 .../example/spiders/books_04_overrides_03.py  | 76 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 example/example/spiders/books_04_overrides_03.py

diff --git a/docs/overrides.rst b/docs/overrides.rst
index b0aabd6d..3ceb3d39 100644
--- a/docs/overrides.rst
+++ b/docs/overrides.rst
@@ -8,6 +8,18 @@ on the request URL domain. Please have a look to :ref:`intro-tutorial` to
 learn the basics about overrides before digging deeper in the content of this
 page.
 
+.. tip::
+
+    Some real-world examples on this topic can be found in:
+
+    - `Example 1 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_01.py>`_:
+      rules using tuples
+    - `Example 2 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_02.py>`_:
+      rules using tuples and :py:class:`web_poet.overrides.OverrideRule`
+    - `Example 3 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_03.py>`_:
+      rules using :py:func:`web_poet.handle_urls` decorator and retrieving them
+      via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides`
+
 Page Objects refinement
 =======================
 
diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py
new file mode 100644
index 00000000..f25fff07
--- /dev/null
+++ b/example/example/spiders/books_04_overrides_03.py
@@ -0,0 +1,76 @@
+"""
+Scrapy spider which uses Page Objects both for crawling and extraction,
+and uses overrides to support two different sites without changing
+the crawling logic (the spider is exactly the same)
+
+No configured default logic: if used for an unregistered domain, no logic
+at all is applied.
+
+This example is quite similar to books_04_overrides_02.py where the only
+difference is that this example is using the ``@handle_urls`` decorator to
+store the rules in web-poet's registry.
+"""
+import scrapy
+from web_poet import ItemWebPage, WebPage, handle_urls, default_registry
+from web_poet.overrides import OverrideRule
+from url_matcher import Patterns
+
+from scrapy_poet import callback_for
+
+
+class BookListPage(WebPage):
+
+    def book_urls(self):
+        return []
+
+
+class BookPage(ItemWebPage):
+
+    def to_item(self):
+        return None
+
+
+@handle_urls("toscrape.com", overrides=BookListPage)
+class BTSBookListPage(BookListPage):
+    """Logic to extract listings from pages like https://books.toscrape.com"""
+    def book_urls(self):
+        return self.css('.image_container a::attr(href)').getall()
+
+
+@handle_urls("toscrape.com", overrides=BookPage)
+class BTSBookPage(BookPage):
+    """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html"""
+    def to_item(self):
+        return {
+            'url': self.url,
+            'name': self.css("title::text").get(),
+        }
+
+
+@handle_urls("bookpage.com", overrides=BookListPage)
+class BPBookListPage(BookListPage):
+    """Logic to extract listings from pages like https://bookpage.com/reviews"""
+    def book_urls(self):
+        return self.css('article.post h4 a::attr(href)').getall()
+
+
+@handle_urls("bookpage.com", overrides=BookPage)
+class BPBookPage(BookPage):
+    """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction"""
+    def to_item(self):
+        return {
+            'url': self.url,
+            'name': self.css("body div > h1::text").get().strip(),
+        }
+
+
+class BooksSpider(scrapy.Spider):
+    name = 'books_04_overrides_03'
+    start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews']
+    # Configuring different page objects pages for different domains
+    custom_settings = {
+        "SCRAPY_POET_OVERRIDES": default_registry.get_overrides()
+    }
+
+    def parse(self, response, page: BookListPage):
+        yield from response.follow_all(page.book_urls(), callback_for(BookPage))