scrapinghub · ivanprado · Nov 29, 2021 · Nov 29, 2021 · Nov 30, 2021 · Nov 30, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,10 +5,11 @@ Changelog
 TBR
 ------------------
 
+* ``handle_urls`` decorator and ``find_page_object_overrides`` function added.
+* new CLI tool for displaying all available Page Objects: ``web_poet <path>``
 * removed support for Python 3.6
 * added support for Python 3.10
 
-
 0.1.1 (2021-06-02)
 ------------------
 

diff --git a/docs/api_reference.rst b/docs/api_reference.rst
@@ -1,3 +1,5 @@
+.. _`api-reference`:
+
 =============
 API Reference
 =============
@@ -45,3 +47,15 @@ Mixins
 .. autoclass:: web_poet.mixins.ResponseShortcutsMixin
    :members:
    :no-special-members:
+
+
+.. _`api-overrides`:
+
+Overrides
+=========
+
+.. autofunction:: web_poet.handle_urls
+
+.. automodule:: web_poet.overrides
+   :members:
+   :exclude-members: handle_urls
diff --git a/docs/conf.py b/docs/conf.py
@@ -192,4 +192,5 @@
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3', None, ),
     'scrapy': ('https://docs.scrapy.org/en/latest', None, ),
+    'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None, ),
 }
diff --git a/docs/index.rst b/docs/index.rst
@@ -33,6 +33,7 @@ and the motivation behind ``web-poet``, start with :ref:`from-ground-up`.
 
    intro/tutorial
    intro/from-ground-up
+   intro/overrides
 
 .. toctree::
    :caption: Reference

diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst
@@ -0,0 +1,122 @@
+.. _`intro-overrides`:
+
+Overrides
+=========
+
+Overrides contains mapping rules to associate which URLs a particular
+Page Object would be used. The URL matching rules is handled by another library
+called `url-matcher <https://url-matcher.readthedocs.io>`_.
+
+Using such matching rules establishes the core concept of Overrides wherein
+its able to use specific Page Objects in lieu of the original one.
-its able to use specific Page Objects in lieu of the original one.
+it's able to use specific Page Objects in lieu of the original one.
-its able to use specific Page Objects in lieu of the original one.
+it's able to use specific Page Objects in lieu of the original one.
+
+This enables ``web-poet`` to be used effectively by other frameworks like 
+`scrapy-poet <https://scrapy-poet.readthedocs.io>`_.
+
+Example Use Case
+----------------
+
+Let's explore an example use case for the Overrides concept.
+
+Suppose we're using Page Objects for our broadcrawl project which explores
+eCommerce websites to discover product pages. It wouldn't be entirely possible
+for us to create parsers for all websites since we don't know which sites we're
+going to crawl beforehand.
+
+However, we could at least create a generic Page Object to support parsing of
+some fields in well-known locations of product information like ``<title>``.
+This enables our broadcrawler to at least parse some useful information. Let's
+call such Page Object to be ``GenericProductPage``.
+
+Assuming that one of our project requirements is to fully support parsing of the
+`top 3 eCommerce websites`, then we'd need to create a Page Object for each one
+to parse more specific fields.
+
+Here's where the Overrides concept comes in:
+
+    1. The ``GenericProductPage`` is used to parse all eCommerce product pages
+       `by default`.
+    2. Whenever one of our declared URL rules matches with a given page URL,
+       then the Page Object associated with that rule `overrides (or replaces)`
+       the default ``GenericProductPage``.
+
+This enables us to fine tune our parsing logic `(which are abstracted away for
+each Page Object)` depending on the page we're parsing.
+
+Let's see this in action by creating Page Objects below.
+
+
+Creating Overrides
+------------------
+
+Let's take a look at how the following code is structured:
+
+.. code-block:: python
+
+    from web_poet import handle_urls
+    from web_poet.pages import ItemWebPage
+
+    class GenericProductPage(ItemWebPage):
+        def to_item(self):
+            return {"product title": self.css("title::text").get()}
+
+    @handle_urls("example.com", overrides=GenericProductPage)
+    class ExampleProductPage(ItemWebPage):
+        def to_item(self):
+            ...  # more specific parsing
+
+    @handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/")
+    class AnotherExampleProductPage(ItemWebPage):
+        def to_item(self):
+            ...  # more specific parsing
+
+    @handle_urls(["dualexample.com", "dualexample.net"], overrides=GenericProductPage)
+    class DualExampleProductPage(ItemWebPage):
+        def to_item(self):
+            ...  # more specific parsing
+
+The code above declares that:
+
+    - For sites that matches the ``example.com`` pattern, ``ExampleProductPage``
+      would be used instead of ``GenericProductPage``.
+    - The same is true for ``YetAnotherExampleProductPage`` where it is used
+      instead of ``GenericProductPage`` for two URLs: ``dualexample.com`` and
-      instead of ``GenericProductPage`` for two URLs: ``dualexample.com`` and
+      instead of ``GenericProductPage`` for two websites: ``dualexample.com`` and
-      instead of ``GenericProductPage`` for two URLs: ``dualexample.com`` and
+      instead of ``GenericProductPage`` for two websites: ``dualexample.com`` and
+      ``dualexample.net``.
+    - However, ``AnotherExampleProductPage`` is only used instead of ``GenericProductPage``
+      when we're parsing pages from ``anotherexample.com`` which doesn't contain
+      ``/digital-goods/`` in its URL path.
+
+The override mechanism that ``web-poet`` offers could also still be further
+customized. You can read some of the specific parameters and alternative ways
+to organize the rules via the :ref:`Overrides API section <api-overrides>`.
+
+
+Viewing all available Overrides
+-------------------------------
+
+A convenience function is available discover and retrieve all rules from your
+project. Make sure to check out :ref:`Overrides API section <api-overrides>`
+to see the other functionalities of ``find_page_object_overrides``.
+
+.. code-block::
+
+    from web_poet import find_page_object_overrides
+
+    rules = find_page_object_overrides("my_project.page_objects")
+
+    print(len(rules))  # 3
+
+    print(rules[0])  # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, meta={})
+
+
+A handy CLI tool is also available at your disposal to quickly see the available
+Override rules in a given module in your project. For example, invoking something
+like ``web_poet my_project.page_objects`` would produce the following:
+
+.. code-block::
+
+    Use this                                              instead of                                  for the URL patterns                    except for the patterns      with priority  meta
+    ----------------------------------------------------  ------------------------------------------  --------------------------------------  -------------------------  ---------------  ------
+    my_project.page_objects.ExampleProductPage            my_project.page_objects.GenericProductPage  ['example.com']                         []                                     500  {}
+    my_project.page_objects.AnotherExampleProductPage     my_project.page_objects.GenericProductPage  ['anotherexample.com']                  ['/digital-goods/']                    500  {}
+    my_project.page_objects.DualExampleProductPage        my_project.page_objects.GenericProductPage  ['dualexample.com', 'dualexample.net']  []                                     500  {}
diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst
@@ -131,4 +131,4 @@ As you can see, it's possible to use web-poet with built-in libraries such as
 `scrapy-poet <https://scrapy-poet.readthedocs.io>`_.
 
 If you want to understand the idea behind web-poet better,
-check the :ref:`from-ground-up` tutorial.
+check the :ref:`from-ground-up` tutorial.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -5,4 +5,4 @@ sphinxcontrib-devhelp==1.0.2
 sphinxcontrib-htmlhelp==2.0.0
 sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.3
-sphinxcontrib-serializinghtml==1.1.5
+sphinxcontrib-serializinghtml==1.1.5
diff --git a/setup.py b/setup.py
@@ -14,6 +14,7 @@
     author='Scrapinghub',
     author_email='[email protected]',
     url='https://github.com/scrapinghub/web-poet',
+    entry_points={'console_scripts': ['web_poet = web_poet.__main__:main']},
     packages=find_packages(
         exclude=(
             'tests',
@@ -22,6 +23,8 @@
     install_requires=(
         'attrs',
         'parsel',
+        'url-matcher',
+        'tabulate',
     ),
     classifiers=(
         'Development Status :: 2 - Pre-Alpha',

diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py
@@ -0,0 +1,43 @@
+"""
+This package is just for overrides testing purposes.
+"""
+from typing import Dict, Any, Callable
+
+from url_matcher import Patterns
+
+from web_poet import handle_urls, PageObjectRegistry
+
+
+class POBase:
+    expected_overrides: Callable
+    expected_patterns: Patterns
+    expected_meta: Dict[str, Any]
+
+
+class POTopLevelOverriden1:
+    ...
+
+
+class POTopLevelOverriden2:
+    ...
+
+
+secondary_registry = PageObjectRegistry(name="secondary")
+
+
+# This first annotation is ignored. A single annotation per registry is allowed
+@handle_urls("example.com", POTopLevelOverriden1)
+@handle_urls("example.com", POTopLevelOverriden1, exclude="/*.jpg|", priority=300)
+class POTopLevel1(POBase):
+    expected_overrides = POTopLevelOverriden1
+    expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300)
+    expected_meta = {}  # type: ignore
+
+
+# The second annotation is for a different registry
+@handle_urls("example.com", POTopLevelOverriden2)
+@secondary_registry.handle_urls("example.org", POTopLevelOverriden2)
+class POTopLevel2(POBase):
+    expected_overrides = POTopLevelOverriden2
+    expected_patterns = Patterns(["example.com"])
+    expected_meta = {}  # type: ignore
diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py
@@ -0,0 +1,16 @@
+from url_matcher import Patterns
+
+from tests.po_lib import POBase
+from web_poet import handle_urls
+
+
+class POModuleOverriden:
+    ...
+
+
+@handle_urls("example.com", overrides=POModuleOverriden, extra_arg="foo")
+class POModule(POBase):
+    expected_overrides = POModuleOverriden
+    expected_patterns = Patterns(["example.com"])
+    expected_meta = {"extra_arg": "foo"}  # type: ignore
+
diff --git a/tests/po_lib/an_empty_module.py b/tests/po_lib/an_empty_module.py
diff --git a/tests/po_lib/an_empty_package/__init__.py b/tests/po_lib/an_empty_package/__init__.py
diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py
@@ -0,0 +1,15 @@
+from url_matcher import Patterns
+
+from tests.po_lib import POBase
+from web_poet import handle_urls
+
+
+class PONestedPkgOverriden:
+    ...
+
+
+@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedPkgOverriden)
+class PONestedPkg(POBase):
+    expected_overrides = PONestedPkgOverriden
+    expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"])
+    expected_meta = {}  # type: ignore
diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py
@@ -0,0 +1,21 @@
+from url_matcher import Patterns
+
+from tests.po_lib import POBase, secondary_registry
+from web_poet import handle_urls
+
+
+class PONestedModuleOverriden:
+    ...
+
+
+class PONestedModuleOverridenSecondary:
+    ...
+
+
+@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedModuleOverriden)
+@secondary_registry.handle_urls("example.com", PONestedModuleOverridenSecondary)
+class PONestedModule(POBase):
+    expected_overrides = PONestedModuleOverriden
+    expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"])
+    expected_meta = {}  # type: ignore
+
diff --git a/tests/test_overrides.py b/tests/test_overrides.py
@@ -0,0 +1,78 @@
+import pytest
+from url_matcher import Patterns
+
+from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2
+from tests.po_lib.a_module import POModule
+from tests.po_lib.nested_package import PONestedPkg
+from tests.po_lib.nested_package.a_nested_module import (
+    PONestedModule,
+    PONestedModuleOverridenSecondary,
+)
+from web_poet.overrides import find_page_object_overrides, PageObjectRegistry
+
+
+POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule}
+
+
+def test_list_page_objects_from_pkg():
+    """Tests that metadata is extracted properly from the po_lib package"""
+    rules = find_page_object_overrides("tests.po_lib")
+    assert {po.use for po in rules} == POS
+
+    for rule in rules:
+        assert rule.instead_of == rule.use.expected_overrides, rule.use
+        assert rule.for_patterns == rule.use.expected_patterns, rule.use
+        assert rule.meta == rule.use.expected_meta, rule.use
+
+
+def test_list_page_objects_from_module():
+    rules = find_page_object_overrides("tests.po_lib.a_module")
+    assert len(rules) == 1
+    rule = rules[0]
+    assert rule.use == POModule
+    assert rule.for_patterns == POModule.expected_patterns
+    assert rule.instead_of == POModule.expected_overrides
+
+
+def test_list_page_objects_from_empty_module():
+    rules = find_page_object_overrides("tests.po_lib.an_empty_module")
+    assert len(rules) == 0
+
+
+def test_list_page_objects_from_empty_pkg():
+    rules = find_page_object_overrides("tests.po_lib.an_empty_package")
+    assert len(rules) == 0
+
+
+def test_list_page_objects_from_unknown_module():
+    with pytest.raises(ImportError):
+        find_page_object_overrides("tests.po_lib.unknown_module")
+
+
+def test_list_page_objects_from_imported_registry():
+    rules = find_page_object_overrides("tests.po_lib", registry_name="secondary")
+    assert len(rules) == 2
+    rule_for = {po.use: po for po in rules}
+
+    potop2 = rule_for[POTopLevel2]
+    assert potop2.for_patterns == Patterns(["example.org"])
+    assert potop2.instead_of == POTopLevelOverriden2
+
+    pones = rule_for[PONestedModule]
+    assert pones.for_patterns == Patterns(["example.com"])
+    assert pones.instead_of == PONestedModuleOverridenSecondary
+
+
+def test_list_page_objects_from_non_existing_registry():
+    assert find_page_object_overrides("tests.po_lib", registry_name="not-exist") == []
+
+
+def test_cmd():
+    from web_poet.__main__ import main
+
+    assert main(["tests.po_lib"]) is None
+
+
+def test_registry_repr():
+    registry = PageObjectRegistry(name="test")
+    assert "name='test'" in str(registry)
diff --git a/tox.ini b/tox.ini
@@ -15,6 +15,7 @@ commands =
 [testenv:mypy]
 deps =
     mypy
+    types-tabulate
 
 commands = mypy --ignore-missing-imports web_poet tests
 

diff --git a/web_poet/__init__.py b/web_poet/__init__.py
@@ -1,2 +1,3 @@
 from .pages import WebPage, ItemPage, ItemWebPage, Injectable
-from .page_inputs import ResponseData
+from .page_inputs import ResponseData
+from .overrides import handle_urls, find_page_object_overrides, PageObjectRegistry
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ commands = @@
     [testenv:mypy]
     deps =
         mypy
+        types-tabulate
     commands = mypy --ignore-missing-imports web_poet tests
@@ Expand Down @@