From b4ac78962ff3471f24e35e0457a337c3140e01c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Wed, 8 Dec 2021 11:51:05 +0100 Subject: [PATCH 01/19] url-matcher integration with scrapy-poet --- CHANGELOG.rst | 2 + docs/conf.py | 3 +- docs/intro/tutorial.rst | 44 ++++++---- docs/overrides.rst | 85 +++++++++++++++++-- .../example/spiders/books_04_overrides_01.py | 14 ++- .../example/spiders/books_04_overrides_02.py | 20 ++--- scrapy_poet/injection.py | 8 +- scrapy_poet/middleware.py | 4 +- scrapy_poet/overrides.py | 74 +++++++++++++++- scrapy_poet/utils.py | 15 ---- setup.py | 3 +- tests/conftest.py | 2 - tests/test_injection.py | 19 +++-- tests/test_middleware.py | 12 ++- tests/utils.py | 8 +- tox.ini | 2 +- 16 files changed, 224 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e48b029a..0943b9e9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,8 @@ TBR: ------------------ * Cache mechanism using SCRAPY_POET_CACHE setting +* New and richer SCRAPY_POET_OVERRIDES registry that uses the + url-matcher patterns to configure the overrides 0.2.1 (2021-06-11) ------------------ diff --git a/docs/conf.py b/docs/conf.py index 027e717f..e13717de 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -188,7 +188,8 @@ intersphinx_mapping = { 'python': ('https://docs.python.org/3', None, ), 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), - 'web_poet': ('https://web-poet.readthedocs.io/en/stable/', None), + 'web-poet': ('https://web-poet.readthedocs.io/en/stable/', None), + 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None), } autodoc_default_options = { diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 8911cfaa..b04a84d0 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -348,12 +348,10 @@ be done by configuring ``SCRAPY_POET_OVERRIDES`` into ``settings.py``: .. code-block:: python - SCRAPY_POET_OVERRIDES = { - "toscrape.com": { - BookListPage: BTSBookListPage, - BookPage: BTSBookPage - } - } + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage) + ] The spider is back to life! ``SCRAPY_POET_OVERRIDES`` contain rules that overrides the Page Objects @@ -381,7 +379,7 @@ to implement new ones: class BPBookListPage(WebPage): def book_urls(self): - return self.css('.article-info a::attr(href)').getall() + return self.css('article.post h4 a::attr(href)').getall() class BPBookPage(ItemWebPage): @@ -389,7 +387,7 @@ to implement new ones: def to_item(self): return { 'url': self.url, - 'name': self.css(".book-data h4::text").get().strip(), + 'name': self.css("body div > h1::text").get().strip(), } The last step is configuring the overrides so that these new Page Objects @@ -399,16 +397,12 @@ are used for the domain .. code-block:: python - SCRAPY_POET_OVERRIDES = { - "toscrape.com": { - BookListPage: BTSBookListPage, - BookPage: BTSBookPage - }, - "bookpage.com": { - BookListPage: BPBookListPage, - BookPage: BPBookPage - } - } + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage), + ("bookpage.com", BPBookListPage, BookListPage), + ("bookpage.com", BPBookPage, BookPage) + ] The spider is now ready to extract books from both sites ๐Ÿ˜€. The full example @@ -418,6 +412,20 @@ On a surface, it looks just like a different way to organize Scrapy spider code - and indeed, it *is* just a different way to organize the code, but it opens some cool possibilities. +.. note:: + + In the examples above we have been configuring the overrides + for a particular domain, but more complex URL patterns are also possible. + For example, the pattern ``books.toscrape.com/cataloge/category/`` + is accepted and it would restrict the override only to category pages. + + It is even possible to configure more complex patterns by + using the ``OverrideRule`` class instead of a triplet in + the configuration. + + Also see the `url-matcher `_ + documentation for more information about the patterns syntax. + Next steps ========== diff --git a/docs/overrides.rst b/docs/overrides.rst index 5d115757..8a07bbbe 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -47,11 +47,9 @@ And then override it for a particular domain using ``settings.py``: .. code-block:: python - SCRAPY_POET_OVERRIDES = { - "example.com": { - BookPage: ISBNBookPage - } - } + SCRAPY_POET_OVERRIDES = [ + ("example.com", ISBNBookPage, BookPage) + ] This new Page Objects gets the original ``BookPage`` as dependency and enrich the obtained item with the ISBN from the page HTML. @@ -79,13 +77,82 @@ the obtained item with the ISBN from the page HTML. return item +Overrides rules +=============== + +The default way of configuring the override rules is using triplets +of the form (``url pattern``, ``override_type``, ``overridden_type``). But +more complex rules can be introduced if the class ``OverrideRule`` +is used. The following example configures an override that +is only applied for book pages from ``books.toscrape.com``: + +.. code-block:: python + + + SCRAPY_POET_OVERRIDES = [ + OverrideRule( + for_patterns=Patterns( + include=["books.toscrape.com/cataloge/*index.html|"], + exclude=["/catalogue/category/"]), + use=MyBookPage, + instead_of=BookPage + ) + ] + +Note how category pages are excludes by using a ``exclude`` pattern. +You can find more information about the patterns syntax in the +`url-matcher `_ +documentation. + + +Decorate Page Objects with the rules +==================================== + +Having the rules along with the Page Objects is a good idea, +as you can identify with a single sight what the Page Object is doing +along with where it is applied. This can be done by decorating the +Page Objects with ``handle_urls`` and then +configure the overrides automatically with the help of the function +``find_page_object_overrides``. + +Let's see an example: + +.. code-block:: python + + @handle_urls("toscrape.com", BookPage) + class BTSBookPage(BookPage): + + def to_item(self): + return { + 'url': self.url, + 'name': self.css("title::text").get(), + } + +The ``handle_urls`` decorator in this case is indicating that +the class ``BSTBookPage`` should be used instead of ``BookPage`` +for the domain ``toscrape.com``. + +In order to configure the scrapy-poet overrides automatically +using these annotations, +you can use the function ``find_page_object_overrides``. +For example: + +.. code-block:: python + + SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module") + +The function will collect all the ``handle_urls`` annotations from the +``my_page_objects_module`` and submodules, and will convert them +to rules ready to be used with ``SCRAPY_POET_OVERRIDES``. + Overrides registry ================== -The overrides registry is responsible for informing whether there exists an -override for a particular type for a given response. The default overrides -registry keeps a map of overrides for each domain and read this configuration -from settings ``SCRAPY_POET_OVERRIDES`` as has been seen in the :ref:`intro-tutorial` +The overrides registry is responsible of informing whether there exists an +override for a particular type for a given request. The default overrides +registry allows to configure these rules using patterns that follows the +`url-matcher `_ syntax. These rules can be configured using the +``SCRAPY_POET_OVERRIDES`` setting, as it has been seen in the :ref:`intro-tutorial` example. But the registry implementation can be changed at convenience. A different diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py index 266f019d..ab266c08 100644 --- a/example/example/spiders/books_04_overrides_01.py +++ b/example/example/spiders/books_04_overrides_01.py @@ -28,7 +28,7 @@ def to_item(self): class BPBookListPage(WebPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" def book_urls(self): - return self.css('.article-info a::attr(href)').getall() + return self.css('article.post h4 a::attr(href)').getall() class BPBookPage(ItemWebPage): @@ -36,7 +36,7 @@ class BPBookPage(ItemWebPage): def to_item(self): return { 'url': self.url, - 'name': self.css(".book-data h4::text").get().strip(), + 'name': self.css("body div > h1::text").get().strip(), } @@ -45,12 +45,10 @@ class BooksSpider(scrapy.Spider): start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] # Configuring different page objects pages from the bookpage.com domain custom_settings = { - "SCRAPY_POET_OVERRIDES": { - "bookpage.com": { - BookListPage: BPBookListPage, - BookPage: BPBookPage - } - } + "SCRAPY_POET_OVERRIDES": [ + ("bookpage.com", BPBookListPage, BookListPage), + ("bookpage.com", BPBookPage, BookPage) + ] } def parse(self, response, page: BookListPage): diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index 9e6e8c2a..177656e5 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -41,7 +41,7 @@ def to_item(self): class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" def book_urls(self): - return self.css('.article-info a::attr(href)').getall() + return self.css('article.post h4 a::attr(href)').getall() class BPBookPage(BookPage): @@ -49,7 +49,7 @@ class BPBookPage(BookPage): def to_item(self): return { 'url': self.url, - 'name': self.css(".book-data h4::text").get().strip(), + 'name': self.css("body div > h1::text").get().strip(), } @@ -58,16 +58,12 @@ class BooksSpider(scrapy.Spider): start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] # Configuring different page objects pages for different domains custom_settings = { - "SCRAPY_POET_OVERRIDES": { - "toscrape.com": { - BookListPage: BTSBookListPage, - BookPage: BTSBookPage - }, - "bookpage.com": { - BookListPage: BPBookListPage, - BookPage: BPBookPage - }, - } + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage), + ("bookpage.com", BPBookListPage, BookListPage), + ("bookpage.com", BPBookPage, BookPage) + ] } def parse(self, response, page: BookListPage): diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index dd18cbd4..adfc4292 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -15,14 +15,14 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.conf import build_component_list from scrapy.utils.defer import maybeDeferred_coro -from scrapy.utils.misc import load_object +from scrapy.utils.misc import load_object, create_instance from scrapy_poet.cache import SqlitedictCache from scrapy_poet.injection_errors import (UndeclaredProvidedTypeError, NonCallableProviderError, InjectionError) from scrapy_poet.overrides import OverridesRegistryBase, \ - PerDomainOverridesRegistry + OverridesRegistry from scrapy_poet.page_input_providers import PageObjectInputProvider from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse from web_poet.pages import is_injectable @@ -43,7 +43,7 @@ def __init__(self, overrides_registry: Optional[OverridesRegistryBase] = None): self.crawler = crawler self.spider = crawler.spider - self.overrides_registry = overrides_registry or PerDomainOverridesRegistry() + self.overrides_registry = overrides_registry or OverridesRegistry() self.load_providers(default_providers) self.init_cache() @@ -348,6 +348,8 @@ class MySpider(Spider): spider = MySpider() spider.settings = settings crawler.spider = spider + if not overrides_registry: + overrides_registry = create_instance(OverridesRegistry, settings, crawler) return Injector(crawler, overrides_registry=overrides_registry) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index c2584c62..7b96b735 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -11,7 +11,7 @@ from scrapy.utils.misc import create_instance, load_object from . import api -from .overrides import PerDomainOverridesRegistry +from .overrides import OverridesRegistry from .page_input_providers import ResponseDataProvider from .injection import Injector @@ -35,7 +35,7 @@ def __init__(self, crawler: Crawler): self.crawler = crawler settings = self.crawler.settings registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", - PerDomainOverridesRegistry)) + OverridesRegistry)) self.overrides_registry = create_instance(registry_cls, settings, crawler) self.injector = Injector(crawler, default_providers=DEFAULT_PROVIDERS, diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index dc3b9cf8..55347ac2 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -1,9 +1,14 @@ +from collections import defaultdict + from abc import ABC, abstractmethod -from typing import Dict, Mapping, Callable +from typing import Dict, Mapping, Callable, Iterable, Union, Tuple, Optional, List from scrapy import Request from scrapy.crawler import Crawler -from scrapy_poet.utils import get_domain +from url_matcher import Patterns, URLMatcher + +from url_matcher.util import get_domain +from web_poet.overrides import OverrideRule class OverridesRegistryBase(ABC): @@ -42,3 +47,68 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: return self.get(get_domain(request.url), {}) +RuleAsTuple = Union[Tuple[str, Callable, Callable], List] + +class OverridesRegistry(OverridesRegistryBase): + """ + Overrides registry that reads the overrides + from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings. It + is a list and each rule can be a tuple or an instance of the class ``OverrideRule``. + + If a tuple is provided, the first element is the pattern to match the URL, + the second element is the type to be used instead of the type in + the third element. Another way to see it: + for the URLs that match the pattern ``tuple[0]`` use ``tuple[1]`` instead of ``tuple[2]``. + + Example of overrides configuration: + + .. code-block:: python + + + SCRAPY_POET_OVERRIDES = [ + ("books.toscrape.com", ISBNBookPage, BookPage), + OverrideRule(for_patterns=Patterns(["books.toscrape.com"]), + use=MyBookListPage, + instead_of=BookListPage, + ), + ] + + It can be handy to compile the list of rules automatically + from a module using the method ``find_page_object_overrides``. For example: + + .. code-block:: python + + SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module") + + It finds all the rules annotated using the decorator ``handle_urls`` inside the module ``my_page_objects_module`` and + its submodules. + """ + + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", [])) + + def __init__(self, rules: Optional[Iterable[Union[RuleAsTuple, OverrideRule]]] = None): + self.rules: List[OverrideRule] = [] + self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher) + for rule in rules or []: + self.add_rule(rule) + + def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]): + if isinstance(rule, (tuple, list)): + if len(rule) != 3: + raise ValueError(f"Invalid overrides rule: {rule}. Rules as tuples must have three elements: " + f"the pattern, the type to override and the new type to use instead.") + pattern, use, instead_of = rule + rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of) + self.rules.append(rule) + print(rule) + self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns) + + def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: + overrides = {} + for instead_of, matcher in self.matcher.items(): + rule_id = matcher.match(request.url) + if rule_id is not None: + overrides[instead_of] = self.rules[rule_id].use + return overrides diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 965b5f9c..94a4bde0 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,21 +1,6 @@ import os from scrapy.utils.project import project_data_dir, inside_project -from tldextract import tldextract - - -def get_domain(url): - """ - Return the domain without any subdomain - - >>> get_domain("http://blog.example.com") - 'example.com' - >>> get_domain("http://www.example.com") - 'example.com' - >>> get_domain("http://deeper.blog.example.co.uk") - 'example.co.uk' - """ - return ".".join(tldextract.extract(url)[-2:]) def get_scrapy_data_path(createdir=True): diff --git a/setup.py b/setup.py index d8b34efd..6404cdc0 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,8 @@ 'andi >= 0.4.1', 'attrs', 'parsel', - 'web-poet', + 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls#egg=web-poet', + 'url-matcher', 'tldextract', 'sqlitedict', ], diff --git a/tests/conftest.py b/tests/conftest.py index 6082152f..209ac514 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,6 @@ import pytest from scrapy.settings import Settings -from scrapy_poet.page_input_providers import ResponseDataProvider - @pytest.fixture() def settings(request): diff --git a/tests/test_injection.py b/tests/test_injection.py index cabe5bf4..95a38d6d 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -7,7 +7,9 @@ from scrapy import Request from scrapy.http import Response -from scrapy_poet.utils import get_domain +from url_matcher import Patterns + +from url_matcher.util import get_domain from scrapy_poet import ResponseDataProvider, PageObjectInputProvider, \ DummyResponse @@ -15,9 +17,10 @@ get_injector_for_testing, get_response_for_testing from scrapy_poet.injection_errors import NonCallableProviderError, \ InjectionError, UndeclaredProvidedTypeError -from scrapy_poet.overrides import PerDomainOverridesRegistry +from scrapy_poet.overrides import OverridesRegistry from web_poet import Injectable, ItemPage from web_poet.mixins import ResponseShortcutsMixin +from web_poet.overrides import OverrideRule def get_provider(classes, content=None): @@ -301,13 +304,11 @@ def test_overrides(self, providers, override_should_happen): domain = "example.com" if override_should_happen else "other-example.com" # The request domain is example.com, so overrides shouldn't be applied # when we configure them for domain other-example.com - overrides = { - domain: { - PricePO: PriceInDollarsPO, - EurDollarRate: OtherEurDollarRate - } - } - registry = PerDomainOverridesRegistry(overrides) + overrides = [ + (domain, PriceInDollarsPO, PricePO), + OverrideRule(Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate) + ] + registry = OverridesRegistry(overrides) injector = get_injector_for_testing(providers, overrides_registry=registry) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index a7a4ab46..bdd2e8e9 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -13,7 +13,9 @@ import attr from scrapy_poet import callback_for -from scrapy_poet.utils import get_domain +from url_matcher.util import get_domain + +from tests.mockserver import get_ephemeral_port from web_poet.pages import WebPage, ItemPage, ItemWebPage from scrapy_poet.page_input_providers import ( PageObjectInputProvider @@ -103,10 +105,12 @@ def test_basic_case(settings): def test_overrides(settings): host = socket.gethostbyname(socket.gethostname()) domain = get_domain(host) - settings["SCRAPY_POET_OVERRIDES"] = { - domain: {BreadcrumbsExtraction: OverridenBreadcrumbsExtraction}} + port = get_ephemeral_port() + settings["SCRAPY_POET_OVERRIDES"] = [ + (f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction) + ] item, url, _ = yield crawl_single_item(spider_for(ProductPage), - ProductHtml, settings) + ProductHtml, settings, port=port) assert item == { 'url': url, 'name': 'Chocolate', diff --git a/tests/utils.py b/tests/utils.py index 7dd46b7d..55b26f5b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -24,26 +24,26 @@ def render_GET(self, request): @inlineCallbacks -def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None): +def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None, port=None): """Use spider_cls to crawl resource_cls. URL of the resource is passed to the spider as ``url`` argument. Return ``(items, resource_url, crawler)`` tuple. """ spider_kwargs = {} if spider_kwargs is None else spider_kwargs crawler = make_crawler(spider_cls, settings) - with MockServer(resource_cls) as s: + with MockServer(resource_cls, port=port) as s: root_url = s.root_url yield crawler.crawl(url=root_url, **spider_kwargs) return crawler.spider.collected_items, s.root_url, crawler @inlineCallbacks -def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None): +def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None, port=None): """Run a spider where a single item is expected. Use in combination with ``capture_capture_exceptions`` and ``CollectorPipeline`` """ items, url, crawler = yield crawl_items(spider_cls, resource_cls, settings, - spider_kwargs=spider_kwargs) + spider_kwargs=spider_kwargs, port=port) assert len(items) == 1 resp = items[0] if 'exception' in resp: diff --git a/tox.ini b/tox.ini index 50d2b8d0..b6f46a45 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ deps = pytest-cov scrapy >= 2.1.0 pytest-twisted - web-poet + web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls#egg=web-poet commands = py.test \ From 35e7876c15432d4008eed8831fd32d7043590e97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Thu, 9 Dec 2021 10:51:09 +0100 Subject: [PATCH 02/19] Remove a print line --- scrapy_poet/overrides.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 55347ac2..5f76f377 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -102,7 +102,6 @@ def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]): pattern, use, instead_of = rule rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of) self.rules.append(rule) - print(rule) self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns) def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: From 327139e4ff43eb71665312df537d9166db57c180 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 19:19:49 +0800 Subject: [PATCH 03/19] improve docs and example code --- CHANGELOG.rst | 8 ++- docs/intro/tutorial.rst | 56 ++++++++++++------- docs/overrides.rst | 30 ++++++---- .../example/spiders/books_04_overrides_02.py | 9 ++- 4 files changed, 67 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fd3363fb..6b681816 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,9 +6,13 @@ Changelog TBR: ------------------ -* Cache mechanism using SCRAPY_POET_CACHE setting -* New and richer SCRAPY_POET_OVERRIDES registry that uses the +* Cache mechanism using ``SCRAPY_POET_CACHE`` setting +* New and richer ``SCRAPY_POET_OVERRIDES`` registry that uses the url-matcher patterns to configure the overrides + + * This results in a **backward incompatible** change since the rules + follow a different structure. + * removed support for Python 3.6 * added support for Python 3.10 diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index a2aab34a..33af5f4c 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -348,10 +348,10 @@ be done by configuring ``SCRAPY_POET_OVERRIDES`` into ``settings.py``: .. code-block:: python - "SCRAPY_POET_OVERRIDES": [ - ("toscrape.com", BTSBookListPage, BookListPage), - ("toscrape.com", BTSBookPage, BookPage) - ] + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage) + ] The spider is back to life! ``SCRAPY_POET_OVERRIDES`` contain rules that overrides the Page Objects @@ -397,34 +397,48 @@ are used for the domain .. code-block:: python - "SCRAPY_POET_OVERRIDES": [ - ("toscrape.com", BTSBookListPage, BookListPage), - ("toscrape.com", BTSBookPage, BookPage), - ("bookpage.com", BPBookListPage, BookListPage), - ("bookpage.com", BPBookPage, BookPage) - ] + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage), + ("bookpage.com", BPBookListPage, BookListPage), + ("bookpage.com", BPBookPage, BookPage) + ] The spider is now ready to extract books from both sites ๐Ÿ˜€. The full example `can be seen here `_ -On a surface, it looks just like a different way to organize Scrapy spider +On the surface, it looks just like a different way to organize Scrapy spider code - and indeed, it *is* just a different way to organize the code, but it opens some cool possibilities. -.. note:: +In the examples above we have been configuring the overrides +for a particular domain, but more complex URL patterns are also possible. +For example, the pattern ``books.toscrape.com/cataloge/category/`` +is accepted and it would restrict the override only to category pages. + +It is even possible to configure more complex patterns by +using the ``OverrideRule`` class instead of a triplet in +the configuration. Another way of declaring the earlier config +for ``SCRAPY_POET_OVERRIDES`` would be the following: + +.. code-block:: python + + from url_matcher import Patterns + from web_poet.overrides import OverrideRule - In the examples above we have been configuring the overrides - for a particular domain, but more complex URL patterns are also possible. - For example, the pattern ``books.toscrape.com/cataloge/category/`` - is accepted and it would restrict the override only to category pages. + SCRAPY_POET_PROVIDERS = [ + OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), + OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), + ] - It is even possible to configure more complex patterns by - using the ``OverrideRule`` class instead of a triplet in - the configuration. +As you can see, this could get verbose. The earlier tuple config simply offers +a shortcut to be more concise. - Also see the `url-matcher `_ - documentation for more information about the patterns syntax. +Also see the `url-matcher `_ +documentation for more information about the patterns syntax. Next steps ========== diff --git a/docs/overrides.rst b/docs/overrides.rst index 9a23060b..9bf89d04 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -100,7 +100,7 @@ is only applied for book pages from ``books.toscrape.com``: ) ] -Note how category pages are excludes by using a ``exclude`` pattern. +Note how category pages are excluded by using a ``exclude`` pattern. You can find more information about the patterns syntax in the `url-matcher `_ documentation. @@ -120,38 +120,46 @@ Let's see an example: .. code-block:: python - @handle_urls("toscrape.com", BookPage) - class BTSBookPage(BookPage): + from web_poet import handle_urls - def to_item(self): - return { - 'url': self.url, - 'name': self.css("title::text").get(), - } + @handle_urls("toscrape.com", BookPage) + class BTSBookPage(BookPage): + + def to_item(self): + return { + 'url': self.url, + 'name': self.css("title::text").get(), + } The ``handle_urls`` decorator in this case is indicating that the class ``BSTBookPage`` should be used instead of ``BookPage`` for the domain ``toscrape.com``. -In order to configure the scrapy-poet overrides automatically +In order to configure the ``scrapy-poet`` overrides automatically using these annotations, you can use the function ``find_page_object_overrides``. For example: .. code-block:: python + from web_poet import find_page_object_overrides + SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module") The function will collect all the ``handle_urls`` annotations from the ``my_page_objects_module`` and submodules, and will convert them to rules ready to be used with ``SCRAPY_POET_OVERRIDES``. +For more info and advanced features, of ``web-poet``'s ``handle_urls`` +and ``find_page_object_overrides``, kindly read the `web-poet `_ +documentatino regarding Overrides. + Overrides registry ================== -The overrides registry is responsible of informing whether there exists an +The overrides registry is responsible for informing whether there exists an override for a particular type for a given request. The default overrides -registry allows to configure these rules using patterns that follows the +registry allows to configure these rules using patterns that follow the `url-matcher `_ syntax. These rules can be configured using the ``SCRAPY_POET_OVERRIDES`` setting, as it has been seen in the :ref:`intro-tutorial` example. diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index 177656e5..b4c366a7 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -8,6 +8,9 @@ """ import scrapy from web_poet import ItemWebPage, WebPage +from web_poet.overrides import OverrideRule +from url_matcher import Patterns + from scrapy_poet import callback_for @@ -61,8 +64,10 @@ class BooksSpider(scrapy.Spider): "SCRAPY_POET_OVERRIDES": [ ("toscrape.com", BTSBookListPage, BookListPage), ("toscrape.com", BTSBookPage, BookPage), - ("bookpage.com", BPBookListPage, BookListPage), - ("bookpage.com", BPBookPage, BookPage) + + # We could also use the long-form version if we want to. + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), ] } From d85766e2ade22b3acbd211106fe2bc130c977706 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 19:31:16 +0800 Subject: [PATCH 04/19] deprecate PerDomainOverridesRegistry in lieu of OverridesRegistry --- CHANGELOG.rst | 12 ++++++++---- docs/settings.rst | 4 ++-- scrapy_poet/overrides.py | 24 ------------------------ 3 files changed, 10 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6b681816..b4513b1b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,11 +7,15 @@ TBR: ------------------ * Cache mechanism using ``SCRAPY_POET_CACHE`` setting -* New and richer ``SCRAPY_POET_OVERRIDES`` registry that uses the - url-matcher patterns to configure the overrides - * This results in a **backward incompatible** change since the rules - follow a different structure. +* We also have these **backward incompatible** changes since the + rules follow a different structure: + + * Deprecated ``PerDomainOverridesRegistry`` in lieu of the newer + ``OverridesRegistry`` which provides a wide variety of features + for better URL matching. + * This resuls in a newer ``SCRAPY_POET_OVERRIDES`` which follows + a different format. * removed support for Python 3.6 * added support for Python 3.10 diff --git a/docs/settings.rst b/docs/settings.rst index c13a9580..2dbdec30 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -25,7 +25,7 @@ Default: ``None`` Mapping of overrides for each domain. The format of the such ``dict`` mapping depends on the currently set Registry. The default is currently -:class:`~.PerDomainOverridesRegistry`. This can be overriden by the setting below: +:class:`~.OverridesRegistry`. This can be overriden by the setting below: ``SCRAPY_POET_OVERRIDES_REGISTRY``. There are sections dedicated for this at :ref:`intro-tutorial` and :ref:`overrides`. @@ -36,7 +36,7 @@ SCRAPY_POET_OVERRIDES_REGISTRY Defaut: ``None`` -Sets an alternative Registry to replace the default :class:`~.PerDomainOverridesRegistry`. +Sets an alternative Registry to replace the default :class:`~.OverridesRegistry`. To use this, set a ``str`` which denotes the absolute object path of the new Registry. diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 5f76f377..805a358e 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -23,30 +23,6 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: pass -class PerDomainOverridesRegistry(Dict[str, Dict[Callable, Callable]], OverridesRegistryBase): - """ - Simple dictionary based registry that reads the overrides - from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings - - Example of overrides configuration: - - .. code-block:: python - - SCRAPY_POET_OVERRIDES = { - "example.com": { - BookPage: ISBNBookPage - } - } - """ - - @classmethod - def from_crawler(cls, crawler: Crawler): - return cls(crawler.settings.getdict("SCRAPY_POET_OVERRIDES", {})) - - def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: - return self.get(get_domain(request.url), {}) - - RuleAsTuple = Union[Tuple[str, Callable, Callable], List] class OverridesRegistry(OverridesRegistryBase): From 670715a992928798ce036e3d018125e8fed3351d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 20:09:10 +0800 Subject: [PATCH 05/19] improve readability of OverridesRegistry's docs --- scrapy_poet/overrides.py | 51 +++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 805a358e..0b83e51a 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -1,16 +1,18 @@ -from collections import defaultdict - +import logging from abc import ABC, abstractmethod +from collections import defaultdict from typing import Dict, Mapping, Callable, Iterable, Union, Tuple, Optional, List from scrapy import Request from scrapy.crawler import Crawler from url_matcher import Patterns, URLMatcher - from url_matcher.util import get_domain from web_poet.overrides import OverrideRule +logger = logging.getLogger(__name__) + + class OverridesRegistryBase(ABC): @abstractmethod @@ -27,37 +29,47 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: class OverridesRegistry(OverridesRegistryBase): """ - Overrides registry that reads the overrides - from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings. It - is a list and each rule can be a tuple or an instance of the class ``OverrideRule``. + Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES`` + in the spider settings. It is a list and each rule can be a tuple or an + instance of the class ``OverrideRule``. - If a tuple is provided, the first element is the pattern to match the URL, - the second element is the type to be used instead of the type in - the third element. Another way to see it: - for the URLs that match the pattern ``tuple[0]`` use ``tuple[1]`` instead of ``tuple[2]``. + If a tuple is provided: + + - the **first** element is the pattern to match the URL, + - the **second** element is the type to be used instead of the type in + the **third** element. + + Another way to see it for the URLs that match the pattern ``tuple[0]`` use + ``tuple[1]`` instead of ``tuple[2]``. Example of overrides configuration: .. code-block:: python - SCRAPY_POET_OVERRIDES = [ ("books.toscrape.com", ISBNBookPage, BookPage), - OverrideRule(for_patterns=Patterns(["books.toscrape.com"]), - use=MyBookListPage, - instead_of=BookListPage, - ), + OverrideRule( + for_patterns=Patterns(["books.toscrape.com"]), + use=MyBookListPage, + instead_of=BookListPage, + ), ] - It can be handy to compile the list of rules automatically - from a module using the method ``find_page_object_overrides``. For example: + It can be handy to compile the list of rules automatically from a module + using the utility function ``find_page_object_overrides()`` from ``web-poet``. + For example: .. code-block:: python + from web_poet import find_page_object_overrides + SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module") - It finds all the rules annotated using the decorator ``handle_urls`` inside the module ``my_page_objects_module`` and - its submodules. + It finds all the rules annotated using ``web-poet``'s ``@handle_urls`` + decorator inside the ``my_page_objects_module`` module and all of its + submodules. + + More info on this at `web-poet `_. """ @classmethod @@ -69,6 +81,7 @@ def __init__(self, rules: Optional[Iterable[Union[RuleAsTuple, OverrideRule]]] = self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher) for rule in rules or []: self.add_rule(rule) + logger.debug(f"List of parsed OverrideRules:\n{self.rules}") def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]): if isinstance(rule, (tuple, list)): From 706e4ac6c5e3c1c533eeb66a81d413cc72adc518 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 20:25:22 +0800 Subject: [PATCH 06/19] improve type annotations and errors in OverridesRegistry --- scrapy_poet/overrides.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 0b83e51a..67b81dd4 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -12,9 +12,11 @@ logger = logging.getLogger(__name__) +RuleAsTuple = Union[Tuple[str, Callable, Callable], List] +RuleFromUser = Union[RuleAsTuple, OverrideRule] -class OverridesRegistryBase(ABC): +class OverridesRegistryBase(ABC): @abstractmethod def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: """ @@ -25,8 +27,6 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: pass -RuleAsTuple = Union[Tuple[str, Callable, Callable], List] - class OverridesRegistry(OverridesRegistryBase): """ Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES`` @@ -73,28 +73,35 @@ class OverridesRegistry(OverridesRegistryBase): """ @classmethod - def from_crawler(cls, crawler: Crawler): + def from_crawler(cls, crawler: Crawler) -> Crawler: return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", [])) - def __init__(self, rules: Optional[Iterable[Union[RuleAsTuple, OverrideRule]]] = None): + def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None: self.rules: List[OverrideRule] = [] self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher) for rule in rules or []: self.add_rule(rule) logger.debug(f"List of parsed OverrideRules:\n{self.rules}") - def add_rule(self, rule: Union[RuleAsTuple, OverrideRule]): + def add_rule(self, rule: RuleFromUser) -> None: if isinstance(rule, (tuple, list)): if len(rule) != 3: - raise ValueError(f"Invalid overrides rule: {rule}. Rules as tuples must have three elements: " - f"the pattern, the type to override and the new type to use instead.") + raise ValueError( + f"Invalid overrides rule: {rule}. Rules as tuples must have " + f"3 elements: (1) the pattern, (2) the PO class used as a " + f"replacement and (3) the PO class to be replaced." + ) pattern, use, instead_of = rule - rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of) + rule = OverrideRule( + for_patterns=Patterns([pattern]), use=use, instead_of=instead_of + ) self.rules.append(rule) - self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns) + self.matcher[rule.instead_of].add_or_update( + len(self.rules) - 1, rule.for_patterns + ) def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: - overrides = {} + overrides: Dict[Callable, Callable] = {} for instead_of, matcher in self.matcher.items(): rule_id = matcher.match(request.url) if rule_id is not None: From bf4e61b52792eb8915e1b0a5f5e9aae3aa4e9d98 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 21 Dec 2021 20:40:55 +0800 Subject: [PATCH 07/19] improve test coverage --- scrapy_poet/cache.py | 4 ++-- scrapy_poet/utils.py | 3 ++- tests/test_utils.py | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 tests/test_utils.py diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index 07b4ee5e..d1a9ef47 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -54,14 +54,14 @@ def decode(self, obj: Any) -> Any: return pickle.loads(data) def __str__(self) -> str: - return ( + return ( #pragma: no cover f"SqlitedictCache <{self.db.filename} | " f"compressed: {self.compressed} | " f"{len(self.db)} records>" ) def __repr__(self) -> str: - return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" + return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" #pragma: no cover def __getitem__(self, fingerprint: str) -> Any: return self.db[fingerprint] diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 83d1e2ca..7564b27c 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,9 +1,10 @@ import os +from pathlib import PosixPath from scrapy.utils.project import project_data_dir, inside_project -def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str: +def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> PosixPath: """Return a path to a folder where Scrapy is storing data. Usually that's a .scrapy folder inside the project. """ diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..05e55542 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,21 @@ +from unittest import mock +from pathlib import PosixPath + +from scrapy_poet.utils import get_scrapy_data_path + + +@mock.patch("scrapy_poet.utils.os.makedirs") +@mock.patch("scrapy_poet.utils.inside_project") +def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path): + mock_inside_project.return_value = False + + path = tmp_path / "test_dir" + result = get_scrapy_data_path(createdir=True, default_dir=path) + + assert isinstance(result, PosixPath) + assert str(result) # should be non-empty + + mock_inside_project.assert_called_once() + + mock_makedirs.assert_called_once() + mock_makedirs.assert_called_with(path, exist_ok=True) From c865c60f1cea0fba17186fd230391be0b0bb1577 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 23 Dec 2021 10:25:21 +0800 Subject: [PATCH 08/19] update docs in-line with recent web-poet refactoring --- docs/overrides.rst | 25 +++++++++++++------------ scrapy_poet/overrides.py | 18 +++++++++++++----- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/docs/overrides.rst b/docs/overrides.rst index 9bf89d04..3c16f8cd 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -112,9 +112,7 @@ Decorate Page Objects with the rules Having the rules along with the Page Objects is a good idea, as you can identify with a single sight what the Page Object is doing along with where it is applied. This can be done by decorating the -Page Objects with ``handle_urls`` and then -configure the overrides automatically with the help of the function -``find_page_object_overrides``. +Page Objects with ``@handle_urls`` provided by ``web-poet``. Let's see an example: @@ -131,28 +129,31 @@ Let's see an example: 'name': self.css("title::text").get(), } -The ``handle_urls`` decorator in this case is indicating that +The ``@handle_urls`` decorator in this case is indicating that the class ``BSTBookPage`` should be used instead of ``BookPage`` for the domain ``toscrape.com``. In order to configure the ``scrapy-poet`` overrides automatically -using these annotations, -you can use the function ``find_page_object_overrides``. +using these annotations, you can directly interact with ``web-poet``'s +default registry. + For example: .. code-block:: python - from web_poet import find_page_object_overrides + from web_poet import default_registry - SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module") -The function will collect all the ``handle_urls`` annotations from the +The function will collect all the ``@handle_urls`` annotations from the ``my_page_objects_module`` and submodules, and will convert them to rules ready to be used with ``SCRAPY_POET_OVERRIDES``. -For more info and advanced features, of ``web-poet``'s ``handle_urls`` -and ``find_page_object_overrides``, kindly read the `web-poet `_ -documentatino regarding Overrides. +.. note:: + + For more info and advanced features of ``web-poet``'s ``@handle_urls`` + and its registry, kindly read the `web-poet `_ + documentation regarding Overrides. Overrides registry ================== diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 67b81dd4..d792fe91 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -55,20 +55,28 @@ class OverridesRegistry(OverridesRegistryBase): ), ] - It can be handy to compile the list of rules automatically from a module - using the utility function ``find_page_object_overrides()`` from ``web-poet``. - For example: + Now, if you've used ``web-poet``'s built-in functionality to directly create + the override rules in the Page Object via the ``@handle_urls`` annotation, + you can quickly import them via: .. code-block:: python - from web_poet import find_page_object_overrides + from web_poet import default_registry - SCRAPY_POET_OVERRIDES = find_page_object_overrides("my_page_objects_module") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module") It finds all the rules annotated using ``web-poet``'s ``@handle_urls`` decorator inside the ``my_page_objects_module`` module and all of its submodules. + However, for most cases, you'd most likely going to simply retrieve all of + the override rules that were ever declared on a given registry. Thus, you + could simply do: + + .. code-block:: python + + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + More info on this at `web-poet `_. """ From 63029dc1e29d7338ce92cc3fe41cfc956fa61f38 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 23 Dec 2021 11:40:14 +0800 Subject: [PATCH 09/19] add integration tests for web-poet --- tests/po_lib/__init__.py | 27 +++++++++++++++++++++++++++ tests/test_middleware.py | 24 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 tests/po_lib/__init__.py diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py new file mode 100644 index 00000000..535de873 --- /dev/null +++ b/tests/po_lib/__init__.py @@ -0,0 +1,27 @@ +""" +This package is just for overrides testing purposes. +""" +import socket +from typing import Dict, Any, Callable + +from url_matcher import Patterns +from url_matcher.util import get_domain +from web_poet import handle_urls, ItemWebPage + +from tests.mockserver import get_ephemeral_port + + +# Need to define it here since it's always changing +DOMAIN = get_domain(socket.gethostbyname(socket.gethostname())) +PORT = get_ephemeral_port() + + +class POOverriden(ItemWebPage): + def to_item(self): + return {"msg": "PO that will be replace"} + + +@handle_urls(f"{DOMAIN}:{PORT}", POOverriden) +class POIntegration(ItemWebPage): + def to_item(self): + return {"msg": "PO replacement"} diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 1434d895..ec650e09 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -24,6 +24,7 @@ from scrapy_poet.page_input_providers import ( PageObjectInputProvider ) +from web_poet import default_registry from web_poet.page_inputs import ResponseData from scrapy_poet import DummyResponse from tests.utils import (HtmlResource, @@ -350,3 +351,26 @@ def get_middleware(settings): mock.call('/tmp/cache', compressed=True), mock.call().close() ] + + +@inlineCallbacks +def test_web_poet_integration(settings): + """This tests scrapy-poet's integration with web-poet most especially when + populating override settings via: + + from web_poet import default_registry + + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + """ + + # Only import them in this test scope since they need to be synced with + # the URL of the Page Object annotated with @handle_urls. + from tests.po_lib import DOMAIN, PORT, POOverriden + + # Override rules are defined in `tests/po_lib/__init__.py`. + settings["SCRAPY_POET_OVERRIDES"] = default_registry.get_overrides() + + item, url, _ = yield crawl_single_item( + spider_for(POOverriden), ProductHtml, settings, port=PORT + ) + assert item == {"msg": "PO replacement"} From 5305da47e0258d1eb6348d693f79a17ec9efcc41 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 5 Jan 2022 11:59:58 +0800 Subject: [PATCH 10/19] fix and improve docs --- CHANGELOG.rst | 3 +-- docs/intro/tutorial.rst | 35 +++++++++++++++++++++++++++-------- docs/overrides.rst | 30 ++++++++++++++++++------------ 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b4513b1b..ef7a3f7e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,8 +14,7 @@ TBR: * Deprecated ``PerDomainOverridesRegistry`` in lieu of the newer ``OverridesRegistry`` which provides a wide variety of features for better URL matching. - * This resuls in a newer ``SCRAPY_POET_OVERRIDES`` which follows - a different format. + * This resuls in a newer format in the ``SCRAPY_POET_OVERRIDES`` setting. * removed support for Python 3.6 * added support for Python 3.10 diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 33af5f4c..c799af69 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -9,7 +9,7 @@ system. If thatโ€™s not the case, see :ref:`intro-install`. .. note:: - This tutorial can be followed without reading `web-poet docs`_, but + This tutorial can be followed without reading `web-poet`_ docs, but for a better understanding it is highly recommended to check them first. @@ -26,7 +26,7 @@ This tutorial will walk you through these tasks: If you're not already familiar with Scrapy, and want to learn it quickly, the `Scrapy Tutorial`_ is a good resource. -.. _web-poet docs: https://web-poet.readthedocs.io/en/stable/ +.. _web-poet: https://web-poet.readthedocs.io/en/stable/ Creating a spider ================= @@ -125,7 +125,7 @@ To use ``scrapy-poet``, enable its downloader middleware in ``settings.py``: ``BookPage`` class we created previously can be used without ``scrapy-poet``, and even without Scrapy (note that imports were from ``web_poet`` so far). -``scrapy-poet`` makes it easy to use ``web-poet`` Page Objects +``scrapy-poet`` makes it easy to use `web-poet`_ Page Objects (such as BookPage) in Scrapy spiders. Changing spider @@ -427,7 +427,7 @@ for ``SCRAPY_POET_OVERRIDES`` would be the following: from url_matcher import Patterns from web_poet.overrides import OverrideRule - SCRAPY_POET_PROVIDERS = [ + SCRAPY_POET_OVERRIDES = [ OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), @@ -437,8 +437,27 @@ for ``SCRAPY_POET_OVERRIDES`` would be the following: As you can see, this could get verbose. The earlier tuple config simply offers a shortcut to be more concise. -Also see the `url-matcher `_ -documentation for more information about the patterns syntax. +.. note:: + + Also see the `url-matcher `_ + documentation for more information about the patterns syntax. + +Manually defining overrides like this would be inconvenient, most +especially for larger projects. Fortunately, `web-poet`_ has a cool feature +to annotate Page Objects like ``@web_poet.handle_urls`` that would define and +store the ``OverrideRule`` for you. All of the Override rules could then be +simply read as: + +.. code:: python + + from web_poet import default_registry + + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + +For more info on this, you can refer to these docs: + + * :ref:`overrides` section + * external `web-poet`_ docs Next steps ========== @@ -446,7 +465,7 @@ Next steps Now that you know how ``scrapy-poet`` is supposed to work, what about trying to apply it to an existing or new Scrapy project? -Also, please check :ref:`overrides`, :ref:`providers` and refer to spiders in the "example" -folder: https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders +Also, please check the :ref:`overrides` and :ref:`providers` sections as well as +refer to spiders in the "example" folder: https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders .. _Scrapy Tutorial: https://docs.scrapy.org/en/latest/intro/tutorial.html diff --git a/docs/overrides.rst b/docs/overrides.rst index 3c16f8cd..59cd6b34 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -89,7 +89,6 @@ is only applied for book pages from ``books.toscrape.com``: .. code-block:: python - SCRAPY_POET_OVERRIDES = [ OverrideRule( for_patterns=Patterns( @@ -112,7 +111,7 @@ Decorate Page Objects with the rules Having the rules along with the Page Objects is a good idea, as you can identify with a single sight what the Page Object is doing along with where it is applied. This can be done by decorating the -Page Objects with ``@handle_urls`` provided by ``web-poet``. +Page Objects with ``@handle_urls`` provided by `web-poet`_. Let's see an example: @@ -123,18 +122,18 @@ Let's see an example: @handle_urls("toscrape.com", BookPage) class BTSBookPage(BookPage): - def to_item(self): - return { - 'url': self.url, - 'name': self.css("title::text").get(), - } + def to_item(self): + return { + 'url': self.url, + 'name': self.css("title::text").get(), + } The ``@handle_urls`` decorator in this case is indicating that the class ``BSTBookPage`` should be used instead of ``BookPage`` for the domain ``toscrape.com``. In order to configure the ``scrapy-poet`` overrides automatically -using these annotations, you can directly interact with ``web-poet``'s +using these annotations, you can directly interact with `web-poet`_'s default registry. For example: @@ -143,15 +142,22 @@ For example: from web_poet import default_registry + # To get all of the Override Rules that were declared via annotations. + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + + # Or, you could even extract the rules on a specific subpackage or module. SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module") -The function will collect all the ``@handle_urls`` annotations from the -``my_page_objects_module`` and submodules, and will convert them -to rules ready to be used with ``SCRAPY_POET_OVERRIDES``. +The ``get_overrides()`` and ``get_overrides_from_module()`` methods of the +``default_registry`` above returns ``List[OverrideRule]`` that were declared +using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient +that manually defining all of the ``OverrideRule``. Take note that since +``SCRAPY_POET_OVERRIDES`` is structured as ``List[OverrideRule]``, you can easily +modify it later on if needed. .. note:: - For more info and advanced features of ``web-poet``'s ``@handle_urls`` + For more info and advanced features of `web-poet`_'s ``@handle_urls`` and its registry, kindly read the `web-poet `_ documentation regarding Overrides. From 2d0c3bc5f8f21af766dd2332d9987a8e24a41654 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 7 Jan 2022 20:36:08 +0800 Subject: [PATCH 11/19] update docs to reflect new changes from web-poet --- docs/intro/tutorial.rst | 7 ++++++- docs/overrides.rst | 13 ++++++++++--- scrapy_poet/overrides.py | 13 ++++++++++--- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index c799af69..136aa7ba 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -450,7 +450,12 @@ simply read as: .. code:: python - from web_poet import default_registry + from web_poet import default_registry, consume_modules + + # The consume_modules() must be called first if you need to load + # rules from other packages. Otherwise, it can be omitted. + # More info about this caveat on web-poet docs. + consume_modules("external_package_A.po", "another_ext_package.lib") SCRAPY_POET_OVERRIDES = default_registry.get_overrides() diff --git a/docs/overrides.rst b/docs/overrides.rst index 59cd6b34..690d3c56 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -140,15 +140,22 @@ For example: .. code-block:: python - from web_poet import default_registry + from web_poet import default_registry, consume_modules + + # The consume_modules() must be called first if you need to load + # rules from other packages. Otherwise, it can be omitted. + # More info about this caveat on web-poet docs. + consume_modules("external_package_A.po", "another_ext_package.lib") # To get all of the Override Rules that were declared via annotations. SCRAPY_POET_OVERRIDES = default_registry.get_overrides() # Or, you could even extract the rules on a specific subpackage or module. - SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from( + "external_page_objects_package", "another_page_object_package.module_1" + ) -The ``get_overrides()`` and ``get_overrides_from_module()`` methods of the +The ``get_overrides()`` and ``get_overrides_from()`` methods of the ``default_registry`` above returns ``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient that manually defining all of the ``OverrideRule``. Take note that since diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index d792fe91..732b89db 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -63,18 +63,25 @@ class OverridesRegistry(OverridesRegistryBase): from web_poet import default_registry - SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from_module("my_page_objects_module") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from("my_page_objects_module") It finds all the rules annotated using ``web-poet``'s ``@handle_urls`` decorator inside the ``my_page_objects_module`` module and all of its submodules. However, for most cases, you'd most likely going to simply retrieve all of - the override rules that were ever declared on a given registry. Thus, you - could simply do: + the override rules that were ever declared on a given registry. Though make + sure to call ``consume_module()`` beforehand: .. code-block:: python + from web_poet import default_registry, consume_modules + + # The consume_modules() must be called first if you need to load + # rules from other packages. Otherwise, it can be omitted. + # More info about this caveat on web-poet docs. + consume_modules("external_package_A.po", "another_ext_package.lib") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() More info on this at `web-poet `_. From ce2392324e5eb126290f333b557918bf863bb569 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 12 Jan 2022 15:29:39 +0800 Subject: [PATCH 12/19] update docs with respect to new Override Rules interface from web-poet --- docs/intro/tutorial.rst | 6 +++++- docs/overrides.rst | 20 ++++++++++++-------- scrapy_poet/overrides.py | 8 ++++++-- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 136aa7ba..acf20cd7 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -456,9 +456,13 @@ simply read as: # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A.po", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + # The two lines above could be mixed together via this shortcut: + SCRAPY_POET_OVERRIDES = default_registry.get_overrides( + consume=["external_package_A.po", "another_ext_package.lib"] + ) + For more info on this, you can refer to these docs: * :ref:`overrides` section diff --git a/docs/overrides.rst b/docs/overrides.rst index 690d3c56..1cbf4b19 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -150,17 +150,21 @@ For example: # To get all of the Override Rules that were declared via annotations. SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + # The two lines above could be mixed together via this shortcut: + SCRAPY_POET_OVERRIDES = default_registry.get_overrides( + consume=["external_package_A.po", "another_ext_package.lib"] + ) + # Or, you could even extract the rules on a specific subpackage or module. - SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from( - "external_page_objects_package", "another_page_object_package.module_1" + SCRAPY_POET_OVERRIDES = default_registry.get_overrides( + filters=["external_page_objects_package", "another_page_object_package.module_1"] ) -The ``get_overrides()`` and ``get_overrides_from()`` methods of the -``default_registry`` above returns ``List[OverrideRule]`` that were declared -using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient -that manually defining all of the ``OverrideRule``. Take note that since -``SCRAPY_POET_OVERRIDES`` is structured as ``List[OverrideRule]``, you can easily -modify it later on if needed. +The ``get_overrides()`` method of the ``default_registry`` above returns +``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()`` +annotation. This is much more convenient that manually defining all of the +`OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as +``List[OverrideRule]``, you can easily modify it later on if needed. .. note:: diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 732b89db..eaa04c81 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -63,7 +63,7 @@ class OverridesRegistry(OverridesRegistryBase): from web_poet import default_registry - SCRAPY_POET_OVERRIDES = default_registry.get_overrides_from("my_page_objects_module") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides(filters="my_page_objects_module") It finds all the rules annotated using ``web-poet``'s ``@handle_urls`` decorator inside the ``my_page_objects_module`` module and all of its @@ -81,9 +81,13 @@ class OverridesRegistry(OverridesRegistryBase): # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A.po", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + # The two lines above could be mixed together via this shortcut: + SCRAPY_POET_OVERRIDES = default_registry.get_overrides( + consume=["external_package_A.po", "another_ext_package.lib"] + ) + More info on this at `web-poet `_. """ From 0c94cf6d258642c654624192e7bb86308f97ff3e Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 13 Jan 2022 19:00:20 +0800 Subject: [PATCH 13/19] update docs to reflect web-poet's new 'registry_pool' --- docs/intro/tutorial.rst | 4 ++-- docs/overrides.rst | 34 +++++++++++++++++++++++++++++++--- tests/test_middleware.py | 5 ++++- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index acf20cd7..ad735380 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -455,12 +455,12 @@ simply read as: # The consume_modules() must be called first if you need to load # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. - consume_modules("external_package_A.po", "another_ext_package.lib") + consume_modules("external_package_A", "another_ext_package.lib") SCRAPY_POET_OVERRIDES = default_registry.get_overrides() # The two lines above could be mixed together via this shortcut: SCRAPY_POET_OVERRIDES = default_registry.get_overrides( - consume=["external_package_A.po", "another_ext_package.lib"] + consume=["external_package_A", "another_ext_package.lib"] ) For more info on this, you can refer to these docs: diff --git a/docs/overrides.rst b/docs/overrides.rst index 1cbf4b19..deed2a79 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -134,7 +134,7 @@ for the domain ``toscrape.com``. In order to configure the ``scrapy-poet`` overrides automatically using these annotations, you can directly interact with `web-poet`_'s -default registry. +``default_registry``. For example: @@ -145,14 +145,14 @@ For example: # The consume_modules() must be called first if you need to load # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. - consume_modules("external_package_A.po", "another_ext_package.lib") + consume_modules("external_package_A", "another_ext_package.lib") # To get all of the Override Rules that were declared via annotations. SCRAPY_POET_OVERRIDES = default_registry.get_overrides() # The two lines above could be mixed together via this shortcut: SCRAPY_POET_OVERRIDES = default_registry.get_overrides( - consume=["external_package_A.po", "another_ext_package.lib"] + consume=["external_package_A", "another_ext_package.lib"] ) # Or, you could even extract the rules on a specific subpackage or module. @@ -172,6 +172,34 @@ annotation. This is much more convenient that manually defining all of the and its registry, kindly read the `web-poet `_ documentation regarding Overrides. +In case the external packages you're using does not use `web-poet`_'s +``default_registry``, you can find and collect custom registries via `web-poet`_'s +``registry_pool``: + +.. code-block:: python + + from web_poet import registry_pool, consume_modules + + # Ensures that the external dependencies are properly imported so that the + # Registry and its accompanying rules can be discovered. + consume_modules("external_package_A", "another_ext_package_B.lib") + + print(registry_pool) + # { + # 'default': , + # 'custom_reg' = , + # 'another_custom_reg' = , + # } + + SCRAPY_POET_OVERRIDES = [ + rule + for _, registry in registry_pool.items() + for rule in registry.get_overrides() + ] + + # Converting it to a set also ensures that there are no duplicate OverrideRules. + SCRAPY_POET_OVERRIDES = set(SCRAPY_POET_OVERRIDES) + Overrides registry ================== diff --git a/tests/test_middleware.py b/tests/test_middleware.py index ec650e09..388fb839 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -368,7 +368,10 @@ def test_web_poet_integration(settings): from tests.po_lib import DOMAIN, PORT, POOverriden # Override rules are defined in `tests/po_lib/__init__.py`. - settings["SCRAPY_POET_OVERRIDES"] = default_registry.get_overrides() + rules = default_registry.get_overrides() + + # Converting it to a set removes potential duplicate OverrideRules + settings["SCRAPY_POET_OVERRIDES"] = set(rules) item, url, _ = yield crawl_single_item( spider_for(POOverriden), ProductHtml, settings, port=PORT From 1f52f3bb49d78bd82e6983a2b952db2599cf898e Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 2 Mar 2022 12:03:27 +0800 Subject: [PATCH 14/19] update docs with web-poet's new MVP version and POP definition --- docs/overrides.rst | 44 +++++++++++----------------------------- scrapy_poet/overrides.py | 36 ++++++++++++++++---------------- 2 files changed, 31 insertions(+), 49 deletions(-) diff --git a/docs/overrides.rst b/docs/overrides.rst index deed2a79..f29305f4 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -155,50 +155,31 @@ For example: consume=["external_package_A", "another_ext_package.lib"] ) - # Or, you could even extract the rules on a specific subpackage or module. - SCRAPY_POET_OVERRIDES = default_registry.get_overrides( - filters=["external_page_objects_package", "another_page_object_package.module_1"] - ) - The ``get_overrides()`` method of the ``default_registry`` above returns ``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient that manually defining all of the `OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as ``List[OverrideRule]``, you can easily modify it later on if needed. -.. note:: +.. tip:: - For more info and advanced features of `web-poet`_'s ``@handle_urls`` - and its registry, kindly read the `web-poet `_ - documentation regarding Overrides. + If you're using External Packages which conform to the **POP** + standards as described in **web-poet's** `Page Object Projects (POP) + `_ section, + then retrieving the rules should be as easy as: -In case the external packages you're using does not use `web-poet`_'s -``default_registry``, you can find and collect custom registries via `web-poet`_'s -``registry_pool``: - -.. code-block:: python + .. code-block:: python - from web_poet import registry_pool, consume_modules + import external_package_A, another_ext_package - # Ensures that the external dependencies are properly imported so that the - # Registry and its accompanying rules can be discovered. - consume_modules("external_package_A", "another_ext_package_B.lib") + SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES - print(registry_pool) - # { - # 'default': , - # 'custom_reg' = , - # 'another_custom_reg' = , - # } +.. note:: - SCRAPY_POET_OVERRIDES = [ - rule - for _, registry in registry_pool.items() - for rule in registry.get_overrides() - ] + For more info and advanced features of `web-poet`_'s ``@handle_urls`` + and its registry, kindly read the `web-poet `_ + documentation regarding Overrides. - # Converting it to a set also ensures that there are no duplicate OverrideRules. - SCRAPY_POET_OVERRIDES = set(SCRAPY_POET_OVERRIDES) Overrides registry ================== @@ -217,4 +198,3 @@ must be a subclass of ``scrapy_poet.overrides.OverridesRegistryBase`` and must implement the method ``overrides_for``. As other Scrapy components, it can be initialized from the ``from_crawler`` class method if implemented. This might be handy to be able to access settings, stats, request meta, etc. - diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index eaa04c81..a623c064 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -57,21 +57,9 @@ class OverridesRegistry(OverridesRegistryBase): Now, if you've used ``web-poet``'s built-in functionality to directly create the override rules in the Page Object via the ``@handle_urls`` annotation, - you can quickly import them via: - - .. code-block:: python - - from web_poet import default_registry - - SCRAPY_POET_OVERRIDES = default_registry.get_overrides(filters="my_page_objects_module") - - It finds all the rules annotated using ``web-poet``'s ``@handle_urls`` - decorator inside the ``my_page_objects_module`` module and all of its - submodules. - - However, for most cases, you'd most likely going to simply retrieve all of - the override rules that were ever declared on a given registry. Though make - sure to call ``consume_module()`` beforehand: + you can quickly import them via the following code below. It finds all the + rules annotated using ``web-poet``'s ``@handle_urls`` decorator that were + registered into ``web_poet.default_registry``. .. code-block:: python @@ -88,8 +76,22 @@ class OverridesRegistry(OverridesRegistryBase): consume=["external_package_A.po", "another_ext_package.lib"] ) - More info on this at `web-poet `_. - """ + Make sure to call ``consume_module()`` beforehand. More info on this at + `web-poet `_. + + .. tip:: + + If you're using External Packages which conform to the **POP** + standards as described in **web-poet's** `Page Object Projects (POP) + `_ section, + then retrieving the rules should be as easy as: + + .. code-block:: python + + import external_package_A, another_ext_package + + SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES + """ @classmethod def from_crawler(cls, crawler: Crawler) -> Crawler: From 10ba139a38bd4e9c14e0e26e309e48bc924c54ce Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 25 Mar 2022 21:31:49 +0800 Subject: [PATCH 15/19] slight doc improvements --- CHANGELOG.rst | 2 +- docs/intro/tutorial.rst | 4 ++-- docs/overrides.rst | 7 +++++-- scrapy_poet/overrides.py | 10 ++++++++-- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3de3ad17..ae8cde19 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ TBR --- * We have these **backward incompatible** changes since the - ``OverrideRule`` follow a different structure: + ``web_poet.OverrideRule`` follow a different structure: * Deprecated ``PerDomainOverridesRegistry`` in lieu of the newer ``OverridesRegistry`` which provides a wide variety of features diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index ad735380..d8e82dbe 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -126,7 +126,7 @@ To use ``scrapy-poet``, enable its downloader middleware in ``settings.py``: and even without Scrapy (note that imports were from ``web_poet`` so far). ``scrapy-poet`` makes it easy to use `web-poet`_ Page Objects -(such as BookPage) in Scrapy spiders. +(such as ``BookPage``) in Scrapy spiders. Changing spider =============== @@ -418,7 +418,7 @@ For example, the pattern ``books.toscrape.com/cataloge/category/`` is accepted and it would restrict the override only to category pages. It is even possible to configure more complex patterns by -using the ``OverrideRule`` class instead of a triplet in +using the ``web_poet.OverrideRule`` class instead of a triplet in the configuration. Another way of declaring the earlier config for ``SCRAPY_POET_OVERRIDES`` would be the following: diff --git a/docs/overrides.rst b/docs/overrides.rst index f29305f4..31d87c35 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -158,7 +158,7 @@ For example: The ``get_overrides()`` method of the ``default_registry`` above returns ``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()`` annotation. This is much more convenient that manually defining all of the -`OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as +``OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as ``List[OverrideRule]``, you can easily modify it later on if needed. .. tip:: @@ -172,7 +172,10 @@ annotation. This is much more convenient that manually defining all of the import external_package_A, another_ext_package - SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES + SCRAPY_POET_OVERRIDES = ( + external_package_A.REGISTRY.get_overrides() + + another_ext_package.REGISTRY.get_overrides() + ) .. note:: diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index a623c064..fa254c52 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -47,7 +47,10 @@ class OverridesRegistry(OverridesRegistryBase): .. code-block:: python SCRAPY_POET_OVERRIDES = [ + # Option 1 ("books.toscrape.com", ISBNBookPage, BookPage), + + # Option 2 OverrideRule( for_patterns=Patterns(["books.toscrape.com"]), use=MyBookListPage, @@ -90,8 +93,11 @@ class OverridesRegistry(OverridesRegistryBase): import external_package_A, another_ext_package - SCRAPY_POET_OVERRIDES = external_package_A.RULES + another_ext_package.RULES - """ + SCRAPY_POET_OVERRIDES = ( + external_package_A.REGISTRY.get_overrides() + + another_ext_package.REGISTRY.get_overrides() + ) + """ @classmethod def from_crawler(cls, crawler: Crawler) -> Crawler: From da93452cdbbfaf8c1281ad23321a2b7e90f1cf2c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 19:24:58 +0800 Subject: [PATCH 16/19] improve docs after web-poet PR#27 has been merged --- docs/conf.py | 2 +- docs/intro/tutorial.rst | 28 ++++++++-------- docs/overrides.rst | 70 ++++++++++++++++++---------------------- scrapy_poet/overrides.py | 49 +++++++++++----------------- 4 files changed, 63 insertions(+), 86 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 9aa8ca4d..2e205d04 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -188,7 +188,7 @@ intersphinx_mapping = { 'python': ('https://docs.python.org/3', None, ), 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), - 'web-poet': ('https://web-poet.readthedocs.io/en/stable/', None), + 'web-poet': ('https://web-poet.readthedocs.io/en/latest/', None), 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None), } diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index d8e82dbe..5481c895 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -417,15 +417,16 @@ for a particular domain, but more complex URL patterns are also possible. For example, the pattern ``books.toscrape.com/cataloge/category/`` is accepted and it would restrict the override only to category pages. -It is even possible to configure more complex patterns by -using the ``web_poet.OverrideRule`` class instead of a triplet in +It is even possible to configure more complex patterns by using the +:py:class:`web_poet.overrides.OverrideRule` class instead of a triplet in the configuration. Another way of declaring the earlier config for ``SCRAPY_POET_OVERRIDES`` would be the following: .. code-block:: python from url_matcher import Patterns - from web_poet.overrides import OverrideRule + from web_poet import OverrideRule + SCRAPY_POET_OVERRIDES = [ OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), @@ -443,30 +444,27 @@ a shortcut to be more concise. documentation for more information about the patterns syntax. Manually defining overrides like this would be inconvenient, most -especially for larger projects. Fortunately, `web-poet`_ has a cool feature -to annotate Page Objects like ``@web_poet.handle_urls`` that would define and -store the ``OverrideRule`` for you. All of the Override rules could then be -simply read as: +especially for larger projects. Fortunately, `web-poet`_ has a cool feature to +annotate Page Objects like :py:func:`web_poet.handle_urls` that would define +and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the +:py:class:`web_poet.overrides.OverrideRule` rules could then be simply read as: .. code:: python from web_poet import default_registry, consume_modules - # The consume_modules() must be called first if you need to load + # The consume_modules() must be called first if you need to properly import # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A", "another_ext_package.lib") SCRAPY_POET_OVERRIDES = default_registry.get_overrides() - # The two lines above could be mixed together via this shortcut: - SCRAPY_POET_OVERRIDES = default_registry.get_overrides( - consume=["external_package_A", "another_ext_package.lib"] - ) - For more info on this, you can refer to these docs: - * :ref:`overrides` section - * external `web-poet`_ docs + * ``scrapy-poet``'s :ref:`overrides` Tutorial section. + * External `web-poet`_ docs. + + * Specifically, the :external:ref:`intro-overrides` Tutorial section. Next steps ========== diff --git a/docs/overrides.rst b/docs/overrides.rst index 31d87c35..3e9c7e4d 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -51,7 +51,7 @@ And then override it for a particular domain using ``settings.py``: ("example.com", ISBNBookPage, BookPage) ] -This new Page Objects gets the original ``BookPage`` as dependency and enrich +This new Page Object gets the original ``BookPage`` as dependency and enrich the obtained item with the ISBN from the page HTML. .. note:: @@ -82,13 +82,16 @@ Overrides rules =============== The default way of configuring the override rules is using triplets -of the form (``url pattern``, ``override_type``, ``overridden_type``). But -more complex rules can be introduced if the class ``OverrideRule`` -is used. The following example configures an override that -is only applied for book pages from ``books.toscrape.com``: +of the form (``url pattern``, ``override_type``, ``overridden_type``). But more +complex rules can be introduced if the class :py:class:`web_poet.overrides.OverrideRule` +is used. The following example configures an override that is only applied for +book pages from ``books.toscrape.com``: .. code-block:: python + from web_poet import OverrideRule + + SCRAPY_POET_OVERRIDES = [ OverrideRule( for_patterns=Patterns( @@ -111,7 +114,12 @@ Decorate Page Objects with the rules Having the rules along with the Page Objects is a good idea, as you can identify with a single sight what the Page Object is doing along with where it is applied. This can be done by decorating the -Page Objects with ``@handle_urls`` provided by `web-poet`_. +Page Objects with :py:func:`web_poet.handle_urls` provided by `web-poet`_. + +.. tip:: + Make sure to read the :external:ref:`intro-overrides` Tutorial section of + `web-poet`_ to learn all of its other functionalities that is not covered + in this section. Let's see an example: @@ -119,6 +127,7 @@ Let's see an example: from web_poet import handle_urls + @handle_urls("toscrape.com", BookPage) class BTSBookPage(BookPage): @@ -128,13 +137,13 @@ Let's see an example: 'name': self.css("title::text").get(), } -The ``@handle_urls`` decorator in this case is indicating that +The :py:func:`web_poet.handle_urls` decorator in this case is indicating that the class ``BSTBookPage`` should be used instead of ``BookPage`` for the domain ``toscrape.com``. In order to configure the ``scrapy-poet`` overrides automatically using these annotations, you can directly interact with `web-poet`_'s -``default_registry``. +``default_registry`` (an instance of :py:class:`web_poet.overrides.PageObjectRegistry`). For example: @@ -142,7 +151,7 @@ For example: from web_poet import default_registry, consume_modules - # The consume_modules() must be called first if you need to load + # The consume_modules() must be called first if you need to properly import # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A", "another_ext_package.lib") @@ -150,38 +159,20 @@ For example: # To get all of the Override Rules that were declared via annotations. SCRAPY_POET_OVERRIDES = default_registry.get_overrides() - # The two lines above could be mixed together via this shortcut: - SCRAPY_POET_OVERRIDES = default_registry.get_overrides( - consume=["external_package_A", "another_ext_package.lib"] - ) +The :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` method of the +``default_registry`` above returns ``List[OverrideRule]`` that were declared +using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much +more convenient that manually defining all of the :py:class:`web_poet.overrides.OverrideRule`. -The ``get_overrides()`` method of the ``default_registry`` above returns -``List[OverrideRule]`` that were declared using `web-poet`_'s ``@handle_urls()`` -annotation. This is much more convenient that manually defining all of the -``OverrideRule``. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as +Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as ``List[OverrideRule]``, you can easily modify it later on if needed. -.. tip:: - - If you're using External Packages which conform to the **POP** - standards as described in **web-poet's** `Page Object Projects (POP) - `_ section, - then retrieving the rules should be as easy as: - - .. code-block:: python - - import external_package_A, another_ext_package - - SCRAPY_POET_OVERRIDES = ( - external_package_A.REGISTRY.get_overrides() - + another_ext_package.REGISTRY.get_overrides() - ) - .. note:: - For more info and advanced features of `web-poet`_'s ``@handle_urls`` + For more info and advanced features of `web-poet`_'s :py:func:`web_poet.handle_urls` and its registry, kindly read the `web-poet `_ - documentation regarding Overrides. + documentation, specifically its :external:ref:`intro-overrides` tutorial + section. Overrides registry @@ -197,7 +188,8 @@ example. But the registry implementation can be changed at convenience. A different registry implementation can be configured using the property ``SCRAPY_POET_OVERRIDES_REGISTRY`` in ``settings.py``. The new registry -must be a subclass of ``scrapy_poet.overrides.OverridesRegistryBase`` -and must implement the method ``overrides_for``. As other Scrapy components, -it can be initialized from the ``from_crawler`` class method if implemented. -This might be handy to be able to access settings, stats, request meta, etc. +must be a subclass of :class:`scrapy_poet.overrides.OverridesRegistryBase` and +must implement the method :meth:`scrapy_poet.overrides.OverridesRegistryBase.overrides_for`. +As other Scrapy components, it can be initialized from the ``from_crawler`` class +method if implemented. This might be handy to be able to access settings, stats, +request meta, etc. diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index fa254c52..a5e330d3 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -31,7 +31,7 @@ class OverridesRegistry(OverridesRegistryBase): """ Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES`` in the spider settings. It is a list and each rule can be a tuple or an - instance of the class ``OverrideRule``. + instance of the class :py:class:`web_poet.overrides.OverrideRule`. If a tuple is provided: @@ -46,6 +46,10 @@ class OverridesRegistry(OverridesRegistryBase): .. code-block:: python + from url_matcher import Patterns + from scrapy_poet.overrides import OverrideRule + + SCRAPY_POET_OVERRIDES = [ # Option 1 ("books.toscrape.com", ISBNBookPage, BookPage), @@ -58,45 +62,28 @@ class OverridesRegistry(OverridesRegistryBase): ), ] - Now, if you've used ``web-poet``'s built-in functionality to directly create - the override rules in the Page Object via the ``@handle_urls`` annotation, - you can quickly import them via the following code below. It finds all the - rules annotated using ``web-poet``'s ``@handle_urls`` decorator that were - registered into ``web_poet.default_registry``. + .. _web-poet: https://web-poet.readthedocs.io + + Now, if you've used web-poet_'s built-in functionality to directly create + the :py:class:`web_poet.overrides.OverrideRule` in the Page Object via the + :py:func:`web_poet.handle_urls` annotation, you can quickly import them via + the following code below. It finds all the rules annotated using web-poet_'s + :py:func:`web_poet.handle_urls` as a decorator that were registered into + ``web_poet.default_registry`` (an instance of + :py:class:`web_poet.overrides.PageObjectRegistry`). .. code-block:: python from web_poet import default_registry, consume_modules - # The consume_modules() must be called first if you need to load - # rules from other packages. Otherwise, it can be omitted. + # The consume_modules() must be called first if you need to properly + # import rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A.po", "another_ext_package.lib") SCRAPY_POET_OVERRIDES = default_registry.get_overrides() - # The two lines above could be mixed together via this shortcut: - SCRAPY_POET_OVERRIDES = default_registry.get_overrides( - consume=["external_package_A.po", "another_ext_package.lib"] - ) - - Make sure to call ``consume_module()`` beforehand. More info on this at - `web-poet `_. - - .. tip:: - - If you're using External Packages which conform to the **POP** - standards as described in **web-poet's** `Page Object Projects (POP) - `_ section, - then retrieving the rules should be as easy as: - - .. code-block:: python - - import external_package_A, another_ext_package - - SCRAPY_POET_OVERRIDES = ( - external_package_A.REGISTRY.get_overrides() - + another_ext_package.REGISTRY.get_overrides() - ) + Make sure to call :py:func:`web_poet.overrides.consume_modules` beforehand. + More info on this at web-poet_. """ @classmethod From dd2a302dbcbfd4cd4463bd76438e93b90a9bc2b3 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 16 May 2022 13:44:30 +0800 Subject: [PATCH 17/19] update imports after web_poet refactoring --- tests/po_lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 535de873..287bd7ea 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -21,7 +21,7 @@ def to_item(self): return {"msg": "PO that will be replace"} -@handle_urls(f"{DOMAIN}:{PORT}", POOverriden) +@handle_urls(f"{DOMAIN}:{PORT}", overrides=POOverriden) class POIntegration(ItemWebPage): def to_item(self): return {"msg": "PO replacement"} From 05881057934c126a26a1232f981fd9928ac85f91 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 19 May 2022 13:12:32 +0800 Subject: [PATCH 18/19] fix return type annotation of get_scrapy_data_path() --- scrapy_poet/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 7564b27c..80a7d715 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,11 +1,11 @@ import os -from pathlib import PosixPath from scrapy.utils.project import project_data_dir, inside_project -def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> PosixPath: +def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str: """Return a path to a folder where Scrapy is storing data. + Usually that's a .scrapy folder inside the project. """ # This code is extracted from scrapy.utils.project.data_path function, From 0bc51b8670d75a88c03edc033d3d7b8b48b0cd20 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 19 May 2022 13:43:44 +0800 Subject: [PATCH 19/19] add override examples using @handle_urls --- docs/overrides.rst | 12 +++ .../example/spiders/books_04_overrides_03.py | 76 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 example/example/spiders/books_04_overrides_03.py diff --git a/docs/overrides.rst b/docs/overrides.rst index b0aabd6d..3ceb3d39 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -8,6 +8,18 @@ on the request URL domain. Please have a look to :ref:`intro-tutorial` to learn the basics about overrides before digging deeper in the content of this page. +.. tip:: + + Some real-world examples on this topic can be found in: + + - `Example 1 `_: + rules using tuples + - `Example 2 `_: + rules using tuples and :py:class:`web_poet.overrides.OverrideRule` + - `Example 3 `_: + rules using :py:func:`web_poet.handle_urls` decorator and retrieving them + via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` + Page Objects refinement ======================= diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py new file mode 100644 index 00000000..f25fff07 --- /dev/null +++ b/example/example/spiders/books_04_overrides_03.py @@ -0,0 +1,76 @@ +""" +Scrapy spider which uses Page Objects both for crawling and extraction, +and uses overrides to support two different sites without changing +the crawling logic (the spider is exactly the same) + +No configured default logic: if used for an unregistered domain, no logic +at all is applied. + +This example is quite similar to books_04_overrides_02.py where the only +difference is that this example is using the ``@handle_urls`` decorator to +store the rules in web-poet's registry. +""" +import scrapy +from web_poet import ItemWebPage, WebPage, handle_urls, default_registry +from web_poet.overrides import OverrideRule +from url_matcher import Patterns + +from scrapy_poet import callback_for + + +class BookListPage(WebPage): + + def book_urls(self): + return [] + + +class BookPage(ItemWebPage): + + def to_item(self): + return None + + +@handle_urls("toscrape.com", overrides=BookListPage) +class BTSBookListPage(BookListPage): + """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): + return self.css('.image_container a::attr(href)').getall() + + +@handle_urls("toscrape.com", overrides=BookPage) +class BTSBookPage(BookPage): + """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): + return { + 'url': self.url, + 'name': self.css("title::text").get(), + } + + +@handle_urls("bookpage.com", overrides=BookListPage) +class BPBookListPage(BookListPage): + """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): + return self.css('article.post h4 a::attr(href)').getall() + + +@handle_urls("bookpage.com", overrides=BookPage) +class BPBookPage(BookPage): + """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): + return { + 'url': self.url, + 'name': self.css("body div > h1::text").get().strip(), + } + + +class BooksSpider(scrapy.Spider): + name = 'books_04_overrides_03' + start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + # Configuring different page objects pages for different domains + custom_settings = { + "SCRAPY_POET_OVERRIDES": default_registry.get_overrides() + } + + def parse(self, response, page: BookListPage): + yield from response.follow_all(page.book_urls(), callback_for(BookPage))