add override examples using @handle_urls

BurnzZ · BurnzZ · commit 4374ad7e7f7e · 2022-05-19T13:39:14.000+08:00
diff --git a/docs/overrides.rst b/docs/overrides.rst
@@ -8,6 +8,18 @@ on the request URL domain. Please have a look to :ref:`intro-tutorial` to
 learn the basics about overrides before digging deeper in the content of this
 page.
 
+.. tip::
+
+    Some real-world examples on this topic can be found in:
+
+    - `Example 1 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_01.py>`_:
+      rules using tuples
+    - `Example 2 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_02.py>`_:
+      rules using tuples and :py:class:`web_poet.overrides.OverrideRule`
+    - `Example 3 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_03.py>`_:
+      rules using :py:func:`web_poet.handle_urls` decorator and retrieving them
+      via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides`
+
 Page Objects refinement
 =======================
 
diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py
@@ -0,0 +1,76 @@
+"""
+Scrapy spider which uses Page Objects both for crawling and extraction,
+and uses overrides to support two different sites without changing
+the crawling logic (the spider is exactly the same)
+
+No configured default logic: if used for an unregistered domain, no logic
+at all is applied.
+
+This example is quite similar to books_04_overrides_02.py where the only
+difference is that this example is using the ``@handle_urls`` decorator to
+store the rules in web-poet's registry.
+"""
+import scrapy
+from web_poet import ItemWebPage, WebPage, handle_urls, default_registry
+from web_poet.overrides import OverrideRule
+from url_matcher import Patterns
+
+from scrapy_poet import callback_for
+
+
+class BookListPage(WebPage):
+
+    def book_urls(self):
+        return []
+
+
+class BookPage(ItemWebPage):
+
+    def to_item(self):
+        return None
+
+
+@handle_urls("toscrape.com", overrides=BookListPage)
+class BTSBookListPage(BookListPage):
+    """Logic to extract listings from pages like https://books.toscrape.com"""
+    def book_urls(self):
+        return self.css('.image_container a::attr(href)').getall()
+
+
+@handle_urls("toscrape.com", overrides=BookPage)
+class BTSBookPage(BookPage):
+    """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html"""
+    def to_item(self):
+        return {
+            'url': self.url,
+            'name': self.css("title::text").get(),
+        }
+
+
+@handle_urls("bookpage.com", overrides=BPBookListPage)
+class BPBookListPage(BookListPage):
+    """Logic to extract listings from pages like https://bookpage.com/reviews"""
+    def book_urls(self):
+        return self.css('article.post h4 a::attr(href)').getall()
+
+
+@handle_urls("bookpage.com", overrides=BPBookPage)
+class BPBookPage(BookPage):
+    """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction"""
+    def to_item(self):
+        return {
+            'url': self.url,
+            'name': self.css("body div > h1::text").get().strip(),
+        }
+
+
+class BooksSpider(scrapy.Spider):
+    name = 'books_04_overrides_02'
+    start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews']
+    # Configuring different page objects pages for different domains
+    custom_settings = {
+        "SCRAPY_POET_OVERRIDES": default_registry.get_overrides()
+    }
+
+    def parse(self, response, page: BookListPage):
+        yield from response.follow_all(page.book_urls(), callback_for(BookPage))