Skip to content

Commit 4374ad7

Browse files
committed
add override examples using @handle_urls
1 parent 0588105 commit 4374ad7

File tree

2 files changed

+88
-0
lines changed

2 files changed

+88
-0
lines changed

docs/overrides.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,18 @@ on the request URL domain. Please have a look to :ref:`intro-tutorial` to
88
learn the basics about overrides before digging deeper in the content of this
99
page.
1010

11+
.. tip::
12+
13+
Some real-world examples on this topic can be found in:
14+
15+
- `Example 1 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_01.py>`_:
16+
rules using tuples
17+
- `Example 2 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_02.py>`_:
18+
rules using tuples and :py:class:`web_poet.overrides.OverrideRule`
19+
- `Example 3 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_03.py>`_:
20+
rules using :py:func:`web_poet.handle_urls` decorator and retrieving them
21+
via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides`
22+
1123
Page Objects refinement
1224
=======================
1325

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
Scrapy spider which uses Page Objects both for crawling and extraction,
3+
and uses overrides to support two different sites without changing
4+
the crawling logic (the spider is exactly the same)
5+
6+
No configured default logic: if used for an unregistered domain, no logic
7+
at all is applied.
8+
9+
This example is quite similar to books_04_overrides_02.py where the only
10+
difference is that this example is using the ``@handle_urls`` decorator to
11+
store the rules in web-poet's registry.
12+
"""
13+
import scrapy
14+
from web_poet import ItemWebPage, WebPage, handle_urls, default_registry
15+
from web_poet.overrides import OverrideRule
16+
from url_matcher import Patterns
17+
18+
from scrapy_poet import callback_for
19+
20+
21+
class BookListPage(WebPage):
22+
23+
def book_urls(self):
24+
return []
25+
26+
27+
class BookPage(ItemWebPage):
28+
29+
def to_item(self):
30+
return None
31+
32+
33+
@handle_urls("toscrape.com", overrides=BookListPage)
34+
class BTSBookListPage(BookListPage):
35+
"""Logic to extract listings from pages like https://books.toscrape.com"""
36+
def book_urls(self):
37+
return self.css('.image_container a::attr(href)').getall()
38+
39+
40+
@handle_urls("toscrape.com", overrides=BookPage)
41+
class BTSBookPage(BookPage):
42+
"""Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html"""
43+
def to_item(self):
44+
return {
45+
'url': self.url,
46+
'name': self.css("title::text").get(),
47+
}
48+
49+
50+
@handle_urls("bookpage.com", overrides=BPBookListPage)
51+
class BPBookListPage(BookListPage):
52+
"""Logic to extract listings from pages like https://bookpage.com/reviews"""
53+
def book_urls(self):
54+
return self.css('article.post h4 a::attr(href)').getall()
55+
56+
57+
@handle_urls("bookpage.com", overrides=BPBookPage)
58+
class BPBookPage(BookPage):
59+
"""Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction"""
60+
def to_item(self):
61+
return {
62+
'url': self.url,
63+
'name': self.css("body div > h1::text").get().strip(),
64+
}
65+
66+
67+
class BooksSpider(scrapy.Spider):
68+
name = 'books_04_overrides_02'
69+
start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews']
70+
# Configuring different page objects pages for different domains
71+
custom_settings = {
72+
"SCRAPY_POET_OVERRIDES": default_registry.get_overrides()
73+
}
74+
75+
def parse(self, response, page: BookListPage):
76+
yield from response.follow_all(page.book_urls(), callback_for(BookPage))

0 commit comments

Comments
 (0)