diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..4caf755f --- /dev/null +++ b/.flake8 @@ -0,0 +1,36 @@ +[flake8] +ignore = + # Refers to the max-line length. Let's suppress the error and simply + # let black take care on how it wants to format the lines. + E501, + + # Refers to "line break before binary operator". + # Similar to above, let black take care of the formatting. + W503, + + # Refers to "necessary dict call - rewrite as a literal". + C408, + + # To be addressed: + D100, # Missing docstring in public module + D101, # Missing docstring in public class + D102, # Missing docstring in public method + D103, # Missing docstring in public function + D104, # Missing docstring in public package + D105, # Missing docstring in magic method + D107, # Missing docstring in __init__ + D200, # One-line docstring should fit on one line with quotes + D202, # No blank lines allowed after function docstring + D205, # 1 blank line required between summary line and description + D209, # Multi-line docstring closing quotes should be on a separate line + D400, # First line should end with a period + D401, # First line should be in imperative mood + D402 # First line should not be the function's "signature" + +per-file-ignores = + # F401: Ignore "imported but unused" errors in __init__ files, as those + # imports are there to expose submodule functions so they can be imported + # directly from that module + # F403: Ignore * imports in these files + scrapy_poet/__init__.py:F401,F403 + scrapy_poet/page_inputs/__init__.py:F401,F403 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..55e70186 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,5 @@ +# Contains commits to be ignored due to linting + +# https://github.com/scrapinghub/scrapy-poet/pull/68 +58c903617911b3209ad68bfefe3fa1a86be629f4 +7249a133722d1115111a8bbb3b02080a892483f2 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 556f0b50..71102a57 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -57,7 +57,7 @@ jobs: fail-fast: false matrix: python-version: ['3.10'] - tox-job: ["mypy", "docs"] + tox-job: ["mypy", "docs", "linters"] steps: - uses: actions/checkout@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..ab9089cd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: + - hooks: + - id: black + language_version: python3 + repo: https://github.com/ambv/black + rev: 22.3.0 + - hooks: + - id: isort + language_version: python3 + repo: https://github.com/PyCQA/isort + rev: 5.10.1 + - hooks: + - id: flake8 + language_version: python3 + additional_dependencies: + - flake8-bugbear + - flake8-comprehensions + - flake8-debugger + - flake8-docstrings + - flake8-string-format + repo: https://github.com/pycqa/flake8 + rev: 4.0.1 diff --git a/README.rst b/README.rst index d305724b..6f76f5b5 100644 --- a/README.rst +++ b/README.rst @@ -60,3 +60,21 @@ Add the following inside Scrapy's ``settings.py`` file: DOWNLOADER_MIDDLEWARES = { "scrapy_poet.InjectionMiddleware": 543, } + +Developing +========== + +Setup your local Python environment via: + +1. `pip install -r requirements-dev.txt` +2. `pre-commit install` + +Now everytime you perform a `git commit`, these tools will run against the +staged files: + +* `black` +* `isort` +* `flake8` + +You can also directly invoke `pre-commit run --all-files` or `tox -e linters` +to run them without performing a commit. diff --git a/docs/conf.py b/docs/conf.py index 84466f9d..b21b3c19 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,19 +12,20 @@ # import os import sys -sys.path.insert(0, os.path.abspath('../')) + +sys.path.insert(0, os.path.abspath("../")) # -- Project information ----------------------------------------------------- -project = u'scrapy-poet' -copyright = u'2022, Zyte' -author = u'Zyte' +project = "scrapy-poet" +copyright = "2022, Zyte" +author = "Zyte" # The short X.Y version -version = u'' +version = "" # The full version, including alpha/beta/rc tags -release = u'0.3.0' +release = "0.3.0" # -- General configuration --------------------------------------------------- @@ -37,24 +38,24 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -66,7 +67,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None @@ -77,12 +78,13 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom themes here, relative to this directory. # Add path to the RTD explicitly to robustify builds (otherwise might # fail in a clean Debian build env) -import sphinx_rtd_theme +import sphinx_rtd_theme # noqa: E402 + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme @@ -110,7 +112,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'scrapy-poetdoc' +htmlhelp_basename = "scrapy-poetdoc" # -- Options for LaTeX output ------------------------------------------------ @@ -119,15 +121,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -137,8 +136,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'scrapy-poet.tex', u'scrapy-poet Documentation', - u'Scrapinghub', 'manual'), + ( + master_doc, + "scrapy-poet.tex", + "scrapy-poet Documentation", + "Scrapinghub", + "manual", + ), ] @@ -146,10 +150,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'scrapy-poet', u'scrapy-poet Documentation', - [author], 1) -] +man_pages = [(master_doc, "scrapy-poet", "scrapy-poet Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -158,9 +159,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'scrapy-poet', u'scrapy-poet Documentation', - author, 'scrapy-poet', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "scrapy-poet", + "scrapy-poet Documentation", + author, + "scrapy-poet", + "One line description of project.", + "Miscellaneous", + ), ] @@ -179,21 +186,27 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- # -- Options for intersphinx extension --------------------------------------- intersphinx_mapping = { - 'python': ('https://docs.python.org/3', None, ), - 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), - 'web-poet': ('https://web-poet.readthedocs.io/en/latest/', None), - 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None), + "python": ( + "https://docs.python.org/3", + None, + ), + "scrapy": ( + "https://docs.scrapy.org/en/latest", + None, + ), + "web-poet": ("https://web-poet.readthedocs.io/en/latest/", None), + "url-matcher": ("https://url-matcher.readthedocs.io/en/stable/", None), } autodoc_default_options = { - 'special-members': '__init__,__call__', + "special-members": "__init__,__call__", # 'undoc-members': True, - 'exclude-members': '__weakref__' + "exclude-members": "__weakref__", } diff --git a/docs/index.rst b/docs/index.rst index 1271bbe5..59450a82 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,9 +25,9 @@ To get started, see :ref:`intro-install` and :ref:`intro-tutorial`. :ref:`license` is BSD 3-clause. -.. _`AutoExtract`: https://scrapinghub.com/autoextract -.. _`Splash`: https://scrapinghub.com/splash -.. _`web-poet`: https://github.com/scrapinghub/web-poet +.. _AutoExtract: https://scrapinghub.com/autoextract +.. _Splash: https://scrapinghub.com/splash +.. _web-poet: https://github.com/scrapinghub/web-poet .. _docs: https://web-poet.readthedocs.io/en/stable/ .. toctree:: diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index a1fd2fa5..a2cffbf5 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -1,4 +1,4 @@ -.. _`intro-advanced-tutorial`: +.. _intro-advanced-tutorial: ================= Advanced Tutorial @@ -15,7 +15,7 @@ These are mainly achieved by **scrapy-poet** implementing **providers** for them * :class:`scrapy_poet.page_input_providers.HttpClientProvider` * :class:`scrapy_poet.page_input_providers.PageParamsProvider` -.. _`intro-additional-requests`: +.. _intro-additional-requests: Additional Requests =================== diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 6a37c548..342cf385 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -1,4 +1,4 @@ -.. _`intro-basic-tutorial`: +.. _intro-basic-tutorial: ============== Basic Tutorial diff --git a/docs/intro/install.rst b/docs/intro/install.rst index f3d6187e..9c6f5e7e 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -1,4 +1,4 @@ -.. _`intro-install`: +.. _intro-install: ============ Installation diff --git a/docs/license.rst b/docs/license.rst index e6a41ca8..e647e180 100644 --- a/docs/license.rst +++ b/docs/license.rst @@ -1,4 +1,4 @@ -.. _`license`: +.. _license: ======= License diff --git a/docs/overrides.rst b/docs/overrides.rst index 3ceb3d39..e278693d 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -1,4 +1,4 @@ -.. _`overrides`: +.. _overrides: ========= Overrides diff --git a/docs/providers.rst b/docs/providers.rst index 939807f6..4b5918e9 100644 --- a/docs/providers.rst +++ b/docs/providers.rst @@ -1,4 +1,4 @@ -.. _`providers`: +.. _providers: ========= Providers diff --git a/docs/settings.rst b/docs/settings.rst index 2dbdec30..3b9c7ddd 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -1,4 +1,4 @@ -.. _`settings`: +.. _settings: Settings ======== diff --git a/example/example/autoextract.py b/example/example/autoextract.py index 7b6c0503..49cff130 100644 --- a/example/example/autoextract.py +++ b/example/example/autoextract.py @@ -2,20 +2,21 @@ Example of how to create a PageObject with a very different input data, which even requires an API request. """ -from typing import Dict, Any +from typing import Any, Dict import attr +from scrapy import Request from twisted.internet.defer import inlineCallbacks from twisted.internet.threads import deferToThread +from web_poet import ItemPage -from scrapy import Request from scrapy_poet.page_input_providers import PageObjectInputProvider -from web_poet import ItemPage @attr.s(auto_attribs=True) class AutoextractProductResponse: - """ Input data """ + """Input data""" + data: Dict[str, Any] @@ -24,7 +25,7 @@ class AutoextractProductProvider(PageObjectInputProvider): @inlineCallbacks def __call__(self, to_provide, request: Request): - data = (yield get_autoextract_product(request.url)) + data = yield get_autoextract_product(request.url) return [AutoextractProductResponse(data=data)] @@ -33,19 +34,21 @@ def get_autoextract_product(url): # fixme: use async # fixme: rate limits? from autoextract.sync import request_batch - resp = yield deferToThread(request_batch, urls=[url], page_type='product') + + resp = yield deferToThread(request_batch, urls=[url], page_type="product") return resp[0] @attr.s(auto_attribs=True) class ProductPage(ItemPage): - """ Generic product page """ + """Generic product page""" + autoextract_resp: AutoextractProductResponse @property def url(self): - return self.autoextract_resp.data['product']['url'] + return self.autoextract_resp.data["product"]["url"] def to_item(self): - product = self.autoextract_resp.data['product'] + product = self.autoextract_resp.data["product"] return product diff --git a/example/example/settings.py b/example/example/settings.py index 4e319585..863cb471 100644 --- a/example/example/settings.py +++ b/example/example/settings.py @@ -8,10 +8,10 @@ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from example.autoextract import AutoextractProductProvider -BOT_NAME = 'example' +BOT_NAME = "example" -SPIDER_MODULES = ['example.spiders'] -NEWSPIDER_MODULE = 'example.spiders' +SPIDER_MODULES = ["example.spiders"] +NEWSPIDER_MODULE = "example.spiders" SCRAPY_POET_PROVIDERS = {AutoextractProductProvider: 500} @@ -19,6 +19,5 @@ ROBOTSTXT_OBEY = True DOWNLOADER_MIDDLEWARES = { - 'scrapy_poet.InjectionMiddleware': 543, + "scrapy_poet.InjectionMiddleware": 543, } - diff --git a/example/example/spiders/books_01.py b/example/example/spiders/books_01.py index c9d68804..a7d65039 100644 --- a/example/example/spiders/books_01.py +++ b/example/example/spiders/books_01.py @@ -5,15 +5,15 @@ class BooksSpider(scrapy.Spider): - name = 'books_01' - start_urls = ['http://books.toscrape.com/'] + name = "books_01" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, self.parse_book) def parse_book(self, response): yield { - 'url': response.url, - 'name': response.css("title::text").get(), + "url": response.url, + "name": response.css("title::text").get(), } diff --git a/example/example/spiders/books_02.py b/example/example/spiders/books_02.py index fa5b89d9..a1f52c34 100644 --- a/example/example/spiders/books_02.py +++ b/example/example/spiders/books_02.py @@ -9,17 +9,17 @@ class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02' - start_urls = ['http://books.toscrape.com/'] + name = "books_02" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, self.parse_book) def parse_book(self, response, book_page: BookPage): diff --git a/example/example/spiders/books_02_1.py b/example/example/spiders/books_02_1.py index 33d3cb59..42eac332 100644 --- a/example/example/spiders/books_02_1.py +++ b/example/example/spiders/books_02_1.py @@ -5,22 +5,23 @@ """ import scrapy from web_poet import ItemWebPage + from scrapy_poet import callback_for class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02_1' - start_urls = ['http://books.toscrape.com/'] + name = "books_02_1" + start_urls = ["http://books.toscrape.com/"] parse_book = callback_for(BookPage) def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, self.parse_book) diff --git a/example/example/spiders/books_02_2.py b/example/example/spiders/books_02_2.py index e720e12a..b9eccc41 100644 --- a/example/example/spiders/books_02_2.py +++ b/example/example/spiders/books_02_2.py @@ -12,21 +12,22 @@ """ import scrapy from web_poet import ItemWebPage + from scrapy_poet import callback_for class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02_2' - start_urls = ['http://books.toscrape.com/'] + name = "books_02_2" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, callback_for(BookPage)) diff --git a/example/example/spiders/books_02_3.py b/example/example/spiders/books_02_3.py index eb952078..14cf53a3 100644 --- a/example/example/spiders/books_02_3.py +++ b/example/example/spiders/books_02_3.py @@ -14,15 +14,15 @@ class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02_3' - start_urls = ['http://books.toscrape.com/'] + name = "books_02_3" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, BookPage) diff --git a/example/example/spiders/books_03.py b/example/example/spiders/books_03.py index 503ed8cd..61efb4f7 100644 --- a/example/example/spiders/books_03.py +++ b/example/example/spiders/books_03.py @@ -2,15 +2,15 @@ Scrapy spider which uses AutoExtract API, to extract books as products. """ import scrapy -from scrapy_poet import callback_for - from example.autoextract import ProductPage +from scrapy_poet import callback_for + class BooksSpider(scrapy.Spider): - name = 'books_03' - start_urls = ['http://books.toscrape.com/'] + name = "books_03" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, callback_for(ProductPage)) diff --git a/example/example/spiders/books_04.py b/example/example/spiders/books_04.py index da118089..2b94dbd7 100644 --- a/example/example/spiders/books_04.py +++ b/example/example/spiders/books_04.py @@ -3,25 +3,26 @@ """ import scrapy from web_poet import ItemWebPage, WebPage + from scrapy_poet import callback_for class BookListPage(WebPage): def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_04' - start_urls = ['http://books.toscrape.com/'] + name = "books_04" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: BookListPage): for url in page.book_urls(): diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py index ab266c08..268c6e6d 100644 --- a/example/example/spiders/books_04_overrides_01.py +++ b/example/example/spiders/books_04_overrides_01.py @@ -7,47 +7,52 @@ """ import scrapy from web_poet import ItemWebPage, WebPage + from scrapy_poet import callback_for class BookListPage(WebPage): """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ItemWebPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BPBookListPage(WebPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): - return self.css('article.post h4 a::attr(href)').getall() + return self.css("article.post h4 a::attr(href)").getall() class BPBookPage(ItemWebPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("body div > h1::text").get().strip(), + "url": self.url, + "name": self.css("body div > h1::text").get().strip(), } class BooksSpider(scrapy.Spider): - name = 'books_04_overrides_01' - start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + name = "books_04_overrides_01" + start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages from the bookpage.com domain custom_settings = { "SCRAPY_POET_OVERRIDES": [ ("bookpage.com", BPBookListPage, BookListPage), - ("bookpage.com", BPBookPage, BookPage) + ("bookpage.com", BPBookPage, BookPage), ] } diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index b4c366a7..f707c2b2 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -7,67 +7,76 @@ at all is applied. """ import scrapy +from url_matcher import Patterns from web_poet import ItemWebPage, WebPage from web_poet.overrides import OverrideRule -from url_matcher import Patterns from scrapy_poet import callback_for class BookListPage(WebPage): - def book_urls(self): return [] class BookPage(ItemWebPage): - def to_item(self): return None class BTSBookListPage(BookListPage): """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BTSBookPage(BookPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): - return self.css('article.post h4 a::attr(href)').getall() + return self.css("article.post h4 a::attr(href)").getall() class BPBookPage(BookPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("body div > h1::text").get().strip(), + "url": self.url, + "name": self.css("body div > h1::text").get().strip(), } class BooksSpider(scrapy.Spider): - name = 'books_04_overrides_02' - start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + name = "books_04_overrides_02" + start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages for different domains custom_settings = { "SCRAPY_POET_OVERRIDES": [ ("toscrape.com", BTSBookListPage, BookListPage), ("toscrape.com", BTSBookPage, BookPage), - # We could also use the long-form version if we want to. - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), + OverrideRule( + for_patterns=Patterns(["bookpage.com"]), + use=BPBookListPage, + instead_of=BookListPage, + ), + OverrideRule( + for_patterns=Patterns(["bookpage.com"]), + use=BPBookPage, + instead_of=BookPage, + ), ] } diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index f25fff07..525c75e6 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -11,21 +11,17 @@ store the rules in web-poet's registry. """ import scrapy -from web_poet import ItemWebPage, WebPage, handle_urls, default_registry -from web_poet.overrides import OverrideRule -from url_matcher import Patterns +from web_poet import ItemWebPage, WebPage, default_registry, handle_urls from scrapy_poet import callback_for class BookListPage(WebPage): - def book_urls(self): return [] class BookPage(ItemWebPage): - def to_item(self): return None @@ -33,44 +29,46 @@ def to_item(self): @handle_urls("toscrape.com", overrides=BookListPage) class BTSBookListPage(BookListPage): """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() @handle_urls("toscrape.com", overrides=BookPage) class BTSBookPage(BookPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } @handle_urls("bookpage.com", overrides=BookListPage) class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): - return self.css('article.post h4 a::attr(href)').getall() + return self.css("article.post h4 a::attr(href)").getall() @handle_urls("bookpage.com", overrides=BookPage) class BPBookPage(BookPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("body div > h1::text").get().strip(), + "url": self.url, + "name": self.css("body div > h1::text").get().strip(), } class BooksSpider(scrapy.Spider): - name = 'books_04_overrides_03' - start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + name = "books_04_overrides_03" + start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages for different domains - custom_settings = { - "SCRAPY_POET_OVERRIDES": default_registry.get_overrides() - } + custom_settings = {"SCRAPY_POET_OVERRIDES": default_registry.get_overrides()} def parse(self, response, page: BookListPage): yield from response.follow_all(page.book_urls(), callback_for(BookPage)) diff --git a/example/example/spiders/books_05.py b/example/example/spiders/books_05.py index cd3c5440..c1aa5de3 100644 --- a/example/example/spiders/books_05.py +++ b/example/example/spiders/books_05.py @@ -3,14 +3,13 @@ You can mix various page types freely. """ import scrapy - -from web_poet import WebPage from example.autoextract import ProductPage +from web_poet import WebPage class BookListPage(WebPage): def product_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ProductPage): @@ -18,14 +17,14 @@ def to_item(self): # post-processing example: return only 2 fields book = super().to_item() return { - 'url': book['url'], - 'name': book['name'], + "url": book["url"], + "name": book["name"], } class BooksSpider(scrapy.Spider): - name = 'books_05' - start_urls = ['http://books.toscrape.com/'] + name = "books_05" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: BookListPage): for url in page.product_urls(): @@ -34,5 +33,5 @@ def parse(self, response, page: BookListPage): def parse_book(self, response, page: BookPage): # you can also post-process data in a spider book = page.to_item() - book['title'] = book.pop('name') + book["title"] = book.pop("name") yield book diff --git a/example/example/spiders/books_05_1.py b/example/example/spiders/books_05_1.py index d7ac6b22..ef53beeb 100644 --- a/example/example/spiders/books_05_1.py +++ b/example/example/spiders/books_05_1.py @@ -12,15 +12,15 @@ """ import scrapy - +from example.autoextract import ProductPage from web_poet import WebPage + from scrapy_poet import DummyResponse -from example.autoextract import ProductPage class BookListPage(WebPage): def product_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ProductPage): @@ -28,14 +28,14 @@ def to_item(self): # post-processing example: return only 2 fields book = super().to_item() return { - 'url': book['url'], - 'name': book['name'], + "url": book["url"], + "name": book["name"], } class BooksSpider(scrapy.Spider): - name = 'books_05_1' - start_urls = ['http://books.toscrape.com/'] + name = "books_05_1" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: BookListPage): for url in page.product_urls(): @@ -45,5 +45,5 @@ def parse(self, response, page: BookListPage): def parse_book(self, response: DummyResponse, page: BookPage): # you can also post-process data in a spider book = page.to_item() - book['title'] = book.pop('name') + book["title"] = book.pop("name") yield book diff --git a/example/example/spiders/books_06.py b/example/example/spiders/books_06.py index 27e2eb44..4ab91897 100644 --- a/example/example/spiders/books_06.py +++ b/example/example/spiders/books_06.py @@ -10,25 +10,24 @@ Scrapy > 2.0 required """ -import scrapy import attr - -from web_poet import WebPage, ItemWebPage, Injectable +import scrapy +from web_poet import Injectable, ItemWebPage, WebPage class ListingsExtractor(WebPage): def urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class PaginationExtractor(WebPage): def urls(self): - return self.css('.pager a::attr(href)').getall() + return self.css(".pager a::attr(href)").getall() class BreadcrumbsExtractor(WebPage): def urls(self): - return self.css('.breadcrumb a::attr(href)').getall() + return self.css(".breadcrumb a::attr(href)").getall() @attr.s(auto_attribs=True) @@ -42,21 +41,21 @@ class BookPage(ItemWebPage): breadcrumbs: BreadcrumbsExtractor def recently_viewed_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_06' - start_urls = ['http://books.toscrape.com/'] + name = "books_06" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: ListingsPage): - """ Callback for Listings pages """ + """Callback for Listings pages""" yield from response.follow_all(page.book_list.urls(), self.parse_book) yield from response.follow_all(page.pagination.urls(), self.parse, priority=+10) @@ -64,4 +63,3 @@ def parse_book(self, response, page: BookPage): yield from response.follow_all(page.recently_viewed_urls(), self.parse_book) yield from response.follow_all(page.breadcrumbs.urls(), self.parse) yield page.to_item() - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..cec60096 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.black] +line-length = 88 + +[tool.isort] +profile = "black" +multi_line_output = 3 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..51f1982a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +pre-commit +pytest diff --git a/scrapy_poet/__init__.py b/scrapy_poet/__init__.py index 973ef9d4..27398f5a 100644 --- a/scrapy_poet/__init__.py +++ b/scrapy_poet/__init__.py @@ -1,7 +1,7 @@ +from .api import DummyResponse, callback_for from .middleware import InjectionMiddleware -from .api import callback_for, DummyResponse from .page_input_providers import ( - PageObjectInputProvider, CacheDataProviderMixin, HttpResponseProvider, + PageObjectInputProvider, ) diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index d09259b5..3b8d5000 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -1,12 +1,10 @@ -from typing import Callable, Optional, Type from inspect import iscoroutinefunction +from typing import Callable, Optional, Type from scrapy.http import Request, Response - from web_poet.pages import ItemPage - -_CALLBACK_FOR_MARKER = '__scrapy_poet_callback' +_CALLBACK_FOR_MARKER = "__scrapy_poet_callback" class DummyResponse(Response): @@ -107,12 +105,12 @@ def parse(self, response): your request object. """ if not issubclass(page_cls, ItemPage): - raise TypeError( - f'{page_cls.__name__} should be a subclass of ItemPage.') + raise TypeError(f"{page_cls.__name__} should be a subclass of ItemPage.") - if getattr(page_cls.to_item, '__isabstractmethod__', False): + if getattr(page_cls.to_item, "__isabstractmethod__", False): raise NotImplementedError( - f'{page_cls.__name__} should implement to_item method.') + f"{page_cls.__name__} should implement to_item method." + ) # When the callback is used as an instance method of the spider, it expects # to receive 'self' as its first argument. When used as a simple inline diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index d1a9ef47..dcae642b 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -28,7 +28,7 @@ class SqlitedictCache(_Cache): def __init__(self, path: str, *, compressed=True): self.path = path self.compressed = compressed - tablename = 'responses_gzip' if compressed else 'responses' + tablename = "responses_gzip" if compressed else "responses" self.db = sqlitedict.SqliteDict( path, tablename=tablename, @@ -54,14 +54,14 @@ def decode(self, obj: Any) -> Any: return pickle.loads(data) def __str__(self) -> str: - return ( #pragma: no cover + return ( # pragma: no cover f"SqlitedictCache <{self.db.filename} | " f"compressed: {self.compressed} | " f"{len(self.db)} records>" ) def __repr__(self) -> str: - return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" #pragma: no cover + return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" # pragma: no cover def __getitem__(self, fingerprint: str) -> Any: return self.db[fingerprint] diff --git a/scrapy_poet/downloader.py b/scrapy_poet/downloader.py index b6ae534f..a20c60f2 100644 --- a/scrapy_poet/downloader.py +++ b/scrapy_poet/downloader.py @@ -3,10 +3,7 @@ import scrapy from scrapy.utils.defer import maybe_deferred_to_future from web_poet import HttpRequest -from web_poet.exceptions import ( - HttpError, - HttpRequestError, -) +from web_poet.exceptions import HttpError, HttpRequestError from scrapy_poet.utils import ( http_request_to_scrapy_request, diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 034ee23a..d44d445e 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -2,10 +2,7 @@ import logging import os import pprint -from typing import Dict, Callable, Any, List, Set, Mapping, Optional - -from .utils import get_scrapy_data_path -from twisted.internet.defer import inlineCallbacks +from typing import Any, Callable, Dict, List, Mapping, Optional, Set import andi from scrapy import Request, Spider @@ -15,18 +12,21 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.conf import build_component_list from scrapy.utils.defer import maybeDeferred_coro -from scrapy.utils.misc import load_object, create_instance +from scrapy.utils.misc import create_instance, load_object +from twisted.internet.defer import inlineCallbacks +from web_poet.pages import is_injectable +from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse from scrapy_poet.cache import SqlitedictCache -from scrapy_poet.injection_errors import (UndeclaredProvidedTypeError, - NonCallableProviderError, - InjectionError) -from scrapy_poet.overrides import OverridesRegistryBase, \ - OverridesRegistry +from scrapy_poet.injection_errors import ( + InjectionError, + NonCallableProviderError, + UndeclaredProvidedTypeError, +) +from scrapy_poet.overrides import OverridesRegistry, OverridesRegistryBase from scrapy_poet.page_input_providers import PageObjectInputProvider -from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse -from web_poet.pages import is_injectable +from .utils import get_scrapy_data_path logger = logging.getLogger(__name__) @@ -36,26 +36,28 @@ class Injector: Keep all the logic required to do dependency injection in Scrapy callbacks. Initializes the providers from the spider settings at initialization. """ - def __init__(self, - crawler: Crawler, - *, - default_providers: Optional[Mapping] = None, - overrides_registry: Optional[OverridesRegistryBase] = None): + + def __init__( + self, + crawler: Crawler, + *, + default_providers: Optional[Mapping] = None, + overrides_registry: Optional[OverridesRegistryBase] = None, + ): self.crawler = crawler self.spider = crawler.spider self.overrides_registry = overrides_registry or OverridesRegistry() self.load_providers(default_providers) self.init_cache() - def load_providers(self, default_providers: Optional[Mapping] = None): - providers_dict = {**(default_providers or {}), - **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS")} + def load_providers(self, default_providers: Optional[Mapping] = None): # noqa: D102 + providers_dict = { + **(default_providers or {}), + **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS"), + } provider_classes = build_component_list(providers_dict) logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}") - self.providers = [ - load_object(cls)(self.crawler) - for cls in provider_classes - ] + self.providers = [load_object(cls)(self.crawler) for cls in provider_classes] check_all_providers_are_callable(self.providers) # Caching whether each provider requires the scrapy response self.is_provider_requiring_scrapy_response = { @@ -63,27 +65,34 @@ def load_providers(self, default_providers: Optional[Mapping] = None): for provider in self.providers } # Caching the function for faster execution - self.is_class_provided_by_any_provider = \ - is_class_provided_by_any_provider_fn(self.providers) + self.is_class_provided_by_any_provider = is_class_provided_by_any_provider_fn( + self.providers + ) - def close(self) -> None: + def close(self) -> None: # noqa: D102 if self.cache: self.cache.close() - def init_cache(self): + def init_cache(self): # noqa: D102 self.cache = None - cache_filename = self.spider.settings.get('SCRAPY_POET_CACHE') + cache_filename = self.spider.settings.get("SCRAPY_POET_CACHE") if cache_filename and isinstance(cache_filename, bool): - cache_filename = os.path.join(get_scrapy_data_path(createdir=True), "scrapy-poet-cache.sqlite3") + cache_filename = os.path.join( + get_scrapy_data_path(createdir=True), "scrapy-poet-cache.sqlite3" + ) if cache_filename: - compressed = self.spider.settings.getbool('SCRAPY_POET_CACHE_GZIP', True) - self.caching_errors = self.spider.settings.getbool('SCRAPY_POET_CACHE_ERRORS', False) + compressed = self.spider.settings.getbool("SCRAPY_POET_CACHE_GZIP", True) + self.caching_errors = self.spider.settings.getbool( + "SCRAPY_POET_CACHE_ERRORS", False + ) self.cache = SqlitedictCache(cache_filename, compressed=compressed) - logger.info(f"Cache enabled. File: '{cache_filename}'. Compressed: {compressed}. Caching errors: {self.caching_errors}") + logger.info( + f"Cache enabled. File: '{cache_filename}'. Compressed: {compressed}. Caching errors: {self.caching_errors}" + ) - def available_dependencies_for_providers(self, - request: Request, - response: Response): + def available_dependencies_for_providers( + self, request: Request, response: Response + ): # noqa: D102 deps = { Crawler: self.crawler, Spider: self.spider, @@ -95,8 +104,9 @@ def available_dependencies_for_providers(self, assert deps.keys() == SCRAPY_PROVIDED_CLASSES return deps - def discover_callback_providers(self, request: Request - ) -> Set[PageObjectInputProvider]: + def discover_callback_providers( + self, request: Request + ) -> Set[PageObjectInputProvider]: """Discover the providers that are required to fulfil the callback dependencies""" plan = self.build_plan(request) result = set() @@ -128,12 +138,11 @@ def build_plan(self, request: Request) -> andi.Plan: callback, is_injectable=is_injectable, externally_provided=self.is_class_provided_by_any_provider, - overrides=self.overrides_registry.overrides_for(request).get + overrides=self.overrides_registry.overrides_for(request).get, ) @inlineCallbacks - def build_instances( - self, request: Request, response: Response, plan: andi.Plan): + def build_instances(self, request: Request, response: Response, plan: andi.Plan): """Build the instances dict from a plan including external dependencies.""" # First we build the external dependencies using the providers instances = yield from self.build_instances_from_providers( @@ -149,15 +158,18 @@ def build_instances( @inlineCallbacks def build_instances_from_providers( - self, request: Request, response: Response, plan: andi.Plan): + self, request: Request, response: Response, plan: andi.Plan + ): """Build dependencies handled by registered providers""" instances: Dict[Callable, Any] = {} scrapy_provided_dependencies = self.available_dependencies_for_providers( - request, response) + request, response + ) dependencies_set = {cls for cls, _ in plan.dependencies} for provider in self.providers: - provided_classes = {cls for cls in dependencies_set if - provider.is_provided(cls)} + provided_classes = { + cls for cls in dependencies_set if provider.is_provided(cls) + } provided_classes -= instances.keys() # ignore already provided types if not provided_classes: continue @@ -166,8 +178,10 @@ def build_instances_from_providers( cache_hit = False if self.cache and provider.has_cache_support: if not provider.name: - raise NotImplementedError(f"The provider {type(provider)} must have a `name` defined if" - f" you want to use the cache. It must be unique across the providers.") + raise NotImplementedError( + f"The provider {type(provider)} must have a `name` defined if" + f" you want to use the cache. It must be unique across the providers." + ) # Return the data if it is already in the cache fingerprint = f"{provider.name}_{provider.fingerprint(set(provided_classes), request)}" try: @@ -191,10 +205,16 @@ def build_instances_from_providers( try: # Invoke the provider to get the data - objs = yield maybeDeferred_coro(provider, set(provided_classes), **kwargs) + objs = yield maybeDeferred_coro( + provider, set(provided_classes), **kwargs + ) except Exception as e: - if self.cache and self.caching_errors and provider.has_cache_support: + if ( + self.cache + and self.caching_errors + and provider.has_cache_support + ): # Save errors in the cache self.cache[fingerprint] = e self.crawler.stats.inc_value("scrapy-poet/cache/firsthand") @@ -238,8 +258,9 @@ def check_all_providers_are_callable(providers): ) -def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider] - ) -> Callable[[Callable], bool]: +def is_class_provided_by_any_provider_fn( + providers: List[PageObjectInputProvider], +) -> Callable[[Callable], bool]: """ Return a function of type ``Callable[[Type], bool]`` that return True if the given type is provided by any of the registered providers. @@ -249,7 +270,9 @@ def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider joined together for efficiency. """ sets_of_types: Set[Callable] = set() # caching all sets found - individual_is_callable: List[Callable[[Callable], bool]] = [sets_of_types.__contains__] + individual_is_callable: List[Callable[[Callable], bool]] = [ + sets_of_types.__contains__ + ] for provider in providers: provided_classes = provider.provided_classes @@ -261,7 +284,8 @@ def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider raise InjectionError( f"Unexpected type '{type(provided_classes)}' for " f"'{type(provider)}.provided_classes'. Expected either 'set' " - f"or 'callable'") + f"or 'callable'" + ) def is_provided_fn(type: Callable) -> bool: for is_provided in individual_is_callable: @@ -275,7 +299,7 @@ def is_provided_fn(type: Callable) -> bool: def get_callback(request, spider): """Get ``request.callback`` of a :class:`scrapy.Request`""" if request.callback is None: - return getattr(spider, 'parse') + return getattr(spider, "parse") # noqa: B009 return request.callback @@ -292,7 +316,7 @@ def is_callback_requiring_scrapy_response(callback: Callable): signature = inspect.signature(callback) first_parameter_key = next(iter(signature.parameters)) first_parameter = signature.parameters[first_parameter_key] - if str(first_parameter).startswith('*'): + if str(first_parameter).startswith("*"): # Parse method is probably using *args and **kwargs annotation. # Let's assume response is going to be used. return True @@ -334,19 +358,21 @@ def is_provider_requiring_scrapy_response(provider): def get_injector_for_testing( - providers: Mapping, - additional_settings: Dict = None, - overrides_registry: Optional[OverridesRegistryBase] = None + providers: Mapping, + additional_settings: Dict = None, + overrides_registry: Optional[OverridesRegistryBase] = None, ) -> Injector: """ Return an :class:`Injector` using a fake crawler. Useful for testing providers """ + class MySpider(Spider): name = "my_spider" - settings = Settings({**(additional_settings or {}), - "SCRAPY_POET_PROVIDERS": providers}) + settings = Settings( + {**(additional_settings or {}), "SCRAPY_POET_PROVIDERS": providers} + ) crawler = Crawler(MySpider) crawler.settings = settings spider = MySpider() @@ -375,7 +401,9 @@ def get_response_for_testing(callback: Callable) -> Response:
The best chocolate ever