From 869f1eebba0f6996732f670286067a1f3dadb907 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 29 Mar 2022 14:09:28 +0800 Subject: [PATCH 01/10] add precommit-hook configuration for linters --- .flake8 | 20 ++++++++++++++++++++ .github/workflows/test.yml | 2 +- .pre-commit-config.yaml | 22 ++++++++++++++++++++++ README.rst | 18 ++++++++++++++++++ pyproject.toml | 6 ++++++ requirements-dev.txt | 2 ++ tox.ini | 4 ++++ 7 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 .flake8 create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..d48f0645 --- /dev/null +++ b/.flake8 @@ -0,0 +1,20 @@ +[flake8] +ignore = + # Refers to the max-line length. Let's suppress the error and simply + # let black take care on how it wants to format the lines. + E501, + + # Refers to "line break before binary operator". + # Similar to above, let black take care of the formatting. + W503, + + # Refers to "nnecessary dict call - rewrite as a literal". + C408 + +per-file-ignores = + # Ignore "imported but unused" errors in __init__ files, as those imports are there + # to expose submodule functions so they can be imported directly from that module + scrapy_poet/__init__.py:F401 + + # Ignore * imports in these files + scrapy_poet/__init__.py:F403 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 556f0b50..71102a57 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -57,7 +57,7 @@ jobs: fail-fast: false matrix: python-version: ['3.10'] - tox-job: ["mypy", "docs"] + tox-job: ["mypy", "docs", "linters"] steps: - uses: actions/checkout@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..ab9089cd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: + - hooks: + - id: black + language_version: python3 + repo: https://github.com/ambv/black + rev: 22.3.0 + - hooks: + - id: isort + language_version: python3 + repo: https://github.com/PyCQA/isort + rev: 5.10.1 + - hooks: + - id: flake8 + language_version: python3 + additional_dependencies: + - flake8-bugbear + - flake8-comprehensions + - flake8-debugger + - flake8-docstrings + - flake8-string-format + repo: https://github.com/pycqa/flake8 + rev: 4.0.1 diff --git a/README.rst b/README.rst index d305724b..6f76f5b5 100644 --- a/README.rst +++ b/README.rst @@ -60,3 +60,21 @@ Add the following inside Scrapy's ``settings.py`` file: DOWNLOADER_MIDDLEWARES = { "scrapy_poet.InjectionMiddleware": 543, } + +Developing +========== + +Setup your local Python environment via: + +1. `pip install -r requirements-dev.txt` +2. `pre-commit install` + +Now everytime you perform a `git commit`, these tools will run against the +staged files: + +* `black` +* `isort` +* `flake8` + +You can also directly invoke `pre-commit run --all-files` or `tox -e linters` +to run them without performing a commit. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..c7d708d9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.black] +line-length = 120 + +[tool.isort] +profile = "black" +multi_line_output = 3 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..51f1982a --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +pre-commit +pytest diff --git a/tox.ini b/tox.ini index 62cb3fd7..b8c15f79 100644 --- a/tox.ini +++ b/tox.ini @@ -53,3 +53,7 @@ changedir = {[docs]changedir} deps = {[docs]deps} commands = sphinx-build -W -b html . {envtmpdir}/html + +[testenv:linters] +deps = -rrequirements-dev.txt +commands = pre-commit run --all-files --show-diff-on-failure From 753c876e6fb4ef1c6b3dd4bb7b86540a9d7ba2b8 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 12:53:59 +0800 Subject: [PATCH 02/10] update flake8 config --- .flake8 | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/.flake8 b/.flake8 index d48f0645..664832fa 100644 --- a/.flake8 +++ b/.flake8 @@ -8,13 +8,28 @@ ignore = # Similar to above, let black take care of the formatting. W503, - # Refers to "nnecessary dict call - rewrite as a literal". - C408 + # Refers to "necessary dict call - rewrite as a literal". + C408, -per-file-ignores = - # Ignore "imported but unused" errors in __init__ files, as those imports are there - # to expose submodule functions so they can be imported directly from that module - scrapy_poet/__init__.py:F401 + # To be addressed: + D100, # Missing docstring in public module + D101, # Missing docstring in public class + D103, # Missing docstring in public function + D104, # Missing docstring in public package + D105, # Missing docstring in magic method + D107, # Missing docstring in __init__ + D200, # One-line docstring should fit on one line with quotes + D202, # No blank lines allowed after function docstring + D205, # 1 blank line required between summary line and description + D209, # Multi-line docstring closing quotes should be on a separate line + D400, # First line should end with a period + D401, # First line should be in imperative mood + D402 # First line should not be the function's "signature" - # Ignore * imports in these files - scrapy_poet/__init__.py:F403 +per-file-ignores = + # F401: Ignore "imported but unused" errors in __init__ files, as those + # imports are there to expose submodule functions so they can be imported + # directly from that module + # F403: Ignore * imports in these files + web_poet/__init__.py:F401,F403 + web_poet/page_inputs/__init__.py:F401,F403 From 58c903617911b3209ad68bfefe3fa1a86be629f4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 12:55:38 +0800 Subject: [PATCH 03/10] run black --- docs/conf.py | 79 +++--- example/example/autoextract.py | 21 +- example/example/settings.py | 9 +- example/example/spiders/books_01.py | 10 +- example/example/spiders/books_02.py | 10 +- example/example/spiders/books_02_1.py | 11 +- example/example/spiders/books_02_2.py | 11 +- example/example/spiders/books_02_3.py | 10 +- example/example/spiders/books_03.py | 10 +- example/example/spiders/books_04.py | 11 +- .../example/spiders/books_04_overrides_01.py | 23 +- .../example/spiders/books_04_overrides_02.py | 25 +- .../example/spiders/books_04_overrides_03.py | 30 +-- example/example/spiders/books_05.py | 15 +- example/example/spiders/books_05_1.py | 16 +- example/example/spiders/books_06.py | 24 +- scrapy_poet/__init__.py | 4 +- scrapy_poet/api.py | 14 +- scrapy_poet/cache.py | 10 +- scrapy_poet/downloader.py | 8 +- scrapy_poet/injection.py | 117 +++++---- scrapy_poet/injection_errors.py | 2 +- scrapy_poet/middleware.py | 23 +- scrapy_poet/overrides.py | 11 +- scrapy_poet/page_input_providers.py | 27 ++- scrapy_poet/utils.py | 4 +- setup.py | 54 ++--- tests/conftest.py | 8 +- tests/mockserver.py | 24 +- tests/po_lib/__init__.py | 5 +- tests/test_callback_for.py | 69 +++--- tests/test_downloader.py | 108 ++++----- tests/test_injection.py | 134 +++++------ tests/test_middleware.py | 224 ++++++++---------- tests/test_page_input_providers.py | 1 - tests/test_providers.py | 37 ++- tests/test_response_required_logic.py | 49 ++-- tests/test_scrapy_dependencies.py | 15 +- tests/test_utils.py | 2 +- tests/utils.py | 33 +-- 40 files changed, 609 insertions(+), 689 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 84466f9d..97c01d71 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,19 +12,20 @@ # import os import sys -sys.path.insert(0, os.path.abspath('../')) + +sys.path.insert(0, os.path.abspath("../")) # -- Project information ----------------------------------------------------- -project = u'scrapy-poet' -copyright = u'2022, Zyte' -author = u'Zyte' +project = "scrapy-poet" +copyright = "2022, Zyte" +author = "Zyte" # The short X.Y version -version = u'' +version = "" # The full version, including alpha/beta/rc tags -release = u'0.3.0' +release = "0.3.0" # -- General configuration --------------------------------------------------- @@ -37,24 +38,24 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -66,7 +67,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None @@ -77,12 +78,13 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom themes here, relative to this directory. # Add path to the RTD explicitly to robustify builds (otherwise might # fail in a clean Debian build env) import sphinx_rtd_theme + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme @@ -110,7 +112,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'scrapy-poetdoc' +htmlhelp_basename = "scrapy-poetdoc" # -- Options for LaTeX output ------------------------------------------------ @@ -119,15 +121,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -137,8 +136,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'scrapy-poet.tex', u'scrapy-poet Documentation', - u'Scrapinghub', 'manual'), + (master_doc, "scrapy-poet.tex", "scrapy-poet Documentation", "Scrapinghub", "manual"), ] @@ -146,10 +144,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'scrapy-poet', u'scrapy-poet Documentation', - [author], 1) -] +man_pages = [(master_doc, "scrapy-poet", "scrapy-poet Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- @@ -158,9 +153,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'scrapy-poet', u'scrapy-poet Documentation', - author, 'scrapy-poet', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "scrapy-poet", + "scrapy-poet Documentation", + author, + "scrapy-poet", + "One line description of project.", + "Miscellaneous", + ), ] @@ -179,21 +180,27 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- # -- Options for intersphinx extension --------------------------------------- intersphinx_mapping = { - 'python': ('https://docs.python.org/3', None, ), - 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), - 'web-poet': ('https://web-poet.readthedocs.io/en/latest/', None), - 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None), + "python": ( + "https://docs.python.org/3", + None, + ), + "scrapy": ( + "https://docs.scrapy.org/en/latest", + None, + ), + "web-poet": ("https://web-poet.readthedocs.io/en/latest/", None), + "url-matcher": ("https://url-matcher.readthedocs.io/en/stable/", None), } autodoc_default_options = { - 'special-members': '__init__,__call__', + "special-members": "__init__,__call__", # 'undoc-members': True, - 'exclude-members': '__weakref__' + "exclude-members": "__weakref__", } diff --git a/example/example/autoextract.py b/example/example/autoextract.py index 7b6c0503..49cff130 100644 --- a/example/example/autoextract.py +++ b/example/example/autoextract.py @@ -2,20 +2,21 @@ Example of how to create a PageObject with a very different input data, which even requires an API request. """ -from typing import Dict, Any +from typing import Any, Dict import attr +from scrapy import Request from twisted.internet.defer import inlineCallbacks from twisted.internet.threads import deferToThread +from web_poet import ItemPage -from scrapy import Request from scrapy_poet.page_input_providers import PageObjectInputProvider -from web_poet import ItemPage @attr.s(auto_attribs=True) class AutoextractProductResponse: - """ Input data """ + """Input data""" + data: Dict[str, Any] @@ -24,7 +25,7 @@ class AutoextractProductProvider(PageObjectInputProvider): @inlineCallbacks def __call__(self, to_provide, request: Request): - data = (yield get_autoextract_product(request.url)) + data = yield get_autoextract_product(request.url) return [AutoextractProductResponse(data=data)] @@ -33,19 +34,21 @@ def get_autoextract_product(url): # fixme: use async # fixme: rate limits? from autoextract.sync import request_batch - resp = yield deferToThread(request_batch, urls=[url], page_type='product') + + resp = yield deferToThread(request_batch, urls=[url], page_type="product") return resp[0] @attr.s(auto_attribs=True) class ProductPage(ItemPage): - """ Generic product page """ + """Generic product page""" + autoextract_resp: AutoextractProductResponse @property def url(self): - return self.autoextract_resp.data['product']['url'] + return self.autoextract_resp.data["product"]["url"] def to_item(self): - product = self.autoextract_resp.data['product'] + product = self.autoextract_resp.data["product"] return product diff --git a/example/example/settings.py b/example/example/settings.py index 4e319585..863cb471 100644 --- a/example/example/settings.py +++ b/example/example/settings.py @@ -8,10 +8,10 @@ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from example.autoextract import AutoextractProductProvider -BOT_NAME = 'example' +BOT_NAME = "example" -SPIDER_MODULES = ['example.spiders'] -NEWSPIDER_MODULE = 'example.spiders' +SPIDER_MODULES = ["example.spiders"] +NEWSPIDER_MODULE = "example.spiders" SCRAPY_POET_PROVIDERS = {AutoextractProductProvider: 500} @@ -19,6 +19,5 @@ ROBOTSTXT_OBEY = True DOWNLOADER_MIDDLEWARES = { - 'scrapy_poet.InjectionMiddleware': 543, + "scrapy_poet.InjectionMiddleware": 543, } - diff --git a/example/example/spiders/books_01.py b/example/example/spiders/books_01.py index c9d68804..a7d65039 100644 --- a/example/example/spiders/books_01.py +++ b/example/example/spiders/books_01.py @@ -5,15 +5,15 @@ class BooksSpider(scrapy.Spider): - name = 'books_01' - start_urls = ['http://books.toscrape.com/'] + name = "books_01" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, self.parse_book) def parse_book(self, response): yield { - 'url': response.url, - 'name': response.css("title::text").get(), + "url": response.url, + "name": response.css("title::text").get(), } diff --git a/example/example/spiders/books_02.py b/example/example/spiders/books_02.py index fa5b89d9..a1f52c34 100644 --- a/example/example/spiders/books_02.py +++ b/example/example/spiders/books_02.py @@ -9,17 +9,17 @@ class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02' - start_urls = ['http://books.toscrape.com/'] + name = "books_02" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, self.parse_book) def parse_book(self, response, book_page: BookPage): diff --git a/example/example/spiders/books_02_1.py b/example/example/spiders/books_02_1.py index 33d3cb59..42eac332 100644 --- a/example/example/spiders/books_02_1.py +++ b/example/example/spiders/books_02_1.py @@ -5,22 +5,23 @@ """ import scrapy from web_poet import ItemWebPage + from scrapy_poet import callback_for class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02_1' - start_urls = ['http://books.toscrape.com/'] + name = "books_02_1" + start_urls = ["http://books.toscrape.com/"] parse_book = callback_for(BookPage) def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, self.parse_book) diff --git a/example/example/spiders/books_02_2.py b/example/example/spiders/books_02_2.py index e720e12a..b9eccc41 100644 --- a/example/example/spiders/books_02_2.py +++ b/example/example/spiders/books_02_2.py @@ -12,21 +12,22 @@ """ import scrapy from web_poet import ItemWebPage + from scrapy_poet import callback_for class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02_2' - start_urls = ['http://books.toscrape.com/'] + name = "books_02_2" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, callback_for(BookPage)) diff --git a/example/example/spiders/books_02_3.py b/example/example/spiders/books_02_3.py index eb952078..14cf53a3 100644 --- a/example/example/spiders/books_02_3.py +++ b/example/example/spiders/books_02_3.py @@ -14,15 +14,15 @@ class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_02_3' - start_urls = ['http://books.toscrape.com/'] + name = "books_02_3" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, BookPage) diff --git a/example/example/spiders/books_03.py b/example/example/spiders/books_03.py index 503ed8cd..61efb4f7 100644 --- a/example/example/spiders/books_03.py +++ b/example/example/spiders/books_03.py @@ -2,15 +2,15 @@ Scrapy spider which uses AutoExtract API, to extract books as products. """ import scrapy -from scrapy_poet import callback_for - from example.autoextract import ProductPage +from scrapy_poet import callback_for + class BooksSpider(scrapy.Spider): - name = 'books_03' - start_urls = ['http://books.toscrape.com/'] + name = "books_03" + start_urls = ["http://books.toscrape.com/"] def parse(self, response): - for url in response.css('.image_container a::attr(href)').getall(): + for url in response.css(".image_container a::attr(href)").getall(): yield response.follow(url, callback_for(ProductPage)) diff --git a/example/example/spiders/books_04.py b/example/example/spiders/books_04.py index da118089..2b94dbd7 100644 --- a/example/example/spiders/books_04.py +++ b/example/example/spiders/books_04.py @@ -3,25 +3,26 @@ """ import scrapy from web_poet import ItemWebPage, WebPage + from scrapy_poet import callback_for class BookListPage(WebPage): def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_04' - start_urls = ['http://books.toscrape.com/'] + name = "books_04" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: BookListPage): for url in page.book_urls(): diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py index ab266c08..268c6e6d 100644 --- a/example/example/spiders/books_04_overrides_01.py +++ b/example/example/spiders/books_04_overrides_01.py @@ -7,47 +7,52 @@ """ import scrapy from web_poet import ItemWebPage, WebPage + from scrapy_poet import callback_for class BookListPage(WebPage): """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ItemWebPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BPBookListPage(WebPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): - return self.css('article.post h4 a::attr(href)').getall() + return self.css("article.post h4 a::attr(href)").getall() class BPBookPage(ItemWebPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("body div > h1::text").get().strip(), + "url": self.url, + "name": self.css("body div > h1::text").get().strip(), } class BooksSpider(scrapy.Spider): - name = 'books_04_overrides_01' - start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + name = "books_04_overrides_01" + start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages from the bookpage.com domain custom_settings = { "SCRAPY_POET_OVERRIDES": [ ("bookpage.com", BPBookListPage, BookListPage), - ("bookpage.com", BPBookPage, BookPage) + ("bookpage.com", BPBookPage, BookPage), ] } diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index b4c366a7..dd576270 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -7,64 +7,65 @@ at all is applied. """ import scrapy +from url_matcher import Patterns from web_poet import ItemWebPage, WebPage from web_poet.overrides import OverrideRule -from url_matcher import Patterns from scrapy_poet import callback_for class BookListPage(WebPage): - def book_urls(self): return [] class BookPage(ItemWebPage): - def to_item(self): return None class BTSBookListPage(BookListPage): """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BTSBookPage(BookPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): - return self.css('article.post h4 a::attr(href)').getall() + return self.css("article.post h4 a::attr(href)").getall() class BPBookPage(BookPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("body div > h1::text").get().strip(), + "url": self.url, + "name": self.css("body div > h1::text").get().strip(), } class BooksSpider(scrapy.Spider): - name = 'books_04_overrides_02' - start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + name = "books_04_overrides_02" + start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages for different domains custom_settings = { "SCRAPY_POET_OVERRIDES": [ ("toscrape.com", BTSBookListPage, BookListPage), ("toscrape.com", BTSBookPage, BookPage), - # We could also use the long-form version if we want to. OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index f25fff07..e3bf936d 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -11,21 +11,19 @@ store the rules in web-poet's registry. """ import scrapy -from web_poet import ItemWebPage, WebPage, handle_urls, default_registry -from web_poet.overrides import OverrideRule from url_matcher import Patterns +from web_poet import ItemWebPage, WebPage, default_registry, handle_urls +from web_poet.overrides import OverrideRule from scrapy_poet import callback_for class BookListPage(WebPage): - def book_urls(self): return [] class BookPage(ItemWebPage): - def to_item(self): return None @@ -33,44 +31,46 @@ def to_item(self): @handle_urls("toscrape.com", overrides=BookListPage) class BTSBookListPage(BookListPage): """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() @handle_urls("toscrape.com", overrides=BookPage) class BTSBookPage(BookPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } @handle_urls("bookpage.com", overrides=BookListPage) class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): - return self.css('article.post h4 a::attr(href)').getall() + return self.css("article.post h4 a::attr(href)").getall() @handle_urls("bookpage.com", overrides=BookPage) class BPBookPage(BookPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): return { - 'url': self.url, - 'name': self.css("body div > h1::text").get().strip(), + "url": self.url, + "name": self.css("body div > h1::text").get().strip(), } class BooksSpider(scrapy.Spider): - name = 'books_04_overrides_03' - start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + name = "books_04_overrides_03" + start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages for different domains - custom_settings = { - "SCRAPY_POET_OVERRIDES": default_registry.get_overrides() - } + custom_settings = {"SCRAPY_POET_OVERRIDES": default_registry.get_overrides()} def parse(self, response, page: BookListPage): yield from response.follow_all(page.book_urls(), callback_for(BookPage)) diff --git a/example/example/spiders/books_05.py b/example/example/spiders/books_05.py index cd3c5440..c1aa5de3 100644 --- a/example/example/spiders/books_05.py +++ b/example/example/spiders/books_05.py @@ -3,14 +3,13 @@ You can mix various page types freely. """ import scrapy - -from web_poet import WebPage from example.autoextract import ProductPage +from web_poet import WebPage class BookListPage(WebPage): def product_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ProductPage): @@ -18,14 +17,14 @@ def to_item(self): # post-processing example: return only 2 fields book = super().to_item() return { - 'url': book['url'], - 'name': book['name'], + "url": book["url"], + "name": book["name"], } class BooksSpider(scrapy.Spider): - name = 'books_05' - start_urls = ['http://books.toscrape.com/'] + name = "books_05" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: BookListPage): for url in page.product_urls(): @@ -34,5 +33,5 @@ def parse(self, response, page: BookListPage): def parse_book(self, response, page: BookPage): # you can also post-process data in a spider book = page.to_item() - book['title'] = book.pop('name') + book["title"] = book.pop("name") yield book diff --git a/example/example/spiders/books_05_1.py b/example/example/spiders/books_05_1.py index d7ac6b22..ef53beeb 100644 --- a/example/example/spiders/books_05_1.py +++ b/example/example/spiders/books_05_1.py @@ -12,15 +12,15 @@ """ import scrapy - +from example.autoextract import ProductPage from web_poet import WebPage + from scrapy_poet import DummyResponse -from example.autoextract import ProductPage class BookListPage(WebPage): def product_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class BookPage(ProductPage): @@ -28,14 +28,14 @@ def to_item(self): # post-processing example: return only 2 fields book = super().to_item() return { - 'url': book['url'], - 'name': book['name'], + "url": book["url"], + "name": book["name"], } class BooksSpider(scrapy.Spider): - name = 'books_05_1' - start_urls = ['http://books.toscrape.com/'] + name = "books_05_1" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: BookListPage): for url in page.product_urls(): @@ -45,5 +45,5 @@ def parse(self, response, page: BookListPage): def parse_book(self, response: DummyResponse, page: BookPage): # you can also post-process data in a spider book = page.to_item() - book['title'] = book.pop('name') + book["title"] = book.pop("name") yield book diff --git a/example/example/spiders/books_06.py b/example/example/spiders/books_06.py index 27e2eb44..4ab91897 100644 --- a/example/example/spiders/books_06.py +++ b/example/example/spiders/books_06.py @@ -10,25 +10,24 @@ Scrapy > 2.0 required """ -import scrapy import attr - -from web_poet import WebPage, ItemWebPage, Injectable +import scrapy +from web_poet import Injectable, ItemWebPage, WebPage class ListingsExtractor(WebPage): def urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() class PaginationExtractor(WebPage): def urls(self): - return self.css('.pager a::attr(href)').getall() + return self.css(".pager a::attr(href)").getall() class BreadcrumbsExtractor(WebPage): def urls(self): - return self.css('.breadcrumb a::attr(href)').getall() + return self.css(".breadcrumb a::attr(href)").getall() @attr.s(auto_attribs=True) @@ -42,21 +41,21 @@ class BookPage(ItemWebPage): breadcrumbs: BreadcrumbsExtractor def recently_viewed_urls(self): - return self.css('.image_container a::attr(href)').getall() + return self.css(".image_container a::attr(href)").getall() def to_item(self): return { - 'url': self.url, - 'name': self.css("title::text").get(), + "url": self.url, + "name": self.css("title::text").get(), } class BooksSpider(scrapy.Spider): - name = 'books_06' - start_urls = ['http://books.toscrape.com/'] + name = "books_06" + start_urls = ["http://books.toscrape.com/"] def parse(self, response, page: ListingsPage): - """ Callback for Listings pages """ + """Callback for Listings pages""" yield from response.follow_all(page.book_list.urls(), self.parse_book) yield from response.follow_all(page.pagination.urls(), self.parse, priority=+10) @@ -64,4 +63,3 @@ def parse_book(self, response, page: BookPage): yield from response.follow_all(page.recently_viewed_urls(), self.parse_book) yield from response.follow_all(page.breadcrumbs.urls(), self.parse) yield page.to_item() - diff --git a/scrapy_poet/__init__.py b/scrapy_poet/__init__.py index 973ef9d4..27398f5a 100644 --- a/scrapy_poet/__init__.py +++ b/scrapy_poet/__init__.py @@ -1,7 +1,7 @@ +from .api import DummyResponse, callback_for from .middleware import InjectionMiddleware -from .api import callback_for, DummyResponse from .page_input_providers import ( - PageObjectInputProvider, CacheDataProviderMixin, HttpResponseProvider, + PageObjectInputProvider, ) diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index d09259b5..a19ab437 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -1,12 +1,10 @@ -from typing import Callable, Optional, Type from inspect import iscoroutinefunction +from typing import Callable, Optional, Type from scrapy.http import Request, Response - from web_poet.pages import ItemPage - -_CALLBACK_FOR_MARKER = '__scrapy_poet_callback' +_CALLBACK_FOR_MARKER = "__scrapy_poet_callback" class DummyResponse(Response): @@ -107,12 +105,10 @@ def parse(self, response): your request object. """ if not issubclass(page_cls, ItemPage): - raise TypeError( - f'{page_cls.__name__} should be a subclass of ItemPage.') + raise TypeError(f"{page_cls.__name__} should be a subclass of ItemPage.") - if getattr(page_cls.to_item, '__isabstractmethod__', False): - raise NotImplementedError( - f'{page_cls.__name__} should implement to_item method.') + if getattr(page_cls.to_item, "__isabstractmethod__", False): + raise NotImplementedError(f"{page_cls.__name__} should implement to_item method.") # When the callback is used as an instance method of the spider, it expects # to receive 'self' as its first argument. When used as a simple inline diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index d1a9ef47..418443e6 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -28,7 +28,7 @@ class SqlitedictCache(_Cache): def __init__(self, path: str, *, compressed=True): self.path = path self.compressed = compressed - tablename = 'responses_gzip' if compressed else 'responses' + tablename = "responses_gzip" if compressed else "responses" self.db = sqlitedict.SqliteDict( path, tablename=tablename, @@ -54,14 +54,12 @@ def decode(self, obj: Any) -> Any: return pickle.loads(data) def __str__(self) -> str: - return ( #pragma: no cover - f"SqlitedictCache <{self.db.filename} | " - f"compressed: {self.compressed} | " - f"{len(self.db)} records>" + return ( # pragma: no cover + f"SqlitedictCache <{self.db.filename} | " f"compressed: {self.compressed} | " f"{len(self.db)} records>" ) def __repr__(self) -> str: - return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" #pragma: no cover + return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" # pragma: no cover def __getitem__(self, fingerprint: str) -> Any: return self.db[fingerprint] diff --git a/scrapy_poet/downloader.py b/scrapy_poet/downloader.py index b6ae534f..92e4f90b 100644 --- a/scrapy_poet/downloader.py +++ b/scrapy_poet/downloader.py @@ -3,10 +3,7 @@ import scrapy from scrapy.utils.defer import maybe_deferred_to_future from web_poet import HttpRequest -from web_poet.exceptions import ( - HttpError, - HttpRequestError, -) +from web_poet.exceptions import HttpError, HttpRequestError from scrapy_poet.utils import ( http_request_to_scrapy_request, @@ -20,8 +17,7 @@ def create_scrapy_downloader(download_func): async def scrapy_downloader(request: HttpRequest): if not isinstance(request, HttpRequest): raise TypeError( - f"The request should be 'web_poet.HttpRequest' but received " - f"one of type: '{type(request)}'." + f"The request should be 'web_poet.HttpRequest' but received " f"one of type: '{type(request)}'." ) scrapy_request = http_request_to_scrapy_request(request) diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 034ee23a..7e52d29e 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -2,10 +2,7 @@ import logging import os import pprint -from typing import Dict, Callable, Any, List, Set, Mapping, Optional - -from .utils import get_scrapy_data_path -from twisted.internet.defer import inlineCallbacks +from typing import Any, Callable, Dict, List, Mapping, Optional, Set import andi from scrapy import Request, Spider @@ -15,18 +12,21 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.conf import build_component_list from scrapy.utils.defer import maybeDeferred_coro -from scrapy.utils.misc import load_object, create_instance +from scrapy.utils.misc import create_instance, load_object +from twisted.internet.defer import inlineCallbacks +from web_poet.pages import is_injectable +from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse from scrapy_poet.cache import SqlitedictCache -from scrapy_poet.injection_errors import (UndeclaredProvidedTypeError, - NonCallableProviderError, - InjectionError) -from scrapy_poet.overrides import OverridesRegistryBase, \ - OverridesRegistry +from scrapy_poet.injection_errors import ( + InjectionError, + NonCallableProviderError, + UndeclaredProvidedTypeError, +) +from scrapy_poet.overrides import OverridesRegistry, OverridesRegistryBase from scrapy_poet.page_input_providers import PageObjectInputProvider -from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse -from web_poet.pages import is_injectable +from .utils import get_scrapy_data_path logger = logging.getLogger(__name__) @@ -36,11 +36,14 @@ class Injector: Keep all the logic required to do dependency injection in Scrapy callbacks. Initializes the providers from the spider settings at initialization. """ - def __init__(self, - crawler: Crawler, - *, - default_providers: Optional[Mapping] = None, - overrides_registry: Optional[OverridesRegistryBase] = None): + + def __init__( + self, + crawler: Crawler, + *, + default_providers: Optional[Mapping] = None, + overrides_registry: Optional[OverridesRegistryBase] = None, + ): self.crawler = crawler self.spider = crawler.spider self.overrides_registry = overrides_registry or OverridesRegistry() @@ -48,23 +51,17 @@ def __init__(self, self.init_cache() def load_providers(self, default_providers: Optional[Mapping] = None): - providers_dict = {**(default_providers or {}), - **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS")} + providers_dict = {**(default_providers or {}), **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS")} provider_classes = build_component_list(providers_dict) logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}") - self.providers = [ - load_object(cls)(self.crawler) - for cls in provider_classes - ] + self.providers = [load_object(cls)(self.crawler) for cls in provider_classes] check_all_providers_are_callable(self.providers) # Caching whether each provider requires the scrapy response self.is_provider_requiring_scrapy_response = { - provider: is_provider_requiring_scrapy_response(provider) - for provider in self.providers + provider: is_provider_requiring_scrapy_response(provider) for provider in self.providers } # Caching the function for faster execution - self.is_class_provided_by_any_provider = \ - is_class_provided_by_any_provider_fn(self.providers) + self.is_class_provided_by_any_provider = is_class_provided_by_any_provider_fn(self.providers) def close(self) -> None: if self.cache: @@ -72,18 +69,18 @@ def close(self) -> None: def init_cache(self): self.cache = None - cache_filename = self.spider.settings.get('SCRAPY_POET_CACHE') + cache_filename = self.spider.settings.get("SCRAPY_POET_CACHE") if cache_filename and isinstance(cache_filename, bool): cache_filename = os.path.join(get_scrapy_data_path(createdir=True), "scrapy-poet-cache.sqlite3") if cache_filename: - compressed = self.spider.settings.getbool('SCRAPY_POET_CACHE_GZIP', True) - self.caching_errors = self.spider.settings.getbool('SCRAPY_POET_CACHE_ERRORS', False) + compressed = self.spider.settings.getbool("SCRAPY_POET_CACHE_GZIP", True) + self.caching_errors = self.spider.settings.getbool("SCRAPY_POET_CACHE_ERRORS", False) self.cache = SqlitedictCache(cache_filename, compressed=compressed) - logger.info(f"Cache enabled. File: '{cache_filename}'. Compressed: {compressed}. Caching errors: {self.caching_errors}") + logger.info( + f"Cache enabled. File: '{cache_filename}'. Compressed: {compressed}. Caching errors: {self.caching_errors}" + ) - def available_dependencies_for_providers(self, - request: Request, - response: Response): + def available_dependencies_for_providers(self, request: Request, response: Response): deps = { Crawler: self.crawler, Spider: self.spider, @@ -95,8 +92,7 @@ def available_dependencies_for_providers(self, assert deps.keys() == SCRAPY_PROVIDED_CLASSES return deps - def discover_callback_providers(self, request: Request - ) -> Set[PageObjectInputProvider]: + def discover_callback_providers(self, request: Request) -> Set[PageObjectInputProvider]: """Discover the providers that are required to fulfil the callback dependencies""" plan = self.build_plan(request) result = set() @@ -128,17 +124,14 @@ def build_plan(self, request: Request) -> andi.Plan: callback, is_injectable=is_injectable, externally_provided=self.is_class_provided_by_any_provider, - overrides=self.overrides_registry.overrides_for(request).get + overrides=self.overrides_registry.overrides_for(request).get, ) @inlineCallbacks - def build_instances( - self, request: Request, response: Response, plan: andi.Plan): + def build_instances(self, request: Request, response: Response, plan: andi.Plan): """Build the instances dict from a plan including external dependencies.""" # First we build the external dependencies using the providers - instances = yield from self.build_instances_from_providers( - request, response, plan - ) + instances = yield from self.build_instances_from_providers(request, response, plan) # All the remaining dependencies are internal so they can be built just # following the andi plan. for cls, kwargs_spec in plan.dependencies: @@ -148,16 +141,13 @@ def build_instances( return instances @inlineCallbacks - def build_instances_from_providers( - self, request: Request, response: Response, plan: andi.Plan): + def build_instances_from_providers(self, request: Request, response: Response, plan: andi.Plan): """Build dependencies handled by registered providers""" instances: Dict[Callable, Any] = {} - scrapy_provided_dependencies = self.available_dependencies_for_providers( - request, response) + scrapy_provided_dependencies = self.available_dependencies_for_providers(request, response) dependencies_set = {cls for cls, _ in plan.dependencies} for provider in self.providers: - provided_classes = {cls for cls in dependencies_set if - provider.is_provided(cls)} + provided_classes = {cls for cls in dependencies_set if provider.is_provided(cls)} provided_classes -= instances.keys() # ignore already provided types if not provided_classes: continue @@ -166,8 +156,10 @@ def build_instances_from_providers( cache_hit = False if self.cache and provider.has_cache_support: if not provider.name: - raise NotImplementedError(f"The provider {type(provider)} must have a `name` defined if" - f" you want to use the cache. It must be unique across the providers.") + raise NotImplementedError( + f"The provider {type(provider)} must have a `name` defined if" + f" you want to use the cache. It must be unique across the providers." + ) # Return the data if it is already in the cache fingerprint = f"{provider.name}_{provider.fingerprint(set(provided_classes), request)}" try: @@ -233,13 +225,11 @@ def check_all_providers_are_callable(providers): for provider in providers: if not callable(provider): raise NonCallableProviderError( - f"The provider {type(provider)} is not callable. " - f"It must implement '__call__' method" + f"The provider {type(provider)} is not callable. " f"It must implement '__call__' method" ) -def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider] - ) -> Callable[[Callable], bool]: +def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider]) -> Callable[[Callable], bool]: """ Return a function of type ``Callable[[Type], bool]`` that return True if the given type is provided by any of the registered providers. @@ -261,7 +251,8 @@ def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider raise InjectionError( f"Unexpected type '{type(provided_classes)}' for " f"'{type(provider)}.provided_classes'. Expected either 'set' " - f"or 'callable'") + f"or 'callable'" + ) def is_provided_fn(type: Callable) -> bool: for is_provided in individual_is_callable: @@ -275,7 +266,7 @@ def is_provided_fn(type: Callable) -> bool: def get_callback(request, spider): """Get ``request.callback`` of a :class:`scrapy.Request`""" if request.callback is None: - return getattr(spider, 'parse') + return getattr(spider, "parse") return request.callback @@ -292,7 +283,7 @@ def is_callback_requiring_scrapy_response(callback: Callable): signature = inspect.signature(callback) first_parameter_key = next(iter(signature.parameters)) first_parameter = signature.parameters[first_parameter_key] - if str(first_parameter).startswith('*'): + if str(first_parameter).startswith("*"): # Parse method is probably using *args and **kwargs annotation. # Let's assume response is going to be used. return True @@ -334,19 +325,17 @@ def is_provider_requiring_scrapy_response(provider): def get_injector_for_testing( - providers: Mapping, - additional_settings: Dict = None, - overrides_registry: Optional[OverridesRegistryBase] = None + providers: Mapping, additional_settings: Dict = None, overrides_registry: Optional[OverridesRegistryBase] = None ) -> Injector: """ Return an :class:`Injector` using a fake crawler. Useful for testing providers """ + class MySpider(Spider): name = "my_spider" - settings = Settings({**(additional_settings or {}), - "SCRAPY_POET_PROVIDERS": providers}) + settings = Settings({**(additional_settings or {}), "SCRAPY_POET_PROVIDERS": providers}) crawler = Crawler(MySpider) crawler.settings = settings spider = MySpider() @@ -375,7 +364,9 @@ def get_response_for_testing(callback: Callable) -> Response:

The best chocolate ever

- """.encode("utf-8") + """.encode( + "utf-8" + ) request = Request(url, callback=callback) response = Response(url, 200, None, html, request=request) return response diff --git a/scrapy_poet/injection_errors.py b/scrapy_poet/injection_errors.py index 973650f1..53169e33 100644 --- a/scrapy_poet/injection_errors.py +++ b/scrapy_poet/injection_errors.py @@ -11,4 +11,4 @@ class UndeclaredProvidedTypeError(InjectionError): class MalformedProvidedClassesError(InjectionError): - pass \ No newline at end of file + pass diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 0543ee3a..8c2bc15a 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -8,20 +8,18 @@ from scrapy import Spider, signals from scrapy.crawler import Crawler from scrapy.http import Request, Response -from twisted.internet.defer import Deferred, inlineCallbacks - from scrapy.utils.misc import create_instance, load_object +from twisted.internet.defer import Deferred, inlineCallbacks from .api import DummyResponse +from .injection import Injector +from .overrides import OverridesRegistry from .page_input_providers import ( HttpClientProvider, HttpResponseProvider, PageParamsProvider, RequestUrlProvider, ) -from .overrides import OverridesRegistry -from .injection import Injector - logger = logging.getLogger(__name__) @@ -42,16 +40,16 @@ class InjectionMiddleware: * check if request downloads could be skipped * inject dependencies before request callbacks are executed """ + def __init__(self, crawler: Crawler) -> None: """Initialize the middleware""" self.crawler = crawler settings = self.crawler.settings - registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", - OverridesRegistry)) + registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", OverridesRegistry)) self.overrides_registry = create_instance(registry_cls, settings, crawler) - self.injector = Injector(crawler, - default_providers=DEFAULT_PROVIDERS, - overrides_registry=self.overrides_registry) + self.injector = Injector( + crawler, default_providers=DEFAULT_PROVIDERS, overrides_registry=self.overrides_registry + ) @classmethod def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> InjectionMiddlewareTV: @@ -84,8 +82,9 @@ def process_request(self, request: Request, spider: Spider) -> Optional[DummyRes return DummyResponse(url=request.url, request=request) @inlineCallbacks - def process_response(self, request: Request, response: Response, - spider: Spider) -> Generator[Deferred[object], object, Response]: + def process_response( + self, request: Request, response: Response, spider: Spider + ) -> Generator[Deferred[object], object, Response]: """This method fills ``request.cb_kwargs`` with instances for the required Page Objects found in the callback signature. diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index a5e330d3..8b9c18fc 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -1,7 +1,7 @@ import logging from abc import ABC, abstractmethod from collections import defaultdict -from typing import Dict, Mapping, Callable, Iterable, Union, Tuple, Optional, List +from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union from scrapy import Request from scrapy.crawler import Crawler @@ -9,7 +9,6 @@ from url_matcher.util import get_domain from web_poet.overrides import OverrideRule - logger = logging.getLogger(__name__) RuleAsTuple = Union[Tuple[str, Callable, Callable], List] @@ -106,13 +105,9 @@ def add_rule(self, rule: RuleFromUser) -> None: f"replacement and (3) the PO class to be replaced." ) pattern, use, instead_of = rule - rule = OverrideRule( - for_patterns=Patterns([pattern]), use=use, instead_of=instead_of - ) + rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of) self.rules.append(rule) - self.matcher[rule.instead_of].add_or_update( - len(self.rules) - 1, rule.for_patterns - ) + self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns) def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: overrides: Dict[Callable, Callable] = {} diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 8a3e2ca0..3516b98c 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -10,18 +10,24 @@ """ import abc import json -from typing import Set, Union, Callable, ClassVar, Any, Sequence +from typing import Any, Callable, ClassVar, Sequence, Set, Union import attr from scrapy import Request -from scrapy.http import Response from scrapy.crawler import Crawler +from scrapy.http import Response from scrapy.utils.request import request_fingerprint +from web_poet import ( + HttpClient, + HttpResponse, + HttpResponseHeaders, + PageParams, + RequestUrl, +) -from scrapy_poet.utils import scrapy_response_to_http_response -from scrapy_poet.injection_errors import MalformedProvidedClassesError from scrapy_poet.downloader import create_scrapy_downloader -from web_poet import HttpClient, HttpResponse, HttpResponseHeaders, PageParams, RequestUrl +from scrapy_poet.injection_errors import MalformedProvidedClassesError +from scrapy_poet.utils import scrapy_response_to_http_response class PageObjectInputProvider: @@ -104,7 +110,8 @@ def is_provided(cls, type_: Callable): else: raise MalformedProvidedClassesError( f"Unexpected type '{type_}' for 'provided_classes' attribute of" - f"'{cls}.'. Expected either 'set' or 'callable'") + f"'{cls}.'. Expected either 'set' or 'callable'" + ) def __init__(self, crawler: Crawler): """Initializes the provider. Invoked only at spider start up.""" @@ -174,11 +181,7 @@ def __call__(self, to_provide: Set[Callable], response: Response): def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: request_keys = {"url", "method", "body"} - request_data = { - k: str(v) - for k, v in request.to_dict().items() - if k in request_keys - } + request_data = {k: str(v) for k, v in request.to_dict().items() if k in request_keys} fp_data = { "SCRAPY_FINGERPRINT": request_fingerprint(request), **request_data, @@ -203,6 +206,7 @@ def deserialize(self, data: Any) -> Sequence[Any]: class HttpClientProvider(PageObjectInputProvider): """This class provides ``web_poet.requests.HttpClient`` instances.""" + provided_classes = {HttpClient} def __call__(self, to_provide: Set[Callable], crawler: Crawler): @@ -215,6 +219,7 @@ def __call__(self, to_provide: Set[Callable], crawler: Crawler): class PageParamsProvider(PageObjectInputProvider): """This class provides ``web_poet.page_inputs.PageParams`` instances.""" + provided_classes = {PageParams} def __call__(self, to_provide: Set[Callable], request: Request): diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 3b79a560..027df39f 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,9 +1,9 @@ import os import attr -from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders from scrapy.http import Request, Response -from scrapy.utils.project import project_data_dir, inside_project +from scrapy.utils.project import inside_project, project_data_dir +from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str: diff --git a/setup.py b/setup.py index 930ebaf2..ddca9dfc 100755 --- a/setup.py +++ b/setup.py @@ -1,35 +1,35 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( - name='scrapy-poet', - version='0.3.0', - description='Page Object pattern for Scrapy', - long_description=open('README.rst').read(), + name="scrapy-poet", + version="0.3.0", + description="Page Object pattern for Scrapy", + long_description=open("README.rst").read(), long_description_content_type="text/x-rst", - author='Mikhail Korobov', - author_email='kmike84@gmail.com', - url='https://github.com/scrapinghub/scrapy-poet', - packages=find_packages(exclude=['tests', 'example']), + author="Mikhail Korobov", + author_email="kmike84@gmail.com", + url="https://github.com/scrapinghub/scrapy-poet", + packages=find_packages(exclude=["tests", "example"]), install_requires=[ - 'andi >= 0.4.1', - 'attrs >= 21.3.0', - 'parsel >= 1.5.0', - 'scrapy >= 2.6.0', - 'sqlitedict >= 1.5.0', - 'url-matcher >= 0.2.0', - 'web-poet >= 0.2.0', + "andi >= 0.4.1", + "attrs >= 21.3.0", + "parsel >= 1.5.0", + "scrapy >= 2.6.0", + "sqlitedict >= 1.5.0", + "url-matcher >= 0.2.0", + "web-poet >= 0.2.0", ], classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Natural Language :: English', - 'Operating System :: OS Independent', - 'Framework :: Scrapy', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Framework :: Scrapy", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], ) diff --git a/tests/conftest.py b/tests/conftest.py index 209ac514..0d9d5504 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,14 +4,14 @@ @pytest.fixture() def settings(request): - """ Default scrapy-poet settings """ + """Default scrapy-poet settings""" s = dict( # collect scraped items to .collected_items attribute ITEM_PIPELINES={ - 'tests.utils.CollectorPipeline': 100, + "tests.utils.CollectorPipeline": 100, }, DOWNLOADER_MIDDLEWARES={ - 'scrapy_poet.InjectionMiddleware': 543, - } + "scrapy_poet.InjectionMiddleware": 543, + }, ) return Settings(s) diff --git a/tests/mockserver.py b/tests/mockserver.py index 9f782e7a..cce02ec5 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -2,8 +2,8 @@ import socket import sys import time -from subprocess import Popen, PIPE from importlib import import_module +from subprocess import PIPE, Popen from twisted.internet import reactor from twisted.web.server import Site @@ -15,19 +15,18 @@ def get_ephemeral_port(): return s.getsockname()[1] -class MockServer(): +class MockServer: def __init__(self, resource, port=None): - self.resource = '{}.{}'.format(resource.__module__, resource.__name__) + self.resource = "{}.{}".format(resource.__module__, resource.__name__) self.proc = None host = socket.gethostbyname(socket.gethostname()) self.port = port or get_ephemeral_port() - self.root_url = 'http://%s:%d' % (host, self.port) + self.root_url = "http://%s:%d" % (host, self.port) def __enter__(self): self.proc = Popen( - [sys.executable, '-u', '-m', 'tests.mockserver', - self.resource, '--port', str(self.port)], - stdout=PIPE) + [sys.executable, "-u", "-m", "tests.mockserver", self.resource, "--port", str(self.port)], stdout=PIPE + ) self.proc.stdout.readline() return self @@ -39,18 +38,17 @@ def __exit__(self, exc_type, exc_value, traceback): def main(): parser = argparse.ArgumentParser() - parser.add_argument('resource') - parser.add_argument('--port', type=int) + parser.add_argument("resource") + parser.add_argument("--port", type=int) args = parser.parse_args() - module_name, name = args.resource.rsplit('.', 1) - sys.path.append('.') + module_name, name = args.resource.rsplit(".", 1) + sys.path.append(".") resource = getattr(import_module(module_name), name)() http_port = reactor.listenTCP(args.port, Site(resource)) def print_listening(): host = http_port.getHost() - print('Mock server {} running at http://{}:{}'.format( - resource, host.host, host.port)) + print("Mock server {} running at http://{}:{}".format(resource, host.host, host.port)) reactor.callWhenRunning(print_listening) reactor.run() diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 287bd7ea..1b25b7d8 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -2,15 +2,14 @@ This package is just for overrides testing purposes. """ import socket -from typing import Dict, Any, Callable +from typing import Any, Callable, Dict from url_matcher import Patterns from url_matcher.util import get_domain -from web_poet import handle_urls, ItemWebPage +from web_poet import ItemWebPage, handle_urls from tests.mockserver import get_ephemeral_port - # Need to define it here since it's always changing DOMAIN = get_domain(socket.gethostbyname(socket.gethostname())) PORT = get_ephemeral_port() diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index a16990b5..61805b4a 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -1,41 +1,36 @@ -import scrapy import pytest +import scrapy from pytest_twisted import ensureDeferred - from web_poet.pages import ItemPage, ItemWebPage -from scrapy_poet import ( - callback_for, - DummyResponse, -) +from scrapy_poet import DummyResponse, callback_for -class FakeItemPage(ItemPage): +class FakeItemPage(ItemPage): def to_item(self): - return 'fake item page' + return "fake item page" -class FakeItemPageAsync(ItemPage): +class FakeItemPageAsync(ItemPage): async def to_item(self): - return 'fake item page' + return "fake item page" class FakeItemWebPage(ItemWebPage): - def to_item(self): - return 'fake item web page' + return "fake item web page" class MySpider(scrapy.Spider): - name = 'my_spider' + name = "my_spider" parse_item = callback_for(FakeItemPage) parse_web = callback_for(FakeItemWebPage) class MySpiderAsync(scrapy.Spider): - name = 'my_spider_async' + name = "my_spider_async" parse_item = callback_for(FakeItemPageAsync) @@ -45,9 +40,9 @@ def test_callback_for(): assert callable(cb) fake_page = FakeItemPage() - response = DummyResponse('http://example.com/') + response = DummyResponse("http://example.com/") result = cb(response=response, page=fake_page) - assert list(result) == ['fake item page'] + assert list(result) == ["fake item page"] @ensureDeferred @@ -56,30 +51,30 @@ async def test_callback_for_async(): assert callable(cb) fake_page = FakeItemPageAsync() - response = DummyResponse('http://example.com/') + response = DummyResponse("http://example.com/") result = cb(response=response, page=fake_page) - assert await result.__anext__() == 'fake item page' + assert await result.__anext__() == "fake item page" with pytest.raises(StopAsyncIteration): assert await result.__anext__() def test_callback_for_instance_method(): spider = MySpider() - response = DummyResponse('http://example.com/') + response = DummyResponse("http://example.com/") fake_page = FakeItemPage() result = spider.parse_item(response, page=fake_page) - assert list(result) == ['fake item page'] + assert list(result) == ["fake item page"] @ensureDeferred async def test_callback_for_instance_method_async(): spider = MySpiderAsync() - response = DummyResponse('http://example.com/') + response = DummyResponse("http://example.com/") fake_page = FakeItemPageAsync() result = spider.parse_item(response, page=fake_page) - assert await result.__anext__() == 'fake item page' + assert await result.__anext__() == "fake item page" with pytest.raises(StopAsyncIteration): assert await result.__anext__() @@ -87,38 +82,38 @@ async def test_callback_for_instance_method_async(): def test_default_callback(): """Sample request not specifying callback.""" spider = MySpider() - request = scrapy.Request('http://example.com/') + request = scrapy.Request("http://example.com/") request_dict = request.to_dict(spider=spider) assert isinstance(request_dict, dict) - assert request_dict['url'] == 'http://example.com/' - assert request_dict['callback'] is None + assert request_dict["url"] == "http://example.com/" + assert request_dict["callback"] is None def test_instance_method_callback(): """Sample request specifying spider's instance method callback.""" spider = MySpider() - request = scrapy.Request('http://example.com/', callback=spider.parse_item) + request = scrapy.Request("http://example.com/", callback=spider.parse_item) request_dict = request.to_dict(spider=spider) assert isinstance(request_dict, dict) - assert request_dict['url'] == 'http://example.com/' - assert request_dict['callback'] == 'parse_item' + assert request_dict["url"] == "http://example.com/" + assert request_dict["callback"] == "parse_item" - request = scrapy.Request('http://example.com/', callback=spider.parse_web) + request = scrapy.Request("http://example.com/", callback=spider.parse_web) request_dict = request.to_dict(spider=spider) assert isinstance(request_dict, dict) - assert request_dict['url'] == 'http://example.com/' - assert request_dict['callback'] == 'parse_web' + assert request_dict["url"] == "http://example.com/" + assert request_dict["callback"] == "parse_web" def test_inline_callback(): """Sample request with inline callback.""" spider = MySpider() cb = callback_for(FakeItemPage) - request = scrapy.Request('http://example.com/', callback=cb) + request = scrapy.Request("http://example.com/", callback=cb) with pytest.raises(ValueError) as exc: request.to_dict(spider=spider) - msg = f'Function {cb} is not an instance method in: {spider}' + msg = f"Function {cb} is not an instance method in: {spider}" assert str(exc.value) == msg @@ -126,11 +121,11 @@ def test_inline_callback_async(): """Sample request with inline callback using async callback_for.""" spider = MySpiderAsync() cb = callback_for(FakeItemPageAsync) - request = scrapy.Request('http://example.com/', callback=cb) + request = scrapy.Request("http://example.com/", callback=cb) with pytest.raises(ValueError) as exc: request.to_dict(spider=spider) - msg = f'Function {cb} is not an instance method in: {spider}' + msg = f"Function {cb} is not an instance method in: {spider}" assert str(exc.value) == msg @@ -143,7 +138,7 @@ class MyClass(object): with pytest.raises(TypeError) as exc: callback_for(MyClass) - msg = 'MyClass should be a subclass of ItemPage.' + msg = "MyClass should be a subclass of ItemPage." assert str(exc.value) == msg @@ -156,5 +151,5 @@ class MyClass(ItemPage): with pytest.raises(NotImplementedError) as exc: callback_for(MyClass) - msg = 'MyClass should implement to_item method.' + msg = "MyClass should implement to_item method." assert str(exc.value) == msg diff --git a/tests/test_downloader.py b/tests/test_downloader.py index bd74a791..5632d48a 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,7 +1,7 @@ -import attr from functools import partial from unittest import mock +import attr import pytest import scrapy import twisted @@ -9,7 +9,6 @@ from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Request, Spider from scrapy.exceptions import IgnoreRequest -from tests.utils import AsyncMock from twisted.internet import reactor from twisted.internet.task import deferLater from twisted.web.resource import Resource @@ -21,7 +20,11 @@ from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.utils import http_request_to_scrapy_request from tests.utils import ( - crawl_single_item, make_crawler, HtmlResource, MockServer + AsyncMock, + HtmlResource, + MockServer, + crawl_single_item, + make_crawler, ) @@ -55,9 +58,7 @@ def fake_http_response(): async def test_scrapy_poet_downloader(fake_http_response): req = web_poet.HttpRequest("https://example.com") - with mock.patch( - "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock - ) as mock_dtf: + with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: mock_dtf.return_value = fake_http_response @@ -82,9 +83,7 @@ async def test_scrapy_poet_downloader_ignored_request(): standard on additional request error handling.""" req = web_poet.HttpRequest("https://example.com") - with mock.patch( - "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock - ) as mock_dtf: + with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: mock_dtf.side_effect = scrapy.exceptions.IgnoreRequest mock_downloader = mock.MagicMock(return_value=AsyncMock) scrapy_downloader = create_scrapy_downloader(mock_downloader) @@ -97,9 +96,7 @@ async def test_scrapy_poet_downloader_ignored_request(): async def test_scrapy_poet_downloader_twisted_error(): req = web_poet.HttpRequest("https://example.com") - with mock.patch( - "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock - ) as mock_dtf: + with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: mock_dtf.side_effect = twisted.internet.error.TimeoutError mock_downloader = mock.MagicMock(return_value=AsyncMock) scrapy_downloader = create_scrapy_downloader(mock_downloader) @@ -112,9 +109,7 @@ async def test_scrapy_poet_downloader_twisted_error(): async def test_scrapy_poet_downloader_head_redirect(fake_http_response): req = web_poet.HttpRequest("https://example.com", method="HEAD") - with mock.patch( - "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock - ) as mock_dtf: + with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: mock_dtf.return_value = fake_http_response mock_downloader = mock.MagicMock(return_value=AsyncMock) scrapy_downloader = create_scrapy_downloader(mock_downloader) @@ -158,17 +153,17 @@ class ItemPage(ItemWebPage): async def to_item(self): response = await self.http_client.request( server.root_url, - body=b'bar', + body=b"bar", ) - return {'foo': response.body.decode()} + return {"foo": response.body.decode()} class TestSpider(Spider): - name = 'test_spider' + name = "test_spider" start_urls = [server.root_url] custom_settings = { - 'DOWNLOADER_MIDDLEWARES': { - 'scrapy_poet.InjectionMiddleware': 543, + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, }, } @@ -179,7 +174,7 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'foo': 'bar'}] + assert items == [{"foo": "bar"}] class StatusResource(LeafResource): @@ -204,18 +199,18 @@ async def to_item(self): try: await self.http_client.request( server.root_url, - body=b'400', + body=b"400", ) except HttpResponseError: - return {'foo': 'bar'} + return {"foo": "bar"} class TestSpider(Spider): - name = 'test_spider' + name = "test_spider" start_urls = [server.root_url] custom_settings = { - 'DOWNLOADER_MIDDLEWARES': { - 'scrapy_poet.InjectionMiddleware': 543, + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, }, } @@ -226,7 +221,7 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'foo': 'bar'}] + assert items == [{"foo": "bar"}] class DelayedResource(LeafResource): @@ -250,11 +245,10 @@ def _delayedRender(self, request, seconds): def test_additional_requests_connection_issue(): items = [] - with mock.patch('scrapy_poet.downloader.http_request_to_scrapy_request') \ - as mock_http_request_to_scrapy_request: + with mock.patch("scrapy_poet.downloader.http_request_to_scrapy_request") as mock_http_request_to_scrapy_request: mock_http_request_to_scrapy_request.side_effect = partial( http_request_to_scrapy_request, - meta={'download_timeout': 0.001}, + meta={"download_timeout": 0.001}, ) with MockServer(DelayedResource) as server: @@ -270,15 +264,15 @@ async def to_item(self): body=b"0.002", ) except HttpRequestError: - return {'foo': 'bar'} + return {"foo": "bar"} class TestSpider(Spider): - name = 'test_spider' + name = "test_spider" start_urls = [server.root_url] custom_settings = { - 'DOWNLOADER_MIDDLEWARES': { - 'scrapy_poet.InjectionMiddleware': 543, + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, }, } @@ -289,7 +283,7 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'foo': 'bar'}] + assert items == [{"foo": "bar"}] @inlineCallbacks @@ -306,25 +300,25 @@ async def to_item(self): try: await self.http_client.request( server.root_url, - body=b'ignore', + body=b"ignore", ) except HttpError as e: - return {'exc': e.__class__} + return {"exc": e.__class__} class TestDownloaderMiddleware: def process_response(self, request, response, spider): - if b'ignore' in response.body: + if b"ignore" in response.body: raise IgnoreRequest return response class TestSpider(Spider): - name = 'test_spider' + name = "test_spider" start_urls = [server.root_url] custom_settings = { - 'DOWNLOADER_MIDDLEWARES': { + "DOWNLOADER_MIDDLEWARES": { TestDownloaderMiddleware: 1, - 'scrapy_poet.InjectionMiddleware': 543, + "scrapy_poet.InjectionMiddleware": 543, }, } @@ -335,7 +329,7 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'exc': HttpError}] + assert items == [{"exc": HttpError}] @pytest.mark.xfail( @@ -363,25 +357,25 @@ async def to_item(self): try: await self.http_client.request( server.root_url, - body=b'raise', + body=b"raise", ) except HttpError as e: - return {'exc': e.__class__} + return {"exc": e.__class__} class TestDownloaderMiddleware: def process_response(self, request, response, spider): - if b'raise' in response.body: + if b"raise" in response.body: raise RuntimeError return response class TestSpider(Spider): - name = 'test_spider' + name = "test_spider" start_urls = [server.root_url] custom_settings = { - 'DOWNLOADER_MIDDLEWARES': { + "DOWNLOADER_MIDDLEWARES": { TestDownloaderMiddleware: 1, - 'scrapy_poet.InjectionMiddleware': 543, + "scrapy_poet.InjectionMiddleware": 543, }, } @@ -392,7 +386,7 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'exc': HttpError}] + assert items == [{"exc": HttpError}] @inlineCallbacks @@ -416,26 +410,26 @@ class ItemPage(ItemWebPage): async def to_item(self): response1 = await self.http_client.request( server.root_url, - body=b'a', + body=b"a", ) response2 = await self.http_client.request( server.root_url, - body=b'a', + body=b"a", ) return {response1.body.decode(): response2.body.decode()} class TestSpider(Spider): - name = 'test_spider' + name = "test_spider" custom_settings = { - 'DOWNLOADER_MIDDLEWARES': { - 'scrapy_poet.InjectionMiddleware': 543, + "DOWNLOADER_MIDDLEWARES": { + "scrapy_poet.InjectionMiddleware": 543, }, } def start_requests(self): - yield Request(server.root_url, body=b'a') - yield Request(server.root_url, body=b'a') + yield Request(server.root_url, body=b"a") + yield Request(server.root_url, body=b"a") async def parse(self, response, page: ItemPage): item = await page.to_item() @@ -444,4 +438,4 @@ async def parse(self, response, page: ItemPage): crawler = make_crawler(TestSpider, {}) yield crawler.crawl() - assert items == [{'a': 'a'}] + assert items == [{"a": "a"}] diff --git a/tests/test_injection.py b/tests/test_injection.py index 6871b1b6..47c89999 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -1,28 +1,37 @@ -from typing import Any, Sequence, Set, Callable +import weakref +from typing import Any, Callable, Sequence, Set import attr +import parsel import pytest from pytest_twisted import inlineCallbacks -import weakref - -import parsel from scrapy import Request from scrapy.http import Response from url_matcher import Patterns - from url_matcher.util import get_domain - -from scrapy_poet import CacheDataProviderMixin, HttpResponseProvider, PageObjectInputProvider, \ - DummyResponse -from scrapy_poet.injection import check_all_providers_are_callable, is_class_provided_by_any_provider_fn, \ - get_injector_for_testing, get_response_for_testing -from scrapy_poet.injection_errors import NonCallableProviderError, \ - InjectionError, UndeclaredProvidedTypeError -from scrapy_poet.overrides import OverridesRegistry from web_poet import Injectable, ItemPage from web_poet.mixins import ResponseShortcutsMixin from web_poet.overrides import OverrideRule +from scrapy_poet import ( + CacheDataProviderMixin, + DummyResponse, + HttpResponseProvider, + PageObjectInputProvider, +) +from scrapy_poet.injection import ( + check_all_providers_are_callable, + get_injector_for_testing, + get_response_for_testing, + is_class_provided_by_any_provider_fn, +) +from scrapy_poet.injection_errors import ( + InjectionError, + NonCallableProviderError, + UndeclaredProvidedTypeError, +) +from scrapy_poet.overrides import OverridesRegistry + def get_provider(classes, content=None): class Provider(PageObjectInputProvider): @@ -76,10 +85,7 @@ def get_providers_for_testing(): prov1 = get_provider_requiring_response({ClsReqResponse}) prov2 = get_provider({Cls1, Cls2}) # Duplicating them because they should work even in this situation - return {prov1: 1, - prov2: 2, - prov1: 3, - prov2: 4} + return {prov1: 1, prov2: 2, prov1: 3, prov2: 4} @pytest.fixture @@ -98,7 +104,6 @@ class WrapCls(Injectable): class TestInjector: - def test_constructor(self): injector = get_injector_for_testing(get_providers_for_testing()) assert injector.is_class_provided_by_any_provider(ClsReqResponse) @@ -106,8 +111,7 @@ def test_constructor(self): assert not injector.is_class_provided_by_any_provider(ClsNoProvided) for provider in injector.providers: - assert (injector.is_provider_requiring_scrapy_response[provider] == - provider.require_response) + assert injector.is_provider_requiring_scrapy_response[provider] == provider.require_response # Asserting that we are not leaking providers references weak_ref = weakref.ref(injector.providers[0]) @@ -117,6 +121,7 @@ def test_constructor(self): def test_non_callable_provider_error(self): """Checks that a exception is raised when a provider is not callable""" + class NonCallableProvider(PageObjectInputProvider): pass @@ -151,7 +156,6 @@ def callback_3(a: ClsNoProvided, b: WrapCls): assert set(map(type, discover_fn(callback_3))) == {providers_list[0]} def test_is_scrapy_response_required(self, injector): - def callback_no_1(response: DummyResponse, a: Cls1): pass @@ -172,12 +176,7 @@ def callback_yes_2(response: DummyResponse, a: ClsReqResponse): @inlineCallbacks def test_build_instances_methods(self, injector): - - def callback(response: DummyResponse, - a: Cls1, - b: Cls2, - c: WrapCls, - d: ClsNoProviderRequired): + def callback(response: DummyResponse, a: Cls1, b: Cls2, c: WrapCls, d: ClsNoProviderRequired): pass response = get_response_for_testing(callback) @@ -189,11 +188,10 @@ def callback(response: DummyResponse, Cls2: Cls2(), WrapCls: WrapCls(ClsReqResponse()), ClsReqResponse: ClsReqResponse(), - ClsNoProviderRequired: ClsNoProviderRequired() + ClsNoProviderRequired: ClsNoProviderRequired(), } - instances = yield from injector.build_instances_from_providers( - request, response, plan) + instances = yield from injector.build_instances_from_providers(request, response, plan) assert instances == { Cls1: Cls1(), Cls2: Cls2(), @@ -202,7 +200,6 @@ def callback(response: DummyResponse, @inlineCallbacks def test_build_instances_from_providers_unexpected_return(self): - class WrongProvider(get_provider({Cls1})): def __call__(self, to_provide): return super().__call__(to_provide) + [Cls2()] @@ -215,21 +212,22 @@ def callback(response: DummyResponse, a: Cls1): response = get_response_for_testing(callback) plan = injector.build_plan(response.request) with pytest.raises(UndeclaredProvidedTypeError) as exinf: - yield from injector.build_instances_from_providers( - response.request, response, plan) + yield from injector.build_instances_from_providers(response.request, response, plan) assert "Provider" in str(exinf.value) assert "Cls2" in str(exinf.value) assert "Cls1" in str(exinf.value) - @pytest.mark.parametrize("str_list", [ - ["1", "2", "3"], - ["3", "2", "1"], - ["1", "3", "2"], - ]) + @pytest.mark.parametrize( + "str_list", + [ + ["1", "2", "3"], + ["3", "2", "1"], + ["1", "3", "2"], + ], + ) @inlineCallbacks - def test_build_instances_from_providers_respect_priorities( - self, str_list): + def test_build_instances_from_providers_respect_priorities(self, str_list): providers = {get_provider({str}, text): int(text) for text in str_list} injector = get_injector_for_testing(providers) @@ -238,30 +236,19 @@ def callback(response: DummyResponse, arg: str): response = get_response_for_testing(callback) plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers( - response.request, response, plan) + instances = yield from injector.build_instances_from_providers(response.request, response, plan) assert instances[str] == min(str_list) @inlineCallbacks def test_build_callback_dependencies(self, injector): - def callback(response: DummyResponse, - a: Cls1, - b: Cls2, - c: WrapCls, - d: ClsNoProviderRequired): + def callback(response: DummyResponse, a: Cls1, b: Cls2, c: WrapCls, d: ClsNoProviderRequired): pass response = get_response_for_testing(callback) - kwargs = yield from injector.build_callback_dependencies( - response.request, response) + kwargs = yield from injector.build_callback_dependencies(response.request, response) kwargs_types = {key: type(value) for key, value in kwargs.items()} - assert kwargs_types == { - "a": Cls1, - "b": Cls2, - "c": WrapCls, - "d": ClsNoProviderRequired - } + assert kwargs_types == {"a": Cls1, "b": Cls2, "c": WrapCls, "d": ClsNoProviderRequired} class Html(Injectable): @@ -302,7 +289,6 @@ def to_item(self): class TestInjectorOverrides: - @pytest.mark.parametrize("override_should_happen", [True, False]) @inlineCallbacks def test_overrides(self, providers, override_should_happen): @@ -311,18 +297,16 @@ def test_overrides(self, providers, override_should_happen): # when we configure them for domain other-example.com overrides = [ (domain, PriceInDollarsPO, PricePO), - OverrideRule(Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate) + OverrideRule(Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate), ] registry = OverridesRegistry(overrides) - injector = get_injector_for_testing(providers, - overrides_registry=registry) + injector = get_injector_for_testing(providers, overrides_registry=registry) def callback(response: DummyResponse, price_po: PricePO, rate_po: EurDollarRate): pass response = get_response_for_testing(callback) - kwargs = yield from injector.build_callback_dependencies( - response.request, response) + kwargs = yield from injector.build_callback_dependencies(response.request, response) kwargs_types = {key: type(value) for key, value in kwargs.items()} price_po = kwargs["price_po"] item = price_po.to_item() @@ -347,17 +331,18 @@ def test_load_provider_classes(): def test_check_all_providers_are_callable(): check_all_providers_are_callable([HttpResponseProvider(None)]) with pytest.raises(NonCallableProviderError) as exinf: - check_all_providers_are_callable([PageObjectInputProvider(None), - HttpResponseProvider(None)]) + check_all_providers_are_callable([PageObjectInputProvider(None), HttpResponseProvider(None)]) assert "PageObjectInputProvider" in str(exinf.value) assert "not callable" in str(exinf.value) + def test_is_class_provided_by_any_provider_fn(): - providers = [get_provider({str}), - get_provider(lambda x: issubclass(x, InjectionError)), - get_provider(frozenset({int, float})), - ] + providers = [ + get_provider({str}), + get_provider(lambda x: issubclass(x, InjectionError)), + get_provider(frozenset({int, float})), + ] is_provided = is_class_provided_by_any_provider_fn(providers) is_provided_empty = is_class_provided_by_any_provider_fn([]) @@ -425,8 +410,7 @@ def validate_instances(instances): if cache.exists(): print(f"Cache file {cache} already exists. Weird. Deleting") cache.unlink() - settings = {"SCRAPY_POET_CACHE": cache, - "SCRAPY_POET_CACHE_ERRORS": cache_errors} + settings = {"SCRAPY_POET_CACHE": cache, "SCRAPY_POET_CACHE_ERRORS": cache_errors} injector = get_injector_for_testing(providers, settings) assert cache.exists() @@ -435,8 +419,7 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response = get_response_for_testing(callback) plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers( - response.request, response, plan) + instances = yield from injector.build_instances_from_providers(response.request, response, plan) validate_instances(instances) @@ -446,8 +429,7 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response.request = Request.replace(response.request, url="http://willfail.page") with pytest.raises(ValueError): plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers( - response.request, response, plan) + instances = yield from injector.build_instances_from_providers(response.request, response, plan) # Different providers. They return a different result, but the cache data should prevail. providers = { @@ -458,8 +440,7 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response = get_response_for_testing(callback) plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers( - response.request, response, plan) + instances = yield from injector.build_instances_from_providers(response.request, response, plan) validate_instances(instances) @@ -468,5 +449,4 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response.request = Request.replace(response.request, url="http://willfail.page") with pytest.raises(Error): plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers( - response.request, response, plan) + instances = yield from injector.build_instances_from_providers(response.request, response, plan) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 12f6ea1f..c7d16452 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -1,36 +1,26 @@ import socket - -from scrapy.utils.log import configure_logging -from twisted.internet.threads import deferToThread -from typing import Optional, Union, Type +from typing import Optional, Type, Union from unittest import mock +import attr import pytest import scrapy +from pytest_twisted import inlineCallbacks from scrapy import Request, Spider from scrapy.http import Response +from scrapy.utils.log import configure_logging from scrapy.utils.test import get_crawler -from pytest_twisted import inlineCallbacks - -import attr - -from scrapy_poet import callback_for +from twisted.internet.threads import deferToThread from url_matcher.util import get_domain - -from tests.mockserver import get_ephemeral_port -from scrapy_poet import InjectionMiddleware -from web_poet.pages import WebPage, ItemPage, ItemWebPage -from scrapy_poet.cache import SqlitedictCache -from scrapy_poet.page_input_providers import ( - PageObjectInputProvider -) from web_poet import default_registry from web_poet.page_inputs import HttpResponse, RequestUrl -from scrapy_poet import DummyResponse -from tests.utils import (HtmlResource, - crawl_items, - capture_exceptions, - crawl_single_item) +from web_poet.pages import ItemPage, ItemWebPage, WebPage + +from scrapy_poet import DummyResponse, InjectionMiddleware, callback_for +from scrapy_poet.cache import SqlitedictCache +from scrapy_poet.page_input_providers import PageObjectInputProvider +from tests.mockserver import get_ephemeral_port +from tests.utils import HtmlResource, capture_exceptions, crawl_items, crawl_single_item class ProductHtml(HtmlResource): @@ -48,7 +38,6 @@ class ProductHtml(HtmlResource): def spider_for(injectable: Type): - class InjectableSpider(scrapy.Spider): url = None @@ -69,8 +58,7 @@ def start_requests(self): @attr.s(auto_attribs=True) class BreadcrumbsExtraction(WebPage): def get(self): - return {a.css('::text').get(): a.attrib['href'] - for a in self.css(".breadcrumbs a")} + return {a.css("::text").get(): a.attrib["href"] for a in self.css(".breadcrumbs a")} @attr.s(auto_attribs=True) @@ -79,11 +67,11 @@ class ProductPage(ItemWebPage): def to_item(self): return { - 'url': self.url, - 'name': self.css(".name::text").get(), - 'price': self.xpath('//*[@class="price"]/text()').get(), - 'description': self.css(".description::text").get(), - 'category': " / ".join(self.breadcrumbs.get().keys()), + "url": self.url, + "name": self.css(".name::text").get(), + "price": self.xpath('//*[@class="price"]/text()').get(), + "description": self.css(".description::text").get(), + "category": " / ".join(self.breadcrumbs.get().keys()), } @@ -95,14 +83,13 @@ def get(self): @inlineCallbacks def test_basic_case(settings): - item, url, _ = yield crawl_single_item(spider_for(ProductPage), - ProductHtml, settings) + item, url, _ = yield crawl_single_item(spider_for(ProductPage), ProductHtml, settings) assert item == { - 'url': url, - 'name': 'Chocolate', - 'price': '22€', - 'description': 'The best chocolate ever', - 'category': 'Food / Sweets', + "url": url, + "name": "Chocolate", + "price": "22€", + "description": "The best chocolate ever", + "category": "Food / Sweets", } @@ -111,17 +98,14 @@ def test_overrides(settings): host = socket.gethostbyname(socket.gethostname()) domain = get_domain(host) port = get_ephemeral_port() - settings["SCRAPY_POET_OVERRIDES"] = [ - (f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction) - ] - item, url, _ = yield crawl_single_item(spider_for(ProductPage), - ProductHtml, settings, port=port) + settings["SCRAPY_POET_OVERRIDES"] = [(f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction)] + item, url, _ = yield crawl_single_item(spider_for(ProductPage), ProductHtml, settings, port=port) assert item == { - 'url': url, - 'name': 'Chocolate', - 'price': '22€', - 'description': 'The best chocolate ever', - 'category': 'overriden_breadcrumb', + "url": url, + "name": "Chocolate", + "price": "22€", + "description": "The best chocolate ever", + "category": "overriden_breadcrumb", } @@ -142,15 +126,14 @@ def to_item(self): @inlineCallbacks def test_optional_and_unions(settings): - item, _, _ = yield crawl_single_item(spider_for(OptionalAndUnionPage), - ProductHtml, settings) - assert item['breadcrumbs'].response is item['response'] - assert item['opt_check_1'] is item['breadcrumbs'] - assert item['opt_check_2'] is None - assert item['union_check_1'] is item['breadcrumbs'] - assert item['union_check_2'] is item['breadcrumbs'].response - assert item['union_check_3'] is None - assert item['union_check_5'] is item['breadcrumbs'] + item, _, _ = yield crawl_single_item(spider_for(OptionalAndUnionPage), ProductHtml, settings) + assert item["breadcrumbs"].response is item["response"] + assert item["opt_check_1"] is item["breadcrumbs"] + assert item["opt_check_2"] is None + assert item["union_check_1"] is item["breadcrumbs"] + assert item["union_check_2"] is item["breadcrumbs"].response + assert item["union_check_3"] is None + assert item["union_check_5"] is item["breadcrumbs"] @attr.s(auto_attribs=True) @@ -214,6 +197,7 @@ class ProvidedWithDeferredPage(ItemWebPage): def to_item(self): return attr.asdict(self, recurse=False) + @attr.s(auto_attribs=True) class ProvidedWithFuturesPage(ProvidedWithDeferredPage): provided: ProvidedWithFutures @@ -222,10 +206,9 @@ class ProvidedWithFuturesPage(ProvidedWithDeferredPage): @pytest.mark.parametrize("type_", [ProvidedWithDeferredPage, ProvidedWithFuturesPage]) @inlineCallbacks def test_providers(settings, type_): - item, _, _ = yield crawl_single_item(spider_for(type_), - ProductHtml, settings) - assert item['provided'].msg == "Provided 5!" - assert item['provided'].response is None + item, _, _ = yield crawl_single_item(spider_for(type_), ProductHtml, settings) + assert item["provided"].msg == "Provided 5!" + assert item["provided"].response is None @inlineCallbacks @@ -234,41 +217,40 @@ def test_providers_returning_wrong_classes(settings): returns instances of classes that they're not supposed to provide. """ with pytest.raises(AssertionError): - yield crawl_single_item( - spider_for(ExtraClassData), ProductHtml, settings - ) + yield crawl_single_item(spider_for(ExtraClassData), ProductHtml, settings) class MultiArgsCallbackSpider(scrapy.Spider): url = None - custom_settings = { - "SCRAPY_POET_PROVIDERS": { - WithDeferredProvider: 1 - } - } + custom_settings = {"SCRAPY_POET_PROVIDERS": {WithDeferredProvider: 1}} def start_requests(self): yield Request(self.url, self.parse, cb_kwargs=dict(cb_arg="arg!")) - def parse(self, response, product: ProductPage, provided: ProvidedWithDeferred, - cb_arg: Optional[str], non_cb_arg: Optional[str]): + def parse( + self, + response, + product: ProductPage, + provided: ProvidedWithDeferred, + cb_arg: Optional[str], + non_cb_arg: Optional[str], + ): yield { - 'product': product, - 'provided': provided, - 'cb_arg': cb_arg, - 'non_cb_arg': non_cb_arg, + "product": product, + "provided": provided, + "cb_arg": cb_arg, + "non_cb_arg": non_cb_arg, } @inlineCallbacks def test_multi_args_callbacks(settings): - item, _, _ = yield crawl_single_item(MultiArgsCallbackSpider, ProductHtml, - settings) - assert type(item['product']) == ProductPage - assert type(item['provided']) == ProvidedWithDeferred - assert item['cb_arg'] == "arg!" - assert item['non_cb_arg'] is None + item, _, _ = yield crawl_single_item(MultiArgsCallbackSpider, ProductHtml, settings) + assert type(item["product"]) == ProductPage + assert type(item["provided"]) == ProvidedWithDeferred + assert item["cb_arg"] == "arg!" + assert item["non_cb_arg"] is None @attr.s(auto_attribs=True) @@ -279,8 +261,7 @@ class UnressolvableProductPage(ProductPage): @inlineCallbacks def test_injection_failure(settings): configure_logging(settings) - items, url, crawler = yield crawl_items( - spider_for(UnressolvableProductPage), ProductHtml, settings) + items, url, crawler = yield crawl_items(spider_for(UnressolvableProductPage), ProductHtml, settings) assert items == [] @@ -293,7 +274,7 @@ def start_requests(self): def parse(self, response): return { - 'response': response, + "response": response, } @@ -306,27 +287,25 @@ def start_requests(self): def parse(self, response: DummyResponse): return { - 'response': response, + "response": response, } @inlineCallbacks def test_skip_downloads(settings): - item, url, crawler = yield crawl_single_item( - MySpider, ProductHtml, settings) - assert isinstance(item['response'], Response) is True - assert isinstance(item['response'], DummyResponse) is False - assert crawler.stats.get_stats().get('downloader/request_count', 0) == 1 - assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 0 - assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 - - item, url, crawler = yield crawl_single_item( - SkipDownloadSpider, ProductHtml, settings) - assert isinstance(item['response'], Response) is True - assert isinstance(item['response'], DummyResponse) is True - assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 - assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 - assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 + item, url, crawler = yield crawl_single_item(MySpider, ProductHtml, settings) + assert isinstance(item["response"], Response) is True + assert isinstance(item["response"], DummyResponse) is False + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 1 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 0 + assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 + + item, url, crawler = yield crawl_single_item(SkipDownloadSpider, ProductHtml, settings) + assert isinstance(item["response"], Response) is True + assert isinstance(item["response"], DummyResponse) is True + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 0 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 1 + assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 class RequestUrlSpider(scrapy.Spider): @@ -337,22 +316,21 @@ def start_requests(self): def parse(self, response: DummyResponse, url: RequestUrl): return { - 'response': response, - 'url': url, + "response": response, + "url": url, } @inlineCallbacks def test_skip_download_request_url(settings): - item, url, crawler = yield crawl_single_item( - RequestUrlSpider, ProductHtml, settings) - assert isinstance(item['response'], Response) is True - assert isinstance(item['response'], DummyResponse) is True - assert isinstance(item['url'], RequestUrl) - assert str(item['url']) == url - assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 - assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 - assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 + item, url, crawler = yield crawl_single_item(RequestUrlSpider, ProductHtml, settings) + assert isinstance(item["response"], Response) is True + assert isinstance(item["response"], DummyResponse) is True + assert isinstance(item["url"], RequestUrl) + assert str(item["url"]) == url + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 0 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 1 + assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 @attr.s(auto_attribs=True) @@ -360,7 +338,7 @@ class RequestUrlPage(ItemPage): url: RequestUrl def to_item(self): - return {'url': self.url} + return {"url": self.url} class RequestUrlPageSpider(scrapy.Spider): @@ -375,20 +353,19 @@ def parse(self, response: DummyResponse, page: RequestUrlPage): @inlineCallbacks def test_skip_download_request_url_page(settings): - item, url, crawler = yield crawl_single_item( - RequestUrlPageSpider, ProductHtml, settings) - assert tuple(item.keys()) == ('url',) - assert str(item['url']) == url - assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 - assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 - assert crawler.stats.get_stats().get('downloader/response_count', 0) == 1 + item, url, crawler = yield crawl_single_item(RequestUrlPageSpider, ProductHtml, settings) + assert tuple(item.keys()) == ("url",) + assert str(item["url"]) == url + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 0 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 1 + assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 @mock.patch("scrapy_poet.injection.SqlitedictCache", spec=SqlitedictCache) def test_cache_closed_on_spider_close(mock_sqlitedictcache, settings): def get_middleware(settings): crawler = get_crawler(Spider, settings) - crawler.spider = crawler._create_spider('example.com') + crawler.spider = crawler._create_spider("example.com") return InjectionMiddleware(crawler) mock_sqlitedictcache_instance = mock_sqlitedictcache.return_value = mock.Mock() @@ -404,10 +381,7 @@ def get_middleware(settings): spider = has_cache_middleware.crawler.spider has_cache_middleware.spider_closed(spider) - assert mock_sqlitedictcache.mock_calls == [ - mock.call('/tmp/cache', compressed=True), - mock.call().close() - ] + assert mock_sqlitedictcache.mock_calls == [mock.call("/tmp/cache", compressed=True), mock.call().close()] @inlineCallbacks @@ -430,7 +404,5 @@ def test_web_poet_integration(settings): # Converting it to a set removes potential duplicate OverrideRules settings["SCRAPY_POET_OVERRIDES"] = set(rules) - item, url, _ = yield crawl_single_item( - spider_for(POOverriden), ProductHtml, settings, port=PORT - ) + item, url, _ = yield crawl_single_item(spider_for(POOverriden), ProductHtml, settings, port=PORT) assert item == {"msg": "PO replacement"} diff --git a/tests/test_page_input_providers.py b/tests/test_page_input_providers.py index 5e3e67ff..1dd3387e 100644 --- a/tests/test_page_input_providers.py +++ b/tests/test_page_input_providers.py @@ -5,7 +5,6 @@ class TestProvider: - def test_is_provided_on_malformed_provided_classes(self): class Provider(PageObjectInputProvider): provided_classes = [str] diff --git a/tests/test_providers.py b/tests/test_providers.py index 5907810a..5e992b67 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,26 +1,26 @@ -from typing import Any, List, Set, Callable, Sequence +import json +from typing import Any, Callable, List, Sequence, Set from unittest import mock import attr -import json import pytest -from pytest_twisted import ensureDeferred, inlineCallbacks -from scrapy_poet import HttpResponseProvider -from twisted.python.failure import Failure - import scrapy +from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Request, Spider from scrapy.crawler import Crawler from scrapy.settings import Settings from scrapy.utils.test import get_crawler +from twisted.python.failure import Failure +from web_poet import HttpClient, HttpResponse + +from scrapy_poet import HttpResponseProvider from scrapy_poet.page_input_providers import ( CacheDataProviderMixin, HttpClientProvider, PageObjectInputProvider, PageParamsProvider, ) -from tests.utils import crawl_single_item, HtmlResource, AsyncMock -from web_poet import HttpResponse, HttpClient +from tests.utils import AsyncMock, HtmlResource, crawl_single_item class ProductHtml(HtmlResource): @@ -69,9 +69,7 @@ def __init__(self, crawler: Crawler): assert isinstance(crawler, Crawler) super().__init__(crawler) - def __call__( - self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider - ): + def __call__(self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider): assert isinstance(spider, scrapy.Spider) ret: List[Any] = [] if Price in to_provide: @@ -162,9 +160,7 @@ class NameFirstMultiProviderSpider(PriceFirstMultiProviderSpider): def test_name_first_spider(settings, tmp_path): cache = tmp_path / "cache.sqlite3" settings.set("SCRAPY_POET_CACHE", str(cache)) - item, _, _ = yield crawl_single_item( - NameFirstMultiProviderSpider, ProductHtml, settings - ) + item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, ProductHtml, settings) assert cache.exists() assert item == { Price: Price("22€"), @@ -175,9 +171,7 @@ def test_name_first_spider(settings, tmp_path): # Let's see that the cache is working. We use a different and wrong resource, # but it should be ignored by the cached version used - item, _, _ = yield crawl_single_item( - NameFirstMultiProviderSpider, NonProductHtml, settings - ) + item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, NonProductHtml, settings) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -188,9 +182,7 @@ def test_name_first_spider(settings, tmp_path): @inlineCallbacks def test_price_first_spider(settings): - item, _, _ = yield crawl_single_item( - PriceFirstMultiProviderSpider, ProductHtml, settings - ) + item, _, _ = yield crawl_single_item(PriceFirstMultiProviderSpider, ProductHtml, settings) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -214,15 +206,14 @@ async def test_http_client_provider(settings): crawler = get_crawler(Spider, settings) crawler.engine = AsyncMock() - with mock.patch( - "scrapy_poet.page_input_providers.create_scrapy_downloader" - ) as mock_factory: + with mock.patch("scrapy_poet.page_input_providers.create_scrapy_downloader") as mock_factory: provider = HttpClientProvider(crawler) results = provider(set(), crawler) assert isinstance(results[0], HttpClient) results[0]._request_downloader == mock_factory.return_value + def test_page_params_provider(settings): crawler = get_crawler(Spider, settings) provider = PageParamsProvider(crawler) diff --git a/tests/test_response_required_logic.py b/tests/test_response_required_logic.py index 846d8099..874a988a 100644 --- a/tests/test_response_required_logic.py +++ b/tests/test_response_required_logic.py @@ -1,24 +1,23 @@ -import attr from typing import Any, Dict -from pytest_twisted import inlineCallbacks - +import attr import scrapy +from pytest_twisted import inlineCallbacks from scrapy.crawler import Crawler -from scrapy.http import TextResponse, HtmlResponse +from scrapy.http import HtmlResponse, TextResponse from scrapy.settings import Settings -from scrapy_poet.injection import Injector, get_callback, \ - is_callback_requiring_scrapy_response, is_provider_requiring_scrapy_response +from web_poet import ItemPage, WebPage +from scrapy_poet import DummyResponse, callback_for +from scrapy_poet.injection import ( + Injector, + get_callback, + is_callback_requiring_scrapy_response, + is_provider_requiring_scrapy_response, +) from scrapy_poet.page_input_providers import ( - PageObjectInputProvider, HttpResponseProvider, -) -from web_poet import ItemPage, WebPage - -from scrapy_poet import ( - callback_for, - DummyResponse, + PageObjectInputProvider, ) @@ -40,9 +39,9 @@ class DummyProductProvider(PageObjectInputProvider): def __call__(self, to_provide, request: scrapy.Request): data = { - 'product': { - 'url': request.url, - 'name': 'Sample', + "product": { + "url": request.url, + "name": "Sample", }, } return [DummyProductResponse(data=data)] @@ -54,9 +53,9 @@ class FakeProductProvider(PageObjectInputProvider): def __call__(self, to_provide): data = { - 'product': { - 'url': 'http://example.com/sample', - 'name': 'Sample', + "product": { + "url": "http://example.com/sample", + "name": "Sample", }, } return [FakeProductResponse(data=data)] @@ -71,7 +70,6 @@ def __call__(self, to_provide, response: TextResponse): class StringProductProvider(HttpResponseProvider): - def __call__(self, to_provide, response: str): return super().__call__(to_provide, response) @@ -83,10 +81,10 @@ class DummyProductPage(ItemPage): @property def url(self): - return self.response.data['product']['url'] + return self.response.data["product"]["url"] def to_item(self): - product = self.response.data['product'] + product = self.response.data["product"] return product @@ -97,22 +95,21 @@ class FakeProductPage(ItemPage): @property def url(self): - return self.response.data['product']['url'] + return self.response.data["product"]["url"] def to_item(self): - product = self.response.data['product'] + product = self.response.data["product"] return product class BookPage(WebPage): - def to_item(self): pass class MySpider(scrapy.Spider): - name = 'foo' + name = "foo" custom_settings = { "SCRAPY_POET_PROVIDERS": { HttpResponseProvider: 1, diff --git a/tests/test_scrapy_dependencies.py b/tests/test_scrapy_dependencies.py index 3ccd34c9..6e5db371 100644 --- a/tests/test_scrapy_dependencies.py +++ b/tests/test_scrapy_dependencies.py @@ -7,11 +7,10 @@ from scrapy_poet.injection import SCRAPY_PROVIDED_CLASSES from scrapy_poet.page_input_providers import ( - PageObjectInputProvider, HttpResponseProvider, + PageObjectInputProvider, ) - -from tests.utils import crawl_items, crawl_single_item, HtmlResource +from tests.utils import HtmlResource, crawl_items, crawl_single_item class ProductHtml(HtmlResource): @@ -30,7 +29,7 @@ class ProductHtml(HtmlResource): @inlineCallbacks -@pytest.mark.parametrize('scrapy_class', SCRAPY_PROVIDED_CLASSES) +@pytest.mark.parametrize("scrapy_class", SCRAPY_PROVIDED_CLASSES) def test_scrapy_dependencies_on_providers(scrapy_class, settings): """Scrapy dependencies should be injected into Providers.""" @@ -72,13 +71,12 @@ def start_requests(self): def parse(self, response, page: Page): return page.to_item() - item, url, crawler = yield crawl_single_item( - MySpider, ProductHtml, settings) + item, url, crawler = yield crawl_single_item(MySpider, ProductHtml, settings) assert item["scrapy_class"] == scrapy_class.__name__ @inlineCallbacks -@pytest.mark.parametrize('scrapy_class', SCRAPY_PROVIDED_CLASSES) +@pytest.mark.parametrize("scrapy_class", SCRAPY_PROVIDED_CLASSES) def test_scrapy_dependencies_on_page_objects(scrapy_class, settings): """Scrapy dependencies should not be injected into Page Objects.""" @@ -103,6 +101,5 @@ def start_requests(self): def parse(self, response, page: Page): return page.to_item() - items, url, crawler = yield crawl_items( - MySpider, ProductHtml, settings) + items, url, crawler = yield crawl_items(MySpider, ProductHtml, settings) assert not items diff --git a/tests/test_utils.py b/tests/test_utils.py index 3083579b..9bc905a6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ -from unittest import mock from pathlib import PosixPath +from unittest import mock import pytest from scrapy.http import Request, Response, TextResponse diff --git a/tests/utils.py b/tests/utils.py index 05186b5b..0863a1b5 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,22 +2,23 @@ from unittest import mock from pytest_twisted import inlineCallbacks +from scrapy.crawler import Crawler from scrapy.exceptions import CloseSpider +from scrapy.utils.python import to_bytes from twisted.web.resource import Resource -from scrapy.crawler import Crawler + from tests.mockserver import MockServer -from scrapy.utils.python import to_bytes class HtmlResource(Resource): isLeaf = True - content_type = 'text/html' - html = '' + content_type = "text/html" + html = "" extra_headers: Dict[str, str] = {} status_code = 200 def render_GET(self, request): - request.setHeader(b'content-type', to_bytes(self.content_type)) + request.setHeader(b"content-type", to_bytes(self.content_type)) for name, value in self.extra_headers.items(): request.setHeader(to_bytes(name), to_bytes(value)) request.setResponseCode(self.status_code) @@ -43,19 +44,20 @@ def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None, po """Run a spider where a single item is expected. Use in combination with ``capture_capture_exceptions`` and ``CollectorPipeline`` """ - items, url, crawler = yield crawl_items(spider_cls, resource_cls, settings, - spider_kwargs=spider_kwargs, port=port) + items, url, crawler = yield crawl_items(spider_cls, resource_cls, settings, spider_kwargs=spider_kwargs, port=port) assert len(items) == 1 resp = items[0] - if 'exception' in resp: - raise resp['exception'] + if "exception" in resp: + raise resp["exception"] return resp, url, crawler def make_crawler(spider_cls, settings): - if not getattr(spider_cls, 'name', None): + if not getattr(spider_cls, "name", None): + class Spider(spider_cls): - name = 'test_spider' + name = "test_spider" + Spider.__name__ = spider_cls.__name__ Spider.__module__ = spider_cls.__module__ spider_cls = Spider @@ -63,7 +65,6 @@ class Spider(spider_cls): class CollectorPipeline: - def open_spider(self, spider): spider.collected_items = [] @@ -73,15 +74,17 @@ def process_item(self, item, spider): def capture_exceptions(callback): - """ Wrapper for Scrapy callbacks that captures exceptions within + """Wrapper for Scrapy callbacks that captures exceptions within the provided callback and yields it under `exception` property. Also - spider is closed on the first exception. """ + spider is closed on the first exception.""" + def parse(*args, **kwargs): try: yield from callback(*args, **kwargs) except Exception as e: - yield {'exception': e} + yield {"exception": e} raise CloseSpider("Exception in callback detected") + # Mimic type annotations parse.__annotations__ = callback.__annotations__ return parse From e4a1cfb7d1e7e55de39039ee3925be073ae6d412 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 13:15:53 +0800 Subject: [PATCH 04/10] fix flake8 errors --- .flake8 | 5 +++-- docs/conf.py | 2 +- example/example/spiders/books_04_overrides_03.py | 2 -- scrapy_poet/injection.py | 10 +++++----- scrapy_poet/overrides.py | 1 - scrapy_poet/page_input_providers.py | 1 - scrapy_poet/utils.py | 1 - tests/mockserver.py | 4 ++-- tests/po_lib/__init__.py | 2 -- tests/test_downloader.py | 8 +------- tests/test_injection.py | 2 +- tests/test_middleware.py | 4 ++-- tests/test_providers.py | 3 +-- 13 files changed, 16 insertions(+), 29 deletions(-) diff --git a/.flake8 b/.flake8 index 664832fa..4caf755f 100644 --- a/.flake8 +++ b/.flake8 @@ -14,6 +14,7 @@ ignore = # To be addressed: D100, # Missing docstring in public module D101, # Missing docstring in public class + D102, # Missing docstring in public method D103, # Missing docstring in public function D104, # Missing docstring in public package D105, # Missing docstring in magic method @@ -31,5 +32,5 @@ per-file-ignores = # imports are there to expose submodule functions so they can be imported # directly from that module # F403: Ignore * imports in these files - web_poet/__init__.py:F401,F403 - web_poet/page_inputs/__init__.py:F401,F403 + scrapy_poet/__init__.py:F401,F403 + scrapy_poet/page_inputs/__init__.py:F401,F403 diff --git a/docs/conf.py b/docs/conf.py index 97c01d71..2db3894d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -83,7 +83,7 @@ # Add any paths that contain custom themes here, relative to this directory. # Add path to the RTD explicitly to robustify builds (otherwise might # fail in a clean Debian build env) -import sphinx_rtd_theme +import sphinx_rtd_theme # noqa: E402 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index e3bf936d..525c75e6 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -11,9 +11,7 @@ store the rules in web-poet's registry. """ import scrapy -from url_matcher import Patterns from web_poet import ItemWebPage, WebPage, default_registry, handle_urls -from web_poet.overrides import OverrideRule from scrapy_poet import callback_for diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 7e52d29e..71d125c6 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -50,7 +50,7 @@ def __init__( self.load_providers(default_providers) self.init_cache() - def load_providers(self, default_providers: Optional[Mapping] = None): + def load_providers(self, default_providers: Optional[Mapping] = None): # noqa: D102 providers_dict = {**(default_providers or {}), **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS")} provider_classes = build_component_list(providers_dict) logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}") @@ -63,11 +63,11 @@ def load_providers(self, default_providers: Optional[Mapping] = None): # Caching the function for faster execution self.is_class_provided_by_any_provider = is_class_provided_by_any_provider_fn(self.providers) - def close(self) -> None: + def close(self) -> None: # noqa: D102 if self.cache: self.cache.close() - def init_cache(self): + def init_cache(self): # noqa: D102 self.cache = None cache_filename = self.spider.settings.get("SCRAPY_POET_CACHE") if cache_filename and isinstance(cache_filename, bool): @@ -80,7 +80,7 @@ def init_cache(self): f"Cache enabled. File: '{cache_filename}'. Compressed: {compressed}. Caching errors: {self.caching_errors}" ) - def available_dependencies_for_providers(self, request: Request, response: Response): + def available_dependencies_for_providers(self, request: Request, response: Response): # noqa: D102 deps = { Crawler: self.crawler, Spider: self.spider, @@ -266,7 +266,7 @@ def is_provided_fn(type: Callable) -> bool: def get_callback(request, spider): """Get ``request.callback`` of a :class:`scrapy.Request`""" if request.callback is None: - return getattr(spider, "parse") + return getattr(spider, "parse") # noqa: B009 return request.callback diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 8b9c18fc..05caef65 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -6,7 +6,6 @@ from scrapy import Request from scrapy.crawler import Crawler from url_matcher import Patterns, URLMatcher -from url_matcher.util import get_domain from web_poet.overrides import OverrideRule logger = logging.getLogger(__name__) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 3516b98c..340e6235 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -27,7 +27,6 @@ from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.injection_errors import MalformedProvidedClassesError -from scrapy_poet.utils import scrapy_response_to_http_response class PageObjectInputProvider: diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 027df39f..5b38c534 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,6 +1,5 @@ import os -import attr from scrapy.http import Request, Response from scrapy.utils.project import inside_project, project_data_dir from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders diff --git a/tests/mockserver.py b/tests/mockserver.py index cce02ec5..13ac1e3e 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -17,7 +17,7 @@ def get_ephemeral_port(): class MockServer: def __init__(self, resource, port=None): - self.resource = "{}.{}".format(resource.__module__, resource.__name__) + self.resource = "{0}.{1}".format(resource.__module__, resource.__name__) self.proc = None host = socket.gethostbyname(socket.gethostname()) self.port = port or get_ephemeral_port() @@ -48,7 +48,7 @@ def main(): def print_listening(): host = http_port.getHost() - print("Mock server {} running at http://{}:{}".format(resource, host.host, host.port)) + print("Mock server {0} running at http://{1}:{2}".format(resource, host.host, host.port)) reactor.callWhenRunning(print_listening) reactor.run() diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 1b25b7d8..e3db57e3 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -2,9 +2,7 @@ This package is just for overrides testing purposes. """ import socket -from typing import Any, Callable, Dict -from url_matcher import Patterns from url_matcher.util import get_domain from web_poet import ItemWebPage, handle_urls diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 5632d48a..ba39bb3f 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -19,13 +19,7 @@ from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.utils import http_request_to_scrapy_request -from tests.utils import ( - AsyncMock, - HtmlResource, - MockServer, - crawl_single_item, - make_crawler, -) +from tests.utils import AsyncMock, MockServer, make_crawler @pytest.fixture diff --git a/tests/test_injection.py b/tests/test_injection.py index 47c89999..8e0f248b 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -85,7 +85,7 @@ def get_providers_for_testing(): prov1 = get_provider_requiring_response({ClsReqResponse}) prov2 = get_provider({Cls1, Cls2}) # Duplicating them because they should work even in this situation - return {prov1: 1, prov2: 2, prov1: 3, prov2: 4} + return {prov1: 1, prov2: 2, prov1: 3, prov2: 4} # noqa: F602 @pytest.fixture diff --git a/tests/test_middleware.py b/tests/test_middleware.py index c7d16452..cbaf4311 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -368,7 +368,7 @@ def get_middleware(settings): crawler.spider = crawler._create_spider("example.com") return InjectionMiddleware(crawler) - mock_sqlitedictcache_instance = mock_sqlitedictcache.return_value = mock.Mock() + mock_sqlitedictcache.return_value = mock.Mock() # no cache no_cache_middleware = get_middleware(settings) @@ -396,7 +396,7 @@ def test_web_poet_integration(settings): # Only import them in this test scope since they need to be synced with # the URL of the Page Object annotated with @handle_urls. - from tests.po_lib import DOMAIN, PORT, POOverriden + from tests.po_lib import PORT, POOverriden # Override rules are defined in `tests/po_lib/__init__.py`. rules = default_registry.get_overrides() diff --git a/tests/test_providers.py b/tests/test_providers.py index 5e992b67..3c412776 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -3,7 +3,6 @@ from unittest import mock import attr -import pytest import scrapy from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Request, Spider @@ -211,7 +210,7 @@ async def test_http_client_provider(settings): results = provider(set(), crawler) assert isinstance(results[0], HttpClient) - results[0]._request_downloader == mock_factory.return_value + assert results[0]._request_downloader == mock_factory.return_value def test_page_params_provider(settings): From 1939a00e58b63be612677ac0b97e9afda037fc74 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 13:17:29 +0800 Subject: [PATCH 05/10] add .git-blame-ignore-revs --- .git-blame-ignore-revs | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..e45154ab --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,5 @@ +# Contains commits to be ignored due to linting + +# https://github.com/scrapinghub/scrapy-poet/pull/68 +58c903617911b3209ad68bfefe3fa1a86be629f4 +e4a1cfb7d1e7e55de39039ee3925be073ae6d412 From 705916b17cceacdb4877301e211fbb82e20b9007 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 13:22:18 +0800 Subject: [PATCH 06/10] remove '' prefix in the docs' anchor tags --- docs/index.rst | 6 +++--- docs/intro/advanced-tutorial.rst | 4 ++-- docs/intro/basic-tutorial.rst | 2 +- docs/intro/install.rst | 2 +- docs/license.rst | 2 +- docs/overrides.rst | 2 +- docs/providers.rst | 2 +- docs/settings.rst | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 1271bbe5..59450a82 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,9 +25,9 @@ To get started, see :ref:`intro-install` and :ref:`intro-tutorial`. :ref:`license` is BSD 3-clause. -.. _`AutoExtract`: https://scrapinghub.com/autoextract -.. _`Splash`: https://scrapinghub.com/splash -.. _`web-poet`: https://github.com/scrapinghub/web-poet +.. _AutoExtract: https://scrapinghub.com/autoextract +.. _Splash: https://scrapinghub.com/splash +.. _web-poet: https://github.com/scrapinghub/web-poet .. _docs: https://web-poet.readthedocs.io/en/stable/ .. toctree:: diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index a1fd2fa5..a2cffbf5 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -1,4 +1,4 @@ -.. _`intro-advanced-tutorial`: +.. _intro-advanced-tutorial: ================= Advanced Tutorial @@ -15,7 +15,7 @@ These are mainly achieved by **scrapy-poet** implementing **providers** for them * :class:`scrapy_poet.page_input_providers.HttpClientProvider` * :class:`scrapy_poet.page_input_providers.PageParamsProvider` -.. _`intro-additional-requests`: +.. _intro-additional-requests: Additional Requests =================== diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 6a37c548..342cf385 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -1,4 +1,4 @@ -.. _`intro-basic-tutorial`: +.. _intro-basic-tutorial: ============== Basic Tutorial diff --git a/docs/intro/install.rst b/docs/intro/install.rst index f3d6187e..9c6f5e7e 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -1,4 +1,4 @@ -.. _`intro-install`: +.. _intro-install: ============ Installation diff --git a/docs/license.rst b/docs/license.rst index e6a41ca8..e647e180 100644 --- a/docs/license.rst +++ b/docs/license.rst @@ -1,4 +1,4 @@ -.. _`license`: +.. _license: ======= License diff --git a/docs/overrides.rst b/docs/overrides.rst index 3ceb3d39..e278693d 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -1,4 +1,4 @@ -.. _`overrides`: +.. _overrides: ========= Overrides diff --git a/docs/providers.rst b/docs/providers.rst index 939807f6..4b5918e9 100644 --- a/docs/providers.rst +++ b/docs/providers.rst @@ -1,4 +1,4 @@ -.. _`providers`: +.. _providers: ========= Providers diff --git a/docs/settings.rst b/docs/settings.rst index 2dbdec30..3b9c7ddd 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -1,4 +1,4 @@ -.. _`settings`: +.. _settings: Settings ======== From 877307e887c241558814fba1494b4c1ac87fc05b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 13:40:03 +0800 Subject: [PATCH 07/10] remove flake8 commit hash in .git-blame-ignore-revs --- .git-blame-ignore-revs | 1 - 1 file changed, 1 deletion(-) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index e45154ab..046b3bfb 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -2,4 +2,3 @@ # https://github.com/scrapinghub/scrapy-poet/pull/68 58c903617911b3209ad68bfefe3fa1a86be629f4 -e4a1cfb7d1e7e55de39039ee3925be073ae6d412 From 1548c1d1262e54035ca83949f80208e11e4c045a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 19:54:35 +0800 Subject: [PATCH 08/10] update black config to use line length of 88 instead of 120 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c7d708d9..cec60096 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.black] -line-length = 120 +line-length = 88 [tool.isort] profile = "black" From 7249a133722d1115111a8bbb3b02080a892483f2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 19:54:46 +0800 Subject: [PATCH 09/10] run black with line length of 88 instead of 120 --- docs/conf.py | 8 +- .../example/spiders/books_04_overrides_02.py | 12 ++- scrapy_poet/api.py | 4 +- scrapy_poet/cache.py | 4 +- scrapy_poet/downloader.py | 3 +- scrapy_poet/injection.py | 73 +++++++++++---- scrapy_poet/middleware.py | 16 +++- scrapy_poet/overrides.py | 8 +- scrapy_poet/page_input_providers.py | 4 +- tests/mockserver.py | 17 +++- tests/test_downloader.py | 20 +++-- tests/test_injection.py | 89 +++++++++++++++---- tests/test_middleware.py | 45 +++++++--- tests/test_providers.py | 29 ++++-- tests/test_utils.py | 8 +- tests/utils.py | 8 +- 16 files changed, 270 insertions(+), 78 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2db3894d..b21b3c19 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -136,7 +136,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "scrapy-poet.tex", "scrapy-poet Documentation", "Scrapinghub", "manual"), + ( + master_doc, + "scrapy-poet.tex", + "scrapy-poet Documentation", + "Scrapinghub", + "manual", + ), ] diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index dd576270..f707c2b2 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -67,8 +67,16 @@ class BooksSpider(scrapy.Spider): ("toscrape.com", BTSBookListPage, BookListPage), ("toscrape.com", BTSBookPage, BookPage), # We could also use the long-form version if we want to. - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), + OverrideRule( + for_patterns=Patterns(["bookpage.com"]), + use=BPBookListPage, + instead_of=BookListPage, + ), + OverrideRule( + for_patterns=Patterns(["bookpage.com"]), + use=BPBookPage, + instead_of=BookPage, + ), ] } diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index a19ab437..3b8d5000 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -108,7 +108,9 @@ def parse(self, response): raise TypeError(f"{page_cls.__name__} should be a subclass of ItemPage.") if getattr(page_cls.to_item, "__isabstractmethod__", False): - raise NotImplementedError(f"{page_cls.__name__} should implement to_item method.") + raise NotImplementedError( + f"{page_cls.__name__} should implement to_item method." + ) # When the callback is used as an instance method of the spider, it expects # to receive 'self' as its first argument. When used as a simple inline diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index 418443e6..dcae642b 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -55,7 +55,9 @@ def decode(self, obj: Any) -> Any: def __str__(self) -> str: return ( # pragma: no cover - f"SqlitedictCache <{self.db.filename} | " f"compressed: {self.compressed} | " f"{len(self.db)} records>" + f"SqlitedictCache <{self.db.filename} | " + f"compressed: {self.compressed} | " + f"{len(self.db)} records>" ) def __repr__(self) -> str: diff --git a/scrapy_poet/downloader.py b/scrapy_poet/downloader.py index 92e4f90b..a20c60f2 100644 --- a/scrapy_poet/downloader.py +++ b/scrapy_poet/downloader.py @@ -17,7 +17,8 @@ def create_scrapy_downloader(download_func): async def scrapy_downloader(request: HttpRequest): if not isinstance(request, HttpRequest): raise TypeError( - f"The request should be 'web_poet.HttpRequest' but received " f"one of type: '{type(request)}'." + f"The request should be 'web_poet.HttpRequest' but received " + f"one of type: '{type(request)}'." ) scrapy_request = http_request_to_scrapy_request(request) diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 71d125c6..d44d445e 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -51,17 +51,23 @@ def __init__( self.init_cache() def load_providers(self, default_providers: Optional[Mapping] = None): # noqa: D102 - providers_dict = {**(default_providers or {}), **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS")} + providers_dict = { + **(default_providers or {}), + **self.spider.settings.getdict("SCRAPY_POET_PROVIDERS"), + } provider_classes = build_component_list(providers_dict) logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}") self.providers = [load_object(cls)(self.crawler) for cls in provider_classes] check_all_providers_are_callable(self.providers) # Caching whether each provider requires the scrapy response self.is_provider_requiring_scrapy_response = { - provider: is_provider_requiring_scrapy_response(provider) for provider in self.providers + provider: is_provider_requiring_scrapy_response(provider) + for provider in self.providers } # Caching the function for faster execution - self.is_class_provided_by_any_provider = is_class_provided_by_any_provider_fn(self.providers) + self.is_class_provided_by_any_provider = is_class_provided_by_any_provider_fn( + self.providers + ) def close(self) -> None: # noqa: D102 if self.cache: @@ -71,16 +77,22 @@ def init_cache(self): # noqa: D102 self.cache = None cache_filename = self.spider.settings.get("SCRAPY_POET_CACHE") if cache_filename and isinstance(cache_filename, bool): - cache_filename = os.path.join(get_scrapy_data_path(createdir=True), "scrapy-poet-cache.sqlite3") + cache_filename = os.path.join( + get_scrapy_data_path(createdir=True), "scrapy-poet-cache.sqlite3" + ) if cache_filename: compressed = self.spider.settings.getbool("SCRAPY_POET_CACHE_GZIP", True) - self.caching_errors = self.spider.settings.getbool("SCRAPY_POET_CACHE_ERRORS", False) + self.caching_errors = self.spider.settings.getbool( + "SCRAPY_POET_CACHE_ERRORS", False + ) self.cache = SqlitedictCache(cache_filename, compressed=compressed) logger.info( f"Cache enabled. File: '{cache_filename}'. Compressed: {compressed}. Caching errors: {self.caching_errors}" ) - def available_dependencies_for_providers(self, request: Request, response: Response): # noqa: D102 + def available_dependencies_for_providers( + self, request: Request, response: Response + ): # noqa: D102 deps = { Crawler: self.crawler, Spider: self.spider, @@ -92,7 +104,9 @@ def available_dependencies_for_providers(self, request: Request, response: Respo assert deps.keys() == SCRAPY_PROVIDED_CLASSES return deps - def discover_callback_providers(self, request: Request) -> Set[PageObjectInputProvider]: + def discover_callback_providers( + self, request: Request + ) -> Set[PageObjectInputProvider]: """Discover the providers that are required to fulfil the callback dependencies""" plan = self.build_plan(request) result = set() @@ -131,7 +145,9 @@ def build_plan(self, request: Request) -> andi.Plan: def build_instances(self, request: Request, response: Response, plan: andi.Plan): """Build the instances dict from a plan including external dependencies.""" # First we build the external dependencies using the providers - instances = yield from self.build_instances_from_providers(request, response, plan) + instances = yield from self.build_instances_from_providers( + request, response, plan + ) # All the remaining dependencies are internal so they can be built just # following the andi plan. for cls, kwargs_spec in plan.dependencies: @@ -141,13 +157,19 @@ def build_instances(self, request: Request, response: Response, plan: andi.Plan) return instances @inlineCallbacks - def build_instances_from_providers(self, request: Request, response: Response, plan: andi.Plan): + def build_instances_from_providers( + self, request: Request, response: Response, plan: andi.Plan + ): """Build dependencies handled by registered providers""" instances: Dict[Callable, Any] = {} - scrapy_provided_dependencies = self.available_dependencies_for_providers(request, response) + scrapy_provided_dependencies = self.available_dependencies_for_providers( + request, response + ) dependencies_set = {cls for cls, _ in plan.dependencies} for provider in self.providers: - provided_classes = {cls for cls in dependencies_set if provider.is_provided(cls)} + provided_classes = { + cls for cls in dependencies_set if provider.is_provided(cls) + } provided_classes -= instances.keys() # ignore already provided types if not provided_classes: continue @@ -183,10 +205,16 @@ def build_instances_from_providers(self, request: Request, response: Response, p try: # Invoke the provider to get the data - objs = yield maybeDeferred_coro(provider, set(provided_classes), **kwargs) + objs = yield maybeDeferred_coro( + provider, set(provided_classes), **kwargs + ) except Exception as e: - if self.cache and self.caching_errors and provider.has_cache_support: + if ( + self.cache + and self.caching_errors + and provider.has_cache_support + ): # Save errors in the cache self.cache[fingerprint] = e self.crawler.stats.inc_value("scrapy-poet/cache/firsthand") @@ -225,11 +253,14 @@ def check_all_providers_are_callable(providers): for provider in providers: if not callable(provider): raise NonCallableProviderError( - f"The provider {type(provider)} is not callable. " f"It must implement '__call__' method" + f"The provider {type(provider)} is not callable. " + f"It must implement '__call__' method" ) -def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider]) -> Callable[[Callable], bool]: +def is_class_provided_by_any_provider_fn( + providers: List[PageObjectInputProvider], +) -> Callable[[Callable], bool]: """ Return a function of type ``Callable[[Type], bool]`` that return True if the given type is provided by any of the registered providers. @@ -239,7 +270,9 @@ def is_class_provided_by_any_provider_fn(providers: List[PageObjectInputProvider joined together for efficiency. """ sets_of_types: Set[Callable] = set() # caching all sets found - individual_is_callable: List[Callable[[Callable], bool]] = [sets_of_types.__contains__] + individual_is_callable: List[Callable[[Callable], bool]] = [ + sets_of_types.__contains__ + ] for provider in providers: provided_classes = provider.provided_classes @@ -325,7 +358,9 @@ def is_provider_requiring_scrapy_response(provider): def get_injector_for_testing( - providers: Mapping, additional_settings: Dict = None, overrides_registry: Optional[OverridesRegistryBase] = None + providers: Mapping, + additional_settings: Dict = None, + overrides_registry: Optional[OverridesRegistryBase] = None, ) -> Injector: """ Return an :class:`Injector` using a fake crawler. @@ -335,7 +370,9 @@ def get_injector_for_testing( class MySpider(Spider): name = "my_spider" - settings = Settings({**(additional_settings or {}), "SCRAPY_POET_PROVIDERS": providers}) + settings = Settings( + {**(additional_settings or {}), "SCRAPY_POET_PROVIDERS": providers} + ) crawler = Crawler(MySpider) crawler.settings = settings spider = MySpider() diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index 8c2bc15a..a35cb3c4 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -45,14 +45,20 @@ def __init__(self, crawler: Crawler) -> None: """Initialize the middleware""" self.crawler = crawler settings = self.crawler.settings - registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", OverridesRegistry)) + registry_cls = load_object( + settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", OverridesRegistry) + ) self.overrides_registry = create_instance(registry_cls, settings, crawler) self.injector = Injector( - crawler, default_providers=DEFAULT_PROVIDERS, overrides_registry=self.overrides_registry + crawler, + default_providers=DEFAULT_PROVIDERS, + overrides_registry=self.overrides_registry, ) @classmethod - def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> InjectionMiddlewareTV: + def from_crawler( + cls: Type[InjectionMiddlewareTV], crawler: Crawler + ) -> InjectionMiddlewareTV: o = cls(crawler) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o @@ -60,7 +66,9 @@ def from_crawler(cls: Type[InjectionMiddlewareTV], crawler: Crawler) -> Injectio def spider_closed(self, spider: Spider) -> None: self.injector.close() - def process_request(self, request: Request, spider: Spider) -> Optional[DummyResponse]: + def process_request( + self, request: Request, spider: Spider + ) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` is going to be used by the callback or a Page Input. diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 05caef65..a404ca00 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -104,9 +104,13 @@ def add_rule(self, rule: RuleFromUser) -> None: f"replacement and (3) the PO class to be replaced." ) pattern, use, instead_of = rule - rule = OverrideRule(for_patterns=Patterns([pattern]), use=use, instead_of=instead_of) + rule = OverrideRule( + for_patterns=Patterns([pattern]), use=use, instead_of=instead_of + ) self.rules.append(rule) - self.matcher[rule.instead_of].add_or_update(len(self.rules) - 1, rule.for_patterns) + self.matcher[rule.instead_of].add_or_update( + len(self.rules) - 1, rule.for_patterns + ) def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: overrides: Dict[Callable, Callable] = {} diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 340e6235..145ec9c4 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -180,7 +180,9 @@ def __call__(self, to_provide: Set[Callable], response: Response): def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: request_keys = {"url", "method", "body"} - request_data = {k: str(v) for k, v in request.to_dict().items() if k in request_keys} + request_data = { + k: str(v) for k, v in request.to_dict().items() if k in request_keys + } fp_data = { "SCRAPY_FINGERPRINT": request_fingerprint(request), **request_data, diff --git a/tests/mockserver.py b/tests/mockserver.py index 13ac1e3e..ec037bd0 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -25,7 +25,16 @@ def __init__(self, resource, port=None): def __enter__(self): self.proc = Popen( - [sys.executable, "-u", "-m", "tests.mockserver", self.resource, "--port", str(self.port)], stdout=PIPE + [ + sys.executable, + "-u", + "-m", + "tests.mockserver", + self.resource, + "--port", + str(self.port), + ], + stdout=PIPE, ) self.proc.stdout.readline() return self @@ -48,7 +57,11 @@ def main(): def print_listening(): host = http_port.getHost() - print("Mock server {0} running at http://{1}:{2}".format(resource, host.host, host.port)) + print( + "Mock server {0} running at http://{1}:{2}".format( + resource, host.host, host.port + ) + ) reactor.callWhenRunning(print_listening) reactor.run() diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ba39bb3f..9471bf5a 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -52,7 +52,9 @@ def fake_http_response(): async def test_scrapy_poet_downloader(fake_http_response): req = web_poet.HttpRequest("https://example.com") - with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: mock_dtf.return_value = fake_http_response @@ -77,7 +79,9 @@ async def test_scrapy_poet_downloader_ignored_request(): standard on additional request error handling.""" req = web_poet.HttpRequest("https://example.com") - with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: mock_dtf.side_effect = scrapy.exceptions.IgnoreRequest mock_downloader = mock.MagicMock(return_value=AsyncMock) scrapy_downloader = create_scrapy_downloader(mock_downloader) @@ -90,7 +94,9 @@ async def test_scrapy_poet_downloader_ignored_request(): async def test_scrapy_poet_downloader_twisted_error(): req = web_poet.HttpRequest("https://example.com") - with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: mock_dtf.side_effect = twisted.internet.error.TimeoutError mock_downloader = mock.MagicMock(return_value=AsyncMock) scrapy_downloader = create_scrapy_downloader(mock_downloader) @@ -103,7 +109,9 @@ async def test_scrapy_poet_downloader_twisted_error(): async def test_scrapy_poet_downloader_head_redirect(fake_http_response): req = web_poet.HttpRequest("https://example.com", method="HEAD") - with mock.patch("scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock) as mock_dtf: + with mock.patch( + "scrapy_poet.downloader.maybe_deferred_to_future", new_callable=AsyncMock + ) as mock_dtf: mock_dtf.return_value = fake_http_response mock_downloader = mock.MagicMock(return_value=AsyncMock) scrapy_downloader = create_scrapy_downloader(mock_downloader) @@ -239,7 +247,9 @@ def _delayedRender(self, request, seconds): def test_additional_requests_connection_issue(): items = [] - with mock.patch("scrapy_poet.downloader.http_request_to_scrapy_request") as mock_http_request_to_scrapy_request: + with mock.patch( + "scrapy_poet.downloader.http_request_to_scrapy_request" + ) as mock_http_request_to_scrapy_request: mock_http_request_to_scrapy_request.side_effect = partial( http_request_to_scrapy_request, meta={"download_timeout": 0.001}, diff --git a/tests/test_injection.py b/tests/test_injection.py index 8e0f248b..e2925de2 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -111,7 +111,10 @@ def test_constructor(self): assert not injector.is_class_provided_by_any_provider(ClsNoProvided) for provider in injector.providers: - assert injector.is_provider_requiring_scrapy_response[provider] == provider.require_response + assert ( + injector.is_provider_requiring_scrapy_response[provider] + == provider.require_response + ) # Asserting that we are not leaking providers references weak_ref = weakref.ref(injector.providers[0]) @@ -176,7 +179,13 @@ def callback_yes_2(response: DummyResponse, a: ClsReqResponse): @inlineCallbacks def test_build_instances_methods(self, injector): - def callback(response: DummyResponse, a: Cls1, b: Cls2, c: WrapCls, d: ClsNoProviderRequired): + def callback( + response: DummyResponse, + a: Cls1, + b: Cls2, + c: WrapCls, + d: ClsNoProviderRequired, + ): pass response = get_response_for_testing(callback) @@ -191,7 +200,9 @@ def callback(response: DummyResponse, a: Cls1, b: Cls2, c: WrapCls, d: ClsNoProv ClsNoProviderRequired: ClsNoProviderRequired(), } - instances = yield from injector.build_instances_from_providers(request, response, plan) + instances = yield from injector.build_instances_from_providers( + request, response, plan + ) assert instances == { Cls1: Cls1(), Cls2: Cls2(), @@ -212,7 +223,9 @@ def callback(response: DummyResponse, a: Cls1): response = get_response_for_testing(callback) plan = injector.build_plan(response.request) with pytest.raises(UndeclaredProvidedTypeError) as exinf: - yield from injector.build_instances_from_providers(response.request, response, plan) + yield from injector.build_instances_from_providers( + response.request, response, plan + ) assert "Provider" in str(exinf.value) assert "Cls2" in str(exinf.value) @@ -236,19 +249,34 @@ def callback(response: DummyResponse, arg: str): response = get_response_for_testing(callback) plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers(response.request, response, plan) + instances = yield from injector.build_instances_from_providers( + response.request, response, plan + ) assert instances[str] == min(str_list) @inlineCallbacks def test_build_callback_dependencies(self, injector): - def callback(response: DummyResponse, a: Cls1, b: Cls2, c: WrapCls, d: ClsNoProviderRequired): + def callback( + response: DummyResponse, + a: Cls1, + b: Cls2, + c: WrapCls, + d: ClsNoProviderRequired, + ): pass response = get_response_for_testing(callback) - kwargs = yield from injector.build_callback_dependencies(response.request, response) + kwargs = yield from injector.build_callback_dependencies( + response.request, response + ) kwargs_types = {key: type(value) for key, value in kwargs.items()} - assert kwargs_types == {"a": Cls1, "b": Cls2, "c": WrapCls, "d": ClsNoProviderRequired} + assert kwargs_types == { + "a": Cls1, + "b": Cls2, + "c": WrapCls, + "d": ClsNoProviderRequired, + } class Html(Injectable): @@ -297,22 +325,31 @@ def test_overrides(self, providers, override_should_happen): # when we configure them for domain other-example.com overrides = [ (domain, PriceInDollarsPO, PricePO), - OverrideRule(Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate), + OverrideRule( + Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate + ), ] registry = OverridesRegistry(overrides) injector = get_injector_for_testing(providers, overrides_registry=registry) - def callback(response: DummyResponse, price_po: PricePO, rate_po: EurDollarRate): + def callback( + response: DummyResponse, price_po: PricePO, rate_po: EurDollarRate + ): pass response = get_response_for_testing(callback) - kwargs = yield from injector.build_callback_dependencies(response.request, response) + kwargs = yield from injector.build_callback_dependencies( + response.request, response + ) kwargs_types = {key: type(value) for key, value in kwargs.items()} price_po = kwargs["price_po"] item = price_po.to_item() if override_should_happen: - assert kwargs_types == {"price_po": PriceInDollarsPO, "rate_po": OtherEurDollarRate} + assert kwargs_types == { + "price_po": PriceInDollarsPO, + "rate_po": OtherEurDollarRate, + } # Note that OtherEurDollarRate don't have effect inside PriceInDollarsPO # because composability of overrides is forbidden assert item == {"price": 22 * 1.1, "currency": "$"} @@ -322,8 +359,12 @@ def callback(response: DummyResponse, price_po: PricePO, rate_po: EurDollarRate) def test_load_provider_classes(): - provider_as_string = f"{HttpResponseProvider.__module__}.{HttpResponseProvider.__name__}" - injector = get_injector_for_testing({provider_as_string: 2, HttpResponseProvider: 1}) + provider_as_string = ( + f"{HttpResponseProvider.__module__}.{HttpResponseProvider.__name__}" + ) + injector = get_injector_for_testing( + {provider_as_string: 2, HttpResponseProvider: 1} + ) assert all(type(prov) == HttpResponseProvider for prov in injector.providers) assert len(injector.providers) == 2 @@ -331,7 +372,9 @@ def test_load_provider_classes(): def test_check_all_providers_are_callable(): check_all_providers_are_callable([HttpResponseProvider(None)]) with pytest.raises(NonCallableProviderError) as exinf: - check_all_providers_are_callable([PageObjectInputProvider(None), HttpResponseProvider(None)]) + check_all_providers_are_callable( + [PageObjectInputProvider(None), HttpResponseProvider(None)] + ) assert "PageObjectInputProvider" in str(exinf.value) assert "not callable" in str(exinf.value) @@ -419,7 +462,9 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response = get_response_for_testing(callback) plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers(response.request, response, plan) + instances = yield from injector.build_instances_from_providers( + response.request, response, plan + ) validate_instances(instances) @@ -429,7 +474,9 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response.request = Request.replace(response.request, url="http://willfail.page") with pytest.raises(ValueError): plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers(response.request, response, plan) + instances = yield from injector.build_instances_from_providers( + response.request, response, plan + ) # Different providers. They return a different result, but the cache data should prevail. providers = { @@ -440,7 +487,9 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response = get_response_for_testing(callback) plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers(response.request, response, plan) + instances = yield from injector.build_instances_from_providers( + response.request, response, plan + ) validate_instances(instances) @@ -449,4 +498,6 @@ def callback(response: DummyResponse, arg_str: str, arg_int: int, arg_float: flo response.request = Request.replace(response.request, url="http://willfail.page") with pytest.raises(Error): plan = injector.build_plan(response.request) - instances = yield from injector.build_instances_from_providers(response.request, response, plan) + instances = yield from injector.build_instances_from_providers( + response.request, response, plan + ) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index cbaf4311..b8fe03ae 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -58,7 +58,9 @@ def start_requests(self): @attr.s(auto_attribs=True) class BreadcrumbsExtraction(WebPage): def get(self): - return {a.css("::text").get(): a.attrib["href"] for a in self.css(".breadcrumbs a")} + return { + a.css("::text").get(): a.attrib["href"] for a in self.css(".breadcrumbs a") + } @attr.s(auto_attribs=True) @@ -83,7 +85,9 @@ def get(self): @inlineCallbacks def test_basic_case(settings): - item, url, _ = yield crawl_single_item(spider_for(ProductPage), ProductHtml, settings) + item, url, _ = yield crawl_single_item( + spider_for(ProductPage), ProductHtml, settings + ) assert item == { "url": url, "name": "Chocolate", @@ -98,8 +102,12 @@ def test_overrides(settings): host = socket.gethostbyname(socket.gethostname()) domain = get_domain(host) port = get_ephemeral_port() - settings["SCRAPY_POET_OVERRIDES"] = [(f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction)] - item, url, _ = yield crawl_single_item(spider_for(ProductPage), ProductHtml, settings, port=port) + settings["SCRAPY_POET_OVERRIDES"] = [ + (f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction) + ] + item, url, _ = yield crawl_single_item( + spider_for(ProductPage), ProductHtml, settings, port=port + ) assert item == { "url": url, "name": "Chocolate", @@ -126,7 +134,9 @@ def to_item(self): @inlineCallbacks def test_optional_and_unions(settings): - item, _, _ = yield crawl_single_item(spider_for(OptionalAndUnionPage), ProductHtml, settings) + item, _, _ = yield crawl_single_item( + spider_for(OptionalAndUnionPage), ProductHtml, settings + ) assert item["breadcrumbs"].response is item["response"] assert item["opt_check_1"] is item["breadcrumbs"] assert item["opt_check_2"] is None @@ -261,7 +271,9 @@ class UnressolvableProductPage(ProductPage): @inlineCallbacks def test_injection_failure(settings): configure_logging(settings) - items, url, crawler = yield crawl_items(spider_for(UnressolvableProductPage), ProductHtml, settings) + items, url, crawler = yield crawl_items( + spider_for(UnressolvableProductPage), ProductHtml, settings + ) assert items == [] @@ -300,7 +312,9 @@ def test_skip_downloads(settings): assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 0 assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 - item, url, crawler = yield crawl_single_item(SkipDownloadSpider, ProductHtml, settings) + item, url, crawler = yield crawl_single_item( + SkipDownloadSpider, ProductHtml, settings + ) assert isinstance(item["response"], Response) is True assert isinstance(item["response"], DummyResponse) is True assert crawler.stats.get_stats().get("downloader/request_count", 0) == 0 @@ -323,7 +337,9 @@ def parse(self, response: DummyResponse, url: RequestUrl): @inlineCallbacks def test_skip_download_request_url(settings): - item, url, crawler = yield crawl_single_item(RequestUrlSpider, ProductHtml, settings) + item, url, crawler = yield crawl_single_item( + RequestUrlSpider, ProductHtml, settings + ) assert isinstance(item["response"], Response) is True assert isinstance(item["response"], DummyResponse) is True assert isinstance(item["url"], RequestUrl) @@ -353,7 +369,9 @@ def parse(self, response: DummyResponse, page: RequestUrlPage): @inlineCallbacks def test_skip_download_request_url_page(settings): - item, url, crawler = yield crawl_single_item(RequestUrlPageSpider, ProductHtml, settings) + item, url, crawler = yield crawl_single_item( + RequestUrlPageSpider, ProductHtml, settings + ) assert tuple(item.keys()) == ("url",) assert str(item["url"]) == url assert crawler.stats.get_stats().get("downloader/request_count", 0) == 0 @@ -381,7 +399,10 @@ def get_middleware(settings): spider = has_cache_middleware.crawler.spider has_cache_middleware.spider_closed(spider) - assert mock_sqlitedictcache.mock_calls == [mock.call("/tmp/cache", compressed=True), mock.call().close()] + assert mock_sqlitedictcache.mock_calls == [ + mock.call("/tmp/cache", compressed=True), + mock.call().close(), + ] @inlineCallbacks @@ -404,5 +425,7 @@ def test_web_poet_integration(settings): # Converting it to a set removes potential duplicate OverrideRules settings["SCRAPY_POET_OVERRIDES"] = set(rules) - item, url, _ = yield crawl_single_item(spider_for(POOverriden), ProductHtml, settings, port=PORT) + item, url, _ = yield crawl_single_item( + spider_for(POOverriden), ProductHtml, settings, port=PORT + ) assert item == {"msg": "PO replacement"} diff --git a/tests/test_providers.py b/tests/test_providers.py index 3c412776..87db3fde 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -68,7 +68,9 @@ def __init__(self, crawler: Crawler): assert isinstance(crawler, Crawler) super().__init__(crawler) - def __call__(self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider): + def __call__( + self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider + ): assert isinstance(spider, scrapy.Spider) ret: List[Any] = [] if Price in to_provide: @@ -135,7 +137,14 @@ def start_requests(self): def errback(self, failure: Failure): yield {"exception": failure.value} - def parse(self, response, price: Price, name: Name, html: Html, response_data: HttpResponse): + def parse( + self, + response, + price: Price, + name: Name, + html: Html, + response_data: HttpResponse, + ): yield { Price: price, Name: name, @@ -159,7 +168,9 @@ class NameFirstMultiProviderSpider(PriceFirstMultiProviderSpider): def test_name_first_spider(settings, tmp_path): cache = tmp_path / "cache.sqlite3" settings.set("SCRAPY_POET_CACHE", str(cache)) - item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, ProductHtml, settings) + item, _, _ = yield crawl_single_item( + NameFirstMultiProviderSpider, ProductHtml, settings + ) assert cache.exists() assert item == { Price: Price("22€"), @@ -170,7 +181,9 @@ def test_name_first_spider(settings, tmp_path): # Let's see that the cache is working. We use a different and wrong resource, # but it should be ignored by the cached version used - item, _, _ = yield crawl_single_item(NameFirstMultiProviderSpider, NonProductHtml, settings) + item, _, _ = yield crawl_single_item( + NameFirstMultiProviderSpider, NonProductHtml, settings + ) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -181,7 +194,9 @@ def test_name_first_spider(settings, tmp_path): @inlineCallbacks def test_price_first_spider(settings): - item, _, _ = yield crawl_single_item(PriceFirstMultiProviderSpider, ProductHtml, settings) + item, _, _ = yield crawl_single_item( + PriceFirstMultiProviderSpider, ProductHtml, settings + ) assert item == { Price: Price("22€"), Name: Name("Chocolate"), @@ -205,7 +220,9 @@ async def test_http_client_provider(settings): crawler = get_crawler(Spider, settings) crawler.engine = AsyncMock() - with mock.patch("scrapy_poet.page_input_providers.create_scrapy_downloader") as mock_factory: + with mock.patch( + "scrapy_poet.page_input_providers.create_scrapy_downloader" + ) as mock_factory: provider = HttpClientProvider(crawler) results = provider(set(), crawler) assert isinstance(results[0], HttpClient) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9bc905a6..5f24afb6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -153,11 +153,15 @@ def test_http_request_to_scrapy_request(http_request, kwargs, scrapy_request): ), ( TextResponse("https://example.com", body="a", encoding="ascii"), - HttpResponse("https://example.com", body=b"a", status=200, encoding="ascii"), + HttpResponse( + "https://example.com", body=b"a", status=200, encoding="ascii" + ), ), ( TextResponse("https://example.com", body="a", encoding="utf-8"), - HttpResponse("https://example.com", body=b"a", status=200, encoding="utf-8"), + HttpResponse( + "https://example.com", body=b"a", status=200, encoding="utf-8" + ), ), ), ) diff --git a/tests/utils.py b/tests/utils.py index 0863a1b5..dd4f6751 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -40,11 +40,15 @@ def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None, port=Non @inlineCallbacks -def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None, port=None): +def crawl_single_item( + spider_cls, resource_cls, settings, spider_kwargs=None, port=None +): """Run a spider where a single item is expected. Use in combination with ``capture_capture_exceptions`` and ``CollectorPipeline`` """ - items, url, crawler = yield crawl_items(spider_cls, resource_cls, settings, spider_kwargs=spider_kwargs, port=port) + items, url, crawler = yield crawl_items( + spider_cls, resource_cls, settings, spider_kwargs=spider_kwargs, port=port + ) assert len(items) == 1 resp = items[0] if "exception" in resp: From 558495124258b8b11872f72f824c297d9c816f50 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 20 Jun 2022 19:55:09 +0800 Subject: [PATCH 10/10] update .git-blame-ignore-revs --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 046b3bfb..55e70186 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -2,3 +2,4 @@ # https://github.com/scrapinghub/scrapy-poet/pull/68 58c903617911b3209ad68bfefe3fa1a86be629f4 +7249a133722d1115111a8bbb3b02080a892483f2