scrapinghub · kmike · Jun 20, 2022 · Mar 29, 2022 · Jun 20, 2022 · Jun 20, 2022
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,36 @@
+[flake8]
+ignore =
+    # Refers to the max-line length. Let's suppress the error and simply
+    # let black take care on how it wants to format the lines.
+    E501,
+
+    # Refers to "line break before binary operator".
+    # Similar to above, let black take care of the formatting.
+    W503,
+
+    # Refers to "necessary dict call - rewrite as a literal".
+    C408,
+
+    # To be addressed:
+    D100,  # Missing docstring in public module
+    D101,  # Missing docstring in public class
+    D102,  # Missing docstring in public method
+    D103,  # Missing docstring in public function
+    D104,  # Missing docstring in public package
+    D105,  # Missing docstring in magic method
+    D107,  # Missing docstring in __init__
+    D200,  # One-line docstring should fit on one line with quotes
+    D202,  # No blank lines allowed after function docstring
+    D205,  # 1 blank line required between summary line and description
+    D209,  # Multi-line docstring closing quotes should be on a separate line
+    D400,  # First line should end with a period
+    D401,  # First line should be in imperative mood
+    D402   # First line should not be the function's "signature"
+
+per-file-ignores =
+    # F401: Ignore "imported but unused" errors in __init__ files, as those
+    # imports are there to expose submodule functions so they can be imported
+    # directly from that module
+    # F403: Ignore * imports in these files
+    scrapy_poet/__init__.py:F401,F403
+    scrapy_poet/page_inputs/__init__.py:F401,F403
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -0,0 +1,5 @@
+# Contains commits to be ignored due to linting
+
+# https://github.com/scrapinghub/scrapy-poet/pull/68
+58c903617911b3209ad68bfefe3fa1a86be629f4
+e4a1cfb7d1e7e55de39039ee3925be073ae6d412
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -57,7 +57,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ['3.10']
-        tox-job: ["mypy", "docs"]
+        tox-job: ["mypy", "docs", "linters"]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,22 @@
+repos:
+  - hooks:
+      - id: black
+        language_version: python3
+    repo: https://github.com/ambv/black
+    rev: 22.3.0
+  - hooks:
+      - id: isort
+        language_version: python3
+    repo: https://github.com/PyCQA/isort
+    rev: 5.10.1
+  - hooks:
+      - id: flake8
+        language_version: python3
+        additional_dependencies:
+          - flake8-bugbear
+          - flake8-comprehensions
+          - flake8-debugger
+          - flake8-docstrings
+          - flake8-string-format
+    repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
diff --git a/README.rst b/README.rst
@@ -60,3 +60,21 @@ Add the following inside Scrapy's ``settings.py`` file:
     DOWNLOADER_MIDDLEWARES = {
         "scrapy_poet.InjectionMiddleware": 543,
     }
+
+Developing
+==========
+
+Setup your local Python environment via:
+
+1. `pip install -r requirements-dev.txt`
+2. `pre-commit install`
+
+Now everytime you perform a `git commit`, these tools will run against the
+staged files:
+
+* `black`
+* `isort`
+* `flake8`
+
+You can also directly invoke `pre-commit run --all-files` or `tox -e linters`
+to run them without performing a commit.
diff --git a/docs/conf.py b/docs/conf.py
@@ -12,19 +12,20 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../'))
+
+sys.path.insert(0, os.path.abspath("../"))
 
 
 # -- Project information -----------------------------------------------------
 
-project = u'scrapy-poet'
-copyright = u'2022, Zyte'
-author = u'Zyte'
+project = "scrapy-poet"
+copyright = "2022, Zyte"
+author = "Zyte"
 
 # The short X.Y version
-version = u''
+version = ""
 # The full version, including alpha/beta/rc tags
-release = u'0.3.0'
+release = "0.3.0"
 
 
 # -- General configuration ---------------------------------------------------
@@ -37,24 +38,24 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.ifconfig',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.githubpages',
+    "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.ifconfig",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -66,7 +67,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None
@@ -77,12 +78,13 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Add any paths that contain custom themes here, relative to this directory.
 # Add path to the RTD explicitly to robustify builds (otherwise might
 # fail in a clean Debian build env)
-import sphinx_rtd_theme
+import sphinx_rtd_theme  # noqa: E402
+
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
@@ -110,7 +112,7 @@
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'scrapy-poetdoc'
+htmlhelp_basename = "scrapy-poetdoc"
 
 
 # -- Options for LaTeX output ------------------------------------------------
@@ -119,15 +121,12 @@
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     #
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     #
     # 'preamble': '',
-
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
@@ -137,19 +136,15 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'scrapy-poet.tex', u'scrapy-poet Documentation',
-     u'Scrapinghub', 'manual'),
+    (master_doc, "scrapy-poet.tex", "scrapy-poet Documentation", "Scrapinghub", "manual"),
 ]
 
 
 # -- Options for manual page output ------------------------------------------
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'scrapy-poet', u'scrapy-poet Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "scrapy-poet", "scrapy-poet Documentation", [author], 1)]
 
 
 # -- Options for Texinfo output ----------------------------------------------
@@ -158,9 +153,15 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'scrapy-poet', u'scrapy-poet Documentation',
-     author, 'scrapy-poet', 'One line description of project.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "scrapy-poet",
+        "scrapy-poet Documentation",
+        author,
+        "scrapy-poet",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
 
@@ -179,21 +180,27 @@
 # epub_uid = ''
 
 # A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]
 
 
 # -- Extension configuration -------------------------------------------------
 
 # -- Options for intersphinx extension ---------------------------------------
 intersphinx_mapping = {
-    'python': ('https://docs.python.org/3', None, ),
-    'scrapy': ('https://docs.scrapy.org/en/latest', None, ),
-    'web-poet': ('https://web-poet.readthedocs.io/en/latest/', None),
-    'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None),
+    "python": (
+        "https://docs.python.org/3",
+        None,
+    ),
+    "scrapy": (
+        "https://docs.scrapy.org/en/latest",
+        None,
+    ),
+    "web-poet": ("https://web-poet.readthedocs.io/en/latest/", None),
+    "url-matcher": ("https://url-matcher.readthedocs.io/en/stable/", None),
 }
 
 autodoc_default_options = {
-    'special-members': '__init__,__call__',
+    "special-members": "__init__,__call__",
     # 'undoc-members': True,
-    'exclude-members': '__weakref__'
+    "exclude-members": "__weakref__",
 }
diff --git a/example/example/autoextract.py b/example/example/autoextract.py
@@ -2,20 +2,21 @@
 Example of how to create a PageObject with a very different input data,
 which even requires an API request.
 """
-from typing import Dict, Any
+from typing import Any, Dict
 
 import attr
+from scrapy import Request
 from twisted.internet.defer import inlineCallbacks
 from twisted.internet.threads import deferToThread
+from web_poet import ItemPage
 
-from scrapy import Request
 from scrapy_poet.page_input_providers import PageObjectInputProvider
-from web_poet import ItemPage
 
 
 @attr.s(auto_attribs=True)
 class AutoextractProductResponse:
-    """ Input data """
+    """Input data"""
+
     data: Dict[str, Any]
 
 
@@ -24,7 +25,7 @@ class AutoextractProductProvider(PageObjectInputProvider):
 
     @inlineCallbacks
     def __call__(self, to_provide, request: Request):
-        data = (yield get_autoextract_product(request.url))
+        data = yield get_autoextract_product(request.url)
         return [AutoextractProductResponse(data=data)]
 
 
@@ -33,19 +34,21 @@ def get_autoextract_product(url):
     # fixme: use async
     # fixme: rate limits?
     from autoextract.sync import request_batch
-    resp = yield deferToThread(request_batch, urls=[url], page_type='product')
+
+    resp = yield deferToThread(request_batch, urls=[url], page_type="product")
     return resp[0]
 
 
 @attr.s(auto_attribs=True)
 class ProductPage(ItemPage):
-    """ Generic product page """
+    """Generic product page"""
+
     autoextract_resp: AutoextractProductResponse
 
     @property
     def url(self):
-        return self.autoextract_resp.data['product']['url']
+        return self.autoextract_resp.data["product"]["url"]
 
     def to_item(self):
-        product = self.autoextract_resp.data['product']
+        product = self.autoextract_resp.data["product"]
         return product
diff --git a/example/example/settings.py b/example/example/settings.py
@@ -8,17 +8,16 @@
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from example.autoextract import AutoextractProductProvider
 
-BOT_NAME = 'example'
+BOT_NAME = "example"
 
-SPIDER_MODULES = ['example.spiders']
-NEWSPIDER_MODULE = 'example.spiders'
+SPIDER_MODULES = ["example.spiders"]
+NEWSPIDER_MODULE = "example.spiders"
 
 SCRAPY_POET_PROVIDERS = {AutoextractProductProvider: 500}
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 
 DOWNLOADER_MIDDLEWARES = {
-   'scrapy_poet.InjectionMiddleware': 543,
+    "scrapy_poet.InjectionMiddleware": 543,
 }
-
diff --git a/example/example/spiders/books_01.py b/example/example/spiders/books_01.py
@@ -5,15 +5,15 @@
 
 
 class BooksSpider(scrapy.Spider):
-    name = 'books_01'
-    start_urls = ['http://books.toscrape.com/']
+    name = "books_01"
+    start_urls = ["http://books.toscrape.com/"]
 
     def parse(self, response):
-        for url in response.css('.image_container a::attr(href)').getall():
+        for url in response.css(".image_container a::attr(href)").getall():
             yield response.follow(url, self.parse_book)
 
     def parse_book(self, response):
         yield {
-            'url': response.url,
-            'name': response.css("title::text").get(),
+            "url": response.url,
+            "name": response.css("title::text").get(),
         }