Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[flake8]
ignore =
# Refers to the max-line length. Let's suppress the error and simply
# let black take care on how it wants to format the lines.
E501,

# Refers to "line break before binary operator".
# Similar to above, let black take care of the formatting.
W503,

# Refers to "necessary dict call - rewrite as a literal".
C408,

# To be addressed:
D100, # Missing docstring in public module
D101, # Missing docstring in public class
D102, # Missing docstring in public method
D103, # Missing docstring in public function
D104, # Missing docstring in public package
D105, # Missing docstring in magic method
D107, # Missing docstring in __init__
D200, # One-line docstring should fit on one line with quotes
D202, # No blank lines allowed after function docstring
D205, # 1 blank line required between summary line and description
D209, # Multi-line docstring closing quotes should be on a separate line
D400, # First line should end with a period
D401, # First line should be in imperative mood
D402 # First line should not be the function's "signature"

per-file-ignores =
# F401: Ignore "imported but unused" errors in __init__ files, as those
# imports are there to expose submodule functions so they can be imported
# directly from that module
# F403: Ignore * imports in these files
scrapy_poet/__init__.py:F401,F403
scrapy_poet/page_inputs/__init__.py:F401,F403
5 changes: 5 additions & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Contains commits to be ignored due to linting

# https://github.com/scrapinghub/scrapy-poet/pull/68
58c903617911b3209ad68bfefe3fa1a86be629f4
e4a1cfb7d1e7e55de39039ee3925be073ae6d412
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
fail-fast: false
matrix:
python-version: ['3.10']
tox-job: ["mypy", "docs"]
tox-job: ["mypy", "docs", "linters"]

steps:
- uses: actions/checkout@v2
Expand Down
22 changes: 22 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
repos:
- hooks:
- id: black
language_version: python3
repo: https://github.com/ambv/black
rev: 22.3.0
- hooks:
- id: isort
language_version: python3
repo: https://github.com/PyCQA/isort
rev: 5.10.1
- hooks:
- id: flake8
language_version: python3
additional_dependencies:
- flake8-bugbear
- flake8-comprehensions
- flake8-debugger
- flake8-docstrings
- flake8-string-format
repo: https://github.com/pycqa/flake8
rev: 4.0.1
18 changes: 18 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,21 @@ Add the following inside Scrapy's ``settings.py`` file:
DOWNLOADER_MIDDLEWARES = {
"scrapy_poet.InjectionMiddleware": 543,
}

Developing
==========

Setup your local Python environment via:

1. `pip install -r requirements-dev.txt`
2. `pre-commit install`

Now everytime you perform a `git commit`, these tools will run against the
staged files:

* `black`
* `isort`
* `flake8`

You can also directly invoke `pre-commit run --all-files` or `tox -e linters`
to run them without performing a commit.
81 changes: 44 additions & 37 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,20 @@
#
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

sys.path.insert(0, os.path.abspath("../"))


# -- Project information -----------------------------------------------------

project = u'scrapy-poet'
copyright = u'2022, Zyte'
author = u'Zyte'
project = "scrapy-poet"
copyright = "2022, Zyte"
author = "Zyte"

# The short X.Y version
version = u''
version = ""
# The full version, including alpha/beta/rc tags
release = u'0.3.0'
release = "0.3.0"


# -- General configuration ---------------------------------------------------
Expand All @@ -37,24 +38,24 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.ifconfig',
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.ifconfig",
"sphinx.ext.viewcode",
"sphinx.ext.githubpages",
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
source_suffix = ".rst"

# The master toctree document.
master_doc = 'index'
master_doc = "index"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand All @@ -66,7 +67,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None
Expand All @@ -77,12 +78,13 @@
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme = "sphinx_rtd_theme"

# Add any paths that contain custom themes here, relative to this directory.
# Add path to the RTD explicitly to robustify builds (otherwise might
# fail in a clean Debian build env)
import sphinx_rtd_theme
import sphinx_rtd_theme # noqa: E402

html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

# Theme options are theme-specific and customize the look and feel of a theme
Expand Down Expand Up @@ -110,7 +112,7 @@
# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'scrapy-poetdoc'
htmlhelp_basename = "scrapy-poetdoc"


# -- Options for LaTeX output ------------------------------------------------
Expand All @@ -119,15 +121,12 @@
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',

# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',

# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',

# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
Expand All @@ -137,19 +136,15 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'scrapy-poet.tex', u'scrapy-poet Documentation',
u'Scrapinghub', 'manual'),
(master_doc, "scrapy-poet.tex", "scrapy-poet Documentation", "Scrapinghub", "manual"),
]


# -- Options for manual page output ------------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'scrapy-poet', u'scrapy-poet Documentation',
[author], 1)
]
man_pages = [(master_doc, "scrapy-poet", "scrapy-poet Documentation", [author], 1)]


# -- Options for Texinfo output ----------------------------------------------
Expand All @@ -158,9 +153,15 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'scrapy-poet', u'scrapy-poet Documentation',
author, 'scrapy-poet', 'One line description of project.',
'Miscellaneous'),
(
master_doc,
"scrapy-poet",
"scrapy-poet Documentation",
author,
"scrapy-poet",
"One line description of project.",
"Miscellaneous",
),
]


Expand All @@ -179,21 +180,27 @@
# epub_uid = ''

# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
epub_exclude_files = ["search.html"]


# -- Extension configuration -------------------------------------------------

# -- Options for intersphinx extension ---------------------------------------
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None, ),
'scrapy': ('https://docs.scrapy.org/en/latest', None, ),
'web-poet': ('https://web-poet.readthedocs.io/en/latest/', None),
'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None),
"python": (
"https://docs.python.org/3",
None,
),
"scrapy": (
"https://docs.scrapy.org/en/latest",
None,
),
"web-poet": ("https://web-poet.readthedocs.io/en/latest/", None),
"url-matcher": ("https://url-matcher.readthedocs.io/en/stable/", None),
}

autodoc_default_options = {
'special-members': '__init__,__call__',
"special-members": "__init__,__call__",
# 'undoc-members': True,
'exclude-members': '__weakref__'
"exclude-members": "__weakref__",
}
21 changes: 12 additions & 9 deletions example/example/autoextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@
Example of how to create a PageObject with a very different input data,
which even requires an API request.
"""
from typing import Dict, Any
from typing import Any, Dict

import attr
from scrapy import Request
from twisted.internet.defer import inlineCallbacks
from twisted.internet.threads import deferToThread
from web_poet import ItemPage

from scrapy import Request
from scrapy_poet.page_input_providers import PageObjectInputProvider
from web_poet import ItemPage


@attr.s(auto_attribs=True)
class AutoextractProductResponse:
""" Input data """
"""Input data"""

data: Dict[str, Any]


Expand All @@ -24,7 +25,7 @@ class AutoextractProductProvider(PageObjectInputProvider):

@inlineCallbacks
def __call__(self, to_provide, request: Request):
data = (yield get_autoextract_product(request.url))
data = yield get_autoextract_product(request.url)
return [AutoextractProductResponse(data=data)]


Expand All @@ -33,19 +34,21 @@ def get_autoextract_product(url):
# fixme: use async
# fixme: rate limits?
from autoextract.sync import request_batch
resp = yield deferToThread(request_batch, urls=[url], page_type='product')

resp = yield deferToThread(request_batch, urls=[url], page_type="product")
return resp[0]


@attr.s(auto_attribs=True)
class ProductPage(ItemPage):
""" Generic product page """
"""Generic product page"""

autoextract_resp: AutoextractProductResponse

@property
def url(self):
return self.autoextract_resp.data['product']['url']
return self.autoextract_resp.data["product"]["url"]

def to_item(self):
product = self.autoextract_resp.data['product']
product = self.autoextract_resp.data["product"]
return product
9 changes: 4 additions & 5 deletions example/example/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,16 @@
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from example.autoextract import AutoextractProductProvider

BOT_NAME = 'example'
BOT_NAME = "example"

SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
SPIDER_MODULES = ["example.spiders"]
NEWSPIDER_MODULE = "example.spiders"

SCRAPY_POET_PROVIDERS = {AutoextractProductProvider: 500}

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

DOWNLOADER_MIDDLEWARES = {
'scrapy_poet.InjectionMiddleware': 543,
"scrapy_poet.InjectionMiddleware": 543,
}

10 changes: 5 additions & 5 deletions example/example/spiders/books_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@


class BooksSpider(scrapy.Spider):
name = 'books_01'
start_urls = ['http://books.toscrape.com/']
name = "books_01"
start_urls = ["http://books.toscrape.com/"]

def parse(self, response):
for url in response.css('.image_container a::attr(href)').getall():
for url in response.css(".image_container a::attr(href)").getall():
yield response.follow(url, self.parse_book)

def parse_book(self, response):
yield {
'url': response.url,
'name': response.css("title::text").get(),
"url": response.url,
"name": response.css("title::text").get(),
}
Loading