diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 1d317e49..00000000 --- a/.flake8 +++ /dev/null @@ -1,53 +0,0 @@ -[flake8] -ignore = - # Refers to the max-line length. Let's suppress the error and simply - # let black take care on how it wants to format the lines. - E501, - - # Refers to "line break before binary operator". - # Similar to above, let black take care of the formatting. - W503, - - # Refers to "necessary dict call - rewrite as a literal". - C408, - - # To be addressed: - # Missing docstring in public module - D100, - # Missing docstring in public class - D101, - # Missing docstring in public method - D102, - # Missing docstring in public function - D103, - # Missing docstring in public package - D104, - # Missing docstring in magic method - D105, - # Missing docstring in __init__ - D107, - # One-line docstring should fit on one line with quotes - D200, - # No blank lines allowed after function docstring - D202, - # 1 blank line required between summary line and description - D205, - # Multi-line docstring closing quotes should be on a separate line - D209, - # First line should end with a period - D400, - # First line should be in imperative mood - D401, - # First line should not be the function's "signature" - D402 - -per-file-ignores = - # F401: Ignore "imported but unused" errors in __init__ files, as those - # imports are there to expose submodule functions so they can be imported - # directly from that module - # F403: Ignore * imports in these files - scrapy_poet/__init__.py:F401,F403 - scrapy_poet/page_inputs/__init__.py:F401,F403 - - # false positive in one case; other cases are fixed - scrapy_poet/injection.py:B028 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5aa77fbf..616676b8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,7 @@ repos: - - hooks: - - id: black - language_version: python3 - repo: https://github.com/ambv/black - rev: 24.10.0 - - hooks: - - id: isort - language_version: python3 - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - - hooks: - - id: flake8 - language_version: python3 - additional_dependencies: - - flake8-bugbear - - flake8-comprehensions - - flake8-debugger - - flake8-docstrings - - flake8-string-format - repo: https://github.com/pycqa/flake8 - rev: 7.1.1 +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.13.1 + hooks: + - id: ruff-check + args: [ --fix ] + - id: ruff-format diff --git a/docs/conf.py b/docs/conf.py index 1f3571b0..a834283f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,13 +10,12 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -import os import pkgutil import sys from datetime import datetime from pathlib import Path -sys.path.insert(0, os.path.abspath("../")) +sys.path.insert(0, str(Path(__file__).parent.parent)) def get_copyright(attribution, *, first_year): @@ -29,7 +28,7 @@ def get_copyright(attribution, *, first_year): def get_version_and_release(): try: - import scrapy_poet # noqa: F401 + import scrapy_poet # noqa: F401,PLC0415 except ImportError: return "", "" version_bytes = pkgutil.get_data("scrapy_poet", "VERSION") or b"" @@ -42,7 +41,7 @@ def get_version_and_release(): # -- Project information ----------------------------------------------------- project = "scrapy-poet" -copyright = get_copyright("Zyte Group Ltd", first_year=2019) +project_copyright = get_copyright("Zyte Group Ltd", first_year=2019) author = "Zyte" version, release = get_version_and_release() diff --git a/example/example/autoextract.py b/example/example/autoextract.py index 816056cd..7ba8917e 100644 --- a/example/example/autoextract.py +++ b/example/example/autoextract.py @@ -3,7 +3,7 @@ which even requires an API request. """ -from typing import Any, Dict +from typing import Any import attr from scrapy import Request @@ -18,7 +18,7 @@ class AutoextractProductResponse: """Input data""" - data: Dict[str, Any] + data: dict[str, Any] class AutoextractProductProvider(PageObjectInputProvider): @@ -51,5 +51,4 @@ def url(self): return self.autoextract_resp.data["product"]["url"] def to_item(self): - product = self.autoextract_resp.data["product"] - return product + return self.autoextract_resp.data["product"] diff --git a/example/example/spiders/books_03.py b/example/example/spiders/books_03.py index 274eb211..a7d388a6 100644 --- a/example/example/spiders/books_03.py +++ b/example/example/spiders/books_03.py @@ -3,8 +3,8 @@ """ import scrapy -from example.autoextract import ProductPage +from example.autoextract import ProductPage from scrapy_poet import callback_for diff --git a/example/example/spiders/books_05.py b/example/example/spiders/books_05.py index 347a98a3..f474eb13 100644 --- a/example/example/spiders/books_05.py +++ b/example/example/spiders/books_05.py @@ -4,9 +4,10 @@ """ import scrapy -from example.autoextract import ProductPage from web_poet import WebPage +from example.autoextract import ProductPage + class BookListPage(WebPage): def product_urls(self): diff --git a/example/example/spiders/books_05_1.py b/example/example/spiders/books_05_1.py index 575cd057..39e8a303 100644 --- a/example/example/spiders/books_05_1.py +++ b/example/example/spiders/books_05_1.py @@ -12,9 +12,9 @@ """ import scrapy -from example.autoextract import ProductPage from web_poet import WebPage +from example.autoextract import ProductPage from scrapy_poet import DummyResponse diff --git a/pyproject.toml b/pyproject.toml index 8d360e16..a95d21b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,3 @@ -[tool.black] -line-length = 88 - [tool.bumpversion] current_version = "0.26.0" commit = true @@ -34,12 +31,6 @@ exclude_also = [ "@(abc\\.)?abstractmethod", ] -[tool.isort] -profile = "black" -multi_line_output = 3 -# scrapy_poet/__init__.py: Automatic sorting causes circular dependencies. -skip = ["scrapy_poet/__init__.py"] - [[tool.mypy.overrides]] module = [ "tests.test_cache.*", @@ -51,3 +42,154 @@ module = [ # when test cases are decorated with @inlineCallbacks. However, the # tests doesn't return anything at all. disable_error_code = "misc" + +[tool.ruff.lint] +extend-select = [ + # flake8-builtins + "A", + # flake8-async + "ASYNC", + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # flake8-commas + "COM", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-pytest-style + "PT", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # flake8-tidy-imports + "TID", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Trailing comma missing + "COM812", + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public method + "D102", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # String contains ambiguous {}. + "RUF001", + # Docstring contains ambiguous {}. + "RUF002", + # Comment contains ambiguous {}. + "RUF003", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", + # Yoda condition detected + "SIM300", + # Add `from __future__ import annotations` to simplify + # (It's harder to keep annotations resolvable at the runtime with it.) + "FA100", +] + +[tool.ruff.lint.flake8-tidy-imports] +banned-module-level-imports = [ + "twisted.internet.reactor", +] + +[tool.ruff.lint.isort] +split-on-trailing-comma = false + +[tool.ruff.lint.per-file-ignores] +"example/*" = ["PLC0415"] +# scrapy_poet/__init__.py: Automatic import sorting causes circular dependencies. +"scrapy_poet/__init__.py" = ["F401", "I"] +"scrapy_poet/page_inputs/__init__.py" = ["F401"] +"tests/*" = ["SLOT000", "S"] + +# we need to use typing.Set[] over modern alternatives with web-poet<0.19.0 && Python<3.11 +# see https://github.com/scrapinghub/web-poet/pull/219 +"scrapy_poet/page_input_providers.py" = ["UP006", "UP035"] +"tests/test_downloader.py" =["UP006", "UP035"] +"tests/test_providers.py" =["UP006", "UP035"] +"tests/test_request_fingerprinter.py" =["UP006", "UP035"] +"tests/test_web_poet_rules.py" =["UP006", "UP035"] + +[tool.ruff.lint.pydocstyle] +convention = "pep257" diff --git a/scrapy_poet/_addon.py b/scrapy_poet/_addon.py index b75de290..2f07ec0e 100644 --- a/scrapy_poet/_addon.py +++ b/scrapy_poet/_addon.py @@ -22,15 +22,15 @@ def _replace_builtin( f"{builtin_cls} entry with {new_cls}. Add {new_cls} manually to " f"silence this warning." ) - return None + return if new_cls in setting_value: - return None + return for cls_or_path in setting_value: if isinstance(cls_or_path, str): _cls = load_object(cls_or_path) if _cls == new_cls: - return None + return builtin_entry: object = None for _setting_value in (setting_value, settings[f"{setting}_BASE"]): @@ -54,7 +54,7 @@ def _replace_builtin( f"missing built-in entry {builtin_cls}. Cannot replace it with {new_cls}. " f"Add {new_cls} manually to silence this warning." ) - return None + return if pos is None: logger.warning( diff --git a/scrapy_poet/_request_fingerprinter.py b/scrapy_poet/_request_fingerprinter.py index ba24b41d..056d8a26 100644 --- a/scrapy_poet/_request_fingerprinter.py +++ b/scrapy_poet/_request_fingerprinter.py @@ -1,8 +1,10 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + try: - from scrapy.utils.request import RequestFingerprinter # NOQA + from scrapy.utils.request import RequestFingerprinter # noqa: F401 except ImportError: - from typing import TYPE_CHECKING - if not TYPE_CHECKING: ScrapyPoetRequestFingerprinter = None else: @@ -10,15 +12,17 @@ import json from functools import cached_property from logging import getLogger - from typing import Annotated, Callable, Dict, List, Optional, get_args, get_origin + from typing import Annotated, Callable, get_args, get_origin from weakref import WeakKeyDictionary from andi import CustomBuilder - from scrapy import Request - from scrapy.crawler import Crawler from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS from scrapy.utils.misc import load_object + if TYPE_CHECKING: + from scrapy import Request + from scrapy.crawler import Crawler + try: from scrapy.utils.misc import build_from_crawler except ImportError: # Scrapy < 2.12 @@ -52,14 +56,12 @@ def build_from_crawler( def _serialize_dep(cls): if isinstance(cls, CustomBuilder): cls = cls.result_class_or_fn - else: - if get_origin(cls) is Annotated: - annotated, *annotations = get_args(cls) - return f"{_serialize_dep(annotated)}{repr(annotations)}" + elif get_origin(cls) is Annotated: + annotated, *annotations = get_args(cls) + return f"{_serialize_dep(annotated)}{annotations!r}" return get_fq_class_name(cls) class ScrapyPoetRequestFingerprinter: - IGNORED_UNANNOTATED_DEPS = { # These dependencies are tools for page objects that should have no # bearing on the request itself. @@ -89,10 +91,8 @@ def __init__(self, crawler: Crawler) -> None: ), crawler, ) - self._callback_cache: Dict[Callable, Optional[bytes]] = {} - self._request_cache: "WeakKeyDictionary[Request, bytes]" = ( - WeakKeyDictionary() - ) + self._callback_cache: dict[Callable, bytes | None] = {} + self._request_cache: WeakKeyDictionary[Request, bytes] = WeakKeyDictionary() self._crawler: Crawler = crawler self._saw_unserializable_page_params = False @@ -107,7 +107,7 @@ def _injector(self): "been configured in the DOWNLOADER_MIDDLEWARES setting?" ) - def _get_deps(self, request: Request) -> Optional[List[str]]: + def _get_deps(self, request: Request) -> list[str] | None: """Return a JSON-serializable structure that uniquely identifies the dependencies requested by the request, or None if dependency injection is not required.""" @@ -117,7 +117,7 @@ def _get_deps(self, request: Request) -> Optional[List[str]]: return None return sorted([_serialize_dep(cls) for cls in deps]) - def get_deps_key(self, request: Request) -> Optional[bytes]: + def get_deps_key(self, request: Request) -> bytes | None: """Return a JSON array as bytes that uniquely identifies the dependencies requested through scrapy-poet injection that could impact the request, or None if there are no such dependencies.""" @@ -134,7 +134,7 @@ def get_deps_key(self, request: Request) -> Optional[bytes]: self._callback_cache[callback] = deps_key return self._callback_cache[callback] - def serialize_page_params(self, request: Request) -> Optional[bytes]: + def serialize_page_params(self, request: Request) -> bytes | None: """Return a JSON object as bytes that represents the page params, or None if there are no page params or they are not JSON-serializable.""" @@ -175,5 +175,5 @@ def fingerprint(self, request: Request) -> bytes: if serialized_page_params is not None: fingerprint += serialized_page_params - self._request_cache[request] = hashlib.sha1(fingerprint).digest() + self._request_cache[request] = hashlib.sha1(fingerprint).digest() # noqa: S324 return self._request_cache[request] diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index 6efbf113..facc7fa6 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -1,5 +1,5 @@ from inspect import iscoroutinefunction -from typing import Callable, Optional, Type +from typing import Callable, Optional from scrapy.http import Request, Response from web_poet.pages import ItemPage @@ -31,7 +31,7 @@ def __init__(self, url: str, request: Optional[Request] = None): super().__init__(url=url, request=request) -def callback_for(page_or_item_cls: Type) -> Callable: +def callback_for(page_or_item_cls: type) -> Callable: """Create a callback for an :class:`web_poet.ItemPage ` subclass or an item class. @@ -116,11 +116,11 @@ def parse(self, response): # a dict of named arguments after our injectable. if issubclass(page_or_item_cls, ItemPage): - def parse(*args, page: page_or_item_cls, **kwargs): # type: ignore - yield page.to_item() # type: ignore + def parse(*args, page: page_or_item_cls, **kwargs): # type: ignore[valid-type] + yield page.to_item() # type: ignore[attr-defined] - async def async_parse(*args, page: page_or_item_cls, **kwargs): # type: ignore - yield await page.to_item() # type: ignore + async def async_parse(*args, page: page_or_item_cls, **kwargs): # type: ignore[valid-type] + yield await page.to_item() # type: ignore[attr-defined] if iscoroutinefunction(page_or_item_cls.to_item): setattr(async_parse, _CALLBACK_FOR_MARKER, True) @@ -128,7 +128,7 @@ async def async_parse(*args, page: page_or_item_cls, **kwargs): # type: ignore else: - def parse(*args, item: page_or_item_cls, **kwargs): # type:ignore + def parse(*args, item: page_or_item_cls, **kwargs): # type:ignore[valid-type,misc] yield item setattr(parse, _CALLBACK_FOR_MARKER, True) diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index f97852e3..9d0105fb 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -1,11 +1,15 @@ +from __future__ import annotations + import abc -import os import pickle from pathlib import Path -from typing import Any, Union +from typing import TYPE_CHECKING, Any from web_poet.serialization.api import SerializedData, SerializedDataFileStorage +if TYPE_CHECKING: + import os + class _Cache(abc.ABC): @abc.abstractmethod @@ -26,20 +30,18 @@ class SerializedDataCache(_Cache): `web_poet.serialization.SerializedDataFileStorage` """ - def __init__(self, directory: Union[str, os.PathLike]) -> None: + def __init__(self, directory: str | os.PathLike) -> None: self.directory = Path(directory) def __getitem__(self, fingerprint: str) -> SerializedData: storage = SerializedDataFileStorage(self._get_directory_path(fingerprint)) try: serialized_data = storage.read() - except FileNotFoundError: - raise KeyError(f"Fingerprint '{fingerprint}' not found in cache") + except FileNotFoundError as ex: + raise KeyError(f"Fingerprint '{fingerprint}' not found in cache") from ex return serialized_data - def __setitem__( - self, fingerprint: str, value: Union[SerializedData, Exception] - ) -> None: + def __setitem__(self, fingerprint: str, value: SerializedData | Exception) -> None: if isinstance(value, Exception): self.write_exception(fingerprint, value) else: diff --git a/scrapy_poet/commands.py b/scrapy_poet/commands.py index d11d3244..6a241d89 100644 --- a/scrapy_poet/commands.py +++ b/scrapy_poet/commands.py @@ -2,7 +2,7 @@ import logging import sys from pathlib import Path -from typing import Optional, Type +from typing import Optional import andi import scrapy @@ -49,7 +49,7 @@ def build_instances_from_providers( for cls, value in instances.items(): metadata = getattr(cls, "__metadata__", None) if metadata: - value = AnnotatedInstance(value, metadata) + value = AnnotatedInstance(value, metadata) # noqa: PLW2901 saved_dependencies.append(value) return instances @@ -65,10 +65,10 @@ def __init__(self, crawler: Crawler) -> None: def spider_for( - injectable: Type[ItemPage], + injectable: type[ItemPage], url: str, - base_spider: Optional[Type[scrapy.Spider]] = None, -) -> Type[scrapy.Spider]: + base_spider: Optional[type[scrapy.Spider]] = None, +) -> type[scrapy.Spider]: if base_spider is None: base_spider = scrapy.Spider @@ -81,7 +81,7 @@ def __init__(self, name=None, **kwargs): self.start_requests = lambda: [scrapy.Request(url, self.cb, meta=meta)] async def cb(self, response: DummyResponse, page: injectable): # type: ignore[valid-type] - global frozen_time + global frozen_time # noqa: PLW0603 frozen_time = datetime.datetime.now(datetime.timezone.utc).replace( microsecond=0 ) @@ -107,7 +107,7 @@ def short_desc(self): def run(self, args, opts): if len(args) < 2: - raise UsageError() + raise UsageError type_name = args[0] url = args[1] @@ -119,9 +119,9 @@ def run(self, args, opts): if not issubclass(cls, ItemPage): raise UsageError(f"Error: {type_name} is not a descendant of ItemPage") - self.settings["DOWNLOADER_MIDDLEWARES"][ - "scrapy_poet.InjectionMiddleware" - ] = None + self.settings["DOWNLOADER_MIDDLEWARES"]["scrapy_poet.InjectionMiddleware"] = ( + None + ) self.settings["DOWNLOADER_MIDDLEWARES"][ "scrapy_poet.downloadermiddlewares.InjectionMiddleware" ] = None diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py index 25488de0..d5c1d2f6 100644 --- a/scrapy_poet/downloadermiddlewares.py +++ b/scrapy_poet/downloadermiddlewares.py @@ -3,15 +3,14 @@ are executed. """ +from __future__ import annotations + import inspect import logging import warnings -from typing import Generator, Optional, Type, TypeVar, Union +from typing import TYPE_CHECKING -from scrapy import Spider -from scrapy.crawler import Crawler from scrapy.downloadermiddlewares.stats import DownloaderStats -from scrapy.http import Request, Response from twisted.internet.defer import Deferred, inlineCallbacks from web_poet import RulesRegistry from web_poet.exceptions import Retry @@ -29,13 +28,23 @@ ) from .utils import create_registry_instance, is_min_scrapy_version +if TYPE_CHECKING: + from collections.abc import Generator + + from scrapy import Spider + from scrapy.crawler import Crawler + from scrapy.http import Request, Response + + # typing.Self requires Python 3.11 + from typing_extensions import Self + logger = logging.getLogger(__name__) class DownloaderStatsMiddleware(DownloaderStats): def process_response( self, request: Request, response: Response, spider: Spider - ) -> Union[Request, Response]: + ) -> Request | Response: if isinstance(response, DummyResponse): return response return super().process_response(request, response, spider) @@ -51,8 +60,6 @@ def process_response( StatsProvider: 1000, } -InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") - class InjectionMiddleware: """This is a Downloader Middleware that's supposed to: @@ -72,15 +79,10 @@ def __init__(self, crawler: Crawler) -> None: ) @classmethod - def from_crawler( - cls: Type[InjectionMiddlewareTV], crawler: Crawler - ) -> InjectionMiddlewareTV: - o = cls(crawler) - return o - - def process_request( - self, request: Request, spider: Spider - ) -> Optional[DummyResponse]: + def from_crawler(cls, crawler: Crawler) -> Self: + return cls(crawler) + + def process_request(self, request: Request, spider: Spider) -> DummyResponse | None: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a :class:`scrapy.http.Response` is going to be used by the callback or a Page Input. @@ -125,15 +127,12 @@ def _skip_dependency_creation(self, request: Request, spider: Spider) -> bool: return False # Skip if providers are needed. - if self.injector.discover_callback_providers(request): - return True - - return False + return bool(self.injector.discover_callback_providers(request)) @inlineCallbacks def process_response( self, request: Request, response: Response, spider: Spider - ) -> Generator[Deferred, object, Union[Response, Request]]: + ) -> Generator[Deferred, object, Response | Request]: """This method fills :attr:`scrapy.Request.cb_kwargs ` with instances for the required Page Objects found in the callback signature. @@ -164,7 +163,9 @@ def process_response( except Retry as exception: # Needed for Twisted < 21.2.0. See the discussion thread linked below: # https://github.com/scrapinghub/scrapy-poet/pull/129#discussion_r1102693967 - from scrapy.downloadermiddlewares.retry import get_retry_request + from scrapy.downloadermiddlewares.retry import ( # noqa: PLC0415 + get_retry_request, + ) reason = str(exception) or "page_object_retry" new_request_or_none = get_retry_request( diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 769f5ac3..74e3c242 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -1,22 +1,11 @@ import functools import inspect import logging -import os import pprint import warnings -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Mapping, - Optional, - Set, - Type, - cast, - get_type_hints, -) +from collections.abc import Iterable, Mapping +from pathlib import Path +from typing import Any, Callable, Optional, cast, get_type_hints from weakref import WeakKeyDictionary import andi @@ -62,8 +51,6 @@ class DynamicDeps(dict): values with keys being dependency types. """ - pass - class Injector: """ @@ -84,7 +71,7 @@ def __init__( self.load_providers(default_providers) self.init_cache() - def load_providers(self, default_providers: Optional[Mapping] = None): # noqa: D102 + def load_providers(self, default_providers: Optional[Mapping] = None): providers_dict = { **(default_providers or {}), **self.crawler.settings.getdict("SCRAPY_POET_PROVIDERS"), @@ -103,14 +90,14 @@ def load_providers(self, default_providers: Optional[Mapping] = None): # noqa: self.providers ) - def init_cache(self): # noqa: D102 + def init_cache(self): self.cache = {} cache_path = self.crawler.settings.get("SCRAPY_POET_CACHE") # SCRAPY_POET_CACHE: True if cache_path and isinstance(cache_path, bool): - cache_path = os.path.join( - get_scrapy_data_path(createdir=True), "scrapy-poet-cache" + cache_path = str( + Path(get_scrapy_data_path(createdir=True), "scrapy-poet-cache") ) # SCRAPY_POET_CACHE: @@ -126,11 +113,11 @@ def init_cache(self): # noqa: D102 # This is different from the cache above as it only stores instances as long # as the request exists. This is useful for latter providers to re-use the # already built instances by earlier providers. - self.weak_cache: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary() + self.weak_cache: WeakKeyDictionary[Request, dict] = WeakKeyDictionary() def available_dependencies_for_providers( self, request: Request, response: Response - ): # noqa: D102 + ): deps = { Crawler: self.crawler, Spider: self.spider, @@ -144,7 +131,7 @@ def available_dependencies_for_providers( def discover_callback_providers( self, request: Request - ) -> Set[PageObjectInputProvider]: + ) -> set[PageObjectInputProvider]: """Discover the providers that are required to fulfil the callback dependencies""" plan = self.build_plan(request) result = set() @@ -193,18 +180,18 @@ def _get_custom_builder( on the registry and also supports filling :class:`.DynamicDeps`. """ - @functools.lru_cache(maxsize=None) # to minimize the registry queries + @functools.cache # to minimize the registry queries def mapping_fn(dep_cls: Callable) -> Optional[Callable]: # building DynamicDeps if dep_cls is DynamicDeps: dynamic_types = request.meta.get("inject", []) if not dynamic_types: - return lambda: {} + return dict return self._get_dynamic_deps_factory(dynamic_types) # building items from pages - page_object_cls: Optional[Type[ItemPage]] = self.registry.page_cls_for_item( - request.url, cast(type, dep_cls) + page_object_cls: Optional[type[ItemPage]] = self.registry.page_cls_for_item( + request.url, cast("type", dep_cls) ) if not page_object_cls: return None @@ -236,7 +223,7 @@ def _get_dynamic_deps_factory_text( @staticmethod def _get_dynamic_deps_factory( - dynamic_types: List[type], + dynamic_types: list[type], ) -> Callable[..., DynamicDeps]: """Return a function that creates a :class:`.DynamicDeps` instance from its args. @@ -245,15 +232,15 @@ def _get_dynamic_deps_factory( corresponding args. It has correct type hints so that it can be used as an ``andi`` custom builder. """ - type_names: List[str] = [] + type_names: list[str] = [] for type_ in dynamic_types: - type_ = cast(type, strip_annotated(type_)) - if not isinstance(type_, type): + type_stripped = cast("type", strip_annotated(type_)) + if not isinstance(type_stripped, type): raise TypeError(f"Expected a dynamic dependency type, got {type_!r}") - type_names.append(type_.__name__) + type_names.append(type_stripped.__name__) txt = Injector._get_dynamic_deps_factory_text(type_names) - ns: Dict[str, Any] = {} - exec(txt, globals(), ns) + ns: dict[str, Any] = {} + exec(txt, globals(), ns) # noqa: S102 return ns["__create_fn__"](*dynamic_types) @inlineCallbacks @@ -274,8 +261,8 @@ def build_instances( # following the andi plan. assert self.crawler.stats for cls, kwargs_spec in plan.dependencies: - if cls not in instances.keys(): - result_cls: type = cast(type, cls) + if cls not in instances: + result_cls: type = cast("type", cls) if isinstance(cls, andi.CustomBuilder): result_cls = cls.result_class_or_fn instances[result_cls] = yield deferred_from_coro( @@ -297,12 +284,12 @@ def build_instances_from_providers( ): """Build dependencies handled by registered providers""" assert self.crawler.stats - instances: Dict[Callable, Any] = {} + instances: dict[Callable, Any] = {} scrapy_provided_dependencies = self.available_dependencies_for_providers( request, response ) dependencies_set = {cls for cls, _ in plan.dependencies} - objs: List[Any] + objs: list[Any] for provider in self.providers: provided_classes = { cls for cls in dependencies_set if provider.is_provided(cls) @@ -360,11 +347,11 @@ def build_instances_from_providers( self.crawler.stats.inc_value("poet/cache/firsthand") raise - objs_by_type: Dict[Callable, Any] = {} + objs_by_type: dict[Callable, Any] = {} for obj in objs: if isinstance(obj, AnnotatedInstance): cls = obj.get_annotated_cls() - obj = obj.result + obj = obj.result # noqa: PLW2901 else: cls = type(obj) objs_by_type[cls] = obj @@ -411,7 +398,7 @@ def check_all_providers_are_callable(providers): def is_class_provided_by_any_provider_fn( - providers: List[PageObjectInputProvider], + providers: list[PageObjectInputProvider], ) -> Callable[[Callable], bool]: """ Return a function of type ``Callable[[Type], bool]`` that return @@ -419,15 +406,12 @@ def is_class_provided_by_any_provider_fn( The ``is_provided`` method from each provider is used. """ - callables: List[Callable[[Callable], bool]] = [] - for provider in providers: - callables.append(provider.is_provided) + callables: list[Callable[[Callable], bool]] = [ + provider.is_provided for provider in providers + ] def is_provided_fn(type_: Callable) -> bool: - for is_provided in callables: - if is_provided(type_): - return True - return False + return any(is_provided(type_) for is_provided in callables) return is_provided_fn @@ -480,7 +464,8 @@ def is_callback_requiring_scrapy_response( "annotated with scrapy_poet.DummyResponse (or its subclasses), " "we're assuming this isn't intended and would simply ignore " "this annotation.\n\n" - "See the Pitfalls doc for more info." + "See the Pitfalls doc for more info.", + stacklevel=1, ) return True @@ -519,7 +504,7 @@ def is_provider_requiring_scrapy_response(provider): def get_injector_for_testing( providers: Mapping, - additional_settings: Optional[Dict] = None, + additional_settings: Optional[dict] = None, registry: Optional[RulesRegistry] = None, ) -> Injector: """ @@ -542,7 +527,7 @@ class MySpider(Spider): def get_response_for_testing( - callback: Callable, meta: Optional[Dict[str, Any]] = None + callback: Callable, meta: Optional[dict[str, Any]] = None ) -> Response: """ Return a :class:`scrapy.http.Response` with fake content with the configured @@ -561,9 +546,6 @@ def get_response_for_testing(

The best chocolate ever

- """.encode( - "utf-8" - ) + """.encode() request = Request(url, callback=callback, meta=meta) - response = Response(url, 200, None, html, request=request) - return response + return Response(url, 200, None, html, request=request) diff --git a/scrapy_poet/injection_errors.py b/scrapy_poet/injection_errors.py index 01e4e926..697ed692 100644 --- a/scrapy_poet/injection_errors.py +++ b/scrapy_poet/injection_errors.py @@ -22,5 +22,3 @@ class ProviderDependencyDeadlockError(InjectionError): - Page object named "ChickenPage" require "EggPage" as a dependency. - Page object named "EggPage" require "ChickenPage" as a dependency. """ - - pass diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 9d404250..077340c8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -9,7 +9,7 @@ for example, from scrapy-playwright or from an API for automatic extraction. """ -from typing import Any, Callable, ClassVar, FrozenSet, Set, Union +from typing import Any, Callable, ClassVar, Set, Union from scrapy import Request from scrapy.crawler import Crawler @@ -95,7 +95,7 @@ def __call__(self, to_provide, response: Response): is provided by this provider. """ - provided_classes: Union[Set[Callable], Callable[[Callable], bool]] + provided_classes: Union[set[Callable], Callable[[Callable], bool]] name: ClassVar[str] = "" # It must be a unique name. Used by the cache mechanism def is_provided(self, type_: Callable) -> bool: @@ -103,15 +103,14 @@ def is_provided(self, type_: Callable) -> bool: Return ``True`` if the given type is provided by this provider based on the value of the attribute ``provided_classes`` """ - if isinstance(self.provided_classes, (Set, FrozenSet)): + if isinstance(self.provided_classes, (set, frozenset)): return type_ in self.provided_classes - elif callable(self.provided_classes): + if callable(self.provided_classes): return self.provided_classes(type_) - else: - raise MalformedProvidedClassesError( - f"Unexpected type {type_!r} for 'provided_classes' attribute of" - f"{self!r}. Expected either 'set' or 'callable'" - ) + raise MalformedProvidedClassesError( + f"Unexpected type {type_!r} for 'provided_classes' attribute of" + f"{self!r}. Expected either 'set' or 'callable'" + ) # FIXME: Can't import the Injector as class annotation due to circular dep. def __init__(self, injector): @@ -243,10 +242,10 @@ def __init__(self, stats): self._stats = stats self._prefix = "poet/stats/" - def set(self, key: str, value: Any) -> None: # noqa: D102 + def set(self, key: str, value: Any) -> None: self._stats.set_value(f"{self._prefix}{key}", value) - def inc(self, key: str, value: StatNum = 1) -> None: # noqa: D102 + def inc(self, key: str, value: StatNum = 1) -> None: self._stats.inc_value(f"{self._prefix}{key}", value) diff --git a/scrapy_poet/spidermiddlewares.py b/scrapy_poet/spidermiddlewares.py index 8f7a1fda..d70b44c2 100644 --- a/scrapy_poet/spidermiddlewares.py +++ b/scrapy_poet/spidermiddlewares.py @@ -1,9 +1,13 @@ -from typing import List, Optional +from __future__ import annotations + +from typing import TYPE_CHECKING -from scrapy import Spider -from scrapy.http import Request, Response from web_poet.exceptions import Retry +if TYPE_CHECKING: + from scrapy import Spider + from scrapy.http import Request, Response + class RetryMiddleware: """Captures :exc:`web_poet.exceptions.Retry` exceptions from spider @@ -14,10 +18,12 @@ def process_spider_exception( response: Response, exception: BaseException, spider: Spider, - ) -> Optional[List[Request]]: + ) -> list[Request] | None: # Needed for Twisted < 21.2.0. See the discussion thread linked below: # https://github.com/scrapinghub/scrapy-poet/pull/129#discussion_r1102693967 - from scrapy.downloadermiddlewares.retry import get_retry_request + from scrapy.downloadermiddlewares.retry import ( # noqa: PLC0415 + get_retry_request, + ) if not isinstance(exception, Retry): return None diff --git a/scrapy_poet/utils/__init__.py b/scrapy_poet/utils/__init__.py index fa7c0372..99273f32 100644 --- a/scrapy_poet/utils/__init__.py +++ b/scrapy_poet/utils/__init__.py @@ -1,6 +1,5 @@ -import os from functools import lru_cache -from typing import Type +from pathlib import Path from packaging.version import Version from scrapy import __version__ as SCRAPY_VERSION @@ -31,7 +30,7 @@ def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") - # which does too many things. path = project_data_dir() if inside_project() else default_dir if createdir: - os.makedirs(path, exist_ok=True) + Path(path).mkdir(exist_ok=True, parents=True) return path @@ -84,13 +83,13 @@ def http_response_to_scrapy_response(response: HttpResponse) -> HtmlResponse: ) -def create_registry_instance(cls: Type, crawler: Crawler): +def create_registry_instance(cls: type, crawler: Crawler): for module in crawler.settings.getlist("SCRAPY_POET_DISCOVER", []): consume_modules(module) rules = crawler.settings.getlist("SCRAPY_POET_RULES", default_registry.get_rules()) return cls(rules=rules) -@lru_cache() +@lru_cache def is_min_scrapy_version(version: str) -> bool: return Version(SCRAPY_VERSION) >= Version(version) diff --git a/scrapy_poet/utils/mockserver.py b/scrapy_poet/utils/mockserver.py index 8d4203e3..24a363d3 100644 --- a/scrapy_poet/utils/mockserver.py +++ b/scrapy_poet/utils/mockserver.py @@ -16,15 +16,15 @@ def get_ephemeral_port(): class MockServer: def __init__(self, resource, port=None, pythonpath=None): - self.resource = "{0}.{1}".format(resource.__module__, resource.__name__) + self.resource = f"{resource.__module__}.{resource.__name__}" self.proc = None host = socket.gethostbyname(socket.gethostname()) self.port = port or get_ephemeral_port() - self.root_url = "http://%s:%d" % (host, self.port) + self.root_url = f"http://{host}:{self.port}" self.pythonpath = pythonpath or "" def __enter__(self): - self.proc = Popen( + self.proc = Popen( # noqa: S603 [ sys.executable, "-u", @@ -60,11 +60,7 @@ def main(): def print_listening(): host = http_port.getHost() - print( - "Mock server {0} running at http://{1}:{2}".format( - resource, host.host, host.port - ) - ) + print(f"Mock server {resource} running at http://{host.host}:{host.port}") reactor.callWhenRunning(print_listening) reactor.run() diff --git a/scrapy_poet/utils/testing.py b/scrapy_poet/utils/testing.py index e0cab1de..ac894ac7 100644 --- a/scrapy_poet/utils/testing.py +++ b/scrapy_poet/utils/testing.py @@ -1,6 +1,6 @@ +import contextlib import json from inspect import isasyncgenfunction -from typing import Dict from warnings import warn from scrapy import Spider, signals @@ -22,7 +22,7 @@ class HtmlResource(Resource): isLeaf = True content_type = "text/html" html = "" - extra_headers: Dict[str, str] = {} + extra_headers: dict[str, str] = {} status_code = 200 def render_GET(self, request): @@ -106,7 +106,6 @@ def render_GET(self, request): class ProductHtml(HtmlResource): - html = """