|
| 1 | +import datetime |
| 2 | +from pathlib import Path |
| 3 | +from typing import Type |
| 4 | + |
| 5 | +import andi |
| 6 | +import scrapy |
| 7 | +import time_machine |
| 8 | +from scrapy import Request |
| 9 | +from scrapy.commands import ScrapyCommand |
| 10 | +from scrapy.crawler import Crawler |
| 11 | +from scrapy.exceptions import UsageError |
| 12 | +from scrapy.http import Response |
| 13 | +from scrapy.utils.misc import load_object |
| 14 | +from twisted.internet.defer import inlineCallbacks |
| 15 | +from web_poet import ItemPage |
| 16 | +from web_poet.testing import Fixture |
| 17 | + |
| 18 | +from scrapy_poet import callback_for |
| 19 | +from scrapy_poet.downloadermiddlewares import DEFAULT_PROVIDERS, InjectionMiddleware |
| 20 | +from scrapy_poet.injection import Injector |
| 21 | + |
| 22 | +saved_dependencies = [] |
| 23 | +saved_items = [] |
| 24 | + |
| 25 | + |
| 26 | +class SavingInjector(Injector): |
| 27 | + @inlineCallbacks |
| 28 | + def build_instances_from_providers( |
| 29 | + self, request: Request, response: Response, plan: andi.Plan |
| 30 | + ): |
| 31 | + instances = yield super().build_instances_from_providers( |
| 32 | + request, response, plan |
| 33 | + ) |
| 34 | + saved_dependencies.extend(instances.values()) |
| 35 | + return instances |
| 36 | + |
| 37 | + |
| 38 | +class SavingPipeline: |
| 39 | + def process_item(self, item, spider): |
| 40 | + saved_items.append(item) |
| 41 | + return item |
| 42 | + |
| 43 | + |
| 44 | +class SavingInjectionMiddleware(InjectionMiddleware): |
| 45 | + def __init__(self, crawler: Crawler) -> None: |
| 46 | + super().__init__(crawler) |
| 47 | + self.injector = SavingInjector( |
| 48 | + crawler, |
| 49 | + default_providers=DEFAULT_PROVIDERS, |
| 50 | + overrides_registry=self.overrides_registry, |
| 51 | + ) |
| 52 | + |
| 53 | + |
| 54 | +def spider_for(injectable: Type[ItemPage]) -> Type[scrapy.Spider]: |
| 55 | + class InjectableSpider(scrapy.Spider): |
| 56 | + name = "injectable" |
| 57 | + url = None |
| 58 | + |
| 59 | + def start_requests(self): |
| 60 | + yield scrapy.Request(self.url, self.cb) |
| 61 | + |
| 62 | + cb = callback_for(injectable) |
| 63 | + |
| 64 | + return InjectableSpider |
| 65 | + |
| 66 | + |
| 67 | +class SaveFixtureCommand(ScrapyCommand): |
| 68 | + def syntax(self): |
| 69 | + return "<page object class> <URL>" |
| 70 | + |
| 71 | + def short_desc(self): |
| 72 | + return "Generate a web-poet test for the provided page object and URL" |
| 73 | + |
| 74 | + def run(self, args, opts): |
| 75 | + if len(args) != 2: |
| 76 | + raise UsageError() |
| 77 | + type_name = args[0] |
| 78 | + url = args[1] |
| 79 | + |
| 80 | + cls = load_object(type_name) |
| 81 | + if not issubclass(cls, ItemPage): |
| 82 | + raise UsageError(f"Error: {type_name} is not a descendant of ItemPage") |
| 83 | + |
| 84 | + spider_cls = spider_for(cls) |
| 85 | + self.settings["ITEM_PIPELINES"][SavingPipeline] = 100 |
| 86 | + self.settings["DOWNLOADER_MIDDLEWARES"][SavingInjectionMiddleware] = 543 |
| 87 | + |
| 88 | + frozen_time = datetime.datetime.now(datetime.timezone.utc) |
| 89 | + with time_machine.travel(frozen_time): |
| 90 | + self.crawler_process.crawl(spider_cls, url=url) |
| 91 | + self.crawler_process.start() |
| 92 | + |
| 93 | + deps = saved_dependencies |
| 94 | + item = saved_items[0] |
| 95 | + meta = { |
| 96 | + "frozen_time": frozen_time.isoformat(), |
| 97 | + } |
| 98 | + basedir = Path(self.settings.get("SCRAPY_POET_TESTS_DIR", "fixtures")) |
| 99 | + fixture = Fixture.save(basedir / type_name, inputs=deps, item=item, meta=meta) |
| 100 | + print(f"\nThe test fixture has been written to {fixture.path}.") |
0 commit comments