-
Notifications
You must be signed in to change notification settings - Fork 28
startproject and override command line tool for Page Objects development #57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 10 commits
9aa1c85
7f9150c
3368bfe
3d63b4f
2fefb0e
79ee103
c0ef97f
c485fbb
1ea8b3a
e47664c
28b16e1
8e09e1f
133af64
cbaa801
161c5f9
c91df4b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,7 @@ | ||
| include CHANGES.rst | ||
| include CHANGELOG.rst | ||
| include LICENSE | ||
| include README.rst | ||
|
|
||
| recursive-include tests * | ||
| recursive-include scrapy_poet/templates *.template | ||
|
|
||
| global-exclude __pycache__ *.py[cod] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,334 @@ | ||
| import tempfile | ||
|
|
||
| import os | ||
| from importlib import resources | ||
|
|
||
| from pathlib import Path | ||
| from scrapy.utils.misc import load_object | ||
| from twisted.python.failure import Failure | ||
|
|
||
| from typing import Type, Callable, Tuple | ||
|
|
||
| from dataclasses import dataclass | ||
|
|
||
| from scrapy import Request | ||
| from scrapy.commands import ScrapyCommand | ||
| from scrapy.exceptions import UsageError | ||
| from scrapy.utils.conf import build_component_list, closest_scrapy_cfg | ||
| from scrapy.utils.spider import DefaultSpider | ||
| from url_matcher.util import get_domain | ||
| from w3lib.url import is_url | ||
|
|
||
| from scrapy_poet import templates, DummyResponse | ||
| from scrapy_poet.po_tester import POTester | ||
| from web_poet import ResponseData | ||
| from web_poet.overrides import find_page_object_overrides | ||
|
|
||
|
|
||
| class OverrideCommand(ScrapyCommand): | ||
|
|
||
| requires_project = False | ||
|
|
||
| def syntax(self): | ||
| return "[options] <url>" | ||
|
|
||
| def short_desc(self): | ||
| return "TODO" | ||
|
|
||
| def long_desc(self): | ||
| return ( | ||
| "TODO" | ||
| ) | ||
|
|
||
| def add_options(self, parser): | ||
| ScrapyCommand.add_options(self, parser) | ||
|
|
||
| def process_options(self, args, opts): | ||
| super().process_options(args, opts) | ||
| page_object, url = parse_args(args) | ||
|
|
||
| self.ensure_injection_middleware() | ||
| for key in ("PO_PACKAGE", "PO_TESTS_PACKAGE"): | ||
| if key not in self.settings: | ||
| raise ValueError(f"{key} is not defined in settings and is required to run this command." | ||
BurnzZ marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f"Please, configure the packages where you want to store the autogenerated " | ||
BurnzZ marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f"Page Objects and their tests. ") | ||
|
|
||
| if "SCRAPY_POET_OVERRIDES" not in self.settings: | ||
| raise ValueError(f"SCRAPY_POET_OVERRIDES is not defined in settings and it is required to run this command. " | ||
| f"A typical configuration is the following:\n" | ||
| f"\n" | ||
| f" SCRAPY_POET_OVERRIDES = find_page_object_overrides(PO_PACKAGE)") | ||
|
|
||
| root_path = Path(".").absolute() | ||
| scrapy_cfg_route = closest_scrapy_cfg() | ||
| if scrapy_cfg_route: | ||
| root_path = Path(scrapy_cfg_route).parent.absolute() | ||
|
|
||
| po_module = self.settings.get("PO_PACKAGE") | ||
| po_folder = po_module.replace(".", "/") | ||
| po_path = Path(po_folder) | ||
| test_module = self.settings.get("PO_TESTS_PACKAGE") | ||
| po_tests_folder = test_module.replace(".", "/") | ||
| test_path = Path(po_tests_folder) | ||
|
|
||
| po_path = path_join(root_path, po_path) | ||
| test_path = path_join(root_path, test_path) | ||
|
|
||
| # Creating folders and files | ||
| for path in (po_path / "templates", test_path): | ||
| path.mkdir(parents=True, exist_ok=True) | ||
| init_path = po_path / "templates" / "__init__.py" | ||
| if not init_path.exists(): | ||
| init_path.write_text("") | ||
| os.system(f"git add {init_path.absolute()}") | ||
|
|
||
| context = OverrideContext( | ||
| url=url, | ||
| page_object=page_object, | ||
| po_module=po_module, | ||
| test_module=test_module, | ||
| po_path=po_path, | ||
| test_path=test_path, | ||
| ) | ||
| self.context = context | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a bit confused here. Should we maybe init |
||
| self.po_path = generate_po_code(context) | ||
|
|
||
| # Configuring the new Page Object so that it is used to record the fixtures | ||
| override_rules_for_new_po = find_page_object_overrides( | ||
| f"{context.po_module}.{context.variables_for_template()['po_submodule']}") | ||
| self.settings["SCRAPY_POET_OVERRIDES"] = (self.settings.get("SCRAPY_POET_OVERRIDES", []) + | ||
| override_rules_for_new_po) | ||
|
|
||
| def ensure_injection_middleware(self): | ||
| """Ensures that the InjectionMiddleware is configured""" | ||
| middlewares = build_component_list( | ||
| self.settings.getwithbase('DOWNLOADER_MIDDLEWARES')) | ||
| injection_mdlw = "scrapy_poet.InjectionMiddleware" | ||
| if injection_mdlw not in middlewares: | ||
| self.settings.set('DOWNLOADER_MIDDLEWARES', { | ||
| **self.settings.getdict('DOWNLOADER_MIDDLEWARES', {}), | ||
| injection_mdlw: 543}) | ||
|
|
||
| def run(self, args, opts): | ||
| page_object, url = parse_args(args) | ||
|
|
||
| errors = [] | ||
| def callback(response: DummyResponse, po: page_object): | ||
| ... | ||
|
|
||
| def errback(failure: Failure): | ||
| errors.append(failure) | ||
|
|
||
| request = Request(args[1], callback=callback, errback=errback, dont_filter=True) | ||
|
|
||
| with tempfile.TemporaryDirectory() as tmpdirname: | ||
| tmppath = Path(tmpdirname) | ||
| po_tester = POTester(url, page_object, self.context.test_path) | ||
| fixture_path = po_tester.fixture_path | ||
|
|
||
| if fixture_path.exists(): | ||
| print(f"The fixture {fixture_path} for the URL {url} already exists. Updating it...") | ||
| else: | ||
| print(f"Creating fixture {fixture_path} for the URL {url}...") | ||
|
|
||
| cache_path = tmppath / fixture_path.name | ||
| self.settings.set("SCRAPY_POET_CACHE", str(cache_path.absolute())) | ||
|
|
||
| spidercls = DefaultSpider | ||
| self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) | ||
| self.crawler_process.start() | ||
|
|
||
| if errors: | ||
| print(f"An error occurred while fetching the resources for the page {url}") | ||
| raise errors[0] | ||
|
|
||
| # Moving the data to the fixtures folder | ||
| cache_path.replace(fixture_path) | ||
| os.system(f"git add {fixture_path.absolute()}") | ||
| print("Fixture saved successfully") | ||
|
|
||
| self.po_test_path = generate_test(self.context) | ||
| print() | ||
|
||
| print("Finished!") | ||
| print() | ||
| print(f" - You can now add your extraction code to the Page Object at {self.po_path}") | ||
| print(f" - It is handy to debug it locally using the test case at {self.po_test_path}") | ||
| print(f" - And remember to invoke this very same command whenever you want fresh data ") | ||
| print(f" to update the fixture or if you changed the dependencies on your Page Object. It is safe! :-)") | ||
|
|
||
|
|
||
|
|
||
| def parse_args(args) -> Tuple[Callable, str]: | ||
| if len(args) != 2 or not is_url(args[1]): | ||
| raise UsageError() | ||
BurnzZ marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| page_object = load_object(args[0]) | ||
| url = args[1] | ||
| return page_object, url | ||
|
|
||
| def domain_in_snake_case(text: str): | ||
| return text.replace(".", "_").replace("-", "_") | ||
|
|
||
|
|
||
| def to_camel_case(snake_str): | ||
BurnzZ marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # Based on https://stackoverflow.com/a/19053800/3887420 | ||
| components = snake_str.split("_") | ||
| # We capitalize the first letter of each component and join them together. | ||
| return "".join(x.title() for x in components) | ||
|
|
||
|
|
||
| @dataclass | ||
| class OverrideContext: | ||
| url: str | ||
| page_object: Type | ||
| po_module: str | ||
| test_module: str | ||
| po_path: Path | ||
| test_path: Path | ||
|
|
||
| def variables_for_template(self): | ||
| domain = get_domain(self.url) | ||
| norm_domain_ = domain_in_snake_case(domain) | ||
| base_po_module = self.page_object.__module__ | ||
| base_po_name = self.page_object.__name__ | ||
| po_name = f"{to_camel_case(norm_domain_)}{base_po_name}" | ||
| po_submodule = f"{norm_domain_}_{self.page_object.__name__}" | ||
| test_submodule = f"test_{po_submodule}" | ||
| return dict( | ||
| url=self.url, | ||
| domain=domain, | ||
| base_po_module=base_po_module, | ||
| base_po_name=base_po_name, | ||
| po_name=po_name, | ||
| po_module=self.po_module, | ||
| po_submodule=po_submodule, | ||
| test_module=self.test_module, | ||
| test_submodule=test_submodule, | ||
| po_path=str(self.po_path), | ||
| test_path=str(self.test_path), | ||
| ) | ||
|
|
||
|
|
||
| def load_template(module, page_object, po_path: Path, suffix=""): | ||
| """Load the most specific template possible from the given po_module""" | ||
| specific_template = f"{page_object.__name__}{suffix}.template" | ||
| try: | ||
| file, text = specific_template, resources.read_text(module, specific_template) | ||
| print(f"Template '{file}' from module '{module}' loaded.") | ||
| except ModuleNotFoundError: | ||
| raise ModuleNotFoundError( | ||
| f"Module {module} is not found. Did you forgot to create it?" | ||
| ) | ||
| except FileNotFoundError: | ||
| default_template = f"default{suffix}.template" | ||
| try: | ||
| file, text = default_template, resources.read_text(module, default_template) | ||
| print( | ||
| f"Template '{file}' from module '{module}' loaded.\n" | ||
| f"If you want a custom template for '{page_object.__name__}' instead the default one then\n" | ||
| f"create your custom one in the module '{module}' with the name '{specific_template}'" | ||
| ) | ||
| except FileNotFoundError as e: | ||
BurnzZ marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| templates_path = po_path / "templates" | ||
| templates_path.mkdir(parents=True, exist_ok=True) | ||
| init_path = templates_path / "__init__.py" | ||
| if not init_path.exists(): | ||
| init_path.touch() | ||
| os.system(f"git add {init_path}") | ||
| template_path = templates_path / default_template | ||
| file = default_template | ||
|
|
||
| print( | ||
| f"Neither '{specific_template}' nor '{default_template}' template was found in the module '{module}'. " | ||
| f"Creating a default one at {template_path}." | ||
| ) | ||
| print("You can edit it to customize the generated code for your Page Objects.") | ||
| print(f"What is more, if you want an specific template only for the Page Objects that overrides {page_object.__name__} " | ||
| f"you can create one template in the module '{module}' with the name '{specific_template}'. " | ||
| f"Use the default one as reference.") | ||
|
|
||
| text = resources.read_text(templates, default_template) | ||
| template_path.write_text(text) | ||
| os.system(f"git add {template_path.absolute()}") | ||
|
|
||
| # remove template comments and trailing lines | ||
| text = "\n".join( | ||
| [line for line in text.splitlines() if not line.strip().startswith("##")] | ||
| ).lstrip() | ||
| return file, text | ||
|
|
||
|
|
||
| def template_for(context: OverrideContext, *, prefix=""): | ||
| file, template = load_template( | ||
| f"{context.po_module}.templates", context.page_object, context.po_path, prefix | ||
| ) | ||
| data = context.variables_for_template() | ||
| try: | ||
| return template.format(**data) | ||
| except KeyError as e: | ||
| raise ValueError( | ||
| f"Unknown parameter {e} on template '{file}'. Available parameters: {list(data.keys())}" | ||
| ) | ||
|
|
||
| def generate_po_code(context: OverrideContext) -> Path: | ||
| norm_domain = domain_in_snake_case(get_domain(context.url)) | ||
| sc_page_type = context.page_object.__name__ | ||
| po_root_path = context.po_path | ||
| po_root_path.mkdir(parents=True, exist_ok=True) | ||
| po_file_path = po_root_path / f"{norm_domain}_{sc_page_type}.py" | ||
| po_code_sample = template_for(context) | ||
| if po_file_path.exists(): | ||
| print( | ||
| f"Page Object code already exists in path {po_file_path}. Not modifying anything." | ||
| ) | ||
| print("Use the following code example to add a new Page Object to the file if required:") | ||
| print() | ||
| print("-" * 20 + " Page Object sample code " + "-" * 20) | ||
| print() | ||
| print(po_code_sample) | ||
| print() | ||
| print("-" * 20 + " end sample code " + "-" * 20) | ||
| else: | ||
| print(f"Saving a Page Object code template to {po_file_path} ...") | ||
| po_file_path.write_text(po_code_sample) | ||
| os.system(f'git add "{po_file_path}"') | ||
| print("Done!") | ||
| print(f"Open {po_file_path} and complete the code with your custom extraction") | ||
| return po_file_path | ||
|
|
||
|
|
||
| def generate_test(context: OverrideContext) -> Path: | ||
| domain = domain_in_snake_case(get_domain(context.url)) | ||
| test_code_sample = template_for(context, prefix="_test") | ||
| sc_page_type = context.page_object.__name__ | ||
| test_file_name = f"test_{domain}_{sc_page_type}.py" | ||
| test_path = context.test_path / test_file_name | ||
| if test_path.exists(): | ||
| print(f"Test already exists in path {test_path}. Not modifying anything. ") | ||
| print( | ||
| "Use the following code example to add new tests using the captured content:" | ||
| ) | ||
| print() | ||
| print("-" * 20 + " sample code " + "-" * 20) | ||
| print() | ||
| print(test_code_sample) | ||
| print() | ||
| print("-" * 20 + " end sample code " + "-" * 20) | ||
| else: | ||
| print(f"Saving a test template to {test_path} ...") | ||
| test_path.write_text(test_code_sample) | ||
| os.system(f'git add "{test_path}"') | ||
| print("Done!") | ||
| print(f"Open {test_path} and complete the code with sensible assert statements") | ||
| return test_path | ||
|
|
||
|
|
||
| def path_join(root: Path, path: Path): | ||
| if not path or not root: | ||
| return path | ||
| if not path.is_absolute(): | ||
| return root / path | ||
| return path.absolute() | ||
|
|
||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.