scrapinghub · ivanprado · Dec 9, 2021 · Dec 9, 2021 · Dec 9, 2021 · Dec 10, 2021
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,7 @@
-include CHANGES.rst
+include CHANGELOG.rst
 include LICENSE
 include README.rst
 
-recursive-include tests *
+recursive-include scrapy_poet/templates *.template
+
+global-exclude __pycache__ *.py[cod]
diff --git a/example/example/po/__init__.py b/example/example/po/__init__.py
diff --git a/example/example/settings.py b/example/example/settings.py
@@ -8,6 +8,8 @@
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from example.autoextract import AutoextractProductProvider
 
+from web_poet.overrides import find_page_object_overrides
+
 BOT_NAME = 'example'
 
 SPIDER_MODULES = ['example.spiders']
@@ -22,3 +24,6 @@
    'scrapy_poet.InjectionMiddleware': 543,
 }
 
+PO_PACKAGE = "example.po"
+PO_TESTS_PACKAGE = "tests.po"
+SCRAPY_POET_OVERRIDES = find_page_object_overrides(PO_PACKAGE)
diff --git a/scrapy_poet/commands/__init__.py b/scrapy_poet/commands/__init__.py
diff --git a/scrapy_poet/commands/override.py b/scrapy_poet/commands/override.py
@@ -0,0 +1,334 @@
+import tempfile
+
+import os
+from importlib import resources
+
+from pathlib import Path
+from scrapy.utils.misc import load_object
+from twisted.python.failure import Failure
+
+from typing import Type, Callable, Tuple
+
+from dataclasses import dataclass
+
+from scrapy import Request
+from scrapy.commands import ScrapyCommand
+from scrapy.exceptions import UsageError
+from scrapy.utils.conf import build_component_list, closest_scrapy_cfg
+from scrapy.utils.spider import DefaultSpider
+from url_matcher.util import get_domain
+from w3lib.url import is_url
+
+from scrapy_poet import templates, DummyResponse
+from scrapy_poet.po_tester import POTester
+from web_poet import ResponseData
+from web_poet.overrides import find_page_object_overrides
+
+
+class OverrideCommand(ScrapyCommand):
+
+    requires_project = False
+
+    def syntax(self):
+        return "[options] <url>"
+
+    def short_desc(self):
+        return "TODO"
+
+    def long_desc(self):
+        return (
+            "TODO"
+        )
+
+    def add_options(self, parser):
+        ScrapyCommand.add_options(self, parser)
+
+    def process_options(self, args, opts):
+        super().process_options(args, opts)
+        page_object, url = parse_args(args)
+
+        self.ensure_injection_middleware()
+        for key in ("PO_PACKAGE", "PO_TESTS_PACKAGE"):
+            if key not in self.settings:
+                raise ValueError(f"{key} is not defined in settings and is required to run this command."
+                                 f"Please, configure the packages where you want to store the autogenerated "
+                                 f"Page Objects and their tests. ")
+
+        if "SCRAPY_POET_OVERRIDES" not in self.settings:
+            raise ValueError(f"SCRAPY_POET_OVERRIDES is not defined in settings and it is required to run this command. "
+                             f"A typical configuration is the following:\n"
+                             f"\n"
+                             f"  SCRAPY_POET_OVERRIDES = find_page_object_overrides(PO_PACKAGE)")
+
+        root_path = Path(".").absolute()
+        scrapy_cfg_route = closest_scrapy_cfg()
+        if scrapy_cfg_route:
+            root_path = Path(scrapy_cfg_route).parent.absolute()
+
+        po_module = self.settings.get("PO_PACKAGE")
+        po_folder = po_module.replace(".", "/")
+        po_path = Path(po_folder)
+        test_module = self.settings.get("PO_TESTS_PACKAGE")
+        po_tests_folder = test_module.replace(".", "/")
+        test_path = Path(po_tests_folder)
+
+        po_path = path_join(root_path, po_path)
+        test_path = path_join(root_path, test_path)
+
+        # Creating folders and files
+        for path in (po_path / "templates", test_path):
+            path.mkdir(parents=True, exist_ok=True)
+        init_path = po_path / "templates" / "__init__.py"
+        if not init_path.exists():
+            init_path.write_text("")
+            os.system(f"git add {init_path.absolute()}")
+
+        context = OverrideContext(
+            url=url,
+            page_object=page_object,
+            po_module=po_module,
+            test_module=test_module,
+            po_path=po_path,
+            test_path=test_path,
+        )
+        self.context = context
+        self.po_path = generate_po_code(context)
+
+        # Configuring the new Page Object so that it is used to record the fixtures
+        override_rules_for_new_po = find_page_object_overrides(
+            f"{context.po_module}.{context.variables_for_template()['po_submodule']}")
+        self.settings["SCRAPY_POET_OVERRIDES"] = (self.settings.get("SCRAPY_POET_OVERRIDES", []) +
+                                                  override_rules_for_new_po)
+
+    def ensure_injection_middleware(self):
+        """Ensures that the InjectionMiddleware is configured"""
+        middlewares = build_component_list(
+            self.settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
+        injection_mdlw = "scrapy_poet.InjectionMiddleware"
+        if injection_mdlw not in middlewares:
+            self.settings.set('DOWNLOADER_MIDDLEWARES', {
+                **self.settings.getdict('DOWNLOADER_MIDDLEWARES', {}),
+                injection_mdlw: 543})
+
+    def run(self, args, opts):
+        page_object, url = parse_args(args)
+
+        errors = []
+        def callback(response: DummyResponse, po: page_object):
+            ...
+
+        def errback(failure: Failure):
+            errors.append(failure)
+
+        request = Request(args[1], callback=callback, errback=errback, dont_filter=True)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tmppath = Path(tmpdirname)
+            po_tester = POTester(url, page_object, self.context.test_path)
+            fixture_path = po_tester.fixture_path
+
+            if fixture_path.exists():
+                print(f"The fixture {fixture_path} for the URL {url} already exists. Updating it...")
+            else:
+                print(f"Creating fixture {fixture_path} for the URL {url}...")
+
+            cache_path = tmppath / fixture_path.name
+            self.settings.set("SCRAPY_POET_CACHE", str(cache_path.absolute()))
+
+            spidercls = DefaultSpider
+            self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
+            self.crawler_process.start()
+
+            if errors:
+                print(f"An error occurred while fetching the resources for the page {url}")
+                raise errors[0]
+
+            # Moving the data to the fixtures folder
+            cache_path.replace(fixture_path)
+            os.system(f"git add {fixture_path.absolute()}")
+            print("Fixture saved successfully")
+
+            self.po_test_path = generate_test(self.context)
+            print()
+            print("Finished!")
+            print()
+            print(f" - You can now add your extraction code to the Page Object at {self.po_path}")
+            print(f" - It is handy to debug it locally using the test case at {self.po_test_path}")
+            print(f" - And remember to invoke this very same command whenever you want fresh data ")
+            print(f"   to update the fixture or if you changed the dependencies on your Page Object. It is safe! :-)")
+
+
+
+def parse_args(args) -> Tuple[Callable, str]:
+    if len(args) != 2 or not is_url(args[1]):
+        raise UsageError()
+    page_object = load_object(args[0])
+    url = args[1]
+    return page_object, url
+
+def domain_in_snake_case(text: str):
+    return text.replace(".", "_").replace("-", "_")
+
+
+def to_camel_case(snake_str):
+    # Based on https://stackoverflow.com/a/19053800/3887420
+    components = snake_str.split("_")
+    # We capitalize the first letter of each component and join them together.
+    return "".join(x.title() for x in components)
+
+
+@dataclass
+class OverrideContext:
+    url: str
+    page_object: Type
+    po_module: str
+    test_module: str
+    po_path: Path
+    test_path: Path
+
+    def variables_for_template(self):
+        domain = get_domain(self.url)
+        norm_domain_ = domain_in_snake_case(domain)
+        base_po_module = self.page_object.__module__
+        base_po_name = self.page_object.__name__
+        po_name = f"{to_camel_case(norm_domain_)}{base_po_name}"
+        po_submodule = f"{norm_domain_}_{self.page_object.__name__}"
+        test_submodule = f"test_{po_submodule}"
+        return dict(
+            url=self.url,
+            domain=domain,
+            base_po_module=base_po_module,
+            base_po_name=base_po_name,
+            po_name=po_name,
+            po_module=self.po_module,
+            po_submodule=po_submodule,
+            test_module=self.test_module,
+            test_submodule=test_submodule,
+            po_path=str(self.po_path),
+            test_path=str(self.test_path),
+        )
+
+
+def load_template(module, page_object, po_path: Path, suffix=""):
+    """Load the most specific template possible from the given po_module"""
+    specific_template = f"{page_object.__name__}{suffix}.template"
+    try:
+        file, text = specific_template, resources.read_text(module, specific_template)
+        print(f"Template '{file}' from module '{module}' loaded.")
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            f"Module {module} is not found. Did you forgot to create it?"
+        )
+    except FileNotFoundError:
+        default_template = f"default{suffix}.template"
+        try:
+            file, text = default_template, resources.read_text(module, default_template)
+            print(
+                f"Template '{file}' from module '{module}' loaded.\n"
+                f"If you want a custom template for '{page_object.__name__}' instead the default one then\n"
+                f"create your custom one in the module '{module}' with the name '{specific_template}'"
+            )
+        except FileNotFoundError as e:
+            templates_path = po_path / "templates"
+            templates_path.mkdir(parents=True, exist_ok=True)
+            init_path = templates_path / "__init__.py"
+            if not init_path.exists():
+                init_path.touch()
+                os.system(f"git add {init_path}")
+            template_path = templates_path / default_template
+            file = default_template
+
+            print(
+                f"Neither '{specific_template}' nor '{default_template}' template was found in the module '{module}'. "
+                f"Creating a default one at {template_path}."
+            )
+            print("You can edit it to customize the generated code for your Page Objects.")
+            print(f"What is more, if you want an specific template only for the Page Objects that overrides {page_object.__name__} "
+                  f"you can create one template in the module '{module}' with the name '{specific_template}'. "
+                  f"Use the default one as reference.")
+
+            text = resources.read_text(templates, default_template)
+            template_path.write_text(text)
+            os.system(f"git add {template_path.absolute()}")
+
+    # remove template comments and trailing lines
+    text = "\n".join(
+        [line for line in text.splitlines() if not line.strip().startswith("##")]
+    ).lstrip()
+    return file, text
+
+
+def template_for(context: OverrideContext, *, prefix=""):
+    file, template = load_template(
+        f"{context.po_module}.templates", context.page_object, context.po_path, prefix
+    )
+    data = context.variables_for_template()
+    try:
+        return template.format(**data)
+    except KeyError as e:
+        raise ValueError(
+            f"Unknown parameter {e} on template '{file}'. Available parameters: {list(data.keys())}"
+        )
+
+def generate_po_code(context: OverrideContext) -> Path:
+    norm_domain = domain_in_snake_case(get_domain(context.url))
+    sc_page_type = context.page_object.__name__
+    po_root_path = context.po_path
+    po_root_path.mkdir(parents=True, exist_ok=True)
+    po_file_path = po_root_path / f"{norm_domain}_{sc_page_type}.py"
+    po_code_sample = template_for(context)
+    if po_file_path.exists():
+        print(
+            f"Page Object code already exists in path {po_file_path}. Not modifying anything."
+        )
+        print("Use the following code example to add a new Page Object to the file if required:")
+        print()
+        print("-" * 20 + " Page Object sample code " + "-" * 20)
+        print()
+        print(po_code_sample)
+        print()
+        print("-" * 20 + " end sample code " + "-" * 20)
+    else:
+        print(f"Saving a Page Object code template to {po_file_path} ...")
+        po_file_path.write_text(po_code_sample)
+        os.system(f'git add "{po_file_path}"')
+        print("Done!")
+        print(f"Open {po_file_path} and complete the code with your custom extraction")
+    return po_file_path
+
+
+def generate_test(context: OverrideContext) -> Path:
+    domain = domain_in_snake_case(get_domain(context.url))
+    test_code_sample = template_for(context, prefix="_test")
+    sc_page_type = context.page_object.__name__
+    test_file_name = f"test_{domain}_{sc_page_type}.py"
+    test_path = context.test_path / test_file_name
+    if test_path.exists():
+        print(f"Test already exists in path {test_path}. Not modifying anything. ")
+        print(
+            "Use the following code example to add new tests using the captured content:"
+        )
+        print()
+        print("-" * 20 + " sample code " + "-" * 20)
+        print()
+        print(test_code_sample)
+        print()
+        print("-" * 20 + " end sample code " + "-" * 20)
+    else:
+        print(f"Saving a test template to {test_path} ...")
+        test_path.write_text(test_code_sample)
+        os.system(f'git add "{test_path}"')
+        print("Done!")
+        print(f"Open {test_path} and complete the code with sensible assert statements")
+    return test_path
+
+
+def path_join(root: Path, path: Path):
+    if not path or not root:
+        return path
+    if not path.is_absolute():
+        return root / path
+    return path.absolute()
+
+
+