Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
include CHANGES.rst
include CHANGELOG.rst
include LICENSE
include README.rst

recursive-include tests *
recursive-include scrapy_poet/templates *.template

global-exclude __pycache__ *.py[cod]
Empty file added example/example/po/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions example/example/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from example.autoextract import AutoextractProductProvider

from web_poet.overrides import find_page_object_overrides

BOT_NAME = 'example'

SPIDER_MODULES = ['example.spiders']
Expand All @@ -22,3 +24,6 @@
'scrapy_poet.InjectionMiddleware': 543,
}

PO_PACKAGE = "example.po"
PO_TESTS_PACKAGE = "tests.po"
SCRAPY_POET_OVERRIDES = find_page_object_overrides(PO_PACKAGE)
Empty file.
334 changes: 334 additions & 0 deletions scrapy_poet/commands/override.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
import tempfile

import os
from importlib import resources

from pathlib import Path
from scrapy.utils.misc import load_object
from twisted.python.failure import Failure

from typing import Type, Callable, Tuple

from dataclasses import dataclass

from scrapy import Request
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.conf import build_component_list, closest_scrapy_cfg
from scrapy.utils.spider import DefaultSpider
from url_matcher.util import get_domain
from w3lib.url import is_url

from scrapy_poet import templates, DummyResponse
from scrapy_poet.po_tester import POTester
from web_poet import ResponseData
from web_poet.overrides import find_page_object_overrides


class OverrideCommand(ScrapyCommand):

requires_project = False

def syntax(self):
return "[options] <url>"

def short_desc(self):
return "TODO"

def long_desc(self):
return (
"TODO"
)

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)

def process_options(self, args, opts):
super().process_options(args, opts)
page_object, url = parse_args(args)

self.ensure_injection_middleware()
for key in ("PO_PACKAGE", "PO_TESTS_PACKAGE"):
if key not in self.settings:
raise ValueError(f"{key} is not defined in settings and is required to run this command."
f"Please, configure the packages where you want to store the autogenerated "
f"Page Objects and their tests. ")

if "SCRAPY_POET_OVERRIDES" not in self.settings:
raise ValueError(f"SCRAPY_POET_OVERRIDES is not defined in settings and it is required to run this command. "
f"A typical configuration is the following:\n"
f"\n"
f" SCRAPY_POET_OVERRIDES = find_page_object_overrides(PO_PACKAGE)")

root_path = Path(".").absolute()
scrapy_cfg_route = closest_scrapy_cfg()
if scrapy_cfg_route:
root_path = Path(scrapy_cfg_route).parent.absolute()

po_module = self.settings.get("PO_PACKAGE")
po_folder = po_module.replace(".", "/")
po_path = Path(po_folder)
test_module = self.settings.get("PO_TESTS_PACKAGE")
po_tests_folder = test_module.replace(".", "/")
test_path = Path(po_tests_folder)

po_path = path_join(root_path, po_path)
test_path = path_join(root_path, test_path)

# Creating folders and files
for path in (po_path / "templates", test_path):
path.mkdir(parents=True, exist_ok=True)
init_path = po_path / "templates" / "__init__.py"
if not init_path.exists():
init_path.write_text("")
os.system(f"git add {init_path.absolute()}")

context = OverrideContext(
url=url,
page_object=page_object,
po_module=po_module,
test_module=test_module,
po_path=po_path,
test_path=test_path,
)
self.context = context

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit confused here. Should we maybe init self.context and self.po_path with all the typing before assigning any values to them inside the methods?

self.po_path = generate_po_code(context)

# Configuring the new Page Object so that it is used to record the fixtures
override_rules_for_new_po = find_page_object_overrides(
f"{context.po_module}.{context.variables_for_template()['po_submodule']}")
self.settings["SCRAPY_POET_OVERRIDES"] = (self.settings.get("SCRAPY_POET_OVERRIDES", []) +
override_rules_for_new_po)

def ensure_injection_middleware(self):
"""Ensures that the InjectionMiddleware is configured"""
middlewares = build_component_list(
self.settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
injection_mdlw = "scrapy_poet.InjectionMiddleware"
if injection_mdlw not in middlewares:
self.settings.set('DOWNLOADER_MIDDLEWARES', {
**self.settings.getdict('DOWNLOADER_MIDDLEWARES', {}),
injection_mdlw: 543})

def run(self, args, opts):
page_object, url = parse_args(args)

errors = []
def callback(response: DummyResponse, po: page_object):
...

def errback(failure: Failure):
errors.append(failure)

request = Request(args[1], callback=callback, errback=errback, dont_filter=True)

with tempfile.TemporaryDirectory() as tmpdirname:
tmppath = Path(tmpdirname)
po_tester = POTester(url, page_object, self.context.test_path)
fixture_path = po_tester.fixture_path

if fixture_path.exists():
print(f"The fixture {fixture_path} for the URL {url} already exists. Updating it...")
else:
print(f"Creating fixture {fixture_path} for the URL {url}...")

cache_path = tmppath / fixture_path.name
self.settings.set("SCRAPY_POET_CACHE", str(cache_path.absolute()))

spidercls = DefaultSpider
self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
self.crawler_process.start()

if errors:
print(f"An error occurred while fetching the resources for the page {url}")
raise errors[0]

# Moving the data to the fixtures folder
cache_path.replace(fixture_path)
os.system(f"git add {fixture_path.absolute()}")
print("Fixture saved successfully")

self.po_test_path = generate_test(self.context)
print()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we stick with print instead of logging?

print("Finished!")
print()
print(f" - You can now add your extraction code to the Page Object at {self.po_path}")
print(f" - It is handy to debug it locally using the test case at {self.po_test_path}")
print(f" - And remember to invoke this very same command whenever you want fresh data ")
print(f" to update the fixture or if you changed the dependencies on your Page Object. It is safe! :-)")



def parse_args(args) -> Tuple[Callable, str]:
if len(args) != 2 or not is_url(args[1]):
raise UsageError()
page_object = load_object(args[0])
url = args[1]
return page_object, url

def domain_in_snake_case(text: str):
return text.replace(".", "_").replace("-", "_")


def to_camel_case(snake_str):
# Based on https://stackoverflow.com/a/19053800/3887420
components = snake_str.split("_")
# We capitalize the first letter of each component and join them together.
return "".join(x.title() for x in components)


@dataclass
class OverrideContext:
url: str
page_object: Type
po_module: str
test_module: str
po_path: Path
test_path: Path

def variables_for_template(self):
domain = get_domain(self.url)
norm_domain_ = domain_in_snake_case(domain)
base_po_module = self.page_object.__module__
base_po_name = self.page_object.__name__
po_name = f"{to_camel_case(norm_domain_)}{base_po_name}"
po_submodule = f"{norm_domain_}_{self.page_object.__name__}"
test_submodule = f"test_{po_submodule}"
return dict(
url=self.url,
domain=domain,
base_po_module=base_po_module,
base_po_name=base_po_name,
po_name=po_name,
po_module=self.po_module,
po_submodule=po_submodule,
test_module=self.test_module,
test_submodule=test_submodule,
po_path=str(self.po_path),
test_path=str(self.test_path),
)


def load_template(module, page_object, po_path: Path, suffix=""):
"""Load the most specific template possible from the given po_module"""
specific_template = f"{page_object.__name__}{suffix}.template"
try:
file, text = specific_template, resources.read_text(module, specific_template)
print(f"Template '{file}' from module '{module}' loaded.")
except ModuleNotFoundError:
raise ModuleNotFoundError(
f"Module {module} is not found. Did you forgot to create it?"
)
except FileNotFoundError:
default_template = f"default{suffix}.template"
try:
file, text = default_template, resources.read_text(module, default_template)
print(
f"Template '{file}' from module '{module}' loaded.\n"
f"If you want a custom template for '{page_object.__name__}' instead the default one then\n"
f"create your custom one in the module '{module}' with the name '{specific_template}'"
)
except FileNotFoundError as e:
templates_path = po_path / "templates"
templates_path.mkdir(parents=True, exist_ok=True)
init_path = templates_path / "__init__.py"
if not init_path.exists():
init_path.touch()
os.system(f"git add {init_path}")
template_path = templates_path / default_template
file = default_template

print(
f"Neither '{specific_template}' nor '{default_template}' template was found in the module '{module}'. "
f"Creating a default one at {template_path}."
)
print("You can edit it to customize the generated code for your Page Objects.")
print(f"What is more, if you want an specific template only for the Page Objects that overrides {page_object.__name__} "
f"you can create one template in the module '{module}' with the name '{specific_template}'. "
f"Use the default one as reference.")

text = resources.read_text(templates, default_template)
template_path.write_text(text)
os.system(f"git add {template_path.absolute()}")

# remove template comments and trailing lines
text = "\n".join(
[line for line in text.splitlines() if not line.strip().startswith("##")]
).lstrip()
return file, text


def template_for(context: OverrideContext, *, prefix=""):
file, template = load_template(
f"{context.po_module}.templates", context.page_object, context.po_path, prefix
)
data = context.variables_for_template()
try:
return template.format(**data)
except KeyError as e:
raise ValueError(
f"Unknown parameter {e} on template '{file}'. Available parameters: {list(data.keys())}"
)

def generate_po_code(context: OverrideContext) -> Path:
norm_domain = domain_in_snake_case(get_domain(context.url))
sc_page_type = context.page_object.__name__
po_root_path = context.po_path
po_root_path.mkdir(parents=True, exist_ok=True)
po_file_path = po_root_path / f"{norm_domain}_{sc_page_type}.py"
po_code_sample = template_for(context)
if po_file_path.exists():
print(
f"Page Object code already exists in path {po_file_path}. Not modifying anything."
)
print("Use the following code example to add a new Page Object to the file if required:")
print()
print("-" * 20 + " Page Object sample code " + "-" * 20)
print()
print(po_code_sample)
print()
print("-" * 20 + " end sample code " + "-" * 20)
else:
print(f"Saving a Page Object code template to {po_file_path} ...")
po_file_path.write_text(po_code_sample)
os.system(f'git add "{po_file_path}"')
print("Done!")
print(f"Open {po_file_path} and complete the code with your custom extraction")
return po_file_path


def generate_test(context: OverrideContext) -> Path:
domain = domain_in_snake_case(get_domain(context.url))
test_code_sample = template_for(context, prefix="_test")
sc_page_type = context.page_object.__name__
test_file_name = f"test_{domain}_{sc_page_type}.py"
test_path = context.test_path / test_file_name
if test_path.exists():
print(f"Test already exists in path {test_path}. Not modifying anything. ")
print(
"Use the following code example to add new tests using the captured content:"
)
print()
print("-" * 20 + " sample code " + "-" * 20)
print()
print(test_code_sample)
print()
print("-" * 20 + " end sample code " + "-" * 20)
else:
print(f"Saving a test template to {test_path} ...")
test_path.write_text(test_code_sample)
os.system(f'git add "{test_path}"')
print("Done!")
print(f"Open {test_path} and complete the code with sensible assert statements")
return test_path


def path_join(root: Path, path: Path):
if not path or not root:
return path
if not path.is_absolute():
return root / path
return path.absolute()



Loading