diff --git a/.travis.yml b/.travis.yml index 9c0be0a..5cdd67a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,24 +1,30 @@ language: python -python: 3.5 sudo: false -env: - matrix: - - TOXENV=py27 - - TOXENV=py35 +matrix: + include: + - python: 3.6 + env: + - TOX_ENV=py36 + - python: 3.7 + env: + - TOX_ENV=py37 + - python: 3.8 + env: + - TOX_ENV=py38 addons: apt: packages: - - libdb-dev + - libdb-dev install: pip install -U tox codecov -script: tox +script: tox -e $TOX_ENV after_success: -- codecov + - codecov deploy: provider: pypi @@ -29,4 +35,4 @@ deploy: on: tags: true repo: scrapy-plugins/scrapy-deltafetch - condition: $TOXENV = py35 + condition: $TOXENV = py38 diff --git a/requirements.txt b/requirements.txt index 2c309f4..fc0ee8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -scrapy>=1.1.0 +scrapy>=2.3.0 bsddb3 diff --git a/scrapy_deltafetch/__init__.py b/scrapy_deltafetch/__init__.py index b6996d7..a8882b3 100644 --- a/scrapy_deltafetch/__init__.py +++ b/scrapy_deltafetch/__init__.py @@ -1,4 +1,3 @@ -from .middleware import DeltaFetch - +from .middleware import DeltaFetch # noqa __version__ = "1.2.1" diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index ded0843..c6e27ee 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -1,20 +1,24 @@ -import logging import os import time +from logging import getLogger +from typing import Iterable -from scrapy.http import Request -from scrapy.item import BaseItem -from scrapy.utils.request import request_fingerprint +import bsddb3 +from scrapy import signals +from scrapy.crawler import Crawler +from scrapy.exceptions import NotConfigured +from scrapy.http import Request, Response +from scrapy.item import Item +from scrapy.spiders import Spider +from scrapy.statscollectors import StatsCollector from scrapy.utils.project import data_path from scrapy.utils.python import to_bytes -from scrapy.exceptions import NotConfigured -from scrapy import signals - +from scrapy.utils.request import request_fingerprint -logger = logging.getLogger(__name__) +logger = getLogger(__name__) -class DeltaFetch(object): +class DeltaFetch: """ This is a spider middleware to ignore requests to pages containing items seen in previous crawls of the same spider, thus producing a "delta crawl" @@ -25,70 +29,66 @@ class DeltaFetch(object): intensive). """ - def __init__(self, dir, reset=False, stats=None): - dbmodule = None - try: - dbmodule = __import__('bsddb3').db - except ImportError: - raise NotConfigured('bsddb3 is required') - self.dbmodule = dbmodule + def __init__(self, dir: str, reset: bool = False, stats: StatsCollector = None): self.dir = dir self.reset = reset self.stats = stats @classmethod - def from_crawler(cls, crawler): + def from_crawler(cls, crawler: Crawler): s = crawler.settings - if not s.getbool('DELTAFETCH_ENABLED'): + if not s.getbool("DELTAFETCH_ENABLED"): raise NotConfigured - dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) - reset = s.getbool('DELTAFETCH_RESET') + dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch")) + reset = s.getbool("DELTAFETCH_RESET") o = cls(dir, reset, crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o - def spider_opened(self, spider): - if not os.path.exists(self.dir): + def spider_opened(self, spider: Spider) -> None: + if not os.path.isdir(self.dir): os.makedirs(self.dir) - dbpath = os.path.join(self.dir, '%s.db' % spider.name) - reset = self.reset or getattr(spider, 'deltafetch_reset', False) - flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE + dbpath = os.path.join(self.dir, f"{spider.name}.db") + reset = self.reset or getattr(spider, "deltafetch_reset", False) + flag = bsddb3.db.DB_TRUNCATE if reset else bsddb3.db.DB_CREATE + try: - self.db = self.dbmodule.DB() - self.db.open(filename=dbpath, - dbtype=self.dbmodule.DB_HASH, - flags=flag) - except Exception: - logger.warning("Failed to open DeltaFetch database at %s, " - "trying to recreate it" % dbpath) - if os.path.exists(dbpath): + self.db = bsddb3.db.DB() + self.db.open(filename=dbpath, dbtype=bsddb3.db.DB_HASH, flags=flag) + except bsddb3.db.DBError: + logger.warning( + f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it" + ) + if os.path.isfile(dbpath): os.remove(dbpath) - self.db = self.dbmodule.DB() - self.db.open(filename=dbpath, - dbtype=self.dbmodule.DB_HASH, - flags=self.dbmodule.DB_CREATE) + self.db = bsddb3.db.DB() + self.db.open( + filename=dbpath, dbtype=bsddb3.db.DB_HASH, flags=bsddb3.db.DB_CREATE, + ) - def spider_closed(self, spider): + def spider_closed(self, _spider: Spider) -> None: self.db.close() - def process_spider_output(self, response, result, spider): + def process_spider_output( + self, response: Response, result: Iterable, spider: Spider + ): for r in result: if isinstance(r, Request): key = self._get_key(r) if key in self.db: - logger.info("Ignoring already visited: %s" % r) + logger.info(f"Ignoring already visited: {r}") if self.stats: - self.stats.inc_value('deltafetch/skipped', spider=spider) + self.stats.inc_value("deltafetch/skipped", spider=spider) continue - elif isinstance(r, (BaseItem, dict)): + elif isinstance(r, (Item, dict)): key = self._get_key(response.request) self.db[key] = str(time.time()) if self.stats: - self.stats.inc_value('deltafetch/stored', spider=spider) + self.stats.inc_value("deltafetch/stored", spider=spider) yield r - def _get_key(self, request): - key = request.meta.get('deltafetch_key') or request_fingerprint(request) + def _get_key(self, request: Request) -> bytes: + key = request.meta.get("deltafetch_key") or request_fingerprint(request) # request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string return to_bytes(key) diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 3c6e79c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal=1 diff --git a/setup.py b/setup.py index 9ba1ae9..1f5113d 100644 --- a/setup.py +++ b/setup.py @@ -1,24 +1,34 @@ -from setuptools import setup +from setuptools import find_packages, setup + +with open( + "README.rst", +) as fh: + long_description = fh.read() setup( - name='scrapy-deltafetch', - version='1.2.1', - license='BSD', - description='Scrapy middleware to ignore previously crawled pages', - author='Scrapinghub', - author_email='info@scrapinghub.com', - url='http://github.com/scrapy-plugins/scrapy-deltafetch', - packages=['scrapy_deltafetch'], - platforms=['Any'], + name="scrapy-deltafetch", + version="1.2.1", + description="Scrapy middleware to ignore previously crawled pages", + long_description=long_description, + long_description_content_type="text/x-rst", + author="Scrapinghub", + author_email="info@scrapinghub.com", + maintainer="Rabin Adhikari", + maintainer_email="rabin.adk1@gmail.com", + url="http://github.com/scrapy-plugins/scrapy-deltafetch", + packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), + license="BSD", classifiers=[ - 'Development Status :: 4 - Beta', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], - install_requires=['Scrapy>=1.1.0', 'bsddb3'] + install_requires=["Scrapy>=2.3.0", "bsddb3"], + python_requires=">=3.6", + zip_safe=True, ) diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index d3c3289..23e5442 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -1,49 +1,47 @@ -from unittest import TestCase, skipIf - import os -import mock import tempfile -from scrapy import Request -from scrapy.item import BaseItem -from scrapy.spiders import Spider -from scrapy.settings import Settings +from importlib import import_module +from unittest import TestCase, mock, skipIf + from scrapy.exceptions import NotConfigured -from scrapy.utils.request import request_fingerprint -from scrapy.utils.python import to_bytes +from scrapy.http import Request +from scrapy.item import Item +from scrapy.settings import Settings +from scrapy.spiders import Spider from scrapy.statscollectors import StatsCollector +from scrapy.utils.python import to_bytes +from scrapy.utils.request import request_fingerprint from scrapy.utils.test import get_crawler - from scrapy_deltafetch.middleware import DeltaFetch - dbmodule = None try: - dbmodule = __import__('bsddb3') -except ImportError: + dbmodule = import_module("bsddb3").db +except ModuleNotFoundError: pass -@skipIf(not dbmodule, "bsddb3 is not found on the system") +@skipIf(dbmodule is None, "bsddb3 is not found on the system") class DeltaFetchTestCase(TestCase): mwcls = DeltaFetch def setUp(self): - self.spider_name = 'df_tests' + self.spider_name = "df_tests" self.spider = Spider(self.spider_name) # DeltaFetch creates .db files named after the spider's name self.temp_dir = tempfile.gettempdir() - self.db_path = os.path.join(self.temp_dir, '%s.db' % self.spider.name) + self.db_path = os.path.join(self.temp_dir, f"{self.spider.name}.db") crawler = get_crawler(Spider) self.stats = StatsCollector(crawler) def test_init(self): # path format is any, the folder is not created - instance = self.mwcls('/any/dir', True, stats=self.stats) + instance = self.mwcls("/any/dir", True, stats=self.stats) assert isinstance(instance, self.mwcls) - self.assertEqual(instance.dir, '/any/dir') + self.assertEqual(instance.dir, "/any/dir") self.assertEqual(self.stats.get_stats(), {}) self.assertEqual(instance.reset, True) @@ -52,107 +50,114 @@ def test_init_from_crawler(self): # void settings crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) - with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ - mock.patch('scrapy.utils.project.inside_project') as in_project: + with mock.patch( + "scrapy.utils.project.project_data_dir" + ) as data_dir, mock.patch("scrapy.utils.project.inside_project") as in_project: data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) + self.assertEqual(instance.dir, os.path.join(self.temp_dir, "deltafetch")) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True, - 'DELTAFETCH_DIR': 'other', - 'DELTAFETCH_RESET': True}) + crawler.settings = Settings( + { + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_DIR": "other", + "DELTAFETCH_RESET": True, + } + ) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) + self.assertEqual(instance.dir, os.path.join(self.temp_dir, "other")) self.assertEqual(instance.reset, True) def test_spider_opened_new(self): """Middleware should create a .db file if not found.""" - if os.path.exists(self.db_path): + if os.path.isfile(self.db_path): os.remove(self.db_path) mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) - assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) + assert os.path.isfile(self.db_path) + assert hasattr(mw, "db") + assert isinstance(mw.db, type(dbmodule.DB())) assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert mw.db.get_type() == dbmodule.DB_HASH + assert mw.db.get_open_flags() == dbmodule.DB_CREATE def test_spider_opened_existing(self): """Middleware should open and use existing and valid .db files.""" self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) - assert mw.db.items() == [(b'test_key_1', b'test_v_1'), - (b'test_key_2', b'test_v_2')] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert hasattr(mw, "db") + assert isinstance(mw.db, type(dbmodule.DB())) + assert mw.db.items() == [ + (b"test_key_1", b"test_v_1"), + (b"test_key_2", b"test_v_2"), + ] + assert mw.db.get_type() == dbmodule.DB_HASH + assert mw.db.get_open_flags() == dbmodule.DB_CREATE def test_spider_opened_corrupt_dbfile(self): """Middleware should create a new .db if it cannot open it.""" # create an invalid .db file with open(self.db_path, "wb") as dbfile: - dbfile.write(b'bad') + dbfile.write(b"bad") mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") # file corruption is only detected when opening spider mw.spider_opened(self.spider) assert os.path.isdir(self.temp_dir) - assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) + assert os.path.isfile(self.db_path) + assert hasattr(mw, "db") + assert isinstance(mw.db, type(dbmodule.DB())) # and db should be empty (it was re-created) assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE + assert mw.db.get_type() == dbmodule.DB_HASH + assert mw.db.get_open_flags() == dbmodule.DB_CREATE def test_spider_opened_existing_spider_reset(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) - assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + assert mw.db.get_open_flags() == dbmodule.DB_TRUNCATE def test_spider_opened_reset_non_existing_db(self): mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.fd() # there's different logic for different bdb versions: # it can fail when opening a non-existing db with truncate flag, # then it should be caught and retried with rm & create flag - assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or - mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE) + assert ( + mw.db.get_open_flags() == dbmodule.DB_CREATE + or mw.db.get_open_flags() == dbmodule.DB_TRUNCATE + ) def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert hasattr(mw, 'db') - assert isinstance(mw.db, type(dbmodule.db.DB())) + assert hasattr(mw, "db") + assert isinstance(mw.db, type(dbmodule.DB())) assert mw.db.items() == [] - assert mw.db.get_type() == dbmodule.db.DB_HASH - assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE + assert mw.db.get_type() == dbmodule.DB_HASH + assert mw.db.get_open_flags() == dbmodule.DB_TRUNCATE def test_spider_closed(self): self._create_test_db() @@ -160,90 +165,84 @@ def test_spider_closed(self): mw.spider_opened(self.spider) assert mw.db.fd() mw.spider_closed(self.spider) - self.assertRaises(dbmodule.db.DBError, mw.db.fd) + self.assertRaises(dbmodule.DBError, mw.db.fd) def test_process_spider_output(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), [] + ) result = [ # same URL but with new key --> it should be processed - Request('http://url', meta={'deltafetch_key': 'key1'}), - + Request("http://url", meta={"deltafetch_key": "key1"}), # 'test_key_1' is already in the test db --> it should be skipped - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] # so only the 1 request should go through - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), [result[0]] + ) # the skipped "http://url1" should be counted in stats - self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) + self.assertEqual(self.stats.get_stats(), {"deltafetch/skipped": 1}) # b'key' should not be in the db yet as no item was collected yet - self.assertEqual(set(mw.db.keys()), - set([b'test_key_1', - b'test_key_2'])) + self.assertEqual(set(mw.db.keys()), set([b"test_key_1", b"test_key_2"])) # if the spider returns items, the request's key is added in db - result = [BaseItem(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), - set([b'key', - b'test_key_1', - b'test_key_2'])) - assert mw.db[b'key'] + result = [Item(), "not an item"] + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), result + ) + self.assertEqual(set(mw.db.keys()), set([b"key", b"test_key_1", b"test_key_2"])) + assert mw.db[b"key"] def test_process_spider_output_dict(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [{"somekey": "somevalue"}] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), - set([b'key', - b'test_key_1', - b'test_key_2'])) - assert mw.db[b'key'] + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), result + ) + self.assertEqual(set(mw.db.keys()), set([b"key", b"test_key_1", b"test_key_2"])) + assert mw.db[b"key"] def test_process_spider_output_stats(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), [] + ) self.assertEqual(self.stats.get_stats(), {}) result = [ - Request('http://url', meta={'deltafetch_key': 'key'}), - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url", meta={"deltafetch_key": "key"}), + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) - self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) - result = [BaseItem(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), [result[0]] + ) + self.assertEqual(self.stats.get_value("deltafetch/skipped"), 1) + result = [Item(), "not a item"] + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), result + ) + self.assertEqual(self.stats.get_value("deltafetch/stored"), 1) def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True @@ -253,34 +252,36 @@ def __init__(self, dir, reset=False, *args, **kwargs): crawler.settings = Settings({}) self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) - with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ - mock.patch('scrapy.utils.project.inside_project') as in_project: + with mock.patch( + "scrapy.utils.project.project_data_dir" + ) as data_dir, mock.patch("scrapy.utils.project.inside_project") as in_project: data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) + self.assertEqual(instance.dir, os.path.join(self.temp_dir, "deltafetch")) self.assertEqual(instance.reset, False) # project_data_dir mock with advanced settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True, - 'DELTAFETCH_DIR': 'other', - 'DELTAFETCH_RESET': True}) + crawler.settings = Settings( + { + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_DIR": "other", + "DELTAFETCH_RESET": True, + } + ) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) + self.assertEqual(instance.dir, os.path.join(self.temp_dir, "other")) self.assertEqual(instance.reset, True) def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True @@ -289,44 +290,49 @@ def __init__(self, dir, reset=False, *args, **kwargs): mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), [] + ) self.assertEqual(self.stats.get_stats(), {}) result = [ - Request('http://url', meta={'deltafetch_key': 'key'}), - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url", meta={"deltafetch_key": "key"}), + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] # stats should not be updated - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) - self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), [result[0]] + ) + self.assertEqual(self.stats.get_value("deltafetch/skipped"), None) - result = [BaseItem(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(self.stats.get_value('deltafetch/stored'), None) + result = [Item(), "not an item"] + self.assertEqual( + list(mw.process_spider_output(response, result, self.spider)), result + ) + self.assertEqual(self.stats.get_value("deltafetch/stored"), None) def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) - test_req1 = Request('http://url1') - self.assertEqual(mw._get_key(test_req1), - to_bytes(request_fingerprint(test_req1))) - test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) - self.assertEqual(mw._get_key(test_req2), b'dfkey1') - - test_req3 = Request('http://url2', meta={'deltafetch_key': u'dfkey1'}) + test_req1 = Request("http://url1") + self.assertEqual( + mw._get_key(test_req1), to_bytes(request_fingerprint(test_req1)) + ) + test_req2 = Request("http://url2", meta={"deltafetch_key": b"dfkey1"}) + self.assertEqual(mw._get_key(test_req2), b"dfkey1") + + test_req3 = Request("http://url2", meta={"deltafetch_key": "dfkey1"}) # key will be converted to bytes - self.assertEqual(mw._get_key(test_req3), b'dfkey1') + self.assertEqual(mw._get_key(test_req3), b"dfkey1") def _create_test_db(self): - db = dbmodule.db.DB() + # !Cannot use context manager for db because there is no Attribute __enter___ + db = dbmodule.DB() # truncate test db if there were failed tests - db.open(self.db_path, dbmodule.db.DB_HASH, - dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE) - db[b'test_key_1'] = b'test_v_1' - db[b'test_key_2'] = b'test_v_2' + db.open( + self.db_path, dbmodule.DB_HASH, dbmodule.DB_CREATE | dbmodule.DB_TRUNCATE, + ) + db[b"test_key_1"] = b"test_v_1" + db[b"test_key_2"] = b"test_v_2" db.close() diff --git a/tox.ini b/tox.ini index 00190cf..74f42cd 100644 --- a/tox.ini +++ b/tox.ini @@ -3,9 +3,6 @@ # test suite on all supported python versions. To use it, "pip install tox" # and then run "tox" from this directory. -[tox] -envlist = py27, py35 - [testenv] setenv = BERKELEYDB_DIR = /usr @@ -13,7 +10,6 @@ setenv = deps = -rrequirements.txt coverage - mock nose commands = nosetests --with-doctest --with-coverage --cover-package=scrapy_deltafetch tests