diff --git a/.gitignore b/.gitignore index c72797a..f73f3c4 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ bot_spiders/ .coverage.* htmlcov/ .scrapy +docs/_build diff --git a/Dockerfile b/Dockerfile index 9c61714..9dcedb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,9 @@ RUN pip3 install . # use e.g. -v /path/to/my/arachnado/config.conf:/etc/arachnado.conf # docker run option to override arachnado parameters -VOLUME /etc/arachnado.conf +# The VOLUME is not exposed here because Docker assumes that volumes are folders +# (unless the file really exists), so this can cause problems in docker-compose +# later (see https://github.com/docker/docker/issues/21702#issuecomment-221987049) # this folder is added to PYTHONPATH, so modules from there are available # for spider_packages Arachnado option diff --git a/README.rst b/README.rst index 3f78496..388a4f1 100644 --- a/README.rst +++ b/README.rst @@ -13,7 +13,7 @@ License is MIT. Install ------- -Arachnado requires Python 2.7. +Arachnado requires Python 2.7 or Python 3.5. To install Arachnado use pip:: pip install arachnado @@ -41,13 +41,13 @@ the server:: For available options check https://github.com/TeamHG-Memex/arachnado/blob/master/arachnado/config/defaults.conf. -Test ------------ -To start unit tests for API: +Tests +----- + +To run tests make sure tox_ is installed, then +execute ``tox`` command from the source root. -python -m tornado.test.runtests tests.test_data -or -python3 -m tornado.test.runtests tests.test_data +.. _tox: https://testrun.org/tox/latest/ Development ----------- diff --git a/arachnado/__main__.py b/arachnado/__main__.py index 8eca1cb..6f7d864 100755 --- a/arachnado/__main__.py +++ b/arachnado/__main__.py @@ -73,8 +73,8 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri') sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri') - settings.update({k: v for k, v in opts['arachnado.scrapy'].items() - if k.isupper()}) + scrapy_opts = opts['arachnado.scrapy'] + settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()}) settings.update({ 'MONGO_EXPORT_ENABLED': storage_opts['enabled'], @@ -91,10 +91,12 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): site_checker_crawler = get_site_checker_crawler(site_storage) crawler_process.crawl(site_checker_crawler) - spider_packages = opts['arachnado.scrapy']['spider_packages'] + spider_packages = scrapy_opts['spider_packages'] + default_spider_name = scrapy_opts['default_spider_name'] domain_crawlers = DomainCrawlers( crawler_process=crawler_process, spider_packages=_parse_spider_packages(spider_packages), + default_spider_name=default_spider_name, settings=settings ) domain_crawlers.resume(job_storage) diff --git a/arachnado/config/defaults.conf b/arachnado/config/defaults.conf index b6c2400..b91fe69 100644 --- a/arachnado/config/defaults.conf +++ b/arachnado/config/defaults.conf @@ -5,7 +5,8 @@ [arachnado] ; General Arachnado server options. -; Event loop to use. Allowed values are "twisted", "tornado" and "auto". +; Event loop to use. Allowed values are +; "twisted", "tornado" and "auto". reactor = auto ; Host/port to listen to @@ -31,6 +32,12 @@ DEPTH_LIMIT = 10 ; Packages to load spiders from (separated by whitespace) spider_packages = +; Name of the default spider. It is used for crawling if +; no custom spider is specified or detected. It should support +; API similar to arachnado.spider.CrawlWebsiteSpider +; (which is the default here). +default_spider_name = generic + [arachnado.storage] ; Where to store crawled items and job information. ; Currently only MongoDB is supported (mongodb:// URIs). diff --git a/arachnado/crawler_process.py b/arachnado/crawler_process.py index a3939df..8cb3ca5 100644 --- a/arachnado/crawler_process.py +++ b/arachnado/crawler_process.py @@ -300,7 +300,13 @@ def _downloader_stats(cls, crawler): @classmethod def _request_info(cls, request): - return {'url': request.url, 'method': request.method} + info = {'url': request.url, 'method': request.method} + if 'splash' in request.meta: + splash_args = request.meta['splash'].get('args', {}) + if 'url' in splash_args: + info['url'] = splash_args['url'] + info['method'] = splash_args.get('http_method', 'GET') + return info @classmethod def _slot_info(cls, key, slot): diff --git a/arachnado/cron.py b/arachnado/cron.py index 7f45a6a..a4b85d6 100644 --- a/arachnado/cron.py +++ b/arachnado/cron.py @@ -17,7 +17,7 @@ def __init__(self, domain_crawlers, site_storage): self.waiting_calls = {} self.domain_crawlers = domain_crawlers self.site_storage = site_storage - self.site_storage.subscribe(self.site_storage.available_subscriptions, + self.site_storage.subscribe(self.site_storage.available_events, self.rerun) def start(self): @@ -96,10 +96,8 @@ def start_crawl(self, id_): args = _key_value_to_dict(site.get('args', [])) settings = _key_value_to_dict(site.get('settings', [])) - if not site.get('engine'): - site['engine'] = 'generic' - - if site['engine'] == 'generic': + # checking for == 'generic' to be backwards compatible + if not site.get('engine') or site['engine'] == 'generic': url = site['url'] else: url = 'spider://' + site['engine'] diff --git a/arachnado/domain_crawlers.py b/arachnado/domain_crawlers.py index 1456e50..d8b571c 100644 --- a/arachnado/domain_crawlers.py +++ b/arachnado/domain_crawlers.py @@ -11,17 +11,19 @@ import arachnado.settings from arachnado.crawler_process import ArachnadoCrawler from arachnado.spider import CrawlWebsiteSpider, ArachnadoSpider -from arachnado.utils.spiders import get_spider_cls +from arachnado.utils.spiders import get_spider_cls, find_spider_cls class DomainCrawlers(object): """ Helper class to create and start crawlers. """ - def __init__(self, crawler_process, spider_packages, settings): + def __init__(self, crawler_process, spider_packages, default_spider_name, + settings): self.settings = get_settings(settings) self.crawler_process = crawler_process self.spider_packages = spider_packages + self.default_spider_name = default_spider_name def resume(self, job_storage): @gen.coroutine @@ -37,8 +39,10 @@ def _resume(): def start(self, domain, args, settings, crawl_id=None): """ Create, start and return a crawler for a given domain. """ - spider_cls = get_spider_cls(domain, self.spider_packages, - CrawlWebsiteSpider) + default_cls = find_spider_cls( + self.default_spider_name, + self.spider_packages + ['arachnado.spider']) + spider_cls = get_spider_cls(domain, self.spider_packages, default_cls) if not spider_cls: return diff --git a/arachnado/downloadermiddlewares/proxyfromsettings.py b/arachnado/downloadermiddlewares/proxyfromsettings.py index 410cbb1..521cd3c 100644 --- a/arachnado/downloadermiddlewares/proxyfromsettings.py +++ b/arachnado/downloadermiddlewares/proxyfromsettings.py @@ -11,6 +11,7 @@ def from_crawler(cls, crawler): def __init__(self, settings): self.proxies = {} + self.auth_encoding = settings.get('HTTPPROXY_AUTH_ENCODING') proxies = [ ('http', settings.get('HTTP_PROXY')), ('https', settings.get('HTTPS_PROXY')), diff --git a/arachnado/handlers.py b/arachnado/handlers.py index 05ec546..af5afd6 100644 --- a/arachnado/handlers.py +++ b/arachnado/handlers.py @@ -31,14 +31,19 @@ def get_application(crawler_process, domain_crawlers, debug = opts['arachnado']['debug'] handlers = [ + # UI url(r"/", Index, context, name="index"), url(r"/help", Help, context, name="help"), + + # simple API used by UI url(r"/crawler/start", StartCrawler, context, name="start"), url(r"/crawler/stop", StopCrawler, context, name="stop"), url(r"/crawler/pause", PauseCrawler, context, name="pause"), url(r"/crawler/resume", ResumeCrawler, context, name="resume"), url(r"/crawler/status", CrawlerStatus, context, name="status"), url(r"/ws-updates", Monitor, context, name="ws-updates"), + + # RPC API url(r"/ws-rpc", RpcWebsocketHandler, context, name="ws-rpc"), url(r"/rpc", RpcHttpHandler, context, name="rpc"), url(r"/ws-pages-data", PagesDataRpcWebsocketHandler, context, name="ws-pages-data"), @@ -148,6 +153,8 @@ def control_job(self, job_id): class CrawlerStatus(BaseRequestHandler): """ Status for one or more jobs. """ + # FIXME: does it work? Can we remove it? It is not used + # by Arachnado UI. def get(self): crawl_ids_arg = self.get_argument('crawl_ids', '') diff --git a/arachnado/manhole.py b/arachnado/manhole.py index 03d4ce0..f120b4f 100644 --- a/arachnado/manhole.py +++ b/arachnado/manhole.py @@ -3,13 +3,13 @@ An interactive Python interpreter available through telnet. """ from __future__ import absolute_import -from twisted.conch.manhole import ColoredManhole -from twisted.conch.insults import insults from twisted.conch.telnet import TelnetTransport, TelnetBootstrapProtocol from twisted.internet import protocol def start(port=None, host=None, telnet_vars=None): + from twisted.conch.manhole import ColoredManhole + from twisted.conch.insults import insults from twisted.internet import reactor port = int(port) if port else 6023 diff --git a/arachnado/pipelines/mongoexport.py b/arachnado/pipelines/mongoexport.py index 9a1766e..4429e45 100644 --- a/arachnado/pipelines/mongoexport.py +++ b/arachnado/pipelines/mongoexport.py @@ -85,9 +85,9 @@ def from_crawler(cls, crawler): def get_spider_urls(cls, spider): options = getattr(spider.crawler, 'start_options', None) if options and "domain" in options: - return options["domain"] + return [options["domain"]] else: - return " ".join(spider.start_urls) + return spider.start_urls @tt_coroutine def open_spider(self, spider): diff --git a/arachnado/rpc/__init__.py b/arachnado/rpc/__init__.py index 838d5d2..e1d5289 100644 --- a/arachnado/rpc/__init__.py +++ b/arachnado/rpc/__init__.py @@ -13,12 +13,22 @@ class ArachnadoRPC(object): """ Base class for all Arachnado RPC resources. + Use it as a mixin for tornado.web.RequestHandler subclasses. + + It provides :meth:`handle_request` method which handles + Jobs, Sites and Pages RPC requests. """ + rpc_objects = tuple() + def initialize(self, *args, **kwargs): + jobs = Jobs(self, *args, **kwargs) + sites = Sites(self, *args, **kwargs) + pages = Pages(self, *args, **kwargs) + self.rpc_objects = [jobs, sites, pages] + self.dispatcher = Dispatcher() - self.dispatcher.add_object(Jobs(self, *args, **kwargs)) - self.dispatcher.add_object(Sites(self, *args, **kwargs)) - self.dispatcher.add_object(Pages(self, *args, **kwargs)) + for obj in self.rpc_objects: + self.dispatcher.add_object(obj) def handle_request(self, body): response = JSONRPCResponseManager.handle(body, self.dispatcher) @@ -45,4 +55,4 @@ def post(self): def send_data(self, data): self.write(json_encode(data)) - self.finish() \ No newline at end of file + self.finish() diff --git a/arachnado/rpc/data.py b/arachnado/rpc/data.py index 42a617c..552b665 100644 --- a/arachnado/rpc/data.py +++ b/arachnado/rpc/data.py @@ -1,11 +1,6 @@ import logging - -from arachnado.utils.misc import json_encode -# A little monkey patching to have custom types encoded right -# from jsonrpclib import jsonrpc -# jsonrpc.jdumps = json_encode -# import tornadorpc import json +from collections import deque from tornado import gen import tornado.ioloop from bson.objectid import ObjectId @@ -18,15 +13,14 @@ from arachnado.crawler_process import agg_stats_changed, CrawlerProcessSignals as CPS from arachnado.rpc.ws import RpcWebsocketHandler +from arachnado.utils.misc import json_encode logger = logging.getLogger(__name__) -# tornadorpc.config.verbose = True -# tornadorpc.config.short_errors = True class DataRpcWebsocketHandler(RpcWebsocketHandler): """ basic class for Data API handlers""" - stored_data = [] + stored_data = deque() delay_mode = False event_types = [] data_hb = None @@ -52,15 +46,15 @@ def init_hb(self, update_delay): ) self.data_hb.start() - def add_storage(self, mongo_q, storage): - self.dispatcher.add_object(storage) + def add_storage_wrapper(self, mongo_q, storage_wrapper): + self.dispatcher.add_object(storage_wrapper) new_id = str(len(self.storages)) self.storages[new_id] = { - "storage": storage, + "storage": storage_wrapper, "job_ids": set([]) } - storage.handler_id = new_id - storage.subscribe(query=mongo_q) + storage_wrapper.handler_id = new_id + storage_wrapper.subscribe(query=mongo_q) return new_id def cancel_subscription(self, subscription_id): @@ -79,8 +73,6 @@ def initialize(self, *args, **kwargs): self.dispatcher["cancel_subscription"] = self.cancel_subscription def on_close(self): - # import traceback - # traceback.print_stack() logger.info("connection closed") for storage in self.storages.values(): storage["storage"]._on_close() @@ -98,9 +90,8 @@ def on_spider_closed(self, spider): self.write_event("jobs:state", job) def send_updates(self): - logger.debug("send_updates: {}".format(len(self.stored_data))) while len(self.stored_data): - item = self.stored_data.pop(0) + item = self.stored_data.popleft() return self._send_event(item["event"], item["data"]) @@ -113,7 +104,7 @@ def subscribe_to_jobs(self, include=[], exclude=[], update_delay=0): mongo_q = self.create_jobs_query(include=include, exclude=exclude) self.init_hb(update_delay) return { "datatype": "job_subscription_id", - "id": self.add_storage(mongo_q, storage=self.create_jobs_storage_link()) + "id": self.add_storage_wrapper(mongo_q, storage_wrapper=self.create_jobs_storage_link()) } @gen.coroutine @@ -128,8 +119,8 @@ def write_event(self, event, data, handler_id=None): if event == 'stats:changed': if len(data) > 1: job_id = data[0] - # dumps for back compatibility - event_data = {"stats": json.dumps(data[1]), + # two fields with same content for back compatibility + event_data = {"stats": data[1], "stats_dict": data[1], } # same as crawl_id @@ -146,6 +137,12 @@ def write_event(self, event, data, handler_id=None): allowed = allowed or job_id in storage["job_ids"] if not allowed: return + if 'stats' in event_data: + if not isinstance(event_data['stats'], dict): + try: + event_data['stats'] = json.loads(event_data['stats']) + except Exception as ex: + logger.warning("Invalid stats field in job {}".format(event_data.get("_id", "MISSING MONGO ID"))) if event in self.event_types and self.delay_mode: self.stored_data.append({"event":event, "data":event_data}) else: @@ -173,8 +170,6 @@ def create_jobs_storage_link(self): return jobs def on_close(self): - # import traceback - # traceback.print_stack() logger.info("connection closed") if self.cp: self.cp.signals.disconnect(self.on_stats_changed, agg_stats_changed) @@ -206,12 +201,12 @@ def subscribe_to_pages(self, site_ids={}, update_delay=0, mode="urls"): } if mode == "urls": mongo_q = self.create_pages_query(site_ids=site_ids) - result["single_subscription_id"] = self.add_storage(mongo_q, storage=self.create_pages_storage_link()) + result["single_subscription_id"] = self.add_storage_wrapper(mongo_q, storage_wrapper=self.create_pages_storage_link()) elif mode == "ids": res = {} for site_id in site_ids: mongo_q = self.create_pages_query(site_ids=site_ids[site_id]) - res[site_id] = self.add_storage(mongo_q, storage=self.create_pages_storage_link()) + res[site_id] = self.add_storage_wrapper(mongo_q, storage_wrapper=self.create_pages_storage_link()) result["id"] = res return result diff --git a/arachnado/rpc/jobs.py b/arachnado/rpc/jobs.py index 5d48ef9..e0c2f40 100644 --- a/arachnado/rpc/jobs.py +++ b/arachnado/rpc/jobs.py @@ -1,15 +1,22 @@ import logging +from arachnado.storages.mongotail import MongoTailStorage + class Jobs(object): + """ + This object is exposed for RPC requests. + It allows to subscribe for scraping job updates. + """ handler_id = None logger = logging.getLogger(__name__) def __init__(self, handler, job_storage, **kwargs): self.handler = handler - self.storage = job_storage + self.storage = job_storage # type: MongoTailStorage def subscribe(self, last_id=0, query=None, fields=None): + """ Subscribe for job updates. """ self.storage.subscribe('tailed', self._publish, last_id=last_id, query=query, fields=fields) diff --git a/arachnado/rpc/pages.py b/arachnado/rpc/pages.py index c30adda..4540200 100644 --- a/arachnado/rpc/pages.py +++ b/arachnado/rpc/pages.py @@ -2,6 +2,7 @@ class Pages(object): + """ Pages (scraped items) object exposed via JSON RPC """ handler_id = None def __init__(self, handler, item_storage, **kwargs): diff --git a/arachnado/rpc/sites.py b/arachnado/rpc/sites.py index a547811..c475cdf 100644 --- a/arachnado/rpc/sites.py +++ b/arachnado/rpc/sites.py @@ -1,13 +1,16 @@ import logging +from functools import partial +from arachnado.storages.mongotail import MongoTailStorage -class Sites(object): +class Sites(object): + """ 'Known sites' object exposed via JSON-RPC """ logger = logging.getLogger(__name__) def __init__(self, handler, site_storage, **kwargs): self.handler = handler - self.storage = site_storage + self.storage = site_storage # type: MongoTailStorage def list(self): return self.storage.fetch() @@ -22,15 +25,14 @@ def delete(self, site): self.storage.delete(site) def subscribe(self): - for subscription in self.storage.available_subscriptions: + for event_name in self.storage.available_events: self.storage.subscribe( - subscription, - lambda data, subscription=subscription: - self._publish(data, subscription) + event_name, + partial(self._publish, event=event_name) ) def _on_close(self): - self.storage.unsubscribe(self.storage.available_subscriptions) + self.storage.unsubscribe(self.storage.available_events) - def _publish(self, data, subscription): - self.handler.write_event('sites.{}'.format(subscription), data) + def _publish(self, event, data): + self.handler.write_event('sites.{}'.format(event), data) diff --git a/arachnado/rpc/stats.py b/arachnado/rpc/stats.py deleted file mode 100644 index 07fa5a5..0000000 --- a/arachnado/rpc/stats.py +++ /dev/null @@ -1,33 +0,0 @@ -import logging - - -class Stats(object): - - logger = logging.getLogger(__name__) - - def __init__(self, handler, stats_storage, **kwargs): - self.handler = handler - self.storage = stats_storage - - def list(self): - return self.storage.cache.values() - - def post(self, site): - self.storage.create(site) - - def patch(self, site): - self.storage.update(site) - - def subscribe(self): - for subscription in self.storage.available_subscriptions: - self.storage.subscribe( - subscription, - lambda data, subscription=subscription: - self._publish(data, subscription) - ) - - def _on_close(self): - self.storage.unsubscribe(self.storage.available_subscriptions) - - def _publish(self, data, subscription): - self.handler.write_event('stats.{}'.format(subscription), data) diff --git a/arachnado/rpc/ws.py b/arachnado/rpc/ws.py index 1772b49..106a702 100644 --- a/arachnado/rpc/ws.py +++ b/arachnado/rpc/ws.py @@ -46,24 +46,19 @@ def write_event(self, event, data): def open(self): """ Forward open event to resource objects. """ - for resource in self._resources(): + logger.debug("Connection opened %s", self) + for resource in self.rpc_objects: if hasattr(resource, '_on_open'): resource._on_open() self._pinger = PeriodicCallback(lambda: self.ping(b'PING'), 1000 * 15) self._pinger.start() - logger.info("Pinger initiated") + logger.debug("Pinger initiated %s", self) def on_close(self): """ Forward on_close event to resource objects. """ - for resource in self._resources(): + logger.debug("Connection closed %s", self) + for resource in self.rpc_objects: if hasattr(resource, '_on_close'): resource._on_close() self._pinger.stop() - - def _resources(self): - for resource_name, resource in self.__dict__.items(): - if hasattr(RequestHandler, resource_name): - continue - yield resource - diff --git a/arachnado/site_checker.py b/arachnado/site_checker.py index 00630da..e450a57 100644 --- a/arachnado/site_checker.py +++ b/arachnado/site_checker.py @@ -165,5 +165,5 @@ def rerun_check(self, site): def detect_engine(self, body): result = self.detector.detect(body) if self.detector else None if result is None: - return 'generic', {} + return None, {} return result diff --git a/arachnado/static/js/pages/SitesPage.jsx b/arachnado/static/js/pages/SitesPage.jsx index 62198c6..e71bc1f 100644 --- a/arachnado/static/js/pages/SitesPage.jsx +++ b/arachnado/static/js/pages/SitesPage.jsx @@ -143,7 +143,8 @@ var SiteRow = React.createClass({ args: keyValueListToDict(this.props.site.args) }; - if(this.props.site.engine == 'generic') { + // checking for == 'generic' to be backwards compatible + if(!this.props.site.engine || this.props.site.engine == 'generic') { JobStore.Actions.startCrawl(this.props.site.url, options); } else { args.start_urls = [this.props.site.url]; diff --git a/arachnado/storages/memory.py b/arachnado/storages/memory.py deleted file mode 100644 index e69de29..0000000 diff --git a/arachnado/storages/mongo.py b/arachnado/storages/mongo.py index 68eebda..c71ed93 100644 --- a/arachnado/storages/mongo.py +++ b/arachnado/storages/mongo.py @@ -9,58 +9,64 @@ class MongoStorage(object): - + """ + Utility class for working with MongoDB data. + It supports CRUD operations and allows to subscribe to + created/updated/deleted events. + """ def __init__(self, mongo_uri, cache=False): self.mongo_uri = mongo_uri - self.cache_flag = cache _, _, _, _, self.col = motor_from_uri(mongo_uri) self.signal_manager = SignalManager() # Used for unsubscribe # disconnect() requires reference to original callback - self.subscription_callbacks = {} - if cache: - self.cache = defaultdict(dict) - else: - self.cache = None + self._callbacks = {} self.fetching = False self.signals = { 'created': object(), 'updated': object(), 'deleted': object(), } + # XXX: cache is used in arachnado.cron and arachnado.site_checker. + # Is it needed? + self.cache_flag = cache + if cache: + self.cache = defaultdict(dict) + else: + self.cache = None - def subscribe(self, subscriptions=None, callback=None): - if subscriptions is None: - subscriptions = self.available_subscriptions - if not isinstance(subscriptions, list): - subscriptions = [subscriptions] - for subscription in subscriptions: - try: - self.signal_manager.connect(callback, - self.signals[subscription], - weak=False) - self.subscription_callbacks[subscription] = callback - except KeyError as exc: - raise ValueError('Invalid subscription type: {}'.format(exc)) + def subscribe(self, events=None, callback=None): + if events is None: + events = self.available_events + if not isinstance(events, list): + events = [events] + for event_name in events: + if event_name not in self.signals: + raise ValueError('Invalid event name: {}'.format(event_name)) + self.signal_manager.connect(callback, + self.signals[event_name], + weak=False) + self._callbacks[event_name] = callback - def unsubscribe(self, subscriptions=None): - if subscriptions is None: - subscriptions = self.available_subscriptions - if not isinstance(subscriptions, list): - subscriptions = [subscriptions] - for subscription in subscriptions: + def unsubscribe(self, events=None): + if events is None: + events = self.available_events + if not isinstance(events, list): + events = [events] + for event_name in events: try: self.signal_manager.disconnect( - self.subscription_callbacks[subscription], - self.signals[subscription], + self._callbacks[event_name], + self.signals[event_name], weak=False ) - self.subscription_callbacks.pop(subscription, None) + self._callbacks.pop(event_name, None) except KeyError: + # FIXME: when can it happen? pass @property - def available_subscriptions(self): + def available_events(self): return list(self.signals.keys()) @coroutine diff --git a/arachnado/storages/mongotail.py b/arachnado/storages/mongotail.py index 61c429d..d30f68d 100644 --- a/arachnado/storages/mongotail.py +++ b/arachnado/storages/mongotail.py @@ -6,6 +6,9 @@ class MongoTailStorage(MongoStorage): + """ + This MongoStorage subclass allows to subscribe to a mongo query. + """ fetch_delay = 0 def __init__(self, mongo_uri, *args, **kwargs): @@ -13,18 +16,24 @@ def __init__(self, mongo_uri, *args, **kwargs): self.tailing = False self.signals['tailed'] = object() - def subscribe(self, subscriptions, callback, last_id=None, query=None, + def subscribe(self, events, callback, last_id=None, query=None, fields=None): - if 'tailed' in subscriptions: + if 'tailed' in events: self.tail(query, fields, last_id) - super(MongoTailStorage, self).subscribe(subscriptions, callback) + super(MongoTailStorage, self).subscribe(events, callback) - def unsubscribe(self, subscriptions): - if 'tailed' in subscriptions: + def unsubscribe(self, events): + if 'tailed' in events: self.untail() + # FIXME: shouldn't it unsubscribe from other events, i.e. call super()? @coroutine def tail(self, query=None, fields=None, last_object_id=None): + """ + Execute ``query`` periodically, fetching new results. + ``self.signals['tailed']`` signal with each result is sent + when a new document appears. + """ if self.tailing: raise RuntimeError('This storage is already tailing') self.tailing = True diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..e8c61ee --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,225 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " epub3 to make an epub3" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + @echo " dummy to check syntax errors of document sources" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Arachnado.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Arachnado.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Arachnado" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Arachnado" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: epub3 +epub3: + $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 + @echo + @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +.PHONY: dummy +dummy: + $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy + @echo + @echo "Build finished. Dummy builder generates no files." diff --git a/docs/_static/img/arachnado-0.png b/docs/_static/img/arachnado-0.png new file mode 100644 index 0000000..0a87676 Binary files /dev/null and b/docs/_static/img/arachnado-0.png differ diff --git a/docs/_static/img/arachnado-1.png b/docs/_static/img/arachnado-1.png new file mode 100644 index 0000000..212eb5b Binary files /dev/null and b/docs/_static/img/arachnado-1.png differ diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..6d3c3f0 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Arachnado documentation build configuration file, created by +# sphinx-quickstart on Fri Jul 1 16:48:37 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +# +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Arachnado' +copyright = '2016, TeamHG' +author = 'TeamHG' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.2' +# The full version, including alpha/beta/rc tags. +release = '0.2' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# +# today = '' +# +# Else, today_fmt is used as the format for a strftime call. +# +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +import sphinx_rtd_theme +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. +# " v documentation" by default. +# +# html_title = 'Arachnado v0.2' + +# A shorter title for the navigation bar. Default is the same as html_title. +# +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# +# html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# +# html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. +# The empty string is equivalent to '%b %d, %Y'. +# +# html_last_updated_fmt = None + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# +# html_additional_pages = {} + +# If false, no module index is generated. +# +# html_domain_indices = True + +# If false, no index is generated. +# +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' +# +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# 'ja' uses this config value. +# 'zh' user can custom change `jieba` dictionary path. +# +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# +# html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Arachnadodoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Arachnado.tex', 'Arachnado Documentation', + 'TeamHG', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# +# latex_use_parts = False + +# If true, show page references after internal links. +# +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# +# latex_appendices = [] + +# If false, no module index is generated. +# +# latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'arachnado', 'Arachnado Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +# +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Arachnado', 'Arachnado Documentation', + author, 'Arachnado', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# +# texinfo_appendices = [] + +# If false, no module index is generated. +# +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# +# texinfo_no_detailmenu = False diff --git a/docs/config.rst b/docs/config.rst new file mode 100644 index 0000000..65d3608 --- /dev/null +++ b/docs/config.rst @@ -0,0 +1,20 @@ +.. _config: + +Configuration +============= + +Arachnado can be configured using a config file. Put it to one of the common +locations: + +* `/etc/arachnado.conf` +* `~/.config/arachnado.conf` +* `~/.arachnado.conf'` + +or pass the file name as an argument when starting the server:: + + arachnado --config ./my-config.conf + +Available options and their default values: + +.. literalinclude:: + ../arachnado/config/defaults.conf diff --git a/docs/http-api.rst b/docs/http-api.rst new file mode 100644 index 0000000..b7d78c6 --- /dev/null +++ b/docs/http-api.rst @@ -0,0 +1,62 @@ +HTTP API +======== + +Arachnado provides HTTP API for starting/stopping crawls. + +To use HTTP API send a POST request with +``Content-Type: application/json`` header; parameters should be in +JSON-encoded POST body. + +/crawler/start +-------------- + +Start a crawling job. Prameters:: + + { + "domain": "", + "args": {}, + "settings": {} + } + +If job is started successfuly, endpoint returns +``{"status": "ok", "job_id": ""}`` object with an ID of a started job. + +In case of errors ``{"status": "error"}`` is returned. + +/crawler/stop +------------- + +Stop a job. Prameters:: + + {"job_id": ""} + +If job is stopped successfuly, endpoint returns +``{"status": "ok"}``, otherwise it returns ``{"status": "error"}``. + + +/crawler/pause +-------------- + +Pause a job. Prameters:: + + {"job_id": ""} + +If job is stopped successfuly, endpoint returns +``{"status": "ok"}``, otherwise it returns ``{"status": "error"}``. + + +/crawler/resume +--------------- + +Resume paused job. Prameters:: + + {"job_id": ""} + +If job is stopped successfuly, endpoint returns +``{"status": "ok"}``, otherwise it returns ``{"status": "error"}``. + + +/crawler/status +--------------- + +TODO diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..12070a9 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,37 @@ +Arachnado +========= + +Arachnado is a tool to crawl a specific website. +It provides a Tornado_-based HTTP API and a web UI for a +Scrapy_-based crawler. + +License is MIT. + +.. _Tornado: http://www.tornadoweb.org +.. _Scrapy: http://scrapy.org/ + +.. toctree:: + :maxdepth: 2 + + intro + config + http-api + json-rpc-api + +Screenshots +----------- + +.. image:: + _static/img/arachnado-0.png + + +.. image:: + _static/img/arachnado-1.png + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/docs/intro.rst b/docs/intro.rst new file mode 100644 index 0000000..a8534c3 --- /dev/null +++ b/docs/intro.rst @@ -0,0 +1,22 @@ +Getting Started +=============== + +Install +------- + +Arachnado requires Python 2.7 or Python 3.5. +To install Arachnado use pip:: + + pip install arachnado + +Run +--- + +To start Arachnado execute ``arachnado`` command:: + + arachnado + +and then visit http://0.0.0.0:8888. + +Run ``arachndo --help`` to see available command-line options. +See also: :ref:`config`. diff --git a/docs/json-rpc-api.rst b/docs/json-rpc-api.rst new file mode 100644 index 0000000..6da986c --- /dev/null +++ b/docs/json-rpc-api.rst @@ -0,0 +1,48 @@ +JSON RPC API +============ + +Arachnado provides JSON-RPC_ API for working with jobs and crawled items +(pages). The API works over WebSocket transport. + +**FIXME**: JSON-RPC request objects are wrapped: +``{"event": "rpc:request", "data": }``. +Responses are also wrapped: +``{"event": "rpc:response", "data": }``. + + +JSON-RPC requests have the following format:: + + { + "jsonrpc": "2.0", + + # pass unique request id here; this id will be included in response + "id": 362810, + + # command to execute + "method": "", + "params": {"name": "value"}, + } + +JSON-RPC responses:: + + { + "jsonrpc": "2.0", + + # id of the request + "id": 362810, + + # what command returns + "result": ... + } + +Working with jobs +----------------- + +JSON-RPC API allows to + +* get information about scraping jobs; +* start new crawls; +* subscribe to job updates. + + +.. _JSON-RPC: http://www.jsonrpc.org/specification diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..03d149a --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,281 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. epub3 to make an epub3 + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + echo. dummy to check syntax errors of document sources + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Arachnado.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Arachnado.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "epub3" ( + %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +if "%1" == "dummy" ( + %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. Dummy builder generates no files. + goto end +) + +:end diff --git a/requirements.txt b/requirements.txt index 8b5c728..5b795c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,6 @@ pymongo==2.8 # required for motor 0.6.2 docopt == 0.6.2 service_identity json-rpc==1.10.3 -croniter == 0.3.8 +croniter == 0.3.12 autopager == 0.2 autologin-middleware == 0.1.1 diff --git a/tests/items.jl b/tests/items.jl new file mode 100644 index 0000000..f3df1f7 --- /dev/null +++ b/tests/items.jl @@ -0,0 +1 @@ +{"status" : 200, "body" : "", "_type" : "page", "url" : "http://example.com/index.php", "items" : [ ], "headers" : { "Cache-Control" : [ "private, no-cache=\"set-cookie\"" ], "X-Powered-By" : [ "PHP/5.5.9-1ubuntu4.14" ], "Date" : [ "Sat, 28 May 2016 17:43:05 GMT" ], "Content-Type" : [ "text/html; charset=UTF-8" ], "Expires" : [ "Sat, 28 May 2016 17:43:05 GMT" ], "Vary" : [ "Accept-Encoding" ], "Server" : [ "Apache/2.4.7 (Ubuntu)" ] }, "meta" : { "download_timeout" : 180, "depth" : 2}} \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py index 87a1fea..40142a8 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -2,6 +2,7 @@ import tornado import json from tornado import web, websocket +import tornado.testing import tests.utils as u @@ -11,17 +12,12 @@ class TestJobsAPI(tornado.testing.AsyncHTTPTestCase): jobs_uri = r"/ws-jobs-data" def setUp(self): - print("setUp:") tornado.ioloop.IOLoop.current().run_sync(u.init_db) super(TestJobsAPI, self).setUp() def get_app(self): return u.get_app(self.pages_uri, self.jobs_uri) - # @tornado.testing.gen_test - # def test_fail(self): - # self.assertTrue(False) - @tornado.testing.gen_test def test_jobs_no_filter(self): jobs_command = { @@ -39,7 +35,6 @@ def test_jobs_no_filter(self): ws_client.write_message(json.dumps(jobs_command)) response = yield ws_client.read_message() json_response = json.loads(response) - print(json_response) subs_id = json_response.get("data", {}).get("result").get("id", -1) self.assertNotEqual(subs_id, -1) self.execute_cancel(ws_client, subs_id, True) @@ -62,7 +57,6 @@ def test_jobs_filter_include(self): ws_client.write_message(json.dumps(jobs_command)) response = yield ws_client.read_message() json_response = json.loads(response) - print(json_response) subs_id = json_response.get("data", {}).get("result").get("id", -1) self.assertNotEqual(subs_id, -1) cnt = 0 @@ -70,8 +64,11 @@ def test_jobs_filter_include(self): response = yield ws_client.read_message() json_response = json.loads(response) if json_response is None: - self.assertFail() + self.assertTrue(False) break + else: + self.assertTrue('stats' in json_response["data"]) + self.assertTrue(isinstance(json_response["data"]["stats"], dict)) cnt += 1 self.execute_cancel(ws_client, subs_id, True) @@ -92,9 +89,18 @@ def test_pages_no_filter(self): ws_client.write_message(json.dumps(pages_command)) response = yield ws_client.read_message() json_response = json.loads(response) - print(json_response) subs_id = json_response.get("data", {}).get("result").get("single_subscription_id", -1) self.assertNotEqual(subs_id, -1) + cnt = 0 + while cnt < 1: + response = yield ws_client.read_message() + json_response = json.loads(response) + if json_response is None: + self.assertTrue(False) + break + else: + self.assertTrue('url' in json_response["data"]) + cnt += 1 self.execute_cancel(ws_client, subs_id, True) @tornado.testing.gen_test @@ -118,5 +124,4 @@ def execute_cancel(self, ws_client, subscription_id, expected): ws_client.write_message(json.dumps(jobs_command)) response = yield ws_client.read_message() json_response = json.loads(response) - print(json_response) self.assertEqual(json_response.get("data", {}).get("result"), expected) diff --git a/tests/utils.py b/tests/utils.py index 74db7bd..2b9db2d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -6,6 +6,7 @@ from arachnado.storages.mongotail import MongoTailStorage from arachnado.utils.mongo import motor_from_uri + def get_db_uri(): return "mongodb://localhost:27017/arachnado-test" @@ -29,21 +30,21 @@ def get_app(ws_pages_uri, ws_jobs_uri): @tornado.gen.coroutine -def init_db(): - db_uri = get_db_uri() - # items_uri = "{}/items".format(db_uri) - uri = "{}/jobs".format(db_uri) - in_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "jobs.jl") - _, _, _, _, col = motor_from_uri(uri) - col_cnt = yield col.count() - print(col_cnt) - col.drop() - col_cnt = yield col.count() - print(col_cnt) - with open(in_path, "r") as fin: +def import_file(file_path, mongo_uri): + _, _, _, _, col = motor_from_uri(mongo_uri) + # col.drop() + with open(file_path, "r") as fin: for text_line in fin: - job = json.loads(text_line) - print(job["_id"]) - res = yield col.insert(job) + record = json.loads(text_line) + yield col.insert(record) +@tornado.gen.coroutine +def init_db(): + db_uri = get_db_uri() + jobs_uri = "{}/jobs".format(db_uri) + jobs_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "jobs.jl") + import_file(jobs_path, jobs_uri) + items_uri = "{}/items".format(db_uri) + items_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "items.jl") + import_file(items_path, items_uri) \ No newline at end of file diff --git a/tox.ini b/tox.ini index 1f6d62a..588ef70 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27 +envlist = py27,py35 [testenv] deps =