diff --git a/.gitignore b/.gitignore index c72797a..f73f3c4 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ bot_spiders/ .coverage.* htmlcov/ .scrapy +docs/_build diff --git a/Dockerfile b/Dockerfile index 9dcedb4..6651a80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,12 +35,12 @@ RUN npm install # install arachnado COPY . /app +RUN pip install --editable /app # npm install is executed again because node_modules can be overwritten # if .dockerignore is not active (may happen with docker-compose or DockerHub) RUN npm install RUN npm run build -RUN pip3 install . # use e.g. -v /path/to/my/arachnado/config.conf:/etc/arachnado.conf # docker run option to override arachnado parameters @@ -51,7 +51,7 @@ RUN pip3 install . # this folder is added to PYTHONPATH, so modules from there are available # for spider_packages Arachnado option VOLUME /python-packages -ENV PYTHONPATH $PYTHONPATH:/python-packages +ENV PYTHONPATH $PYTHONPATH:/python-packages:/app EXPOSE 8888 -ENTRYPOINT ["arachnado"] +CMD ["arachnado"] diff --git a/README.rst b/README.rst index d95fa01..388a4f1 100644 --- a/README.rst +++ b/README.rst @@ -13,7 +13,7 @@ License is MIT. Install ------- -Arachnado requires Python 2.7. +Arachnado requires Python 2.7 or Python 3.5. To install Arachnado use pip:: pip install arachnado @@ -41,6 +41,14 @@ the server:: For available options check https://github.com/TeamHG-Memex/arachnado/blob/master/arachnado/config/defaults.conf. +Tests +----- + +To run tests make sure tox_ is installed, then +execute ``tox`` command from the source root. + +.. _tox: https://testrun.org/tox/latest/ + Development ----------- diff --git a/arachnado/__main__.py b/arachnado/__main__.py index 6f7d864..3738cc3 100755 --- a/arachnado/__main__.py +++ b/arachnado/__main__.py @@ -83,8 +83,11 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): }) job_storage = MongoTailStorage(jobs_uri, cache=True) + job_storage.ensure_index("urls") site_storage = MongoStorage(sites_uri, cache=True) item_storage = MongoTailStorage(items_uri) + item_storage.ensure_index("url") + item_storage.ensure_index("_job_id") crawler_process = ArachnadoCrawlerProcess(settings) diff --git a/arachnado/config/defaults.conf b/arachnado/config/defaults.conf index 0357e35..b91fe69 100644 --- a/arachnado/config/defaults.conf +++ b/arachnado/config/defaults.conf @@ -5,7 +5,8 @@ [arachnado] ; General Arachnado server options. -; Event loop to use. Allowed values are "twisted", "tornado" and "auto". +; Event loop to use. Allowed values are +; "twisted", "tornado" and "auto". reactor = auto ; Host/port to listen to @@ -30,9 +31,11 @@ DEPTH_LIMIT = 10 ; Packages to load spiders from (separated by whitespace) spider_packages = -; Name of the default spider. It is used for crawling if no custom spider -; is specified or detected. It should support API similar to -; arachnado.spider.CrawlWebsiteSpider (which is the default here) + +; Name of the default spider. It is used for crawling if +; no custom spider is specified or detected. It should support +; API similar to arachnado.spider.CrawlWebsiteSpider +; (which is the default here). default_spider_name = generic [arachnado.storage] diff --git a/arachnado/cron.py b/arachnado/cron.py index e192c1e..a4b85d6 100644 --- a/arachnado/cron.py +++ b/arachnado/cron.py @@ -17,7 +17,7 @@ def __init__(self, domain_crawlers, site_storage): self.waiting_calls = {} self.domain_crawlers = domain_crawlers self.site_storage = site_storage - self.site_storage.subscribe(self.site_storage.available_subscriptions, + self.site_storage.subscribe(self.site_storage.available_events, self.rerun) def start(self): diff --git a/arachnado/handlers.py b/arachnado/handlers.py index 62ba6a6..af5afd6 100644 --- a/arachnado/handlers.py +++ b/arachnado/handlers.py @@ -8,6 +8,9 @@ from arachnado.utils.misc import json_encode from arachnado.monitor import Monitor from arachnado.handler_utils import ApiHandler, NoEtagsMixin + +from arachnado.rpc.data import PagesDataRpcWebsocketHandler, JobsDataRpcWebsocketHandler + from arachnado.rpc import RpcHttpHandler from arachnado.rpc.ws import RpcWebsocketHandler @@ -28,16 +31,23 @@ def get_application(crawler_process, domain_crawlers, debug = opts['arachnado']['debug'] handlers = [ + # UI url(r"/", Index, context, name="index"), url(r"/help", Help, context, name="help"), + + # simple API used by UI url(r"/crawler/start", StartCrawler, context, name="start"), url(r"/crawler/stop", StopCrawler, context, name="stop"), url(r"/crawler/pause", PauseCrawler, context, name="pause"), url(r"/crawler/resume", ResumeCrawler, context, name="resume"), url(r"/crawler/status", CrawlerStatus, context, name="status"), url(r"/ws-updates", Monitor, context, name="ws-updates"), + + # RPC API url(r"/ws-rpc", RpcWebsocketHandler, context, name="ws-rpc"), url(r"/rpc", RpcHttpHandler, context, name="rpc"), + url(r"/ws-pages-data", PagesDataRpcWebsocketHandler, context, name="ws-pages-data"), + url(r"/ws-jobs-data", JobsDataRpcWebsocketHandler, context, name="ws-jobs-data"), ] return Application( handlers=handlers, @@ -143,6 +153,8 @@ def control_job(self, job_id): class CrawlerStatus(BaseRequestHandler): """ Status for one or more jobs. """ + # FIXME: does it work? Can we remove it? It is not used + # by Arachnado UI. def get(self): crawl_ids_arg = self.get_argument('crawl_ids', '') diff --git a/arachnado/manhole.py b/arachnado/manhole.py index 03d4ce0..f120b4f 100644 --- a/arachnado/manhole.py +++ b/arachnado/manhole.py @@ -3,13 +3,13 @@ An interactive Python interpreter available through telnet. """ from __future__ import absolute_import -from twisted.conch.manhole import ColoredManhole -from twisted.conch.insults import insults from twisted.conch.telnet import TelnetTransport, TelnetBootstrapProtocol from twisted.internet import protocol def start(port=None, host=None, telnet_vars=None): + from twisted.conch.manhole import ColoredManhole + from twisted.conch.insults import insults from twisted.internet import reactor port = int(port) if port else 6023 diff --git a/arachnado/pipelines/mongoexport.py b/arachnado/pipelines/mongoexport.py index 17b519c..4429e45 100644 --- a/arachnado/pipelines/mongoexport.py +++ b/arachnado/pipelines/mongoexport.py @@ -81,6 +81,14 @@ def __init__(self, crawler): def from_crawler(cls, crawler): return cls(crawler) + @classmethod + def get_spider_urls(cls, spider): + options = getattr(spider.crawler, 'start_options', None) + if options and "domain" in options: + return [options["domain"]] + else: + return spider.start_urls + @tt_coroutine def open_spider(self, spider): try: @@ -94,6 +102,7 @@ def open_spider(self, spider): 'started_at': datetime.datetime.utcnow(), 'status': 'running', 'spider': spider.name, + "urls": self.get_spider_urls(spider), 'options': getattr(spider.crawler, 'start_options', {}), }, upsert=True, new=True) self.job_id = str(job['_id']) diff --git a/arachnado/rpc/__init__.py b/arachnado/rpc/__init__.py index 9975e0b..e1d5289 100644 --- a/arachnado/rpc/__init__.py +++ b/arachnado/rpc/__init__.py @@ -13,12 +13,22 @@ class ArachnadoRPC(object): """ Base class for all Arachnado RPC resources. + Use it as a mixin for tornado.web.RequestHandler subclasses. + + It provides :meth:`handle_request` method which handles + Jobs, Sites and Pages RPC requests. """ + rpc_objects = tuple() + def initialize(self, *args, **kwargs): + jobs = Jobs(self, *args, **kwargs) + sites = Sites(self, *args, **kwargs) + pages = Pages(self, *args, **kwargs) + self.rpc_objects = [jobs, sites, pages] + self.dispatcher = Dispatcher() - self.dispatcher.add_object(Jobs(self, *args, **kwargs)) - self.dispatcher.add_object(Sites(self, *args, **kwargs)) - self.dispatcher.add_object(Pages(self, *args, **kwargs)) + for obj in self.rpc_objects: + self.dispatcher.add_object(obj) def handle_request(self, body): response = JSONRPCResponseManager.handle(body, self.dispatcher) diff --git a/arachnado/rpc/data.py b/arachnado/rpc/data.py new file mode 100644 index 0000000..bd67164 --- /dev/null +++ b/arachnado/rpc/data.py @@ -0,0 +1,368 @@ +import logging +import json +import sys +from collections import deque +from tornado import gen +import tornado.ioloop +from bson.objectid import ObjectId +from bson.errors import InvalidId +from jsonrpc.dispatcher import Dispatcher + +from arachnado.rpc.jobs import Jobs +from arachnado.rpc.sites import Sites +from arachnado.rpc.pages import Pages + +from arachnado.crawler_process import agg_stats_changed, CrawlerProcessSignals as CPS +from arachnado.rpc.ws import RpcWebsocketHandler +from arachnado.utils.misc import json_encode + +logger = logging.getLogger(__name__) + + +class DataRpcWebsocketHandler(RpcWebsocketHandler): + """ basic class for Data API handlers""" + stored_data = None + delay_mode = False + heartbeat_data = None + i_args = None + i_kwargs = None + storages = None + max_msg_size = 2**20 + + def _send_event(self, data): + return super(DataRpcWebsocketHandler, self).write_event(data, max_message_size=self.max_msg_size) + + def init_heartbeat(self, update_delay): + if update_delay > 0 and not self.heartbeat_data: + self.delay_mode = True + self.heartbeat_data = tornado.ioloop.PeriodicCallback( + lambda: self.send_updates(), + update_delay + ) + self.heartbeat_data.start() + + def cancel_subscription(self, subscription_id): + storage = self.storages.pop(subscription_id, None) + if storage: + storage.on_close() + return True + else: + return False + + def set_max_message_size(self, max_size): + self.max_msg_size = max_size + return True + + def initialize(self, *args, **kwargs): + self.stored_data = deque() + self.storages = {} + self.i_args = args + self.i_kwargs = kwargs + self.cp = kwargs.get("crawler_process", None) + self.dispatcher = Dispatcher() + self.dispatcher["cancel_subscription"] = self.cancel_subscription + self.dispatcher["set_max_message_size"] = self.set_max_message_size + + def on_close(self): + logger.info("connection closed") + for storage in self.storages.values(): + storage.on_close() + if self.heartbeat_data: + self.heartbeat_data.stop() + super(DataRpcWebsocketHandler, self).on_close() + + def open(self): + logger.info("new connection") + super(DataRpcWebsocketHandler, self).open() + + def send_updates(self): + while len(self.stored_data): + item = self.stored_data.popleft() + self._send_event(item) + + +class JobsDataRpcWebsocketHandler(DataRpcWebsocketHandler): + mongo_id_mapping = None + job_url_mapping = None + stored_jobs_stats = None + + @gen.coroutine + def subscribe_to_jobs(self, include=None, exclude=None, update_delay=0, last_job_id=None): + self.init_heartbeat(update_delay) + stor_id, storage = self.add_storage() + jobs_storage = Jobs(self, *self.i_args, **self.i_kwargs) + jobs_storage.callback_meta = stor_id + jobs_storage.callback = self.on_jobs_tailed + storage.add_jobs_subscription(jobs_storage, include=include, exclude=exclude, last_id=last_job_id) + return {"datatype": "job_subscription_id", + "id": stor_id} + + def add_storage(self): + new_id = str(len(self.storages)) + storage = DataSubscription() + self.storages[new_id] = storage + return new_id, storage + + @gen.coroutine + def write_event(self, data, aggregate=False): + event_data = dict(data) + if 'stats' in event_data: + if not isinstance(event_data['stats'], dict): + try: + event_data['stats'] = json.loads(event_data['stats']) + except Exception as ex: + logger.warning("Invalid stats field in job {}".format(event_data.get("_id", "MISSING MONGO ID"))) + if aggregate and self.delay_mode: + item_id = event_data.get("_id", None) + if item_id: + if item_id in self.stored_jobs_stats: + self.stored_jobs_stats[item_id]["stats"].update(event_data["stats"]) + else: + item = event_data + self.stored_jobs_stats[item_id] = item + else: + logger.warning("Job data without _id field from event {}".format(event)) + else: + return self._send_event(event_data) + + def send_updates(self): + super(JobsDataRpcWebsocketHandler, self).send_updates() + for job_id in set(self.stored_jobs_stats.keys()): + item = self.stored_jobs_stats.pop(job_id, None) + if item: + self._send_event(item) + + def initialize(self, *args, **kwargs): + super(JobsDataRpcWebsocketHandler, self).initialize(*args, **kwargs) + self.dispatcher["subscribe_to_jobs"] = self.subscribe_to_jobs + self.mongo_id_mapping = {} + self.job_url_mapping = {} + self.stored_jobs_stats = {} + + def on_close(self): + logger.debug("connection closed") + if self.cp: + self.cp.signals.disconnect(self.on_stats_changed, agg_stats_changed) + self.cp.signals.disconnect(self.on_spider_closed, CPS.spider_closed) + super(JobsDataRpcWebsocketHandler, self).on_close() + + def open(self): + logger.debug("new connection") + super(JobsDataRpcWebsocketHandler, self).open() + if self.cp: + self.cp.signals.connect(self.on_stats_changed, agg_stats_changed) + self.cp.signals.connect(self.on_spider_closed, CPS.spider_closed) + + def on_stats_changed(self, changes, crawler): + job_id = crawler.spider.crawl_id + data = {"stats": changes} + # same as crawl_id + data["id"] = job_id + # mongo id + data["_id"] = self.mongo_id_mapping.get(job_id, "") + # job url + data["urls"] = self.job_url_mapping.get(job_id, "") + allowed = False + for storage in self.storages.values(): + allowed = allowed or job_id in storage.job_ids + if allowed: + self.write_event(data, aggregate=True) + + def on_spider_closed(self, spider): + if self.cp: + for job in self.cp.jobs: + job_id = job["id"] + allowed = False + if job_id: + for storage in self.storages.values(): + allowed = allowed or job_id in storage.job_ids + if allowed: + self.write_event(job) + + def on_jobs_tailed(self, data, callback_meta=None): + if "id" in data and callback_meta: + self.storages[callback_meta].job_ids.add(data["id"]) + self.mongo_id_mapping[data["id"]] = data.get("_id", None) + self.job_url_mapping[data["id"]] = data.get("urls", None) + self.write_event(data) + + +class PagesDataRpcWebsocketHandler(DataRpcWebsocketHandler): + """ pages API""" + + @gen.coroutine + def subscribe_to_pages(self, urls=None, url_groups=None): + result = { + "datatype": "pages_subscription_id", + "single_subscription_id": "", + "id": {}, + } + if urls: + result["single_subscription_id"] = yield self.create_subscribtion_to_urls(urls) + if url_groups: + res = {} + for group_id in url_groups: + res[group_id] = yield self.create_subscribtion_to_urls(url_groups[group_id]) + result["id"] = res + if not urls and not url_groups: + stor_id, storage = self.add_storage() + result["single_subscription_id"] = stor_id + storage.pages.subscribe(query={}) + raise gen.Return(result) + + @gen.coroutine + def create_subscribtion_to_urls(self, urls): + jobs_to_subscribe = [] + stor_id, storage = self.add_storage() + result = stor_id + for url in urls: + last_id = urls[url] + jobs = Jobs(self, *self.i_args, **self.i_kwargs) + jobs.callback_meta = { + "subscription_id":stor_id, + "last_id":last_id + } + jobs.callback = self.job_query_callback + jobs_q = self.create_jobs_query(url) + jobs_ds = yield jobs.storage.fetch(jobs_q) + job_ids =[x["_id"] for x in jobs_ds] + if job_ids: + storage.job_ids.update(job_ids) + pages_query = storage.create_pages_query(job_ids, last_id) + storage.filters.append(pages_query) + storage.jobs.append(jobs) + jobs_to_subscribe.append([jobs_q, jobs]) + else: + logger.info("No jobs found for url {}".format(url)) + storage.subscribe_to_pages() + for jobs_q, jobs in jobs_to_subscribe: + jobs.subscribe(query=jobs_q) + raise gen.Return(result) + + @gen.coroutine + def write_event(self, data, aggregate=False): + if aggregate and self.delay_mode: + self.stored_data.append(data) + else: + return self._send_event(data) + + def initialize(self, *args, **kwargs): + super(PagesDataRpcWebsocketHandler, self).initialize(*args, **kwargs) + self.dispatcher["subscribe_to_pages"] = self.subscribe_to_pages + + @gen.coroutine + def job_query_callback(self, data, callback_meta=None): + if "_id" in data and callback_meta: + storage = self.storages[callback_meta["subscription_id"]] + job_id = data["_id"] + storage.update_pages_subscription(job_id, callback_meta["last_id"]) + else: + logger.warning("Jobs callback with incomplete data") + + def on_pages_tailed(self, data, callback_meta=None): + self.write_event(data) + + def create_jobs_query(self, url): + if url: + return {"urls":{'$regex': url }} + else: + return {} + + def add_storage(self): + new_id = str(len(self.storages)) + pages = Pages(self, *self.i_args, **self.i_kwargs) + pages.callback = self.on_pages_tailed + self.storages[new_id] = DataSubscription(pages) + return new_id, self.storages[new_id] + + def cancel_subscription(self, subscription_id): + storage = self.storages.pop(subscription_id, None) + if storage: + storage.on_close() + return True + else: + return False + + +class DataSubscription(object): + + def __init__(self, pages_storage=None): + self.pages = pages_storage + self.jobs = [] + self.job_ids = set([]) + self.filters = [] + + def on_close(self): + for jobs in self.jobs: + jobs._on_close() + if self.pages: + self.pages._on_close() + + def subscribe_to_pages(self, require_filters=True): + if self.filters: + if len(self.filters) == 1: + self.pages.subscribe(query=self.filters[0]) + elif len(self.filters) > 1: + self.pages.subscribe(query={"$or": self.filters}) + elif not require_filters: + self.pages.subscribe(query={}) + else: + logger.warning("No subscription - empty filter list") + + def add_jobs_subscription(self, jobs_storage, include=None, exclude=None, last_id=None): + jobs_query = self.create_jobs_subscription_query(include=include, exclude=exclude, last_id=last_id) + self.jobs.append(jobs_storage) + jobs_storage.subscribe(query=jobs_query) + + def update_pages_subscription(self, job_id, last_id): + if job_id not in self.job_ids: + # stop pages subscription + self.pages.unsubscribe() + # create new pages query + pages_query = self.create_pages_query([job_id], last_id) + self.filters.append(pages_query) + # subscribe to pages + self.subscribe_to_pages() + else: + logger.debug("Already subscribed to job {}".format(job_id)) + + def create_pages_query(self, job_ids=None, last_id=None): + filters = [] + job_conditions_lst = [] + if job_ids: + for job_id in job_ids: + job_conditions_lst.append({"_job_id":{'$eq': str(job_id) }}) + if job_conditions_lst: + if len(job_conditions_lst) > 1: + filters.append({"$or": job_conditions_lst}) + else: + filters.append(job_conditions_lst[0]) + if last_id: + try: + page_id = ObjectId(last_id) + filters.append({"_id":{"$gt":page_id}}) + except InvalidId: + logger.warning("Invalid ObjectID: {}, will use job ids filter only.".format(last_id)) + items_q = {} + if len(filters) == 1: + items_q = filters[0] + elif len(filters) > 1: + items_q = {"$and": filters} + return items_q + + def create_jobs_subscription_query(self, include, exclude, last_id): + conditions = [] + if last_id: + conditions.append({"_id":{"$gt":last_id}}) + if include: + for inc_str in include: + conditions.append({"urls":{'$regex': inc_str }}) + if exclude: + for exc_str in exclude: + conditions.append({"urls":{'$regex': '^((?!' + exc_str + ').)*$'}}) + jobs_q = {} + if len(conditions) == 1: + jobs_q = conditions[0] + elif len(conditions): + jobs_q = {"$and": conditions } + return jobs_q \ No newline at end of file diff --git a/arachnado/rpc/jobs.py b/arachnado/rpc/jobs.py index 8757ee2..e59ee30 100644 --- a/arachnado/rpc/jobs.py +++ b/arachnado/rpc/jobs.py @@ -1,15 +1,23 @@ import logging +from arachnado.storages.mongotail import MongoTailStorage -class Jobs(object): +class Jobs(object): + """ + This object is exposed for RPC requests. + It allows to subscribe for scraping job updates. + """ + callback_meta = None + callback = None logger = logging.getLogger(__name__) def __init__(self, handler, job_storage, **kwargs): self.handler = handler - self.storage = job_storage + self.storage = job_storage # type: MongoTailStorage def subscribe(self, last_id=0, query=None, fields=None): + """ Subscribe for job updates. """ self.storage.subscribe('tailed', self._publish, last_id=last_id, query=query, fields=fields) @@ -17,8 +25,12 @@ def _on_close(self): self.storage.unsubscribe('tailed') def _publish(self, data): - # print("=============================================================") - # print("jobs rpc :") - # print(data) + if self.callback: + _callback = self.callback + else: + _callback = self.handler.write_event if self.storage.tailing: - self.handler.write_event('jobs.tailed', data) + if self.callback_meta: + _callback(data, callback_meta=self.callback_meta) + else: + _callback(data) diff --git a/arachnado/rpc/pages.py b/arachnado/rpc/pages.py index 1f7e891..014031d 100644 --- a/arachnado/rpc/pages.py +++ b/arachnado/rpc/pages.py @@ -2,6 +2,9 @@ class Pages(object): + """ Pages (scraped items) object exposed via JSON RPC """ + handler_id = None + callback = None def __init__(self, handler, item_storage, **kwargs): self.handler = handler @@ -17,6 +20,13 @@ def subscribe(self, last_id=0, query=None, fields=None, fetch_delay=None): def _on_close(self): self.storage.unsubscribe('tailed') + def unsubscribe(self): + self.storage.unsubscribe('tailed') + def _publish(self, data): + if self.callback: + _callback = self.callback + else: + _callback = self.handler.write_event if self.storage.tailing: - self.handler.write_event('pages.tailed', data) + _callback(data) diff --git a/arachnado/rpc/sites.py b/arachnado/rpc/sites.py index a547811..c475cdf 100644 --- a/arachnado/rpc/sites.py +++ b/arachnado/rpc/sites.py @@ -1,13 +1,16 @@ import logging +from functools import partial +from arachnado.storages.mongotail import MongoTailStorage -class Sites(object): +class Sites(object): + """ 'Known sites' object exposed via JSON-RPC """ logger = logging.getLogger(__name__) def __init__(self, handler, site_storage, **kwargs): self.handler = handler - self.storage = site_storage + self.storage = site_storage # type: MongoTailStorage def list(self): return self.storage.fetch() @@ -22,15 +25,14 @@ def delete(self, site): self.storage.delete(site) def subscribe(self): - for subscription in self.storage.available_subscriptions: + for event_name in self.storage.available_events: self.storage.subscribe( - subscription, - lambda data, subscription=subscription: - self._publish(data, subscription) + event_name, + partial(self._publish, event=event_name) ) def _on_close(self): - self.storage.unsubscribe(self.storage.available_subscriptions) + self.storage.unsubscribe(self.storage.available_events) - def _publish(self, data, subscription): - self.handler.write_event('sites.{}'.format(subscription), data) + def _publish(self, event, data): + self.handler.write_event('sites.{}'.format(event), data) diff --git a/arachnado/rpc/stats.py b/arachnado/rpc/stats.py deleted file mode 100644 index 07fa5a5..0000000 --- a/arachnado/rpc/stats.py +++ /dev/null @@ -1,33 +0,0 @@ -import logging - - -class Stats(object): - - logger = logging.getLogger(__name__) - - def __init__(self, handler, stats_storage, **kwargs): - self.handler = handler - self.storage = stats_storage - - def list(self): - return self.storage.cache.values() - - def post(self, site): - self.storage.create(site) - - def patch(self, site): - self.storage.update(site) - - def subscribe(self): - for subscription in self.storage.available_subscriptions: - self.storage.subscribe( - subscription, - lambda data, subscription=subscription: - self._publish(data, subscription) - ) - - def _on_close(self): - self.storage.unsubscribe(self.storage.available_subscriptions) - - def _publish(self, data, subscription): - self.handler.write_event('stats.{}'.format(subscription), data) diff --git a/arachnado/rpc/ws.py b/arachnado/rpc/ws.py index 0653058..834edbf 100644 --- a/arachnado/rpc/ws.py +++ b/arachnado/rpc/ws.py @@ -1,3 +1,4 @@ +import sys import json import logging import six @@ -19,49 +20,41 @@ class RpcWebsocketHandler(ArachnadoRPC, websocket.WebSocketHandler): """ def on_message(self, message): - try: - msg = json.loads(message) - event, data = msg['event'], msg['data'] - except (TypeError, ValueError): - logger.warn('Invalid message skipped: {!r}'.format(message[:500])) - return - if event == 'rpc:request': - self.handle_request(json.dumps(data)) - else: - logger.warn('Unsupported event type: {!r}'.format(event)) + self.handle_request(message) def send_data(self, data): - self.write_event('rpc:response', data) + self.write_event(data) @gen.coroutine - def write_event(self, event, data): + def write_event(self, data, max_message_size=0): if isinstance(data, six.string_types): - data = json.loads(data) - message = json_encode({'event': event, 'data': data}) + message = data + else: + message = json_encode(data) try: - self.write_message(message) + if sys.getsizeof(message) < max_message_size or not max_message_size: + self.write_message(message) + else: + logger.info("Message size exceeded. Message wasn't sent.") except websocket.WebSocketClosedError: pass def open(self): """ Forward open event to resource objects. """ - for resource in self._resources(): + logger.debug("Connection opened %s", self) + for resource in self.rpc_objects: if hasattr(resource, '_on_open'): resource._on_open() self._pinger = PeriodicCallback(lambda: self.ping(b'PING'), 1000 * 15) self._pinger.start() + logger.debug("Pinger initiated %s", self) def on_close(self): """ Forward on_close event to resource objects. """ - for resource in self._resources(): + logger.debug("Connection closed %s", self) + for resource in self.rpc_objects: if hasattr(resource, '_on_close'): resource._on_close() self._pinger.stop() - - def _resources(self): - for resource_name, resource in self.__dict__.items(): - if hasattr(RequestHandler, resource_name): - continue - yield resource diff --git a/arachnado/storages/mongo.py b/arachnado/storages/mongo.py index 68eebda..a253365 100644 --- a/arachnado/storages/mongo.py +++ b/arachnado/storages/mongo.py @@ -9,58 +9,64 @@ class MongoStorage(object): - + """ + Utility class for working with MongoDB data. + It supports CRUD operations and allows to subscribe to + created/updated/deleted events. + """ def __init__(self, mongo_uri, cache=False): self.mongo_uri = mongo_uri - self.cache_flag = cache _, _, _, _, self.col = motor_from_uri(mongo_uri) self.signal_manager = SignalManager() # Used for unsubscribe # disconnect() requires reference to original callback - self.subscription_callbacks = {} - if cache: - self.cache = defaultdict(dict) - else: - self.cache = None + self._callbacks = {} self.fetching = False self.signals = { 'created': object(), 'updated': object(), 'deleted': object(), } + # XXX: cache is used in arachnado.cron and arachnado.site_checker. + # Is it needed? + self.cache_flag = cache + if cache: + self.cache = defaultdict(dict) + else: + self.cache = None - def subscribe(self, subscriptions=None, callback=None): - if subscriptions is None: - subscriptions = self.available_subscriptions - if not isinstance(subscriptions, list): - subscriptions = [subscriptions] - for subscription in subscriptions: - try: - self.signal_manager.connect(callback, - self.signals[subscription], - weak=False) - self.subscription_callbacks[subscription] = callback - except KeyError as exc: - raise ValueError('Invalid subscription type: {}'.format(exc)) + def subscribe(self, events=None, callback=None): + if events is None: + events = self.available_events + if not isinstance(events, list): + events = [events] + for event_name in events: + if event_name not in self.signals: + raise ValueError('Invalid event name: {}'.format(event_name)) + self.signal_manager.connect(callback, + self.signals[event_name], + weak=False) + self._callbacks[event_name] = callback - def unsubscribe(self, subscriptions=None): - if subscriptions is None: - subscriptions = self.available_subscriptions - if not isinstance(subscriptions, list): - subscriptions = [subscriptions] - for subscription in subscriptions: + def unsubscribe(self, events=None): + if events is None: + events = self.available_events + if not isinstance(events, list): + events = [events] + for event_name in events: try: self.signal_manager.disconnect( - self.subscription_callbacks[subscription], - self.signals[subscription], + self._callbacks[event_name], + self.signals[event_name], weak=False ) - self.subscription_callbacks.pop(subscription, None) + self._callbacks.pop(event_name, None) except KeyError: + # FIXME: when can it happen? pass @property - def available_subscriptions(self): + def available_events(self): return list(self.signals.keys()) @coroutine @@ -91,6 +97,11 @@ def create(self, doc): self.signal_manager.send_catch_log(self.signals['created'], data=doc) raise Return(result) + @coroutine + def ensure_index(self, key_or_list): + result = yield self.col.ensure_index(key_or_list) + raise Return(result) + @coroutine def update(self, doc): doc = replace_dots(doc) diff --git a/arachnado/storages/mongotail.py b/arachnado/storages/mongotail.py index 61c429d..d30f68d 100644 --- a/arachnado/storages/mongotail.py +++ b/arachnado/storages/mongotail.py @@ -6,6 +6,9 @@ class MongoTailStorage(MongoStorage): + """ + This MongoStorage subclass allows to subscribe to a mongo query. + """ fetch_delay = 0 def __init__(self, mongo_uri, *args, **kwargs): @@ -13,18 +16,24 @@ def __init__(self, mongo_uri, *args, **kwargs): self.tailing = False self.signals['tailed'] = object() - def subscribe(self, subscriptions, callback, last_id=None, query=None, + def subscribe(self, events, callback, last_id=None, query=None, fields=None): - if 'tailed' in subscriptions: + if 'tailed' in events: self.tail(query, fields, last_id) - super(MongoTailStorage, self).subscribe(subscriptions, callback) + super(MongoTailStorage, self).subscribe(events, callback) - def unsubscribe(self, subscriptions): - if 'tailed' in subscriptions: + def unsubscribe(self, events): + if 'tailed' in events: self.untail() + # FIXME: shouldn't it unsubscribe from other events, i.e. call super()? @coroutine def tail(self, query=None, fields=None, last_object_id=None): + """ + Execute ``query`` periodically, fetching new results. + ``self.signals['tailed']`` signal with each result is sent + when a new document appears. + """ if self.tailing: raise RuntimeError('This storage is already tailing') self.tailing = True diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..e8c61ee --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,225 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " epub3 to make an epub3" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + @echo " dummy to check syntax errors of document sources" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Arachnado.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Arachnado.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Arachnado" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Arachnado" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: epub3 +epub3: + $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 + @echo + @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +.PHONY: dummy +dummy: + $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy + @echo + @echo "Build finished. Dummy builder generates no files." diff --git a/docs/_static/img/arachnado-0.png b/docs/_static/img/arachnado-0.png new file mode 100644 index 0000000..0a87676 Binary files /dev/null and b/docs/_static/img/arachnado-0.png differ diff --git a/docs/_static/img/arachnado-1.png b/docs/_static/img/arachnado-1.png new file mode 100644 index 0000000..212eb5b Binary files /dev/null and b/docs/_static/img/arachnado-1.png differ diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..6d3c3f0 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Arachnado documentation build configuration file, created by +# sphinx-quickstart on Fri Jul 1 16:48:37 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +# +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Arachnado' +copyright = '2016, TeamHG' +author = 'TeamHG' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.2' +# The full version, including alpha/beta/rc tags. +release = '0.2' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# +# today = '' +# +# Else, today_fmt is used as the format for a strftime call. +# +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +import sphinx_rtd_theme +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. +# " v documentation" by default. +# +# html_title = 'Arachnado v0.2' + +# A shorter title for the navigation bar. Default is the same as html_title. +# +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# +# html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# +# html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. +# The empty string is equivalent to '%b %d, %Y'. +# +# html_last_updated_fmt = None + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# +# html_additional_pages = {} + +# If false, no module index is generated. +# +# html_domain_indices = True + +# If false, no index is generated. +# +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' +# +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# 'ja' uses this config value. +# 'zh' user can custom change `jieba` dictionary path. +# +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# +# html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Arachnadodoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Arachnado.tex', 'Arachnado Documentation', + 'TeamHG', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# +# latex_use_parts = False + +# If true, show page references after internal links. +# +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# +# latex_appendices = [] + +# If false, no module index is generated. +# +# latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'arachnado', 'Arachnado Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +# +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Arachnado', 'Arachnado Documentation', + author, 'Arachnado', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# +# texinfo_appendices = [] + +# If false, no module index is generated. +# +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# +# texinfo_no_detailmenu = False diff --git a/docs/config.rst b/docs/config.rst new file mode 100644 index 0000000..65d3608 --- /dev/null +++ b/docs/config.rst @@ -0,0 +1,20 @@ +.. _config: + +Configuration +============= + +Arachnado can be configured using a config file. Put it to one of the common +locations: + +* `/etc/arachnado.conf` +* `~/.config/arachnado.conf` +* `~/.arachnado.conf'` + +or pass the file name as an argument when starting the server:: + + arachnado --config ./my-config.conf + +Available options and their default values: + +.. literalinclude:: + ../arachnado/config/defaults.conf diff --git a/docs/http-api.rst b/docs/http-api.rst new file mode 100644 index 0000000..b7d78c6 --- /dev/null +++ b/docs/http-api.rst @@ -0,0 +1,62 @@ +HTTP API +======== + +Arachnado provides HTTP API for starting/stopping crawls. + +To use HTTP API send a POST request with +``Content-Type: application/json`` header; parameters should be in +JSON-encoded POST body. + +/crawler/start +-------------- + +Start a crawling job. Prameters:: + + { + "domain": "", + "args": {}, + "settings": {} + } + +If job is started successfuly, endpoint returns +``{"status": "ok", "job_id": ""}`` object with an ID of a started job. + +In case of errors ``{"status": "error"}`` is returned. + +/crawler/stop +------------- + +Stop a job. Prameters:: + + {"job_id": ""} + +If job is stopped successfuly, endpoint returns +``{"status": "ok"}``, otherwise it returns ``{"status": "error"}``. + + +/crawler/pause +-------------- + +Pause a job. Prameters:: + + {"job_id": ""} + +If job is stopped successfuly, endpoint returns +``{"status": "ok"}``, otherwise it returns ``{"status": "error"}``. + + +/crawler/resume +--------------- + +Resume paused job. Prameters:: + + {"job_id": ""} + +If job is stopped successfuly, endpoint returns +``{"status": "ok"}``, otherwise it returns ``{"status": "error"}``. + + +/crawler/status +--------------- + +TODO diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..12070a9 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,37 @@ +Arachnado +========= + +Arachnado is a tool to crawl a specific website. +It provides a Tornado_-based HTTP API and a web UI for a +Scrapy_-based crawler. + +License is MIT. + +.. _Tornado: http://www.tornadoweb.org +.. _Scrapy: http://scrapy.org/ + +.. toctree:: + :maxdepth: 2 + + intro + config + http-api + json-rpc-api + +Screenshots +----------- + +.. image:: + _static/img/arachnado-0.png + + +.. image:: + _static/img/arachnado-1.png + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/docs/intro.rst b/docs/intro.rst new file mode 100644 index 0000000..a8534c3 --- /dev/null +++ b/docs/intro.rst @@ -0,0 +1,22 @@ +Getting Started +=============== + +Install +------- + +Arachnado requires Python 2.7 or Python 3.5. +To install Arachnado use pip:: + + pip install arachnado + +Run +--- + +To start Arachnado execute ``arachnado`` command:: + + arachnado + +and then visit http://0.0.0.0:8888. + +Run ``arachndo --help`` to see available command-line options. +See also: :ref:`config`. diff --git a/docs/json-rpc-api.rst b/docs/json-rpc-api.rst new file mode 100644 index 0000000..e642ff8 --- /dev/null +++ b/docs/json-rpc-api.rst @@ -0,0 +1,227 @@ +JSON RPC API +============ + +Arachnado provides JSON-RPC_ API for working with jobs and crawled items +(pages). The API works over WebSocket transport. + +JSON-RPC requests have the following format:: + + { + "jsonrpc": "2.0", + + # pass unique request id here; this id will be included in response + "id": 362810, + + # command to execute + "method": "", + "params": {"name": "value"}, + } + +JSON-RPC responses:: + + { + "jsonrpc": "2.0", + + # id of the request + "id": 362810, + + # what command returns + "result": ... + } + +Working with jobs and pages +--------------------------- + +JSON-RPC API allows to + +* get information about scraping jobs; +* start new crawls; +* subscripbe to crawled pages; +* subscribe to job updates. + +jobs.subscribe + Get information about jobs and subscribe for new jobs. + + Parameters: + + * last_id - optional, ObjectID value of a last previously seen job; + When passed, only new job data is returned; + * query - optional, MongoDB query; + * fields - optional, set of fields to return. + +pages.subscribe + Get crawled pages and subscribe for new pages. + + Parameters: + + * last_id - optional, ObjectID value of a last previously seen page. + When passed, only new job data is returned; + * query - optional, MongoDB query; + * fields - optional, set of fields to return. + + +New API +======= + +Working with jobs +----------------- + +Open a websocket connection to ``/ws-jobs-data`` in order to use +jobs JSON-RPC API for scraping jobs. + +subscribe_to_jobs + Get information about jobs and subscribe for new jobs. + Parameters: + + * include - an array of regexes which should match URLs to include; + * exclude - an array of regexes; URLs matched by these regexes are excluded + from the result; + * update_delay - (opional) int, a minimum number of ms between websocket messages. If this parameter set then Arachnado will aggregate job statistics; + * last_job_id - optional, ObjectID value of a last previously seen job. + When passed, only new job data is returned. + + + Response contains subscription ID in ``['result']['id']`` field:: + + { + 'id': '', + 'jsonrpc': '2.0', + 'result': {'datatype': 'job_subscription_id', 'id': '0'} + } + + Use this ID to cancel the subscription. + + After the subscription Arachnado will start to send information + about new jobs. Messages look like this:: + + { + '_id': '574718bba7a4edb9b026f248', + 'finished_at': '2016-05-26 16:03:17', + 'id': '97ca610fa8c347dbafeca9fcd02213dd', + 'options': { + 'args': {}, + 'crawl_id': '97ca610fa8c347dbafeca9fcd02213dd', + 'domain': 'scrapy.org', + 'settings': {} + }, + 'spider': 'generic', + 'started_at': '2016-05-26 16:03:16', + 'stats': {...}, + 'status': 'finished' + } + +cancel_subscription + Stop receiving updates about jobs. Parameters: + + * subscription_id + + +set_max_message_size + Set maximum message size in bytes for websockets channel. + Messages larger than specified limit are dropped. + Default value is 2**20. + To disable this check set max size to zero. + Parameters: + * max_size - maximum message size in bytes. + + Response returns result(true/false) at result field:: + + + { + "id": '', + "result": true, + "jsonrpc": "2.0" + } + + +Working with pages (crawled items) +---------------------------------- + +Open a websocket connection to ``/ws-pages-data`` in order to use +jobs JSON-RPC API for scraping jobs. + +subscribe_to_pages + Get crawled pages(items) for specific urls. + Url values are used as regex without any modifications at Arachnado side. + Allows to get all pages or only crawled since last update. + Search function uses job start urls, not page urls. + For example, if job was started for www.mysite.com and then goes to www.example.com (by redirect, etc.), + all its pages will be returned by www.mysite.com search query. + To search pages by its own urls use pages.subscribe method described above. + To get only new pages set last seen page id (from "id" field of page record) for an url. + To get all pages set page id to None. + + Parameters: + + * urls - a dictionary of :. Arachnado will create one subscription id for all urls; + * url_groups - a dictionary of : {:}. Arachnado will create one subscription id for each url group. + + Command example:: + + { + 'id': '', + 'jsonrpc': '2.0', + 'method': 'subscribe_to_pages', + 'params': { + 'urls': {'http://example.com': None}, + 'url_groups': { + 'gr1': {'http://example1.com': None}, + 'gr2': {'http://example2.com': "57863974a8cb9c15e8f3d53a"}} + } + } + } + + Response example for above command:: + + { + "result": { + "datatype": "pages_subscription_id", + "single_subscription_id": "112", # subscription id for http://example.com subscription + "id": { + "gr1": "113", # subscription id for http://example1.com subscription + "gr2": "114", # subscription id for http://example2.com subscription + } + }, + "id": '', # command request id + "jsonrpc": "2.0" + } + + Use these IDs to cancel subscriptions. + + After the subscription Arachnado will start to send information + about crawled pages. Messages look like this:: + + { + "status": 200, + "items": [], + "_id": "57863974a8cb9c15e8f3d53a", + "url": "http://example.com/index.php", + "headers": {}, + "_type": "page", + "body": "" + } + + +cancel_subscription + Stop receiving updates. Parameters: + + * subscription_id + +set_max_message_size + Set maximum message size in bytes for websockets channel. + Messages larger than specified limit are dropped. + Default value is 2**20. + To disable this check set max size to zero. + Parameters: + * max_size - maximum message size in bytes. + + Response returns result(true/false) at result field:: + + { + "id": '',, + "result": true, + "jsonrpc": "2.0" + } + + +.. _JSON-RPC: http://www.jsonrpc.org/specification diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..03d149a --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,281 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. epub3 to make an epub3 + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + echo. dummy to check syntax errors of document sources + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Arachnado.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Arachnado.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "epub3" ( + %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +if "%1" == "dummy" ( + %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. Dummy builder generates no files. + goto end +) + +:end diff --git a/requirements.txt b/requirements.txt index 8b5c728..5b795c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,6 @@ pymongo==2.8 # required for motor 0.6.2 docopt == 0.6.2 service_identity json-rpc==1.10.3 -croniter == 0.3.8 +croniter == 0.3.12 autopager == 0.2 autologin-middleware == 0.1.1 diff --git a/arachnado/storages/memory.py b/tests/__init__.py similarity index 100% rename from arachnado/storages/memory.py rename to tests/__init__.py diff --git a/tests/items.jl b/tests/items.jl new file mode 100644 index 0000000..c39786c --- /dev/null +++ b/tests/items.jl @@ -0,0 +1,2 @@ +{"_job_id": "5749d89da8cb9c1f286e3a90","status" : 200, "body" : "", "_type" : "page", "url" : "http://mysite.com/index.php", "items" : [ ], "headers" : { "Cache-Control" : [ "private, no-cache=\"set-cookie\"" ], "X-Powered-By" : [ "PHP/5.5.9-1ubuntu4.14" ], "Date" : [ "Sat, 28 May 2016 17:43:05 GMT" ], "Content-Type" : [ "text/html; charset=UTF-8" ], "Expires" : [ "Sat, 28 May 2016 17:43:05 GMT" ], "Vary" : [ "Accept-Encoding" ], "Server" : [ "Apache/2.4.7 (Ubuntu)" ] }, "meta" : { "download_timeout" : 180, "depth" : 2}} +{"_job_id": "5749d89da8cb9c1f286e3fff","status" : 200, "body" : "", "_type" : "page", "url" : "http://mysite.com", "items" : [ ], "headers" : { "Cache-Control" : [ "private, no-cache=\"set-cookie\"" ], "X-Powered-By" : [ "PHP/5.5.9-1ubuntu4.14" ], "Date" : [ "Sat, 28 May 2016 17:43:05 GMT" ], "Content-Type" : [ "text/html; charset=UTF-8" ], "Expires" : [ "Sat, 28 May 2016 17:43:05 GMT" ], "Vary" : [ "Accept-Encoding" ], "Server" : [ "Apache/2.4.7 (Ubuntu)" ] }, "meta" : { "download_timeout" : 180, "depth" : 2}} \ No newline at end of file diff --git a/tests/jobs.jl b/tests/jobs.jl new file mode 100644 index 0000000..1338406 --- /dev/null +++ b/tests/jobs.jl @@ -0,0 +1,2 @@ +{"_id": "5749d45fa8cb9c1df532a98a", "started_at": "2016-05-28 17:42:53", "urls": "http://127.0.0.1/", "spider": "generic", "status": "finished", "stats": "{\"memusage/startup\": 42012672, \"arachnado/domain\": \"127.0.0.1\", \"log_count/INFO\": 11, \"downloader/response_count\": 58, \"downloader/response_bytes\": 220480, \"finish_reason\": \"finished\", \"arachnado/start_url\": \"http://127.0.0.1/\", \"mongo_export/items_stored_count\": 58, \"scheduler/dequeued\": 58, \"request_depth_max\": 4, \"start_time\": \"2016-05-28 17:42:53\", \"request_depth_count/2\": 351, \"scheduler/enqueued/disk\": 58, \"downloader/request_bytes\": 27767, \"downloader/request_method_count/GET\": 58, \"memusage/max\": 42012672, \"dupefilter/filtered\": 1054, \"scheduler/dequeued/disk\": 58, \"downloader/response_status_count/200\": 58, \"response_received_count\": 58, \"finish_time\": \"2016-05-28 17:43:19\", \"item_scraped_count\": 58, \"scheduler/enqueued\": 58, \"request_depth_count/1\": 19, \"request_depth_count/0\": 1, \"request_depth_count/3\": 629, \"downloader/request_count\": 58, \"request_depth_count/4\": 112}", "id": "99f123a74e814fd09a9954265595eac0", "options": {"args": {}, "settings": {}, "crawl_id": "99f123a74e814fd09a9954265595eac0", "domain": "http://127.0.0.1/"}, "finished_at": "2016-05-28 17:43:19"} +{"_id": "5749d89da8cb9c1f286e3a90", "started_at": "2016-05-30 12:29:18", "stats_dict": {"downloader/request_method_count/GET": 107, "log_count/INFO": 18, "mongo_export/items_stored_count": 103, "request_depth_count/2": 9718, "scheduler/enqueued": 3711, "scheduler/dequeued": 107, "downloader/request_count": 107, "memusage/max": 97583104, "start_time": "2016-05-30 12:29:18", "downloader/request_bytes": 43053, "dupefilter/filtered": 7451, "scheduler/initial": 0, "downloader/response_count": 107, "arachnado/start_url": "http://example.com/", "scheduler/enqueued/disk": 3711, "request_depth_count/1": 86, "downloader/response_status_count/200": 103, "response_received_count": 103, "request_depth_count/3": 1353, "request_depth_count/0": 1, "memusage/startup": 41951232, "finish_time": "2016-05-30 12:30:27", "item_scraped_count": 103, "scheduler/remaining": 3604, "request_depth_max": 3, "scheduler/dequeued/disk": 107, "finish_reason": "stopped", "log_count/WARNING": 3, "downloader/response_bytes": 3336279, "downloader/response_status_count/301": 3, "downloader/response_status_count/302": 1}, "urls": "http://example.com/", "spider": "generic", "status": "finished", "stats": "{\"memusage/startup\": 41951232, \"scheduler/remaining\": 3604, \"scheduler/initial\": 0, \"log_count/INFO\": 18, \"downloader/response_count\": 107, \"downloader/response_bytes\": 3336279, \"finish_reason\": \"stopped\", \"arachnado/start_url\": \"http://example.com/\", \"mongo_export/items_stored_count\": 103, \"scheduler/dequeued\": 107, \"log_count/WARNING\": 3, \"request_depth_max\": 3, \"start_time\": \"2016-05-30 12:29:18\", \"request_depth_count/2\": 9718, \"scheduler/enqueued/disk\": 3711, \"downloader/request_bytes\": 43053, \"downloader/request_method_count/GET\": 107, \"memusage/max\": 97583104, \"dupefilter/filtered\": 7451, \"scheduler/dequeued/disk\": 107, \"downloader/response_status_count/200\": 103, \"response_received_count\": 103, \"finish_time\": \"2016-05-30 12:30:27\", \"item_scraped_count\": 103, \"scheduler/enqueued\": 3711, \"request_depth_count/1\": 86, \"request_depth_count/0\": 1, \"request_depth_count/3\": 1353, \"downloader/request_count\": 107, \"downloader/response_status_count/301\": 3, \"downloader/response_status_count/302\": 1}", "id": "6c8be393f12a442ca2ec94ec3f89b72e", "options": {"args": {}, "settings": {}, "domain": "http://example.com/"}, "finished_at": "2016-05-30 12:30:27"} \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..d60408f --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +import tornado +import json +from tornado import web, websocket +import tornado.testing +from tornado.ioloop import TimeoutError + +import tests.utils as u + + +class TestDataAPI(tornado.testing.AsyncHTTPTestCase): + pages_uri = r"/ws-pages-data" + jobs_uri = r"/ws-jobs-data" + + @classmethod + def setUpClass(cls): + u.init_db() + + @classmethod + def tearDownClass(cls): + u.clear_db() + + def get_app(self): + return u.get_app(self.pages_uri, self.jobs_uri) + + @tornado.testing.gen_test + def test_set_message_size(self): + test_command = self.get_command("test_set_0",'set_max_message_size', {"max_size":10000}) + ws_url = "ws://localhost:" + str(self.get_http_port()) + self.jobs_uri + ws_client = yield tornado.websocket.websocket_connect(ws_url) + ws_client.write_message(json.dumps(test_command)) + response = yield ws_client.read_message() + json_response = json.loads(response) + res = json_response.get("result", False) + self.assertTrue(res) + + @tornado.testing.gen_test + def test_jobs_no_filter(self): + jobs_command = self.get_command("test_jobs_0",'subscribe_to_jobs', {}) + yield self.execute_jobs_command(jobs_command, wait_result=True) + + @tornado.testing.gen_test + def test_jobs_filter_include(self): + jobs_command = self.get_command("test_jobs_1",'subscribe_to_jobs', {"include":["127.0.0.1"],}) + yield self.execute_jobs_command(jobs_command, wait_result=True) + + @tornado.gen.coroutine + def execute_jobs_command(self, jobs_command, wait_result=True): + ws_url = "ws://localhost:" + str(self.get_http_port()) + self.jobs_uri + ws_client = yield tornado.websocket.websocket_connect(ws_url) + ws_client.write_message(json.dumps(jobs_command)) + response = yield ws_client.read_message() + json_response = json.loads(response) + subs_id = json_response.get("result").get("id", -1) + self.assertNotEqual(subs_id, -1) + if wait_result: + response = yield ws_client.read_message() + json_response = json.loads(response) + if json_response is None: + self.fail("incorrect response") + else: + self.assertTrue('stats' in json_response) + self.assertTrue(isinstance(json_response["stats"], dict)) + yield self.execute_cancel(ws_client, subs_id, True) + + def test_jobs_filter_include_not_exists(self): + @tornado.gen.coroutine + def f(): + jobs_command = self.get_command("test_jobs_2",'subscribe_to_jobs', {"include":["notexists.com"],}) + yield self.execute_jobs_command(jobs_command, wait_result=True) + self.assertRaises(TimeoutError, self.io_loop.run_sync, f, timeout=3) + + @tornado.testing.gen_test + def test_pages_filter_url_groups(self): + url_value = 'http://example.com' + pages_command = self.get_command("test_pages_0",'subscribe_to_pages', {'url_groups': {1: {url_value: None}}}) + yield self.execute_pages_command(pages_command, wait_result=True, required_url=url_value) + + def test_pages_no_result(self): + @tornado.gen.coroutine + def f(): + url_value = 'http://mysite.com' + pages_command = self.get_command("test_pages_3",'subscribe_to_pages', {'url_groups': {1: {url_value: None}}}) + yield self.execute_pages_command(pages_command, + wait_result=True, + required_url=url_value, + max_count=0) + self.assertRaises(TimeoutError, self.io_loop.run_sync, f, timeout=3) + + def test_pages_exact_count(self): + @tornado.gen.coroutine + def f(): + url_value = 'http://example.com' + pages_command = self.get_command("test_pages_4",'subscribe_to_pages', {'url_groups': {1: {url_value: None}}}) + yield self.execute_pages_command(pages_command, + wait_result=True, + required_url=url_value, + max_count=1) + self.assertRaises(TimeoutError, self.io_loop.run_sync, f, timeout=3) + + @tornado.testing.gen_test + def test_pages_no_filter(self): + pages_command = self.get_command("test_pages_1",'subscribe_to_pages', {}) + yield self.execute_pages_command(pages_command, wait_result=True) + + @tornado.testing.gen_test + def test_pages_filter_urls(self): + url_value = 'http://example.com' + pages_command = self.get_command("test_pages_2",'subscribe_to_pages', {'urls': {url_value: None}}) + yield self.execute_pages_command(pages_command, wait_result=True, required_url=url_value) + + def get_command(self, id, method, params): + return { + 'id': id, + 'jsonrpc': '2.0', + 'method': method, + 'params': params + } + + @tornado.gen.coroutine + def execute_pages_command(self, pages_command, wait_result=False, required_url=None, max_count=None): + ws_url = "ws://localhost:" + str(self.get_http_port()) + self.pages_uri + ws_client = yield tornado.websocket.websocket_connect(ws_url) + ws_client.write_message(json.dumps(pages_command)) + response = yield ws_client.read_message() + json_response = json.loads(response) + subs_id = json_response.get("result").get("single_subscription_id", -1) + if not subs_id: + group_sub_ids = json_response.get("result").get("id", {}) + for group_id in group_sub_ids.keys(): + if group_sub_ids[group_id] != -1: + subs_id = group_sub_ids[group_id] + self.assertNotEqual(subs_id, -1) + if wait_result: + if max_count is None: + response = yield ws_client.read_message() + json_response = json.loads(response) + if json_response is None: + self.fail("incorrect response") + else: + cnt = 0 + while True: + response = yield ws_client.read_message() + json_response = json.loads(response) + if json_response is None: + self.fail("incorrect response") + cnt += 1 + if cnt > max_count: + self.fail("max count of pages exceeded") + yield self.execute_cancel(ws_client, subs_id, True) + + @tornado.testing.gen_test + def test_wrong_cancel(self): + ws_url = "ws://localhost:" + str(self.get_http_port()) + self.pages_uri + ws_client = yield tornado.websocket.websocket_connect(ws_url) + yield self.execute_cancel(ws_client, -1, False) + + @tornado.gen.coroutine + def execute_cancel(self, ws_client, subscription_id, expected): + cmd_id = "test_cancel" + cancel_command = self.get_command(cmd_id,'cancel_subscription', {"subscription_id": subscription_id}) + ws_client.write_message(json.dumps(cancel_command)) + while True: + response = yield ws_client.read_message() + json_response = json.loads(response) + if json_response.get("id", None) == cmd_id: + self.assertEqual(json_response.get("result"), expected) + break diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..578c6b5 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import os +import tornado +import json +from pymongo import MongoClient +from pymongo.errors import DuplicateKeyError + +from arachnado.rpc.data import PagesDataRpcWebsocketHandler, JobsDataRpcWebsocketHandler +from arachnado.storages.mongotail import MongoTailStorage +from arachnado.utils.mongo import motor_from_uri + + +def get_mongo_db(): + client = MongoClient('mongodb://localhost:27017/') + return client["arachnado-test"] + + +def get_db_uri(): + return "mongodb://localhost:27017/arachnado-test" + + +def get_app(ws_pages_uri, ws_jobs_uri): + db_uri = get_db_uri() + items_uri = "{}/items".format(db_uri) + jobs_uri = "{}/jobs".format(db_uri) + job_storage = MongoTailStorage(jobs_uri, cache=True) + item_storage = MongoTailStorage(items_uri) + context = { + 'crawler_process': None, + 'job_storage': job_storage, + 'item_storage': item_storage, + } + app = tornado.web.Application([ + (ws_pages_uri, PagesDataRpcWebsocketHandler, context), + (ws_jobs_uri, JobsDataRpcWebsocketHandler, context), + ]) + return app + + +def init_db(): + db = get_mongo_db() + collections = ["jobs", "items"] + for collection in collections: + col_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "{}.jl".format(collection)) + col = db[collection] + with open(col_path, "r") as fin: + for text_line in fin: + record = json.loads(text_line) + try: + col.insert(record) + except DuplicateKeyError: + pass + + +def clear_db(): + db = get_mongo_db() + collections = ["jobs", "items"] + for collection in collections: + col = db[collection] + col.drop() \ No newline at end of file diff --git a/tox.ini b/tox.ini index 1f6d62a..f143631 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27 +envlist = py27,py35 [testenv] deps = @@ -7,6 +7,6 @@ deps = pytest-cov commands = pip install -r requirements.txt - py.test --doctest-modules --cov=arachnado {posargs:arachnado} + py.test --doctest-modules --cov=arachnado {posargs:arachnado tests}