Skip to content

Commit

Permalink
Merge branch 'jobs_api_py3' of https://github.com/TeamHG-Memex/arachnado
Browse files Browse the repository at this point in the history
 into jobs_api_py3
  • Loading branch information
Ubuntu committed Jul 7, 2016
2 parents 5257384 + b5a9fa0 commit b942ae8
Show file tree
Hide file tree
Showing 35 changed files with 1,180 additions and 120 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ bot_spiders/
.coverage.*
htmlcov/
.scrapy
docs/_build
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ RUN npm run build

# use e.g. -v /path/to/my/arachnado/config.conf:/etc/arachnado.conf
# docker run option to override arachnado parameters
VOLUME /etc/arachnado.conf
# The VOLUME is not exposed here because Docker assumes that volumes are folders
# (unless the file really exists), so this can cause problems in docker-compose
# later (see https://github.com/docker/docker/issues/21702#issuecomment-221987049)

# this folder is added to PYTHONPATH, so modules from there are available
# for spider_packages Arachnado option
Expand Down
14 changes: 7 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ License is MIT.
Install
-------

Arachnado requires Python 2.7.
Arachnado requires Python 2.7 or Python 3.5.
To install Arachnado use pip::

pip install arachnado
Expand Down Expand Up @@ -41,13 +41,13 @@ the server::
For available options check
https://github.com/TeamHG-Memex/arachnado/blob/master/arachnado/config/defaults.conf.

Test
-----------
To start unit tests for API:
Tests
-----

To run tests make sure tox_ is installed, then
execute ``tox`` command from the source root.

python -m tornado.test.runtests tests.test_data
or
python3 -m tornado.test.runtests tests.test_data
.. _tox: https://testrun.org/tox/latest/

Development
-----------
Expand Down
8 changes: 5 additions & 3 deletions arachnado/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts):
jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri')
sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri')

settings.update({k: v for k, v in opts['arachnado.scrapy'].items()
if k.isupper()})
scrapy_opts = opts['arachnado.scrapy']
settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()})

settings.update({
'MONGO_EXPORT_ENABLED': storage_opts['enabled'],
Expand All @@ -91,10 +91,12 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts):
site_checker_crawler = get_site_checker_crawler(site_storage)
crawler_process.crawl(site_checker_crawler)

spider_packages = opts['arachnado.scrapy']['spider_packages']
spider_packages = scrapy_opts['spider_packages']
default_spider_name = scrapy_opts['default_spider_name']
domain_crawlers = DomainCrawlers(
crawler_process=crawler_process,
spider_packages=_parse_spider_packages(spider_packages),
default_spider_name=default_spider_name,
settings=settings
)
domain_crawlers.resume(job_storage)
Expand Down
9 changes: 8 additions & 1 deletion arachnado/config/defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
[arachnado]
; General Arachnado server options.

; Event loop to use. Allowed values are "twisted", "tornado" and "auto".
; Event loop to use. Allowed values are
; "twisted", "tornado" and "auto".
reactor = auto

; Host/port to listen to
Expand All @@ -31,6 +32,12 @@ DEPTH_LIMIT = 10
; Packages to load spiders from (separated by whitespace)
spider_packages =

; Name of the default spider. It is used for crawling if
; no custom spider is specified or detected. It should support
; API similar to arachnado.spider.CrawlWebsiteSpider
; (which is the default here).
default_spider_name = generic

[arachnado.storage]
; Where to store crawled items and job information.
; Currently only MongoDB is supported (mongodb:// URIs).
Expand Down
8 changes: 7 additions & 1 deletion arachnado/crawler_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,13 @@ def _downloader_stats(cls, crawler):

@classmethod
def _request_info(cls, request):
return {'url': request.url, 'method': request.method}
info = {'url': request.url, 'method': request.method}
if 'splash' in request.meta:
splash_args = request.meta['splash'].get('args', {})
if 'url' in splash_args:
info['url'] = splash_args['url']
info['method'] = splash_args.get('http_method', 'GET')
return info

@classmethod
def _slot_info(cls, key, slot):
Expand Down
8 changes: 3 additions & 5 deletions arachnado/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, domain_crawlers, site_storage):
self.waiting_calls = {}
self.domain_crawlers = domain_crawlers
self.site_storage = site_storage
self.site_storage.subscribe(self.site_storage.available_subscriptions,
self.site_storage.subscribe(self.site_storage.available_events,
self.rerun)

def start(self):
Expand Down Expand Up @@ -96,10 +96,8 @@ def start_crawl(self, id_):
args = _key_value_to_dict(site.get('args', []))
settings = _key_value_to_dict(site.get('settings', []))

if not site.get('engine'):
site['engine'] = 'generic'

if site['engine'] == 'generic':
# checking for == 'generic' to be backwards compatible
if not site.get('engine') or site['engine'] == 'generic':
url = site['url']
else:
url = 'spider://' + site['engine']
Expand Down
12 changes: 8 additions & 4 deletions arachnado/domain_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@
import arachnado.settings
from arachnado.crawler_process import ArachnadoCrawler
from arachnado.spider import CrawlWebsiteSpider, ArachnadoSpider
from arachnado.utils.spiders import get_spider_cls
from arachnado.utils.spiders import get_spider_cls, find_spider_cls


class DomainCrawlers(object):
"""
Helper class to create and start crawlers.
"""
def __init__(self, crawler_process, spider_packages, settings):
def __init__(self, crawler_process, spider_packages, default_spider_name,
settings):
self.settings = get_settings(settings)
self.crawler_process = crawler_process
self.spider_packages = spider_packages
self.default_spider_name = default_spider_name

def resume(self, job_storage):
@gen.coroutine
Expand All @@ -37,8 +39,10 @@ def _resume():

def start(self, domain, args, settings, crawl_id=None):
""" Create, start and return a crawler for a given domain. """
spider_cls = get_spider_cls(domain, self.spider_packages,
CrawlWebsiteSpider)
default_cls = find_spider_cls(
self.default_spider_name,
self.spider_packages + ['arachnado.spider'])
spider_cls = get_spider_cls(domain, self.spider_packages, default_cls)
if not spider_cls:
return

Expand Down
1 change: 1 addition & 0 deletions arachnado/downloadermiddlewares/proxyfromsettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def from_crawler(cls, crawler):

def __init__(self, settings):
self.proxies = {}
self.auth_encoding = settings.get('HTTPPROXY_AUTH_ENCODING')
proxies = [
('http', settings.get('HTTP_PROXY')),
('https', settings.get('HTTPS_PROXY')),
Expand Down
7 changes: 7 additions & 0 deletions arachnado/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,19 @@ def get_application(crawler_process, domain_crawlers,
debug = opts['arachnado']['debug']

handlers = [
# UI
url(r"/", Index, context, name="index"),
url(r"/help", Help, context, name="help"),

# simple API used by UI
url(r"/crawler/start", StartCrawler, context, name="start"),
url(r"/crawler/stop", StopCrawler, context, name="stop"),
url(r"/crawler/pause", PauseCrawler, context, name="pause"),
url(r"/crawler/resume", ResumeCrawler, context, name="resume"),
url(r"/crawler/status", CrawlerStatus, context, name="status"),
url(r"/ws-updates", Monitor, context, name="ws-updates"),

# RPC API
url(r"/ws-rpc", RpcWebsocketHandler, context, name="ws-rpc"),
url(r"/rpc", RpcHttpHandler, context, name="rpc"),
url(r"/ws-pages-data", PagesDataRpcWebsocketHandler, context, name="ws-pages-data"),
Expand Down Expand Up @@ -148,6 +153,8 @@ def control_job(self, job_id):

class CrawlerStatus(BaseRequestHandler):
""" Status for one or more jobs. """
# FIXME: does it work? Can we remove it? It is not used
# by Arachnado UI.
def get(self):
crawl_ids_arg = self.get_argument('crawl_ids', '')

Expand Down
4 changes: 2 additions & 2 deletions arachnado/manhole.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
An interactive Python interpreter available through telnet.
"""
from __future__ import absolute_import
from twisted.conch.manhole import ColoredManhole
from twisted.conch.insults import insults
from twisted.conch.telnet import TelnetTransport, TelnetBootstrapProtocol
from twisted.internet import protocol


def start(port=None, host=None, telnet_vars=None):
from twisted.conch.manhole import ColoredManhole
from twisted.conch.insults import insults
from twisted.internet import reactor

port = int(port) if port else 6023
Expand Down
18 changes: 14 additions & 4 deletions arachnado/rpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,22 @@

class ArachnadoRPC(object):
""" Base class for all Arachnado RPC resources.
Use it as a mixin for tornado.web.RequestHandler subclasses.
It provides :meth:`handle_request` method which handles
Jobs, Sites and Pages RPC requests.
"""
rpc_objects = tuple()

def initialize(self, *args, **kwargs):
jobs = Jobs(self, *args, **kwargs)
sites = Sites(self, *args, **kwargs)
pages = Pages(self, *args, **kwargs)
self.rpc_objects = [jobs, sites, pages]

self.dispatcher = Dispatcher()
self.dispatcher.add_object(Jobs(self, *args, **kwargs))
self.dispatcher.add_object(Sites(self, *args, **kwargs))
self.dispatcher.add_object(Pages(self, *args, **kwargs))
for obj in self.rpc_objects:
self.dispatcher.add_object(obj)

def handle_request(self, body):
response = JSONRPCResponseManager.handle(body, self.dispatcher)
Expand All @@ -45,4 +55,4 @@ def post(self):

def send_data(self, data):
self.write(json_encode(data))
self.finish()
self.finish()
9 changes: 8 additions & 1 deletion arachnado/rpc/jobs.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
import logging

from arachnado.storages.mongotail import MongoTailStorage


class Jobs(object):
"""
This object is exposed for RPC requests.
It allows to subscribe for scraping job updates.
"""
handler_id = None
logger = logging.getLogger(__name__)

def __init__(self, handler, job_storage, **kwargs):
self.handler = handler
self.storage = job_storage
self.storage = job_storage # type: MongoTailStorage

def subscribe(self, last_id=0, query=None, fields=None):
""" Subscribe for job updates. """
self.storage.subscribe('tailed', self._publish, last_id=last_id,
query=query, fields=fields)

Expand Down
1 change: 1 addition & 0 deletions arachnado/rpc/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


class Pages(object):
""" Pages (scraped items) object exposed via JSON RPC """
handler_id = None

def __init__(self, handler, item_storage, **kwargs):
Expand Down
20 changes: 11 additions & 9 deletions arachnado/rpc/sites.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import logging
from functools import partial

from arachnado.storages.mongotail import MongoTailStorage

class Sites(object):

class Sites(object):
""" 'Known sites' object exposed via JSON-RPC """
logger = logging.getLogger(__name__)

def __init__(self, handler, site_storage, **kwargs):
self.handler = handler
self.storage = site_storage
self.storage = site_storage # type: MongoTailStorage

def list(self):
return self.storage.fetch()
Expand All @@ -22,15 +25,14 @@ def delete(self, site):
self.storage.delete(site)

def subscribe(self):
for subscription in self.storage.available_subscriptions:
for event_name in self.storage.available_events:
self.storage.subscribe(
subscription,
lambda data, subscription=subscription:
self._publish(data, subscription)
event_name,
partial(self._publish, event=event_name)
)

def _on_close(self):
self.storage.unsubscribe(self.storage.available_subscriptions)
self.storage.unsubscribe(self.storage.available_events)

def _publish(self, data, subscription):
self.handler.write_event('sites.{}'.format(subscription), data)
def _publish(self, event, data):
self.handler.write_event('sites.{}'.format(event), data)
33 changes: 0 additions & 33 deletions arachnado/rpc/stats.py

This file was deleted.

15 changes: 5 additions & 10 deletions arachnado/rpc/ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,19 @@ def write_event(self, event, data):
def open(self):
""" Forward open event to resource objects.
"""
for resource in self._resources():
logger.debug("Connection opened %s", self)
for resource in self.rpc_objects:
if hasattr(resource, '_on_open'):
resource._on_open()
self._pinger = PeriodicCallback(lambda: self.ping(b'PING'), 1000 * 15)
self._pinger.start()
logger.info("Pinger initiated")
logger.debug("Pinger initiated %s", self)

def on_close(self):
""" Forward on_close event to resource objects.
"""
for resource in self._resources():
logger.debug("Connection closed %s", self)
for resource in self.rpc_objects:
if hasattr(resource, '_on_close'):
resource._on_close()
self._pinger.stop()

def _resources(self):
for resource_name, resource in self.__dict__.items():
if hasattr(RequestHandler, resource_name):
continue
yield resource

2 changes: 1 addition & 1 deletion arachnado/site_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,5 +165,5 @@ def rerun_check(self, site):
def detect_engine(self, body):
result = self.detector.detect(body) if self.detector else None
if result is None:
return 'generic', {}
return None, {}
return result
Loading

0 comments on commit b942ae8

Please sign in to comment.