Skip to content

Commit

Permalink
Merge pull request TeamHG-Memex#20 from TeamHG-Memex/jobs_api_py3
Browse files Browse the repository at this point in the history
Jobs and Items new API
  • Loading branch information
kmike authored Aug 16, 2016
2 parents 17e06c8 + e30181b commit 94aa932
Show file tree
Hide file tree
Showing 35 changed files with 1,986 additions and 124 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ bot_spiders/
.coverage.*
htmlcov/
.scrapy
docs/_build
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ RUN npm install

# install arachnado
COPY . /app
RUN pip install --editable /app

# npm install is executed again because node_modules can be overwritten
# if .dockerignore is not active (may happen with docker-compose or DockerHub)
RUN npm install
RUN npm run build
RUN pip3 install .

# use e.g. -v /path/to/my/arachnado/config.conf:/etc/arachnado.conf
# docker run option to override arachnado parameters
Expand All @@ -51,7 +51,7 @@ RUN pip3 install .
# this folder is added to PYTHONPATH, so modules from there are available
# for spider_packages Arachnado option
VOLUME /python-packages
ENV PYTHONPATH $PYTHONPATH:/python-packages
ENV PYTHONPATH $PYTHONPATH:/python-packages:/app

EXPOSE 8888
ENTRYPOINT ["arachnado"]
CMD ["arachnado"]
10 changes: 9 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ License is MIT.
Install
-------

Arachnado requires Python 2.7.
Arachnado requires Python 2.7 or Python 3.5.
To install Arachnado use pip::

pip install arachnado
Expand Down Expand Up @@ -41,6 +41,14 @@ the server::
For available options check
https://github.com/TeamHG-Memex/arachnado/blob/master/arachnado/config/defaults.conf.

Tests
-----

To run tests make sure tox_ is installed, then
execute ``tox`` command from the source root.

.. _tox: https://testrun.org/tox/latest/

Development
-----------

Expand Down
3 changes: 3 additions & 0 deletions arachnado/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,11 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts):
})

job_storage = MongoTailStorage(jobs_uri, cache=True)
job_storage.ensure_index("urls")
site_storage = MongoStorage(sites_uri, cache=True)
item_storage = MongoTailStorage(items_uri)
item_storage.ensure_index("url")
item_storage.ensure_index("_job_id")

crawler_process = ArachnadoCrawlerProcess(settings)

Expand Down
11 changes: 7 additions & 4 deletions arachnado/config/defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
[arachnado]
; General Arachnado server options.

; Event loop to use. Allowed values are "twisted", "tornado" and "auto".
; Event loop to use. Allowed values are
; "twisted", "tornado" and "auto".
reactor = auto

; Host/port to listen to
Expand All @@ -30,9 +31,11 @@ DEPTH_LIMIT = 10

; Packages to load spiders from (separated by whitespace)
spider_packages =
; Name of the default spider. It is used for crawling if no custom spider
; is specified or detected. It should support API similar to
; arachnado.spider.CrawlWebsiteSpider (which is the default here)

; Name of the default spider. It is used for crawling if
; no custom spider is specified or detected. It should support
; API similar to arachnado.spider.CrawlWebsiteSpider
; (which is the default here).
default_spider_name = generic

[arachnado.storage]
Expand Down
2 changes: 1 addition & 1 deletion arachnado/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, domain_crawlers, site_storage):
self.waiting_calls = {}
self.domain_crawlers = domain_crawlers
self.site_storage = site_storage
self.site_storage.subscribe(self.site_storage.available_subscriptions,
self.site_storage.subscribe(self.site_storage.available_events,
self.rerun)

def start(self):
Expand Down
12 changes: 12 additions & 0 deletions arachnado/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from arachnado.utils.misc import json_encode
from arachnado.monitor import Monitor
from arachnado.handler_utils import ApiHandler, NoEtagsMixin

from arachnado.rpc.data import PagesDataRpcWebsocketHandler, JobsDataRpcWebsocketHandler

from arachnado.rpc import RpcHttpHandler
from arachnado.rpc.ws import RpcWebsocketHandler

Expand All @@ -28,16 +31,23 @@ def get_application(crawler_process, domain_crawlers,
debug = opts['arachnado']['debug']

handlers = [
# UI
url(r"/", Index, context, name="index"),
url(r"/help", Help, context, name="help"),

# simple API used by UI
url(r"/crawler/start", StartCrawler, context, name="start"),
url(r"/crawler/stop", StopCrawler, context, name="stop"),
url(r"/crawler/pause", PauseCrawler, context, name="pause"),
url(r"/crawler/resume", ResumeCrawler, context, name="resume"),
url(r"/crawler/status", CrawlerStatus, context, name="status"),
url(r"/ws-updates", Monitor, context, name="ws-updates"),

# RPC API
url(r"/ws-rpc", RpcWebsocketHandler, context, name="ws-rpc"),
url(r"/rpc", RpcHttpHandler, context, name="rpc"),
url(r"/ws-pages-data", PagesDataRpcWebsocketHandler, context, name="ws-pages-data"),
url(r"/ws-jobs-data", JobsDataRpcWebsocketHandler, context, name="ws-jobs-data"),
]
return Application(
handlers=handlers,
Expand Down Expand Up @@ -143,6 +153,8 @@ def control_job(self, job_id):

class CrawlerStatus(BaseRequestHandler):
""" Status for one or more jobs. """
# FIXME: does it work? Can we remove it? It is not used
# by Arachnado UI.
def get(self):
crawl_ids_arg = self.get_argument('crawl_ids', '')

Expand Down
4 changes: 2 additions & 2 deletions arachnado/manhole.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
An interactive Python interpreter available through telnet.
"""
from __future__ import absolute_import
from twisted.conch.manhole import ColoredManhole
from twisted.conch.insults import insults
from twisted.conch.telnet import TelnetTransport, TelnetBootstrapProtocol
from twisted.internet import protocol


def start(port=None, host=None, telnet_vars=None):
from twisted.conch.manhole import ColoredManhole
from twisted.conch.insults import insults
from twisted.internet import reactor

port = int(port) if port else 6023
Expand Down
9 changes: 9 additions & 0 deletions arachnado/pipelines/mongoexport.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ def __init__(self, crawler):
def from_crawler(cls, crawler):
return cls(crawler)

@classmethod
def get_spider_urls(cls, spider):
options = getattr(spider.crawler, 'start_options', None)
if options and "domain" in options:
return [options["domain"]]
else:
return spider.start_urls

@tt_coroutine
def open_spider(self, spider):
try:
Expand All @@ -94,6 +102,7 @@ def open_spider(self, spider):
'started_at': datetime.datetime.utcnow(),
'status': 'running',
'spider': spider.name,
"urls": self.get_spider_urls(spider),
'options': getattr(spider.crawler, 'start_options', {}),
}, upsert=True, new=True)
self.job_id = str(job['_id'])
Expand Down
16 changes: 13 additions & 3 deletions arachnado/rpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,22 @@

class ArachnadoRPC(object):
""" Base class for all Arachnado RPC resources.
Use it as a mixin for tornado.web.RequestHandler subclasses.
It provides :meth:`handle_request` method which handles
Jobs, Sites and Pages RPC requests.
"""
rpc_objects = tuple()

def initialize(self, *args, **kwargs):
jobs = Jobs(self, *args, **kwargs)
sites = Sites(self, *args, **kwargs)
pages = Pages(self, *args, **kwargs)
self.rpc_objects = [jobs, sites, pages]

self.dispatcher = Dispatcher()
self.dispatcher.add_object(Jobs(self, *args, **kwargs))
self.dispatcher.add_object(Sites(self, *args, **kwargs))
self.dispatcher.add_object(Pages(self, *args, **kwargs))
for obj in self.rpc_objects:
self.dispatcher.add_object(obj)

def handle_request(self, body):
response = JSONRPCResponseManager.handle(body, self.dispatcher)
Expand Down
Loading

0 comments on commit 94aa932

Please sign in to comment.