Skip to content

Commit

Permalink
Merge branch 'master' into pages_api_docs
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Jul 1, 2016
2 parents 3281e95 + 17e06c8 commit acaa65f
Show file tree
Hide file tree
Showing 9 changed files with 33 additions and 15 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ RUN pip3 install .

# use e.g. -v /path/to/my/arachnado/config.conf:/etc/arachnado.conf
# docker run option to override arachnado parameters
VOLUME /etc/arachnado.conf
# The VOLUME is not exposed here because Docker assumes that volumes are folders
# (unless the file really exists), so this can cause problems in docker-compose
# later (see https://github.com/docker/docker/issues/21702#issuecomment-221987049)

# this folder is added to PYTHONPATH, so modules from there are available
# for spider_packages Arachnado option
Expand Down
8 changes: 5 additions & 3 deletions arachnado/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts):
jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri')
sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri')

settings.update({k: v for k, v in opts['arachnado.scrapy'].items()
if k.isupper()})
scrapy_opts = opts['arachnado.scrapy']
settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()})

settings.update({
'MONGO_EXPORT_ENABLED': storage_opts['enabled'],
Expand All @@ -91,10 +91,12 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts):
site_checker_crawler = get_site_checker_crawler(site_storage)
crawler_process.crawl(site_checker_crawler)

spider_packages = opts['arachnado.scrapy']['spider_packages']
spider_packages = scrapy_opts['spider_packages']
default_spider_name = scrapy_opts['default_spider_name']
domain_crawlers = DomainCrawlers(
crawler_process=crawler_process,
spider_packages=_parse_spider_packages(spider_packages),
default_spider_name=default_spider_name,
settings=settings
)
domain_crawlers.resume(job_storage)
Expand Down
4 changes: 4 additions & 0 deletions arachnado/config/defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ DEPTH_LIMIT = 10

; Packages to load spiders from (separated by whitespace)
spider_packages =
; Name of the default spider. It is used for crawling if no custom spider
; is specified or detected. It should support API similar to
; arachnado.spider.CrawlWebsiteSpider (which is the default here)
default_spider_name = generic

[arachnado.storage]
; Where to store crawled items and job information.
Expand Down
8 changes: 7 additions & 1 deletion arachnado/crawler_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,13 @@ def _downloader_stats(cls, crawler):

@classmethod
def _request_info(cls, request):
return {'url': request.url, 'method': request.method}
info = {'url': request.url, 'method': request.method}
if 'splash' in request.meta:
splash_args = request.meta['splash'].get('args', {})
if 'url' in splash_args:
info['url'] = splash_args['url']
info['method'] = splash_args.get('http_method', 'GET')
return info

@classmethod
def _slot_info(cls, key, slot):
Expand Down
6 changes: 2 additions & 4 deletions arachnado/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,8 @@ def start_crawl(self, id_):
args = _key_value_to_dict(site.get('args', []))
settings = _key_value_to_dict(site.get('settings', []))

if not site.get('engine'):
site['engine'] = 'generic'

if site['engine'] == 'generic':
# checking for == 'generic' to be backwards compatible
if not site.get('engine') or site['engine'] == 'generic':
url = site['url']
else:
url = 'spider://' + site['engine']
Expand Down
12 changes: 8 additions & 4 deletions arachnado/domain_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@
import arachnado.settings
from arachnado.crawler_process import ArachnadoCrawler
from arachnado.spider import CrawlWebsiteSpider, ArachnadoSpider
from arachnado.utils.spiders import get_spider_cls
from arachnado.utils.spiders import get_spider_cls, find_spider_cls


class DomainCrawlers(object):
"""
Helper class to create and start crawlers.
"""
def __init__(self, crawler_process, spider_packages, settings):
def __init__(self, crawler_process, spider_packages, default_spider_name,
settings):
self.settings = get_settings(settings)
self.crawler_process = crawler_process
self.spider_packages = spider_packages
self.default_spider_name = default_spider_name

def resume(self, job_storage):
@gen.coroutine
Expand All @@ -37,8 +39,10 @@ def _resume():

def start(self, domain, args, settings, crawl_id=None):
""" Create, start and return a crawler for a given domain. """
spider_cls = get_spider_cls(domain, self.spider_packages,
CrawlWebsiteSpider)
default_cls = find_spider_cls(
self.default_spider_name,
self.spider_packages + ['arachnado.spider'])
spider_cls = get_spider_cls(domain, self.spider_packages, default_cls)
if not spider_cls:
return

Expand Down
1 change: 1 addition & 0 deletions arachnado/downloadermiddlewares/proxyfromsettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def from_crawler(cls, crawler):

def __init__(self, settings):
self.proxies = {}
self.auth_encoding = settings.get('HTTPPROXY_AUTH_ENCODING')
proxies = [
('http', settings.get('HTTP_PROXY')),
('https', settings.get('HTTPS_PROXY')),
Expand Down
2 changes: 1 addition & 1 deletion arachnado/site_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,5 +165,5 @@ def rerun_check(self, site):
def detect_engine(self, body):
result = self.detector.detect(body) if self.detector else None
if result is None:
return 'generic', {}
return None, {}
return result
3 changes: 2 additions & 1 deletion arachnado/static/js/pages/SitesPage.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ var SiteRow = React.createClass({
args: keyValueListToDict(this.props.site.args)
};

if(this.props.site.engine == 'generic') {
// checking for == 'generic' to be backwards compatible
if(!this.props.site.engine || this.props.site.engine == 'generic') {
JobStore.Actions.startCrawl(this.props.site.url, options);
} else {
args.start_urls = [this.props.site.url];
Expand Down

0 comments on commit acaa65f

Please sign in to comment.