From e6dd06d9c882cd563c5dfbbf8055137d70c58ad6 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 10 Jun 2016 16:20:02 +0300 Subject: [PATCH 1/7] Request info: support splash requests --- arachnado/crawler_process.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arachnado/crawler_process.py b/arachnado/crawler_process.py index a3939df..eaf30d2 100644 --- a/arachnado/crawler_process.py +++ b/arachnado/crawler_process.py @@ -300,7 +300,14 @@ def _downloader_stats(cls, crawler): @classmethod def _request_info(cls, request): - return {'url': request.url, 'method': request.method} + info = {'url': request.url, 'method': request.method} + if 'splash' in request.meta: + splash_args = request.meta['splash'].get('args', {}) + if 'url' in splash_args: + info['url'] = splash_args['url'] + if 'http_method' in splash_args: + info['method'] = splash_args['http_method'] + return info @classmethod def _slot_info(cls, key, slot): From d0b52c3567b4ae75c26890430c540621c1dc6b56 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 10 Jun 2016 16:47:29 +0300 Subject: [PATCH 2/7] Default to GET for splash request Method can be passed in some other way (or hardcoded in the script), but GET makes more sense as the default than POST in crawling context. Thanks @kmike for the catch! --- arachnado/crawler_process.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arachnado/crawler_process.py b/arachnado/crawler_process.py index eaf30d2..8cb3ca5 100644 --- a/arachnado/crawler_process.py +++ b/arachnado/crawler_process.py @@ -305,8 +305,7 @@ def _request_info(cls, request): splash_args = request.meta['splash'].get('args', {}) if 'url' in splash_args: info['url'] = splash_args['url'] - if 'http_method' in splash_args: - info['method'] = splash_args['http_method'] + info['method'] = splash_args.get('http_method', 'GET') return info @classmethod From bbf3bb5b80fe8d7baed26cef662eb456eb592376 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 10 Jun 2016 12:52:30 +0300 Subject: [PATCH 3/7] WIP: Add an option to set default spider name Custom generic spider has bad UX without this feature: we need to pass url via arguments and specify a spider name, url is not displayed in the jobs list. --- arachnado/__main__.py | 8 +++++--- arachnado/config/defaults.conf | 1 + arachnado/domain_crawlers.py | 12 ++++++++---- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arachnado/__main__.py b/arachnado/__main__.py index 8eca1cb..6f7d864 100755 --- a/arachnado/__main__.py +++ b/arachnado/__main__.py @@ -73,8 +73,8 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri') sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri') - settings.update({k: v for k, v in opts['arachnado.scrapy'].items() - if k.isupper()}) + scrapy_opts = opts['arachnado.scrapy'] + settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()}) settings.update({ 'MONGO_EXPORT_ENABLED': storage_opts['enabled'], @@ -91,10 +91,12 @@ def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): site_checker_crawler = get_site_checker_crawler(site_storage) crawler_process.crawl(site_checker_crawler) - spider_packages = opts['arachnado.scrapy']['spider_packages'] + spider_packages = scrapy_opts['spider_packages'] + default_spider_name = scrapy_opts['default_spider_name'] domain_crawlers = DomainCrawlers( crawler_process=crawler_process, spider_packages=_parse_spider_packages(spider_packages), + default_spider_name=default_spider_name, settings=settings ) domain_crawlers.resume(job_storage) diff --git a/arachnado/config/defaults.conf b/arachnado/config/defaults.conf index b6c2400..7a6e9bc 100644 --- a/arachnado/config/defaults.conf +++ b/arachnado/config/defaults.conf @@ -30,6 +30,7 @@ DEPTH_LIMIT = 10 ; Packages to load spiders from (separated by whitespace) spider_packages = +default_spider_name = generic [arachnado.storage] ; Where to store crawled items and job information. diff --git a/arachnado/domain_crawlers.py b/arachnado/domain_crawlers.py index 1456e50..d8b571c 100644 --- a/arachnado/domain_crawlers.py +++ b/arachnado/domain_crawlers.py @@ -11,17 +11,19 @@ import arachnado.settings from arachnado.crawler_process import ArachnadoCrawler from arachnado.spider import CrawlWebsiteSpider, ArachnadoSpider -from arachnado.utils.spiders import get_spider_cls +from arachnado.utils.spiders import get_spider_cls, find_spider_cls class DomainCrawlers(object): """ Helper class to create and start crawlers. """ - def __init__(self, crawler_process, spider_packages, settings): + def __init__(self, crawler_process, spider_packages, default_spider_name, + settings): self.settings = get_settings(settings) self.crawler_process = crawler_process self.spider_packages = spider_packages + self.default_spider_name = default_spider_name def resume(self, job_storage): @gen.coroutine @@ -37,8 +39,10 @@ def _resume(): def start(self, domain, args, settings, crawl_id=None): """ Create, start and return a crawler for a given domain. """ - spider_cls = get_spider_cls(domain, self.spider_packages, - CrawlWebsiteSpider) + default_cls = find_spider_cls( + self.default_spider_name, + self.spider_packages + ['arachnado.spider']) + spider_cls = get_spider_cls(domain, self.spider_packages, default_cls) if not spider_cls: return From f00e273262e3a2f8f4c4b800d5cdc734588edc5d Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 10 Jun 2016 17:02:07 +0300 Subject: [PATCH 4/7] Unknown engine is now encoded as None, not "generic" Now "generic" is always a concrete spider name, and if we can not determine the engine, we return None in the site_checker. But since we could have some old sites with "generic" engine, check for it too to avoid breaking them. The logic is that if engine is not set (or "generic"), we resolve spider class at runtime, using default_spider_name. --- arachnado/cron.py | 6 ++---- arachnado/site_checker.py | 2 +- arachnado/static/js/pages/SitesPage.jsx | 3 ++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arachnado/cron.py b/arachnado/cron.py index 7f45a6a..e192c1e 100644 --- a/arachnado/cron.py +++ b/arachnado/cron.py @@ -96,10 +96,8 @@ def start_crawl(self, id_): args = _key_value_to_dict(site.get('args', [])) settings = _key_value_to_dict(site.get('settings', [])) - if not site.get('engine'): - site['engine'] = 'generic' - - if site['engine'] == 'generic': + # checking for == 'generic' to be backwards compatible + if not site.get('engine') or site['engine'] == 'generic': url = site['url'] else: url = 'spider://' + site['engine'] diff --git a/arachnado/site_checker.py b/arachnado/site_checker.py index 00630da..e450a57 100644 --- a/arachnado/site_checker.py +++ b/arachnado/site_checker.py @@ -165,5 +165,5 @@ def rerun_check(self, site): def detect_engine(self, body): result = self.detector.detect(body) if self.detector else None if result is None: - return 'generic', {} + return None, {} return result diff --git a/arachnado/static/js/pages/SitesPage.jsx b/arachnado/static/js/pages/SitesPage.jsx index 62198c6..e71bc1f 100644 --- a/arachnado/static/js/pages/SitesPage.jsx +++ b/arachnado/static/js/pages/SitesPage.jsx @@ -143,7 +143,8 @@ var SiteRow = React.createClass({ args: keyValueListToDict(this.props.site.args) }; - if(this.props.site.engine == 'generic') { + // checking for == 'generic' to be backwards compatible + if(!this.props.site.engine || this.props.site.engine == 'generic') { JobStore.Actions.startCrawl(this.props.site.url, options); } else { args.start_urls = [this.props.site.url]; From 8eb533d67f3db2be26f8af9ca05626a4a0b0cf8d Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Tue, 14 Jun 2016 19:31:39 +0300 Subject: [PATCH 5/7] Fix ProxyFromSettingsMiddleware with auth This is done in HttpProxyMiddleware.__init__, and is required for proxies with authentication. --- arachnado/downloadermiddlewares/proxyfromsettings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arachnado/downloadermiddlewares/proxyfromsettings.py b/arachnado/downloadermiddlewares/proxyfromsettings.py index 410cbb1..521cd3c 100644 --- a/arachnado/downloadermiddlewares/proxyfromsettings.py +++ b/arachnado/downloadermiddlewares/proxyfromsettings.py @@ -11,6 +11,7 @@ def from_crawler(cls, crawler): def __init__(self, settings): self.proxies = {} + self.auth_encoding = settings.get('HTTPPROXY_AUTH_ENCODING') proxies = [ ('http', settings.get('HTTP_PROXY')), ('https', settings.get('HTTPS_PROXY')), From 57e2b62b98e2adfd485409d44e123c1a9d15a8e0 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Wed, 15 Jun 2016 15:27:22 +0300 Subject: [PATCH 6/7] Do not expose file VOLUME to avoid docker-compose problems --- Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9c61714..9dcedb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,9 @@ RUN pip3 install . # use e.g. -v /path/to/my/arachnado/config.conf:/etc/arachnado.conf # docker run option to override arachnado parameters -VOLUME /etc/arachnado.conf +# The VOLUME is not exposed here because Docker assumes that volumes are folders +# (unless the file really exists), so this can cause problems in docker-compose +# later (see https://github.com/docker/docker/issues/21702#issuecomment-221987049) # this folder is added to PYTHONPATH, so modules from there are available # for spider_packages Arachnado option From 1d2222b1f9c5c9ce95bb97a7d5d0a9d8436377bb Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Tue, 21 Jun 2016 11:17:12 +0300 Subject: [PATCH 7/7] Add a comment describing default_spider_name setting --- arachnado/config/defaults.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arachnado/config/defaults.conf b/arachnado/config/defaults.conf index 7a6e9bc..0357e35 100644 --- a/arachnado/config/defaults.conf +++ b/arachnado/config/defaults.conf @@ -30,6 +30,9 @@ DEPTH_LIMIT = 10 ; Packages to load spiders from (separated by whitespace) spider_packages = +; Name of the default spider. It is used for crawling if no custom spider +; is specified or detected. It should support API similar to +; arachnado.spider.CrawlWebsiteSpider (which is the default here) default_spider_name = generic [arachnado.storage]