diff --git a/.gitignore b/.gitignore index 9d3e9a21a..dded327fa 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,5 @@ nosetests.xml .mr.developer.cfg .project .pydevproject -.idea +config.json +LICENSE \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index e5fbd98b1..74d673eb3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,44 +1,45 @@ language: python cache: pip python: - - 3.5 - 3.6 - 3.7 - #- 3.8 + - 3.8 + - 3.9 services: - - docker - - mongodb - - rabbitmq - - redis - - mysql - # - elasticsearch - - postgresql + - docker + - mongodb + - rabbitmq + - redis + - mysql + # - elasticsearch + - postgresql addons: postgresql: "9.4" apt: packages: - - rabbitmq-server + - rabbitmq-server env: - - IGNORE_COUCHDB=1 + - IGNORE_COUCHDB=1 before_install: - - sudo apt-get update -qq - - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart - - npm install express puppeteer - - sudo docker pull scrapinghub/splash - - sudo docker run -d --net=host scrapinghub/splash -before_script: - - psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - - psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - - psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres - - sleep 10 + - sudo apt-get update -qq + - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart + - npm install express puppeteer + - sudo docker pull scrapinghub/splash + - sudo docker run -d --net=host scrapinghub/splash install: - - pip install https://github.com/marcus67/easywebdav/archive/master.zip - - sudo apt-get install libgnutls28-dev - - pip install -e .[all,test] - - pip install coveralls + - pip install https://github.com/marcus67/easywebdav/archive/master.zip + - sudo apt-get install libgnutls28-dev + - pip install -e .[all,test] + - pip install coveralls +before_script: + - psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres + - psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres + - psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres + - sleep 10 + script: - - coverage run setup.py test + - coverage run setup.py test after_success: - - coverage combine - - coveralls + - coverage combine + - coveralls diff --git a/Dockerfile b/Dockerfile index feac31b1b..62ca5564d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.6 +FROM python:3.7 MAINTAINER binux # install phantomjs @@ -22,7 +22,7 @@ RUN npm install puppeteer express # install requirements COPY requirements.txt /opt/pyspider/requirements.txt -RUN pip install -r /opt/pyspider/requirements.txt +RUN pip install --no-cache-dir -r /opt/pyspider/requirements.txt # add all repo ADD ./ /opt/pyspider diff --git a/LICENSE b/LICENSE.txt similarity index 100% rename from LICENSE rename to LICENSE.txt diff --git a/README.md b/README.md index 1dc169585..274b5d7da 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ A Powerful Spider(Web Crawler) System in Python. - [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue - Task priority, retry, periodical, recrawl by age, etc... -- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc... +- Distributed architecture, Crawl Javascript pages, Python 3.{6, 7, 8, 9} support, etc... Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/) Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/) @@ -18,7 +18,7 @@ Sample Code ----------- ```python -from pyspider.libs.base_handler import * +from pyspider.libs.base_handler import BaseHandler, config, every class Handler(BaseHandler): @@ -66,6 +66,12 @@ TODO ### v0.4.0 +- [ ] 适配 python 3.9, 放弃 3.6 以下版本, try my best to fix bug +- [ ] fix travis and Coverage +- [ ] review docker + +### v0.5.0 + - [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia) diff --git a/docker-compose.yaml b/docker-compose.yaml index 983fc566d..bd132fd02 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,4 +1,4 @@ -version: "3.7" +version: "3.3" # replace /path/to/dir/ to point to config.json diff --git a/docs/conf.py b/docs/conf.py index e53f785fe..25e14967b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,18 +6,20 @@ import sys from unittest.mock import MagicMock + from recommonmark.parser import CommonMarkParser + class Mock(MagicMock): @classmethod def __getattr__(cls, name): - return Mock() + return Mock() MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) source_parsers = { - '.md': CommonMarkParser, + '.md': CommonMarkParser, } source_suffix = ['.rst', '.md'] diff --git a/pyspider/__init__.py b/pyspider/__init__.py index 700f8fc7f..da6af2f8a 100644 --- a/pyspider/__init__.py +++ b/pyspider/__init__.py @@ -1,8 +1,7 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-17 19:17:12 -__version__ = '0.4.0' +__version__ = '0.4.1' diff --git a/pyspider/database/__init__.py b/pyspider/database/__init__.py index 04755b904..bf46c0476 100644 --- a/pyspider/database/__init__.py +++ b/pyspider/database/__init__.py @@ -1,12 +1,14 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-08 15:04:08 -import os, requests, json -from six.moves.urllib.parse import urlparse, parse_qs +import json +import os + +import requests +from six.moves.urllib.parse import parse_qs, urlparse def connect_database(url): @@ -60,16 +62,15 @@ def _connect_database(url): # NOQA other_scheme = "+".join(scheme[1:-1]) if dbtype not in ('taskdb', 'projectdb', 'resultdb'): - raise LookupError('unknown database type: %s, ' - 'type should be one of ["taskdb", "projectdb", "resultdb"]', dbtype) + raise LookupError(f'unknown database type: {dbtype}, type should be one of ["taskdb", "projectdb", "resultdb"]') if engine == 'mysql': - return _connect_mysql(parsed,dbtype) + return _connect_mysql(parsed, dbtype) elif engine == 'sqlite': - return _connect_sqlite(parsed,dbtype) + return _connect_sqlite(parsed, dbtype) elif engine == 'mongodb': - return _connect_mongodb(parsed,dbtype,url) + return _connect_mongodb(parsed, dbtype, url) elif engine == 'sqlalchemy': return _connect_sqlalchemy(parsed, dbtype, url, other_scheme) @@ -88,7 +89,7 @@ def _connect_database(url): # NOQA from .local.projectdb import ProjectDB return ProjectDB(scripts) else: - raise LookupError('not supported dbtype: %s', dbtype) + raise LookupError(f'not supported dbtype: {dbtype}') elif engine == 'elasticsearch' or engine == 'es': return _connect_elasticsearch(parsed, dbtype) @@ -96,36 +97,36 @@ def _connect_database(url): # NOQA return _connect_couchdb(parsed, dbtype, url) else: - raise Exception('unknown engine: %s' % engine) + raise Exception(f'unknown engine: {engine}') -def _connect_mysql(parsed,dbtype): - parames = {} +def _connect_mysql(parsed, dbtype): + params = dict() if parsed.username: - parames['user'] = parsed.username + params['user'] = parsed.username if parsed.password: - parames['passwd'] = parsed.password + params['passwd'] = parsed.password if parsed.hostname: - parames['host'] = parsed.hostname + params['host'] = parsed.hostname if parsed.port: - parames['port'] = parsed.port + params['port'] = parsed.port if parsed.path.strip('/'): - parames['database'] = parsed.path.strip('/') + params['database'] = parsed.path.strip('/') if dbtype == 'taskdb': from .mysql.taskdb import TaskDB - return TaskDB(**parames) + return TaskDB(**params) elif dbtype == 'projectdb': from .mysql.projectdb import ProjectDB - return ProjectDB(**parames) + return ProjectDB(**params) elif dbtype == 'resultdb': from .mysql.resultdb import ResultDB - return ResultDB(**parames) + return ResultDB(**params) else: raise LookupError -def _connect_sqlite(parsed,dbtype): +def _connect_sqlite(parsed, dbtype): if parsed.path.startswith('//'): path = '/' + parsed.path.strip('/') elif parsed.path.startswith('/'): @@ -148,7 +149,7 @@ def _connect_sqlite(parsed,dbtype): raise LookupError -def _connect_mongodb(parsed,dbtype,url): +def _connect_mongodb(parsed, dbtype, url): url = url.replace(parsed.scheme, 'mongodb') parames = {} if parsed.path.strip('/'): @@ -167,7 +168,7 @@ def _connect_mongodb(parsed,dbtype,url): raise LookupError -def _connect_sqlalchemy(parsed, dbtype,url, other_scheme): +def _connect_sqlalchemy(parsed, dbtype, url, other_scheme): if not other_scheme: raise Exception('wrong scheme format: %s' % parsed.scheme) url = url.replace(parsed.scheme, other_scheme) diff --git a/pyspider/database/base/projectdb.py b/pyspider/database/base/projectdb.py index 7f02c7426..0786c4c36 100644 --- a/pyspider/database/base/projectdb.py +++ b/pyspider/database/base/projectdb.py @@ -1,14 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-09 11:28:52 import re +from typing import Dict # NOTE: When get/get_all/check_update from database with default fields, # all following fields should be included in output dict. + { 'project': { 'name': str, @@ -34,10 +35,10 @@ class ProjectDB(object): 'RUNNING', ] - def insert(self, name, obj={}): + def insert(self, name, obj: Dict = None): raise NotImplementedError - def update(self, name, obj={}, **kwargs): + def update(self, name, obj: Dict = None, **kwargs): raise NotImplementedError def get_all(self, fields=None): @@ -54,9 +55,9 @@ def check_update(self, timestamp, fields=None): def split_group(self, group, lower=True): if lower: - return re.split("\W+", (group or '').lower()) + return re.split(r"\W+", (group or '').lower()) else: - return re.split("\W+", group or '') + return re.split(r"\W+", group or '') def verify_project_name(self, name): if len(name) > 64: diff --git a/pyspider/database/base/resultdb.py b/pyspider/database/base/resultdb.py index aa29afd35..dd401a71e 100644 --- a/pyspider/database/base/resultdb.py +++ b/pyspider/database/base/resultdb.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/database/base/taskdb.py b/pyspider/database/base/taskdb.py index b698a8210..d5cf4765f 100644 --- a/pyspider/database/base/taskdb.py +++ b/pyspider/database/base/taskdb.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -76,10 +75,14 @@ def status_count(self, project): ''' raise NotImplementedError - def insert(self, project, taskid, obj={}): + def insert(self, project, taskid, obj: dict = None): + if obj is None: + obj = dict() raise NotImplementedError - def update(self, project, taskid, obj={}, **kwargs): + def update(self, project, taskid, obj: dict = None, **kwargs): + if obj is None: + obj = dict() raise NotImplementedError def drop(self, project): diff --git a/pyspider/database/basedb.py b/pyspider/database/basedb.py index ca71d6d2c..c8d918c24 100644 --- a/pyspider/database/basedb.py +++ b/pyspider/database/basedb.py @@ -1,21 +1,22 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-08-30 17:43:49 -from __future__ import unicode_literals, division, absolute_import +from __future__ import absolute_import, division, unicode_literals import logging -logger = logging.getLogger('database.basedb') +from typing import List from six import itervalues + from pyspider.libs import utils +logger = logging.getLogger('database.basedb') -class BaseDB: +class BaseDB: ''' BaseDB @@ -33,12 +34,14 @@ def escape(string): def dbcur(self): raise NotImplementedError - def _execute(self, sql_query, values=[]): + def _execute(self, sql_query, values: List = None): dbcur = self.dbcur + if values is None: + values = list() dbcur.execute(sql_query, values) return dbcur - def _select(self, tablename=None, what="*", where="", where_values=[], offset=0, limit=None): + def _select(self, tablename=None, what="*", where="", where_values: List = None, offset: int = 0, limit=None): tablename = self.escape(tablename or self.__tablename__) if isinstance(what, list) or isinstance(what, tuple) or what is None: what = ','.join(self.escape(f) for f in what) if what else '*' @@ -51,14 +54,15 @@ def _select(self, tablename=None, what="*", where="", where_values=[], offset=0, elif offset: sql_query += " LIMIT %d, %d" % (offset, self.maxlimit) logger.debug("", sql_query) - + if where_values is None: + where_values = list() for row in self._execute(sql_query, where_values): yield row - def _select2dic(self, tablename=None, what="*", where="", where_values=[], + def _select2dic(self, tablename=None, what="*", where="", where_values: List = None, order=None, offset=0, limit=None): tablename = self.escape(tablename or self.__tablename__) - if isinstance(what, list) or isinstance(what, tuple) or what is None: + if isinstance(what, (list, tuple)) or what is None: what = ','.join(self.escape(f) for f in what) if what else '*' sql_query = "SELECT %s FROM %s" % (what, tablename) @@ -71,7 +75,8 @@ def _select2dic(self, tablename=None, what="*", where="", where_values=[], elif offset: sql_query += " LIMIT %d, %d" % (offset, self.maxlimit) logger.debug("", sql_query) - + if where_values is None: + where_values = list() dbcur = self._execute(sql_query, where_values) # f[0] may return bytes type @@ -113,28 +118,32 @@ def _insert(self, tablename=None, **values): dbcur = self._execute(sql_query) return dbcur.lastrowid - def _update(self, tablename=None, where="1=0", where_values=[], **values): + def _update(self, tablename=None, where="1=0", where_values: List = None, **values): tablename = self.escape(tablename or self.__tablename__) _key_values = ", ".join([ "%s = %s" % (self.escape(k), self.placeholder) for k in values ]) sql_query = "UPDATE %s SET %s WHERE %s" % (tablename, _key_values, where) logger.debug("", sql_query) - + if where_values is None: + where_values = list() return self._execute(sql_query, list(itervalues(values)) + list(where_values)) - def _delete(self, tablename=None, where="1=0", where_values=[]): + def _delete(self, tablename=None, where="1=0", where_values: List = None): tablename = self.escape(tablename or self.__tablename__) sql_query = "DELETE FROM %s" % tablename if where: sql_query += " WHERE %s" % where logger.debug("", sql_query) - + if where_values is None: + where_values = list() return self._execute(sql_query, where_values) + if __name__ == "__main__": import sqlite3 + class DB(BaseDB): __tablename__ = "test" placeholder = "?" @@ -151,6 +160,7 @@ def __init__(self): def dbcur(self): return self.conn.cursor() + db = DB() assert db._insert(db.__tablename__, name="binux", age=23) == 1 assert db._select(db.__tablename__, "name, age").next() == ("binux", 23) @@ -161,4 +171,4 @@ def dbcur(self): db._update(db.__tablename__, "id = 1", age=16) assert db._select(db.__tablename__, "name, age").next() == (None, 16) db._delete(db.__tablename__, "id = 1") - assert [row for row in db._select(db.__tablename__)] == [] + assert [row for row in db._select(db.__tablename__)] diff --git a/pyspider/database/couchdb/couchdbbase.py b/pyspider/database/couchdb/couchdbbase.py index 13eb7fb57..27c48c2bd 100644 --- a/pyspider/database/couchdb/couchdbbase.py +++ b/pyspider/database/couchdb/couchdbbase.py @@ -1,6 +1,9 @@ -import time, requests, json +import time + +import requests from requests.auth import HTTPBasicAuth + class SplitTableMixin(object): UPDATE_PROJECTS_TIME = 10 * 60 @@ -16,19 +19,16 @@ def _collection_name(self, project): else: return project - @property def projects(self): if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME: self._list_project() return self._projects - @projects.setter def projects(self, value): self._projects = value - def _list_project(self): self._last_update_projects = time.time() self.projects = set() @@ -43,17 +43,18 @@ def _list_project(self): if each.startswith('_'): continue if each.startswith(self.database): - self.projects.add(each[len(self.database)+1+len(prefix):]) - + self.projects.add(each[len(self.database) + 1 + len(prefix):]) def create_database(self, name): url = self.base_url + name res = self.session.put(url).json() if 'error' in res and res['error'] == 'unauthorized': - raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password)) + raise Exception( + "Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], + self.username, + self.password)) return res - def get_doc(self, db_name, doc_id): url = self.base_url + db_name + "/" + doc_id res = self.session.get(url).json() @@ -61,7 +62,6 @@ def get_doc(self, db_name, doc_id): return None return res - def get_docs(self, db_name, selector): url = self.base_url + db_name + "/_find" selector['use_index'] = self.index @@ -70,16 +70,13 @@ def get_docs(self, db_name, selector): return [] return res['docs'] - def get_all_docs(self, db_name): return self.get_docs(db_name, {"selector": {}}) - def insert_doc(self, db_name, doc_id, doc): url = self.base_url + db_name + "/" + doc_id return self.session.put(url, json=doc).json() - def update_doc(self, db_name, doc_id, new_doc): doc = self.get_doc(db_name, doc_id) if doc is None: @@ -89,7 +86,5 @@ def update_doc(self, db_name, doc_id, new_doc): url = self.base_url + db_name + "/" + doc_id return self.session.put(url, json=doc).json() - def delete(self, url): return self.session.delete(url).json() - diff --git a/pyspider/database/couchdb/projectdb.py b/pyspider/database/couchdb/projectdb.py index 17c1f6ff3..d2d57c0f7 100644 --- a/pyspider/database/couchdb/projectdb.py +++ b/pyspider/database/couchdb/projectdb.py @@ -1,5 +1,9 @@ -import time, requests, json +import time +from typing import Dict + +import requests from requests.auth import HTTPBasicAuth + from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB @@ -46,19 +50,22 @@ def _default_fields(self, each): each.setdefault('updatetime', 0) return each - def insert(self, name, obj={}): + def insert(self, name, obj: Dict = None): url = self.url + name - obj = dict(obj) + if obj is None: + obj = dict() obj['name'] = name obj['updatetime'] = time.time() res = self.session.put(url, json=obj).json() return res - def update(self, name, obj={}, **kwargs): + def update(self, name, obj: Dict = None, **kwargs): # object contains the fields to update and their new values - update = self.get(name) # update will contain _rev + update = self.get(name) # update will contain _rev if update is None: return None + if obj is None: + obj = dict() obj = dict(obj) obj['updatetime'] = time.time() obj.update(kwargs) diff --git a/pyspider/database/couchdb/resultdb.py b/pyspider/database/couchdb/resultdb.py index 163a6c17b..7546caeb7 100644 --- a/pyspider/database/couchdb/resultdb.py +++ b/pyspider/database/couchdb/resultdb.py @@ -1,5 +1,8 @@ -import time, json +import json +import time + from pyspider.database.base.resultdb import ResultDB as BaseResultDB + from .couchdbbase import SplitTableMixin diff --git a/pyspider/database/couchdb/taskdb.py b/pyspider/database/couchdb/taskdb.py index 9110be82a..6d258d75a 100644 --- a/pyspider/database/couchdb/taskdb.py +++ b/pyspider/database/couchdb/taskdb.py @@ -1,5 +1,8 @@ -import json, time +import json +import time + from pyspider.database.base.taskdb import TaskDB as BaseTaskDB + from .couchdbbase import SplitTableMixin diff --git a/pyspider/database/elasticsearch/__init__.py b/pyspider/database/elasticsearch/__init__.py index 816f8dc36..07842692d 100644 --- a/pyspider/database/elasticsearch/__init__.py +++ b/pyspider/database/elasticsearch/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/database/elasticsearch/projectdb.py b/pyspider/database/elasticsearch/projectdb.py index 326657f55..13574afbd 100644 --- a/pyspider/database/elasticsearch/projectdb.py +++ b/pyspider/database/elasticsearch/projectdb.py @@ -1,14 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2016-01-17 18:32:33 import time +from typing import Dict import elasticsearch.helpers from elasticsearch import Elasticsearch + from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB @@ -19,7 +20,7 @@ def __init__(self, hosts, index='pyspider'): self.index = index self.es = Elasticsearch(hosts=hosts) - self.es.indices.create(index=self.index, ignore=400) + self.es.indices.create(index=self.index) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": False}, @@ -28,8 +29,9 @@ def __init__(self, hosts, index='pyspider'): } }) - def insert(self, name, obj={}): - obj = dict(obj) + def insert(self, name, obj:Dict=None): + if obj is None: + obj = dict() obj['name'] = name obj['updatetime'] = time.time() @@ -40,15 +42,15 @@ def insert(self, name, obj={}): obj.setdefault('rate', 0) obj.setdefault('burst', 0) - return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name, - refresh=True) + return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name) - def update(self, name, obj={}, **kwargs): - obj = dict(obj) + def update(self, name, obj: Dict = None, **kwargs): + if obj is None: + obj = dict() obj.update(kwargs) obj['updatetime'] = time.time() return self.es.update(index=self.index, doc_type=self.__type__, - body={'doc': obj}, id=name, refresh=True, ignore=404) + body={'doc': obj}, id=name) def get_all(self, fields=None): for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, @@ -57,8 +59,7 @@ def get_all(self, fields=None): yield record['_source'] def get(self, name, fields=None): - ret = self.es.get(index=self.index, doc_type=self.__type__, id=name, - _source_include=fields or [], ignore=404) + ret = self.es.get(index=self.index, doc_type=self.__type__, id=name) return ret.get('_source', None) def check_update(self, timestamp, fields=None): @@ -69,4 +70,4 @@ def check_update(self, timestamp, fields=None): yield record['_source'] def drop(self, name): - return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True) + return self.es.delete(index=self.index, doc_type=self.__type__, id=name) diff --git a/pyspider/database/elasticsearch/resultdb.py b/pyspider/database/elasticsearch/resultdb.py index c6a3de373..e4b777f08 100644 --- a/pyspider/database/elasticsearch/resultdb.py +++ b/pyspider/database/elasticsearch/resultdb.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -10,6 +9,7 @@ import elasticsearch.helpers from elasticsearch import Elasticsearch + from pyspider.database.base.resultdb import ResultDB as BaseResultDB @@ -18,9 +18,9 @@ class ResultDB(BaseResultDB): def __init__(self, hosts, index='pyspider'): self.index = index - self.es = Elasticsearch(hosts=hosts) + self.es: Elasticsearch = Elasticsearch(hosts=hosts) - self.es.indices.create(index=self.index, ignore=400) + self.es.indices.create(index=self.index) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": True}, @@ -36,7 +36,7 @@ def projects(self): ret = self.es.search(index=self.index, doc_type=self.__type__, body={"aggs": {"projects": { "terms": {"field": "project"} - }}}, _source=False) + }}}) return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])] def save(self, project, taskid, url, result): @@ -62,7 +62,7 @@ def select(self, project, fields=None, offset=0, limit=0): else: for record in self.es.search(index=self.index, doc_type=self.__type__, body={'query': {'term': {'project': project}}}, - _source_include=fields or [], from_=offset, size=limit, + from_=offset, size=limit, sort="updatetime:desc" ).get('hits', {}).get('hits', []): yield record['_source'] diff --git a/pyspider/database/elasticsearch/taskdb.py b/pyspider/database/elasticsearch/taskdb.py index b6b980273..0cc8edd88 100644 --- a/pyspider/database/elasticsearch/taskdb.py +++ b/pyspider/database/elasticsearch/taskdb.py @@ -1,16 +1,17 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2016-01-20 20:20:55 -import time import json +import time +from typing import Dict import elasticsearch.helpers from elasticsearch import Elasticsearch + from pyspider.database.base.taskdb import TaskDB as BaseTaskDB @@ -91,18 +92,20 @@ def status_count(self, project): result[each['key']] = each['doc_count'] return result - def insert(self, project, taskid, obj={}): + def insert(self, project, taskid, obj: Dict = None): self._changed = True - obj = dict(obj) + if obj is None: + obj = dict() obj['taskid'] = taskid obj['project'] = project obj['updatetime'] = time.time() return self.es.index(index=self.index, doc_type=self.__type__, body=self._stringify(obj), id='%s:%s' % (project, taskid)) - def update(self, project, taskid, obj={}, **kwargs): + def update(self, project, taskid, obj: Dict = None, **kwargs): self._changed = True - obj = dict(obj) + if obj is None: + obj = dict() obj.update(kwargs) obj['updatetime'] = time.time() return self.es.update(index=self.index, doc_type=self.__type__, id='%s:%s' % (project, taskid), diff --git a/pyspider/database/local/__init__.py b/pyspider/database/local/__init__.py index 9966b5939..cdbb1283f 100644 --- a/pyspider/database/local/__init__.py +++ b/pyspider/database/local/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/database/local/projectdb.py b/pyspider/database/local/projectdb.py index 835fe5a56..43a15d0fb 100644 --- a/pyspider/database/local/projectdb.py +++ b/pyspider/database/local/projectdb.py @@ -1,15 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-17 12:32:17 +import glob +import logging import os import re + import six -import glob -import logging from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB @@ -68,13 +68,13 @@ def _build_project(self, filename): 'burst': burst, 'updatetime': os.path.getmtime(filename), } - except OSError as e: - logging.error('loading project script error: %s', e) + except OSError as err: + logging.error('loading project script error: %s', err) return None def get_all(self, fields=None): - for projectname in self.projects: - yield self.get(projectname, fields) + for project_name in self.projects: + yield self.get(project_name, fields) def get(self, name, fields=None): if name not in self.projects: diff --git a/pyspider/database/mongodb/mongodbbase.py b/pyspider/database/mongodb/mongodbbase.py index 5815904b3..f3bbbc16e 100644 --- a/pyspider/database/mongodb/mongodbbase.py +++ b/pyspider/database/mongodb/mongodbbase.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/database/mongodb/projectdb.py b/pyspider/database/mongodb/projectdb.py index 20d0426c8..cd5b58a27 100644 --- a/pyspider/database/mongodb/projectdb.py +++ b/pyspider/database/mongodb/projectdb.py @@ -1,11 +1,11 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-12 12:22:42 import time + from pymongo import MongoClient from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB diff --git a/pyspider/database/mongodb/resultdb.py b/pyspider/database/mongodb/resultdb.py index 7039750a9..248ab51b5 100644 --- a/pyspider/database/mongodb/resultdb.py +++ b/pyspider/database/mongodb/resultdb.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -11,6 +10,7 @@ from pymongo import MongoClient from pyspider.database.base.resultdb import ResultDB as BaseResultDB + from .mongodbbase import SplitTableMixin diff --git a/pyspider/database/mongodb/taskdb.py b/pyspider/database/mongodb/taskdb.py index 5b65ba6ea..676d98ab5 100644 --- a/pyspider/database/mongodb/taskdb.py +++ b/pyspider/database/mongodb/taskdb.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -11,6 +10,7 @@ from pymongo import MongoClient from pyspider.database.base.taskdb import TaskDB as BaseTaskDB + from .mongodbbase import SplitTableMixin diff --git a/pyspider/database/mysql/__init__.py b/pyspider/database/mysql/__init__.py index 45724e964..cdba29225 100644 --- a/pyspider/database/mysql/__init__.py +++ b/pyspider/database/mysql/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/database/mysql/mysqlbase.py b/pyspider/database/mysql/mysqlbase.py index 9dfc1aa0e..b8a273284 100644 --- a/pyspider/database/mysql/mysqlbase.py +++ b/pyspider/database/mysql/mysqlbase.py @@ -1,11 +1,11 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-05 10:42:24 import time + import mysql.connector diff --git a/pyspider/database/mysql/projectdb.py b/pyspider/database/mysql/projectdb.py index 94e388e24..cad6cb390 100644 --- a/pyspider/database/mysql/projectdb.py +++ b/pyspider/database/mysql/projectdb.py @@ -1,15 +1,16 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-07-17 21:06:43 import time + import mysql.connector from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB from pyspider.database.basedb import BaseDB + from .mysqlbase import MySQLMixin diff --git a/pyspider/database/mysql/resultdb.py b/pyspider/database/mysql/resultdb.py index 3fb50b68f..486f8f80e 100644 --- a/pyspider/database/mysql/resultdb.py +++ b/pyspider/database/mysql/resultdb.py @@ -1,19 +1,20 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-13 22:02:57 +import json import re -import six import time -import json + import mysql.connector +import six -from pyspider.libs import utils from pyspider.database.base.resultdb import ResultDB as BaseResultDB from pyspider.database.basedb import BaseDB +from pyspider.libs import utils + from .mysqlbase import MySQLMixin, SplitTableMixin diff --git a/pyspider/database/mysql/taskdb.py b/pyspider/database/mysql/taskdb.py index 90e97a8ac..5e0a2aa48 100644 --- a/pyspider/database/mysql/taskdb.py +++ b/pyspider/database/mysql/taskdb.py @@ -1,20 +1,22 @@ #!/usr/bin/envutils -# -*- encoding: utf-8 -*- + # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-07-17 18:53:01 +import json import re -import six import time -import json + import mysql.connector +import six -from pyspider.libs import utils from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from pyspider.database.basedb import BaseDB +from pyspider.libs import utils + from .mysqlbase import MySQLMixin, SplitTableMixin diff --git a/pyspider/database/redis/__init__.py b/pyspider/database/redis/__init__.py index 181c4e734..61e9b2487 100644 --- a/pyspider/database/redis/__init__.py +++ b/pyspider/database/redis/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/database/redis/taskdb.py b/pyspider/database/redis/taskdb.py index c6125b6ea..f67885584 100644 --- a/pyspider/database/redis/taskdb.py +++ b/pyspider/database/redis/taskdb.py @@ -1,19 +1,19 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-05-16 21:01:52 -import six -import time +import itertools import json -import redis import logging -import itertools +import time + +import redis +import six -from pyspider.libs import utils from pyspider.database.base.taskdb import TaskDB as BaseTaskDB +from pyspider.libs import utils class TaskDB(BaseTaskDB): diff --git a/pyspider/database/sqlalchemy/__init__.py b/pyspider/database/sqlalchemy/__init__.py index d0548d60e..1ac63133b 100644 --- a/pyspider/database/sqlalchemy/__init__.py +++ b/pyspider/database/sqlalchemy/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/database/sqlalchemy/projectdb.py b/pyspider/database/sqlalchemy/projectdb.py index 18e323c1d..4921ef318 100644 --- a/pyspider/database/sqlalchemy/projectdb.py +++ b/pyspider/database/sqlalchemy/projectdb.py @@ -1,18 +1,20 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 23:25:10 -import six import time -import sqlalchemy.exc -from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text +import six +import sqlalchemy.exc +from sqlalchemy import (Column, Float, MetaData, String, Table, Text, + create_engine) from sqlalchemy.engine.url import make_url -from pyspider.libs import utils + from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB +from pyspider.libs import utils + from .sqlalchemybase import result2dict @@ -35,17 +37,14 @@ def __init__(self, url): self.url = make_url(url) if self.url.database: - database = self.url.database - self.url.database = None try: - engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600) + engine = create_engine(self.url, pool_recycle=3600) conn = engine.connect() conn.execute("commit") - conn.execute("CREATE DATABASE %s" % database) + conn.execute("CREATE DATABASE %s" % self.url.database) except sqlalchemy.exc.SQLAlchemyError: pass - self.url.database = database - self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600) + self.engine = create_engine(url, pool_recycle=3600) self.table.create(self.engine, checkfirst=True) @staticmethod @@ -56,14 +55,16 @@ def _parse(data): def _stringify(data): return data - def insert(self, name, obj={}): + def insert(self, name, obj: dict = None): + if obj is None: + obj = dict() obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() return self.engine.execute(self.table.insert() .values(**self._stringify(obj))) - def update(self, name, obj={}, **kwargs): + def update(self, name, obj: dict = None, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() @@ -74,15 +75,15 @@ def update(self, name, obj={}, **kwargs): def get_all(self, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() - .with_only_columns(columns)): + .with_only_columns(columns)): yield self._parse(result2dict(columns, task)) def get(self, name, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() - .where(self.table.c.name == name) - .limit(1) - .with_only_columns(columns)): + .where(self.table.c.name == name) + .limit(1) + .with_only_columns(columns)): return self._parse(result2dict(columns, task)) def drop(self, name): @@ -92,6 +93,6 @@ def drop(self, name): def check_update(self, timestamp, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() - .with_only_columns(columns) - .where(self.table.c.updatetime >= timestamp)): + .with_only_columns(columns) + .where(self.table.c.updatetime >= timestamp)): yield self._parse(result2dict(columns, task)) diff --git a/pyspider/database/sqlalchemy/resultdb.py b/pyspider/database/sqlalchemy/resultdb.py index 8f91f6b49..c56c525e5 100644 --- a/pyspider/database/sqlalchemy/resultdb.py +++ b/pyspider/database/sqlalchemy/resultdb.py @@ -1,21 +1,22 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 18:48:15 +import json import re -import six import time -import json -import sqlalchemy.exc -from sqlalchemy import (create_engine, MetaData, Table, Column, - String, Float, Text) +import six +import sqlalchemy.exc +from sqlalchemy import (Column, Float, MetaData, String, Table, Text, + create_engine) from sqlalchemy.engine.url import make_url + from pyspider.database.base.resultdb import ResultDB as BaseResultDB from pyspider.libs import utils + from .sqlalchemybase import SplitTableMixin, result2dict @@ -34,18 +35,14 @@ def __init__(self, url): self.url = make_url(url) if self.url.database: - database = self.url.database - self.url.database = None try: - engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600) + engine = create_engine(self.url, pool_recycle=3600) conn = engine.connect() conn.execute("commit") - conn.execute("CREATE DATABASE %s" % database) + conn.execute("CREATE DATABASE %s" % self.url.database) except sqlalchemy.exc.SQLAlchemyError: pass - self.url.database = database - self.engine = create_engine(url, convert_unicode=True, - pool_recycle=3600) + self.engine = create_engine(url, pool_recycle=3600) self._list_project() @@ -88,7 +85,7 @@ def save(self, project, taskid, url, result): 'result': result, 'updatetime': time.time(), } - if self.get(project, taskid, ('taskid', )): + if self.get(project, taskid, ('taskid',)): del obj['taskid'] return self.engine.execute(self.table.update() .where(self.table.c.taskid == taskid) @@ -106,10 +103,10 @@ def select(self, project, fields=None, offset=0, limit=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() - .with_only_columns(columns=columns) - .order_by(self.table.c.updatetime.desc()) - .offset(offset).limit(limit) - .execution_options(autocommit=True)): + .with_only_columns(columns=columns) + .order_by(self.table.c.updatetime.desc()) + .offset(offset).limit(limit) + .execution_options(autocommit=True)): yield self._parse(result2dict(columns, task)) def count(self, project): @@ -131,7 +128,7 @@ def get(self, project, taskid, fields=None): columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for task in self.engine.execute(self.table.select() - .with_only_columns(columns=columns) - .where(self.table.c.taskid == taskid) - .limit(1)): + .with_only_columns(columns=columns) + .where(self.table.c.taskid == taskid) + .limit(1)): return self._parse(result2dict(columns, task)) diff --git a/pyspider/database/sqlalchemy/sqlalchemybase.py b/pyspider/database/sqlalchemy/sqlalchemybase.py index 8fc100d21..eebe43390 100644 --- a/pyspider/database/sqlalchemy/sqlalchemybase.py +++ b/pyspider/database/sqlalchemy/sqlalchemybase.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,6 +6,8 @@ import time +from sqlalchemy import inspect + def result2dict(columns, task): return dict(task) @@ -39,8 +40,8 @@ def _list_project(self): prefix = '%s_' % self.__tablename__ else: prefix = '' - - for project in self.engine.table_names(): + inspector = inspect(self.engine) + for project in inspector.get_table_names(): if project.startswith(prefix): project = project[len(prefix):] self.projects.add(project) diff --git a/pyspider/database/sqlalchemy/taskdb.py b/pyspider/database/sqlalchemy/taskdb.py index b298d608b..a6cc08687 100644 --- a/pyspider/database/sqlalchemy/taskdb.py +++ b/pyspider/database/sqlalchemy/taskdb.py @@ -1,21 +1,22 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-04 22:33:43 +import json import re -import six import time -import json -import sqlalchemy.exc -from sqlalchemy import (create_engine, MetaData, Table, Column, Index, - Integer, String, Float, Text, func) +import six +import sqlalchemy.exc +from sqlalchemy import (Column, Float, Index, Integer, MetaData, String, Table, + Text, create_engine, func) from sqlalchemy.engine.url import make_url -from pyspider.libs import utils + from pyspider.database.base.taskdb import TaskDB as BaseTaskDB +from pyspider.libs import utils + from .sqlalchemybase import SplitTableMixin, result2dict @@ -39,18 +40,16 @@ def __init__(self, url): ) self.url = make_url(url) + if self.url.database: - database = self.url.database - self.url.database = None try: - engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600) + engine = create_engine(self.url, pool_recycle=3600) conn = engine.connect() conn.execute("commit") - conn.execute("CREATE DATABASE %s" % database) + conn.execute("CREATE DATABASE %s" % self.url.database) except sqlalchemy.exc.SQLAlchemyError: pass - self.url.database = database - self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600) + self.engine = create_engine(url, pool_recycle=3600) self._list_project() @@ -99,8 +98,8 @@ def load_tasks(self, status, project=None, fields=None): for project in projects: self.table.name = self._tablename(project) for task in self.engine.execute(self.table.select() - .with_only_columns(columns) - .where(self.table.c.status == status)): + .with_only_columns(columns) + .where(self.table.c.status == status)): yield self._parse(result2dict(columns, task)) def get_task(self, project, taskid, fields=None): @@ -112,9 +111,9 @@ def get_task(self, project, taskid, fields=None): self.table.name = self._tablename(project) columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c for each in self.engine.execute(self.table.select() - .with_only_columns(columns) - .limit(1) - .where(self.table.c.taskid == taskid)): + .with_only_columns(columns) + .limit(1) + .where(self.table.c.taskid == taskid)): return self._parse(result2dict(columns, each)) def status_count(self, project): @@ -127,8 +126,8 @@ def status_count(self, project): self.table.name = self._tablename(project) for status, count in self.engine.execute( self.table.select() - .with_only_columns((self.table.c.status, func.count(1))) - .group_by(self.table.c.status)): + .with_only_columns((self.table.c.status, func.count(1))) + .group_by(self.table.c.status)): result[status] = count return result diff --git a/pyspider/database/sqlite/projectdb.py b/pyspider/database/sqlite/projectdb.py index 282ce5305..5ad48e36f 100644 --- a/pyspider/database/sqlite/projectdb.py +++ b/pyspider/database/sqlite/projectdb.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,10 +6,11 @@ import time -from .sqlitebase import SQLiteMixin from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB from pyspider.database.basedb import BaseDB +from .sqlitebase import SQLiteMixin + class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB): __tablename__ = 'projectdb' diff --git a/pyspider/database/sqlite/resultdb.py b/pyspider/database/sqlite/resultdb.py index 0314eaf2d..845902406 100644 --- a/pyspider/database/sqlite/resultdb.py +++ b/pyspider/database/sqlite/resultdb.py @@ -1,18 +1,18 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-13 17:08:43 +import json import re import time -import json -from .sqlitebase import SQLiteMixin, SplitTableMixin from pyspider.database.base.resultdb import ResultDB as BaseResultDB from pyspider.database.basedb import BaseDB +from .sqlitebase import SplitTableMixin, SQLiteMixin + class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB): __tablename__ = 'resultdb' diff --git a/pyspider/database/sqlite/sqlitebase.py b/pyspider/database/sqlite/sqlitebase.py index 9a652b9f7..a6bccad7d 100644 --- a/pyspider/database/sqlite/sqlitebase.py +++ b/pyspider/database/sqlite/sqlitebase.py @@ -1,14 +1,13 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-22 20:30:44 import os -import time import sqlite3 import threading +import time class SQLiteMixin(object): diff --git a/pyspider/database/sqlite/taskdb.py b/pyspider/database/sqlite/taskdb.py index 5a0095d5a..83dd70541 100644 --- a/pyspider/database/sqlite/taskdb.py +++ b/pyspider/database/sqlite/taskdb.py @@ -1,18 +1,18 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-08 10:25:34 +import json import re import time -import json -from .sqlitebase import SQLiteMixin, SplitTableMixin from pyspider.database.base.taskdb import TaskDB as BaseTaskDB from pyspider.database.basedb import BaseDB +from .sqlitebase import SplitTableMixin, SQLiteMixin + class TaskDB(SQLiteMixin, SplitTableMixin, BaseTaskDB, BaseDB): __tablename__ = 'taskdb' diff --git a/pyspider/fetcher/cookie_utils.py b/pyspider/fetcher/cookie_utils.py index e486fa8af..cbdef6a1c 100644 --- a/pyspider/fetcher/cookie_utils.py +++ b/pyspider/fetcher/cookie_utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/pyspider/fetcher/tornado_fetcher.py b/pyspider/fetcher/tornado_fetcher.py index d64169351..8a22da0aa 100644 --- a/pyspider/fetcher/tornado_fetcher.py +++ b/pyspider/fetcher/tornado_fetcher.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,32 +6,34 @@ from __future__ import unicode_literals -import os -import sys -import six import copy -import time +import functools import json import logging -import traceback -import functools +import os +import sys import threading -import tornado.ioloop -import tornado.httputil -import tornado.httpclient -import pyspider +import time +import traceback -from six.moves import queue, http_cookies -from six.moves.urllib.robotparser import RobotFileParser +import six +import tornado.httpclient +import tornado.httputil +import tornado.ioloop from requests import cookies +from six.moves import http_cookies, queue from six.moves.urllib.parse import urljoin, urlsplit +from six.moves.urllib.robotparser import RobotFileParser from tornado import gen from tornado.curl_httpclient import CurlAsyncHTTPClient from tornado.simple_httpclient import SimpleAsyncHTTPClient -from pyspider.libs import utils, dataurl, counter +import pyspider +from pyspider.libs import counter, dataurl, utils from pyspider.libs.url import quote_chinese + from .cookie_utils import extract_cookies_to_jar + logger = logging.getLogger('fetcher') @@ -53,6 +54,7 @@ def free_size(self): def size(self): return len(self.active) + fetcher_output = { "status_code": int, "orig_url": str, @@ -75,8 +77,9 @@ class Fetcher(object): } phantomjs_proxy = None splash_endpoint = None - splash_lua_source = open(os.path.join(os.path.dirname(__file__), "splash_fetcher.lua")).read() - robot_txt_age = 60*60 # 1h + with open(os.path.join(os.path.dirname(__file__), "splash_fetcher.lua"), encoding="utf-8") as f: + splash_lua_source = f.read() + robot_txt_age = 60 * 60 # 1h def __init__(self, inqueue, outqueue, poolsize=100, proxy=None, async_mode=True): self.inqueue = inqueue @@ -135,10 +138,10 @@ def async_fetch(self, task, callback=None): elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'): type = 'phantomjs' result = yield self.phantomjs_fetch(url, task) - elif task.get('fetch', {}).get('fetch_type') in ('splash', ): + elif task.get('fetch', {}).get('fetch_type') in ('splash',): type = 'splash' result = yield self.splash_fetch(url, task) - elif task.get('fetch', {}).get('fetch_type') in ('puppeteer', ): + elif task.get('fetch', {}).get('fetch_type') in ('puppeteer',): type = 'puppeteer' result = yield self.puppeteer_fetch(url, task) else: @@ -754,7 +757,7 @@ def queue_loop(): if self.http_client.free_size() <= 0: break task = self.inqueue.get_nowait() - # FIXME: decode unicode_obj should used after data selete from + # FIXME: decode unicode_obj should used after data select from # database, it's used here for performance task = utils.decode_unicode_obj(task) self.fetch(task) @@ -778,7 +781,9 @@ def queue_loop(): logger.info("fetcher exiting...") def quit(self): - '''Quit fetcher''' + """ + Quit fetcher + """ self._running = False self._quit = True self.ioloop.add_callback(self.ioloop.stop) @@ -792,12 +797,9 @@ def size(self): def xmlrpc_run(self, port=24444, bind='127.0.0.1', logRequests=False): '''Run xmlrpc server''' import umsgpack - from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication - try: - from xmlrpc.client import Binary - except ImportError: - from xmlrpclib import Binary + from pyspider.libs.wsgi_xmlrpc import WSGIXMLRPCApplication + from xmlrpc.client import Binary application = WSGIXMLRPCApplication() application.register_function(self.quit, '_quit') @@ -807,15 +809,17 @@ def sync_fetch(task): result = self.sync_fetch(task) result = Binary(umsgpack.packb(result)) return result + application.register_function(sync_fetch, 'fetch') def dump_counter(_time, _type): return self._cnt[_time].to_dict(_type) + application.register_function(dump_counter, 'counter') - import tornado.wsgi - import tornado.ioloop import tornado.httpserver + import tornado.ioloop + import tornado.wsgi container = tornado.wsgi.WSGIContainer(application) self.xmlrpc_ioloop = tornado.ioloop.IOLoop() diff --git a/pyspider/libs/base_handler.py b/pyspider/libs/base_handler.py index d2ebe9584..26c19f5fc 100644 --- a/pyspider/libs/base_handler.py +++ b/pyspider/libs/base_handler.py @@ -1,25 +1,25 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-16 23:12:48 -import sys -import inspect + import functools -import fractions +import inspect +import math +import sys +from pprint import pprint import six from six import add_metaclass, iteritems -from pyspider.libs.url import ( - quote_chinese, _build_url, _encode_params, - _encode_multipart_formdata, curl_to_arguments) -from pyspider.libs.utils import md5string, timeout -from pyspider.libs.ListIO import ListO +from pyspider.libs.list_io import ListO from pyspider.libs.response import rebuild_response -from pyspider.libs.pprint import pprint +from pyspider.libs.url import (_build_url, _encode_multipart_formdata, + _encode_params, curl_to_arguments, + quote_chinese) +from pyspider.libs.utils import md5string, timeout from pyspider.processor import ProcessorResult @@ -38,11 +38,13 @@ def not_send_status(func): It's used by callbacks like on_message, on_result etc... """ + @functools.wraps(func) def wrapper(self, response, task): - self._extinfo['not_send_status'] = True + self._extinfo["not_send_status"] = True function = func.__get__(self, self.__class__) return self._run_func(function, response, task) + return wrapper @@ -58,6 +60,7 @@ def config(_config=None, **kwargs): def wrapper(func): func._config = _config return func + return wrapper @@ -69,6 +72,7 @@ def every(minutes=NOTSET, seconds=NOTSET): """ method will been called every minutes or seconds """ + def wrapper(func): # mark the function with variable 'is_cronjob=True', the function would be # collected into the list Handler._cron_jobs by meta class @@ -98,7 +102,6 @@ def wrapper(func): class BaseHandlerMeta(type): - def __new__(cls, name, bases, attrs): # A list of all functions which is marked as 'is_cronjob=True' cron_jobs = [] @@ -110,9 +113,9 @@ def __new__(cls, name, bases, attrs): min_tick = 0 for each in attrs.values(): - if inspect.isfunction(each) and getattr(each, 'is_cronjob', False): + if inspect.isfunction(each) and getattr(each, "is_cronjob", False): cron_jobs.append(each) - min_tick = fractions.gcd(min_tick, each.tick) + min_tick = math.gcd(min_tick, int(each.tick)) newcls = type.__new__(cls, name, bases, attrs) newcls._cron_jobs = cron_jobs newcls._min_tick = min_tick @@ -126,11 +129,12 @@ class BaseHandler(object): `BaseHandler.run` is the main method to handler the task. """ + crawl_config = {} project_name = None _cron_jobs = [] _min_tick = 0 - __env__ = {'not_inited': True} + __env__ = {"not_inited": True} retry_delay = {} def _reset(self): @@ -146,15 +150,16 @@ def _run_func(self, function, *arguments): """ Running callback function with requested number of arguments """ - args, varargs, keywords, defaults = inspect.getargspec(function) + args, _, _, _, _, _, _ = inspect.getfullargspec(function) task = arguments[-1] - process_time_limit = task['process'].get('process_time_limit', - self.__env__.get('process_time_limit', 0)) + process_time_limit = task["process"].get( + "process_time_limit", self.__env__.get("process_time_limit", 0) + ) if process_time_limit > 0: - with timeout(process_time_limit, 'process timeout'): - ret = function(*arguments[:len(args) - 1]) + with timeout(process_time_limit, "process timeout"): + ret = function(*arguments[: len(args) - 1]) else: - ret = function(*arguments[:len(args) - 1]) + ret = function(*arguments[: len(args) - 1]) return ret def _run_task(self, task, response): @@ -162,16 +167,18 @@ def _run_task(self, task, response): Finding callback specified by `task['callback']` raising status error for it if needed. """ - process = task.get('process', {}) - callback = process.get('callback', '__call__') + process = task.get("process", {}) + callback = process.get("callback", "__call__") if not hasattr(self, callback): raise NotImplementedError("self.%s() not implemented!" % callback) function = getattr(self, callback) # do not run_func when 304 - if response.status_code == 304 and not getattr(function, '_catch_status_code_error', False): + if response.status_code == 304 and not getattr( + function, "_catch_status_code_error", False + ): return None - if not getattr(function, '_catch_status_code_error', False): + if not getattr(function, "_catch_status_code_error", False): response.raise_for_status() return self._run_func(function, response, task) @@ -187,10 +194,10 @@ def run_task(self, module, task, response): if isinstance(response, dict): response = rebuild_response(response) self.response = response - self.save = (task.get('track') or {}).get('save', {}) + self.save = (task.get("track") or {}).get("save", {}) try: - if self.__env__.get('enable_stdout_capture', True): + if self.__env__.get("enable_stdout_capture", True): sys.stdout = ListO(module.log_buffer) self._reset() result = self._run_task(task, response) @@ -200,6 +207,8 @@ def run_task(self, module, task, response): else: self._run_func(self.on_result, result, response, task) except Exception as e: + import traceback + traceback.print_exc() logger.exception(e) exception = e finally: @@ -215,18 +224,50 @@ def run_task(self, module, task, response): self.save = None module.log_buffer[:] = [] - return ProcessorResult(result, follows, messages, logs, exception, extinfo, save) - - schedule_fields = ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', 'auto_recrawl', 'cancel') - fetch_fields = ('method', 'headers', 'user_agent', 'data', 'connect_timeout', 'timeout', 'allow_redirects', 'cookies', - 'proxy', 'etag', 'last_modifed', 'last_modified', 'save', 'js_run_at', 'js_script', - 'js_viewport_width', 'js_viewport_height', 'load_images', 'fetch_type', 'use_gzip', 'validate_cert', - 'max_redirects', 'robots_txt') - process_fields = ('callback', 'process_time_limit') + return ProcessorResult( + result, follows, messages, logs, exception, extinfo, save + ) + + schedule_fields = ( + "priority", + "retries", + "exetime", + "age", + "itag", + "force_update", + "auto_recrawl", + "cancel", + ) + fetch_fields = ( + "method", + "headers", + "user_agent", + "data", + "connect_timeout", + "timeout", + "allow_redirects", + "cookies", + "proxy", + "etag", + "last_modifed", + "last_modified", + "save", + "js_run_at", + "js_script", + "js_viewport_width", + "js_viewport_height", + "load_images", + "fetch_type", + "use_gzip", + "validate_cert", + "max_redirects", + "robots_txt", + ) + process_fields = ("callback", "process_time_limit") @staticmethod def task_join_crawl_config(task, crawl_config): - task_fetch = task.get('fetch', {}) + task_fetch = task.get("fetch", {}) for k in BaseHandler.fetch_fields: if k in crawl_config: v = crawl_config[k] @@ -237,9 +278,9 @@ def task_join_crawl_config(task, crawl_config): else: task_fetch.setdefault(k, v) if task_fetch: - task['fetch'] = task_fetch + task["fetch"] = task_fetch - task_process = task.get('process', {}) + task_process = task.get("process", {}) for k in BaseHandler.process_fields: if k in crawl_config: v = crawl_config[k] @@ -248,7 +289,7 @@ def task_join_crawl_config(task, crawl_config): else: task_process.setdefault(k, v) if task_process: - task['process'] = task_process + task["process"] = task_process return task @@ -262,42 +303,44 @@ def _crawl(self, url, **kwargs): assert len(url) < 1024, "Maximum (1024) URL length error." - if kwargs.get('callback'): - callback = kwargs['callback'] + if kwargs.get("callback"): + callback = kwargs["callback"] if isinstance(callback, six.string_types) and hasattr(self, callback): func = getattr(self, callback) elif six.callable(callback) and six.get_method_self(callback) is self: func = callback - kwargs['callback'] = func.__name__ + kwargs["callback"] = func.__name__ elif six.callable(callback) and hasattr(self, callback.__name__): func = getattr(self, callback.__name__) - kwargs['callback'] = func.__name__ + kwargs["callback"] = func.__name__ else: raise NotImplementedError("self.%s() not implemented!" % callback) - if hasattr(func, '_config'): + if hasattr(func, "_config"): for k, v in iteritems(func._config): if isinstance(v, dict) and isinstance(kwargs.get(k), dict): kwargs[k].update(v) else: kwargs.setdefault(k, v) - url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) - if kwargs.get('files'): + url = quote_chinese(_build_url(url.strip(), kwargs.pop("params", None))) + if kwargs.get("files"): assert isinstance( - kwargs.get('data', {}), dict), "data must be a dict when using with files!" - content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), - kwargs.pop('files', {})) - kwargs.setdefault('headers', {}) - kwargs['headers']['Content-Type'] = content_type - kwargs['data'] = data - if kwargs.get('data'): - kwargs['data'] = _encode_params(kwargs['data']) - if kwargs.get('data'): - kwargs.setdefault('method', 'POST') - - if kwargs.get('user_agent'): - kwargs.setdefault('headers', {}) - kwargs['headers']['User-Agent'] = kwargs.get('user_agent') + kwargs.get("data", {}), dict + ), "data must be a dict when using with files!" + content_type, data = _encode_multipart_formdata( + kwargs.pop("data", {}), kwargs.pop("files", {}) + ) + kwargs.setdefault("headers", {}) + kwargs["headers"]["Content-Type"] = content_type + kwargs["data"] = data + if kwargs.get("data"): + kwargs["data"] = _encode_params(kwargs["data"]) + if kwargs.get("data"): + kwargs.setdefault("method", "POST") + + if kwargs.get("user_agent"): + kwargs.setdefault("headers", {}) + kwargs["headers"]["User-Agent"] = kwargs.get("user_agent") schedule = {} for key in self.schedule_fields: @@ -306,29 +349,31 @@ def _crawl(self, url, **kwargs): elif key in self.crawl_config: schedule[key] = self.crawl_config[key] - task['schedule'] = schedule + task["schedule"] = schedule fetch = {} for key in self.fetch_fields: if key in kwargs: fetch[key] = kwargs.pop(key) - task['fetch'] = fetch + task["fetch"] = fetch process = {} for key in self.process_fields: if key in kwargs: process[key] = kwargs.pop(key) - task['process'] = process + task["process"] = process - task['project'] = self.project_name - task['url'] = url - if 'taskid' in kwargs: - task['taskid'] = kwargs.pop('taskid') + task["project"] = self.project_name + task["url"] = url + if "taskid" in kwargs: + task["taskid"] = kwargs.pop("taskid") else: - task['taskid'] = self.get_taskid(task) + task["taskid"] = self.get_taskid(task) if kwargs: - raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) + raise TypeError( + "crawl() got unexpected keyword argument: %s" % kwargs.keys() + ) if self.is_debugger(): task = self.task_join_crawl_config(task, self.crawl_config) @@ -340,12 +385,12 @@ def _crawl(self, url, **kwargs): return task def get_taskid(self, task): - '''Generate taskid by information of task md5(url) by default, override me''' - return md5string(task['url']) + """Generate taskid by information of task md5(url) by default, override me""" + return md5string(task["url"]) # apis def crawl(self, url, **kwargs): - ''' + """ available params: url callback @@ -380,12 +425,12 @@ def crawl(self, url, **kwargs): save taskid - full documents: http://pyspider.readthedocs.org/en/latest/apis/self.crawl/ - ''' + full documents: https://pyspider.readthedocs.io/en/latest/apis/self.crawl/ + """ - if isinstance(url, six.string_types) and url.startswith('curl '): + if isinstance(url, six.string_types) and url.startswith("curl "): curl_kwargs = curl_to_arguments(url) - url = curl_kwargs.pop('urls') + url = curl_kwargs.pop("urls") for k, v in iteritems(curl_kwargs): kwargs.setdefault(k, v) @@ -399,15 +444,16 @@ def crawl(self, url, **kwargs): def is_debugger(self): """Return true if running in debugger""" - return self.__env__.get('debugger') + return self.__env__.get("debugger") - def send_message(self, project, msg, url='data:,on_message'): + def send_message(self, project, msg, url="data:,on_message"): """Send messages to other project.""" self._messages.append((project, msg, url)) def on_message(self, project, msg): - """Receive message from other project, override me.""" - pass + """ + Receive message from other project, override me. + """ def on_result(self, result): """Receiving returns from other callback, override me.""" @@ -416,15 +462,14 @@ def on_result(self, result): assert self.task, "on_result can't outside a callback." if self.is_debugger(): pprint(result) - if self.__env__.get('result_queue'): - self.__env__['result_queue'].put((self.task, result)) + if self.__env__.get("result_queue"): + self.__env__["result_queue"].put((self.task, result)) def on_finished(self, response, task): """ Triggered when all tasks in task queue finished. - http://docs.pyspider.org/en/latest/About-Projects/#on_finished-callback + https://docs.pyspider.org/en/latest/About-Projects/#on_finished-callback """ - pass @not_send_status def _on_message(self, response): @@ -433,9 +478,11 @@ def _on_message(self, response): @not_send_status def _on_cronjob(self, response, task): - if (not response.save + if ( + not response.save or not isinstance(response.save, dict) - or 'tick' not in response.save): + or "tick" not in response.save + ): return # When triggered, a '_on_cronjob' task is sent from scheudler with 'tick' in @@ -443,7 +490,7 @@ def _on_cronjob(self, response, task): # inverval of the cronjobs. The method should check the tick for each cronjob # function to confirm the execute interval. for cronjob in self._cron_jobs: - if response.save['tick'] % cronjob.tick != 0: + if response.save["tick"] % cronjob.tick != 0: continue function = cronjob.__get__(self, self.__class__) self._run_func(function, response, task) @@ -451,11 +498,11 @@ def _on_cronjob(self, response, task): def _on_get_info(self, response, task): """Sending runtime infomation about this script.""" for each in response.save or []: - if each == 'min_tick': + if each == "min_tick": self.save[each] = self._min_tick - elif each == 'retry_delay': + elif each == "retry_delay": if not isinstance(self.retry_delay, dict): - self.retry_delay = {'': self.retry_delay} + self.retry_delay = {"": self.retry_delay} self.save[each] = self.retry_delay - elif each == 'crawl_config': + elif each == "crawl_config": self.save[each] = self.crawl_config diff --git a/pyspider/libs/bench.py b/pyspider/libs/bench.py index 9e7bfd6e9..a2ff8b292 100644 --- a/pyspider/libs/bench.py +++ b/pyspider/libs/bench.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,16 +6,18 @@ # rate: 10000000000 # burst: 10000000000 -import time import logging -logger = logging.getLogger('bench') +import time from six.moves import queue as Queue -from pyspider.scheduler import ThreadBaseScheduler as Scheduler + from pyspider.fetcher.tornado_fetcher import Fetcher +from pyspider.libs.utils import md5string from pyspider.processor import Processor from pyspider.result import ResultWorker -from pyspider.libs.utils import md5string +from pyspider.scheduler import ThreadBaseScheduler as Scheduler + +logger = logging.getLogger('bench') def bench_test_taskdb(taskdb): @@ -73,7 +74,7 @@ def test_insert(n, start=0): cost_time, n * 1.0 / cost_time, cost_time / n * 1000) def test_update(n, start=0): - logger.info("taskdb update %d" % n) + logger.info("taskdb update %d", n) start_time = time.time() for i in range(n): task['url'] = 'http://bench.pyspider.org/?l=%d' % (i + start) @@ -96,8 +97,10 @@ def test_update(n, start=0): 'lastcrawltime' ] - def test_get(n, start=0, random=True, fields=request_task_fields): - logger.info("taskdb get %d %s" % (n, "randomly" if random else "")) + def test_get(n, start=0, random=True, fields=None): + logger.info("taskdb get %d %s", n, "randomly" if random else "") + if not fields: + fields = request_task_fields range_n = list(range(n)) if random: from random import shuffle @@ -189,6 +192,7 @@ def test_get(n): class BenchMixin(object): """Report to logger for bench test""" + def _bench_init(self): self.done_cnt = 0 self.start_time = time.time() @@ -225,9 +229,9 @@ def __init__(self, *args, **kwargs): super(BenchFetcher, self).__init__(*args, **kwargs) self._bench_init() - def on_result(self, type, task, result): + def on_result(self, _type, task, result): self._bench_report("Fetched", 0, 75) - return super(BenchFetcher, self).on_result(type, task, result) + return super(BenchFetcher, self).on_result(_type, task, result) class BenchProcessor(Processor, BenchMixin): diff --git a/pyspider/libs/counter.py b/pyspider/libs/counter.py index 88ff60eeb..bdf30a439 100644 --- a/pyspider/libs/counter.py +++ b/pyspider/libs/counter.py @@ -1,19 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-14 17:09:50 -from __future__ import unicode_literals, division, absolute_import +from __future__ import absolute_import, division, unicode_literals -import time import logging +import time from collections import deque -try: - from UserDict import DictMixin -except ImportError: - from collections import Mapping as DictMixin +from collections.abc import Mapping import six from six import iteritems @@ -246,10 +242,10 @@ def _trim_window(self): @property def avg(self): - sum = float(self.sum) + sum_value = float(self.sum) if not self.window_size: return 0 - return sum / self.window_size / self.window_interval + return sum_value / self.window_size / self.window_interval @property def sum(self): @@ -265,7 +261,7 @@ def on_append(self, value, time): pass -class CounterValue(DictMixin): +class CounterValue(Mapping): """ A dict like value item for CounterManager. """ @@ -279,7 +275,7 @@ def __getitem__(self, key): key = self._keys return self.manager.counters[key] else: - key = self._keys + (key, ) + key = self._keys + (key,) available_keys = [] for _key in list(self.manager.counters.keys()): @@ -326,7 +322,7 @@ def to_dict(self, get_value=None): return result -class CounterManager(DictMixin): +class CounterManager(Mapping): """ A dict like counter manager. @@ -345,7 +341,7 @@ def __init__(self, cls=TimebaseAverageWindowCounter): def event(self, key, value=1): """Fire a event of a counter by counter key""" if isinstance(key, six.string_types): - key = (key, ) + key = (key,) assert isinstance(key, tuple), "event key type error" if key not in self.counters: self.counters[key] = self.cls() @@ -355,7 +351,7 @@ def event(self, key, value=1): def value(self, key, value=1): """Set value of a counter by counter key""" if isinstance(key, six.string_types): - key = (key, ) + key = (key,) # assert all(isinstance(k, six.string_types) for k in key) assert isinstance(key, tuple), "event key type error" if key not in self.counters: @@ -370,7 +366,7 @@ def trim(self): del self.counters[key] def __getitem__(self, key): - key = (key, ) + key = (key,) available_keys = [] for _key in list(self.counters.keys()): if _key[:len(key)] == key: @@ -387,7 +383,7 @@ def __getitem__(self, key): return CounterValue(self, key) def __delitem__(self, key): - key = (key, ) + key = (key,) available_keys = [] for _key in list(self.counters.keys()): if _key[:len(key)] == key: @@ -435,7 +431,7 @@ def load(self, filename): try: with open(filename, 'rb') as fp: self.counters = cPickle.load(fp) - except: + except Exception: logging.debug("can't load counter from file: %s", filename) return False return True diff --git a/pyspider/libs/dataurl.py b/pyspider/libs/dataurl.py index 3f75095e4..ecb74d794 100644 --- a/pyspider/libs/dataurl.py +++ b/pyspider/libs/dataurl.py @@ -1,15 +1,16 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-16 10:33:20 +from base64 import b64decode, b64encode + import six -from base64 import b64encode, b64decode -from . import utils from six.moves.urllib.parse import quote, unquote +from . import utils + def encode(data, mime_type='', charset='utf-8', base64=True): """ diff --git a/pyspider/libs/ListIO.py b/pyspider/libs/list_io.py similarity index 92% rename from pyspider/libs/ListIO.py rename to pyspider/libs/list_io.py index e48d42edd..8c958f5b5 100644 --- a/pyspider/libs/ListIO.py +++ b/pyspider/libs/list_io.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -24,7 +23,7 @@ def close(self): def flush(self): pass - def seek(self, n, mode=0): + def seek(self, offset, mode=0): pass def readline(self): diff --git a/pyspider/libs/log.py b/pyspider/libs/log.py index 770ff20f4..5a9275ce6 100644 --- a/pyspider/libs/log.py +++ b/pyspider/libs/log.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,16 +6,12 @@ import logging -try: - import curses -except ImportError: - curses = None - from tornado.log import LogFormatter as _LogFormatter class LogFormatter(_LogFormatter, object): """Init tornado.log.LogFormatter from logging.config.fileConfig""" + def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs): if fmt is None: fmt = _LogFormatter.DEFAULT_FORMAT diff --git a/pyspider/libs/multiprocessing_queue.py b/pyspider/libs/multiprocessing_queue.py index 96525225e..baa57fd54 100644 --- a/pyspider/libs/multiprocessing_queue.py +++ b/pyspider/libs/multiprocessing_queue.py @@ -1,9 +1,7 @@ -import six -import platform import multiprocessing +import platform from multiprocessing.queues import Queue as BaseQueue - # The SharedCounter and Queue classes come from: # https://github.com/vterron/lemon/commit/9ca6b4b @@ -44,6 +42,7 @@ class MultiProcessingQueue(BaseQueue): being raised, but also allows us to implement a reliable version of both qsize() and empty(). """ + def __init__(self, *args, **kwargs): super(MultiProcessingQueue, self).__init__(*args, **kwargs) self.size = SharedCounter(0) diff --git a/pyspider/libs/pprint.py b/pyspider/libs/pprint.py deleted file mode 100644 index 0ff21716e..000000000 --- a/pyspider/libs/pprint.py +++ /dev/null @@ -1,378 +0,0 @@ -# Author: Fred L. Drake, Jr. -# fdrake@... -# -# This is a simple little module I wrote to make life easier. I didn't -# see anything quite like it in the library, though I may have overlooked -# something. I wrote this when I was trying to read some heavily nested -# tuples with fairly non-descriptive content. This is modeled very much -# after Lisp/Scheme - style pretty-printing of lists. If you find it -# useful, thank small children who sleep at night. - -"""Support to pretty-print lists, tuples, & dictionaries recursively. - -Very simple, but useful, especially in debugging data structures. - -Classes -------- - -PrettyPrinter() - Handle pretty-printing operations onto a stream using a configured - set of formatting parameters. - -Functions ---------- - -pformat() - Format a Python object into a pretty-printed representation. - -pprint() - Pretty-print a Python object to a stream [default is sys.stdout]. - -saferepr() - Generate a 'standard' repr()-like value, but protect against recursive - data structures. - -""" - -from __future__ import print_function - -import six -import sys as _sys - -from io import BytesIO, StringIO - -__all__ = ["pprint", "pformat", "isreadable", "isrecursive", "saferepr", - "PrettyPrinter"] - -# cache these for faster access: -_commajoin = ", ".join -_id = id -_len = len -_type = type - - -def pprint(object, stream=None, indent=1, width=80, depth=None): - """Pretty-print a Python object to a stream [default is sys.stdout].""" - printer = PrettyPrinter( - stream=stream, indent=indent, width=width, depth=depth) - printer.pprint(object) - - -def pformat(object, indent=1, width=80, depth=None): - """Format a Python object into a pretty-printed representation.""" - return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object) - - -def saferepr(object): - """Version of repr() which can handle recursive data structures.""" - return _safe_repr(object, {}, None, 0)[0] - - -def isreadable(object): - """Determine if saferepr(object) is readable by eval().""" - return _safe_repr(object, {}, None, 0)[1] - - -def isrecursive(object): - """Determine if object requires a recursive representation.""" - return _safe_repr(object, {}, None, 0)[2] - - -def _sorted(iterable): - return sorted(iterable) - - -class PrettyPrinter: - - def __init__(self, indent=1, width=80, depth=None, stream=None): - """Handle pretty printing operations onto a stream using a set of - configured parameters. - - indent - Number of spaces to indent for each level of nesting. - - width - Attempted maximum number of columns in the output. - - depth - The maximum depth to print out nested structures. - - stream - The desired output stream. If omitted (or false), the standard - output stream available at construction will be used. - - """ - indent = int(indent) - width = int(width) - assert indent >= 0, "indent must be >= 0" - assert depth is None or depth > 0, "depth must be > 0" - assert width, "width must be != 0" - self._depth = depth - self._indent_per_level = indent - self._width = width - if stream is not None: - self._stream = stream - else: - self._stream = _sys.stdout - - def pprint(self, object): - self._format(object, self._stream, 0, 0, {}, 0) - self._stream.write("\n") - - def pformat(self, object): - sio = BytesIO() - self._format(object, sio, 0, 0, {}, 0) - return sio.getvalue() - - def isrecursive(self, object): - return self.format(object, {}, 0, 0)[2] - - def isreadable(self, object): - s, readable, recursive = self.format(object, {}, 0, 0) - return readable and not recursive - - def _format(self, object, stream, indent, allowance, context, level): - level = level + 1 - objid = _id(object) - if objid in context: - stream.write(_recursion(object)) - self._recursive = True - self._readable = False - return - rep = self._repr(object, context, level - 1) - typ = _type(object) - sepLines = _len(rep) > (self._width - 1 - indent - allowance) - write = stream.write - - if self._depth and level > self._depth: - write(rep) - return - - r = getattr(typ, "__repr__", None) - if issubclass(typ, dict) and r is dict.__repr__: - write('{') - if self._indent_per_level > 1: - write((self._indent_per_level - 1) * ' ') - length = _len(object) - if length: - context[objid] = 1 - indent = indent + self._indent_per_level - items = _sorted(object.items()) - key, ent = items[0] - rep = self._repr(key, context, level) - write(rep) - write(': ') - self._format(ent, stream, indent + _len(rep) + 2, - allowance + 1, context, level) - if length > 1: - for key, ent in items[1:]: - rep = self._repr(key, context, level) - if sepLines: - write(',\n%s%s: ' % (' ' * indent, rep)) - else: - write(', %s: ' % rep) - self._format(ent, stream, indent + _len(rep) + 2, - allowance + 1, context, level) - indent = indent - self._indent_per_level - del context[objid] - write('}') - return - - if ( - (issubclass(typ, list) and r is list.__repr__) or - (issubclass(typ, tuple) and r is tuple.__repr__) or - (issubclass(typ, set) and r is set.__repr__) or - (issubclass(typ, frozenset) and r is frozenset.__repr__) - ): - length = _len(object) - if issubclass(typ, list): - write('[') - endchar = ']' - elif issubclass(typ, set): - if not length: - write('set()') - return - write('set([') - endchar = '])' - object = _sorted(object) - indent += 4 - elif issubclass(typ, frozenset): - if not length: - write('frozenset()') - return - write('frozenset([') - endchar = '])' - object = _sorted(object) - indent += 10 - else: - write('(') - endchar = ')' - if self._indent_per_level > 1 and sepLines: - write((self._indent_per_level - 1) * ' ') - if length: - context[objid] = 1 - indent = indent + self._indent_per_level - self._format(object[0], stream, indent, allowance + 1, - context, level) - if length > 1: - for ent in object[1:]: - if sepLines: - write(',\n' + ' ' * indent) - else: - write(', ') - self._format(ent, stream, indent, - allowance + 1, context, level) - indent = indent - self._indent_per_level - del context[objid] - if issubclass(typ, tuple) and length == 1: - write(',') - write(endchar) - return - - write(rep) - - def _repr(self, object, context, level): - repr, readable, recursive = self.format(object, context.copy(), - self._depth, level) - if not readable: - self._readable = False - if recursive: - self._recursive = True - return repr - - def format(self, object, context, maxlevels, level): - """Format object for a specific context, returning a string - and flags indicating whether the representation is 'readable' - and whether the object represents a recursive construct. - """ - return _safe_repr(object, context, maxlevels, level) - - -# Return triple (repr_string, isreadable, isrecursive). - -def _safe_repr(object, context, maxlevels, level): - typ = _type(object) - if typ is str: - string = object - string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') - if 'locale' not in _sys.modules: - return repr(object), True, False - if "'" in object and '"' not in object: - closure = '"' - quotes = {'"': '\\"'} - string = string.replace('"', '\\"') - else: - closure = "'" - quotes = {"'": "\\'"} - string = string.replace("'", "\\'") - try: - string.decode('utf8').encode('gbk', 'replace') - return ("%s%s%s" % (closure, string, closure)), True, False - except: - pass - qget = quotes.get - sio = StringIO() - write = sio.write - for char in object: - if char.isalpha(): - write(char) - else: - write(qget(char, repr(char)[1:-1])) - return ("%s%s%s" % (closure, sio.getvalue(), closure)), True, False - - if typ is six.text_type: - string = object.encode("utf8", 'replace') - string = string.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t') - if "'" in object and '"' not in object: - closure = '"' - quotes = {'"': '\\"'} - string = string.replace('"', '\\"') - else: - closure = "'" - quotes = {"'": "\\'"} - string = string.replace("'", "\\'") - return ("u%s%s%s" % (closure, string, closure)), True, False - - r = getattr(typ, "__repr__", None) - if issubclass(typ, dict) and r is dict.__repr__: - if not object: - return "{}", True, False - objid = _id(object) - if maxlevels and level >= maxlevels: - return "{...}", False, objid in context - if objid in context: - return _recursion(object), False, True - context[objid] = 1 - readable = True - recursive = False - components = [] - append = components.append - level += 1 - saferepr = _safe_repr - for k, v in _sorted(object.items()): - krepr, kreadable, krecur = saferepr(k, context, maxlevels, level) - vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) - append("%s: %s" % (krepr, vrepr)) - readable = readable and kreadable and vreadable - if krecur or vrecur: - recursive = True - del context[objid] - return "{%s}" % _commajoin(components), readable, recursive - - if (issubclass(typ, list) and r is list.__repr__) or \ - (issubclass(typ, tuple) and r is tuple.__repr__): - if issubclass(typ, list): - if not object: - return "[]", True, False - format = "[%s]" - elif _len(object) == 1: - format = "(%s,)" - else: - if not object: - return "()", True, False - format = "(%s)" - objid = _id(object) - if maxlevels and level >= maxlevels: - return format % "...", False, objid in context - if objid in context: - return _recursion(object), False, True - context[objid] = 1 - readable = True - recursive = False - components = [] - append = components.append - level += 1 - for o in object: - orepr, oreadable, orecur = _safe_repr(o, context, maxlevels, level) - append(orepr) - if not oreadable: - readable = False - if orecur: - recursive = True - del context[objid] - return format % _commajoin(components), readable, recursive - - rep = repr(object) - return rep, (rep and not rep.startswith('<')), False - - -def _recursion(object): - return ("" - % (_type(object).__name__, _id(object))) - - -def _perfcheck(object=None): - import time - if object is None: - object = [("string", (1, 2), [3, 4], {5: 6, 7: 8})] * 100000 - p = PrettyPrinter() - t1 = time.time() - _safe_repr(object, {}, None, 0) - t2 = time.time() - p.pformat(object) - t3 = time.time() - print("_safe_repr:", t2 - t1) - print("pformat:", t3 - t2) - -if __name__ == "__main__": - _perfcheck() diff --git a/pyspider/libs/response.py b/pyspider/libs/response.py index 8975781b2..bf78149f9 100644 --- a/pyspider/libs/response.py +++ b/pyspider/libs/response.py @@ -1,26 +1,29 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-02 11:16:02 import cgi -import re -import six import json +import re + import chardet -import lxml.html import lxml.etree -from tblib import Traceback +import lxml.html +import six from pyquery import PyQuery -from requests.structures import CaseInsensitiveDict from requests import HTTPError +from requests.structures import CaseInsensitiveDict +from tblib import Traceback + from pyspider.libs import utils class Response(object): - + """ + custom response + """ def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsensitiveDict(), content='', cookies=None, error=None, traceback=None, save=None, js_script_result=None, time=0): if cookies is None: @@ -38,7 +41,7 @@ def __init__(self, status_code=None, url=None, orig_url=None, headers=CaseInsens self.time = time def __repr__(self): - return u'' % self.status_code + return f'' def __bool__(self): """Returns true if `status_code` is 200 and no error""" @@ -53,7 +56,7 @@ def ok(self): """Return true if `status_code` is 200 and no error.""" try: self.raise_for_status() - except: + except Exception: return False return True @@ -105,7 +108,7 @@ def text(self): if hasattr(self, '_text') and self._text: return self._text if not self.content: - return u'' + return '' if isinstance(self.content, six.text_type): return self.content @@ -172,11 +175,11 @@ def raise_for_status(self, allow_redirects=True): six.reraise(Exception, Exception(self.error), Traceback.from_string(self.traceback).as_traceback()) http_error = HTTPError(self.error) elif (self.status_code >= 300) and (self.status_code < 400) and not allow_redirects: - http_error = HTTPError('%s Redirection' % (self.status_code)) + http_error = HTTPError(f'{self.status_code} Redirection') elif (self.status_code >= 400) and (self.status_code < 500): - http_error = HTTPError('%s Client Error' % (self.status_code)) + http_error = HTTPError(f'{self.status_code} Client Error') elif (self.status_code >= 500) and (self.status_code < 600): - http_error = HTTPError('%s Server Error' % (self.status_code)) + http_error = HTTPError(f'{self.status_code} Server Error') else: return @@ -187,23 +190,23 @@ def isok(self): try: self.raise_for_status() return True - except: + except Exception: return False -def rebuild_response(r): +def rebuild_response(resp: dict): response = Response( - status_code=r.get('status_code', 599), - url=r.get('url', ''), - headers=CaseInsensitiveDict(r.get('headers', {})), - content=r.get('content', ''), - cookies=r.get('cookies', {}), - error=r.get('error'), - traceback=r.get('traceback'), - time=r.get('time', 0), - orig_url=r.get('orig_url', r.get('url', '')), - js_script_result=r.get('js_script_result'), - save=r.get('save'), + status_code=resp.get('status_code', 599), + url=resp.get('url', ''), + headers=CaseInsensitiveDict(resp.get('headers', {})), + content=resp.get('content', ''), + cookies=resp.get('cookies', {}), + error=resp.get('error'), + traceback=resp.get('traceback'), + time=resp.get('time', 0), + orig_url=resp.get('orig_url', resp.get('url', '')), + js_script_result=resp.get('js_script_result'), + save=resp.get('save'), ) return response diff --git a/pyspider/libs/result_dump.py b/pyspider/libs/result_dump.py index 5e7dd45a6..f3b45aabb 100644 --- a/pyspider/libs/result_dump.py +++ b/pyspider/libs/result_dump.py @@ -1,15 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-03-27 20:12:11 -import six import csv -import json import itertools -from io import StringIO, BytesIO +import json +from io import BytesIO, StringIO + +import six from six import iteritems diff --git a/pyspider/libs/sample_handler.py b/pyspider/libs/sample_handler.py index ecea6cd95..3ee1066b6 100644 --- a/pyspider/libs/sample_handler.py +++ b/pyspider/libs/sample_handler.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- + # Created on __DATE__ # Project: __PROJECT_NAME__ -from pyspider.libs.base_handler import * +from pyspider.libs.base_handler import BaseHandler, config, every class Handler(BaseHandler): diff --git a/pyspider/libs/url.py b/pyspider/libs/url.py index c1c99a59f..eb935f5e0 100644 --- a/pyspider/libs/url.py +++ b/pyspider/libs/url.py @@ -1,16 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-09 14:39:57 import mimetypes +import shlex import six -import shlex -from six.moves.urllib.parse import urlparse, urlunparse from requests.models import RequestEncodingMixin +from six.moves.urllib.parse import urlparse, urlunparse def get_content_type(filename): diff --git a/pyspider/libs/utils.py b/pyspider/libs/utils.py index 336021a03..da0d8bcd1 100644 --- a/pyspider/libs/utils.py +++ b/pyspider/libs/utils.py @@ -1,18 +1,17 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2012-11-06 11:50:13 -import math -import logging -import hashlib +import base64 import datetime +import hashlib +import logging +import math import socket -import base64 -import warnings import threading +import warnings import six from six import iteritems @@ -31,7 +30,7 @@ def getitem(obj, key=0, default=None): """Get first element of list or return default""" try: return obj[key] - except: + except Exception: return default @@ -43,8 +42,8 @@ def hide_me(tb, g=globals()): tb = tb.tb_next while tb and tb.tb_frame.f_globals is g: tb = tb.tb_next - except Exception as e: - logging.exception(e) + except Exception as err: + logging.exception(err) tb = base_tb if not tb: tb = base_tb @@ -105,22 +104,20 @@ def format_date(date, gmt_offset=0, relative=True, shorter=False, full_format=Fa seconds = difference.seconds days = difference.days - format = None + format_str = None if not full_format: ret_, fff_format = fix_full_format(days, seconds, relative, shorter, local_date, local_yesterday) - format = fff_format + format_str = fff_format if ret_: - return format - else: - format = format + return format_str - if format is None: - format = "%(month_name)s %(day)s, %(year)s" if shorter else \ + if format_str is None: + format_str = "%(month_name)s %(day)s, %(year)s" if shorter else \ "%(month_name)s %(day)s, %(year)s at %(time)s" str_time = "%d:%02d" % (local_date.hour, local_date.minute) - return format % { + return format_str % { "month_name": local_date.strftime('%b'), "weekday": local_date.strftime('%A'), "day": str(local_date.day), @@ -134,37 +131,37 @@ def fix_full_format(days, seconds, relative, shorter, local_date, local_yesterda if relative and days == 0: if seconds < 50: return True, (("1 second ago" if seconds <= 1 else - "%(seconds)d seconds ago") % {"seconds": seconds}) + "%(seconds)d seconds ago") % {"seconds": seconds}) if seconds < 50 * 60: minutes = round(seconds / 60.0) return True, (("1 minute ago" if minutes <= 1 else - "%(minutes)d minutes ago") % {"minutes": minutes}) + "%(minutes)d minutes ago") % {"minutes": minutes}) hours = round(seconds / (60.0 * 60)) return True, (("1 hour ago" if hours <= 1 else - "%(hours)d hours ago") % {"hours": hours}) - format = None + "%(hours)d hours ago") % {"hours": hours}) + format_str = None if days == 0: - format = "%(time)s" + format_str = "%(time)s" elif days == 1 and local_date.day == local_yesterday.day and \ relative: - format = "yesterday" if shorter else "yesterday at %(time)s" + format_str = "yesterday" if shorter else "yesterday at %(time)s" elif days < 5: - format = "%(weekday)s" if shorter else "%(weekday)s at %(time)s" + format_str = "%(weekday)s" if shorter else "%(weekday)s at %(time)s" elif days < 334: # 11mo, since confusing for same month last year - format = "%(month)s-%(day)s" if shorter else \ + format_str = "%(month)s-%(day)s" if shorter else \ "%(month)s-%(day)s at %(time)s" - return False, format + return False, format_str -class TimeoutError(Exception): - pass try: import signal + if not hasattr(signal, 'SIGALRM'): raise ImportError('signal') + class timeout: """ Time limit of command @@ -195,6 +192,7 @@ def __exit__(self, type, value, traceback): except ImportError as e: warnings.warn("timeout is not supported on your platform.", FutureWarning) + class timeout: """ Time limit of command (for windows) @@ -300,7 +298,7 @@ def unicode_obj(obj): else: try: return text(obj) - except: + except Exception: return text(repr(obj)) @@ -400,7 +398,7 @@ def get_python_console(namespace=None): shell = code.InteractiveConsole(namespace) shell._quit = False - def exit(): + def ask_exit(): shell._quit = True def readfunc(prompt=""): @@ -409,7 +407,7 @@ def readfunc(prompt=""): return six.moves.input(prompt) # inject exit method - shell.ask_exit = exit + shell.ask_exit = ask_exit shell.raw_input = readfunc return shell diff --git a/pyspider/libs/wsgi_xmlrpc.py b/pyspider/libs/wsgi_xmlrpc.py index 37b6eafa4..ccb246e3e 100644 --- a/pyspider/libs/wsgi_xmlrpc.py +++ b/pyspider/libs/wsgi_xmlrpc.py @@ -15,9 +15,10 @@ # Origin: https://code.google.com/p/wsgi-xmlrpc/ -from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher import logging +from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher + logger = logging.getLogger(__name__) @@ -81,9 +82,9 @@ def handle_POST(self, environ, start_response): data, getattr(self.dispatcher, '_dispatch', None) ) response += b'\n' - except Exception as e: # This should only happen if the module is buggy + except Exception as err: # This should only happen if the module is buggy # internal error, report as HTTP server error - logger.exception(e) + logger.exception(err) start_response("500 Server error", [('Content-Type', 'text/plain')]) return [] else: diff --git a/pyspider/message_queue/__init__.py b/pyspider/message_queue/__init__.py index 86592f6fb..1dd4f94f8 100644 --- a/pyspider/message_queue/__init__.py +++ b/pyspider/message_queue/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -44,10 +43,8 @@ def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): elif parsed.scheme == 'redis': from .redis_queue import Queue if ',' in parsed.netloc: - """ - redis in cluster mode (there is no concept of 'db' in cluster mode) - ex. redis://host1:port1,host2:port2,...,hostn:portn - """ + # redis in cluster mode (there is no concept of 'db' in cluster mode) + # ex. redis://host1:port1,host2:port2,...,hostn:portn cluster_nodes = [] for netloc in parsed.netloc.split(','): cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])}) @@ -58,16 +55,17 @@ def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): db = parsed.path.lstrip('/').split('/') try: db = int(db[0]) - except: + except Exception: logging.warning('redis DB must zero-based numeric index, using 0 instead') db = 0 password = parsed.password or None - return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) + return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, + lazy_limit=lazy_limit) elif url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) else: - raise Exception('unknown connection url: %s', url) + raise Exception(f'unknown connection url: {url}') diff --git a/pyspider/message_queue/kombu_queue.py b/pyspider/message_queue/kombu_queue.py index e16f7b8c0..4f47b22ea 100644 --- a/pyspider/message_queue/kombu_queue.py +++ b/pyspider/message_queue/kombu_queue.py @@ -1,23 +1,22 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-05-22 20:54:01 import time + import umsgpack from kombu import Connection, enable_insecure_serializers -from kombu.serialization import register from kombu.exceptions import ChannelError +from kombu.serialization import register from six.moves import queue as BaseQueue - register('umsgpack', umsgpack.packb, umsgpack.unpackb, 'application/x-msgpack') enable_insecure_serializers(['umsgpack']) -class KombuQueue(object): +class KombuQueue: """ kombu is a high-level interface for multiple message queue backends. @@ -55,16 +54,14 @@ def qsize(self): return 0 def empty(self): - if self.qsize() == 0: + if self.qsize(): return True - else: - return False + return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True - else: - return False + return False def put(self, obj, block=True, timeout=None): if not block: diff --git a/pyspider/message_queue/rabbitmq.py b/pyspider/message_queue/rabbitmq.py index 9e4e72595..aec975031 100644 --- a/pyspider/message_queue/rabbitmq.py +++ b/pyspider/message_queue/rabbitmq.py @@ -1,29 +1,30 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux<17175297.hk@gmail.com> # http://binux.me # Created on 2012-11-15 17:27:54 -import time -import socket -import select import logging -import umsgpack +import select +import socket import threading +import time import amqp +import umsgpack from six.moves.urllib.parse import unquote + try: from urllib import parse as urlparse except ImportError: import urlparse + from six.moves import queue as BaseQueue def catch_error(func): """Catch errors of rabbitmq then reconnect""" - import amqp + try: import pika.exceptions connect_exceptions = ( @@ -46,10 +47,11 @@ def wrap(self, *args, **kwargs): logging.error('RabbitMQ error: %r, reconnect.', e) self.reconnect() return func(self, *args, **kwargs) + return wrap -class PikaQueue(object): +class PikaQueue: """ A Queue like rabbitmq connector """ @@ -100,7 +102,7 @@ def reconnect(self): except pika.exceptions.ChannelClosed: self.connection = pika.BlockingConnection(pika.URLParameters(self.amqp_url)) self.channel = self.connection.channel() - #self.channel.queue_purge(self.name) + # self.channel.queue_purge(self.name) @catch_error def qsize(self): @@ -109,21 +111,19 @@ def qsize(self): return ret.method.message_count def empty(self): - if self.qsize() == 0: + if self.qsize(): return True - else: - return False + return False def full(self): if self.maxsize and self.qsize() >= self.maxsize: return True - else: - return False + return False @catch_error def put(self, obj, block=True, timeout=None): if not block: - return self.put_nowait() + return self.put_nowait(obj) start_time = time.time() while True: @@ -191,8 +191,7 @@ class AmqpQueue(PikaQueue): Full = BaseQueue.Full max_timeout = 0.3 - def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', - maxsize=0, lazy_limit=True): + def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', maxsize=0, lazy_limit=True): """ Constructor for a AmqpQueue. @@ -207,6 +206,7 @@ def __init__(self, name, amqp_url='amqp://guest:guest@localhost:5672/%2F', `lazy_limit` is enabled, PikaQueue will check queue size every max_size / 10 put operation for better performace. """ + super().__init__(name, amqp_url, maxsize, lazy_limit) self.name = name self.amqp_url = amqp_url self.maxsize = maxsize @@ -229,13 +229,14 @@ def reconnect(self): userid=parsed.username or 'guest', password=parsed.password or 'guest', virtual_host=unquote( - parsed.path.lstrip('/') or '%2F')).connect() + parsed.path.lstrip('/') or '%2F')) + self.connection.connect() self.channel = self.connection.channel() try: self.channel.queue_declare(self.name) except amqp.exceptions.PreconditionFailed: pass - #self.channel.queue_purge(self.name) + # self.channel.queue_purge(self.name) @catch_error def qsize(self): @@ -267,4 +268,5 @@ def get_nowait(self, ack=False): self.channel.basic_ack(message.delivery_tag) return umsgpack.unpackb(message.body) + Queue = PikaQueue diff --git a/pyspider/message_queue/redis_queue.py b/pyspider/message_queue/redis_queue.py index dc24924c1..a3772989c 100644 --- a/pyspider/message_queue/redis_queue.py +++ b/pyspider/message_queue/redis_queue.py @@ -1,11 +1,11 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-04-27 22:48:04 import time + import redis import umsgpack from six.moves import queue as BaseQueue diff --git a/pyspider/processor/__init__.py b/pyspider/processor/__init__.py index 8423f2f44..cf4fa0df9 100644 --- a/pyspider/processor/__init__.py +++ b/pyspider/processor/__init__.py @@ -1 +1 @@ -from .processor import ProcessorResult, Processor +from .processor import Processor, ProcessorResult diff --git a/pyspider/processor/processor.py b/pyspider/processor/processor.py index ae0de1f46..1103edaf7 100644 --- a/pyspider/processor/processor.py +++ b/pyspider/processor/processor.py @@ -1,25 +1,25 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-16 22:59:56 +import logging import sys -import six import time -import logging import traceback -logger = logging.getLogger("processor") +import six from six.moves import queue as Queue + from pyspider.libs import utils from pyspider.libs.log import LogFormatter -from pyspider.libs.utils import pretty_unicode, hide_me from pyspider.libs.response import rebuild_response -from .project_module import ProjectManager, ProjectFinder +from pyspider.libs.utils import hide_me, pretty_unicode +from .project_module import ProjectFinder, ProjectManager +logger = logging.getLogger("processor") class ProcessorResult(object): """The result and logs producted by a callback""" diff --git a/pyspider/processor/project_module.py b/pyspider/processor/project_module.py index 7adfe708c..34e9fcde6 100644 --- a/pyspider/processor/project_module.py +++ b/pyspider/processor/project_module.py @@ -1,22 +1,24 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-16 22:24:20 +import inspect +import linecache +import logging import os -import six import sys -import imp import time -import weakref -import logging -import inspect import traceback -import linecache +import types +import weakref + +import six + from pyspider.libs import utils -from pyspider.libs.log import SaveLogHandler, LogFormatter +from pyspider.libs.log import LogFormatter, SaveLogHandler + logger = logging.getLogger("processor") @@ -165,10 +167,10 @@ def __init__(self, project, mod=None): def load_module(self, fullname): if self.mod is None: - self.mod = mod = imp.new_module(fullname) + self.mod = mod = types.ModuleType(fullname) else: mod = self.mod - mod.__file__ = '<%s>' % self.name + mod.__file__ = f'<{self.name}>' mod.__loader__ = self mod.__project__ = self.project mod.__package__ = '' @@ -216,7 +218,7 @@ def find_module(self, fullname, path=None): return ProjectLoader(info) def load_module(self, fullname): - mod = imp.new_module(fullname) + mod = types.ModuleType(fullname) mod.__file__ = '' mod.__loader__ = self mod.__path__ = [''] @@ -228,6 +230,7 @@ def is_package(self, fullname): else: import importlib.abc + class ProjectFinder(importlib.abc.MetaPathFinder): '''ProjectFinder class for sys.meta_path''' @@ -255,9 +258,10 @@ def find_module(self, fullname, path): if info: return ProjectLoader(info) + class ProjectsLoader(importlib.abc.InspectLoader): def load_module(self, fullname): - mod = imp.new_module(fullname) + mod = types.ModuleType(fullname) mod.__file__ = '' mod.__loader__ = self mod.__path__ = [''] @@ -278,6 +282,7 @@ def get_source(self, path): def get_code(self, fullname): return compile(self.get_source(fullname), '', 'exec') + class ProjectLoader(ProjectLoader, importlib.abc.Loader): def create_module(self, spec): return self.load_module(spec.name) @@ -286,4 +291,4 @@ def exec_module(self, module): return module def module_repr(self, module): - return '' % self.name + return f'' diff --git a/pyspider/result/__init__.py b/pyspider/result/__init__.py index c386ba625..047bc24ec 100644 --- a/pyspider/result/__init__.py +++ b/pyspider/result/__init__.py @@ -1,8 +1,7 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-19 16:10:19 -from .result_worker import ResultWorker, OneResultWorker +from .result_worker import OneResultWorker, ResultWorker diff --git a/pyspider/result/result_worker.py b/pyspider/result/result_worker.py index 16935fa18..3150508c0 100644 --- a/pyspider/result/result_worker.py +++ b/pyspider/result/result_worker.py @@ -1,14 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-19 15:37:46 -import time import json import logging +import time + from six.moves import queue as Queue + logger = logging.getLogger("result") diff --git a/pyspider/run.py b/pyspider/run.py index 7e3333c5f..d9c10745d 100755 --- a/pyspider/run.py +++ b/pyspider/run.py @@ -1,25 +1,25 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-03-05 00:11:49 -import os -import sys -import six import copy -import time -import shutil import logging import logging.config +import os +import shutil +import sys +import time import click +import six + import pyspider -from pyspider.message_queue import connect_message_queue from pyspider.database import connect_database from pyspider.libs import utils +from pyspider.message_queue import connect_message_queue def read_config(ctx, param, value): @@ -39,7 +39,7 @@ def underline_dict(d): def connect_db(ctx, param, value): if not value: - return + return None return utils.Get(lambda: connect_database(value)) @@ -51,7 +51,7 @@ def load_cls(ctx, param, value): def connect_rpc(ctx, param, value): if not value: - return + return None try: from six.moves import xmlrpc_client except ImportError: @@ -75,12 +75,12 @@ def connect_rpc(ctx, param, value): help='database url for resultdb, default: sqlite') @click.option('--message-queue', envvar='AMQP_URL', help='connection url to message queue, ' - 'default: builtin multiprocessing.Queue') + 'default: builtin multiprocessing.Queue') @click.option('--amqp-url', help='[deprecated] amqp url for rabbitmq. ' - 'please use --message-queue instead.') + 'please use --message-queue instead.') @click.option('--beanstalk', envvar='BEANSTALK_HOST', help='[deprecated] beanstalk config for beanstalk queue. ' - 'please use --message-queue instead.') + 'please use --message-queue instead.') @click.option('--phantomjs-proxy', envvar='PHANTOMJS_PROXY', help="phantomjs proxy ip:port") @click.option('--puppeteer-proxy', envvar='PUPPETEER_PROXY', help="puppeteer proxy ip:port") @click.option('--data-path', default='./data', help='data dir path') @@ -125,7 +125,7 @@ def cli(ctx, **kwargs): os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db))) - elif db in ('projectdb', ): + elif db in ('projectdb',): kwargs[db] = utils.Get(lambda db=db: connect_database('local+%s://%s' % ( db, os.path.join(os.path.dirname(__file__), 'libs/bench.py')))) else: @@ -185,12 +185,13 @@ def cli(ctx, **kwargs): @click.option('--xmlrpc-port', envvar='SCHEDULER_XMLRPC_PORT', default=23333) @click.option('--inqueue-limit', default=0, help='size limit of task queue for each project, ' - 'tasks will been ignored when overflow') + 'tasks will been ignored when overflow') @click.option('--delete-time', default=24 * 60 * 60, help='delete time before marked as delete') @click.option('--active-tasks', default=100, help='active log size') @click.option('--loop-limit', default=1000, help='maximum number of tasks due with in a loop') -@click.option('--fail-pause-num', default=10, help='auto pause the project when last FAIL_PAUSE_NUM task failed, set 0 to disable') +@click.option('--fail-pause-num', default=10, + help='auto pause the project when last FAIL_PAUSE_NUM task failed, set 0 to disable') @click.option('--scheduler-cls', default='pyspider.scheduler.ThreadBaseScheduler', callback=load_cls, help='scheduler class to be used.') @click.option('--threads', default=None, help='thread number for ThreadBaseScheduler, default: 4') @@ -238,7 +239,8 @@ def scheduler(ctx, xmlrpc, no_xmlrpc, xmlrpc_host, xmlrpc_port, @click.option('--timeout', help='default fetch timeout') @click.option('--phantomjs-endpoint', help="endpoint of phantomjs, start via pyspider phantomjs") @click.option('--puppeteer-endpoint', help="endpoint of puppeteer, start via pyspider puppeteer") -@click.option('--splash-endpoint', help="execute endpoint of splash: http://splash.readthedocs.io/en/stable/api.html#execute") +@click.option('--splash-endpoint', + help="execute endpoint of splash: http://splash.readthedocs.io/en/stable/api.html#execute") @click.option('--fetcher-cls', default='pyspider.fetcher.Fetcher', callback=load_cls, help='Fetcher class to be used.') @click.pass_context @@ -314,13 +316,13 @@ def result_worker(ctx, result_cls, get_object=False): g = ctx.obj ResultWorker = load_cls(None, None, result_cls) - result_worker = ResultWorker(resultdb=g.resultdb, inqueue=g.processor2result) + _result_worker = ResultWorker(resultdb=g.resultdb, inqueue=g.processor2result) - g.instances.append(result_worker) + g.instances.append(_result_worker) if g.get('testing_mode') or get_object: - return result_worker + return _result_worker - result_worker.run() + _result_worker.run() @cli.command() @@ -389,14 +391,14 @@ def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst, scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'): app.config['scheduler_rpc'] = connect_rpc(ctx, None, - 'http://{}:{}/'.format(os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'), - os.environ.get('SCHEDULER_PORT_23333_TCP_PORT') or 23333)) + 'http://{}:{}/'.format( + os.environ.get('SCHEDULER_PORT_23333_TCP_ADDR'), + os.environ.get('SCHEDULER_PORT_23333_TCP_PORT') or 23333)) elif scheduler_rpc is None: app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://127.0.0.1:23333/') else: app.config['scheduler_rpc'] = scheduler_rpc - app.debug = g.debug g.instances.append(app) if g.get('testing_mode') or get_object: @@ -422,9 +424,11 @@ def phantomjs(ctx, phantomjs_path, port, auto_restart, args): _quit = [] phantomjs_fetcher = os.path.join( os.path.dirname(pyspider.__file__), 'fetcher/phantomjs_fetcher.js') + if not phantomjs_fetcher: + return None cmd = [phantomjs_path, # this may cause memory leak: https://github.com/ariya/phantomjs/issues/12903 - #'--load-images=false', + # '--load-images=false', '--ssl-protocol=any', '--disk-cache=true'] + list(args or []) + [phantomjs_fetcher, str(port)] @@ -454,6 +458,7 @@ def quit(*args, **kwargs): break _phantomjs = subprocess.Popen(cmd) + @cli.command() @click.option('--port', default=22222, help='puppeteer port') @click.option('--auto-restart', default=False, help='auto restart puppeteer if crashed') @@ -472,7 +477,10 @@ def puppeteer(ctx, port, auto_restart, args): cmd = ['node', puppeteer_fetcher, str(port)] try: - _puppeteer = subprocess.Popen(cmd) + _stdout, _stderr = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + if _stderr: + logging.warning(_stderr) + return None except OSError: logging.warning('puppeteer not found, continue running without it.') return None @@ -492,10 +500,10 @@ def quit(*args, **kwargs): return puppeteer while True: + _puppeteer = subprocess.Popen(cmd) _puppeteer.wait() if _quit or not auto_restart: break - _puppeteer = subprocess.Popen(cmd) @cli.command() @@ -505,7 +513,7 @@ def quit(*args, **kwargs): help='instance num of result worker') @click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']), help='run each components in thread or subprocess. ' - 'always using thread for windows.') + 'always using thread for windows.') @click.pass_context def all(ctx, fetcher_num, processor_num, result_worker_num, run_in): """ @@ -544,18 +552,18 @@ def all(ctx, fetcher_num, processor_num, result_worker_num, run_in): # result worker result_worker_config = g.config.get('result_worker', {}) - for i in range(result_worker_num): + for _ in range(result_worker_num): threads.append(run_in(ctx.invoke, result_worker, **result_worker_config)) # processor processor_config = g.config.get('processor', {}) - for i in range(processor_num): + for _ in range(processor_num): threads.append(run_in(ctx.invoke, processor, **processor_config)) # fetcher fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc_host', '127.0.0.1') - for i in range(fetcher_num): + for _ in range(fetcher_num): threads.append(run_in(ctx.invoke, fetcher, **fetcher_config)) # scheduler @@ -588,7 +596,7 @@ def all(ctx, fetcher_num, processor_num, result_worker_num, run_in): @click.option('--result-worker-num', default=1, help='instance num of result worker') @click.option('--run-in', default='subprocess', type=click.Choice(['subprocess', 'thread']), help='run each components in thread or subprocess. ' - 'always using thread for windows.') + 'always using thread for windows.') @click.option('--total', default=10000, help="total url in test page") @click.option('--show', default=20, help="show how many urls in a page") @click.option('--taskdb-bench', default=False, is_flag=True, @@ -651,14 +659,14 @@ def clear_project(): # result worker result_worker_config = g.config.get('result_worker', {}) - for i in range(result_worker_num): + for _ in range(result_worker_num): threads.append(run_in(ctx.invoke, result_worker, result_cls='pyspider.libs.bench.BenchResultWorker', **result_worker_config)) # processor processor_config = g.config.get('processor', {}) - for i in range(processor_num): + for _ in range(processor_num): threads.append(run_in(ctx.invoke, processor, processor_cls='pyspider.libs.bench.BenchProcessor', **processor_config)) @@ -666,7 +674,7 @@ def clear_project(): # fetcher fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc_host', '127.0.0.1') - for i in range(fetcher_num): + for _ in range(fetcher_num): threads.append(run_in(ctx.invoke, fetcher, fetcher_cls='pyspider.libs.bench.BenchFetcher', **fetcher_config)) @@ -838,5 +846,6 @@ def send_message(ctx, scheduler_rpc, project, message): def main(): cli() + if __name__ == '__main__': main() diff --git a/pyspider/scheduler/__init__.py b/pyspider/scheduler/__init__.py index 997102d37..21a9f9071 100644 --- a/pyspider/scheduler/__init__.py +++ b/pyspider/scheduler/__init__.py @@ -1 +1 @@ -from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler # NOQA +from .scheduler import OneScheduler, Scheduler, ThreadBaseScheduler # NOQA diff --git a/pyspider/scheduler/scheduler.py b/pyspider/scheduler/scheduler.py index 084baff28..6dc8545d1 100644 --- a/pyspider/scheduler/scheduler.py +++ b/pyspider/scheduler/scheduler.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -18,6 +17,7 @@ from pyspider.libs import counter, utils from pyspider.libs.base_handler import BaseHandler + from .task_queue import TaskQueue logger = logging.getLogger('scheduler') @@ -671,7 +671,9 @@ def run_once(self): self._try_dump_cnt() def run(self): - '''Start scheduler loop''' + """ + Start scheduler loop + """ logger.info("scheduler starting...") while not self._quit: @@ -799,9 +801,9 @@ def webui_update(): } application.register_function(webui_update, 'webui_update') - import tornado.wsgi - import tornado.ioloop import tornado.httpserver + import tornado.ioloop + import tornado.wsgi container = tornado.wsgi.WSGIContainer(application) self.xmlrpc_ioloop = tornado.ioloop.IOLoop() @@ -1180,6 +1182,7 @@ def quit(self): import random import threading + from pyspider.database.sqlite.sqlitebase import SQLiteMixin @@ -1234,7 +1237,7 @@ def resultdb(self, resultdb): self.local.resultdb = resultdb def _start_threads(self): - for i in range(self.threads): + for _ in range(self.threads): queue = Queue.Queue() thread = threading.Thread(target=self._thread_worker, args=(queue, )) thread.daemon = True diff --git a/pyspider/scheduler/task_queue.py b/pyspider/scheduler/task_queue.py index a6d02e3a5..06b5febee 100644 --- a/pyspider/scheduler/task_queue.py +++ b/pyspider/scheduler/task_queue.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -9,20 +8,22 @@ import logging import threading import time +from collections.abc import Mapping as DictMixin -try: - from UserDict import DictMixin -except ImportError: - from collections import Mapping as DictMixin -from .token_bucket import Bucket from six.moves import queue as Queue +from .token_bucket import Bucket + logger = logging.getLogger('scheduler') -try: - cmp -except NameError: - cmp = lambda x, y: (x > y) - (x < y) + +# cmp() 在 python3 中不存在 +def cmp(a, b): + """ + Compare the two objects x and y and return an integer according to the outcome. + The return value is negative if x < y, zero if x == y and strictly positive if x > y. + """ + return (a > b) - (a < b) class AtomInt(object): @@ -190,16 +191,16 @@ def _check_processing(self): def put(self, taskid, priority=0, exetime=0): """ Put a task into task queue - + when use heap sort, if we put tasks(with the same priority and exetime=0) into queue, the queue is not a strict FIFO queue, but more like a FILO stack. - It is very possible that when there are continuous big flow, the speed of select is + It is very possible that when there are continuous big flow, the speed of select is slower than request, resulting in priority-queue accumulation in short time. - In this scenario, the tasks more earlier entering the priority-queue will not get - processed until the request flow becomes small. - - Thus, we store a global atom self increasing value into task.sequence which represent - the task enqueue sequence. When the comparison of exetime and priority have no + In this scenario, the tasks more earlier entering the priority-queue will not get + processed until the request flow becomes small. + + Thus, we store a global atom self increasing value into task.sequence which represent + the task enqueue sequence. When the comparison of exetime and priority have no difference, we compare task.sequence to ensure that the entire queue is ordered. """ now = time.time() diff --git a/pyspider/scheduler/token_bucket.py b/pyspider/scheduler/token_bucket.py index e7bb1b308..9fb8b9a87 100644 --- a/pyspider/scheduler/token_bucket.py +++ b/pyspider/scheduler/token_bucket.py @@ -1,11 +1,11 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-07 16:53:08 import time + try: import threading as _threading except ImportError: diff --git a/pyspider/webui/__init__.py b/pyspider/webui/__init__.py index abbc7d707..f89533a93 100644 --- a/pyspider/webui/__init__.py +++ b/pyspider/webui/__init__.py @@ -1,8 +1,7 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 23:20:40 -from . import app, index, debug, task, result, login +from . import app, debug, index, login, result, task diff --git a/pyspider/webui/app.py b/pyspider/webui/app.py index 2261fd6e6..9d7125080 100644 --- a/pyspider/webui/app.py +++ b/pyspider/webui/app.py @@ -1,23 +1,27 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- -# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: -# Author: Binux -# http://binux.me -# Created on 2014-02-22 23:17:13 +""" +vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: +Author: Binux + http://binux.me +Created on 2014-02-22 23:17:13 +""" +import logging import os import sys -import logging -logger = logging.getLogger("webui") +from flask import Flask from six import reraise from six.moves import builtins from six.moves.urllib.parse import urljoin -from flask import Flask + from pyspider.fetcher import tornado_fetcher +logger = logging.getLogger("webui") + if os.name == 'nt': import mimetypes + mimetypes.add_type("text/css", ".css", True) @@ -29,10 +33,10 @@ def logger(self): return logger def run(self, host=None, port=None, debug=None, **options): - import tornado.wsgi - import tornado.ioloop import tornado.httpserver + import tornado.ioloop import tornado.web + import tornado.wsgi if host is None: host = '127.0.0.1' @@ -46,7 +50,6 @@ def run(self, host=None, port=None, debug=None, **options): self.debug = bool(debug) hostname = host - port = port application = self use_reloader = self.debug use_debugger = self.debug @@ -61,7 +64,7 @@ def run(self, host=None, port=None, debug=None, **options): logger.warning('WebDav interface not enabled: %r', e) dav_app = None if dav_app: - from werkzeug.wsgi import DispatcherMiddleware + from werkzeug.middleware.dispatcher import DispatcherMiddleware application = DispatcherMiddleware(application, { '/dav': dav_app }) @@ -114,4 +117,6 @@ def cdn_url_handler(error, endpoint, kwargs): reraise(exc_type, exc_value, tb) else: raise error + + app.handle_url_build_error = cdn_url_handler diff --git a/pyspider/webui/bench_test.py b/pyspider/webui/bench_test.py index 18d21e9ba..16c1453be 100644 --- a/pyspider/webui/bench_test.py +++ b/pyspider/webui/bench_test.py @@ -1,17 +1,18 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-08 22:31:17 import random + try: from urllib import urlencode except ImportError: from urllib.parse import urlencode from flask import request + from .app import app diff --git a/pyspider/webui/debug.py b/pyspider/webui/debug.py index 6a0694139..e06d3fbff 100644 --- a/pyspider/webui/debug.py +++ b/pyspider/webui/debug.py @@ -1,27 +1,25 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-23 00:19:06 +import datetime +import inspect +import socket import sys import time -import socket -import inspect -import datetime import traceback -from flask import render_template, request, json -try: - import flask_login as login -except ImportError: - from flask.ext import login +from flask import json, render_template, request -from pyspider.libs import utils, sample_handler, dataurl +import flask_login as login + +from pyspider.libs import dataurl, sample_handler, utils from pyspider.libs.response import rebuild_response -from pyspider.processor.project_module import ProjectManager, ProjectFinder +from pyspider.processor.project_module import ProjectFinder, ProjectManager + from .app import app default_task = { @@ -32,7 +30,7 @@ 'callback': 'on_start', }, } -default_script = inspect.getsource(sample_handler) +DEFAULT_SCRIPT = inspect.getsource(sample_handler) @app.route('/debug/', methods=['GET', 'POST']) @@ -44,7 +42,7 @@ def debug(project): if info: script = info['script'] else: - script = (default_script + script = (DEFAULT_SCRIPT .replace('__DATE__', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) .replace('__PROJECT_NAME__', project) .replace('__START_URL__', request.values.get('start-urls') or '__START_URL__')) @@ -81,7 +79,7 @@ def run(project): 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ - 200, {'Content-Type': 'application/json'} + 200, {'Content-Type': 'application/json'} project_info = { 'name': project, @@ -102,7 +100,7 @@ def run(project): 'time': time.time() - start_time, } return json.dumps(utils.unicode_obj(result)), \ - 200, {'Content-Type': 'application/json'} + 200, {'Content-Type': 'application/json'} project_info['script'] = info['script'] fetch_result = {} @@ -181,7 +179,7 @@ def save(project): info = { 'script': script, } - if project_info.get('status') in ('DEBUG', 'RUNNING', ): + if project_info.get('status') in ('DEBUG', 'RUNNING',): info['status'] = 'CHECKING' projectdb.update(project, info) else: @@ -212,7 +210,7 @@ def get_script(project): return 'project name is not allowed!', 400 info = projectdb.get(project, fields=['name', 'script']) return json.dumps(utils.unicode_obj(info)), \ - 200, {'Content-Type': 'application/json'} + 200, {'Content-Type': 'application/json'} @app.route('/blank.html') diff --git a/pyspider/webui/index.py b/pyspider/webui/index.py index 381131d09..a0abff78f 100644 --- a/pyspider/webui/index.py +++ b/pyspider/webui/index.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,13 +6,10 @@ import socket -from six import iteritems, itervalues -from flask import render_template, request, json +from flask import json, render_template, request +from six import iteritems -try: - import flask_login as login -except ImportError: - from flask.ext import login +import flask_login as login from .app import app diff --git a/pyspider/webui/login.py b/pyspider/webui/login.py index d32d5b73a..4415907a6 100644 --- a/pyspider/webui/login.py +++ b/pyspider/webui/login.py @@ -1,16 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-10 20:36:27 import base64 + from flask import Response -try: - import flask_login as login -except ImportError: - from flask.ext import login + +import flask_login as login + from .app import app login_manager = login.LoginManager() diff --git a/pyspider/webui/result.py b/pyspider/webui/result.py index 84305bb31..65cac83aa 100644 --- a/pyspider/webui/result.py +++ b/pyspider/webui/result.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,11 +6,12 @@ from __future__ import unicode_literals -from flask import render_template, request, json -from flask import Response -from .app import app +from flask import Response, json, render_template, request + from pyspider.libs import result_dump +from .app import app + @app.route('/results') def result(): diff --git a/pyspider/webui/task.py b/pyspider/webui/task.py index 4652c641d..45d627d75 100644 --- a/pyspider/webui/task.py +++ b/pyspider/webui/task.py @@ -1,14 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-07-16 15:30:57 import socket -from flask import abort, render_template, request, json + +from flask import abort, json, render_template, request from pyspider.libs import utils + from .app import app diff --git a/pyspider/webui/webdav.py b/pyspider/webui/webdav.py index 5483dbf19..b55b0527f 100644 --- a/pyspider/webui/webdav.py +++ b/pyspider/webui/webdav.py @@ -1,20 +1,22 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-6-3 11:29 +import base64 import os import time -import base64 + import six from six import BytesIO +from wsgidav.dav_error import HTTP_FORBIDDEN, DAVError +from wsgidav.dav_provider import DAVCollection, DAVNonCollection, DAVProvider from wsgidav.wsgidav_app import DEFAULT_CONFIG, WsgiDAVApp -from wsgidav.dav_provider import DAVProvider, DAVCollection, DAVNonCollection -from wsgidav.dav_error import DAVError, HTTP_FORBIDDEN -from pyspider.libs.utils import utf8, text + +from pyspider.libs.utils import text, utf8 + from .app import app diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..490cc93a5 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +# for test or doc +httpbin +recommonmark +easywebdav +coverage +pyproxy==0.1.6 +pylint diff --git a/requirements.txt b/requirements.txt index 85e030fef..fa2ab5899 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,24 +1,27 @@ -Flask==0.10 -Jinja2==2.7 +Jinja2==3.1.2 chardet==3.0.4 cssselect==0.9 -lxml==4.3.3 -pycurl==7.43.0.3 -pyquery==1.4.0 -requests==2.24.0 +lxml==4.8.0 +pycurl==7.45.1 +pyquery==1.4.3 +requests==2.27.1 tornado==4.5.3 mysql-connector-python==8.0.16 -pika==1.1.0 +pika==1.2.1 pymongo==3.9.0 -Flask-Login==0.2.11 +Flask==2.1.2 +Flask-Login==0.6.1 u-msgpack-python==1.6 -click==6.6 -SQLAlchemy==1.3.10 -six==1.10.0 -amqp==2.4.0 -redis==2.10.6 -redis-py-cluster==1.3.6 -kombu==4.4.0 -psycopg2==2.8.2 -elasticsearch==2.3.0 -tblib==1.4.0 +click==8.1.3 +SQLAlchemy==1.4.36 +six==1.16.0 +amqp>=5.1.1 +redis==3.5.3 +redis-py-cluster==2.1.3 +kombu>=5.2.4 +psycopg2-binary==2.9.3 +elasticsearch==7.10.0 +tblib==1.7.0 +Werkzeug==2.1.2 +wsgidav==2.3.0 +vine==5.0.0 diff --git a/run.py b/run.py index eba7e6e89..7e357eecc 100755 --- a/run.py +++ b/run.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/setup.py b/setup.py index 2512f4708..0fbc7fc36 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,15 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-24 22:27:45 -import sys -from setuptools import setup, find_packages from codecs import open from os import path +from setuptools import find_packages, setup + here = path.abspath(path.dirname(__file__)) with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() @@ -18,18 +17,18 @@ import pyspider install_requires = [ - 'Flask==0.10', - 'Jinja2==2.7', + 'Flask==2.1.2', + 'Flask-Login==0.6.1', + 'Jinja2==3.1.2', 'chardet==3.0.4', 'cssselect==0.9', - "lxml==4.3.3", - 'pycurl==7.43.0.3', - 'requests==2.24.0', - 'Flask-Login==0.2.11', + "lxml==4.8.0", + 'pycurl==7.45.1', + 'requests==2.27.1', 'u-msgpack-python==1.6', - 'click==3.3', - 'six==1.10.0', - 'tblib==1.4.0', + 'click==8.1.3', + 'six==1.16.0', + 'tblib==1.7.0', 'wsgidav==2.3.0', 'tornado>=3.2,<=4.5.3', 'pyquery', @@ -38,14 +37,15 @@ extras_require_all = [ 'mysql-connector-python==8.0.16', 'pymongo==3.9.0', - 'redis==2.10.6', - 'redis-py-cluster==1.3.6', - 'psycopg2==2.8.2', - 'elasticsearch==2.3.0', - 'kombu==4.4.0', - 'amqp==2.4.0', - 'SQLAlchemy==1.3.10', - 'pika==1.1.0' + 'redis==3.5.3', + 'redis-py-cluster==2.1.3', + 'psycopg2-binary==2.9.3', + 'elasticsearch==7.10.0', + 'kombu>=5.2.4,<6', + 'amqp>=5.1.1,<6', + 'SQLAlchemy==1.4.36', + 'vine==5.0.0', + 'pika==1.2.1', ] setup( @@ -55,18 +55,19 @@ description='A Powerful Spider System in Python', long_description=long_description, - url='https://github.com/binux/pyspider', + url='https://github.com/lusi1990/pyspider', - author='Roy Binux', - author_email='roy@binux.me', + author='Master Lu', + author_email='lusi2114@gmail.com', license='Apache License, Version 2.0', classifiers=[ 'Development Status :: 4 - Beta', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'License :: OSI Approved :: Apache Software License', @@ -89,7 +90,7 @@ 'all': extras_require_all, 'test': [ 'coverage', - 'Werkzeug==0.16.1', + 'Werkzeug==2.1.2', 'httpbin==0.7.0', 'pyproxy==0.1.6', 'easywebdav==1.2.0', diff --git a/tests/__init__.py b/tests/__init__.py index 5a125efd0..eb5982f79 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -8,4 +7,5 @@ import os import unittest -all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_*.py") +# all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_*.py") +all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_webui.py") diff --git a/tests/data_fetcher_processor_handler.py b/tests/data_fetcher_processor_handler.py index 74ca5b48c..34c823bdb 100644 --- a/tests/data_fetcher_processor_handler.py +++ b/tests/data_fetcher_processor_handler.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,6 +6,7 @@ from pyspider.libs.base_handler import * + class Handler(BaseHandler): @not_send_status diff --git a/tests/data_handler.py b/tests/data_handler.py index 3f77235c7..1f77eca47 100644 --- a/tests/data_handler.py +++ b/tests/data_handler.py @@ -1,17 +1,21 @@ - -#!/usr/bin/env python -# -*- encoding: utf-8 -*- +# !/usr/bin/env python # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 14:02:21 - +import logging import time -from pyspider.libs.base_handler import BaseHandler, catch_status_code_error, every + +from pyspider.libs.base_handler import (BaseHandler, catch_status_code_error, + every) + +logger = logging.getLogger(__name__) + class IgnoreHandler(object): pass + class TestHandler(BaseHandler): retry_delay = { 1: 10, @@ -59,4 +63,3 @@ def generator(self, response): def sleep(self, response): time.sleep(response.save) - diff --git a/tests/data_sample_handler.py b/tests/data_sample_handler.py index ea193b492..9008a68c0 100644 --- a/tests/data_sample_handler.py +++ b/tests/data_sample_handler.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- + # Created on __DATE__ # Project: __PROJECT_NAME__ -from pyspider.libs.base_handler import * +from pyspider.libs.base_handler import BaseHandler, config, every class Handler(BaseHandler): diff --git a/tests/data_test_webpage.py b/tests/data_test_webpage.py index 70bc3dedf..fd1c9c126 100644 --- a/tests/data_test_webpage.py +++ b/tests/data_test_webpage.py @@ -1,11 +1,14 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-24 13:44:10 -from httpbin import app +# from httpbin import app +from flask import Flask + +app = Flask(__name__) + @app.route('/pyspider/test.html') def test_page(): @@ -26,6 +29,7 @@ def test_page(): stream ''' + @app.route('/pyspider/ajax.html') def test_ajax(): return ''' @@ -45,6 +49,7 @@ def test_ajax(): ''' + @app.route('/pyspider/ajax_click.html') def test_ajax_click(): return ''' diff --git a/tests/test_base_handler.py b/tests/test_base_handler.py index 317e12a60..d158ee426 100644 --- a/tests/test_base_handler.py +++ b/tests/test_base_handler.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me diff --git a/tests/test_bench.py b/tests/test_bench.py index 9b584700f..6a43f8a31 100644 --- a/tests/test_bench.py +++ b/tests/test_bench.py @@ -1,21 +1,22 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-12-10 01:34:09 +import inspect import os +import shutil import sys import time -import click -import shutil -import inspect import unittest +import click + from pyspider import run from pyspider.libs import utils + class TestBench(unittest.TestCase): @classmethod @@ -29,6 +30,7 @@ def tearDownClass(self): def test_10_bench(self): import subprocess + #cmd = [sys.executable] cmd = ['coverage', 'run'] p = subprocess.Popen(cmd+[ diff --git a/tests/test_counter.py b/tests/test_counter.py index 03ceb4203..e4cf41190 100644 --- a/tests/test_counter.py +++ b/tests/test_counter.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -11,6 +10,7 @@ from pyspider.libs import counter + class TestCounter(unittest.TestCase): def test_010_TimebaseAverageEventCounter(self): c = counter.TimebaseAverageEventCounter(2, 1) diff --git a/tests/test_database.py b/tests/test_database.py index f9d563a3b..78b7a1774 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -1,14 +1,12 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-08 22:37:13 -from __future__ import unicode_literals, division +from __future__ import division, unicode_literals import os -import six import time import unittest @@ -16,7 +14,7 @@ from pyspider.database.base.taskdb import TaskDB -class TaskDBCase(object): +class TaskDBCase(unittest.TestCase): sample_task = { 'taskid': 'taskid', 'project': 'project', @@ -58,7 +56,7 @@ class TaskDBCase(object): 'time': 10, 'follows': 3, 'outputs': 5, - 'exception': u"中文", + 'exception': "中文", }, }, 'lastcrawltime': time.time(), @@ -155,7 +153,7 @@ def test_z20_update_projects(self): self.taskdb.UPDATE_PROJECTS_TIME = saved -class ProjectDBCase(object): +class ProjectDBCase(unittest.TestCase): sample_project = { 'name': 'name', 'script': 'import time\nprint(time.time(), "!@#$%^&*()\';:<>?/|")', @@ -245,7 +243,7 @@ def test_z10_drop(self): self.assertIsNone(self.projectdb.get('drop_project3')) -class ResultDBCase(object): +class ResultDBCase(unittest.TestCase): @classmethod def setUpClass(self): diff --git a/tests/test_fetcher.py b/tests/test_fetcher.py index 02ace999c..540fe48e8 100644 --- a/tests/test_fetcher.py +++ b/tests/test_fetcher.py @@ -1,31 +1,32 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-15 22:10:35 -import os -import json import copy -import time +import json +import logging +import logging.config +import os import socket -import umsgpack import subprocess +import time import unittest -import logging -import logging.config +import umsgpack + logging.config.fileConfig("pyspider/logging.conf") try: from six.moves import xmlrpc_client except ImportError: import xmlrpclib as xmlrpc_client + +from pyspider.fetcher.tornado_fetcher import Fetcher from pyspider.libs import utils from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.response import rebuild_response -from pyspider.fetcher.tornado_fetcher import Fetcher class TestFetcher(unittest.TestCase): @@ -53,9 +54,10 @@ class TestFetcher(unittest.TestCase): @classmethod def setUpClass(self): - import tests.data_test_webpage import httpbin + import tests.data_test_webpage + self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' @@ -442,9 +444,10 @@ def sample_task_http(self): @classmethod def setUpClass(self): - import tests.data_test_webpage import httpbin + import tests.data_test_webpage + self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887' @@ -459,7 +462,7 @@ def setUpClass(self): '--password=123456', '--port=14830', '--debug'], close_fds=True) self.proxy = socket.gethostbyname(socket.gethostname()) + ':14830' - + @classmethod def tearDownClass(self): self.rpc("close")() diff --git a/tests/test_fetcher_processor.py b/tests/test_fetcher_processor.py index 44cf2c1d3..358b0baf1 100644 --- a/tests/test_fetcher_processor.py +++ b/tests/test_fetcher_processor.py @@ -1,21 +1,21 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-18 14:09:41 import os -import time -import httpbin import subprocess +import time import unittest +import httpbin +from six.moves.queue import Queue + from pyspider.database.local.projectdb import ProjectDB from pyspider.fetcher import Fetcher +from pyspider.libs import dataurl, utils from pyspider.processor import Processor -from pyspider.libs import utils, dataurl -from six.moves.queue import Queue from tests.data_fetcher_processor_handler import Handler @@ -434,7 +434,7 @@ def test_zzz_etag_not_working(self): self.assertTrue(result) def test_zzz_unexpected_crawl_argument(self): - with self.assertRaisesRegexp(TypeError, "unexpected keyword argument"): + with self.assertRaisesRegex(TypeError, "unexpected keyword argument"): self.crawl(self.httpbin + '/cache', cookie={}, callback=self.json) def test_zzz_curl_get(self): @@ -465,18 +465,18 @@ def test_zzz_curl_put(self): self.assertIn('fileUpload1', result['files'], result) def test_zzz_curl_no_url(self): - with self.assertRaisesRegexp(TypeError, 'no URL'): + with self.assertRaisesRegex(TypeError, 'no URL'): status, newtasks, result = self.crawl( '''curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed''', callback=self.json) def test_zzz_curl_bad_option(self): - with self.assertRaisesRegexp(TypeError, 'Unknow curl option'): + with self.assertRaisesRegex(TypeError, 'Unknow curl option'): status, newtasks, result = self.crawl( '''curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v''' % self.httpbin, callback=self.json) - with self.assertRaisesRegexp(TypeError, 'Unknow curl option'): + with self.assertRaisesRegex(TypeError, 'Unknow curl option'): status, newtasks, result = self.crawl( '''curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' ''' % self.httpbin, callback=self.json) @@ -490,4 +490,4 @@ def test_zzz_connect_timeout(self): start_time = time.time() status, newtasks, result = self.crawl('http://240.0.0.1/', connect_timeout=5, callback=self.catch_http_error) end_time = time.time() - self.assertTrue(5 <= end_time - start_time <= 6) \ No newline at end of file + self.assertTrue(5 <= end_time - start_time <= 6) diff --git a/tests/test_message_queue.py b/tests/test_message_queue.py index d5e19559b..9c55ec9d8 100644 --- a/tests/test_message_queue.py +++ b/tests/test_message_queue.py @@ -1,20 +1,20 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-10-07 10:33:38 import os -import six import time import unittest -from pyspider.libs import utils +import six from six.moves import queue as Queue +from pyspider.libs import utils + -class TestMessageQueue(object): +class TestMessageQueue(unittest.TestCase): @classmethod def setUpClass(self): @@ -165,8 +165,7 @@ class TestRedisQueue(TestMessageQueue, unittest.TestCase): @classmethod def setUpClass(self): - from pyspider.message_queue import connect_message_queue - from pyspider.message_queue import redis_queue + from pyspider.message_queue import connect_message_queue, redis_queue with utils.timeout(3): self.q1 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False) self.q2 = redis_queue.RedisQueue('test_queue', maxsize=5, lazy_limit=False) diff --git a/tests/test_processor.py b/tests/test_processor.py index 1a07960cb..fa67cb131 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -1,16 +1,17 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-22 14:00:05 -import os -import six import copy +import logging.config +import os import time import unittest -import logging.config + +import six + logging.config.fileConfig("pyspider/logging.conf") from pyspider.libs import utils @@ -241,13 +242,14 @@ def test_60_timeout_in_thread(self): self.assertGreaterEqual(time.time() - start_time, 2) -import shutil import inspect +import shutil + from pyspider.database.sqlite import projectdb -from pyspider.processor.processor import Processor +from pyspider.libs import sample_handler from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.utils import run_in_thread -from pyspider.libs import sample_handler +from pyspider.processor.processor import Processor class TestProcessor(unittest.TestCase): diff --git a/tests/test_response.py b/tests/test_response.py index 4b9bbf094..93979004d 100644 --- a/tests/test_response.py +++ b/tests/test_response.py @@ -1,24 +1,25 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-01-18 11:10:27 -import os import copy +import logging +import logging.config +import os import time -import httpbin import unittest -import logging -import logging.config +import httpbin + logging.config.fileConfig("pyspider/logging.conf") +from pyspider.fetcher.tornado_fetcher import Fetcher from pyspider.libs import utils from pyspider.libs.response import rebuild_response -from pyspider.fetcher.tornado_fetcher import Fetcher + class TestResponse(unittest.TestCase): sample_task_http = { diff --git a/tests/test_result_dump.py b/tests/test_result_dump.py index 0d6e933e7..e46c43f58 100644 --- a/tests/test_result_dump.py +++ b/tests/test_result_dump.py @@ -1,17 +1,17 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-10-12 22:17:57 -from __future__ import unicode_literals, division +from __future__ import division, unicode_literals -import six import csv -import time import json +import time import unittest + +import six from six import StringIO from pyspider.libs import result_dump diff --git a/tests/test_result_worker.py b/tests/test_result_worker.py index 9933cfed8..219f581f1 100644 --- a/tests/test_result_worker.py +++ b/tests/test_result_worker.py @@ -1,21 +1,22 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-11 20:52:53 +import logging.config import os import time import unittest -import logging.config + logging.config.fileConfig("pyspider/logging.conf") import shutil + from pyspider.database.sqlite import resultdb -from pyspider.result.result_worker import ResultWorker from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.utils import run_in_thread +from pyspider.result.result_worker import ResultWorker class TestProcessor(unittest.TestCase): diff --git a/tests/test_run.py b/tests/test_run.py index 490844ee4..77b45ba1c 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -7,21 +6,23 @@ from __future__ import print_function +import inspect +import json import os +import shutil +import signal import sys -import six import time -import json -import signal -import shutil -import inspect -import requests import unittest +import requests +import six + from pyspider import run from pyspider.libs import utils from tests import data_sample_handler + class TestRun(unittest.TestCase): @classmethod @@ -29,8 +30,9 @@ def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') - import tests.data_test_webpage import httpbin + + import tests.data_test_webpage self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' @@ -225,6 +227,7 @@ def test_90_docker_scheduler(self): def test_a100_all(self): import subprocess + #cmd = [sys.executable] cmd = ['coverage', 'run'] p = subprocess.Popen(cmd+[ diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 66ac000eb..647517239 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -1,20 +1,20 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-02-08 22:37:13 +import logging +import logging.config import os -import time import shutil +import time import unittest -import logging -import logging.config + logging.config.fileConfig("pyspider/logging.conf") -from pyspider.scheduler.task_queue import TaskQueue from pyspider.libs import utils +from pyspider.scheduler.task_queue import TaskQueue class TestTaskQueue(unittest.TestCase): @@ -96,10 +96,11 @@ def test_bucket(self): from six.moves import xmlrpc_client except ImportError: import xmlrpclib as xmlrpc_client -from pyspider.scheduler.scheduler import Scheduler -from pyspider.database.sqlite import taskdb, projectdb, resultdb + +from pyspider.database.sqlite import projectdb, resultdb, taskdb from pyspider.libs.multiprocessing_queue import Queue from pyspider.libs.utils import run_in_thread +from pyspider.scheduler.scheduler import Scheduler class TestScheduler(unittest.TestCase): @@ -330,6 +331,7 @@ def test_60_taskdone_failed_retry(self): } }) # task retry 0/3 test_project:taskid url from six.moves import queue as Queue + # with self.assertRaises(Queue.Empty): # task = self.scheduler2fetcher.get(timeout=4) task = self.scheduler2fetcher.get(timeout=5) # select test_project:taskid url @@ -738,6 +740,7 @@ def test_z20_quit(self): from pyspider.scheduler.scheduler import Project + class TestProject(unittest.TestCase): task_pack = { 'type': Scheduler.TASK_PACK, diff --git a/tests/test_utils.py b/tests/test_utils.py index b64a3baad..bb7b51fbc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me @@ -11,6 +10,7 @@ from pyspider.libs import utils + class TestFetcher(unittest.TestCase): def test_readonlydict(self): data = dict(a='a', b=123) diff --git a/tests/test_webdav.py b/tests/test_webdav.py index ccb40a6e6..1d712d621 100644 --- a/tests/test_webdav.py +++ b/tests/test_webdav.py @@ -1,22 +1,23 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-06-03 21:15 +import inspect import os +import shutil import sys -import six import time -import shutil -import inspect import unittest +import six from six import BytesIO + from pyspider import run from pyspider.libs import utils -from tests import data_sample_handler, data_handler +from tests import data_handler, data_sample_handler + @unittest.skipIf(sys.version_info >= (3, 6), "easywebdav doesn't support python 3.6") class TestWebDav(unittest.TestCase): diff --git a/tests/test_webui.py b/tests/test_webui.py index 1e232cee8..c638b05e4 100644 --- a/tests/test_webui.py +++ b/tests/test_webui.py @@ -1,85 +1,85 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2014-11-18 21:03:22 +import json import os import re -import time -import json import shutil +import time import unittest from pyspider import run from pyspider.libs import utils -from pyspider.libs.utils import run_in_thread, ObjectDict +from pyspider.libs.utils import ObjectDict, run_in_thread class TestWebUI(unittest.TestCase): + threads = list() + ctx = None @classmethod - def setUpClass(self): + def setUpClass(cls): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') - import tests.data_test_webpage - import httpbin + # import httpbin + + from tests import data_test_webpage from pyspider.webui import bench_test # flake8: noqa - self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) - self.httpbin = 'http://127.0.0.1:14887' + cls.httpbin_thread = utils.run_in_subprocess(data_test_webpage.app.run, port=14887, passthrough_errors=False) + # self.httpbin = 'http://httpbin.org' ctx = run.cli.make_context('test', [ - '--taskdb', 'sqlalchemy+sqlite+taskdb:///data/tests/task.db', - '--projectdb', 'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db', - '--resultdb', 'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db', + '--taskdb', 'sqlalchemy+sqlite+taskdb:///data/tests/task.db?check_same_thread=False', + '--projectdb', 'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db?check_same_thread=False', + '--resultdb', 'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db?check_same_thread=False', ], None, obj=ObjectDict(testing_mode=True)) - self.ctx = run.cli.invoke(ctx) - - self.threads = [] + cls.ctx = run.cli.invoke(ctx) - ctx = run.scheduler.make_context('scheduler', [], self.ctx) - self.scheduler = scheduler = run.scheduler.invoke(ctx) - self.threads.append(run_in_thread(scheduler.xmlrpc_run)) - self.threads.append(run_in_thread(scheduler.run)) + ctx = run.scheduler.make_context('scheduler', [], cls.ctx) + cls.scheduler = scheduler = run.scheduler.invoke(ctx) + cls.threads.append(run_in_thread(scheduler.xmlrpc_run)) + cls.threads.append(run_in_thread(scheduler.run)) ctx = run.fetcher.make_context('fetcher', [ '--xmlrpc-port', '24444', - ], self.ctx) + ], cls.ctx) fetcher = run.fetcher.invoke(ctx) - self.threads.append(run_in_thread(fetcher.xmlrpc_run)) - self.threads.append(run_in_thread(fetcher.run)) + cls.threads.append(run_in_thread(fetcher.xmlrpc_run)) + cls.threads.append(run_in_thread(fetcher.run)) - ctx = run.processor.make_context('processor', [], self.ctx) + ctx = run.processor.make_context('processor', [], cls.ctx) processor = run.processor.invoke(ctx) - self.threads.append(run_in_thread(processor.run)) + cls.threads.append(run_in_thread(processor.run)) - ctx = run.result_worker.make_context('result_worker', [], self.ctx) + ctx = run.result_worker.make_context('result_worker', [], cls.ctx) result_worker = run.result_worker.invoke(ctx) - self.threads.append(run_in_thread(result_worker.run)) + cls.threads.append(run_in_thread(result_worker.run)) ctx = run.webui.make_context('webui', [ '--scheduler-rpc', 'http://localhost:23333/' - ], self.ctx) + ], cls.ctx) app = run.webui.invoke(ctx) app.debug = True - self.app = app.test_client() - self.rpc = app.config['scheduler_rpc'] + cls.app = app.test_client() + cls.rpc = app.config['scheduler_rpc'] time.sleep(1) @classmethod - def tearDownClass(self): - for each in self.ctx.obj.instances: + def tearDownClass(cls): + for each in cls.ctx.obj.instances: each.quit() time.sleep(1) - for thread in self.threads: + for thread in cls.threads: thread.join() - self.httpbin_thread.terminate() - self.httpbin_thread.join() + cls.httpbin_thread.terminate() + cls.httpbin_thread.join() assert not utils.check_port_open(5000) assert not utils.check_port_open(23333) @@ -144,7 +144,7 @@ def test_30_run(self): def test_32_run_bad_task(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content, - 'task': self.task_content+'asdfasdf312!@#' + 'task': self.task_content + 'asdfasdf312!@#' }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) @@ -153,7 +153,7 @@ def test_32_run_bad_task(self): def test_33_run_bad_script(self): rv = self.app.post('/debug/test_project/run', data={ - 'script': self.script_content+'adfasfasdf', + 'script': self.script_content + 'adfasfasdf', 'task': self.task_content }) self.assertEqual(rv.status_code, 200) @@ -269,7 +269,7 @@ def test_90_run(self): def test_a10_counter(self): for i in range(30): time.sleep(1) - if self.rpc.counter('5m', 'sum')\ + if self.rpc.counter('5m', 'sum') \ .get('test_project', {}).get('success', 0) > 5: break @@ -329,7 +329,6 @@ def test_a22_active_tasks(self): self.assertIn('ok', task['track']['process']) self.assertIn('time', task['track']['process']) self.assertTrue(track) - def test_a24_task(self): rv = self.app.get(self.task_url) diff --git a/tests/test_xmlrpc.py b/tests/test_xmlrpc.py index 736d94e8d..07601741e 100644 --- a/tests/test_xmlrpc.py +++ b/tests/test_xmlrpc.py @@ -15,11 +15,14 @@ # Origin: https://code.google.com/p/wsgi-xmlrpc/ import unittest -import tornado.wsgi -import tornado.ioloop + import tornado.httpserver +import tornado.ioloop +import tornado.wsgi + from pyspider.libs import utils + class TestXMLRPCServer(unittest.TestCase): @classmethod def setUpClass(self): diff --git a/tools/migrate.py b/tools/migrate.py index f092daa6b..1632d61e4 100755 --- a/tools/migrate.py +++ b/tools/migrate.py @@ -1,18 +1,19 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: # Author: Binux # http://binux.me # Created on 2015-09-30 23:22:46 -import click import logging +from multiprocessing.pool import ThreadPool as Pool + +import click + +from pyspider.database import connect_database from pyspider.database.base.projectdb import ProjectDB -from pyspider.database.base.taskdb import TaskDB from pyspider.database.base.resultdb import ResultDB -from pyspider.database import connect_database +from pyspider.database.base.taskdb import TaskDB from pyspider.libs.utils import unicode_obj -from multiprocessing.pool import ThreadPool as Pool logging.getLogger().setLevel(logging.INFO)