Skip to content
This repository was archived by the owner on Apr 22, 2022. It is now read-only.

Commit a707c29

Browse files
committedNov 23, 2018
Fix: Fix the error when save config to local file
Print the git version at launch
1 parent f9a20cf commit a707c29

File tree

3 files changed

+40
-9
lines changed

3 files changed

+40
-9
lines changed
 

‎config.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import logging
44
from dateutil import parser
5+
from util import encode_json
56

67

78
class Nested:
@@ -77,7 +78,6 @@ def __str__(self):
7778

7879

7980
def read_config(file_name: str = 'config.json'):
80-
8181
if os.path.isfile(file_name):
8282
with open(file_name, 'r', encoding='utf-8') as f:
8383
data = json.load(f)
@@ -96,7 +96,8 @@ def read_config(file_name: str = 'config.json'):
9696
Config = Nested(data)
9797
else:
9898
with open(file_name, 'w', encoding='utf-8') as f:
99-
json.dump(Config.data, f, ensure_ascii=False, indent=2)
99+
Config.dict['log']['level'] = logging.getLevelName(Config.dict['log']['level'])
100+
json.dump(Config.dict, f, default=encode_json, ensure_ascii=False, indent=2)
100101

101102

102103
read_config()

‎main.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from datetime import datetime
88
from log import Log
99
from persistence import RedisSet, MongoDB, test_redis, test_mongodb
10+
from util import git_date
1011
import config
1112

1213
from multiprocessing import Pool
@@ -34,21 +35,25 @@ def main():
3435
help='Start a spider to crawl data by date or by district')
3536
parser.add_argument('-d', '--downloader', nargs='?', choices=['read', 'download'], const='download',
3637
help='Start a downloader')
38+
parser.add_argument('--clean', action='store_true',
39+
help='Delete all data in Redis before read, only useful for -d read')
3740
parser.add_argument('-c', '--config', nargs='?', help='Specify the filename of config')
3841
args = parser.parse_args()
3942

43+
logging.info('Version: {}.'.format(git_date()))
44+
4045
# Specify the filename of config
4146
if args.config is not None:
42-
logging.info('Config read from {0}.'.format(args.config))
47+
logging.info('Config: {0}.'.format(args.config))
4348
config.read_config(args.config)
4449
else:
45-
logging.info('Config read from config.json.')
50+
logging.info('Config: config.json.')
4651

4752
if args.spider is None:
4853
if args.downloader is None:
4954

5055
# Run multiprocess
51-
logging.info('Multiprocess mode on.')
56+
logging.info('Multiprocess Mode: On.')
5257
test_redis()
5358
test_mongodb()
5459
logging.info(test_proxy())
@@ -68,12 +73,12 @@ def main():
6873

6974
# Run single instance of downloader
7075
test_redis()
71-
test_mongodb()
72-
logging.info(test_proxy())
7376

7477
if args.downloader == 'read':
75-
read_content_list()
78+
read_content_list(args.clean)
7679
elif args.downloader == 'download':
80+
test_mongodb()
81+
logging.info(test_proxy())
7782
download()
7883

7984
if args.spider is not None:
@@ -236,7 +241,7 @@ def crawl_by_district():
236241
data_file.close()
237242

238243

239-
def read_content_list():
244+
def read_content_list(clean: bool = False):
240245
logger = Log.create_logger('downloader')
241246
logger.info('Downloader reading contents from local files.')
242247
total = 0
@@ -245,6 +250,10 @@ def read_content_list():
245250
pattern = re.compile(r"{'id': '(.+?)',")
246251
database = RedisSet('spider')
247252

253+
if clean:
254+
database.flush_all()
255+
logger.info('All DocID in redis has been deleted.')
256+
248257
for data_file_name in os.listdir(data_dir):
249258
total_per_file = 0
250259
available_per_file = 0
@@ -265,6 +274,7 @@ def read_content_list():
265274
available += available_per_file
266275

267276
logger.info('Data retrieved from local file: {} total, {} available.'.format(total, available))
277+
logger.info('Total {} items in redis database.'.format(database.count()))
268278
return total, available
269279

270280

‎util.py

+20
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
import json
22
from datetime import datetime
3+
from decimal import Decimal
4+
from datetime import date
35
from dateutil import parser
46
import logging
57
import os
68
from math import ceil
79
import re
10+
import sys
11+
import subprocess
812

913
from persistence import RedisSet, test_redis
1014

@@ -34,6 +38,22 @@ def object_hook(obj):
3438
return obj
3539

3640

41+
def encode_json(obj):
42+
if isinstance(obj, Decimal):
43+
return float(obj)
44+
elif isinstance(obj, date):
45+
return obj.strftime("%Y-%m-%d")
46+
elif isinstance(obj, datetime):
47+
return obj.strftime("%Y-%m-%d")
48+
raise TypeError
49+
50+
51+
def git_date():
52+
p = subprocess.Popen(["git", "log", "-1", "--format='%aI %s'"], stdout=subprocess.PIPE)
53+
out, err = p.communicate()
54+
return out.decode(sys.getdefaultencoding()).strip()[1:-1]
55+
56+
3757
def merge_doc_and_split(number: int = 1):
3858
logging.basicConfig(format='[%(levelname)s] %(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S',
3959
level=logging.INFO)

0 commit comments

Comments
 (0)
This repository has been archived.