7
7
from datetime import datetime
8
8
from log import Log
9
9
from persistence import RedisSet , MongoDB , test_redis , test_mongodb
10
+ from util import git_date
10
11
import config
11
12
12
13
from multiprocessing import Pool
@@ -34,21 +35,25 @@ def main():
34
35
help = 'Start a spider to crawl data by date or by district' )
35
36
parser .add_argument ('-d' , '--downloader' , nargs = '?' , choices = ['read' , 'download' ], const = 'download' ,
36
37
help = 'Start a downloader' )
38
+ parser .add_argument ('--clean' , action = 'store_true' ,
39
+ help = 'Delete all data in Redis before read, only useful for -d read' )
37
40
parser .add_argument ('-c' , '--config' , nargs = '?' , help = 'Specify the filename of config' )
38
41
args = parser .parse_args ()
39
42
43
+ logging .info ('Version: {}.' .format (git_date ()))
44
+
40
45
# Specify the filename of config
41
46
if args .config is not None :
42
- logging .info ('Config read from {0}.' .format (args .config ))
47
+ logging .info ('Config: {0}.' .format (args .config ))
43
48
config .read_config (args .config )
44
49
else :
45
- logging .info ('Config read from config.json.' )
50
+ logging .info ('Config: config.json.' )
46
51
47
52
if args .spider is None :
48
53
if args .downloader is None :
49
54
50
55
# Run multiprocess
51
- logging .info ('Multiprocess mode on .' )
56
+ logging .info ('Multiprocess Mode: On .' )
52
57
test_redis ()
53
58
test_mongodb ()
54
59
logging .info (test_proxy ())
@@ -68,12 +73,12 @@ def main():
68
73
69
74
# Run single instance of downloader
70
75
test_redis ()
71
- test_mongodb ()
72
- logging .info (test_proxy ())
73
76
74
77
if args .downloader == 'read' :
75
- read_content_list ()
78
+ read_content_list (args . clean )
76
79
elif args .downloader == 'download' :
80
+ test_mongodb ()
81
+ logging .info (test_proxy ())
77
82
download ()
78
83
79
84
if args .spider is not None :
@@ -236,7 +241,7 @@ def crawl_by_district():
236
241
data_file .close ()
237
242
238
243
239
- def read_content_list ():
244
+ def read_content_list (clean : bool = False ):
240
245
logger = Log .create_logger ('downloader' )
241
246
logger .info ('Downloader reading contents from local files.' )
242
247
total = 0
@@ -245,6 +250,10 @@ def read_content_list():
245
250
pattern = re .compile (r"{'id': '(.+?)'," )
246
251
database = RedisSet ('spider' )
247
252
253
+ if clean :
254
+ database .flush_all ()
255
+ logger .info ('All DocID in redis has been deleted.' )
256
+
248
257
for data_file_name in os .listdir (data_dir ):
249
258
total_per_file = 0
250
259
available_per_file = 0
@@ -265,6 +274,7 @@ def read_content_list():
265
274
available += available_per_file
266
275
267
276
logger .info ('Data retrieved from local file: {} total, {} available.' .format (total , available ))
277
+ logger .info ('Total {} items in redis database.' .format (database .count ()))
268
278
return total , available
269
279
270
280
0 commit comments