66import urllib .request , urllib .parse , http .client
77from scrapy .utils .project import get_project_settings
88settings = get_project_settings ()
9+ from scrapyd .config import Config
10+ scrapyd_config = Config ()
11+ scrapyd_port = scrapyd_config .getint ('http_port' , 6800 )
912from dynamic_scraper .models import Scraper
1013
1114class TaskUtils (object ):
12-
15+
1316 conf = {
1417 "MAX_SPIDER_RUNS_PER_TASK" : 10 ,
1518 "MAX_CHECKER_RUNS_PER_TASK" : 25 ,
1619 }
17-
20+
21+
1822 def _run_spider (self , ** kwargs ):
1923 param_dict = {
2024 'project' : 'default' ,
@@ -25,36 +29,36 @@ def _run_spider(self, **kwargs):
2529 }
2630 params = urllib .parse .urlencode (param_dict )
2731 headers = {"Content-type" : "application/x-www-form-urlencoded" , "Accept" : "text/plain" }
28- conn = http .client .HTTPConnection ("localhost:6800" )
32+ conn = http .client .HTTPConnection ("localhost:{}" . format ( scrapyd_port ) )
2933 conn .request ("POST" , "/schedule.json" , params , headers )
3034 conn .getresponse ()
31-
32-
35+
36+
3337 def _pending_jobs (self , spider ):
3438 # Ommit scheduling new jobs if there are still pending jobs for same spider
35- resp = urllib .request .urlopen ('http://localhost:6800 /listjobs.json?project=default' )
39+ resp = urllib .request .urlopen ('http://localhost:{} /listjobs.json?project=default' . format ( scrapyd_port ) )
3640 data = json .loads (resp .read ().decode ('utf-8' ))
3741 if 'pending' in data :
3842 for item in data ['pending' ]:
3943 if item ['spider' ] == spider :
4044 return True
4145 return False
42-
43-
46+
47+
4448 def run_spiders (self , ref_obj_class , scraper_field_name , runtime_field_name , spider_name , * args , ** kwargs ):
4549 filter_kwargs = {
4650 scraper_field_name + '__status' : 'A' ,
4751 runtime_field_name + '__next_action_time__lt' : datetime .datetime .now (),
4852 }
4953 for key in kwargs :
5054 filter_kwargs [key ] = kwargs [key ]
51-
55+
5256 max = settings .get ('DSCRAPER_MAX_SPIDER_RUNS_PER_TASK' , self .conf ['MAX_SPIDER_RUNS_PER_TASK' ])
5357 ref_obj_list = ref_obj_class .objects .filter (* args , ** filter_kwargs ).order_by (runtime_field_name + '__next_action_time' )[:max ]
5458 if not self ._pending_jobs (spider_name ):
5559 for ref_object in ref_obj_list :
5660 self ._run_spider (id = ref_object .pk , spider = spider_name , run_type = 'TASK' , do_action = 'yes' )
57-
61+
5862
5963 def run_checkers (self , ref_obj_class , scraper_field_path , runtime_field_name , checker_name , * args , ** kwargs ):
6064 filter_kwargs = {
@@ -63,10 +67,9 @@ def run_checkers(self, ref_obj_class, scraper_field_path, runtime_field_name, ch
6367 }
6468 for key in kwargs :
6569 filter_kwargs [key ] = kwargs [key ]
66-
70+
6771 max = settings .get ('DSCRAPER_MAX_CHECKER_RUNS_PER_TASK' , self .conf ['MAX_CHECKER_RUNS_PER_TASK' ])
6872 ref_obj_list = ref_obj_class .objects .filter (* args , ** filter_kwargs ).order_by (runtime_field_name + '__next_action_time' )[:max ]
6973 if not self ._pending_jobs (checker_name ):
7074 for ref_object in ref_obj_list :
7175 self ._run_spider (id = ref_object .pk , spider = checker_name , run_type = 'TASK' , do_action = 'yes' )
72-
0 commit comments