Skip to content

Commit 60bfc44

Browse files
committed
get scrapyd port from config
1 parent a5c38a0 commit 60bfc44

1 file changed

Lines changed: 15 additions & 12 deletions

File tree

dynamic_scraper/utils/task_utils.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,19 @@
66
import urllib.request, urllib.parse, http.client
77
from scrapy.utils.project import get_project_settings
88
settings = get_project_settings()
9+
from scrapyd.config import Config
10+
scrapyd_config = Config()
11+
scrapyd_port = scrapyd_config.getint('http_port', 6800)
912
from dynamic_scraper.models import Scraper
1013

1114
class TaskUtils(object):
12-
15+
1316
conf = {
1417
"MAX_SPIDER_RUNS_PER_TASK": 10,
1518
"MAX_CHECKER_RUNS_PER_TASK": 25,
1619
}
17-
20+
21+
1822
def _run_spider(self, **kwargs):
1923
param_dict = {
2024
'project': 'default',
@@ -25,36 +29,36 @@ def _run_spider(self, **kwargs):
2529
}
2630
params = urllib.parse.urlencode(param_dict)
2731
headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}
28-
conn = http.client.HTTPConnection("localhost:6800")
32+
conn = http.client.HTTPConnection("localhost:{}".format(scrapyd_port))
2933
conn.request("POST", "/schedule.json", params, headers)
3034
conn.getresponse()
31-
32-
35+
36+
3337
def _pending_jobs(self, spider):
3438
# Ommit scheduling new jobs if there are still pending jobs for same spider
35-
resp = urllib.request.urlopen('http://localhost:6800/listjobs.json?project=default')
39+
resp = urllib.request.urlopen('http://localhost:{}/listjobs.json?project=default'.format(scrapyd_port))
3640
data = json.loads(resp.read().decode('utf-8'))
3741
if 'pending' in data:
3842
for item in data['pending']:
3943
if item['spider'] == spider:
4044
return True
4145
return False
42-
43-
46+
47+
4448
def run_spiders(self, ref_obj_class, scraper_field_name, runtime_field_name, spider_name, *args, **kwargs):
4549
filter_kwargs = {
4650
scraper_field_name + '__status': 'A',
4751
runtime_field_name + '__next_action_time__lt': datetime.datetime.now(),
4852
}
4953
for key in kwargs:
5054
filter_kwargs[key] = kwargs[key]
51-
55+
5256
max = settings.get('DSCRAPER_MAX_SPIDER_RUNS_PER_TASK', self.conf['MAX_SPIDER_RUNS_PER_TASK'])
5357
ref_obj_list = ref_obj_class.objects.filter(*args, **filter_kwargs).order_by(runtime_field_name + '__next_action_time')[:max]
5458
if not self._pending_jobs(spider_name):
5559
for ref_object in ref_obj_list:
5660
self._run_spider(id=ref_object.pk, spider=spider_name, run_type='TASK', do_action='yes')
57-
61+
5862

5963
def run_checkers(self, ref_obj_class, scraper_field_path, runtime_field_name, checker_name, *args, **kwargs):
6064
filter_kwargs = {
@@ -63,10 +67,9 @@ def run_checkers(self, ref_obj_class, scraper_field_path, runtime_field_name, ch
6367
}
6468
for key in kwargs:
6569
filter_kwargs[key] = kwargs[key]
66-
70+
6771
max = settings.get('DSCRAPER_MAX_CHECKER_RUNS_PER_TASK', self.conf['MAX_CHECKER_RUNS_PER_TASK'])
6872
ref_obj_list = ref_obj_class.objects.filter(*args, **filter_kwargs).order_by(runtime_field_name + '__next_action_time')[:max]
6973
if not self._pending_jobs(checker_name):
7074
for ref_object in ref_obj_list:
7175
self._run_spider(id=ref_object.pk, spider=checker_name, run_type='TASK', do_action='yes')
72-

0 commit comments

Comments
 (0)