-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEntrezDownloader.py
executable file
·157 lines (131 loc) · 5.61 KB
/
EntrezDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
####################################################################################################
#
# EntrezDownloader
# Author: Leon Kuchenbecker
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
####################################################################################################
import requests
import io
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from concurrent import futures
class RequestLimiter:
def __init__(self, min_wait = 0.4):
"""The RequestLimiter class provides functionality to limit the rate at which new requests are made."""
self.lock = threading.Lock()
self.last_request = None
self.min_wait = min_wait
def wait(self):
"""The wait() function blocks until a minimum wait time from the previous invocation has passed. Thread safe."""
with self.lock:
# This is the first request
if not self.last_request:
self.last_request = time.time()
return
# This is not the first request
diff = time.time() - self.last_request
if diff < self.min_wait:
tsleep = self.min_wait - diff
time.sleep(tsleep)
self.last_request = time.time()
class ResultCollector:
def __init__(self, pbar = None):
"""The ResultCollector class provides functionality for threads to deliver their results."""
self.pbar = pbar
self.results = []
self.failed = []
self.lock = threading.Lock()
def add_results(self, results):
"""Adds results to the collector. If a progress bar was provided, it updates the progress bar."""
with self.lock:
self.results += results
if self.pbar:
self.pbar.update(len(results))
def add_failed(self, ids):
"""Adds failed IDs to the collector. If a progress bar was provided, it updates the progress bar."""
with self.lock:
self.failed += ids
if self.pbar:
self.pbar.update(len(ids))
class EntrezDownloader:
def __init__(self, num_threads = 30, batch_size = 10, email = None, api_key = None, pbar = False):
"""The EntrezDownloader class enables parallel downloads via the NCBI Entrez interface"""
self.baseurl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
self.num_threads = num_threads
self.batch_size = batch_size
self.email = email
self.api_key = api_key
self.request_limiter = RequestLimiter(min_wait=0.35 if not api_key else 0.5)
self.print_lock = threading.Lock()
self.pbar = pbar
def _efetch_batch(self, db, ids, result_collector, result_func, **kwargs):
post_data = {
'tool' : 'EntrezDownloader',
'email' : self.email,
'api_key' : self.api_key,
'id' : ','.join(ids),
'db' : db,
'retmode' : 'xml',
}
post_data.update(kwargs)
if self.email:
post_data.update({'email':self.email})
if self.api_key:
post_data.update({'api_key':self.api_key})
error = None
for _ in range(3): # Retry three times
try:
self.request_limiter.wait()
response = requests.post(f'{self.baseurl}/efetch.cgi', post_data)
if response.status_code == 200:
results = result_func(response.text)
result_collector.add_results(results)
error = None
break
else:
error = f'[STATUS {response.status_code}] An error occurred, you may see a response text here: {response.text}'
except Exception as e:
error = f'[UNKNOWN ERROR] {e}'
if error:
result_collector.add_failed(ids)
print(error)
def efetch(self, db, ids, result_func = lambda x : [x], **kwargs):
"""Interface to the efetch database.
result_func: A function to be applied to the response. Must return an iterable.
"""
if self.pbar:
from tqdm import tqdm
results = ResultCollector(pbar = tqdm(total=len(ids), unit = 'records'))
else:
results = ResultCollector()
executor = ThreadPoolExecutor(max_workers=self.num_threads)
fs = []
for start in range(0, len(ids), self.batch_size):
num = len(ids)-start
num = self.batch_size if num > self.batch_size else num
f = executor.submit(self._efetch_batch,
db = db,
ids = ids[start:start+num],
result_collector = results,
result_func = result_func,
**kwargs)
fs.append(f)
futures.wait(fs)
# Close the progress bar if it was enabled
if results.pbar:
results.pbar.close()
return results.results, results.failed