Skip to content

Commit 4c9dced

Browse files
committed
update: Rewrite update script
New update script uses futures to dynamically schedule many smaller tasks between a constant number of threads, instead of statically assigning a single long running task to each thread. This results in better CPU saturation. Database handles are not shared between threads anymore, instead the main thread is used to commit results of other threads into the database. This trades locking on database access for serialization costs - since multiprocessing is used, values returned from futures are pickled. (although in practice that depends on ProcessPool configuration)
1 parent 3717f52 commit 4c9dced

File tree

2 files changed

+451
-17
lines changed

2 files changed

+451
-17
lines changed

elixir/data.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import berkeleydb
2222
import re
23-
from . import lib
23+
from .lib import autoBytes
2424
import os
2525
import os.path
2626
import errno
@@ -72,6 +72,14 @@ def iter(self, dummy=False):
7272
if dummy:
7373
yield maxId, None, None, None
7474

75+
def exists(self, idx, line_num):
76+
entries = deflist_regex.findall(self.data)
77+
for id, _, line, _ in entries:
78+
if id == idx and int(line) == line_num:
79+
return True
80+
81+
return False
82+
7583
def append(self, id, type, line, family):
7684
if type not in defTypeD:
7785
return
@@ -159,26 +167,32 @@ def __init__(self, filename, readonly, contentType, shared=False):
159167
self.ctype = contentType
160168

161169
def exists(self, key):
162-
key = lib.autoBytes(key)
170+
key = autoBytes(key)
163171
return self.db.exists(key)
164172

165173
def get(self, key):
166-
key = lib.autoBytes(key)
174+
key = autoBytes(key)
167175
p = self.db.get(key)
168-
return self.ctype(p) if p is not None else None
176+
if p is None:
177+
return None
178+
p = self.ctype(p)
179+
return p
169180

170181
def get_keys(self):
171182
return self.db.keys()
172183

173184
def put(self, key, val, sync=False):
174-
key = lib.autoBytes(key)
175-
val = lib.autoBytes(val)
185+
key = autoBytes(key)
186+
val = autoBytes(val)
176187
if type(val) is not bytes:
177188
val = val.pack()
178189
self.db.put(key, val)
179190
if sync:
180191
self.db.sync()
181192

193+
def sync(self):
194+
self.db.sync()
195+
182196
def close(self):
183197
self.db.close()
184198

@@ -201,13 +215,6 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False):
201215
# Map serial number to filename
202216
self.vers = BsdDB(dir + '/versions.db', ro, PathList, shared=shared)
203217
self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared)
204-
self.defs_cache = {}
205-
NOOP = lambda x: x
206-
self.defs_cache['C'] = BsdDB(dir + '/definitions-cache-C.db', ro, NOOP, shared=shared)
207-
self.defs_cache['K'] = BsdDB(dir + '/definitions-cache-K.db', ro, NOOP, shared=shared)
208-
self.defs_cache['D'] = BsdDB(dir + '/definitions-cache-D.db', ro, NOOP, shared=shared)
209-
self.defs_cache['M'] = BsdDB(dir + '/definitions-cache-M.db', ro, NOOP, shared=shared)
210-
assert sorted(self.defs_cache.keys()) == sorted(lib.CACHED_DEFINITIONS_FAMILIES)
211218
self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared)
212219
self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared)
213220
self.dtscomp = dtscomp
@@ -223,10 +230,6 @@ def close(self):
223230
self.file.close()
224231
self.vers.close()
225232
self.defs.close()
226-
self.defs_cache['C'].close()
227-
self.defs_cache['K'].close()
228-
self.defs_cache['D'].close()
229-
self.defs_cache['M'].close()
230233
self.refs.close()
231234
self.docs.close()
232235
if self.dtscomp:

0 commit comments

Comments
 (0)