Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a6ad3d4
script.sh: short-circuit find-file-doc-comments.pl using an heuristic
tleb Jun 18, 2024
eb36850
script.sh: do not sort ctags output
fstachura Oct 11, 2024
2b17fbc
update: Rewrite update script
fstachura Dec 21, 2024
265fcdb
update: Remove old update script
fstachura Dec 29, 2024
3de8215
update: Fix versions database issues
fstachura Jul 2, 2025
d72291d
utils/index: fix to use reworked update script
tleb Jun 18, 2025
5aa8192
update: Add references only if definition visible in version
fstachura Jul 2, 2025
1e6624d
add db cache
fstachura Jul 5, 2025
8b339a4
do not close db between tags
fstachura Jul 5, 2025
6376e49
attempt to improve reflist and deflist performance
fstachura Jul 5, 2025
19e81c6
print slow docs/comps docs
fstachura Jul 5, 2025
a284c92
generate defs caches at the end of update
fstachura Jul 5, 2025
58fca89
check def existence in get_refs
fstachura Jul 5, 2025
00ba004
optiimze find_compatible_dts for gigantic amd driver files
fstachura Jul 5, 2025
d4b79b0
query deflist once, cprofile add refs
fstachura Jul 5, 2025
82fa442
deflist to dict
fstachura Jul 5, 2025
c4103a2
fix database segfaults
fstachura Jul 5, 2025
b7c0fc0
catch sigint
fstachura Jul 6, 2025
3531c7c
add def in ver cache
fstachura Jul 6, 2025
8c712e4
move more work to get_refs
fstachura Jul 6, 2025
1fa6949
optimize write-only reflist case
fstachura Jul 6, 2025
8f65990
refs append only opt pt 2
fstachura Jul 6, 2025
f90a8af
add number of blobs in logs
fstachura Jul 6, 2025
33d4a0c
dont filter ref_idxes by file family
fstachura Jul 6, 2025
d65d4e0
put in cachedDB only if modified
fstachura Jul 6, 2025
a2dbb8f
def sync improvement
fstachura Jul 6, 2025
13814bd
def pack/unpack optimization part 2
fstachura Jul 6, 2025
99e7c93
optimize deflist for append
fstachura Jul 7, 2025
c789e01
put in evict only if necessary
fstachura Jul 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 241 additions & 49 deletions elixir/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,21 @@
# You should have received a copy of the GNU Affero General Public License
# along with Elixir. If not, see <http://www.gnu.org/licenses/>.

from typing import OrderedDict
import berkeleydb
import re
import time
from . import lib
from .lib import autoBytes
import os
import os.path
import errno

# Cache size used by the update script for the largest databases. Tuple of (gigabytes, bytes).
# https://docs.oracle.com/database/bdb181/html/api_reference/C/dbset_cachesize.html
# https://docs.oracle.com/database/bdb181/html/programmer_reference/general_am_conf.html#am_conf_cachesize
CACHESIZE = (2,0)

deflist_regex = re.compile(b'(\d*)(\w)(\d*)(\w),?')
deflist_macro_regex = re.compile('\dM\d+(\w)')

Expand Down Expand Up @@ -59,43 +67,86 @@ class DefList:
def __init__(self, data=b'#'):
self.data, self.families = data.split(b'#')

self.modified = False
self.entries = None
self.to_append = []

def populate_entries(self):
entries_modified = False
if self.entries is None:
self.entries = [
(int(d[0]), d[1], int(d[2]), d[3])
for d in deflist_regex.findall(self.data)
]
entries_modified = True

if len(self.to_append) != 0:
self.entries += self.to_append
self.to_append = []
entries_modified = True

if entries_modified:
self.entries.sort(key=lambda x:int(x[0]))

def iter(self, dummy=False):
# Get all element in a list of sublists and sort them
entries = deflist_regex.findall(self.data)
entries.sort(key=lambda x:int(x[0]))
for id, type, line, family in entries:
id = int(id)
type = defTypeR [type.decode()]
line = int(line)
family = family.decode()
yield id, type, line, family
if self.entries is None:
self.populate_entries()

for id, type, line, family in self.entries:
yield id, defTypeR[type.decode()], int(line), family.decode()
if dummy:
yield maxId, None, None, None

def append(self, id, type, line, family):
def exists(self, idx: int, line_num: int):
if self.entries is None:
self.populate_entries()

for id, _, line, _ in self.entries:
if id == idx and int(line) == line_num:
return True

return False

def append(self, id: int, type, line: int, family: str):
if type not in defTypeD:
return
p = str(id) + defTypeD[type] + str(line) + family
if self.data != b'':
p = ',' + p
self.data += p.encode()

self.modified = True
if self.entries is None:
self.to_append.append((id, defTypeD[type].encode(), line, family.encode()))
else:
self.entries.append((id, defTypeD[type].encode(), line, family.encode()))

self.add_family(family)

def pack(self):
return self.data + b'#' + self.families
def pack(self) -> bytes:
if self.entries is None:
to_append = b",".join([
str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3]
for arg in self.to_append
])
self.to_append = []
self.data += to_append
return self.data + b'#' + self.families
else:
self.data = b",".join([
str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3]
for arg in self.entries
])
return self.data + b'#' + self.families

def add_family(self, family):
family = family.encode()
def add_family(self, family: str):
if not family in self.families.split(b','):
if self.families != b'':
family = b',' + family
self.families += family
family = ',' + family
self.families += family.encode()

def get_families(self):
return self.families.decode().split(',')
return [f.decode() for f in self.families.split(b',')]

def get_macros(self):
return deflist_macro_regex.findall(self.data.decode()) or ''
return (deflist_macro_regex.findall(self.data.decode()) + [entry[1] for entry in self.to_append]) or ''

class PathList:
'''Stores associations between a blob ID and a file path.
Expand Down Expand Up @@ -124,76 +175,217 @@ class RefList:
and the corresponding family.'''
def __init__(self, data=b''):
self.data = data
self.entries = None
self.to_append = []
self.sorted = False
self.modified = False

def decode_entry(self, k):
return (int(k[0].decode()), k[1].decode(), k[2].decode())

def populate_entries(self):
self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]]
self.entries += self.to_append
self.to_append = []
self.entries.sort(key=lambda x:int(x[0]))

def iter(self, dummy=False):
# Split all elements in a list of sublists and sort them
entries = [x.split(b':') for x in self.data.split(b'\n')[:-1]]
entries.sort(key=lambda x:int(x[0]))
for b, c, d in entries:
b = int(b.decode())
c = c.decode()
d = d.decode()
if self.entries is None:
self.populate_entries()

for b, c, d in self.entries:
yield b, c, d
if dummy:
yield maxId, None, None

def append(self, id, lines, family):
p = str(id) + ':' + lines + ':' + family + '\n'
self.data += p.encode()
self.modified = True
if self.entries is not None:
self.entries.append((id, lines, family))
else:
self.to_append.append((id, lines, family))

def pack(self):
return self.data
if self.entries is not None:
assert len(self.to_append) == 0
result = "".join([str(id) + ":" + lines + ":" + family + "\n" for id, lines, family in self.entries])
return result.encode()
elif len(self.to_append) != 0:
result = "".join([str(id) + ":" + lines + ":" + family + "\n" for id, lines, family in self.to_append])
self.data += result.encode()
self.to_append = []
return self.data

class BsdDB:
def __init__(self, filename, readonly, contentType, shared=False):
def __init__(self, filename, readonly, contentType, shared=False, cachesize=None):
self.filename = filename
self.db = berkeleydb.db.DB()
flags = berkeleydb.db.DB_THREAD if shared else 0
self.flags = berkeleydb.db.DB_THREAD if shared else 0

if readonly:
flags |= berkeleydb.db.DB_RDONLY
self.db.open(filename, flags=flags)
self.readonly = readonly
if self.readonly:
self.flags |= berkeleydb.db.DB_RDONLY
else:
flags |= berkeleydb.db.DB_CREATE
self.db.open(filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE)
self.flags |= berkeleydb.db.DB_CREATE

if cachesize is not None:
self.db.set_cachesize(cachesize[0], cachesize[1])

self.open()
self.ctype = contentType

def open(self):
if self.readonly:
self.db.open(self.filename, flags=self.flags)
else:
self.db.open(self.filename, flags=self.flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE)

def exists(self, key):
key = lib.autoBytes(key)
key = autoBytes(key)
return self.db.exists(key)

def get(self, key):
key = lib.autoBytes(key)
key = autoBytes(key)
p = self.db.get(key)
return self.ctype(p) if p is not None else None
if p is None:
return None
p = self.ctype(p)
return p

def get_keys(self):
return self.db.keys()

def put(self, key, val, sync=False):
key = lib.autoBytes(key)
val = lib.autoBytes(val)
key = autoBytes(key)
val = autoBytes(val)
if type(val) is not bytes:
val = val.pack()
self.db.put(key, val)
if sync:
self.db.sync()

def sync(self):
self.db.sync()

def close(self):
self.db.close()

def __len__(self):
return self.db.stat()["nkeys"]

class CachedBsdDB:
def __init__(self, filename, readonly, contentType, cachesize):
self.filename = filename
self.db = None
self.readonly = readonly

self.cachesize = cachesize
self.cache = OrderedDict()

self.open()

self.ctype = contentType

def open(self):
if self.db is None:
self.db = berkeleydb.db.DB()

flags = 0

if self.readonly:
flags |= berkeleydb.db.DB_RDONLY
self.db.open(self.filename, flags=flags)
else:
flags |= berkeleydb.db.DB_CREATE
self.db.open(self.filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE)

def exists(self, key):
if key in self.cache:
return True

return self.db.exists(autoBytes(key))

def get(self, key):
if key in self.cache:
self.cache.move_to_end(key)
return self.cache[key]

p = self.db.get(autoBytes(key))
if p is None:
return None
p = self.ctype(p)

self.cache[key] = p
self.cache.move_to_end(key)
if len(self.cache) > self.cachesize:
old_k, old_v = self.cache.popitem(last=False)
if old_v.modified:
self.put_raw(old_k, old_v)

return p

def get_keys(self):
self.sync()
return self.db.keys()

def put(self, key, val):
if self.readonly:
raise Exception("database is readonly")

self.cache[key] = val
self.cache.move_to_end(key)
if len(self.cache) > self.cachesize:
old_k, old_v = self.cache.popitem(last=False)
if old_v.modified:
self.put_raw(old_k, old_v)

def put_raw(self, key, val, sync=False):
if self.readonly:
raise Exception("database is readonly")

key = autoBytes(key)
val = autoBytes(val)
if type(val) is not bytes:
val = val.pack()
self.db.put(key, val)
if sync:
self.db.sync()

def sync(self):
start = time.time()
flushed = 0
if not self.readonly:
for k, v in self.cache.items():
if v.modified:
v.modified = False
self.put_raw(k, v)
flushed += 1

print("synced", flushed, "/", len(self.cache), time.time()-start)
self.db.sync()

def close(self):
self.sync()
self.db.close()
self.db = None

def __len__(self):
return self.db.stat()["nkeys"]

class DB:
def __init__(self, dir, readonly=True, dtscomp=False, shared=False):
def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache=None):
if os.path.isdir(dir):
self.dir = dir
else:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dir)

ro = readonly

if update_cache:
db_cls = lambda dir, ro, ctype: CachedBsdDB(dir, ro, ctype, cachesize=update_cache)
else:
db_cls = lambda dir, ro, ctype: BsdDB(dir, ro, ctype, shared=shared)

self.vars = BsdDB(dir + '/variables.db', ro, lambda x: int(x.decode()), shared=shared)
# Key-value store of basic information
self.blob = BsdDB(dir + '/blobs.db', ro, lambda x: int(x.decode()), shared=shared)
Expand All @@ -203,20 +395,20 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False):
self.file = BsdDB(dir + '/filenames.db', ro, lambda x: x.decode(), shared=shared)
# Map serial number to filename
self.vers = BsdDB(dir + '/versions.db', ro, PathList, shared=shared)
self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared)
self.defs = db_cls(dir + '/definitions.db', ro, DefList)
self.defs_cache = {}
NOOP = lambda x: x
self.defs_cache['C'] = BsdDB(dir + '/definitions-cache-C.db', ro, NOOP, shared=shared)
self.defs_cache['K'] = BsdDB(dir + '/definitions-cache-K.db', ro, NOOP, shared=shared)
self.defs_cache['D'] = BsdDB(dir + '/definitions-cache-D.db', ro, NOOP, shared=shared)
self.defs_cache['M'] = BsdDB(dir + '/definitions-cache-M.db', ro, NOOP, shared=shared)
assert sorted(self.defs_cache.keys()) == sorted(lib.CACHED_DEFINITIONS_FAMILIES)
self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared)
self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared)
self.refs = db_cls(dir + '/references.db', ro, RefList)
self.docs = db_cls(dir + '/doccomments.db', ro, RefList)
self.dtscomp = dtscomp
if dtscomp:
self.comps = BsdDB(dir + '/compatibledts.db', ro, RefList, shared=shared)
self.comps_docs = BsdDB(dir + '/compatibledts_docs.db', ro, RefList, shared=shared)
self.comps = db_cls(dir + '/compatibledts.db', ro, RefList)
self.comps_docs = db_cls(dir + '/compatibledts_docs.db', ro, RefList)
# Use a RefList in case there are multiple doc comments for an identifier

def close(self):
Expand Down
Loading