Skip to content

Commit aa8208b

Browse files
authored
allow updates for batch_bulk_create decor 🎀 (#1015)
1 parent 7644a0b commit aa8208b

2 files changed

Lines changed: 71 additions & 9 deletions

File tree

src/simmate/database/utils.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,30 @@
2222
APPS_TO_MIGRATE = list(apps.app_configs.keys())
2323

2424

25-
def batch_bulk_create(batch_size: int = 1000):
25+
def batch_bulk_create(
26+
batch_size: int = 1000,
27+
update_conflicts: bool = False,
28+
unique_fields: list[str] = None,
29+
update_fields: list[str] = None,
30+
):
2631
"""
2732
Decorator for the `load_source_data` classmethod on DatabaseTables.
2833
Expects the wrapped method to be a generator that yields database objects.
2934
This handles creating the objects in batches using `bulk_create`.
35+
36+
By default, conflicts are ignored (insert-only). To enable upsert behavior,
37+
set `update_conflicts=True` and provide `unique_fields` and `update_fields`.
3038
"""
3139

40+
if update_conflicts:
41+
bulk_create_kwargs = dict(
42+
update_conflicts=True,
43+
unique_fields=unique_fields,
44+
update_fields=update_fields,
45+
)
46+
else:
47+
bulk_create_kwargs = dict(ignore_conflicts=True)
48+
3249
def decorator(func):
3350
@wraps(func)
3451
def wrapper(cls, *args, **kwargs):
@@ -41,15 +58,15 @@ def wrapper(cls, *args, **kwargs):
4158
cls.objects.bulk_create(
4259
db_objs,
4360
batch_size=batch_size,
44-
ignore_conflicts=True,
61+
**bulk_create_kwargs,
4562
)
4663
db_objs = [] # reset for next batch
4764
# save any remaining
4865
if db_objs:
4966
cls.objects.bulk_create(
5067
db_objs,
5168
batch_size=batch_size,
52-
ignore_conflicts=True,
69+
**bulk_create_kwargs,
5370
)
5471

5572
return wrapper

src/simmate/toolkit/dataframes/molecule.py

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,34 @@ def filter_from_ids(self, ids: list[int]):
303303

304304
@cached_property
305305
def substructure_library(self):
306+
"""
307+
Builds an RDKit SubstructLibrary for fast substructure searching.
308+
309+
This is the recommended approach over `_custom_substructure_filter`
310+
because it handles fingerprint pre-screening and substructure matching
311+
internally with multi-threaded C++ code.
312+
313+
Build times (per 1M molecules):
314+
- CachedTrustedSmilesMolHolder: ~0.5s per 1M
315+
- PatternHolder (fingerprints): ~10min per 1M
316+
317+
Query times (~15M molecule catalog, SMARTS query):
318+
- 1 core: ~20s
319+
- 2 cores: ~12s
320+
- 4 cores: ~7s
321+
- 8 cores: ~5s
322+
- 20 cores: ~4s
323+
At 1M molecules: ~3.5s on 1 core, ~0.5s on all cores (20).
324+
325+
Memory usage:
326+
- SMILES (mollib): ~1.3 GB / 10M, ~2 GB / 16M
327+
- Fingerprints (fpslib): ~4.4 GB / 10M, ~7.1 GB / 16M
328+
- Total for 15M: ~9.1 GB
329+
330+
References:
331+
- https://www.rdkit.org/docs/source/rdkit.Chem.rdSubstructLibrary.html
332+
- https://rdkit.blogspot.com/2018/02/introducing-substructlibrary.html
333+
"""
306334
logging.info("Generating substructure library...")
307335
if (
308336
"smiles" not in self.df.columns
@@ -346,8 +374,28 @@ def substructure_library(self):
346374
# -------------------------------------------------------------------------
347375

348376
def _custom_substructure_filter(self, query: Molecule):
349-
# this is a unwrapped version of rdkit's substruc lib. I keep it here becuase
350-
# it helps to know what is happening behind the scenes
377+
"""
378+
An unwrapped version of RDKit's SubstructLibrary. Kept here to show
379+
what is happening behind the scenes in the `substructure_library`
380+
property + `GetMatches` call.
381+
382+
Two-phase approach:
383+
1. Fingerprint pre-screen with AllProbeBitsMatch (fast C++ bit check)
384+
2. Exact substructure match only on candidates (using sanitize=False
385+
trusted SMILES trick for speed)
386+
387+
For production use, prefer `substructure_library` which does both phases
388+
internally in multi-threaded C++ and is significantly faster.
389+
390+
Key speed notes:
391+
- Molecule.from_smiles() is too slow for the inner loop; use
392+
AllChem.MolFromSmiles(smi, sanitize=False) + UpdatePropertyCache()
393+
instead. See: https://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
394+
- Fingerprint generation (PatternFingerprint.featurize_many with
395+
parallel=True): ~30min for 16M molecules.
396+
- Property featurization (PropertyGrabber.featurize_many): ~17s
397+
parallel vs ~60s serial for standard descriptors.
398+
"""
351399

352400
from rdkit.Chem import AllChem, DataStructs
353401

@@ -367,12 +415,9 @@ def _custom_substructure_filter(self, query: Molecule):
367415
hit_ids = []
368416
q = query.rdkit_molecule
369417
for i in candidate_ids:
370-
# a faster way to load trusted smiles based on CachedTrustedSmilesMolHolder
371-
# https://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
372-
# mol = Molecule.from_smiles(df[i]["smiles"][0]) # too slow
373418
mol = AllChem.MolFromSmiles(self.df[i]["smiles"][0], sanitize=False)
374419
mol.UpdatePropertyCache()
375-
# Chem.FastFindRings(mol)
420+
# Chem.FastFindRings(mol) # not needed for most SMARTS
376421
if mol.HasSubstructMatch(q):
377422
hit_ids.append(i)
378423

0 commit comments

Comments
 (0)