allow updates for batch_bulk_create decor 🎀 (#1015)

jacksund · web-flow · commit aa8208bd65a5 · 2026-04-06T16:30:30.000-05:00
diff --git a/src/simmate/database/utils.py b/src/simmate/database/utils.py
@@ -22,13 +22,30 @@
 APPS_TO_MIGRATE = list(apps.app_configs.keys())
 
 
-def batch_bulk_create(batch_size: int = 1000):
+def batch_bulk_create(
+    batch_size: int = 1000,
+    update_conflicts: bool = False,
+    unique_fields: list[str] = None,
+    update_fields: list[str] = None,
+):
     """
     Decorator for the `load_source_data` classmethod on DatabaseTables.
     Expects the wrapped method to be a generator that yields database objects.
     This handles creating the objects in batches using `bulk_create`.
+
+    By default, conflicts are ignored (insert-only). To enable upsert behavior,
+    set `update_conflicts=True` and provide `unique_fields` and `update_fields`.
     """
 
+    if update_conflicts:
+        bulk_create_kwargs = dict(
+            update_conflicts=True,
+            unique_fields=unique_fields,
+            update_fields=update_fields,
+        )
+    else:
+        bulk_create_kwargs = dict(ignore_conflicts=True)
+
     def decorator(func):
         @wraps(func)
         def wrapper(cls, *args, **kwargs):
@@ -41,15 +58,15 @@ def wrapper(cls, *args, **kwargs):
                     cls.objects.bulk_create(
                         db_objs,
                         batch_size=batch_size,
-                        ignore_conflicts=True,
+                        **bulk_create_kwargs,
                     )
                     db_objs = []  # reset for next batch
             # save any remaining
             if db_objs:
                 cls.objects.bulk_create(
                     db_objs,
                     batch_size=batch_size,
-                    ignore_conflicts=True,
+                    **bulk_create_kwargs,
                 )
 
         return wrapper
diff --git a/src/simmate/toolkit/dataframes/molecule.py b/src/simmate/toolkit/dataframes/molecule.py
@@ -303,6 +303,34 @@ def filter_from_ids(self, ids: list[int]):
 
     @cached_property
     def substructure_library(self):
+        """
+        Builds an RDKit SubstructLibrary for fast substructure searching.
+
+        This is the recommended approach over `_custom_substructure_filter`
+        because it handles fingerprint pre-screening and substructure matching
+        internally with multi-threaded C++ code.
+
+        Build times (per 1M molecules):
+            - CachedTrustedSmilesMolHolder: ~0.5s per 1M
+            - PatternHolder (fingerprints): ~10min per 1M
+
+        Query times (~15M molecule catalog, SMARTS query):
+            -  1 core: ~20s
+            -  2 cores: ~12s
+            -  4 cores: ~7s
+            -  8 cores: ~5s
+            - 20 cores: ~4s
+            At 1M molecules: ~3.5s on 1 core, ~0.5s on all cores (20).
+
+        Memory usage:
+            - SMILES (mollib):       ~1.3 GB / 10M, ~2 GB / 16M
+            - Fingerprints (fpslib): ~4.4 GB / 10M, ~7.1 GB / 16M
+            - Total for 15M: ~9.1 GB
+
+        References:
+            - https://www.rdkit.org/docs/source/rdkit.Chem.rdSubstructLibrary.html
+            - https://rdkit.blogspot.com/2018/02/introducing-substructlibrary.html
+        """
         logging.info("Generating substructure library...")
         if (
             "smiles" not in self.df.columns
@@ -346,8 +374,28 @@ def substructure_library(self):
     # -------------------------------------------------------------------------
 
     def _custom_substructure_filter(self, query: Molecule):
-        # this is a unwrapped version of rdkit's substruc lib. I keep it here becuase
-        # it helps to know what is happening behind the scenes
+        """
+        An unwrapped version of RDKit's SubstructLibrary. Kept here to show
+        what is happening behind the scenes in the `substructure_library`
+        property + `GetMatches` call.
+
+        Two-phase approach:
+            1. Fingerprint pre-screen with AllProbeBitsMatch (fast C++ bit check)
+            2. Exact substructure match only on candidates (using sanitize=False
+               trusted SMILES trick for speed)
+
+        For production use, prefer `substructure_library` which does both phases
+        internally in multi-threaded C++ and is significantly faster.
+
+        Key speed notes:
+            - Molecule.from_smiles() is too slow for the inner loop; use
+              AllChem.MolFromSmiles(smi, sanitize=False) + UpdatePropertyCache()
+              instead. See: https://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
+            - Fingerprint generation (PatternFingerprint.featurize_many with
+              parallel=True): ~30min for 16M molecules.
+            - Property featurization (PropertyGrabber.featurize_many): ~17s
+              parallel vs ~60s serial for standard descriptors.
+        """
 
         from rdkit.Chem import AllChem, DataStructs
 
@@ -367,12 +415,9 @@ def _custom_substructure_filter(self, query: Molecule):
         hit_ids = []
         q = query.rdkit_molecule
         for i in candidate_ids:
-            # a faster way to load trusted smiles based on CachedTrustedSmilesMolHolder
-            # https://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
-            # mol = Molecule.from_smiles(df[i]["smiles"][0]) # too slow
             mol = AllChem.MolFromSmiles(self.df[i]["smiles"][0], sanitize=False)
             mol.UpdatePropertyCache()
-            # Chem.FastFindRings(mol)
+            # Chem.FastFindRings(mol)  # not needed for most SMARTS
             if mol.HasSubstructMatch(q):
                 hit_ids.append(i)