@@ -303,6 +303,34 @@ def filter_from_ids(self, ids: list[int]):
303303
304304 @cached_property
305305 def substructure_library (self ):
306+ """
307+ Builds an RDKit SubstructLibrary for fast substructure searching.
308+
309+ This is the recommended approach over `_custom_substructure_filter`
310+ because it handles fingerprint pre-screening and substructure matching
311+ internally with multi-threaded C++ code.
312+
313+ Build times (per 1M molecules):
314+ - CachedTrustedSmilesMolHolder: ~0.5s per 1M
315+ - PatternHolder (fingerprints): ~10min per 1M
316+
317+ Query times (~15M molecule catalog, SMARTS query):
318+ - 1 core: ~20s
319+ - 2 cores: ~12s
320+ - 4 cores: ~7s
321+ - 8 cores: ~5s
322+ - 20 cores: ~4s
323+ At 1M molecules: ~3.5s on 1 core, ~0.5s on all cores (20).
324+
325+ Memory usage:
326+ - SMILES (mollib): ~1.3 GB / 10M, ~2 GB / 16M
327+ - Fingerprints (fpslib): ~4.4 GB / 10M, ~7.1 GB / 16M
328+ - Total for 15M: ~9.1 GB
329+
330+ References:
331+ - https://www.rdkit.org/docs/source/rdkit.Chem.rdSubstructLibrary.html
332+ - https://rdkit.blogspot.com/2018/02/introducing-substructlibrary.html
333+ """
306334 logging .info ("Generating substructure library..." )
307335 if (
308336 "smiles" not in self .df .columns
@@ -346,8 +374,28 @@ def substructure_library(self):
346374 # -------------------------------------------------------------------------
347375
348376 def _custom_substructure_filter (self , query : Molecule ):
349- # this is a unwrapped version of rdkit's substruc lib. I keep it here becuase
350- # it helps to know what is happening behind the scenes
377+ """
378+ An unwrapped version of RDKit's SubstructLibrary. Kept here to show
379+ what is happening behind the scenes in the `substructure_library`
380+ property + `GetMatches` call.
381+
382+ Two-phase approach:
383+ 1. Fingerprint pre-screen with AllProbeBitsMatch (fast C++ bit check)
384+ 2. Exact substructure match only on candidates (using sanitize=False
385+ trusted SMILES trick for speed)
386+
387+ For production use, prefer `substructure_library` which does both phases
388+ internally in multi-threaded C++ and is significantly faster.
389+
390+ Key speed notes:
391+ - Molecule.from_smiles() is too slow for the inner loop; use
392+ AllChem.MolFromSmiles(smi, sanitize=False) + UpdatePropertyCache()
393+ instead. See: https://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
394+ - Fingerprint generation (PatternFingerprint.featurize_many with
395+ parallel=True): ~30min for 16M molecules.
396+ - Property featurization (PropertyGrabber.featurize_many): ~17s
397+ parallel vs ~60s serial for standard descriptors.
398+ """
351399
352400 from rdkit .Chem import AllChem , DataStructs
353401
@@ -367,12 +415,9 @@ def _custom_substructure_filter(self, query: Molecule):
367415 hit_ids = []
368416 q = query .rdkit_molecule
369417 for i in candidate_ids :
370- # a faster way to load trusted smiles based on CachedTrustedSmilesMolHolder
371- # https://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
372- # mol = Molecule.from_smiles(df[i]["smiles"][0]) # too slow
373418 mol = AllChem .MolFromSmiles (self .df [i ]["smiles" ][0 ], sanitize = False )
374419 mol .UpdatePropertyCache ()
375- # Chem.FastFindRings(mol)
420+ # Chem.FastFindRings(mol) # not needed for most SMARTS
376421 if mol .HasSubstructMatch (q ):
377422 hit_ids .append (i )
378423
0 commit comments