diff --git a/pyproject.toml b/pyproject.toml index b63a1725..e535eee4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,4 +93,9 @@ replace = 'version = "{new_version}"' [[tool.bumpversion.files]] filename = "src/poli/__init__.py" search = '__version__ = "{current_version}"' -replace = '__version__ = "{new_version}"' \ No newline at end of file +replace = '__version__ = "{new_version}"' + +[[tool.bumpversion.files]] +filename = "setup.cfg" +search = 'version = "{current_version}"' +replace = 'version = "{new_version}"' \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 55c8c5ed..253dc225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = poli -version = "1.0.0.dev0" +version = "1.0.0.dev2" author_email = miguel.gonzalez-duque@bio.ku.dk description = Protein Objectives Library long_description = file: README.md diff --git a/src/poli/core/util/chemistry/smiles_alphabet.py b/src/poli/core/util/chemistry/smiles_alphabet.py new file mode 100644 index 00000000..ddff9e1c --- /dev/null +++ b/src/poli/core/util/chemistry/smiles_alphabet.py @@ -0,0 +1,2349 @@ +CID_SMILES_COUNTS = { + "": 44420414080, + "C": 2501779041, + "=": 923142831, + "(": 660472511, + ")": 660472511, + "O": 350079998, + "N": 325800358, + "1": 256135776, + "2": 208175970, + "3": 137611186, + "4": 78117092, + "F": 53547041, + "S": 44262279, + "5": 37744044, + "[C@H]": 29592982, + "[C@@H]": 29562650, + "Cl": 27583359, + "/": 27499221, + ".": 26383866, + "6": 19040680, + "7": 11946940, + "\\": 10219927, + "Br": 9309871, + "8": 8971834, + "[N+]": 7559684, + "9": 7089046, + "[O-]": 7077949, + "[C@@]": 3671846, + "[C@]": 3281384, + "[2H]": 3152821, + "P": 3123822, + "I": 2807425, + "[Si]": 2335956, + "[C-]": 1553009, + "B": 989657, + "[NH+]": 678291, + "[N-]": 666314, + "[Na+]": 451479, + "[B]": 410832, + "[HH]": 404315, + "[Cl-]": 363185, + "[Y]": 279710, + "[CH-]": 254408, + "[P+]": 214734, + "[Ir]": 206755, + "[NH2+]": 197772, + "[CH]": 197689, + "[CH3-]": 169626, + "[CH2-]": 158704, + "[K+]": 155674, + "[S+]": 136431, + "[Br-]": 135664, + "[Se]": 123182, + "[I-]": 116496, + "[B-]": 110600, + "[NH3+]": 107587, + "[3H]": 103034, + "[S-]": 99616, + "[Li+]": 99273, + "[Pt]": 93992, + "[C]": 93594, + "[O+]": 91731, + "[Sn]": 91285, + "[Pt+2]": 88838, + "[W]": 86357, + "[SiH]": 77344, + "[NH-]": 72058, + "[V]": 71935, + "[OH-]": 71660, + "[PH+]": 61439, + "[H-]": 55447, + "[SiH2]": 55377, + "[S@]": 52361, + "[S@@]": 51342, + "[Na]": 49490, + "[Ge]": 46495, + "[O-2]": 45698, + "%10": 44860, + "[Mg+2]": 43788, + "[K]": 41663, + "[Al]": 40513, + "[SiH3]": 39129, + "[Zr+2]": 37152, + "[CH2]": 35935, + "[NH4+]": 35325, + "[Zn+2]": 34238, + "%11": 33370, + "[Fe]": 32730, + "[Pd]": 31923, + "[Fe+2]": 30052, + "[OH+]": 29120, + "%12": 28668, + "[F-]": 28111, + "[Cu+2]": 28110, + "[P@]": 28050, + "[Co]": 28009, + "[Ca+2]": 27718, + "[Ti]": 27663, + "[P@@]": 27320, + "[Ir+3]": 26574, + "[As]": 26180, + "[Pd+2]": 26042, + "[Zn]": 25837, + "[C+]": 25663, + "[Zr]": 24917, + "[P-]": 24617, + "[Rb]": 24595, + "[Cu]": 24259, + "[U]": 23906, + "[13C]": 22163, + "%13": 22084, + "[Ni]": 21920, + "[Ru]": 19831, + "[H+]": 19611, + "[Ni+2]": 19363, + "[Te]": 18845, + "[NH2-]": 18549, + "[CH+]": 18135, + "[Ac]": 17967, + "[OH3+]": 17045, + "[O]": 16785, + "%14": 16552, + "[Cr]": 16517, + "[I+]": 16410, + "[13CH2]": 15876, + "[18F]": 15627, + "[Zr+4]": 15416, + "[OH2+]": 15244, + "[Mo]": 15048, + "[2HH]": 14854, + "[Mn]": 14791, + "[Ti+4]": 14733, + "[Tb]": 14689, + "[13CH]": 14239, + "[Li]": 14180, + "%15": 13950, + "[Cu+]": 13930, + "[Rb+]": 13858, + "[Cs+]": 13650, + "[Rf]": 12826, + "%16": 12596, + "[Sb]": 12251, + "[Co+2]": 12210, + "[Ar]": 12144, + "[Ag+]": 11556, + "[Au+]": 11441, + "[W+2]": 11335, + "%17": 11326, + "[Rh]": 11324, + "[Ru+2]": 11121, + "[Hg]": 11008, + "[Re]": 10156, + "[Ba+2]": 10114, + "[Ti+2]": 9859, + "[Pb]": 9607, + "[Ag]": 9569, + "[SH+]": 9510, + "[Al+3]": 9481, + "[CH2+]": 8633, + "[N@+]": 8469, + "%18": 8130, + "[Pt+4]": 8096, + "[N@@+]": 8082, + "[Mn+2]": 8058, + "[Hf]": 7877, + "[H]": 7476, + "[13CH3]": 7452, + "[Au]": 7412, + "[Bi]": 7351, + "[Fe+3]": 7344, + "[Y+3]": 7204, + "[U+2]": 7195, + "[Sn+2]": 6888, + "[Ga]": 6612, + "%19": 6572, + "[In]": 6540, + "[3HH]": 6484, + "[SH2+]": 6332, + "[Hf+4]": 6303, + "[Os]": 6239, + "[Zn+]": 6227, + "[V+2]": 5854, + "[Ca]": 5822, + "[N]": 5747, + "[Mg]": 5700, + "[Co+3]": 5643, + "%20": 5590, + "[SH-]": 5483, + "[Ti+3]": 5392, + "[15N]": 5013, + "[Si-]": 4981, + "[Zr+3]": 4687, + "[Gd+3]": 4659, + "%21": 4504, + "[15NH]": 4478, + "[Tl]": 4388, + "[Ru+]": 4386, + "[AlH3]": 4331, + "[CH3+]": 4257, + "[Cr+3]": 4073, + "[Cd]": 4020, + "[14C]": 4002, + "[BH3-]": 3970, + "%22": 3952, + "[Ru+3]": 3833, + "[S-2]": 3711, + "%23": 3674, + "[Gd]": 3671, + "[Sr+2]": 3519, + "[Se-]": 3519, + "[Cd+2]": 3504, + "[AlH2]": 3476, + "%24": 3398, + "%25": 3332, + "[Pd+]": 3231, + "[PH3+]": 3215, + "[AlH]": 3154, + "[Rh+2]": 3115, + "[PH2+]": 3112, + "[Sn+]": 3085, + "%26": 3044, + "[Au+3]": 2955, + "[SeH]": 2924, + "[Mn+3]": 2888, + "[Nb]": 2735, + "[Sb-]": 2680, + "[Ta]": 2673, + "[Hg+]": 2671, + "[14CH]": 2660, + "[Pr]": 2632, + "[13C@@H]": 2616, + "[Be+2]": 2612, + "[2H-]": 2605, + "[BH-]": 2592, + "[BH2-]": 2546, + "[La]": 2503, + "[Eu]": 2478, + "[Ce]": 2475, + "[11CH3]": 2422, + "[13C@H]": 2406, + "[Ti+]": 2314, + "[Os+2]": 2290, + "[IH+]": 2284, + "[Ni+3]": 2262, + "[NH]": 2232, + "[ClH+]": 2175, + "[Pb+2]": 2172, + "[Al+]": 2135, + "[125I]": 2111, + "[15NH2]": 2071, + "[Mo+2]": 2053, + "[La+3]": 1977, + "[Ba]": 1952, + "[Al-]": 1949, + "[Ce+3]": 1948, + "[18O]": 1943, + "[Si@@]": 1937, + "[Pt+]": 1928, + "[Si@]": 1906, + "[Sm]": 1904, + "[Cr+2]": 1879, + "[Eu+3]": 1875, + "[Nd]": 1839, + "[Rh+3]": 1825, + "[Hf+2]": 1805, + "[123I]": 1744, + "[Bi+3]": 1724, + "[Se+]": 1722, + "[Nd+3]": 1716, + "[3H-]": 1664, + "[PH-]": 1639, + "[GeH]": 1616, + "[Sn+4]": 1581, + "[Sc]": 1577, + "[Fm]": 1549, + "[SnH]": 1535, + "[Mo+4]": 1519, + "[Fe+4]": 1492, + "[Yb]": 1487, + "[14CH2]": 1486, + "[Dy+3]": 1481, + "[As+]": 1421, + "[10B]": 1405, + "[Sb+3]": 1372, + "[GeH2]": 1324, + "[AsH]": 1289, + "[SiH4]": 1287, + "[Hg+2]": 1275, + "[V+4]": 1267, + "[Si+]": 1249, + "[In+3]": 1221, + "[Al+2]": 1220, + "[Os+]": 1207, + "[Ga+3]": 1196, + "[Lu]": 1161, + "[BH4-]": 1153, + "[Tb+3]": 1145, + "[Fe+]": 1118, + "[Sr]": 1107, + "[Sn+3]": 1100, + "[131I]": 1082, + "%27": 1066, + "[Yb+3]": 1033, + "[Sm+3]": 1019, + "[SiH-]": 1010, + "[Tc]": 1002, + "[Sc+3]": 979, + "[Ru+4]": 952, + "[99Tc]": 942, + "[Po]": 941, + "[11C]": 919, + "[14CH3]": 902, + "[Ru+6]": 884, + "[Cs]": 868, + "[Er]": 863, + "[SH3+]": 863, + "[Ir+2]": 861, + "[Tl+]": 852, + "[Dy]": 829, + "[GeH3]": 829, + "[99Tc+4]": 823, + "[Er+3]": 823, + "[Se-2]": 822, + "[PH4+]": 807, + "[Ho]": 802, + "[Si+4]": 778, + "[No]": 773, + "[Pr+3]": 760, + "[Th]": 758, + "[BiH2]": 741, + "[S]": 741, + "[Nb+5]": 728, + "[11B]": 727, + "%28": 726, + "[Si-2]": 718, + "[Ta+5]": 707, + "[Fe+6]": 689, + "[P]": 677, + "[Te+]": 674, + "[18OH]": 672, + "[SnH2]": 671, + "[Ce+4]": 664, + "[BH]": 662, + "[Fe+5]": 643, + "[As-]": 637, + "[1H]": 636, + "[Lu+3]": 626, + "[19F]": 622, + "[Os+4]": 618, + "[U+4]": 618, + "[IH2+]": 598, + "[Tm]": 586, + "[Cr+6]": 563, + "[Mo+3]": 548, + "%29": 532, + "[Th+4]": 526, + "[Ho+3]": 521, + "[PH2-]": 514, + "[124I]": 509, + "[At]": 506, + "[Cm]": 499, + "[SbH]": 493, + "[Co+]": 492, + "[GaH]": 487, + "[Bi+2]": 485, + "[SnH3]": 480, + "[AsH2]": 474, + "[15N+]": 471, + "[Cr+]": 470, + "[13C@@]": 461, + "[Cl+]": 458, + "[GaH3]": 457, + "[Be]": 453, + "[Np]": 448, + "[Eu+2]": 446, + "[Sg]": 444, + "[GeH4]": 440, + "[14C@@H]": 438, + "%30": 438, + "[Lr]": 431, + "[Tc+4]": 430, + "[13C@]": 412, + "[76Br]": 409, + "[Cr+4]": 409, + "[Xe]": 408, + "[ClH2+]": 407, + "%31": 406, + "[U+3]": 405, + "[10BH2]": 402, + "[TeH]": 401, + "[14C@H]": 400, + "[Ru+5]": 395, + "[Yb+2]": 393, + "[Tm+3]": 392, + "%32": 392, + "[B@@-]": 392, + "[B@-]": 387, + "[BiH]": 378, + "[RuH]": 377, + "[RuH+2]": 367, + "[Pu]": 366, + "[Nb+2]": 365, + "[Sb+]": 359, + "[P-3]": 331, + "[PH]": 329, + "[Sb+5]": 321, + "[Ga-]": 311, + "%37": 310, + "[12C]": 295, + "[Gd+2]": 288, + "[Ta+2]": 286, + "[17O]": 284, + "%33": 284, + "[Si+2]": 282, + "[211At]": 278, + "[IH]": 276, + "[Pm]": 275, + "[B+]": 275, + "[Cf]": 273, + "[64Cu]": 273, + "[Zn-2]": 273, + "[Li-]": 267, + "[Cr+5]": 266, + "[Bi+]": 265, + "[11CH2]": 260, + "[Sn-]": 259, + "[Si@@H]": 258, + "[GaH2]": 258, + "[He]": 258, + "%36": 256, + "[PbH]": 250, + "[99Tc+3]": 247, + "[Am]": 241, + "%35": 240, + "[P-2]": 235, + "[SbH2]": 233, + "%34": 232, + "[Fe-]": 231, + "[SeH2]": 228, + "[Si@H]": 218, + "[Ni+]": 216, + "%38": 216, + "[InH2]": 214, + "[18F-]": 212, + "[Es]": 212, + "%40": 210, + "[Ge+4]": 209, + "[Ra]": 204, + "[Ir-3]": 203, + "%39": 202, + "[Pa]": 198, + "[NiH2]": 193, + "[OH]": 192, + "[Mn+]": 192, + "%41": 192, + "[16OH]": 188, + "%43": 184, + "[Ir+]": 182, + "%42": 182, + "[Bk]": 180, + "[Th+2]": 179, + "[Rh+]": 178, + "[35S]": 177, + "[Tc+3]": 176, + "[3H+]": 173, + "[32P]": 171, + "[64Cu+2]": 170, + "[Os+6]": 167, + "[AsH3]": 163, + "[16O]": 160, + "[PH-2]": 158, + "[Zn-]": 157, + "[InH]": 151, + "[Mg+]": 150, + "[Ne]": 150, + "%44": 148, + "[SbH3]": 143, + "[68Ga+3]": 142, + "[111In+3]": 140, + "[Ru+8]": 135, + "[Tl+3]": 134, + "[PbH2]": 132, + "%45": 132, + "[As+3]": 131, + "[TlH]": 129, + "[XeH]": 127, + "[13CH4]": 127, + "[18O-]": 121, + "[17OH]": 118, + "[13C-]": 118, + "[Nb+3]": 118, + "[Db]": 117, + "[InH3]": 117, + "[Ru-]": 117, + "[77Br]": 116, + "[12CH]": 116, + "[W+]": 115, + "[Mt]": 112, + "[11CH]": 112, + "[Si+3]": 110, + "[ClH]": 107, + "[PoH]": 105, + "[Tc+5]": 103, + "[68Ga]": 100, + "[In+]": 100, + "[Pt-2]": 98, + "[RuH+3]": 96, + "[Cu-]": 96, + "[SiH+]": 96, + "[14N]": 94, + "[177Lu]": 91, + "[BiH3]": 90, + "[IrH+2]": 89, + "[Kr]": 88, + "[NiH]": 88, + "[Mo+]": 87, + "[Os+5]": 86, + "[AlH2-]": 85, + "[TlH2]": 81, + "[Au-]": 81, + "[OsH]": 80, + "[RuH2]": 80, + "[RuH+]": 80, + "[PdH2]": 79, + "[Tc+2]": 78, + "[111In]": 77, + "[18FH]": 76, + "[Hs]": 72, + "[Bh]": 72, + "%46": 72, + "%47": 72, + "[12CH3]": 72, + "[Rn]": 69, + "[Cl]": 69, + "[SiH2-]": 69, + "[12CH2]": 68, + "[AlH2+]": 68, + "[99Tc+5]": 64, + "[Os+7]": 63, + "[35Cl]": 63, + "[Pd-2]": 62, + "[AlH-]": 62, + "%48": 62, + "[Tc+6]": 62, + "[90Y]": 61, + "[Ge-2]": 59, + "[P@@H]": 58, + "%49": 58, + "[PtH]": 57, + "[Co-2]": 57, + "[P@H]": 57, + "[Pd-]": 56, + "[Cu-2]": 55, + "[Ir-2]": 55, + "[177Lu+3]": 55, + "[10BH]": 55, + "[In-]": 55, + "[37Cl]": 55, + "[99Tc+7]": 54, + "[2H+]": 52, + "[Tc+7]": 52, + "[16N]": 51, + "[Tb+4]": 51, + "[34S]": 51, + "[AlH4-]": 50, + "[Ru-2]": 49, + "%51": 48, + "%53": 48, + "[TeH2]": 47, + "[10B-]": 47, + "[Ca+]": 46, + "[Tc+]": 46, + "%55": 46, + "[188Re]": 45, + "[IrH2]": 44, + "%50": 44, + "%52": 44, + "%54": 44, + "[AlH+]": 43, + "[Md]": 43, + "[Nb-]": 43, + "%56": 42, + "[CH3]": 41, + "[CuH]": 41, + "[15NH3]": 40, + "[10BH-]": 40, + "[Br]": 40, + "[14NH]": 38, + "[127I]": 38, + "[Ge@]": 38, + "[14C@@]": 38, + "[80Br]": 37, + "[SiH2+]": 37, + "[125I-]": 36, + "[RhH+2]": 35, + "[Ti-2]": 35, + "[B@@H-]": 35, + "[67Ga+3]": 35, + "[PbH2+2]": 35, + "[B@H-]": 35, + "%57": 34, + "[15N-]": 34, + "[67Cu]": 34, + "[Ge@@]": 33, + "[SiH3-]": 33, + "[13NH2]": 33, + "[12C@@H]": 33, + "[I]": 32, + "[SeH-]": 31, + "[ReH]": 31, + "[68GaH3]": 30, + "[PdH+]": 30, + "[15O]": 30, + "%58": 30, + "%60": 30, + "%62": 30, + "[225Ac]": 30, + "[WH2]": 29, + "[AsH2+]": 29, + "[Pt-]": 29, + "[75Se]": 29, + "[7Li+]": 28, + "%59": 28, + "%61": 28, + "[Hg-2]": 28, + "[PoH2]": 27, + "[RuH2+2]": 27, + "[11C-]": 27, + "[186Re]": 27, + "[ZrH2]": 26, + "[RhH]": 26, + "[14NH2]": 26, + "[6Li+]": 26, + "%64": 26, + "[OsH2]": 25, + "[Hg-]": 25, + "[14C-]": 25, + "[W-]": 25, + "[15NH3+]": 25, + "[Ni-2]": 24, + "[SeH+]": 24, + "[SnH+3]": 24, + "[As+5]": 24, + "[15NH+]": 24, + "[Ni-]": 24, + "%63": 24, + "[SnH4]": 23, + "[Al-3]": 23, + "[AsH3+]": 23, + "[90Y+3]": 23, + "[123I-]": 23, + "[MoH2]": 23, + "[Te+4]": 23, + "[WH]": 23, + "[Rh-]": 23, + "[113In+3]": 23, + "[Os-2]": 22, + "[PtH+]": 22, + "[BH2]": 22, + "[FeH2]": 22, + "[ClH2+2]": 22, + "[AsH+]": 22, + "[99Tc+6]": 22, + "[Zr-2]": 21, + "[GeH3-]": 21, + "[PH2]": 21, + "[Cl+3]": 21, + "[16NH]": 21, + "[CuH+]": 21, + "[RuH3]": 21, + "[AlH3-]": 21, + "[33P]": 21, + "[PbH3]": 21, + "[98Tc+5]": 21, + "[GeH2-]": 20, + "%65": 20, + "[BrH]": 19, + "[Te@@]": 19, + "[ReH4]": 19, + "[18OH2]": 19, + "[AtH]": 19, + "[99Tc+2]": 19, + "[15NH4+]": 19, + "[14C@]": 19, + "[Se+4]": 19, + "[89Zr]": 18, + "[IrH]": 18, + "[67Cu+2]": 18, + "[Al-2]": 18, + "[33S]": 18, + "[SeH2+]": 17, + "[Os+8]": 17, + "[HgH]": 17, + "[117Sn+4]": 17, + "[82Br]": 17, + "[125Te]": 17, + "[CoH2]": 17, + "[Rh-3]": 16, + "[35SH]": 16, + "[AlH+2]": 16, + "[16NH2]": 16, + "[SnH+]": 16, + "[67Ga]": 16, + "%66": 16, + "%67": 16, + "[RhH3]": 15, + "[TaH2]": 15, + "[RhH2]": 15, + "[4He]": 15, + "[16N+]": 15, + "[79Br]": 15, + "[13CH3-]": 15, + "[SiH2-2]": 15, + "[12C@@]": 14, + "[153Sm+3]": 14, + "[IrH3]": 14, + "[PdH]": 14, + "[FeH]": 14, + "[13N]": 14, + "[AsH-]": 14, + "[36Cl]": 14, + "[Fr]": 14, + "[18O-2]": 14, + "[PtH2]": 13, + "[89Zr+4]": 13, + "[60Co+3]": 13, + "[11CH4]": 13, + "[TaH3]": 13, + "[124I-]": 13, + "[13CH2-]": 13, + "[YH]": 13, + "[13CH-]": 13, + "[TlH3]": 13, + "[11CH3-]": 13, + "[166Ho+3]": 13, + "[Ge-]": 13, + "[153Sm]": 13, + "[Pr+]": 13, + "[25Mg+2]": 13, + "[ZnH+]": 12, + "[Te@]": 12, + "[1H-]": 12, + "[99Mo]": 12, + "[ZrH]": 12, + "[Bi-]": 12, + "[201Hg]": 12, + "[121Sb]": 12, + "[13NH]": 12, + "[12C@H]": 12, + "[13O]": 12, + "[131I-]": 12, + "[75Br]": 12, + "[InH4-]": 12, + "[Re-]": 12, + "[SiH-2]": 12, + "[12CH4]": 12, + "[15C]": 11, + "[Ta-]": 11, + "[64Zn+2]": 11, + "[121I]": 11, + "[153Gd+3]": 11, + "*": 10, + "[119Sn]": 10, + "[Re-2]": 10, + "[197Hg]": 10, + "[TaH]": 10, + "[15NH2+]": 10, + "[10BH3]": 10, + "[64Zn]": 10, + "[Cd-2]": 10, + "[19C]": 10, + "[CoH+]": 10, + "[14O]": 10, + "[Nb-2]": 9, + "[SbH3+]": 9, + "[241Am]": 9, + "[76As]": 9, + "[CoH+2]": 9, + "[SnH2+2]": 9, + "[14CH4]": 9, + "[SiH3+]": 9, + "[Ru-3]": 9, + "[14N+]": 9, + "[NH2]": 9, + "[17F]": 9, + "[Se@@]": 8, + "[Mn-2]": 8, + "[ReH7]": 8, + "[19BH2]": 8, + "[TiH2]": 8, + "[VH]": 8, + "[17O-]": 8, + "[123IH]": 8, + "[CuH2]": 8, + "[77Se]": 8, + "[SH2]": 8, + "[57Fe+2]": 8, + "[IrH4]": 7, + "[MnH+]": 7, + "[Se@]": 7, + "[131IH]": 7, + "[15OH]": 7, + "[SbH2+]": 7, + "[RuH4]": 7, + "[32S]": 7, + "[249Cf]": 7, + "[203Hg]": 7, + "[59Fe+2]": 7, + "[11C@@H]": 7, + "[11C@H]": 7, + "[122I]": 7, + "[81Br]": 7, + "[13CH3+]": 7, + "[14NH3]": 7, + "[166Ho]": 7, + "[153Gd]": 7, + "[Ba+]": 7, + "[PbH4]": 7, + "[18CH2]": 7, + "[18OH-]": 7, + "[20CH3]": 6, + "[MnH2]": 6, + "[99Tc+]": 6, + "[149Pm]": 6, + "[68GaH]": 6, + "[AuH]": 6, + "[BiH2+]": 6, + "[129I]": 6, + "[Ag-]": 6, + "[17OH2]": 6, + "[AlH2-2]": 6, + "[SbH-]": 6, + "[Rh-2]": 6, + "[203Hg+2]": 6, + "[197Hg+]": 6, + "[Fe-3]": 6, + "[SbH+]": 6, + "[74Se]": 6, + "[62Cu+2]": 6, + "[7Li]": 6, + "[TiH]": 6, + "[98Tc]": 6, + "[59Fe+3]": 6, + "[51Cr]": 6, + "[13OH]": 6, + "[14CH3-]": 6, + "[11BH3]": 6, + "[Fe-2]": 6, + "[28Si]": 6, + "[Cl@-]": 6, + "[Zr-]": 5, + "[F]": 5, + "[AlH6-3]": 5, + "[V+]": 5, + "[51Cr+3]": 5, + "[57Co]": 5, + "[195Pt]": 5, + "[203Pb]": 5, + "[MoH]": 5, + "[127I-]": 5, + "[111InH3]": 5, + "[ZnH2]": 5, + "[9C]": 5, + "[YH2]": 5, + "[SH]": 5, + "[74As]": 5, + "[SeH3+]": 5, + "[201Tl]": 5, + "[213Bi+3]": 5, + "[TeH2+]": 5, + "[197Au]": 5, + "[Ge@@H]": 5, + "[12BH2]": 5, + "[13C+]": 5, + "[239Pu]": 5, + "[87Sr+2]": 5, + "[SnH3-]": 5, + "[TeH3]": 5, + "[195Pt+2]": 5, + "[19FH]": 5, + "[198Au]": 5, + "[12C@]": 5, + "[11B-]": 5, + "[Zr+]": 5, + "[ZrH2+2]": 5, + "[99Ru+2]": 5, + "[67GaH3]": 4, + "[6Li]": 4, + "[17NH]": 4, + "[MnH]": 4, + "[213BiH]": 4, + "[GaH4-]": 4, + "[ThH2]": 4, + "[TiH+]": 4, + "[18O+]": 4, + "[8B]": 4, + "[OsH6]": 4, + "[NbH3]": 4, + "[1H+]": 4, + "[Cr-2]": 4, + "[252Cf]": 4, + "[18C]": 4, + "[66Ga+3]": 4, + "[Be+]": 4, + "[86Y]": 4, + "[89Y]": 4, + "[24Na+]": 4, + "[17CH]": 4, + "[77As]": 4, + "[31P]": 4, + "[32P+]": 4, + "[115In]": 4, + "[Os-]": 4, + "[ClH3+2]": 4, + "[KrH]": 4, + "[13CH+]": 4, + "[Ta-2]": 4, + "[ClH+2]": 4, + "[57Co+2]": 4, + "[ClH-]": 4, + "[SbH4]": 4, + "[125IH]": 4, + "[19O]": 4, + "[57Fe]": 4, + "[Ta+]": 4, + "[203Hg+]": 4, + "[44Ca+2]": 4, + "[237Np]": 4, + "[Fe-4]": 4, + "[85Sr+2]": 4, + "[90Sr+2]": 4, + "[201Tl+]": 4, + "[18CH]": 4, + "[CoH]": 4, + "[ZrH3]": 4, + "[GaH-]": 4, + "[181Ta+2]": 4, + "[13N+]": 4, + "[58Fe+2]": 4, + "[18N]": 4, + "[WH4]": 3, + "[Cd-]": 3, + "[169Yb+3]": 3, + "[152Sm+3]": 3, + "[10C]": 3, + "[20C]": 3, + "[47Ca+2]": 3, + "[NbH2]": 3, + "[111InH2]": 3, + "[SbH4+]": 3, + "[ReH2]": 3, + "[CrH2]": 3, + "[OsH3]": 3, + "[62Cu]": 3, + "[238Pu]": 3, + "[19CH2]": 3, + "[Ni-3]": 3, + "[227Th+4]": 3, + "[220Rn]": 3, + "[10CH2]": 3, + "[10CH3]": 3, + "[36S]": 3, + "[35P]": 3, + "[133I]": 3, + "[75Ge]": 3, + "[213Bi]": 3, + "[211Pb]": 3, + "[82Se]": 3, + "[73Ge]": 3, + "[27Al]": 3, + "[8BH2]": 3, + "[212Pb+2]": 3, + "[94Tc]": 3, + "[Cl+2]": 3, + "[70Zn]": 3, + "[12C-]": 3, + "[72Zn]": 3, + "[35Cl-]": 3, + "[238U]": 3, + "[95Tc+4]": 3, + "[204Hg+]": 3, + "[34SH]": 3, + "[68Cu]": 3, + "[249Bk]": 3, + "[253Es]": 3, + "[Os-3]": 3, + "[59Fe]": 3, + "[Zr-3]": 3, + "[169Yb]": 3, + "[32Cl]": 3, + "[239Np]": 3, + "[89Sr+2]": 3, + "[1HH]": 3, + "[Mo-2]": 3, + "[45Ca+2]": 3, + "[BH2+]": 3, + "[60Co]": 3, + "[103Pd]": 3, + "[9CH]": 3, + "[15CH]": 3, + "[44Sc]": 3, + "[137Cs+]": 3, + "[165Dy]": 3, + "[106Ru]": 3, + "[65Zn+2]": 3, + "[55Fe+3]": 3, + "[11CH-]": 3, + "[159Gd+3]": 3, + "[105Rh+3]": 3, + "[165Dy+3]": 3, + "[227Th]": 3, + "[Ru-4]": 3, + "[209BiH3]": 3, + "[164Dy+3]": 3, + "[BiH2+2]": 3, + "[Bi-2]": 3, + "[195Pt+4]": 3, + "[12B]": 3, + "[111In-]": 3, + "[Te-]": 2, + "[32PH]": 2, + "[31PH]": 2, + "[CrH+2]": 2, + "[SnH-]": 2, + "[OsH-]": 2, + "[22CH3-]": 2, + "[ReH3]": 2, + "[MoH5]": 2, + "[15NH2-]": 2, + "[Cr-3]": 2, + "[144Ce+3]": 2, + "[237Pu]": 2, + "[244Cm]": 2, + "[235U+2]": 2, + "[72As]": 2, + "[WH3]": 2, + "[Co-]": 2, + "[GeH5-]": 2, + "[82Br-]": 2, + "[113In]": 2, + "[FeH3]": 2, + "[17C-]": 2, + "[GeH6-2]": 2, + "[70As]": 2, + "[75SeH]": 2, + "[210Pb]": 2, + "[40K]": 2, + "[214Pb]": 2, + "[7Be]": 2, + "[212Pb]": 2, + "[127Te]": 2, + "[205Pb]": 2, + "[209Pb]": 2, + "[123Te]": 2, + "[202Pb]": 2, + "[68Ge]": 2, + "[201Pb]": 2, + "[200Pb]": 2, + "[198Pb]": 2, + "[71As]": 2, + "[66Ga]": 2, + "[73Se]": 2, + "[195Pb]": 2, + "[199Pb]": 2, + "[SnH3+]": 2, + "[ClH4+3]": 2, + "[57Co+3]": 2, + "[HgH+]": 2, + "[63Cu]": 2, + "[AsH2-]": 2, + "[16O-]": 2, + "[17O+]": 2, + "[16OH-]": 2, + "[81BrH]": 2, + "[SnH2+]": 2, + "[70Ge]": 2, + "[120I]": 2, + "[109Cd+2]": 2, + "[SeH5]": 2, + "[98Tc+7]": 2, + "[208Bi]": 2, + "[152Gd]": 2, + "[199Tl+]": 2, + "[60Co+2]": 2, + "[13NH2-]": 2, + "[235Np]": 2, + "[236Np]": 2, + "[82Sr+2]": 2, + "[16OH2]": 2, + "[AsH4+]": 2, + "[SnH2-]": 2, + "[128I]": 2, + "[100Tc+]": 2, + "[89Y+3]": 2, + "[62Zn]": 2, + "[29Si]": 2, + "[Ti-]": 2, + "%68": 2, + "[CeH]": 2, + "[191Pt+2]": 2, + "[101Tc]": 2, + "[149Tb]": 2, + "[81Rb+]": 2, + "[203Tl+]": 2, + "[144Ce]": 2, + "[CoH3]": 2, + "[18C-]": 2, + "[14CH-]": 2, + "[AlH5-2]": 2, + "[15CH2]": 2, + "[11CH3+]": 2, + "[FeH6]": 2, + "[16C]": 2, + "[OsH4]": 2, + "[AgH]": 2, + "[253Cf]": 2, + "[187Re]": 2, + "[159Dy]": 2, + "[61Cu]": 2, + "[91Y]": 2, + "[231Pa]": 2, + "[20OH]": 2, + "[79BrH]": 2, + "[14CH2-]": 2, + "[132Cs+]": 2, + "[Mn-]": 2, + "[191Pt+4]": 2, + "[193Pt+4]": 2, + "[90Tc]": 2, + "[114In+3]": 2, + "[147Pm]": 2, + "[32PH3]": 2, + "[234U]": 2, + "[210PoH2]": 2, + "[210BiH3]": 2, + "[35SH2]": 2, + "[235U]": 2, + "[82Rb+]": 2, + "[65Zn]": 2, + "[212PbH2]": 2, + "[Se+6]": 2, + "[43Ca+2]": 2, + "[10CH4]": 2, + "[203PbH]": 2, + "[Cl@@-]": 2, + "[37Cl-]": 2, + "[TiH4]": 2, + "[129Xe]": 2, + "[9CH4]": 2, + "[RuH6]": 2, + "[PtH3]": 2, + "[31PH3]": 2, + "[194Pb]": 2, + "[196Pb]": 2, + "[197Pb]": 2, + "[213Pb]": 2, + "[42K]": 2, + "[22NH]": 2, + "[20CH]": 2, + "[76Br-]": 2, + "[Cr-]": 2, + "[Zr-4]": 2, + "[148Sm]": 2, + "[147Sm]": 2, + "[TiH+3]": 2, + "[MoH4]": 2, + "[85Rb+]": 2, + "[58Co+2]": 2, + "[69Ga+3]": 2, + "[22Na+]": 2, + "[43K+]": 2, + "[129I-]": 2, + "[13CH2+]": 2, + "[YbH2]": 2, + "[52Fe+3]": 2, + "[42K+]": 2, + "[187Os]": 2, + "[149Sm]": 2, + "[193Pt+2]": 2, + "[19B]": 2, + "[InH-]": 2, + "[131Cs+]": 2, + "[86Rb+]": 2, + "[134Cs+]": 2, + "[125Te+4]": 2, + "[PtH2+2]": 1, + "[38PH]": 1, + "[33PH]": 1, + "[40PH]": 1, + "[20CH2]": 1, + "[BiH5]": 1, + "[77Kr]": 1, + "[103Cd]": 1, + "[FeH4-3]": 1, + "[228Th+4]": 1, + "[62Ni]": 1, + "[LaH3]": 1, + "[SmH3]": 1, + "[EuH3]": 1, + "[64Ni]": 1, + "[129IH]": 1, + "[157Dy]": 1, + "[111IH]": 1, + "[230Ra]": 1, + "[144Pr+3]": 1, + "[167Dy]": 1, + "[154Gd]": 1, + "[95Ru]": 1, + "[210At]": 1, + "[124Te]": 1, + "[66Zn]": 1, + "[68Zn]": 1, + "[217Bi]": 1, + "[AsH4]": 1, + "[22CH3]": 1, + "[232Th]": 1, + "[SnH+2]": 1, + "[P@@H+]": 1, + "[233Ra]": 1, + "[24Mg]": 1, + "[200Hg]": 1, + "[53Ni]": 1, + "[131Xe]": 1, + "[174Hf+4]": 1, + "[174Hf]": 1, + "[76Se]": 1, + "[168Tm]": 1, + "[239Th]": 1, + "[186Lu]": 1, + "[13NH4+]": 1, + "[17C]": 1, + "[31S]": 1, + "[126I]": 1, + "[36SH]": 1, + "[30S]": 1, + "[32SH]": 1, + "[SbH5]": 1, + "[224Ra]": 1, + "[22Na]": 1, + "[210Po]": 1, + "[210Bi]": 1, + "[214Bi]": 1, + "[228Ra]": 1, + "[218Po]": 1, + "[127Sb]": 1, + "[136Cs]": 1, + "[125Sb]": 1, + "[134Cs]": 1, + "[140Ba]": 1, + "[45Ca]": 1, + "[206Pb]": 1, + "[207Pb]": 1, + "[24Na]": 1, + "[86Rb]": 1, + "[212Bi]": 1, + "[208Pb]": 1, + "[124Sb]": 1, + "[204Pb]": 1, + "[44K]": 1, + "[129Te]": 1, + "[113Sn]": 1, + "[204Tl]": 1, + "[87Sr]": 1, + "[208Tl]": 1, + "[87Rb]": 1, + "[47Ca]": 1, + "[135Cs]": 1, + "[216Po]": 1, + "[137Ba]": 1, + "[207Bi]": 1, + "[79Se]": 1, + "[223Ra]": 1, + "[86Sr]": 1, + "[122Sb]": 1, + "[26Al]": 1, + "[32Si]": 1, + "[126Sn]": 1, + "[225Ra]": 1, + "[114In]": 1, + "[72Ga]": 1, + "[132Te]": 1, + "[10Be]": 1, + "[125Sn]": 1, + "[73As]": 1, + "[206Bi]": 1, + "[117Sn]": 1, + "[40Ca]": 1, + "[41Ca]": 1, + "[89Rb]": 1, + "[116In]": 1, + "[129Sb]": 1, + "[91Sr]": 1, + "[71Ge]": 1, + "[139Ba]": 1, + "[69Ga]": 1, + "[120Sb]": 1, + "[121Sn]": 1, + "[123Sn]": 1, + "[131Te]": 1, + "[77Ge]": 1, + "[135Ba]": 1, + "[82Sr]": 1, + "[43K]": 1, + "[131Ba]": 1, + "[92Sr]": 1, + "[88Rb]": 1, + "[129Cs]": 1, + "[144Cs]": 1, + "[127Cs]": 1, + "[200Tl]": 1, + "[202Tl]": 1, + "[141Ba]": 1, + "[117Sb]": 1, + "[116Sb]": 1, + "[78As]": 1, + "[131Sb]": 1, + "[126Sb]": 1, + "[128Sb]": 1, + "[130Sb]": 1, + "[67Ge]": 1, + "[78Ge]": 1, + "[66Ge]": 1, + "[223Fr]": 1, + "[132Cs]": 1, + "[125Cs]": 1, + "[138Cs]": 1, + "[133Te]": 1, + "[84Rb]": 1, + "[83Rb]": 1, + "[81Rb]": 1, + "[142Ba]": 1, + "[200Bi]": 1, + "[115Sb]": 1, + "[194Tl]": 1, + "[70Se]": 1, + "[112In]": 1, + "[118Sb]": 1, + "[70Ga]": 1, + "[27Mg]": 1, + "[202Bi]": 1, + "[83Se]": 1, + "[9Li]": 1, + "[69As]": 1, + "[79Rb]": 1, + "[81Sr]": 1, + "[83Sr]": 1, + "[78Se]": 1, + "[109In]": 1, + "[29Al]": 1, + "[118Sn]": 1, + "[117In]": 1, + "[119Sb]": 1, + "[114Sn]": 1, + "[138Ba]": 1, + "[69Ge]": 1, + "[73Ga]": 1, + "[74Ge]": 1, + "[206Tl]": 1, + "[199Tl]": 1, + "[130Cs]": 1, + "[28Mg]": 1, + "[116Te]": 1, + "[112Sn]": 1, + "[126Ba]": 1, + "[211Bi]": 1, + "[81Se]": 1, + "[127Sn]": 1, + "[143Cs]": 1, + "[134Te]": 1, + "[80Sr]": 1, + "[45K]": 1, + "[215Po]": 1, + "[207Po]": 1, + "[111Sn]": 1, + "[211Po]": 1, + "[128Ba]": 1, + "[198Tl]": 1, + "[227Ra]": 1, + "[220Ra]": 1, + "[128Sn]": 1, + "[203Po]": 1, + "[205Po]": 1, + "[65Ga]": 1, + "[197Tl]": 1, + "[88Sr]": 1, + "[110In]": 1, + "[31Si]": 1, + "[201Bi]": 1, + "[121Te]": 1, + "[205Bi]": 1, + "[203Bi]": 1, + "[195Tl]": 1, + "[209Tl]": 1, + "[110Sn]": 1, + "[222Fr]": 1, + "[207At]": 1, + "[119In]": 1, + "[As@@]": 1, + "[65Cu+]": 1, + "[85Br]": 1, + "[59Co]": 1, + "[122Xe]": 1, + "[54Cr]": 1, + "[3He]": 1, + "[Co-3]": 1, + "[55Fe+2]": 1, + "[201TlH3]": 1, + "[25Mg]": 1, + "[51V]": 1, + "[93Nb]": 1, + "[95Mo]": 1, + "[45Sc]": 1, + "[123Sb]": 1, + "[139La]": 1, + "[9Be]": 1, + "[99Y+3]": 1, + "[99Y]": 1, + "[156Ho]": 1, + "[34S+]": 1, + "[38Cl]": 1, + "[14NH4+]": 1, + "[132Xe]": 1, + "[83Kr]": 1, + "[70Zn+2]": 1, + "[137Ba+2]": 1, + "[36Ar]": 1, + "[38Ar]": 1, + "[21Ne]": 1, + "[126Xe]": 1, + "[136Xe]": 1, + "[128Xe]": 1, + "[134Xe]": 1, + "[84Kr]": 1, + "[86Kr]": 1, + "[78Kr]": 1, + "[80Kr]": 1, + "[82Kr]": 1, + "[SiH4-]": 1, + "[98Tc+4]": 1, + "[76Ge]": 1, + "[108Cd]": 1, + "[116Cd]": 1, + "[130Xe]": 1, + "[94Mo]": 1, + "[124Sn]": 1, + "[186Os]": 1, + "[188Os]": 1, + "[190Os]": 1, + "[192Os]": 1, + "[106Pd]": 1, + "[110Pd]": 1, + "[120Te]": 1, + "[132Ba]": 1, + "[134Ba]": 1, + "[136Ba]": 1, + "[136Ce]": 1, + "[138Ce]": 1, + "[156Dy]": 1, + "[158Dy]": 1, + "[160Dy]": 1, + "[163Dy]": 1, + "[162Er]": 1, + "[164Er]": 1, + "[167Er]": 1, + "[176Hf]": 1, + "[26Mg]": 1, + "[144Nd]": 1, + "[150Nd]": 1, + "[41K]": 1, + "[46Ti]": 1, + "[48Ti]": 1, + "[49Ti]": 1, + "[50Ti]": 1, + "[170Yb]": 1, + "[173Yb]": 1, + "[91Zr]": 1, + "[92Zr]": 1, + "[96Zr]": 1, + "[34S-]": 1, + "[CuH2-]": 1, + "[22Ne]": 1, + "[205Bi+3]": 1, + "[63Ni+2]": 1, + "[EuH2]": 1, + "[67Zn]": 1, + "[144Ce+4]": 1, + "[210Tl]": 1, + "[42Ca]": 1, + "[54Fe]": 1, + "[193Ir]": 1, + "[92Nb]": 1, + "[141Cs]": 1, + "[52Cr]": 1, + "[35ClH]": 1, + "[46Ca]": 1, + "[139Cs]": 1, + "[65Cu]": 1, + "[71Ga]": 1, + "[60Ni]": 1, + "[16NH3]": 1, + "[148Nd]": 1, + "[72Ge]": 1, + "[161Dy]": 1, + "[49Ca]": 1, + "[43Ca]": 1, + "[48Ca]": 1, + "[44Ca]": 1, + "[120Xe]": 1, + "[80Rb]": 1, + "[180Re]": 1, + "[146Sm]": 1, + "[19Ne]": 1, + "[74Kr]": 1, + "[134La]": 1, + "[76Kr]": 1, + "[121Xe]": 1, + "[220Fr]": 1, + "[223Ac]": 1, + "[218At]": 1, + "[37Ar]": 1, + "[198Bi]": 1, + "[209Po]": 1, + "[208Po]": 1, + "[206Po]": 1, + "[204Po]": 1, + "[202Po]": 1, + "[201Po]": 1, + "[200Po]": 1, + "[199Po]": 1, + "[198Po]": 1, + "[197Po]": 1, + "[196Po]": 1, + "[195Po]": 1, + "[194Po]": 1, + "[193Po]": 1, + "[192Po]": 1, + "[191Po]": 1, + "[190Po]": 1, + "[217Po]": 1, + "[BiH4-]": 1, + "[89Sr]": 1, + "[SeH3]": 1, + "[TeH3+]": 1, + "[BH4+]": 1, + "[BH3+]": 1, + "[AsH3-]": 1, + "[82Rb]": 1, + "[85Sr]": 1, + "[90Sr]": 1, + "[137Cs]": 1, + "[133Ba]": 1, + "[131Cs]": 1, + "[XeH2]": 1, + "[74Br-]": 1, + "[133I-]": 1, + "[100Tc+4]": 1, + "[100Tc]": 1, + "[36Cl-]": 1, + "[104Rh]": 1, + "[152Sm]": 1, + "[226Ra]": 1, + "[104Pd]": 1, + "[148Gd]": 1, + "[157Lu]": 1, + "[33SH2]": 1, + "[121I-]": 1, + "[ScH3]": 1, + "[TeH4]": 1, + "[222Ra]": 1, + "[62Zn+2]": 1, + "[32ClH]": 1, + "[33ClH]": 1, + "[78BrH]": 1, + "[22CH4]": 1, + "[NiH+]": 1, + "[22CH2]": 1, + "[10CH]": 1, + "[15C-]": 1, + "[15CH3]": 1, + "[16CH3]": 1, + "[157Gd+3]": 1, + "[205Tl]": 1, + "[203Tl]": 1, + "[56Mn]": 1, + "[234Pa]": 1, + "[41Ar]": 1, + "[147Nd]": 1, + "[187W]": 1, + "[151Sm]": 1, + "[59Ni]": 1, + "[233Pa]": 1, + "[52Mn]": 1, + "[94Nb]": 1, + "[219Rn]": 1, + "[236Pu]": 1, + "[13NH3]": 1, + "[93Zr]": 1, + "[51Cr+6]": 1, + "[123Xe]": 1, + "[160Tb]": 1, + "[170Tm]": 1, + "[182Ta]": 1, + "[175Yb]": 1, + "[93Mo]": 1, + "[143Ce]": 1, + "[191Os]": 1, + "[126IH]": 1, + "[48V]": 1, + "[113Cd]": 1, + "[47Sc]": 1, + "[181Hf]": 1, + "[185W]": 1, + "[143Pr]": 1, + "[191Pt]": 1, + "[181W]": 1, + "[33PH3]": 1, + "[97Ru]": 1, + "[97Tc]": 1, + "[111Ag]": 1, + "[169Er]": 1, + "[107Pd]": 1, + "[103Ru+2]": 1, + "[34SH2]": 1, + "[137Ce]": 1, + "[242Am]": 1, + "[117SnH2]": 1, + "[57Ni]": 1, + "[239U]": 1, + "[60Cu]": 1, + "[250Cf]": 1, + "[193Au]": 1, + "[69Zn]": 1, + "[55Co]": 1, + "[139Ce]": 1, + "[127Xe]": 1, + "[159Gd]": 1, + "[56Co]": 1, + "[177Hf]": 1, + "[244Pu]": 1, + "[38ClH]": 1, + "[142Pr]": 1, + "[199Hg]": 1, + "[179Hf]": 1, + "[178Hf]": 1, + "[237U]": 1, + "[156Eu]": 1, + "[157Eu]": 1, + "[105Ru]": 1, + "[171Tm]": 1, + "[199Au]": 1, + "[155Sm]": 1, + "[80BrH]": 1, + "[108Ag]": 1, + "[128IH]": 1, + "[48Sc]": 1, + "[45Ti]": 1, + "[176Lu]": 1, + "[121SnH2]": 1, + "[148Pm]": 1, + "[96Tc]": 1, + "[133IH]": 1, + "[143Pm]": 1, + "[105Rh]": 1, + "[130IH]": 1, + "[134IH]": 1, + "[71Zn]": 1, + "[105Ag]": 1, + "[97Zr]": 1, + "[235Pu]": 1, + "[231Th]": 1, + "[109Pd]": 1, + "[93Y]": 1, + "[190Ir]": 1, + "[135Xe]": 1, + "[53Mn]": 1, + "[134Ce]": 1, + "[234Np]": 1, + "[240Am]": 1, + "[246Cf]": 1, + "[240Cm]": 1, + "[241Cm]": 1, + "[226Th]": 1, + "[39ClH]": 1, + "[229Th]": 1, + "[245Cm]": 1, + "[240U]": 1, + "[240Np]": 1, + "[249Cm]": 1, + "[243Pu]": 1, + "[145Pm]": 1, + "[199Pt]": 1, + "[246Bk]": 1, + "[193Pt]": 1, + "[230U]": 1, + "[250Cm]": 1, + "[44Ti]": 1, + "[175Hf]": 1, + "[254Fm]": 1, + "[255Fm]": 1, + "[257Fm]": 1, + "[92Y]": 1, + "[188Ir]": 1, + "[171Lu]": 1, + "[257Md]": 1, + "[247Bk]": 1, + "[121IH]": 1, + "[250Bk]": 1, + "[179Lu]": 1, + "[224Ac]": 1, + "[195Hg]": 1, + "[244Am]": 1, + "[246Pu]": 1, + "[194Au]": 1, + "[252Fm]": 1, + "[173Hf]": 1, + "[246Cm]": 1, + "[135Ce]": 1, + "[49Cr]": 1, + "[248Cf]": 1, + "[247Cm]": 1, + "[248Cm]": 1, + "[174Ta]": 1, + "[176Ta]": 1, + "[154Tb]": 1, + "[172Ta]": 1, + "[177Ta]": 1, + "[175Ta]": 1, + "[180Ta]": 1, + "[158Tb]": 1, + "[115Ag]": 1, + "[189Os]": 1, + "[251Cf]": 1, + "[145Pr]": 1, + "[147Pr]": 1, + "[76BrH]": 1, + "[102Rh]": 1, + "[238Np]": 1, + "[185Os]": 1, + "[246Am]": 1, + "[233Np]": 1, + "[166Dy]": 1, + "[254Es]": 1, + "[244Cf]": 1, + "[193Os]": 1, + "[245Am]": 1, + "[245Bk]": 1, + "[239Am]": 1, + "[238Am]": 1, + "[97Nb]": 1, + "[245Pu]": 1, + "[254Cf]": 1, + "[188W]": 1, + "[250Es]": 1, + "[251Es]": 1, + "[237Am]": 1, + "[182Hf]": 1, + "[258Md]": 1, + "[232Np]": 1, + "[238Cm]": 1, + "[60Fe]": 1, + "[109Pd+2]": 1, + "[234Pu]": 1, + "[141Ce+3]": 1, + "[136Nd]": 1, + "[136Pr]": 1, + "[173Ta]": 1, + "[110Ru]": 1, + "[147Tb]": 1, + "[253Fm]": 1, + "[139Nd]": 1, + "[178Re]": 1, + "[177Re]": 1, + "[200Au]": 1, + "[182Re]": 1, + "[156Tb]": 1, + "[155Tb]": 1, + "[157Tb]": 1, + "[161Tb]": 1, + "[161Ho]": 1, + "[167Tm]": 1, + "[173Lu]": 1, + "[179Ta]": 1, + "[171Er]": 1, + "[49Sc]": 1, + "[49V]": 1, + "[51Mn]": 1, + "[90Nb]": 1, + "[88Nb]": 1, + "[88Zr]": 1, + "[36SH2]": 1, + "[174Yb]": 1, + "[178Lu]": 1, + "[179W]": 1, + "[83BrH]": 1, + "[107Cd]": 1, + "[75BrH]": 1, + "[62Co]": 1, + "[48Cr]": 1, + "[63Zn]": 1, + "[102Ag]": 1, + "[154Sm]": 1, + "[168Er]": 1, + "[65Ni]": 1, + "[137La]": 1, + "[187Ir]": 1, + "[144Pm]": 1, + "[146Pm]": 1, + "[160Gd]": 1, + "[166Yb]": 1, + "[162Dy]": 1, + "[47V]": 1, + "[141Nd]": 1, + "[141Sm]": 1, + "[166Er]": 1, + "[150Sm]": 1, + "[146Eu]": 1, + "[149Eu]": 1, + "[174Lu]": 1, + "[17NH3]": 1, + "[102Ru]": 1, + "[170Hf]": 1, + "[188Pt]": 1, + "[61Ni]": 1, + "[56Ni]": 1, + "[149Gd]": 1, + "[151Gd]": 1, + "[141Pm]": 1, + "[147Gd]": 1, + "[146Gd]": 1, + "[161Er]": 1, + "[103Ag]": 1, + "[145Eu]": 1, + "[153Tb]": 1, + "[155Dy]": 1, + "[184Re]": 1, + "[180Os]": 1, + "[182Os]": 1, + "[186Pt]": 1, + "[181Os]": 1, + "[181Re]": 1, + "[151Tb]": 1, + "[178Ta]": 1, + "[178W]": 1, + "[189Pt]": 1, + "[194Hg]": 1, + "[145Sm]": 1, + "[150Tb]": 1, + "[132La]": 1, + "[158Gd]": 1, + "[104Ag]": 1, + "[193Hg]": 1, + "[94Ru]": 1, + "[137Pr]": 1, + "[155Ho]": 1, + "[117Cd]": 1, + "[99Ru]": 1, + "[146Nd]": 1, + "[218Rn]": 1, + "[95Y]": 1, + "[79Kr]": 1, + "[120IH]": 1, + "[138Pr]": 1, + "[100Pd]": 1, + "[166Tm]": 1, + "[90Mo]": 1, + "[151Nd]": 1, + "[231U]": 1, + "[138Nd]": 1, + "[89Nb]": 1, + "[98Nb]": 1, + "[162Ho]": 1, + "[142Sm]": 1, + "[186Ta]": 1, + "[104Tc]": 1, + "[184Ta]": 1, + "[185Ta]": 1, + "[170Er]": 1, + "[107Rh]": 1, + "[131La]": 1, + "[169Lu]": 1, + "[74BrH]": 1, + "[150Pm]": 1, + "[172Tm]": 1, + "[197Pt]": 1, + "[230Pu]": 1, + "[170Lu]": 1, + "[86Zr]": 1, + "[176W]": 1, + "[177W]": 1, + "[101Pd]": 1, + "[105Pd]": 1, + "[108Pd]": 1, + "[149Nd]": 1, + "[164Ho]": 1, + "[159Ho]": 1, + "[167Ho]": 1, + "[176Yb]": 1, + "[156Sm]": 1, + "[77BrH]": 1, + "[189Re]": 1, + "[99Rh]": 1, + "[100Rh]": 1, + "[151Pm]": 1, + "[232Pa]": 1, + "[228Pa]": 1, + "[230Pa]": 1, + "[66Ni]": 1, + "[194Os]": 1, + "[135La]": 1, + "[138La]": 1, + "[141La]": 1, + "[142La]": 1, + "[195Ir]": 1, + "[96Nb]": 1, + "[157Ho]": 1, + "[183Hf]": 1, + "[162Tm]": 1, + "[172Er]": 1, + "[148Eu]": 1, + "[150Eu]": 1, + "[15CH4]": 1, + "[89Kr]": 1, + "[143La]": 1, + "[58Ni]": 1, + "[61Co]": 1, + "[158Eu]": 1, + "[165Er]": 1, + "[167Yb]": 1, + "[173Tm]": 1, + "[175Tm]": 1, + "[172Hf]": 1, + "[172Lu]": 1, + "[93Tc]": 1, + "[177Yb]": 1, + "[124IH]": 1, + "[194Ir]": 1, + "[147Eu]": 1, + "[101Mo]": 1, + "[180Hf]": 1, + "[189Ir]": 1, + "[87Y]": 1, + "[43Sc]": 1, + "[195Au]": 1, + "[112Ag]": 1, + "[84BrH]": 1, + "[106Ag]": 1, + "[109Ag]": 1, + "[101Rh]": 1, + "[162Yb]": 1, + "[228Rn]": 1, + "[139Pr]": 1, + "[94Y]": 1, + "[201Au]": 1, + "[40PH3]": 1, + "[110Ag+]": 1, + "[104Cd]": 1, + "[133Ba+2]": 1, + "[226Ac]": 1, + "[145Gd]": 1, + "[186Ir]": 1, + "[184Ir]": 1, + "[224Rn]": 1, + "[185Ir]": 1, + "[182Ir]": 1, + "[184Hf]": 1, + "[200Pt]": 1, + "[227Pa]": 1, + "[178Yb]": 1, + "[72Br-]": 1, + "[72BrH]": 1, + "[248Am]": 1, + "[238Th]": 1, + "[161Gd]": 1, + "[9C-]": 1, + "[66Cu]": 1, + "[Hf+]": 1, + "[16O+]": 1, + "[127Cs+]": 1, + "[135Cs+]": 1, + "[45K+]": 1, + "[125Cs+]": 1, + "[88Rb+]": 1, + "[130Cs+]": 1, + "[138Cs+]": 1, + "[79Rb+]": 1, + "[129Cs+]": 1, + "[83Rb+]": 1, + "[89Rb+]": 1, + "[FeH6-4]": 1, + "[BH+]": 1, + "[17FH]": 1, + "[71Se]": 1, + "[157Sm]": 1, + "[148Tb]": 1, + "[164Dy]": 1, + "[15OH2]": 1, + "[15O+]": 1, + "[39K]": 1, + "[40Ar]": 1, + "[50Cr+3]": 1, + "[50Cr]": 1, + "[52Ti]": 1, + "[103Pd+2]": 1, + "[130Ba]": 1, + "[142Pm]": 1, + "[Po@]": 1, + "[AuH3]": 1, + "[72Se]": 1, + "[95Tc]": 1, + "[121Sn+2]": 1, + "[211Rn]": 1, + "[38SH2]": 1, + "[127IH]": 1, + "[SrH2]": 1, + "[55Fe]": 1, + "[233U]": 1, + "[230Th]": 1, + "[228Th]": 1, + "[222Rn]": 1, + "[192Ir]": 1, + "[133Xe]": 1, + "[81Kr]": 1, + "[95Zr]": 1, + "[240Pu]": 1, + "[54Mn]": 1, + "[103Ru]": 1, + "[95Nb]": 1, + "[109Cd]": 1, + "[141Ce]": 1, + "[85Kr]": 1, + "[110Ag]": 1, + "[58Co]": 1, + "[241Pu]": 1, + "[234Th]": 1, + "[140La]": 1, + "[63Ni]": 1, + "[152Eu]": 1, + "[132IH]": 1, + "[226Rn]": 1, + "[154Eu]": 1, + "[36ClH]": 1, + "[228Ac]": 1, + "[155Eu]": 1, + "[106Rh]": 1, + "[243Am]": 1, + "[227Ac]": 1, + "[243Cm]": 1, + "[236U]": 1, + "[144Pr]": 1, + "[232U]": 1, + "[32SH2]": 1, + "[88Y]": 1, + "[82BrH]": 1, + "[135IH]": 1, + "[242Cm]": 1, + "[115Cd]": 1, + "[242Pu]": 1, + "[46Sc]": 1, + "[41Ca+2]": 1, + "[62Cu+]": 1, + "[64Cu+]": 1, + "[61Cu+]": 1, + "[37SH2]": 1, + "[30Si]": 1, + "[28Al]": 1, + "[19OH2]": 1, + "[8He]": 1, + "[6He]": 1, + "[153Pm]": 1, + "[209Bi]": 1, + "[66Zn+2]": 1, + "[191Ir]": 1, + "[UH3]": 1, + "[S@@H]": 1, + "[RuH-]": 1, + "[ClH3+3]": 1, + "[NiH2+2]": 1, + "[122Te]": 1, + "[156Gd]": 1, + "[112Cd]": 1, + "[116Sn]": 1, + "[120Sn]": 1, + "[Sn@@]": 1, + "[118Pd+2]": 1, + "[118Pd]": 1, + "[PtH4]": 1, + "[VH2]": 1, + "[84Sr]": 1, + "[UH2]": 1, + "[198Hg]": 1, + "[52Mn+2]": 1, + "[64Ga]": 1, + "[181Ta]": 1, + "[BiH+]": 1, + "[12NH3]": 1, + "[62Ga]": 1, + "[39Ar]": 1, + "[144Sm]": 1, + "[58Fe]": 1, + "[153Eu]": 1, + "[85Rb]": 1, + "[171Yb]": 1, + "[172Yb]": 1, + "[114Cd]": 1, + "[51Fe]": 1, + "[142Ce]": 1, + "[207Tl]": 1, + "[92Mo]": 1, + "[115Sn]": 1, + "[140Ce]": 1, + "[202Hg]": 1, + "[180W]": 1, + "[182W]": 1, + "[183W]": 1, + "[184W]": 1, + "[96Mo]": 1, + "[47Ti]": 1, + "[111Cd]": 1, + "[143Nd]": 1, + "[145Nd]": 1, + "[126Te]": 1, + "[128Te]": 1, + "[130Te]": 1, + "[185Re]": 1, + "[97Mo]": 1, + "[98Mo]": 1, + "[183Re]": 1, + "[52V]": 1, + "[80Se]": 1, + "[87Kr]": 1, + "[137Xe]": 1, + "[196Au]": 1, + "[146Ce]": 1, + "[88Kr]": 1, + "[51Ti]": 1, + "[138Xe]": 1, + "[37S]": 1, + "[38S]": 1, + "[30PH3]": 1, + "[19CH3]": 1, + "[14OH2]": 1, + "[17NH4+]": 1, + "[32PH2]": 1, + "[13B]": 1, + "[216Bi]": 1, + "[117Sn+2]": 1, + "[P@H+]": 1, + "[MoH3]": 1, + "[186W]": 1, + "[FeH4]": 1, + "[HgH2]": 1, + "[86Tc]": 1, + "[141Pr+3]": 1, + "[141Pr]": 1, + "[Ge@H]": 1, + "[204Hg]": 1, + "[ThH4]": 1, + "[WH6]": 1, + "[218AtH]": 1, + "[67Zn+2]": 1, + "[65Cu+2]": 1, + "[110Te]": 1, + "[58Fe+3]": 1, + "[142Nd]": 1, + "[38K]": 1, + "[198Au+3]": 1, + "[122IH]": 1, + "[38PH3]": 1, + "[130I-]": 1, + "[40K+]": 1, + "[38K+]": 1, + "[28Mg+2]": 1, + "[58Co+3]": 1, + "[229Rn]": 1, + "[218Pb]": 1, + "[167Tm+3]": 1, + "[100Tc+5]": 1, + "[110Cd]": 1, + "[16CH2]": 1, + "[100Mo]": 1, + "[208Tl+]": 1, + "[18CH3]": 1, + "[135I]": 1, + "[13OH2]": 1, + "[192Bi]": 1, + "[194Bi]": 1, + "[196Bi]": 1, + "[161Tb+3]": 1, + "[197Hg+2]": 1, + "[191Os+4]": 1, + "[170Tm+3]": 1, + "[233U+4]": 1, + "[106Cd]": 1, + "[122Sn]": 1, + "[132I-]": 1, + "[83Sr+2]": 1, + "[169Er+3]": 1, + "[122I-]": 1, + "[120I-]": 1, + "[92Sr+2]": 1, + "[126I-]": 1, + "[126Sb+3]": 1, + "[127Sb+3]": 1, + "[136Eu+3]": 1, + "[136Eu]": 1, + "[125Sn+4]": 1, + "[175Yb+3]": 1, + "[94Tc+7]": 1, + "[86Y+3]": 1, + "[135I-]": 1, + "[15O-2]": 1, + "[151Eu+3]": 1, + "[28SiH3]": 1, + "[35S-]": 1, + "[15NH-]": 1, + "[91Y+3]": 1, + "[106Ru+3]": 1, + "[80Br-]": 1, + "[CeH3]": 1, + "[37ClH]": 1, + "[21NH3]": 1, + "[18OH3+]": 1, + "[17B]": 1, + "[RuH5]": 1, + "[35S-2]": 1, + "[17OH-]": 1, + "[212Ra]": 1, + "[75Br-]": 1, + "[79Br-]": 1, + "[113Ag]": 1, + "[23Na]": 1, + "[34Cl-]": 1, + "[34ClH]": 1, + "[38Cl-]": 1, + "[56Fe]": 1, + "[77Br-]": 1, + "[90Zr+4]": 1, + "[90Zr]": 1, + "[102Pd]": 1, + "[154Eu+3]": 1, + "[57Mn]": 1, + "[165Tm]": 1, + "[23Na+]": 1, + "[27Al+3]": 1, + "[31P-3]": 1, + "[34S-2]": 1, + "[39K+]": 1, + "[59Co+3]": 1, + "[60Ni+2]": 1, + "[75As+3]": 1, + "[75As+5]": 1, + "[82Se-2]": 1, + "[82Se+4]": 1, + "[82Se+6]": 1, + "[88Sr+2]": 1, + "[111Cd+2]": 1, + "[107Ag]": 1, + "[12CH-]": 1, + "[12N+]": 1, + "[Se@@H]": 1, + "[BiH4]": 1, + "[203Pb+2]": 1, + "[68Ge+4]": 1, + "[44Sc+3]": 1, + "[89Zr+3]": 1, + "[55Mn]": 1, + "[75As]": 1, + "[152Dy]": 1, + "[217At]": 1, + "[AsH5]": 1, + "[TeH+]": 1, + "[151Eu]": 1, + "[103Rh]": 1, + "[124Xe]": 1, + "[152Tb]": 1, + "[20Ne]": 1, + "[52Fe]": 1, + "[94Zr+4]": 1, + "[94Zr]": 1, + "[149Pr]": 1, + "[53Cr+6]": 1, + "[53Cr]": 1, + "[81Br-]": 1, + "[112Pd]": 1, + "[125Xe]": 1, + "[155Gd]": 1, + "[157Gd]": 1, + "[168Yb]": 1, + "[184Os]": 1, + "[166Tb]": 1, + "[221Fr]": 1, +} + +CID_SMILES_ALPHABET = list(CID_SMILES_COUNTS.keys()) + +ZINC_250K_SMILES_COUNTS = { + "": 8894435, + "C": 4034842, + "=": 1580410, + "(": 966950, + ")": 966950, + "1": 717220, + "N": 626877, + "O": 551937, + "2": 458888, + "3": 163222, + "[C@@H]": 105871, + "[C@H]": 100551, + "S": 100291, + "F": 79430, + "Cl": 42872, + "[NH+]": 40289, + "4": 30516, + "[O-]": 24276, + "[NH2+]": 17571, + "Br": 12722, + "[N+]": 12139, + "[C@]": 7085, + "[NH3+]": 6788, + "[C@@]": 6519, + "5": 2492, + "[N-]": 1527, + "[S@]": 1082, + "[S@@]": 1054, + "I": 888, + "[S-]": 446, + "6": 84, + "P": 69, + "[P@@]": 30, + "[P@]": 23, + "[O+]": 13, + "[NH-]": 12, + "7": 8, + "[OH+]": 5, + "[S+]": 3, + "[CH-]": 2, + "[SH+]": 2, + "8": 2, + "[P@@H]": 1, + "[PH2]": 1, + "[P+]": 1, + "[CH2-]": 1, + "[PH+]": 1, + "[PH]": 1, + "[S@@+]": 1, +} + +ZINC_250K_SMILES_ALPHABET = list(ZINC_250K_SMILES_COUNTS.keys()) diff --git a/src/poli/core/util/tokenizers/__init__.py b/src/poli/core/util/tokenizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/poli/core/util/tokenizers/abstract_tokenizer.py b/src/poli/core/util/tokenizers/abstract_tokenizer.py new file mode 100644 index 00000000..217059a6 --- /dev/null +++ b/src/poli/core/util/tokenizers/abstract_tokenizer.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import numpy as np + + +def _pad( + texts: list[list[str]], max_sequence_length: int, padding_token: str = "" +) -> list[list[str]]: + if max_sequence_length < max(map(len, texts)): + raise ValueError( + f"max_sequence_length ({max_sequence_length}) must be greater than the length of the longest text ({max(map(len, texts))})" + ) + return [ + text + [padding_token] * (max_sequence_length - len(text)) for text in texts + ] + + +class AbstractTokenizer: + def __init__( + self, max_sequence_length: int | float | None = None, padding_token: str = "" + ) -> None: + if isinstance(max_sequence_length, float): + assert ( + max_sequence_length == np.inf + ), "max_sequence_length must be np.inf if float" + + self.max_sequence_length = max_sequence_length + self.padding_token = padding_token + + def _tokenize(self, texts: str | list[str]) -> list[str] | list[list[str]]: + raise NotImplementedError + + def tokenize(self, texts: str | list[str]) -> np.ndarray: + unpadded_tokens = self._tokenize(texts) + + if isinstance(unpadded_tokens[0], list): + tokens = unpadded_tokens + else: + tokens = [unpadded_tokens] + + if self.max_sequence_length is None or np.isinf(self.max_sequence_length): + max_length = max(map(len, tokens)) + tokens = _pad(tokens, max_length, padding_token=self.padding_token) + else: + tokens = _pad( + tokens, self.max_sequence_length, padding_token=self.padding_token + ) + + return np.array(tokens) diff --git a/src/poli/core/util/tokenizers/character_tokenizer.py b/src/poli/core/util/tokenizers/character_tokenizer.py new file mode 100644 index 00000000..b0148209 --- /dev/null +++ b/src/poli/core/util/tokenizers/character_tokenizer.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from poli.core.util.tokenizers.abstract_tokenizer import AbstractTokenizer + + +class CharacterTokenizer(AbstractTokenizer): + def _tokenize(self, texts: str | list[str]) -> list[str] | list[list[str]]: + if isinstance(texts, str): + return list(texts) + elif isinstance(texts, list): + return [list(t) for t in texts] + else: + raise ValueError(f"Expected str or list, got {type(texts)}") diff --git a/src/poli/core/util/tokenizers/selfies_tokenizer.py b/src/poli/core/util/tokenizers/selfies_tokenizer.py new file mode 100644 index 00000000..63ecacce --- /dev/null +++ b/src/poli/core/util/tokenizers/selfies_tokenizer.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import selfies as sf + +from poli.core.util.tokenizers.abstract_tokenizer import AbstractTokenizer + + +class SELFIESTokenizer(AbstractTokenizer): + def __init__( + self, + max_sequence_length: int | float | None = None, + padding_token: str = "[nop]", + ) -> None: + super().__init__(max_sequence_length, padding_token) + + def _tokenize(self, texts: str | list[str]) -> list[str] | list[list[str]]: + if isinstance(texts, str): + return list(sf.split_selfies(texts)) + elif isinstance(texts, list): + return [list(sf.split_selfies(t)) for t in texts] + else: + raise ValueError(f"Expected str or list, got {type(texts)}") diff --git a/src/poli/core/util/tokenizers/smiles_tokenizer.py b/src/poli/core/util/tokenizers/smiles_tokenizer.py new file mode 100644 index 00000000..910df152 --- /dev/null +++ b/src/poli/core/util/tokenizers/smiles_tokenizer.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import re + +from poli.core.util.tokenizers.abstract_tokenizer import AbstractTokenizer + + +class SMILESTokenizer(AbstractTokenizer): + def __init__( + self, max_sequence_length: int | float | None = None, padding_token: str = "" + ) -> None: + """Tokenizer for SMILES strings, based on DeepChem's basic SMILES tokenizer.""" + # DeepChem's SMILES tokenizer + SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|\#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])""" + + self.REGEX_FOR_SMILES = re.compile(SMI_REGEX_PATTERN) + super().__init__(max_sequence_length, padding_token) + + def _tokenize(self, texts: str | list[str]) -> list[str] | list[list[str]]: + smiles = texts + if isinstance(smiles, str): + smiles = [smiles] + + tokens = [list(self.REGEX_FOR_SMILES.findall(smile)) for smile in smiles] + + # Make sure they all have the same length + max_len = max(len(token) for token in tokens) + + for token in tokens: + token += [""] * (max_len - len(token)) + + if len(tokens) == 1: + return tokens[0] + + return tokens diff --git a/src/poli/tests/tokenizers/__init__.py b/src/poli/tests/tokenizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/poli/tests/tokenizers/test_character_tokenizer.py b/src/poli/tests/tokenizers/test_character_tokenizer.py new file mode 100644 index 00000000..3be9f8c5 --- /dev/null +++ b/src/poli/tests/tokenizers/test_character_tokenizer.py @@ -0,0 +1,65 @@ +import numpy as np +import pytest + +from poli.core.util.proteins.defaults import AMINO_ACIDS +from poli.core.util.tokenizers.character_tokenizer import CharacterTokenizer + + +def test_character_tokenizer_on_single_sequence(): + one_amino_acid = AMINO_ACIDS[0] + + text = one_amino_acid * 10 + + tokenizer = CharacterTokenizer(max_sequence_length=10) + + tokens = tokenizer.tokenize(text) + assert (tokens == np.array(list(one_amino_acid * 10))).all() + + +def test_character_tokenizer_on_multiple_sequences_of_same_length(): + one_amino_acid = AMINO_ACIDS[0] + + texts = [one_amino_acid * 10] * 5 + + tokenizer = CharacterTokenizer(max_sequence_length=10) + + tokens = tokenizer.tokenize(texts) + assert (tokens == np.array([list(one_amino_acid * 10)] * 5)).all() + + +@pytest.mark.parametrize("max_sequence_length", [np.inf, 15, None]) +def test_character_tokenizer_on_multiple_sequences_of_varying_length( + max_sequence_length, +): + one_amino_acid = AMINO_ACIDS[0] + + texts = [ + one_amino_acid * 10, + one_amino_acid * 5, + one_amino_acid * 15, + one_amino_acid * 3, + one_amino_acid * 8, + ] + + tokenizer = CharacterTokenizer(max_sequence_length=max_sequence_length) + + tokens = tokenizer.tokenize(texts) + assert tokens.shape == (5, 15) + assert tokens[0, 11] == tokenizer.padding_token + + +def test_character_tokenizer_outputs_error_on_wrong_max_sequence(): + one_amino_acid = AMINO_ACIDS[0] + + texts = [ + one_amino_acid * 10, + one_amino_acid * 5, + one_amino_acid * 15, + one_amino_acid * 3, + one_amino_acid * 8, + ] + + tokenizer = CharacterTokenizer(max_sequence_length=10) + + with pytest.raises(ValueError): + _ = tokenizer.tokenize(texts) diff --git a/src/poli/tests/tokenizers/test_smiles_and_selfies_tokenizer.py b/src/poli/tests/tokenizers/test_smiles_and_selfies_tokenizer.py new file mode 100644 index 00000000..6cf58fbc --- /dev/null +++ b/src/poli/tests/tokenizers/test_smiles_and_selfies_tokenizer.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import numpy as np +import pytest + +from poli.core.util.tokenizers.selfies_tokenizer import SELFIESTokenizer +from poli.core.util.tokenizers.smiles_tokenizer import SMILESTokenizer + + +@pytest.mark.parametrize( + "smile, expected_tokens", + [ + ("CCO", ["C", "C", "O"]), + ("C1=CC=CC=C1", ["C", "1", "=", "C", "C", "=", "C", "C", "=", "C", "1"]), + # TODO: add more + ], +) +def test_smiles_tokenizer_on_single_sequence(smile: str, expected_tokens: list[str]): + tokenizer = SMILESTokenizer(max_sequence_length=np.inf) + + tokens = tokenizer.tokenize(smile) + assert (tokens == np.array(expected_tokens)).all() + + +@pytest.mark.parametrize( + "selfie, expected_tokens", + [ + ("[C][C][O]", ["[C]", "[C]", "[O]"]), + ( + "[C][=C][C][=C][C][=C][Ring1][=Branch1]", + ["[C]", "[=C]", "[C]", "[=C]", "[C]", "[=C]", "[Ring1]", "[=Branch1]"], + ), + ], +) +def test_selfies_tokenizer_on_single_sequence(selfie: str, expected_tokens: list[str]): + tokenizer = SELFIESTokenizer(max_sequence_length=np.inf) + + tokens = tokenizer.tokenize(selfie) + assert (tokens == np.array(expected_tokens)).all() + + +def test_smiles_on_sequences_of_varying_length(): + smiles = [ + "CCO", + "C1=CC=CC=C1", + "CC(C)C", + "CC(C)(C)C", + "CC(C)(C)C", + ] + + tokenizer = SMILESTokenizer(max_sequence_length=np.inf) + + tokens = tokenizer.tokenize(smiles) + assert tokens.shape == (5, 11) + assert tokens[0, 3] == tokenizer.padding_token + assert tokens[4, 9] == tokenizer.padding_token + + +def test_selfies_on_sequences_of_varying_length(): + selfies = [ + "[C][C][O]", + "[C][=C][C][=C][C][=C][Ring1][=Branch1]", + ] + + tokenizer = SELFIESTokenizer(max_sequence_length=np.inf) + + tokens = tokenizer.tokenize(selfies) + assert tokens.shape == (2, 8) + assert tokens[0, 4] == tokenizer.padding_token