diff --git a/ml_peg/analysis/conformers/solv_mpconf196/analyse_solv_mpconf196.py b/ml_peg/analysis/conformers/solv_mpconf196/analyse_solv_mpconf196.py new file mode 100644 index 00000000..29998060 --- /dev/null +++ b/ml_peg/analysis/conformers/solv_mpconf196/analyse_solv_mpconf196.py @@ -0,0 +1,164 @@ +""" +Analyse the solvMPCONF196 dataset of solvated biomolecule conformers. + +J. Comput. Chem. 2024, 45(7), 419. +https://doi.org/10.1002/jcc.27248. +""" + +from __future__ import annotations + +from pathlib import Path + +from ase import units +from ase.io import read, write +import pytest + +from ml_peg.analysis.utils.decorators import build_table, plot_parity +from ml_peg.analysis.utils.utils import build_d3_name_map, load_metrics_config, mae +from ml_peg.app import APP_ROOT +from ml_peg.calcs import CALCS_ROOT +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +MODELS = load_models(current_models) +D3_MODEL_NAMES = build_d3_name_map(MODELS) + +KCAL_TO_EV = units.kcal / units.mol +EV_TO_KCAL = 1 / KCAL_TO_EV +CALC_PATH = CALCS_ROOT / "conformers" / "solv_mpconf196" / "outputs" +OUT_PATH = APP_ROOT / "data" / "conformers" / "solv_mpconf196" + +METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml") +DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config( + METRICS_CONFIG_PATH +) +MOLECULES = [ + "FGG", + "GFA", + "GGF", + "WG", + "WGG", + "CAMVES", + "CHPSAR", + "COHVAW", + "GS464992", + "GS557577", + "POXTRD", + "SANGLI", + "YIVNOG", +] + + +def labels() -> list: + """ + Get list of system names. + + Returns + ------- + list + List of all system names. + """ + for model_name in MODELS: + labels_list = [ + path.stem for path in sorted((CALC_PATH / model_name).glob("*.xyz")) + ] + break + return labels_list + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure_solv_mpconf196.json", + title="Energies", + x_label="Predicted energy / eV", + y_label="Reference energy / eV", + hoverdata={ + "Labels": labels(), + }, +) +def conformer_energies() -> dict[str, list]: + """ + Get conformer energies for all systems. + + Returns + ------- + dict[str, list] + Dictionary of all reference and predicted energies. + """ + results = {"ref": []} | {mlip: [] for mlip in MODELS} + ref_stored = False + + for model_name in MODELS: + for label in labels(): + atoms = read(CALC_PATH / model_name / f"{label}.xyz") + results[model_name].append(atoms.info["model_rel_energy"]) + if not ref_stored: + results["ref"].append(atoms.info["ref_rel_energy"]) + + # Write structures for app + structs_dir = OUT_PATH / model_name + structs_dir.mkdir(parents=True, exist_ok=True) + write(structs_dir / f"{label}.xyz", atoms) + ref_stored = True + return results + + +@pytest.fixture +def get_mae(conformer_energies) -> dict[str, float]: + """ + Get mean absolute error for conformer energies. + + Parameters + ---------- + conformer_energies + Dictionary of reference and predicted conformer energies. + + Returns + ------- + dict[str, float] + Dictionary of predicted conformer energies errors for all models. + """ + results = {} + for model_name in MODELS: + results[model_name] = mae( + conformer_energies["ref"], conformer_energies[model_name] + ) + return results + + +@pytest.fixture +@build_table( + filename=OUT_PATH / "solv_mpconf196_metrics_table.json", + metric_tooltips=DEFAULT_TOOLTIPS, + thresholds=DEFAULT_THRESHOLDS, + mlip_name_map=D3_MODEL_NAMES, +) +def metrics(get_mae: dict[str, float]) -> dict[str, dict]: + """ + Get all metrics. + + Parameters + ---------- + get_mae + Mean absolute errors for all models. + + Returns + ------- + dict[str, dict] + Metric names and values for all models. + """ + return { + "MAE": get_mae, + } + + +def test_solv_mpconf196(metrics: dict[str, dict]) -> None: + """ + Run solvMPCONF196 test. + + Parameters + ---------- + metrics + All new benchmark metric names and dictionary of values for each model. + """ + return diff --git a/ml_peg/analysis/conformers/solv_mpconf196/metrics.yml b/ml_peg/analysis/conformers/solv_mpconf196/metrics.yml new file mode 100644 index 00000000..4eb3346e --- /dev/null +++ b/ml_peg/analysis/conformers/solv_mpconf196/metrics.yml @@ -0,0 +1,7 @@ +metrics: + MAE: + good: 0.0 + bad: 1.0 + unit: eV + tooltip: Mean Absolute Error for all systems + level_of_theory: CCSD(T) diff --git a/ml_peg/app/conformers/solv_mpconf196/app_solv_mpconf196.py b/ml_peg/app/conformers/solv_mpconf196/app_solv_mpconf196.py new file mode 100644 index 00000000..e387d3e2 --- /dev/null +++ b/ml_peg/app/conformers/solv_mpconf196/app_solv_mpconf196.py @@ -0,0 +1,90 @@ +"""Run solvMPCONF196 app.""" + +from __future__ import annotations + +from dash import Dash +from dash.html import Div + +from ml_peg.app import APP_ROOT +from ml_peg.app.base_app import BaseApp +from ml_peg.app.utils.build_callbacks import ( + plot_from_table_column, + struct_from_scatter, +) +from ml_peg.app.utils.load import read_plot +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + +MODELS = get_model_names(current_models) +BENCHMARK_NAME = "solvMPCONF196" +DOCS_URL = ( + "https://ddmms.github.io/ml-peg/user_guide/benchmarks/molecular.html#solvmpconf196" +) +DATA_PATH = APP_ROOT / "data" / "conformers" / "solv_mpconf196" + + +class SolvMPCONF196App(BaseApp): + """SolvMPCONF196 benchmark app layout and callbacks.""" + + def register_callbacks(self) -> None: + """Register callbacks to app.""" + scatter = read_plot( + DATA_PATH / "figure_solv_mpconf196.json", + id=f"{BENCHMARK_NAME}-figure", + ) + + model_dir = DATA_PATH / MODELS[0] + if model_dir.exists(): + labels = sorted([f.stem for f in model_dir.glob("*.xyz")]) + structs = [ + f"assets/conformers/solv_mpconf196/{MODELS[0]}/{label}.xyz" + for label in labels + ] + else: + structs = [] + + plot_from_table_column( + table_id=self.table_id, + plot_id=f"{BENCHMARK_NAME}-figure-placeholder", + column_to_plot={"MAE": scatter}, + ) + + struct_from_scatter( + scatter_id=f"{BENCHMARK_NAME}-figure", + struct_id=f"{BENCHMARK_NAME}-struct-placeholder", + structs=structs, + mode="struct", + ) + + +def get_app() -> SolvMPCONF196App: + """ + Get solvMPCONF196 benchmark app layout and callback registration. + + Returns + ------- + SolvMPCONF196App + Benchmark layout and callback registration. + """ + return SolvMPCONF196App( + name=BENCHMARK_NAME, + description=( + "Performance in predicting solvent-stabilized conformer energies for " + "the solvMPCONF196 dataset (13 biomolecular fragments with explicit " + "solvation). Reference data from CCSD(T) calculations." + ), + docs_url=DOCS_URL, + table_path=DATA_PATH / "solv_mpconf196_metrics_table.json", + extra_components=[ + Div(id=f"{BENCHMARK_NAME}-figure-placeholder"), + Div(id=f"{BENCHMARK_NAME}-struct-placeholder"), + ], + ) + + +if __name__ == "__main__": + full_app = Dash(__name__, assets_folder=DATA_PATH.parent.parent) + benchmark_app = get_app() + full_app.layout = benchmark_app.layout + benchmark_app.register_callbacks() + full_app.run(port=8069, debug=True) diff --git a/ml_peg/calcs/conformers/solv_mpconf196/.dvc/.gitignore b/ml_peg/calcs/conformers/solv_mpconf196/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/ml_peg/calcs/conformers/solv_mpconf196/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/ml_peg/calcs/conformers/solv_mpconf196/.dvc/config b/ml_peg/calcs/conformers/solv_mpconf196/.dvc/config new file mode 100644 index 00000000..e69de29b diff --git a/ml_peg/calcs/conformers/solv_mpconf196/.dvcignore b/ml_peg/calcs/conformers/solv_mpconf196/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/ml_peg/calcs/conformers/solv_mpconf196/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/ml_peg/calcs/conformers/solv_mpconf196/calc_solv_mpconf196.py b/ml_peg/calcs/conformers/solv_mpconf196/calc_solv_mpconf196.py new file mode 100644 index 00000000..c087bc81 --- /dev/null +++ b/ml_peg/calcs/conformers/solv_mpconf196/calc_solv_mpconf196.py @@ -0,0 +1,191 @@ +""" +Calculate the solvMPCONF196 dataset of solvated biomolecule conformers. + +J. Comput. Chem. 2024, 45(7), 419. +https://doi.org/10.1002/jcc.27248. +""" + +from __future__ import annotations + +from pathlib import Path + +from ase import units +from ase.io import read, write +import mlipx +from mlipx.abc import NodeWithCalculator +import numpy as np +import pandas as pd +from tqdm import tqdm +import zntrack + +from ml_peg.calcs.utils.utils import chdir, download_s3_data +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +MODELS = load_models(current_models) + +KCAL_TO_EV = units.kcal / units.mol +EV_TO_KCAL = 1 / KCAL_TO_EV + +OUT_PATH = Path(__file__).parent / "outputs" + +MOLECULES = [ + "FGG", + "GFA", + "GGF", + "WG", + "WGG", + "CAMVES", + "CHPSAR", + "COHVAW", + "GS464992", + "GS557577", + "POXTRD", + "SANGLI", + "YIVNOG", +] + + +class SolvMPCONF196Benchmark(zntrack.Node): + """Benchmark the solvMPCONF196 dataset.""" + + model: NodeWithCalculator = zntrack.deps() + model_name: str = zntrack.params() + + @staticmethod + def get_atoms(atoms_path): + """ + Read atoms object and add charge and spin. + + Parameters + ---------- + atoms_path + Path to atoms object. + + Returns + ------- + atoms + ASE atoms object. + """ + atoms = read(atoms_path) + atoms.info["charge"] = 0 + atoms.info["spin"] = 1 + return atoms + + def get_ref_energies(self, data_path): + """ + Get reference conformer energies. + + Parameters + ---------- + data_path + Path to the structure. + """ + df = pd.read_excel( + data_path / "Energies_CCSD(T).xlsx", + sheet_name="Total PNO-CCS(T) Energies solvM", + header=1, + ) + self.ref_energies = {} + for row in df.iterrows(): + label = row[1][0] + e_ref = float(row[1][1]) * units.Hartree + self.ref_energies[label] = e_ref + + def run(self): + """Run new benchmark.""" + data_path = ( + download_s3_data( + filename="SolvMPCONF196.zip", + key="inputs/conformers/SolvMPCONF196/SolvMPCONF196.zip", + ) + / "SolvMPCONF196" + ) + self.get_ref_energies(data_path) + # Read in data and attach calculator + calc = self.model.get_calculator() + # Add D3 calculator for this test + calc = self.model.add_d3_calculator(calc) + for molecule in tqdm(MOLECULES): + model_abs_energies = [] + ref_abs_energies = [] + current_molecule_labels = [] + for label, e_ref in self.ref_energies.items(): + molecule_label = label.split("_")[0] + conformer_label = label.split("_")[1] + if label[-1].isnumeric(): + xyz_label = f"{molecule_label}{conformer_label}" + else: + xyz_label = f"{molecule_label}_{conformer_label}" + if molecule != molecule_label: + continue + atoms = self.get_atoms( + data_path + / "solvMPCONF196_geometries/solvMPCONF196" + / xyz_label + / "struc.xyz" + ) + atoms.translate(-atoms.get_center_of_mass()) + atoms.calc = calc + model_abs_energies.append(atoms.get_potential_energy()) + ref_abs_energies.append(e_ref) + current_molecule_labels.append(label) + + for label, e_model in zip( + current_molecule_labels, model_abs_energies, strict=False + ): + molecule_label = label.split("_")[0] + conformer_label = label.split("_")[1] + if label[-1].isnumeric(): + xyz_label = f"{molecule_label}{conformer_label}" + else: + xyz_label = f"{molecule_label}_{conformer_label}" + if molecule != molecule_label: + continue + atoms = self.get_atoms( + data_path + / "solvMPCONF196_geometries/solvMPCONF196" + / xyz_label + / "struc.xyz" + ) + atoms.translate(-atoms.get_center_of_mass()) + atoms.info["ref_rel_energy"] = self.ref_energies[label] - np.mean( + ref_abs_energies + ) + atoms.info["model_rel_energy"] = e_model - np.mean(model_abs_energies) + + write_dir = OUT_PATH / self.model_name + write_dir.mkdir(parents=True, exist_ok=True) + write(write_dir / f"{label}.xyz", atoms) + + +def build_project(repro: bool = False) -> None: + """ + Build mlipx project. + + Parameters + ---------- + repro + Whether to call dvc repro -f after building. + """ + project = mlipx.Project() + benchmark_node_dict = {} + + for model_name, model in MODELS.items(): + with project.group(model_name): + benchmark = SolvMPCONF196Benchmark( + model=model, + model_name=model_name, + ) + benchmark_node_dict[model_name] = benchmark + + if repro: + with chdir(Path(__file__).parent): + project.repro(build=True, force=True) + else: + project.build() + + +def test_solv_mpconf196(): + """Run solvMPCONF196 benchmark via pytest.""" + build_project(repro=True)