diff --git a/news/hill-notation.rst b/news/hill-notation.rst new file mode 100644 index 0000000..debf0e5 --- /dev/null +++ b/news/hill-notation.rst @@ -0,0 +1,23 @@ +**Added:** + +* Function that converts a chemical formula to hill notation. + +**Changed:** + +* + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* + +**Security:** + +* diff --git a/src/diffpy/utils/tools.py b/src/diffpy/utils/tools.py index 63e10ba..addba03 100644 --- a/src/diffpy/utils/tools.py +++ b/src/diffpy/utils/tools.py @@ -1,5 +1,7 @@ import importlib.metadata import json +import re +from collections import defaultdict from copy import copy from pathlib import Path @@ -214,6 +216,60 @@ def get_package_info(package_names, metadata=None): return metadata +def _expand_formula(formula): + """Expands the formula if it contains parentheses with multipliers.""" + while "(" in formula and ")" in formula: + formula = re.sub( + r"\(([A-Za-z0-9]+)\)(\d+)", + lambda m: m.group(1) * int(m.group(2)), + formula, + ) + return formula + + +def to_hill_notation(formula): + """Converts a chemical formula to Hill notation. + + The process is the following: + 1. Expand group elements, and parse the expanded formula + into a dictionary of elements and their counts. + e.g., "H2O" -> {"H": 2, "O": 1}. + 2. Apply Hill notation: + - Carbon (C) comes first if present. + - Hydrogen (H) follows Carbon (C) if present, + but only if carbon is also present. + - All remaining elements are listed in alphabetical order. + 3. Format the elements with their counts, omitting counts of 1. + + Parameters + ---------- + formula : str + The chemical formula of the material. + + Returns + ------- + str + The formula formatted in Hill notation, + with elements separated by spaces (e.g., "C6 H12 O6"). + """ + element_counts = defaultdict(int) + tokens = re.findall(r"([A-Z][a-z]*)(\d*)", _expand_formula(formula)) + for element, count in tokens: + element_counts[element] += int(count) if count else 1 + + hill_parts = [] + if "C" in element_counts: + c_count = element_counts.pop("C") + hill_parts.append(f"C{c_count if c_count > 1 else ''}") + if "H" in element_counts: + h_count = element_counts.pop("H") + hill_parts.append(f"H{h_count if h_count > 1 else ''}") + for element in sorted(element_counts): + count = element_counts[element] + hill_parts.append(f"{element}{count if count > 1 else ''}") + return " ".join(hill_parts) + + def get_density_from_cloud(sample_composition, mp_token=""): """Function to get material density from the MP or COD database. diff --git a/tests/test_tools.py b/tests/test_tools.py index 6be3870..14fb7e0 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -13,6 +13,7 @@ compute_mud, get_package_info, get_user_info, + to_hill_notation, ) @@ -270,6 +271,27 @@ def test_get_package_info(monkeypatch, inputs, expected): assert actual_metadata == expected +@pytest.mark.parametrize( + "input_formula, expected", + [ + # C1: Formulas with C and/or H + ("C", "C"), # Only C + ("H", "H"), # Only H + ("CO2", "C O2"), # With C + ("C6H12O6", "C6 H12 O6"), # With C and H + ("CH3COOH", "C2 H4 O2"), # With C and H + ("NH3", "H3 N"), # With H only + # C2: Formulas without C or H + ("O2", "O2"), # Single element + ("FeCl3", "Cl3 Fe"), # Compound + # C3: Parentheses Expansion + ("Mg(OH)2", "H2 Mg O2"), + ], +) +def test_to_hill_notation(input_formula, expected): + assert to_hill_notation(input_formula) == expected + + @pytest.mark.parametrize( "inputs", [