Skip to content

Commit 9fc5520

Browse files
authored
Merge pull request #16 from wladerer/feat/lxml-fast-xml-parsing
updating lxml to use explicit tags to reduce overhead.
2 parents 087a3f9 + 99e7847 commit 9fc5520

4 files changed

Lines changed: 143 additions & 22 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ dependencies = [
7474
"uncertainties>=3.1",
7575
"plotly>=6.0",
7676
"joblib>=1.3.2",
77+
"lxml>=4.9",
7778
"bibtexparser",
7879
"tabulate>=0.9.0",
7980
"tqdm>=4.67.3",

src/pymatgen/io/vasp/outputs.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
from io import BytesIO
1717
from pathlib import Path
1818
from typing import TYPE_CHECKING, Any, cast
19-
from xml.etree import ElementTree as ET
2019

2120
import numpy as np
2221
import orjson
22+
from lxml import etree as ET
2323
from monty.dev import requires
2424
from monty.io import reverse_readfile, zopen
2525
from monty.json import MSONable, jsanitize
@@ -55,8 +55,7 @@
5555
from typing import Literal, Self, TypeAlias
5656

5757
# Avoid name conflict with pymatgen.core.Element
58-
from xml.etree.ElementTree import Element as XML_Element
59-
58+
from lxml.etree import _Element as XML_Element
6059
from numpy.typing import NDArray
6160

6261
from pymatgen.util.typing import Kpoint, PathLike
@@ -228,10 +227,8 @@ class BandgapProps(MSONable):
228227

229228
class Vasprun(MSONable):
230229
"""
231-
Vastly improved cElementTree-based parser for vasprun.xml files. Uses
232-
iterparse to support incremental parsing of large files.
233-
Speedup over Dom is at least 2x for smallish files (~1 Mb) to orders of
234-
magnitude for larger files (~10 Mb).
230+
Parser for vasprun.xml files. Uses lxml with explicit tags with iterparsing to reduce callback
231+
overhead. Speedup over Dom is at least 2-3x for 10-430MB files.
235232
236233
**VASP results**
237234
@@ -356,30 +353,31 @@ def __init__(
356353
self.separate_spins = separate_spins
357354
self.exception_on_bad_xml = exception_on_bad_xml
358355

359-
with zopen(filename, mode="rt", encoding="utf-8") as file:
360-
if ionic_step_skip or ionic_step_offset:
356+
if ionic_step_skip or ionic_step_offset:
357+
with zopen(filename, mode="rb") as file:
361358
# Remove parts of the xml file and parse the string
362-
content: str = file.read() # type:ignore[assignment]
363-
steps: list[str] = content.split("<calculation>")
359+
content: bytes = file.read()
360+
steps: list[bytes] = content.split(b"<calculation>")
364361

365362
# The text before the first <calculation> is the preamble!
366-
preamble: str = steps.pop(0)
363+
preamble: bytes = steps.pop(0)
367364
self.nionic_steps: int = len(steps)
368365
new_steps = steps[ionic_step_offset :: int(ionic_step_skip or 1)]
369366

370367
# Add the tailing information in the last step from the run
371-
to_parse: str = "<calculation>".join(new_steps)
368+
to_parse: bytes = b"<calculation>".join(new_steps)
372369
if steps[-1] != new_steps[-1]:
373-
to_parse = f"{preamble}<calculation>{to_parse}{steps[-1].split('</calculation>')[-1]}"
370+
to_parse = preamble + b"<calculation>" + to_parse + steps[-1].split(b"</calculation>")[-1]
374371
else:
375-
to_parse = f"{preamble}<calculation>{to_parse}"
372+
to_parse = preamble + b"<calculation>" + to_parse
376373
self._parse(
377-
BytesIO(to_parse.encode("utf-8")),
374+
BytesIO(to_parse),
378375
parse_dos=parse_dos,
379376
parse_eigen=parse_eigen,
380377
parse_projected_eigen=parse_projected_eigen,
381378
)
382-
else:
379+
else:
380+
with zopen(filename, mode="rb") as file:
383381
self._parse(
384382
file,
385383
parse_dos=parse_dos,
@@ -454,7 +452,25 @@ def _parse(
454452
# whether they are nested within another block. This is why we
455453
# must read both start and end tags and have flags to tell us
456454
# when we have entered or left a block. (2024-01-26)
457-
for event, elem in ET.iterparse(stream, events=["start", "end"]):
455+
_TAGS = [
456+
"atominfo",
457+
"calculation",
458+
"dielectricfunction",
459+
"dos",
460+
"dynmat",
461+
"eigenvalues",
462+
"eigenvalues_kpoints_opt",
463+
"energy",
464+
"generator",
465+
"incar",
466+
"kpoints",
467+
"parameters",
468+
"projected",
469+
"projected_kpoints_opt",
470+
"structure",
471+
"varray",
472+
]
473+
for event, elem in ET.iterparse(stream, events=["start", "end"], tag=_TAGS):
458474
tag = elem.tag
459475
if event == "start":
460476
# The start event tells us when we have entered blocks
@@ -603,7 +619,7 @@ def _parse(
603619
if "kinetic" in d:
604620
md_data[-1]["energy"] = {i.attrib["name"]: float(i.text) for i in elem.findall("i")} # type:ignore[arg-type]
605621

606-
except ET.ParseError:
622+
except ET.XMLSyntaxError:
607623
if self.exception_on_bad_xml:
608624
raise
609625
warnings.warn(
@@ -1843,7 +1859,7 @@ def __init__(
18431859
self.occu_tol = occu_tol
18441860
self.separate_spins = separate_spins
18451861

1846-
with zopen(filename, mode="rt", encoding="utf-8") as file:
1862+
with zopen(filename, mode="rb") as file:
18471863
self.efermi = None
18481864
parsed_header = False
18491865
in_kpoints_opt = False

tests/io/vasp/test_outputs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55
import json
66
import logging
77
import os
8-
import xml
98
from pathlib import Path
109
from shutil import copyfile, copyfileobj
1110

1211
import numpy as np
1312
import pytest
13+
from lxml import etree as ET
1414
from monty.io import zopen
1515
from monty.shutil import decompress_file
1616
from numpy.testing import assert_allclose
@@ -142,7 +142,7 @@ def test_vasprun_with_more_than_two_unlabelled_dielectric_functions(self):
142142
assert "unlabelled" in vr.dielectric_data
143143

144144
def test_bad_vasprun(self):
145-
with pytest.raises(xml.etree.ElementTree.ParseError):
145+
with pytest.raises(ET.XMLSyntaxError):
146146
Vasprun(f"{VASP_OUT_DIR}/vasprun.bad.xml.gz")
147147

148148
with pytest.warns(

0 commit comments

Comments
 (0)