|
16 | 16 | from io import BytesIO |
17 | 17 | from pathlib import Path |
18 | 18 | from typing import TYPE_CHECKING, Any, cast |
19 | | -from xml.etree import ElementTree as ET |
20 | 19 |
|
21 | 20 | import numpy as np |
22 | 21 | import orjson |
| 22 | +from lxml import etree as ET |
23 | 23 | from monty.dev import requires |
24 | 24 | from monty.io import reverse_readfile, zopen |
25 | 25 | from monty.json import MSONable, jsanitize |
|
55 | 55 | from typing import Literal, Self, TypeAlias |
56 | 56 |
|
57 | 57 | # Avoid name conflict with pymatgen.core.Element |
58 | | - from xml.etree.ElementTree import Element as XML_Element |
59 | | - |
| 58 | + from lxml.etree import _Element as XML_Element |
60 | 59 | from numpy.typing import NDArray |
61 | 60 |
|
62 | 61 | from pymatgen.util.typing import Kpoint, PathLike |
@@ -228,10 +227,8 @@ class BandgapProps(MSONable): |
228 | 227 |
|
229 | 228 | class Vasprun(MSONable): |
230 | 229 | """ |
231 | | - Vastly improved cElementTree-based parser for vasprun.xml files. Uses |
232 | | - iterparse to support incremental parsing of large files. |
233 | | - Speedup over Dom is at least 2x for smallish files (~1 Mb) to orders of |
234 | | - magnitude for larger files (~10 Mb). |
| 230 | + Parser for vasprun.xml files. Uses lxml with explicit tags with iterparsing to reduce callback |
| 231 | + overhead. Speedup over Dom is at least 2-3x for 10-430MB files. |
235 | 232 |
|
236 | 233 | **VASP results** |
237 | 234 |
|
@@ -356,30 +353,31 @@ def __init__( |
356 | 353 | self.separate_spins = separate_spins |
357 | 354 | self.exception_on_bad_xml = exception_on_bad_xml |
358 | 355 |
|
359 | | - with zopen(filename, mode="rt", encoding="utf-8") as file: |
360 | | - if ionic_step_skip or ionic_step_offset: |
| 356 | + if ionic_step_skip or ionic_step_offset: |
| 357 | + with zopen(filename, mode="rb") as file: |
361 | 358 | # Remove parts of the xml file and parse the string |
362 | | - content: str = file.read() # type:ignore[assignment] |
363 | | - steps: list[str] = content.split("<calculation>") |
| 359 | + content: bytes = file.read() |
| 360 | + steps: list[bytes] = content.split(b"<calculation>") |
364 | 361 |
|
365 | 362 | # The text before the first <calculation> is the preamble! |
366 | | - preamble: str = steps.pop(0) |
| 363 | + preamble: bytes = steps.pop(0) |
367 | 364 | self.nionic_steps: int = len(steps) |
368 | 365 | new_steps = steps[ionic_step_offset :: int(ionic_step_skip or 1)] |
369 | 366 |
|
370 | 367 | # Add the tailing information in the last step from the run |
371 | | - to_parse: str = "<calculation>".join(new_steps) |
| 368 | + to_parse: bytes = b"<calculation>".join(new_steps) |
372 | 369 | if steps[-1] != new_steps[-1]: |
373 | | - to_parse = f"{preamble}<calculation>{to_parse}{steps[-1].split('</calculation>')[-1]}" |
| 370 | + to_parse = preamble + b"<calculation>" + to_parse + steps[-1].split(b"</calculation>")[-1] |
374 | 371 | else: |
375 | | - to_parse = f"{preamble}<calculation>{to_parse}" |
| 372 | + to_parse = preamble + b"<calculation>" + to_parse |
376 | 373 | self._parse( |
377 | | - BytesIO(to_parse.encode("utf-8")), |
| 374 | + BytesIO(to_parse), |
378 | 375 | parse_dos=parse_dos, |
379 | 376 | parse_eigen=parse_eigen, |
380 | 377 | parse_projected_eigen=parse_projected_eigen, |
381 | 378 | ) |
382 | | - else: |
| 379 | + else: |
| 380 | + with zopen(filename, mode="rb") as file: |
383 | 381 | self._parse( |
384 | 382 | file, |
385 | 383 | parse_dos=parse_dos, |
@@ -454,7 +452,25 @@ def _parse( |
454 | 452 | # whether they are nested within another block. This is why we |
455 | 453 | # must read both start and end tags and have flags to tell us |
456 | 454 | # when we have entered or left a block. (2024-01-26) |
457 | | - for event, elem in ET.iterparse(stream, events=["start", "end"]): |
| 455 | + _TAGS = [ |
| 456 | + "atominfo", |
| 457 | + "calculation", |
| 458 | + "dielectricfunction", |
| 459 | + "dos", |
| 460 | + "dynmat", |
| 461 | + "eigenvalues", |
| 462 | + "eigenvalues_kpoints_opt", |
| 463 | + "energy", |
| 464 | + "generator", |
| 465 | + "incar", |
| 466 | + "kpoints", |
| 467 | + "parameters", |
| 468 | + "projected", |
| 469 | + "projected_kpoints_opt", |
| 470 | + "structure", |
| 471 | + "varray", |
| 472 | + ] |
| 473 | + for event, elem in ET.iterparse(stream, events=["start", "end"], tag=_TAGS): |
458 | 474 | tag = elem.tag |
459 | 475 | if event == "start": |
460 | 476 | # The start event tells us when we have entered blocks |
@@ -603,7 +619,7 @@ def _parse( |
603 | 619 | if "kinetic" in d: |
604 | 620 | md_data[-1]["energy"] = {i.attrib["name"]: float(i.text) for i in elem.findall("i")} # type:ignore[arg-type] |
605 | 621 |
|
606 | | - except ET.ParseError: |
| 622 | + except ET.XMLSyntaxError: |
607 | 623 | if self.exception_on_bad_xml: |
608 | 624 | raise |
609 | 625 | warnings.warn( |
@@ -1843,7 +1859,7 @@ def __init__( |
1843 | 1859 | self.occu_tol = occu_tol |
1844 | 1860 | self.separate_spins = separate_spins |
1845 | 1861 |
|
1846 | | - with zopen(filename, mode="rt", encoding="utf-8") as file: |
| 1862 | + with zopen(filename, mode="rb") as file: |
1847 | 1863 | self.efermi = None |
1848 | 1864 | parsed_header = False |
1849 | 1865 | in_kpoints_opt = False |
|
0 commit comments