Skip to content

Commit e3a69c6

Browse files
authored
Merge pull request #4 from SCDH/file_reading
2 parents b19646c + bf31f25 commit e3a69c6

5 files changed

Lines changed: 58 additions & 2 deletions

File tree

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Copyright (c) 2026 Mirko Westermeier
1+
Copyright (c) 2026 Mirko Westermeier, Katharina Dietz
22

33
Permission is hereby granted, free of charge, to any person obtaining
44
a copy of this software and associated documentation files (the

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ Please note that this project has a [Code of Conduct](CODE_OF_CONDUCT.md).
7575

7676
## Copyright and License
7777

78-
Copyright (c) 2026 [Mirko Westermeier][gh-memowe] (SCDH, University of Münster)
78+
Copyright (c) 2026 [Mirko Westermeier][gh-memowe], [Katharina Dietz][gh-kdietzm] (SCDH, University of Münster)
7979

8080
Released under the [MIT License](LICENSE).
8181

@@ -90,3 +90,4 @@ Released under the [MIT License](LICENSE).
9090
[gh-issues]: https://github.com/SCDH/pygexml/issues
9191
[gh-prs]: https://github.com/SCDH/pygexml/pulls
9292
[gh-memowe]: https://github.com/memowe
93+
[gh-kdietzm]: https://github.com/KDietzM

pygexml/page.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pathlib import Path
12
from re import Pattern, compile
23
from warnings import warn
34
from dataclasses import dataclass
@@ -179,6 +180,12 @@ def from_xml_string(cls, xml_str: str) -> "Page":
179180
raise PageXMLError("Page: no page element found")
180181
return cls.from_xml(page_element)
181182

183+
@classmethod
184+
def from_xml_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page":
185+
path = Path(file)
186+
xml_string = path.read_text(encoding=encoding)
187+
return Page.from_xml_string(xml_string)
188+
182189
def lookup_region(self, id: ID) -> TextRegion | None:
183190
return self.regions.get(id)
184191

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ license = "MIT"
1111
requires-python = ">=3.12"
1212
authors = [
1313
{ name = "Mirko Westermeier", email = "mirko.westermeier@uni-muenster.de" },
14+
{ name = "Katharina Dietz", email = "katharina.dietz@uni-muenster.de" },
1415
]
1516
dependencies = [
1617
"lxml",

test/test_page.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
from pathlib import Path
2+
13
import pytest
24
from hypothesis import given, assume
35
import hypothesis.strategies as st
6+
47
from lxml import etree
58

69
from pygexml.strategies import *
@@ -349,6 +352,50 @@ def test_page_from_string() -> None:
349352
}
350353

351354

355+
def test_from_xml_file_example(tmp_path: Path) -> None:
356+
content = """<?xml version='1.0' encoding='utf-8'?>
357+
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
358+
<Metadata>
359+
<Creator>God</Creator>
360+
<Created>Sonntag</Created>
361+
</Metadata>
362+
<Page imageFilename="a.jpg" imageWidth="4217" imageHeight="1742">
363+
<TextRegion id="b">
364+
<Coords points="1,2 3,4"/>
365+
<TextLine id="c" index="0" custom="heights_v2:[91.0,32.1]">
366+
<Coords points="5,6 7,8"/>
367+
<Baseline points="2008,360 2208,352"/>
368+
<TextEquiv conf="0.932">
369+
<Unicode>d</Unicode>
370+
</TextEquiv>
371+
</TextLine>
372+
</TextRegion>
373+
</Page>
374+
</PcGts>
375+
"""
376+
377+
xml_filepath = tmp_path / "test.xml"
378+
xml_filepath.write_text(content, encoding="utf-8")
379+
380+
result = Page.from_xml_file(xml_filepath)
381+
assert result.image_filename == "a.jpg"
382+
assert result.regions == {
383+
"b": TextRegion(
384+
id="b",
385+
coords=Coords.parse("1,2 3,4"),
386+
textlines={"c": TextLine(id="c", coords=Coords.parse("5,6 7,8"), text="d")},
387+
)
388+
}
389+
assert result == Page.from_xml_string(content)
390+
391+
392+
def test_from_missing_xml_file(tmp_path: Path) -> None:
393+
missing_file = tmp_path / "does_not_exist.xml"
394+
assert not missing_file.exists()
395+
with pytest.raises(FileNotFoundError):
396+
Page.from_xml_file(missing_file)
397+
398+
352399
@given(st_text_regions, st_pages())
353400
def test_page_region_lookup(region: TextRegion, page: Page) -> None:
354401
assume(region.id not in page.regions)

0 commit comments

Comments
 (0)