|
| 1 | +from pathlib import Path |
| 2 | + |
1 | 3 | import pytest |
2 | 4 | from hypothesis import given, assume |
3 | 5 | import hypothesis.strategies as st |
| 6 | + |
4 | 7 | from lxml import etree |
5 | 8 |
|
6 | 9 | from pygexml.strategies import * |
@@ -349,6 +352,50 @@ def test_page_from_string() -> None: |
349 | 352 | } |
350 | 353 |
|
351 | 354 |
|
| 355 | +def test_from_xml_file_example(tmp_path: Path) -> None: |
| 356 | + content = """<?xml version='1.0' encoding='utf-8'?> |
| 357 | + <PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"> |
| 358 | + <Metadata> |
| 359 | + <Creator>God</Creator> |
| 360 | + <Created>Sonntag</Created> |
| 361 | + </Metadata> |
| 362 | + <Page imageFilename="a.jpg" imageWidth="4217" imageHeight="1742"> |
| 363 | + <TextRegion id="b"> |
| 364 | + <Coords points="1,2 3,4"/> |
| 365 | + <TextLine id="c" index="0" custom="heights_v2:[91.0,32.1]"> |
| 366 | + <Coords points="5,6 7,8"/> |
| 367 | + <Baseline points="2008,360 2208,352"/> |
| 368 | + <TextEquiv conf="0.932"> |
| 369 | + <Unicode>d</Unicode> |
| 370 | + </TextEquiv> |
| 371 | + </TextLine> |
| 372 | + </TextRegion> |
| 373 | + </Page> |
| 374 | + </PcGts> |
| 375 | + """ |
| 376 | + |
| 377 | + xml_filepath = tmp_path / "test.xml" |
| 378 | + xml_filepath.write_text(content, encoding="utf-8") |
| 379 | + |
| 380 | + result = Page.from_xml_file(xml_filepath) |
| 381 | + assert result.image_filename == "a.jpg" |
| 382 | + assert result.regions == { |
| 383 | + "b": TextRegion( |
| 384 | + id="b", |
| 385 | + coords=Coords.parse("1,2 3,4"), |
| 386 | + textlines={"c": TextLine(id="c", coords=Coords.parse("5,6 7,8"), text="d")}, |
| 387 | + ) |
| 388 | + } |
| 389 | + assert result == Page.from_xml_string(content) |
| 390 | + |
| 391 | + |
| 392 | +def test_from_missing_xml_file(tmp_path: Path) -> None: |
| 393 | + missing_file = tmp_path / "does_not_exist.xml" |
| 394 | + assert not missing_file.exists() |
| 395 | + with pytest.raises(FileNotFoundError): |
| 396 | + Page.from_xml_file(missing_file) |
| 397 | + |
| 398 | + |
352 | 399 | @given(st_text_regions, st_pages()) |
353 | 400 | def test_page_region_lookup(region: TextRegion, page: Page) -> None: |
354 | 401 | assume(region.id not in page.regions) |
|
0 commit comments