From 4d5888af1117177d9808f4cf3a0fff80b2573ae3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Oct 2020 00:09:48 +0200 Subject: [PATCH 01/10] ocrd.processor.base: add property zip_input_files --- ocrd/ocrd/processor/base.py | 55 ++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 58ff423ef8..be3f6b6b92 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -6,7 +6,7 @@ import os import json -from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE +from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType @@ -15,9 +15,11 @@ class Processor(): """ - A processor runs an algorithm based on the workspace, the mets.xml in the - workspace (and the input files defined therein) as well as optional - parameter. + A processor is an OCR-D compliant command-line-interface for executing + a single workflow step on the workspace (represented by local METS). It + reads input files for all or requested physical pages of the input fileGrp(s), + and writes output files for them into the output fileGrp(s). It may take + a number of optional or mandatory parameters. """ def __init__( @@ -123,3 +125,48 @@ def input_files(self): self.input_file_grp )) return ret + + @property + def zip_input_files(self, require_first=True): + """ + List tuples of input files (for multiple input file groups). + + Processors that expect/need multiple input file groups, + cannot use ``input_files``. They must align (zip) input files + across pages. This includes the case where not all pages + are equally present in all file groups. + + This function does not make much sense for non-PAGE fileGrps, + so it uses a fixed MIME type filter for PAGE-XML. + + Args: + require_first (bool): If true, then skip a page entirely + whenever it is not available in the first input fileGrp. + """ + + LOG = getLogger('ocrd.processor.base') + ifgs = self.input_file_grp.split(",") + # Iterating over all files repeatedly may seem inefficient at first sight, + # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering + # can actually be much more costly than traversing the ltree. + # This might depend on the number of pages vs number of fileGrps. + + pages = dict() + for i, ifg in enumerate(ifgs): + for file_ in self.workspace.mets.find_all_files( + pageId=self.page_id, fileGrp=ifg, mimetype=MIMETYPE_PAGE): + if not file_.pageId: + continue + LOG.debug("adding page %s to input file group %s", file_.pageId, ifg) + ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) + ift[i] = file_ + ifts = list() + for page, ifiles in pages.items(): + for i, ifg in enumerate(ifgs): + if not ifiles[i]: + # other fallback options? + LOG.error('found no page %s in file group %s', + page, ifg) + if ifiles[0] or not require_first: + ifts.append(tuple(ifiles)) + return ifts From 20f9f6b6824109c6a86690a20472e6a51c17f609 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Oct 2020 13:04:48 +0100 Subject: [PATCH 02/10] Processor.zip_input_files must be method not property --- ocrd/ocrd/processor/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index be3f6b6b92..07660025a2 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -126,7 +126,6 @@ def input_files(self): )) return ret - @property def zip_input_files(self, require_first=True): """ List tuples of input files (for multiple input file groups). From 5f6b5aff5872329507789e1fb95eed1d600101bf Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Oct 2020 13:05:00 +0100 Subject: [PATCH 03/10] test Processor.zip_input_files --- tests/processor/test_processor.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 6cd1dff840..1dec53b0c0 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -2,10 +2,10 @@ from tempfile import TemporaryDirectory from os.path import join -from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module +from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL -from ocrd_utils import MIMETYPE_PAGE +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging from ocrd.resolver import Resolver from ocrd.processor.base import Processor, run_processor, run_cli @@ -89,5 +89,32 @@ def test_run_cli(self): resolver=Resolver(), ) + def test_zip_input_files(self): + class ZipTestProcessor(Processor): pass + with pushd_popd(tempdir=True) as tempdir: + ws = self.resolver.workspace_from_nothing(directory=tempdir) + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002') + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + + def test_zip_input_files_require_first(self): + initLogging() + class ZipTestProcessor(Processor): pass + self.capture_out_err() + with pushd_popd(tempdir=True) as tempdir: + ws = self.resolver.workspace_from_nothing(directory=tempdir) + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None) + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') + assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] + r = self.capture_out_err() + assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + + if __name__ == "__main__": main(__file__) From 102fd2db6e738890ce572504e7496cdd7ca2e6ba Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Oct 2020 17:27:04 +0100 Subject: [PATCH 04/10] Processor.zip_input_files: allow overriding mimetype --- ocrd/ocrd/processor/base.py | 4 ++-- tests/processor/test_processor.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 07660025a2..77a438bb21 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -126,7 +126,7 @@ def input_files(self): )) return ret - def zip_input_files(self, require_first=True): + def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE): """ List tuples of input files (for multiple input file groups). @@ -153,7 +153,7 @@ def zip_input_files(self, require_first=True): pages = dict() for i, ifg in enumerate(ifgs): for file_ in self.workspace.mets.find_all_files( - pageId=self.page_id, fileGrp=ifg, mimetype=MIMETYPE_PAGE): + pageId=self.page_id, fileGrp=ifg, mimetype=mimetype): if not file_.pageId: continue LOG.debug("adding page %s to input file group %s", file_.pageId, ifg) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 1dec53b0c0..5d9554c277 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -94,11 +94,11 @@ class ZipTestProcessor(Processor): pass with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') + ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002') proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') - tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples From 397d1b6876c3013dc3338727e9f02b58b5250ac4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Nov 2020 17:16:01 +0100 Subject: [PATCH 05/10] =?UTF-8?q?processor.zip=5Finput=5Ffiles:=20PAGE/ima?= =?UTF-8?q?ge-fallback=20and=20multi-match=20error=20handling=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - allow different kinds of error handling for the multi-match case (single page, multiple files): `on_error` - `skip`: None - `first`: first match - `last`: last match - `abort`: raise an exception - if `mimetype` is inactive, support default OCR-D PAGE-image fallback semantics; - if no PAGE but multiple other files, then delegate to general error handling above - if multiple PAGE files, then raise an exception --- ocrd/ocrd/processor/base.py | 82 +++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 9 deletions(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 77a438bb21..e62c81fe4f 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -126,21 +126,43 @@ def input_files(self): )) return ret - def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE): + def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): """ List tuples of input files (for multiple input file groups). Processors that expect/need multiple input file groups, cannot use ``input_files``. They must align (zip) input files across pages. This includes the case where not all pages - are equally present in all file groups. - - This function does not make much sense for non-PAGE fileGrps, - so it uses a fixed MIME type filter for PAGE-XML. + are equally present in all file groups. It also requires + making a consistent selection if there are multiple files + per page. + + Following the OCR-D functional model, this function tries to + find a single PAGE file per page, or fall back to a single + image file per page. In either case, multiple matches per page + are an error (see error handling below). + This default behaviour can be changed by using a fixed MIME + type filter via ``mimetype``. But still, multiple matching + files per page are an error. + + Single-page multiple-file errors are handled according to + ``on_error``: + - if ``skip``, then the page for the respective fileGrp will be + silently skipped (as if there was no match at all) + - if ``first``, then the first matching file for the page will be + silently selected (as if the first was the only match) + - if ``last``, then the last matching file for the page will be + silently selected (as if the last was the only match) + - if ``abort``, then an exception will be raised. + Multiple matches for PAGE-XML will always raise an exception. Args: require_first (bool): If true, then skip a page entirely whenever it is not available in the first input fileGrp. + + mimetype (str): If not None, filter by the specified MIME + type (literal or regex prefixed by ``//``. + Otherwise prefer PAGE or image. """ LOG = getLogger('ocrd.processor.base') @@ -152,13 +174,55 @@ def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE): pages = dict() for i, ifg in enumerate(ifgs): - for file_ in self.workspace.mets.find_all_files( - pageId=self.page_id, fileGrp=ifg, mimetype=mimetype): + for file_ in sorted(self.workspace.mets.find_all_files( + pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), + # sort by MIME type so PAGE comes before images + key=lambda file_: file_.mimetype): if not file_.pageId: continue - LOG.debug("adding page %s to input file group %s", file_.pageId, ifg) ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) - ift[i] = file_ + if ift[i]: + LOG.debug("another page %s in input file group %s", file_.pageId, ifg) + # fileGrp has multiple files for this page ID + if mimetype: + # filter was active, this must not happen + if on_error == 'skip': + ift[i] = None + elif on_error == 'first': + pass # keep first match + elif on_error == 'last': + ift[i] = file_ + elif on_error == 'abort': + raise ValueError( + "Multiple '%s' matches for page '%s' in fileGrp '%s'." % ( + mimetype, file_.pageId, ifg)) + else: + raise Exception("Unknown 'on_error' strategy '%s'" % on_error) + elif (ift[i].mimetype == MIMETYPE_PAGE and + file_.mimetype != MIMETYPE_PAGE): + pass # keep PAGE match + elif (ift[i].mimetype == MIMETYPE_PAGE and + file_.mimetype == MIMETYPE_PAGE): + raise ValueError( + "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % ( + file_.pageId, ifg)) + else: + # filter was inactive but no PAGE is in control, this must not happen + if on_error == 'skip': + ift[i] = None + elif on_error == 'first': + pass # keep first match + elif on_error == 'last': + ift[i] = file_ + elif on_error == 'abort': + raise ValueError( + "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % ( + file_.pageId, ifg)) + else: + raise Exception("Unknown 'on_error' strategy '%s'" % on_error) + else: + LOG.debug("adding page %s to input file group %s", file_.pageId, ifg) + ift[i] = file_ ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): From 034967435ca31138e624e7acc0b476c997df0003 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Nov 2020 17:24:17 +0100 Subject: [PATCH 06/10] extend tests for processor.zip_input_files --- tests/processor/test_processor.py | 37 ++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5d9554c277..7dad1f1032 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -12,6 +12,7 @@ class TestProcessor(TestCase): def setUp(self): + initLogging() self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) @@ -98,12 +99,46 @@ class ZipTestProcessor(Processor): pass ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002') proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] + assert ('foobar1', None) in tuples tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')] assert ('foobar1', 'foobar2') in tuples assert ('foobar3', 'foobar4') in tuples + def test_zip_input_files_multi_mixed(self): + class ZipTestProcessor(Processor): pass + with pushd_popd(tempdir=True) as tempdir: + ws = self.resolver.workspace_from_nothing(directory=tempdir) + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') + ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001') + ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') + ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002') + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] + assert ('foobar3', None) in tuples + ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002') + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] + assert ('foobar3', None) in tuples + with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): + tuples = proc.zip_input_files(on_error='abort') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001') + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') + with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): + tuples = proc.zip_input_files() + def test_zip_input_files_require_first(self): - initLogging() class ZipTestProcessor(Processor): pass self.capture_out_err() with pushd_popd(tempdir=True) as tempdir: From a560c69d6dda9ba7398f058cf15b3af1b3945ee4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Nov 2020 17:25:57 +0100 Subject: [PATCH 07/10] Workspace.resolve_image_{exif,as_pil}: properly raise on empty file path --- ocrd/ocrd/workspace.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index c06f7061fb..41f4349285 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -266,6 +266,9 @@ def resolve_image_exif(self, image_url): Return :class:`OcrdExif` """ + if not image_url: + # avoid "finding" just any file + raise Exception("Cannot resolve empty image path") f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url)) image_filename = self.download_file(f).local_filename ocrd_exif = exif_from_filename(image_filename) @@ -286,6 +289,9 @@ def _resolve_image_as_pil(self, image_url, coords=None): Image or region in image as PIL.Image """ + if not image_url: + # avoid "finding" just any file + raise Exception("Cannot resolve empty image path") log = getLogger('ocrd.workspace._resolve_image_as_pil') f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url)) image_filename = self.download_file(f).local_filename From ddaa28a2c7682514019771238bf225d0e24245d3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Nov 2020 18:32:00 +0100 Subject: [PATCH 08/10] test Processor.zip_input_files with run-time page_id selection, too --- tests/processor/test_processor.py | 71 ++++++++++++++++++------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 7dad1f1032..5b4a1f8b28 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -5,13 +5,14 @@ from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL -from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver from ocrd.processor.base import Processor, run_processor, run_cli class TestProcessor(TestCase): def setUp(self): + disableLogging() initLogging() self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) @@ -98,15 +99,17 @@ class ZipTestProcessor(Processor): pass ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002') - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') - tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] - assert ('foobar1', 'foobar2') in tuples - assert ('foobar3', 'foobar4') in tuples - tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] - assert ('foobar1', None) in tuples - tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')] - assert ('foobar1', 'foobar2') in tuples - assert ('foobar3', 'foobar4') in tuples + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] + assert ('foobar1', None) in tuples + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples def test_zip_input_files_multi_mixed(self): class ZipTestProcessor(Processor): pass @@ -118,25 +121,33 @@ class ZipTestProcessor(Processor): pass ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002') - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') - tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] - assert ('foobar1', 'foobar2') in tuples - assert ('foobar3', 'foobar4') in tuples - tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] - assert ('foobar3', None) in tuples + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + print("unfiltered") + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + print("PAGE-filtered") + tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] + assert ('foobar3', None) in tuples ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002') - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') - tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] - assert ('foobar1', 'foobar2') in tuples - assert ('foobar3', 'foobar4') in tuples - tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] - assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): - tuples = proc.zip_input_files(on_error='abort') + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] + assert ('foobar3', None) in tuples + with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): + tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001') - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') - with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): - tuples = proc.zip_input_files() + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): + tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): class ZipTestProcessor(Processor): pass @@ -145,8 +156,10 @@ class ZipTestProcessor(Processor): pass ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None) ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') - proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2') - assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err From 8f8cbb9a7e2643bc5dcc31c380c1f3053ba7ca03 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Nov 2020 19:14:01 +0100 Subject: [PATCH 09/10] :fire: `Processor.{zip_,}input_files`: require input_file_grp (raise an exception if no input fileGrp was set) --- ocrd/ocrd/processor/base.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index e62c81fe4f..71f9378c2a 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -102,29 +102,24 @@ def add_metadata(self, pcgts): @property def input_files(self): """ - List the input files. + List the input files (for single input file groups). - - If there's a PAGE-XML for the page, take it (and forget about all + For each physical page: + - If there is a single PAGE-XML for the page, take it (and forget about all other files for that page) - - Else if there's only one image, take it (and forget about all other + - Else if there is a single image file, take it (and forget about all other files for that page) - Otherwise raise an error (complaining that only PAGE-XML warrants - having multiple images for a single page) (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593) """ - ret = self.workspace.mets.find_all_files( - fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE) - if ret: - return ret - ret = self.workspace.mets.find_all_files( - fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*") - if self.page_id and len(ret) > 1: - raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % ( - "for page '%s'" % self.page_id if self.page_id else '', - self.input_file_grp - )) - return ret + if not self.input_file_grp: + raise ValueError("Processor is missing input fileGrp") + ret = self.zip_input_files(mimetype=None, on_error='abort') + if not ret: + return [] + assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps' + return [tuples[0] for tuples in ret] def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): """ @@ -164,6 +159,8 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): type (literal or regex prefixed by ``//``. Otherwise prefer PAGE or image. """ + if not self.input_file_grp: + raise ValueError("Processor is missing input fileGrp") LOG = getLogger('ocrd.processor.base') ifgs = self.input_file_grp.split(",") From 625547cd3153435444cf68fb6ec37afff22ef4a5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Nov 2020 19:15:37 +0100 Subject: [PATCH 10/10] adapt/extend tests for Processor.input_files (empty input_file_grp) --- tests/processor/test_processor.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5b4a1f8b28..7ca6aaa0ef 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -30,9 +30,19 @@ def test_no_mets_url(self): with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'): run_processor(DummyProcessor, resolver=self.resolver) + def test_no_input_file_grp(self): + processor = run_processor(DummyProcessor, + resolver=self.resolver, + mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) + with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'): + _ = processor.input_files + def test_with_mets_url_input_files(self): - processor = run_processor(DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) - self.assertEqual(len(processor.input_files), 20) + processor = run_processor(DummyProcessor, + input_file_grp='OCR-D-SEG-PAGE', + resolver=self.resolver, + mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) + self.assertEqual(len(processor.input_files), 2) self.assertTrue(all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files])) def test_parameter(self): @@ -44,10 +54,11 @@ def test_parameter(self): processor = run_processor( DummyProcessor, parameter=json.load(f), + input_file_grp="OCR-D-IMG", resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml') ) - self.assertEqual(len(processor.input_files), 20) + self.assertEqual(len(processor.input_files), 3) def test_verify(self): proc = DummyProcessor(self.workspace)