From 4d5888af1117177d9808f4cf3a0fff80b2573ae3 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sat, 24 Oct 2020 00:09:48 +0200
Subject: [PATCH 01/10] ocrd.processor.base: add property zip_input_files

---
 ocrd/ocrd/processor/base.py | 55 ++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py
index 58ff423ef8..be3f6b6b92 100644
--- a/ocrd/ocrd/processor/base.py
+++ b/ocrd/ocrd/processor/base.py
@@ -6,7 +6,7 @@
 
 import os
 import json
-from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE
+from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger
 from ocrd_validators import ParameterValidator
 from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
 
@@ -15,9 +15,11 @@
 
 class Processor():
     """
-    A processor runs an algorithm based on the workspace, the mets.xml in the
-    workspace (and the input files defined therein) as well as optional
-    parameter.
+    A processor is an OCR-D compliant command-line-interface for executing
+    a single workflow step on the workspace (represented by local METS). It
+    reads input files for all or requested physical pages of the input fileGrp(s),
+    and writes output files for them into the output fileGrp(s). It may take 
+    a number of optional or mandatory parameters.
     """
 
     def __init__(
@@ -123,3 +125,48 @@ def input_files(self):
                 self.input_file_grp
                 ))
         return ret
+
+    @property
+    def zip_input_files(self, require_first=True):
+        """
+        List tuples of input files (for multiple input file groups).
+
+        Processors that expect/need multiple input file groups,
+        cannot use ``input_files``. They must align (zip) input files
+        across pages. This includes the case where not all pages
+        are equally present in all file groups.
+
+        This function does not make much sense for non-PAGE fileGrps,
+        so it uses a fixed MIME type filter for PAGE-XML.
+
+        Args:
+             require_first (bool): If true, then skip a page entirely
+             whenever it is not available in the first input fileGrp.
+        """
+
+        LOG = getLogger('ocrd.processor.base')
+        ifgs = self.input_file_grp.split(",")
+        # Iterating over all files repeatedly may seem inefficient at first sight,
+        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
+        # can actually be much more costly than traversing the ltree.
+        # This might depend on the number of pages vs number of fileGrps.
+
+        pages = dict()
+        for i, ifg in enumerate(ifgs):
+            for file_ in self.workspace.mets.find_all_files(
+                    pageId=self.page_id, fileGrp=ifg, mimetype=MIMETYPE_PAGE):
+                if not file_.pageId:
+                    continue
+                LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
+                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
+                ift[i] = file_
+        ifts = list()
+        for page, ifiles in pages.items():
+            for i, ifg in enumerate(ifgs):
+                if not ifiles[i]:
+                    # other fallback options?
+                    LOG.error('found no page %s in file group %s',
+                              page, ifg)
+            if ifiles[0] or not require_first:
+                ifts.append(tuple(ifiles))
+        return ifts

From 20f9f6b6824109c6a86690a20472e6a51c17f609 Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 26 Oct 2020 13:04:48 +0100
Subject: [PATCH 02/10] Processor.zip_input_files must be method not property

---
 ocrd/ocrd/processor/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py
index be3f6b6b92..07660025a2 100644
--- a/ocrd/ocrd/processor/base.py
+++ b/ocrd/ocrd/processor/base.py
@@ -126,7 +126,6 @@ def input_files(self):
                 ))
         return ret
 
-    @property
     def zip_input_files(self, require_first=True):
         """
         List tuples of input files (for multiple input file groups).

From 5f6b5aff5872329507789e1fb95eed1d600101bf Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 26 Oct 2020 13:05:00 +0100
Subject: [PATCH 03/10] test Processor.zip_input_files

---
 tests/processor/test_processor.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 6cd1dff840..1dec53b0c0 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -2,10 +2,10 @@
 
 from tempfile import TemporaryDirectory
 from os.path import join
-from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module
+from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
 from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL
 
-from ocrd_utils import MIMETYPE_PAGE
+from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging
 from ocrd.resolver import Resolver
 from ocrd.processor.base import Processor, run_processor, run_cli
 
@@ -89,5 +89,32 @@ def test_run_cli(self):
                 resolver=Resolver(),
             )
 
+    def test_zip_input_files(self):
+        class ZipTestProcessor(Processor): pass
+        with pushd_popd(tempdir=True) as tempdir:
+            ws = self.resolver.workspace_from_nothing(directory=tempdir)
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
+            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+            assert ('foobar1', 'foobar2') in tuples
+            assert ('foobar3', 'foobar4') in tuples
+
+    def test_zip_input_files_require_first(self):
+        initLogging()
+        class ZipTestProcessor(Processor): pass
+        self.capture_out_err()
+        with pushd_popd(tempdir=True) as tempdir:
+            ws = self.resolver.workspace_from_nothing(directory=tempdir)
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None)
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
+            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+            assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
+        r = self.capture_out_err()
+        assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
+
+
 if __name__ == "__main__":
     main(__file__)

From 102fd2db6e738890ce572504e7496cdd7ca2e6ba Mon Sep 17 00:00:00 2001
From: Konstantin Baierer <unixprog@gmail.com>
Date: Mon, 26 Oct 2020 17:27:04 +0100
Subject: [PATCH 04/10] Processor.zip_input_files: allow overriding mimetype

---
 ocrd/ocrd/processor/base.py       | 4 ++--
 tests/processor/test_processor.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py
index 07660025a2..77a438bb21 100644
--- a/ocrd/ocrd/processor/base.py
+++ b/ocrd/ocrd/processor/base.py
@@ -126,7 +126,7 @@ def input_files(self):
                 ))
         return ret
 
-    def zip_input_files(self, require_first=True):
+    def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE):
         """
         List tuples of input files (for multiple input file groups).
 
@@ -153,7 +153,7 @@ def zip_input_files(self, require_first=True):
         pages = dict()
         for i, ifg in enumerate(ifgs):
             for file_ in self.workspace.mets.find_all_files(
-                    pageId=self.page_id, fileGrp=ifg, mimetype=MIMETYPE_PAGE):
+                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype):
                 if not file_.pageId:
                     continue
                 LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 1dec53b0c0..5d9554c277 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -94,11 +94,11 @@ class ZipTestProcessor(Processor): pass
         with pushd_popd(tempdir=True) as tempdir:
             ws = self.resolver.workspace_from_nothing(directory=tempdir)
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
-            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
+            ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001')
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
             proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
-            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
             assert ('foobar1', 'foobar2') in tuples
             assert ('foobar3', 'foobar4') in tuples
 

From 397d1b6876c3013dc3338727e9f02b58b5250ac4 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Nov 2020 17:16:01 +0100
Subject: [PATCH 05/10] =?UTF-8?q?processor.zip=5Finput=5Ffiles:=20PAGE/ima?=
 =?UTF-8?q?ge-fallback=20and=20multi-match=20error=20handling=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- allow different kinds of error handling
  for the multi-match case (single page,
  multiple files): `on_error`
  - `skip`: None
  - `first`: first match
  - `last`: last match
  - `abort`: raise an exception
- if `mimetype` is inactive, support default OCR-D
  PAGE-image fallback semantics;
  - if no PAGE but multiple other files,
    then delegate to general error handling above
  - if multiple PAGE files, then raise an exception
---
 ocrd/ocrd/processor/base.py | 82 +++++++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 9 deletions(-)

diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py
index 77a438bb21..e62c81fe4f 100644
--- a/ocrd/ocrd/processor/base.py
+++ b/ocrd/ocrd/processor/base.py
@@ -126,21 +126,43 @@ def input_files(self):
                 ))
         return ret
 
-    def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE):
+    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
         """
         List tuples of input files (for multiple input file groups).
 
         Processors that expect/need multiple input file groups,
         cannot use ``input_files``. They must align (zip) input files
         across pages. This includes the case where not all pages
-        are equally present in all file groups.
-
-        This function does not make much sense for non-PAGE fileGrps,
-        so it uses a fixed MIME type filter for PAGE-XML.
+        are equally present in all file groups. It also requires
+        making a consistent selection if there are multiple files
+        per page.
+
+        Following the OCR-D functional model, this function tries to
+        find a single PAGE file per page, or fall back to a single
+        image file per page. In either case, multiple matches per page
+        are an error (see error handling below).
+        This default behaviour can be changed by using a fixed MIME
+        type filter via ``mimetype``. But still, multiple matching
+        files per page are an error.
+
+        Single-page multiple-file errors are handled according to
+        ``on_error``:
+        - if ``skip``, then the page for the respective fileGrp will be
+          silently skipped (as if there was no match at all)
+        - if ``first``, then the first matching file for the page will be
+          silently selected (as if the first was the only match)
+        - if ``last``, then the last matching file for the page will be
+          silently selected (as if the last was the only match)
+        - if ``abort``, then an exception will be raised.
+        Multiple matches for PAGE-XML will always raise an exception.
 
         Args:
              require_first (bool): If true, then skip a page entirely
              whenever it is not available in the first input fileGrp.
+
+             mimetype (str): If not None, filter by the specified MIME
+             type (literal or regex prefixed by ``//``.
+             Otherwise prefer PAGE or image.
         """
 
         LOG = getLogger('ocrd.processor.base')
@@ -152,13 +174,55 @@ def zip_input_files(self, require_first=True, mimetype=MIMETYPE_PAGE):
 
         pages = dict()
         for i, ifg in enumerate(ifgs):
-            for file_ in self.workspace.mets.find_all_files(
-                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype):
+            for file_ in sorted(self.workspace.mets.find_all_files(
+                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
+                                # sort by MIME type so PAGE comes before images
+                                key=lambda file_: file_.mimetype):
                 if not file_.pageId:
                     continue
-                LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
                 ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
-                ift[i] = file_
+                if ift[i]:
+                    LOG.debug("another page %s in input file group %s", file_.pageId, ifg)
+                    # fileGrp has multiple files for this page ID
+                    if mimetype:
+                        # filter was active, this must not happen
+                        if on_error == 'skip':
+                            ift[i] = None
+                        elif on_error == 'first':
+                            pass # keep first match
+                        elif on_error == 'last':
+                            ift[i] = file_
+                        elif on_error == 'abort':
+                            raise ValueError(
+                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
+                                    mimetype, file_.pageId, ifg))
+                        else:
+                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
+                    elif (ift[i].mimetype == MIMETYPE_PAGE and
+                          file_.mimetype != MIMETYPE_PAGE):
+                        pass # keep PAGE match
+                    elif (ift[i].mimetype == MIMETYPE_PAGE and
+                          file_.mimetype == MIMETYPE_PAGE):
+                            raise ValueError(
+                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
+                                    file_.pageId, ifg))
+                    else:
+                        # filter was inactive but no PAGE is in control, this must not happen
+                        if on_error == 'skip':
+                            ift[i] = None
+                        elif on_error == 'first':
+                            pass # keep first match
+                        elif on_error == 'last':
+                            ift[i] = file_
+                        elif on_error == 'abort':
+                            raise ValueError(
+                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
+                                    file_.pageId, ifg))
+                        else:
+                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
+                else:
+                    LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
+                    ift[i] = file_
         ifts = list()
         for page, ifiles in pages.items():
             for i, ifg in enumerate(ifgs):

From 034967435ca31138e624e7acc0b476c997df0003 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Nov 2020 17:24:17 +0100
Subject: [PATCH 06/10] extend tests for processor.zip_input_files

---
 tests/processor/test_processor.py | 37 ++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 5d9554c277..7dad1f1032 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -12,6 +12,7 @@
 class TestProcessor(TestCase):
 
     def setUp(self):
+        initLogging()
         self.resolver = Resolver()
         self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
 
@@ -98,12 +99,46 @@ class ZipTestProcessor(Processor): pass
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
             proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+            assert ('foobar1', 'foobar2') in tuples
+            assert ('foobar3', 'foobar4') in tuples
+            tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
+            assert ('foobar1', None) in tuples
             tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
             assert ('foobar1', 'foobar2') in tuples
             assert ('foobar3', 'foobar4') in tuples
 
+    def test_zip_input_files_multi_mixed(self):
+        class ZipTestProcessor(Processor): pass
+        with pushd_popd(tempdir=True) as tempdir:
+            ws = self.resolver.workspace_from_nothing(directory=tempdir)
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001')
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
+            ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002')
+            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+            assert ('foobar1', 'foobar2') in tuples
+            assert ('foobar3', 'foobar4') in tuples
+            tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
+            assert ('foobar3', None) in tuples
+            ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002')
+            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')]
+            assert ('foobar1', 'foobar2') in tuples
+            assert ('foobar3', 'foobar4') in tuples
+            tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
+            assert ('foobar3', None) in tuples
+            with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
+                tuples = proc.zip_input_files(on_error='abort')
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001')
+            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
+            with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
+                tuples = proc.zip_input_files()
+
     def test_zip_input_files_require_first(self):
-        initLogging()
         class ZipTestProcessor(Processor): pass
         self.capture_out_err()
         with pushd_popd(tempdir=True) as tempdir:

From a560c69d6dda9ba7398f058cf15b3af1b3945ee4 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Nov 2020 17:25:57 +0100
Subject: [PATCH 07/10] Workspace.resolve_image_{exif,as_pil}: properly raise
 on empty file path

---
 ocrd/ocrd/workspace.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py
index c06f7061fb..41f4349285 100644
--- a/ocrd/ocrd/workspace.py
+++ b/ocrd/ocrd/workspace.py
@@ -266,6 +266,9 @@ def resolve_image_exif(self, image_url):
         Return
             :class:`OcrdExif`
         """
+        if not image_url:
+            # avoid "finding" just any file
+            raise Exception("Cannot resolve empty image path")
         f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
         image_filename = self.download_file(f).local_filename
         ocrd_exif = exif_from_filename(image_filename)
@@ -286,6 +289,9 @@ def _resolve_image_as_pil(self, image_url, coords=None):
             Image or region in image as PIL.Image
 
         """
+        if not image_url:
+            # avoid "finding" just any file
+            raise Exception("Cannot resolve empty image path")
         log = getLogger('ocrd.workspace._resolve_image_as_pil')
         f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
         image_filename = self.download_file(f).local_filename

From ddaa28a2c7682514019771238bf225d0e24245d3 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Nov 2020 18:32:00 +0100
Subject: [PATCH 08/10] test Processor.zip_input_files with run-time page_id
 selection, too

---
 tests/processor/test_processor.py | 71 ++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 7dad1f1032..5b4a1f8b28 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -5,13 +5,14 @@
 from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
 from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL
 
-from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging
+from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
 from ocrd.resolver import Resolver
 from ocrd.processor.base import Processor, run_processor, run_cli
 
 class TestProcessor(TestCase):
 
     def setUp(self):
+        disableLogging()
         initLogging()
         self.resolver = Resolver()
         self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
@@ -98,15 +99,17 @@ class ZipTestProcessor(Processor): pass
             ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001')
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
-            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
-            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
-            assert ('foobar1', 'foobar2') in tuples
-            assert ('foobar3', 'foobar4') in tuples
-            tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
-            assert ('foobar1', None) in tuples
-            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
-            assert ('foobar1', 'foobar2') in tuples
-            assert ('foobar3', 'foobar4') in tuples
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
+                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
+                    assert ('foobar1', None) in tuples
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
 
     def test_zip_input_files_multi_mixed(self):
         class ZipTestProcessor(Processor): pass
@@ -118,25 +121,33 @@ class ZipTestProcessor(Processor): pass
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
             ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002')
-            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
-            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
-            assert ('foobar1', 'foobar2') in tuples
-            assert ('foobar3', 'foobar4') in tuples
-            tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
-            assert ('foobar3', None) in tuples
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    print("unfiltered")
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
+                    print("PAGE-filtered")
+                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
+                    assert ('foobar3', None) in tuples
             ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002')
-            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
-            tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')]
-            assert ('foobar1', 'foobar2') in tuples
-            assert ('foobar3', 'foobar4') in tuples
-            tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
-            assert ('foobar3', None) in tuples
-            with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
-                tuples = proc.zip_input_files(on_error='abort')
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
+                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
+                    assert ('foobar3', None) in tuples
+                    with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
+                        tuples = proc.zip_input_files(on_error='abort')
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001')
-            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
-            with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
-                tuples = proc.zip_input_files()
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
+                        tuples = proc.zip_input_files()
 
     def test_zip_input_files_require_first(self):
         class ZipTestProcessor(Processor): pass
@@ -145,8 +156,10 @@ class ZipTestProcessor(Processor): pass
             ws = self.resolver.workspace_from_nothing(directory=tempdir)
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None)
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
-            proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2')
-            assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
         r = self.capture_out_err()
         assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
 

From 8f8cbb9a7e2643bc5dcc31c380c1f3053ba7ca03 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Nov 2020 19:14:01 +0100
Subject: [PATCH 09/10] :fire: `Processor.{zip_,}input_files`: require
 input_file_grp

(raise an exception if no input fileGrp was set)
---
 ocrd/ocrd/processor/base.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py
index e62c81fe4f..71f9378c2a 100644
--- a/ocrd/ocrd/processor/base.py
+++ b/ocrd/ocrd/processor/base.py
@@ -102,29 +102,24 @@ def add_metadata(self, pcgts):
     @property
     def input_files(self):
         """
-        List the input files.
+        List the input files (for single input file groups).
 
-        - If there's a PAGE-XML for the page, take it (and forget about all
+        For each physical page:
+        - If there is a single PAGE-XML for the page, take it (and forget about all
           other files for that page)
-        - Else if there's only one image, take it (and forget about all other
+        - Else if there is a single image file, take it (and forget about all other
           files for that page)
         - Otherwise raise an error (complaining that only PAGE-XML warrants
-
           having multiple images for a single page)
         (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
         """
-        ret = self.workspace.mets.find_all_files(
-            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE)
-        if ret:
-            return ret
-        ret = self.workspace.mets.find_all_files(
-            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*")
-        if self.page_id and len(ret) > 1:
-            raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % (
-                "for page '%s'" % self.page_id if self.page_id else '',
-                self.input_file_grp
-                ))
-        return ret
+        if not self.input_file_grp:
+            raise ValueError("Processor is missing input fileGrp")
+        ret = self.zip_input_files(mimetype=None, on_error='abort')
+        if not ret:
+            return []
+        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
+        return [tuples[0] for tuples in ret]
 
     def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
         """
@@ -164,6 +159,8 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
              type (literal or regex prefixed by ``//``.
              Otherwise prefer PAGE or image.
         """
+        if not self.input_file_grp:
+            raise ValueError("Processor is missing input fileGrp")
 
         LOG = getLogger('ocrd.processor.base')
         ifgs = self.input_file_grp.split(",")

From 625547cd3153435444cf68fb6ec37afff22ef4a5 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 2 Nov 2020 19:15:37 +0100
Subject: [PATCH 10/10] adapt/extend tests for Processor.input_files (empty
 input_file_grp)

---
 tests/processor/test_processor.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 5b4a1f8b28..7ca6aaa0ef 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -30,9 +30,19 @@ def test_no_mets_url(self):
         with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'):
             run_processor(DummyProcessor, resolver=self.resolver)
 
+    def test_no_input_file_grp(self):
+        processor = run_processor(DummyProcessor,
+                                  resolver=self.resolver,
+                                  mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
+        with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'):
+            _ = processor.input_files
+
     def test_with_mets_url_input_files(self):
-        processor = run_processor(DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
-        self.assertEqual(len(processor.input_files), 20)
+        processor = run_processor(DummyProcessor,
+                                  input_file_grp='OCR-D-SEG-PAGE',
+                                  resolver=self.resolver,
+                                  mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
+        self.assertEqual(len(processor.input_files), 2)
         self.assertTrue(all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))
 
     def test_parameter(self):
@@ -44,10 +54,11 @@ def test_parameter(self):
                 processor = run_processor(
                     DummyProcessor,
                     parameter=json.load(f),
+                    input_file_grp="OCR-D-IMG",
                     resolver=self.resolver,
                     mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')
                 )
-            self.assertEqual(len(processor.input_files), 20)
+            self.assertEqual(len(processor.input_files), 3)
 
     def test_verify(self):
         proc = DummyProcessor(self.workspace)