Merge pull request #635 from bertsky/processor.zip_input_files

kba · web-flow · commit b4c826662959 · 2020-11-03T15:09:36.000+01:00
ocrd.processor.base: add property zip_input_files
diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py
@@ -6,7 +6,7 @@
 
 import os
 import json
-from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE
+from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger
 from ocrd_validators import ParameterValidator
 from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
 
@@ -15,9 +15,11 @@
 
 class Processor():
     """
-    A processor runs an algorithm based on the workspace, the mets.xml in the
-    workspace (and the input files defined therein) as well as optional
-    parameter.
+    A processor is an OCR-D compliant command-line-interface for executing
+    a single workflow step on the workspace (represented by local METS). It
+    reads input files for all or requested physical pages of the input fileGrp(s),
+    and writes output files for them into the output fileGrp(s). It may take 
+    a number of optional or mandatory parameters.
     """
 
     def __init__(
@@ -108,26 +110,131 @@ def add_metadata(self, pcgts):
     @property
     def input_files(self):
         """
-        List the input files.
+        List the input files (for single input file groups).
 
-        - If there's a PAGE-XML for the page, take it (and forget about all
+        For each physical page:
+        - If there is a single PAGE-XML for the page, take it (and forget about all
           other files for that page)
-        - Else if there's only one image, take it (and forget about all other
+        - Else if there is a single image file, take it (and forget about all other
           files for that page)
         - Otherwise raise an error (complaining that only PAGE-XML warrants
-
           having multiple images for a single page)
         (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
         """
-        ret = self.workspace.mets.find_all_files(
-            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE)
-        if ret:
-            return ret
-        ret = self.workspace.mets.find_all_files(
-            fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*")
-        if self.page_id and len(ret) > 1:
-            raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % (
-                "for page '%s'" % self.page_id if self.page_id else '',
-                self.input_file_grp
-                ))
-        return ret
+        if not self.input_file_grp:
+            raise ValueError("Processor is missing input fileGrp")
+        ret = self.zip_input_files(mimetype=None, on_error='abort')
+        if not ret:
+            return []
+        assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
+        return [tuples[0] for tuples in ret]
+
+    def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
+        """
+        List tuples of input files (for multiple input file groups).
+
+        Processors that expect/need multiple input file groups,
+        cannot use ``input_files``. They must align (zip) input files
+        across pages. This includes the case where not all pages
+        are equally present in all file groups. It also requires
+        making a consistent selection if there are multiple files
+        per page.
+
+        Following the OCR-D functional model, this function tries to
+        find a single PAGE file per page, or fall back to a single
+        image file per page. In either case, multiple matches per page
+        are an error (see error handling below).
+        This default behaviour can be changed by using a fixed MIME
+        type filter via ``mimetype``. But still, multiple matching
+        files per page are an error.
+
+        Single-page multiple-file errors are handled according to
+        ``on_error``:
+        - if ``skip``, then the page for the respective fileGrp will be
+          silently skipped (as if there was no match at all)
+        - if ``first``, then the first matching file for the page will be
+          silently selected (as if the first was the only match)
+        - if ``last``, then the last matching file for the page will be
+          silently selected (as if the last was the only match)
+        - if ``abort``, then an exception will be raised.
+        Multiple matches for PAGE-XML will always raise an exception.
+
+        Args:
+             require_first (bool): If true, then skip a page entirely
+             whenever it is not available in the first input fileGrp.
+
+             mimetype (str): If not None, filter by the specified MIME
+             type (literal or regex prefixed by ``//``.
+             Otherwise prefer PAGE or image.
+        """
+        if not self.input_file_grp:
+            raise ValueError("Processor is missing input fileGrp")
+
+        LOG = getLogger('ocrd.processor.base')
+        ifgs = self.input_file_grp.split(",")
+        # Iterating over all files repeatedly may seem inefficient at first sight,
+        # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
+        # can actually be much more costly than traversing the ltree.
+        # This might depend on the number of pages vs number of fileGrps.
+
+        pages = dict()
+        for i, ifg in enumerate(ifgs):
+            for file_ in sorted(self.workspace.mets.find_all_files(
+                    pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
+                                # sort by MIME type so PAGE comes before images
+                                key=lambda file_: file_.mimetype):
+                if not file_.pageId:
+                    continue
+                ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
+                if ift[i]:
+                    LOG.debug("another page %s in input file group %s", file_.pageId, ifg)
+                    # fileGrp has multiple files for this page ID
+                    if mimetype:
+                        # filter was active, this must not happen
+                        if on_error == 'skip':
+                            ift[i] = None
+                        elif on_error == 'first':
+                            pass # keep first match
+                        elif on_error == 'last':
+                            ift[i] = file_
+                        elif on_error == 'abort':
+                            raise ValueError(
+                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
+                                    mimetype, file_.pageId, ifg))
+                        else:
+                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
+                    elif (ift[i].mimetype == MIMETYPE_PAGE and
+                          file_.mimetype != MIMETYPE_PAGE):
+                        pass # keep PAGE match
+                    elif (ift[i].mimetype == MIMETYPE_PAGE and
+                          file_.mimetype == MIMETYPE_PAGE):
+                            raise ValueError(
+                                "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
+                                    file_.pageId, ifg))
+                    else:
+                        # filter was inactive but no PAGE is in control, this must not happen
+                        if on_error == 'skip':
+                            ift[i] = None
+                        elif on_error == 'first':
+                            pass # keep first match
+                        elif on_error == 'last':
+                            ift[i] = file_
+                        elif on_error == 'abort':
+                            raise ValueError(
+                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
+                                    file_.pageId, ifg))
+                        else:
+                            raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
+                else:
+                    LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
+                    ift[i] = file_
+        ifts = list()
+        for page, ifiles in pages.items():
+            for i, ifg in enumerate(ifgs):
+                if not ifiles[i]:
+                    # other fallback options?
+                    LOG.error('found no page %s in file group %s',
+                              page, ifg)
+            if ifiles[0] or not require_first:
+                ifts.append(tuple(ifiles))
+        return ifts
diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py
@@ -266,6 +266,9 @@ def resolve_image_exif(self, image_url):
         Return
             :class:`OcrdExif`
         """
+        if not image_url:
+            # avoid "finding" just any file
+            raise Exception("Cannot resolve empty image path")
         f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
         image_filename = self.download_file(f).local_filename
         ocrd_exif = exif_from_filename(image_filename)
@@ -286,6 +289,9 @@ def _resolve_image_as_pil(self, image_url, coords=None):
             Image or region in image as PIL.Image
 
         """
+        if not image_url:
+            # avoid "finding" just any file
+            raise Exception("Cannot resolve empty image path")
         log = getLogger('ocrd.workspace._resolve_image_as_pil')
         f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
         image_filename = self.download_file(f).local_filename
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
@@ -2,16 +2,18 @@
 
 from tempfile import TemporaryDirectory
 from os.path import join
-from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module
+from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
 from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL
 
-from ocrd_utils import MIMETYPE_PAGE
+from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
 from ocrd.resolver import Resolver
 from ocrd.processor.base import Processor, run_processor, run_cli
 
 class TestProcessor(TestCase):
 
     def setUp(self):
+        disableLogging()
+        initLogging()
         self.resolver = Resolver()
         self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
 
@@ -28,9 +30,19 @@ def test_no_mets_url(self):
         with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'):
             run_processor(DummyProcessor, resolver=self.resolver)
 
+    def test_no_input_file_grp(self):
+        processor = run_processor(DummyProcessor,
+                                  resolver=self.resolver,
+                                  mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
+        with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'):
+            _ = processor.input_files
+
     def test_with_mets_url_input_files(self):
-        processor = run_processor(DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
-        self.assertEqual(len(processor.input_files), 20)
+        processor = run_processor(DummyProcessor,
+                                  input_file_grp='OCR-D-SEG-PAGE',
+                                  resolver=self.resolver,
+                                  mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
+        self.assertEqual(len(processor.input_files), 2)
         self.assertTrue(all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))
 
     def test_parameter(self):
@@ -42,10 +54,11 @@ def test_parameter(self):
                 processor = run_processor(
                     DummyProcessor,
                     parameter=json.load(f),
+                    input_file_grp="OCR-D-IMG",
                     resolver=self.resolver,
                     mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')
                 )
-            self.assertEqual(len(processor.input_files), 20)
+            self.assertEqual(len(processor.input_files), 3)
 
     def test_verify(self):
         proc = DummyProcessor(self.workspace)
@@ -89,5 +102,78 @@ def test_run_cli(self):
                 resolver=Resolver(),
             )
 
+    def test_zip_input_files(self):
+        class ZipTestProcessor(Processor): pass
+        with pushd_popd(tempdir=True) as tempdir:
+            ws = self.resolver.workspace_from_nothing(directory=tempdir)
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
+            ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
+                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
+                    assert ('foobar1', None) in tuples
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
+
+    def test_zip_input_files_multi_mixed(self):
+        class ZipTestProcessor(Processor): pass
+        with pushd_popd(tempdir=True) as tempdir:
+            ws = self.resolver.workspace_from_nothing(directory=tempdir)
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001')
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
+            ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002')
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    print("unfiltered")
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
+                    print("PAGE-filtered")
+                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
+                    assert ('foobar3', None) in tuples
+            ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002')
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')]
+                    assert ('foobar1', 'foobar2') in tuples
+                    assert ('foobar3', 'foobar4') in tuples
+                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
+                    assert ('foobar3', None) in tuples
+                    with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
+                        tuples = proc.zip_input_files(on_error='abort')
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001')
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
+                        tuples = proc.zip_input_files()
+
+    def test_zip_input_files_require_first(self):
+        class ZipTestProcessor(Processor): pass
+        self.capture_out_err()
+        with pushd_popd(tempdir=True) as tempdir:
+            ws = self.resolver.workspace_from_nothing(directory=tempdir)
+            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None)
+            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
+            for page_id in [None, 'phys_0001,phys_0002']:
+                with self.subTest(page_id=page_id):
+                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
+                    assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
+        r = self.capture_out_err()
+        assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
+
+
 if __name__ == "__main__":
     main(__file__)