diff --git a/ocrd/ocrd/processor/base.py b/ocrd/ocrd/processor/base.py index 58ff423ef8..71f9378c2a 100644 --- a/ocrd/ocrd/processor/base.py +++ b/ocrd/ocrd/processor/base.py @@ -6,7 +6,7 @@ import os import json -from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE +from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger from ocrd_validators import ParameterValidator from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType @@ -15,9 +15,11 @@ class Processor(): """ - A processor runs an algorithm based on the workspace, the mets.xml in the - workspace (and the input files defined therein) as well as optional - parameter. + A processor is an OCR-D compliant command-line-interface for executing + a single workflow step on the workspace (represented by local METS). It + reads input files for all or requested physical pages of the input fileGrp(s), + and writes output files for them into the output fileGrp(s). It may take + a number of optional or mandatory parameters. """ def __init__( @@ -100,26 +102,131 @@ def add_metadata(self, pcgts): @property def input_files(self): """ - List the input files. + List the input files (for single input file groups). - - If there's a PAGE-XML for the page, take it (and forget about all + For each physical page: + - If there is a single PAGE-XML for the page, take it (and forget about all other files for that page) - - Else if there's only one image, take it (and forget about all other + - Else if there is a single image file, take it (and forget about all other files for that page) - Otherwise raise an error (complaining that only PAGE-XML warrants - having multiple images for a single page) (https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593) """ - ret = self.workspace.mets.find_all_files( - fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE) - if ret: - return ret - ret = self.workspace.mets.find_all_files( - fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*") - if self.page_id and len(ret) > 1: - raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % ( - "for page '%s'" % self.page_id if self.page_id else '', - self.input_file_grp - )) - return ret + if not self.input_file_grp: + raise ValueError("Processor is missing input fileGrp") + ret = self.zip_input_files(mimetype=None, on_error='abort') + if not ret: + return [] + assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps' + return [tuples[0] for tuples in ret] + + def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): + """ + List tuples of input files (for multiple input file groups). + + Processors that expect/need multiple input file groups, + cannot use ``input_files``. They must align (zip) input files + across pages. This includes the case where not all pages + are equally present in all file groups. It also requires + making a consistent selection if there are multiple files + per page. + + Following the OCR-D functional model, this function tries to + find a single PAGE file per page, or fall back to a single + image file per page. In either case, multiple matches per page + are an error (see error handling below). + This default behaviour can be changed by using a fixed MIME + type filter via ``mimetype``. But still, multiple matching + files per page are an error. + + Single-page multiple-file errors are handled according to + ``on_error``: + - if ``skip``, then the page for the respective fileGrp will be + silently skipped (as if there was no match at all) + - if ``first``, then the first matching file for the page will be + silently selected (as if the first was the only match) + - if ``last``, then the last matching file for the page will be + silently selected (as if the last was the only match) + - if ``abort``, then an exception will be raised. + Multiple matches for PAGE-XML will always raise an exception. + + Args: + require_first (bool): If true, then skip a page entirely + whenever it is not available in the first input fileGrp. + + mimetype (str): If not None, filter by the specified MIME + type (literal or regex prefixed by ``//``. + Otherwise prefer PAGE or image. + """ + if not self.input_file_grp: + raise ValueError("Processor is missing input fileGrp") + + LOG = getLogger('ocrd.processor.base') + ifgs = self.input_file_grp.split(",") + # Iterating over all files repeatedly may seem inefficient at first sight, + # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering + # can actually be much more costly than traversing the ltree. + # This might depend on the number of pages vs number of fileGrps. + + pages = dict() + for i, ifg in enumerate(ifgs): + for file_ in sorted(self.workspace.mets.find_all_files( + pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), + # sort by MIME type so PAGE comes before images + key=lambda file_: file_.mimetype): + if not file_.pageId: + continue + ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) + if ift[i]: + LOG.debug("another page %s in input file group %s", file_.pageId, ifg) + # fileGrp has multiple files for this page ID + if mimetype: + # filter was active, this must not happen + if on_error == 'skip': + ift[i] = None + elif on_error == 'first': + pass # keep first match + elif on_error == 'last': + ift[i] = file_ + elif on_error == 'abort': + raise ValueError( + "Multiple '%s' matches for page '%s' in fileGrp '%s'." % ( + mimetype, file_.pageId, ifg)) + else: + raise Exception("Unknown 'on_error' strategy '%s'" % on_error) + elif (ift[i].mimetype == MIMETYPE_PAGE and + file_.mimetype != MIMETYPE_PAGE): + pass # keep PAGE match + elif (ift[i].mimetype == MIMETYPE_PAGE and + file_.mimetype == MIMETYPE_PAGE): + raise ValueError( + "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % ( + file_.pageId, ifg)) + else: + # filter was inactive but no PAGE is in control, this must not happen + if on_error == 'skip': + ift[i] = None + elif on_error == 'first': + pass # keep first match + elif on_error == 'last': + ift[i] = file_ + elif on_error == 'abort': + raise ValueError( + "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % ( + file_.pageId, ifg)) + else: + raise Exception("Unknown 'on_error' strategy '%s'" % on_error) + else: + LOG.debug("adding page %s to input file group %s", file_.pageId, ifg) + ift[i] = file_ + ifts = list() + for page, ifiles in pages.items(): + for i, ifg in enumerate(ifgs): + if not ifiles[i]: + # other fallback options? + LOG.error('found no page %s in file group %s', + page, ifg) + if ifiles[0] or not require_first: + ifts.append(tuple(ifiles)) + return ifts diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index c06f7061fb..41f4349285 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -266,6 +266,9 @@ def resolve_image_exif(self, image_url): Return :class:`OcrdExif` """ + if not image_url: + # avoid "finding" just any file + raise Exception("Cannot resolve empty image path") f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url)) image_filename = self.download_file(f).local_filename ocrd_exif = exif_from_filename(image_filename) @@ -286,6 +289,9 @@ def _resolve_image_as_pil(self, image_url, coords=None): Image or region in image as PIL.Image """ + if not image_url: + # avoid "finding" just any file + raise Exception("Cannot resolve empty image path") log = getLogger('ocrd.workspace._resolve_image_as_pil') f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url)) image_filename = self.download_file(f).local_filename diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 6cd1dff840..7ca6aaa0ef 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -2,16 +2,18 @@ from tempfile import TemporaryDirectory from os.path import join -from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module +from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL -from ocrd_utils import MIMETYPE_PAGE +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver from ocrd.processor.base import Processor, run_processor, run_cli class TestProcessor(TestCase): def setUp(self): + disableLogging() + initLogging() self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) @@ -28,9 +30,19 @@ def test_no_mets_url(self): with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'): run_processor(DummyProcessor, resolver=self.resolver) + def test_no_input_file_grp(self): + processor = run_processor(DummyProcessor, + resolver=self.resolver, + mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) + with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'): + _ = processor.input_files + def test_with_mets_url_input_files(self): - processor = run_processor(DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) - self.assertEqual(len(processor.input_files), 20) + processor = run_processor(DummyProcessor, + input_file_grp='OCR-D-SEG-PAGE', + resolver=self.resolver, + mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) + self.assertEqual(len(processor.input_files), 2) self.assertTrue(all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files])) def test_parameter(self): @@ -42,10 +54,11 @@ def test_parameter(self): processor = run_processor( DummyProcessor, parameter=json.load(f), + input_file_grp="OCR-D-IMG", resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml') ) - self.assertEqual(len(processor.input_files), 20) + self.assertEqual(len(processor.input_files), 3) def test_verify(self): proc = DummyProcessor(self.workspace) @@ -89,5 +102,78 @@ def test_run_cli(self): resolver=Resolver(), ) + def test_zip_input_files(self): + class ZipTestProcessor(Processor): pass + with pushd_popd(tempdir=True) as tempdir: + ws = self.resolver.workspace_from_nothing(directory=tempdir) + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') + ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002') + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] + assert ('foobar1', None) in tuples + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + + def test_zip_input_files_multi_mixed(self): + class ZipTestProcessor(Processor): pass + with pushd_popd(tempdir=True) as tempdir: + ws = self.resolver.workspace_from_nothing(directory=tempdir) + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') + ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001') + ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') + ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002') + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + print("unfiltered") + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + print("PAGE-filtered") + tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] + assert ('foobar3', None) in tuples + ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002') + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] + assert ('foobar1', 'foobar2') in tuples + assert ('foobar3', 'foobar4') in tuples + tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] + assert ('foobar3', None) in tuples + with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): + tuples = proc.zip_input_files(on_error='abort') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001') + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): + tuples = proc.zip_input_files() + + def test_zip_input_files_require_first(self): + class ZipTestProcessor(Processor): pass + self.capture_out_err() + with pushd_popd(tempdir=True) as tempdir: + ws = self.resolver.workspace_from_nothing(directory=tempdir) + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None) + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') + for page_id in [None, 'phys_0001,phys_0002']: + with self.subTest(page_id=page_id): + proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) + assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] + r = self.capture_out_err() + assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + + if __name__ == "__main__": main(__file__)