Skip to content

Commit b4c8266

Browse files
authored
Merge pull request #635 from bertsky/processor.zip_input_files
ocrd.processor.base: add property zip_input_files
2 parents c8136bf + 625547c commit b4c8266

File tree

3 files changed

+224
-25
lines changed

3 files changed

+224
-25
lines changed

ocrd/ocrd/processor/base.py

Lines changed: 127 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import os
88
import json
9-
from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE
9+
from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger
1010
from ocrd_validators import ParameterValidator
1111
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType
1212

@@ -15,9 +15,11 @@
1515

1616
class Processor():
1717
"""
18-
A processor runs an algorithm based on the workspace, the mets.xml in the
19-
workspace (and the input files defined therein) as well as optional
20-
parameter.
18+
A processor is an OCR-D compliant command-line-interface for executing
19+
a single workflow step on the workspace (represented by local METS). It
20+
reads input files for all or requested physical pages of the input fileGrp(s),
21+
and writes output files for them into the output fileGrp(s). It may take
22+
a number of optional or mandatory parameters.
2123
"""
2224

2325
def __init__(
@@ -108,26 +110,131 @@ def add_metadata(self, pcgts):
108110
@property
109111
def input_files(self):
110112
"""
111-
List the input files.
113+
List the input files (for single input file groups).
112114
113-
- If there's a PAGE-XML for the page, take it (and forget about all
115+
For each physical page:
116+
- If there is a single PAGE-XML for the page, take it (and forget about all
114117
other files for that page)
115-
- Else if there's only one image, take it (and forget about all other
118+
- Else if there is a single image file, take it (and forget about all other
116119
files for that page)
117120
- Otherwise raise an error (complaining that only PAGE-XML warrants
118-
119121
having multiple images for a single page)
120122
(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
121123
"""
122-
ret = self.workspace.mets.find_all_files(
123-
fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE)
124-
if ret:
125-
return ret
126-
ret = self.workspace.mets.find_all_files(
127-
fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*")
128-
if self.page_id and len(ret) > 1:
129-
raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % (
130-
"for page '%s'" % self.page_id if self.page_id else '',
131-
self.input_file_grp
132-
))
133-
return ret
124+
if not self.input_file_grp:
125+
raise ValueError("Processor is missing input fileGrp")
126+
ret = self.zip_input_files(mimetype=None, on_error='abort')
127+
if not ret:
128+
return []
129+
assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
130+
return [tuples[0] for tuples in ret]
131+
132+
def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
133+
"""
134+
List tuples of input files (for multiple input file groups).
135+
136+
Processors that expect/need multiple input file groups,
137+
cannot use ``input_files``. They must align (zip) input files
138+
across pages. This includes the case where not all pages
139+
are equally present in all file groups. It also requires
140+
making a consistent selection if there are multiple files
141+
per page.
142+
143+
Following the OCR-D functional model, this function tries to
144+
find a single PAGE file per page, or fall back to a single
145+
image file per page. In either case, multiple matches per page
146+
are an error (see error handling below).
147+
This default behaviour can be changed by using a fixed MIME
148+
type filter via ``mimetype``. But still, multiple matching
149+
files per page are an error.
150+
151+
Single-page multiple-file errors are handled according to
152+
``on_error``:
153+
- if ``skip``, then the page for the respective fileGrp will be
154+
silently skipped (as if there was no match at all)
155+
- if ``first``, then the first matching file for the page will be
156+
silently selected (as if the first was the only match)
157+
- if ``last``, then the last matching file for the page will be
158+
silently selected (as if the last was the only match)
159+
- if ``abort``, then an exception will be raised.
160+
Multiple matches for PAGE-XML will always raise an exception.
161+
162+
Args:
163+
require_first (bool): If true, then skip a page entirely
164+
whenever it is not available in the first input fileGrp.
165+
166+
mimetype (str): If not None, filter by the specified MIME
167+
type (literal or regex prefixed by ``//``.
168+
Otherwise prefer PAGE or image.
169+
"""
170+
if not self.input_file_grp:
171+
raise ValueError("Processor is missing input fileGrp")
172+
173+
LOG = getLogger('ocrd.processor.base')
174+
ifgs = self.input_file_grp.split(",")
175+
# Iterating over all files repeatedly may seem inefficient at first sight,
176+
# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
177+
# can actually be much more costly than traversing the ltree.
178+
# This might depend on the number of pages vs number of fileGrps.
179+
180+
pages = dict()
181+
for i, ifg in enumerate(ifgs):
182+
for file_ in sorted(self.workspace.mets.find_all_files(
183+
pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
184+
# sort by MIME type so PAGE comes before images
185+
key=lambda file_: file_.mimetype):
186+
if not file_.pageId:
187+
continue
188+
ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
189+
if ift[i]:
190+
LOG.debug("another page %s in input file group %s", file_.pageId, ifg)
191+
# fileGrp has multiple files for this page ID
192+
if mimetype:
193+
# filter was active, this must not happen
194+
if on_error == 'skip':
195+
ift[i] = None
196+
elif on_error == 'first':
197+
pass # keep first match
198+
elif on_error == 'last':
199+
ift[i] = file_
200+
elif on_error == 'abort':
201+
raise ValueError(
202+
"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
203+
mimetype, file_.pageId, ifg))
204+
else:
205+
raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
206+
elif (ift[i].mimetype == MIMETYPE_PAGE and
207+
file_.mimetype != MIMETYPE_PAGE):
208+
pass # keep PAGE match
209+
elif (ift[i].mimetype == MIMETYPE_PAGE and
210+
file_.mimetype == MIMETYPE_PAGE):
211+
raise ValueError(
212+
"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
213+
file_.pageId, ifg))
214+
else:
215+
# filter was inactive but no PAGE is in control, this must not happen
216+
if on_error == 'skip':
217+
ift[i] = None
218+
elif on_error == 'first':
219+
pass # keep first match
220+
elif on_error == 'last':
221+
ift[i] = file_
222+
elif on_error == 'abort':
223+
raise ValueError(
224+
"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
225+
file_.pageId, ifg))
226+
else:
227+
raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
228+
else:
229+
LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
230+
ift[i] = file_
231+
ifts = list()
232+
for page, ifiles in pages.items():
233+
for i, ifg in enumerate(ifgs):
234+
if not ifiles[i]:
235+
# other fallback options?
236+
LOG.error('found no page %s in file group %s',
237+
page, ifg)
238+
if ifiles[0] or not require_first:
239+
ifts.append(tuple(ifiles))
240+
return ifts

ocrd/ocrd/workspace.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,9 @@ def resolve_image_exif(self, image_url):
266266
Return
267267
:class:`OcrdExif`
268268
"""
269+
if not image_url:
270+
# avoid "finding" just any file
271+
raise Exception("Cannot resolve empty image path")
269272
f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
270273
image_filename = self.download_file(f).local_filename
271274
ocrd_exif = exif_from_filename(image_filename)
@@ -286,6 +289,9 @@ def _resolve_image_as_pil(self, image_url, coords=None):
286289
Image or region in image as PIL.Image
287290
288291
"""
292+
if not image_url:
293+
# avoid "finding" just any file
294+
raise Exception("Cannot resolve empty image path")
289295
log = getLogger('ocrd.workspace._resolve_image_as_pil')
290296
f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
291297
image_filename = self.download_file(f).local_filename

tests/processor/test_processor.py

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22

33
from tempfile import TemporaryDirectory
44
from os.path import join
5-
from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module
5+
from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
66
from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL
77

8-
from ocrd_utils import MIMETYPE_PAGE
8+
from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
99
from ocrd.resolver import Resolver
1010
from ocrd.processor.base import Processor, run_processor, run_cli
1111

1212
class TestProcessor(TestCase):
1313

1414
def setUp(self):
15+
disableLogging()
16+
initLogging()
1517
self.resolver = Resolver()
1618
self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
1719

@@ -28,9 +30,19 @@ def test_no_mets_url(self):
2830
with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'):
2931
run_processor(DummyProcessor, resolver=self.resolver)
3032

33+
def test_no_input_file_grp(self):
34+
processor = run_processor(DummyProcessor,
35+
resolver=self.resolver,
36+
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
37+
with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'):
38+
_ = processor.input_files
39+
3140
def test_with_mets_url_input_files(self):
32-
processor = run_processor(DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
33-
self.assertEqual(len(processor.input_files), 20)
41+
processor = run_processor(DummyProcessor,
42+
input_file_grp='OCR-D-SEG-PAGE',
43+
resolver=self.resolver,
44+
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
45+
self.assertEqual(len(processor.input_files), 2)
3446
self.assertTrue(all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))
3547

3648
def test_parameter(self):
@@ -42,10 +54,11 @@ def test_parameter(self):
4254
processor = run_processor(
4355
DummyProcessor,
4456
parameter=json.load(f),
57+
input_file_grp="OCR-D-IMG",
4558
resolver=self.resolver,
4659
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')
4760
)
48-
self.assertEqual(len(processor.input_files), 20)
61+
self.assertEqual(len(processor.input_files), 3)
4962

5063
def test_verify(self):
5164
proc = DummyProcessor(self.workspace)
@@ -89,5 +102,78 @@ def test_run_cli(self):
89102
resolver=Resolver(),
90103
)
91104

105+
def test_zip_input_files(self):
106+
class ZipTestProcessor(Processor): pass
107+
with pushd_popd(tempdir=True) as tempdir:
108+
ws = self.resolver.workspace_from_nothing(directory=tempdir)
109+
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
110+
ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001')
111+
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
112+
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
113+
for page_id in [None, 'phys_0001,phys_0002']:
114+
with self.subTest(page_id=page_id):
115+
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
116+
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
117+
assert ('foobar1', 'foobar2') in tuples
118+
assert ('foobar3', 'foobar4') in tuples
119+
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
120+
assert ('foobar1', None) in tuples
121+
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
122+
assert ('foobar1', 'foobar2') in tuples
123+
assert ('foobar3', 'foobar4') in tuples
124+
125+
def test_zip_input_files_multi_mixed(self):
126+
class ZipTestProcessor(Processor): pass
127+
with pushd_popd(tempdir=True) as tempdir:
128+
ws = self.resolver.workspace_from_nothing(directory=tempdir)
129+
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
130+
ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001')
131+
ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001')
132+
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
133+
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
134+
ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002')
135+
for page_id in [None, 'phys_0001,phys_0002']:
136+
with self.subTest(page_id=page_id):
137+
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
138+
print("unfiltered")
139+
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
140+
assert ('foobar1', 'foobar2') in tuples
141+
assert ('foobar3', 'foobar4') in tuples
142+
print("PAGE-filtered")
143+
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
144+
assert ('foobar3', None) in tuples
145+
ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002')
146+
for page_id in [None, 'phys_0001,phys_0002']:
147+
with self.subTest(page_id=page_id):
148+
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
149+
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')]
150+
assert ('foobar1', 'foobar2') in tuples
151+
assert ('foobar3', 'foobar4') in tuples
152+
tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
153+
assert ('foobar3', None) in tuples
154+
with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
155+
tuples = proc.zip_input_files(on_error='abort')
156+
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001')
157+
for page_id in [None, 'phys_0001,phys_0002']:
158+
with self.subTest(page_id=page_id):
159+
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
160+
with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
161+
tuples = proc.zip_input_files()
162+
163+
def test_zip_input_files_require_first(self):
164+
class ZipTestProcessor(Processor): pass
165+
self.capture_out_err()
166+
with pushd_popd(tempdir=True) as tempdir:
167+
ws = self.resolver.workspace_from_nothing(directory=tempdir)
168+
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None)
169+
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
170+
for page_id in [None, 'phys_0001,phys_0002']:
171+
with self.subTest(page_id=page_id):
172+
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
173+
assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
174+
r = self.capture_out_err()
175+
assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
176+
177+
92178
if __name__ == "__main__":
93179
main(__file__)

0 commit comments

Comments
 (0)