22
33from tempfile import TemporaryDirectory
44from os .path import join
5- from tests .base import TestCase , assets , main # pylint: disable=import-error, no-name-in-module
5+ from tests .base import CapturingTestCase as TestCase , assets , main # pylint: disable=import-error, no-name-in-module
66from tests .data import DummyProcessor , DummyProcessorWithRequiredParameters , IncompleteProcessor , DUMMY_TOOL
77
8- from ocrd_utils import MIMETYPE_PAGE
8+ from ocrd_utils import MIMETYPE_PAGE , pushd_popd , initLogging , disableLogging
99from ocrd .resolver import Resolver
1010from ocrd .processor .base import Processor , run_processor , run_cli
1111
1212class TestProcessor (TestCase ):
1313
1414 def setUp (self ):
15+ disableLogging ()
16+ initLogging ()
1517 self .resolver = Resolver ()
1618 self .workspace = self .resolver .workspace_from_url (assets .url_of ('SBB0000F29300010000/data/mets.xml' ))
1719
@@ -28,9 +30,19 @@ def test_no_mets_url(self):
2830 with self .assertRaisesRegex (Exception , 'pass mets_url to create a workspace' ):
2931 run_processor (DummyProcessor , resolver = self .resolver )
3032
33+ def test_no_input_file_grp (self ):
34+ processor = run_processor (DummyProcessor ,
35+ resolver = self .resolver ,
36+ mets_url = assets .url_of ('SBB0000F29300010000/data/mets.xml' ))
37+ with self .assertRaisesRegex (Exception , 'Processor is missing input fileGrp' ):
38+ _ = processor .input_files
39+
3140 def test_with_mets_url_input_files (self ):
32- processor = run_processor (DummyProcessor , resolver = self .resolver , mets_url = assets .url_of ('SBB0000F29300010000/data/mets.xml' ))
33- self .assertEqual (len (processor .input_files ), 20 )
41+ processor = run_processor (DummyProcessor ,
42+ input_file_grp = 'OCR-D-SEG-PAGE' ,
43+ resolver = self .resolver ,
44+ mets_url = assets .url_of ('SBB0000F29300010000/data/mets.xml' ))
45+ self .assertEqual (len (processor .input_files ), 2 )
3446 self .assertTrue (all ([f .mimetype == MIMETYPE_PAGE for f in processor .input_files ]))
3547
3648 def test_parameter (self ):
@@ -42,10 +54,11 @@ def test_parameter(self):
4254 processor = run_processor (
4355 DummyProcessor ,
4456 parameter = json .load (f ),
57+ input_file_grp = "OCR-D-IMG" ,
4558 resolver = self .resolver ,
4659 mets_url = assets .url_of ('SBB0000F29300010000/data/mets.xml' )
4760 )
48- self .assertEqual (len (processor .input_files ), 20 )
61+ self .assertEqual (len (processor .input_files ), 3 )
4962
5063 def test_verify (self ):
5164 proc = DummyProcessor (self .workspace )
@@ -89,5 +102,78 @@ def test_run_cli(self):
89102 resolver = Resolver (),
90103 )
91104
105+ def test_zip_input_files (self ):
106+ class ZipTestProcessor (Processor ): pass
107+ with pushd_popd (tempdir = True ) as tempdir :
108+ ws = self .resolver .workspace_from_nothing (directory = tempdir )
109+ ws .add_file ('GRP1' , mimetype = MIMETYPE_PAGE , ID = 'foobar1' , pageId = 'phys_0001' )
110+ ws .add_file ('GRP2' , mimetype = 'application/alto+xml' , ID = 'foobar2' , pageId = 'phys_0001' )
111+ ws .add_file ('GRP1' , mimetype = MIMETYPE_PAGE , ID = 'foobar3' , pageId = 'phys_0002' )
112+ ws .add_file ('GRP2' , mimetype = MIMETYPE_PAGE , ID = 'foobar4' , pageId = 'phys_0002' )
113+ for page_id in [None , 'phys_0001,phys_0002' ]:
114+ with self .subTest (page_id = page_id ):
115+ proc = ZipTestProcessor (workspace = ws , input_file_grp = 'GRP1,GRP2' , page_id = page_id )
116+ tuples = [(one .ID , two .ID ) for one , two in proc .zip_input_files ()]
117+ assert ('foobar1' , 'foobar2' ) in tuples
118+ assert ('foobar3' , 'foobar4' ) in tuples
119+ tuples = [(one .ID , two ) for one , two in proc .zip_input_files (mimetype = MIMETYPE_PAGE )]
120+ assert ('foobar1' , None ) in tuples
121+ tuples = [(one .ID , two .ID ) for one , two in proc .zip_input_files (mimetype = r'//application/(vnd.prima.page|alto)\+xml' )]
122+ assert ('foobar1' , 'foobar2' ) in tuples
123+ assert ('foobar3' , 'foobar4' ) in tuples
124+
125+ def test_zip_input_files_multi_mixed (self ):
126+ class ZipTestProcessor (Processor ): pass
127+ with pushd_popd (tempdir = True ) as tempdir :
128+ ws = self .resolver .workspace_from_nothing (directory = tempdir )
129+ ws .add_file ('GRP1' , mimetype = MIMETYPE_PAGE , ID = 'foobar1' , pageId = 'phys_0001' )
130+ ws .add_file ('GRP1' , mimetype = 'image/png' , ID = 'foobar1img1' , pageId = 'phys_0001' )
131+ ws .add_file ('GRP1' , mimetype = 'image/png' , ID = 'foobar1img2' , pageId = 'phys_0001' )
132+ ws .add_file ('GRP2' , mimetype = MIMETYPE_PAGE , ID = 'foobar2' , pageId = 'phys_0001' )
133+ ws .add_file ('GRP1' , mimetype = MIMETYPE_PAGE , ID = 'foobar3' , pageId = 'phys_0002' )
134+ ws .add_file ('GRP2' , mimetype = 'image/tiff' , ID = 'foobar4' , pageId = 'phys_0002' )
135+ for page_id in [None , 'phys_0001,phys_0002' ]:
136+ with self .subTest (page_id = page_id ):
137+ proc = ZipTestProcessor (workspace = ws , input_file_grp = 'GRP1,GRP2' , page_id = page_id )
138+ print ("unfiltered" )
139+ tuples = [(one .ID , two .ID ) for one , two in proc .zip_input_files ()]
140+ assert ('foobar1' , 'foobar2' ) in tuples
141+ assert ('foobar3' , 'foobar4' ) in tuples
142+ print ("PAGE-filtered" )
143+ tuples = [(one .ID , two ) for one , two in proc .zip_input_files (mimetype = MIMETYPE_PAGE )]
144+ assert ('foobar3' , None ) in tuples
145+ ws .add_file ('GRP2' , mimetype = 'image/tiff' , ID = 'foobar4dup' , pageId = 'phys_0002' )
146+ for page_id in [None , 'phys_0001,phys_0002' ]:
147+ with self .subTest (page_id = page_id ):
148+ proc = ZipTestProcessor (workspace = ws , input_file_grp = 'GRP1,GRP2' , page_id = page_id )
149+ tuples = [(one .ID , two .ID ) for one , two in proc .zip_input_files (on_error = 'first' )]
150+ assert ('foobar1' , 'foobar2' ) in tuples
151+ assert ('foobar3' , 'foobar4' ) in tuples
152+ tuples = [(one .ID , two ) for one , two in proc .zip_input_files (on_error = 'skip' )]
153+ assert ('foobar3' , None ) in tuples
154+ with self .assertRaisesRegex (Exception , "No PAGE-XML for page .* in fileGrp .* but multiple matches." ):
155+ tuples = proc .zip_input_files (on_error = 'abort' )
156+ ws .add_file ('GRP2' , mimetype = MIMETYPE_PAGE , ID = 'foobar2dup' , pageId = 'phys_0001' )
157+ for page_id in [None , 'phys_0001,phys_0002' ]:
158+ with self .subTest (page_id = page_id ):
159+ proc = ZipTestProcessor (workspace = ws , input_file_grp = 'GRP1,GRP2' , page_id = page_id )
160+ with self .assertRaisesRegex (Exception , "Multiple PAGE-XML matches for page" ):
161+ tuples = proc .zip_input_files ()
162+
163+ def test_zip_input_files_require_first (self ):
164+ class ZipTestProcessor (Processor ): pass
165+ self .capture_out_err ()
166+ with pushd_popd (tempdir = True ) as tempdir :
167+ ws = self .resolver .workspace_from_nothing (directory = tempdir )
168+ ws .add_file ('GRP1' , mimetype = MIMETYPE_PAGE , ID = 'foobar1' , pageId = None )
169+ ws .add_file ('GRP2' , mimetype = MIMETYPE_PAGE , ID = 'foobar2' , pageId = 'phys_0001' )
170+ for page_id in [None , 'phys_0001,phys_0002' ]:
171+ with self .subTest (page_id = page_id ):
172+ proc = ZipTestProcessor (workspace = ws , input_file_grp = 'GRP1,GRP2' , page_id = page_id )
173+ assert [(one , two .ID ) for one , two in proc .zip_input_files (require_first = False )] == [(None , 'foobar2' )]
174+ r = self .capture_out_err ()
175+ assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r .err
176+
177+
92178if __name__ == "__main__" :
93179 main (__file__ )
0 commit comments