diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fc87a8225..768019e2d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,18 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * `ocrd_utils.make_file_id`: combine with output fileGrp if input has pageId, but don't extract numbers, #744 + * `OcrdMets.add_file`: `mets:fileGrp/@USE` must be valid `xs:ID`, #746 + +Added: + + * `ocrd ocrd-tool`: wrap `list-resources` and `show-resource` from `Processor` + * bashlib `ocrd__parse_argv`: add `--list-resources` and `--show-resource`, #751 + * `ocrd bashlib`: wrap `input-files` from `Processor` and `make_file_id` + * bashlib `ocrd__wrap`: offer `ocrd__files` and `ocrd__input_file`, #571 + ## [2.28.0] - 2021-11-30 Added: diff --git a/ocrd/bashlib/src/dumpjson.bash b/ocrd/bashlib/src/dumpjson.bash index 34e06be6d5..633be7f69a 100644 --- a/ocrd/bashlib/src/dumpjson.bash +++ b/ocrd/bashlib/src/dumpjson.bash @@ -13,3 +13,16 @@ ocrd__dumpjson () { ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" dump } +## +## Output file resource content. +## +ocrd__show_resource () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" show-resource "$1" +} + +## +## Output file resources names. +## +ocrd__list_resources () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" list-resources +} diff --git a/ocrd/bashlib/src/parse_argv.bash b/ocrd/bashlib/src/parse_argv.bash index 542372166b..8d68a0717a 100644 --- a/ocrd/bashlib/src/parse_argv.bash +++ b/ocrd/bashlib/src/parse_argv.bash @@ -34,6 +34,8 @@ ocrd__parse_argv () { -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; + -C|--show-resource) ocrd__show_resource "$2"; exit ;; + -L|--list-resources) ocrd__list_resources; exit ;; -p|--parameter) __parameters+=(-p "$2") ; shift ;; -P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;; -g|--page-id) ocrd__argv[page_id]=$2 ; shift ;; diff --git a/ocrd/bashlib/src/wrap.bash b/ocrd/bashlib/src/wrap.bash index aaa2cee99f..e46f5b90d3 100644 --- a/ocrd/bashlib/src/wrap.bash +++ b/ocrd/bashlib/src/wrap.bash @@ -27,4 +27,20 @@ ocrd__wrap () { ocrd__parse_argv "$@" + i=0 + declare -ag ocrd__files + while read line; do + eval declare -Ag "ocrd__file$i=( $line )" + eval "ocrd__files[$i]=ocrd__file$i" + let ++i + done < <(ocrd bashlib input-files \ + -m "${ocrd__argv[mets_file]}" \ + -I "${ocrd__argv[input_file_grp]}" \ + -O "${ocrd__argv[output_file_grp]}" \ + ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) +} + +# usage: pageId=$(ocrd__input_file 3 pageId) +ocrd__input_file() { + eval echo "\${${ocrd__files[$1]}[$2]}" } diff --git a/ocrd/ocrd/cli/bashlib.py b/ocrd/ocrd/cli/bashlib.py index 92f5722c03..ed3d8c3344 100644 --- a/ocrd/ocrd/cli/bashlib.py +++ b/ocrd/ocrd/cli/bashlib.py @@ -8,6 +8,7 @@ """ from __future__ import print_function import sys +from os.path import isfile import click from ocrd.constants import BASHLIB_FILENAME @@ -15,6 +16,19 @@ import ocrd_utils.constants import ocrd_models.constants import ocrd_validators.constants +from ocrd.decorators import ( + parameter_option, + parameter_override_option, + ocrd_loglevel +) +from ocrd_utils import ( + is_local_filename, + get_local_filename, + initLogging, + make_file_id +) +from ocrd.resolver import Resolver +from ocrd.processor import Processor # ---------------------------------------------------------------------- # ocrd bashlib @@ -61,3 +75,44 @@ def bashlib_constants(name): print("[%s]=%s" % (key, val[key]), end=' ') else: print(val) + +@bashlib_cli.command('input-files') +@click.option('-m', '--mets', help="METS to process", default="mets.xml") +@click.option('-w', '--working-dir', help="Working Directory") +@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT') +@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT') +# repeat some other processor options for convenience (will be ignored here) +@click.option('-g', '--page-id', help="ID(s) of the pages to process") +@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") +@parameter_option +@parameter_override_option +@ocrd_loglevel +def bashlib_input_files(**kwargs): + """ + List input files for processing + + Instantiate a processor and workspace from the given processing options. + Then loop through the input files of the input fileGrp, and for each one, + print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended + `outputFileId` (from ``make_file_id``). + + (The printing format is one associative array initializer per line.) + """ + initLogging() + mets = kwargs.pop('mets') + working_dir = kwargs.pop('working_dir') + if is_local_filename(mets) and not isfile(get_local_filename(mets)): + msg = "File does not exist: %s" % mets + raise Exception(msg) + resolver = Resolver() + workspace = resolver.workspace_from_url(mets, working_dir) + processor = Processor(workspace, + ocrd_tool=None, + page_id=kwargs['page_id'], + input_file_grp=kwargs['input_file_grp'], + output_file_grp=kwargs['output_file_grp']) + for input_file in processor.input_files: + for field in ['url', 'ID', 'mimetype', 'pageId']: + # make this bash-friendly (show initialization for associative array) + print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') + print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp'])) diff --git a/ocrd/ocrd/cli/ocrd_tool.py b/ocrd/ocrd/cli/ocrd_tool.py index 8bf6bc3aea..8ac03d6730 100644 --- a/ocrd/ocrd/cli/ocrd_tool.py +++ b/ocrd/ocrd/cli/ocrd_tool.py @@ -13,12 +13,12 @@ import click from ocrd.decorators import parameter_option, parameter_override_option -from ocrd.processor import generate_processor_help +from ocrd.processor import Processor from ocrd_utils import ( - set_json_key_value_overrides, - VERSION as OCRD_VERSION, - parse_json_string_with_comments as loads - ) + set_json_key_value_overrides, + VERSION as OCRD_VERSION, + parse_json_string_with_comments as loads +) from ocrd_validators import ParameterValidator, OcrdToolValidator class OcrdToolCtx(): @@ -93,10 +93,24 @@ def ocrd_tool_tool(ctx, tool_name): def ocrd_tool_tool_description(ctx): print(ctx.json['tools'][ctx.tool_name]['description']) +@ocrd_tool_tool.command('list-resources', help="List tool's file resources") +@pass_ocrd_tool +def ocrd_tool_tool_list_resources(ctx): + Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], + list_resources=True) + +@ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") +@click.argument('res_name') +@pass_ocrd_tool +def ocrd_tool_tool_show_resource(ctx, res_name): + Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], + show_resource=res_name) + @ocrd_tool_tool.command('help', help="Generate help for processors") @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx): - print(generate_processor_help(ctx.json['tools'][ctx.tool_name])) + Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], + show_help=True) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories diff --git a/ocrd/ocrd/lib.bash b/ocrd/ocrd/lib.bash index c8573b53bd..8fbb37d9cf 100644 --- a/ocrd/ocrd/lib.bash +++ b/ocrd/ocrd/lib.bash @@ -72,6 +72,20 @@ ocrd__dumpjson () { ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" dump } +## +## Output file resource content. +## +ocrd__show_resource () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" show-resource "$1" +} + +## +## Output file resources names. +## +ocrd__list_resources () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" list-resources +} + # END-INCLUDE # BEGIN-INCLUDE ./src/usage.bash ## ### `ocrd__usage` @@ -122,6 +136,8 @@ ocrd__parse_argv () { -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; + -C|--show-resource) ocrd__show_resource "$2"; exit ;; + -L|--list-resources) ocrd__list_resources; exit ;; -p|--parameter) __parameters+=(-p "$2") ; shift ;; -P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;; -g|--page-id) ocrd__argv[page_id]=$2 ; shift ;; @@ -209,6 +225,22 @@ ocrd__wrap () { ocrd__parse_argv "$@" + i=0 + declare -ag ocrd__files + while read line; do + eval declare -Ag "ocrd__file$i=( $line )" + eval "ocrd__files[$i]=ocrd__file$i" + let ++i + done < <(ocrd bashlib input-files \ + -m "${ocrd__argv[mets_file]}" \ + -I "${ocrd__argv[input_file_grp]}" \ + -O "${ocrd__argv[output_file_grp]}" \ + ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) +} + +# usage: pageId=$(ocrd__input_file 3 pageId) +ocrd__input_file() { + eval echo "\${${ocrd__files[$1]}[$2]}" } # END-INCLUDE diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 5a6dbabbd2..8d7fbb91fd 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -296,7 +296,9 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force if not fileGrp: raise ValueError("Must set fileGrp of the mets:file") if not REGEX_FILE_ID.fullmatch(ID): - raise ValueError("Invalid syntax for mets:file/@ID %s" % ID) + raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID) + if not REGEX_FILE_ID.fullmatch(fileGrp): + raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % ID) el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) if el_fileGrp is None: el_fileGrp = self.add_file_group(fileGrp) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index d519a5df1f..0a10e43f64 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -62,24 +62,25 @@ def make_file_id(ocrd_file, output_file_grp): Derive a new file ID for an output file from an existing input file ``ocrd_file`` and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``. If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``. + Else if ``ocrd_file``'s ID contains the input file's pageId, then merely append ``output_file_grp``. Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp - (as a fallback counter). Increment counter until there is no more ID conflict. + (as a fallback counter), and increment counter until there is no more ID conflict. """ ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp) if ret == ocrd_file.ID: - m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '') - if m: - n = int(m.group(1)) + if ocrd_file.pageId and ocrd_file.pageId in ocrd_file.ID: + # still sufficiently unique + ret = output_file_grp + '_' + ocrd_file.ID else: ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)] try: n = ids.index(ocrd_file.ID) + 1 except ValueError: n = len(ids) - ret = concat_padded(output_file_grp, n) - while next(ocrd_file.mets.find_files(ID=ret), None): - n += 1 ret = concat_padded(output_file_grp, n) + while next(ocrd_file.mets.find_files(ID=ret), None): + n += 1 + ret = concat_padded(output_file_grp, n) if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index b398d6286d..92c8de5ccc 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.28.0', + version='2.29.0', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', diff --git a/repo/spec b/repo/spec index be209674d7..2eb3940500 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit be209674d7e980b97c196ec6f2f667d59b8a3ef3 +Subproject commit 2eb3940500ee39e02c09cc926bd5dc8ac76818f7 diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 986cf4e282..9e9183a01a 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -255,5 +255,11 @@ def test_merge(self): self.mets.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) assert len(self.mets.file_groups) == 18 + def test_invalid_filegrp(self): + """https://github.com/OCR-D/core/issues/746""" + mets = OcrdMets(content="") + with self.assertRaisesRegex(ValueError, "Invalid syntax for mets:fileGrp/@USE"): + mets.add_file('1:! bad filegrp', ID="foo123", pageId="foobar") + if __name__ == '__main__': main(__file__) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7326c1e80f..e1da91a586 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -289,9 +289,19 @@ def test_make_file_id_570(self): def test_make_file_id_605(self): """https://github.com/OCR-D/core/pull/605""" mets = OcrdMets.empty_mets() - f = mets.add_file('1:!GRP', ID='FOO_0001', pageId='phys0001') - f = mets.add_file('2:!GRP', ID='FOO_0002', pageId='phys0002') - self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0002') + f = mets.add_file('GRP1', ID='FOO_0001', pageId='phys0001') + f = mets.add_file('GRP2', ID='FOO_0002', pageId='phys0002') + self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0001') + + def test_make_file_id_744(self): + """ + https://github.com/OCR-D/core/pull/744 + > Often file IDs have two numbers, one of which will clash. In that case only the numerical fallback works. + """ + mets = OcrdMets.empty_mets() + f = mets.add_file('GRP2', ID='img1796-97_00000024_img', pageId='phys0024') + f = mets.add_file('GRP2', ID='img1796-97_00000025_img', pageId='phys0025') + self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0002') def test_generate_range(self): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005']