From 2d947444564f8e1670964e1334f9de8ae7aef021 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 12 Nov 2021 18:32:56 +0100 Subject: [PATCH 01/15] make_file_id: no page_id number extraction In make_file_id, if the input file's ID does not contain the input fileGrp, then do not attempt to extract the numerical part of the pageId (which might still clash); but before fallback to purely numerical ID, additionally check if the ID does already contain the pageId: in that case, only append the output fileGrp to that ID. --- ocrd_utils/ocrd_utils/str.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index d519a5df1f..e62def29a2 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -62,24 +62,25 @@ def make_file_id(ocrd_file, output_file_grp): Derive a new file ID for an output file from an existing input file ``ocrd_file`` and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``. If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``. + Else if ``ocrd_file``'s ID contains the input file's pageId, then merely append ``output_file_grp``. Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp - (as a fallback counter). Increment counter until there is no more ID conflict. + (as a fallback counter), and increment counter until there is no more ID conflict. """ ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp) if ret == ocrd_file.ID: - m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '') - if m: - n = int(m.group(1)) + if ocrd_file.pageId in ocrd_file.ID: + # still sufficiently unique + ret = ocrd_file.ID + '_' + output_file_grp else: ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)] try: n = ids.index(ocrd_file.ID) + 1 except ValueError: n = len(ids) - ret = concat_padded(output_file_grp, n) - while next(ocrd_file.mets.find_files(ID=ret), None): - n += 1 ret = concat_padded(output_file_grp, n) + while next(ocrd_file.mets.find_files(ID=ret), None): + n += 1 + ret = concat_padded(output_file_grp, n) if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) From f96d3fd40a4189b7aefb66c54eb87a0f2a6412ea Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 12 Nov 2021 18:50:53 +0100 Subject: [PATCH 02/15] make_file_id: be robust against missing pageId --- ocrd_utils/ocrd_utils/str.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index e62def29a2..e27027ee17 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -68,7 +68,7 @@ def make_file_id(ocrd_file, output_file_grp): """ ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp) if ret == ocrd_file.ID: - if ocrd_file.pageId in ocrd_file.ID: + if ocrd_file.pageId and ocrd_file.pageId in ocrd_file.ID: # still sufficiently unique ret = ocrd_file.ID + '_' + output_file_grp else: From db5d36be82a8b44a52b74b047c51afaa3b95386e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 12 Nov 2021 19:32:46 +0100 Subject: [PATCH 03/15] update test_make_file_id_605 (no numerical pageId extraction any more) --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7326c1e80f..21db84a280 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -289,7 +289,7 @@ def test_make_file_id_570(self): def test_make_file_id_605(self): """https://github.com/OCR-D/core/pull/605""" mets = OcrdMets.empty_mets() - f = mets.add_file('1:!GRP', ID='FOO_0001', pageId='phys0001') + f = mets.add_file('2:!GRP', ID='FOO_0001', pageId='phys0001') f = mets.add_file('2:!GRP', ID='FOO_0002', pageId='phys0002') self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0002') From f03977f05e1f1ba90d6786bdf414a549fa1ecfc2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Dec 2021 00:45:59 +0100 Subject: [PATCH 04/15] cli.ocrd-tool: delegate list-resources and show-resource to processor --- ocrd/ocrd/cli/ocrd_tool.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/ocrd/ocrd/cli/ocrd_tool.py b/ocrd/ocrd/cli/ocrd_tool.py index 8bf6bc3aea..8ac03d6730 100644 --- a/ocrd/ocrd/cli/ocrd_tool.py +++ b/ocrd/ocrd/cli/ocrd_tool.py @@ -13,12 +13,12 @@ import click from ocrd.decorators import parameter_option, parameter_override_option -from ocrd.processor import generate_processor_help +from ocrd.processor import Processor from ocrd_utils import ( - set_json_key_value_overrides, - VERSION as OCRD_VERSION, - parse_json_string_with_comments as loads - ) + set_json_key_value_overrides, + VERSION as OCRD_VERSION, + parse_json_string_with_comments as loads +) from ocrd_validators import ParameterValidator, OcrdToolValidator class OcrdToolCtx(): @@ -93,10 +93,24 @@ def ocrd_tool_tool(ctx, tool_name): def ocrd_tool_tool_description(ctx): print(ctx.json['tools'][ctx.tool_name]['description']) +@ocrd_tool_tool.command('list-resources', help="List tool's file resources") +@pass_ocrd_tool +def ocrd_tool_tool_list_resources(ctx): + Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], + list_resources=True) + +@ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") +@click.argument('res_name') +@pass_ocrd_tool +def ocrd_tool_tool_show_resource(ctx, res_name): + Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], + show_resource=res_name) + @ocrd_tool_tool.command('help', help="Generate help for processors") @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx): - print(generate_processor_help(ctx.json['tools'][ctx.tool_name])) + Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name], + show_help=True) # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories From 41e154d5403df32f0a344d752fc90b9fe165e775 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Dec 2021 00:52:06 +0100 Subject: [PATCH 05/15] bashlib: add --list-resources and --show-resource --- ocrd/bashlib/src/dumpjson.bash | 13 +++++++++++++ ocrd/bashlib/src/parse_argv.bash | 2 ++ ocrd/ocrd/lib.bash | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/ocrd/bashlib/src/dumpjson.bash b/ocrd/bashlib/src/dumpjson.bash index 34e06be6d5..633be7f69a 100644 --- a/ocrd/bashlib/src/dumpjson.bash +++ b/ocrd/bashlib/src/dumpjson.bash @@ -13,3 +13,16 @@ ocrd__dumpjson () { ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" dump } +## +## Output file resource content. +## +ocrd__show_resource () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" show-resource "$1" +} + +## +## Output file resources names. +## +ocrd__list_resources () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" list-resources +} diff --git a/ocrd/bashlib/src/parse_argv.bash b/ocrd/bashlib/src/parse_argv.bash index 542372166b..8d68a0717a 100644 --- a/ocrd/bashlib/src/parse_argv.bash +++ b/ocrd/bashlib/src/parse_argv.bash @@ -34,6 +34,8 @@ ocrd__parse_argv () { -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; + -C|--show-resource) ocrd__show_resource "$2"; exit ;; + -L|--list-resources) ocrd__list_resources; exit ;; -p|--parameter) __parameters+=(-p "$2") ; shift ;; -P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;; -g|--page-id) ocrd__argv[page_id]=$2 ; shift ;; diff --git a/ocrd/ocrd/lib.bash b/ocrd/ocrd/lib.bash index c8573b53bd..5c1074d87c 100644 --- a/ocrd/ocrd/lib.bash +++ b/ocrd/ocrd/lib.bash @@ -72,6 +72,20 @@ ocrd__dumpjson () { ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" dump } +## +## Output file resource content. +## +ocrd__show_resource () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" show-resource "$1" +} + +## +## Output file resources names. +## +ocrd__list_resources () { + ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" list-resources +} + # END-INCLUDE # BEGIN-INCLUDE ./src/usage.bash ## ### `ocrd__usage` @@ -122,6 +136,8 @@ ocrd__parse_argv () { -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; + -C|--show-resource) ocrd__show_resource "$2"; exit ;; + -L|--list-resources) ocrd__list_resources; exit ;; -p|--parameter) __parameters+=(-p "$2") ; shift ;; -P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;; -g|--page-id) ocrd__argv[page_id]=$2 ; shift ;; From 6da3db5e8587fafd8afa5458007b0b7f571a81ef Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Dec 2021 08:36:11 +0100 Subject: [PATCH 06/15] cli.bashlib: add input-files (delegating to Processor) --- ocrd/ocrd/cli/bashlib.py | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/ocrd/ocrd/cli/bashlib.py b/ocrd/ocrd/cli/bashlib.py index 92f5722c03..003c876d04 100644 --- a/ocrd/ocrd/cli/bashlib.py +++ b/ocrd/ocrd/cli/bashlib.py @@ -8,6 +8,7 @@ """ from __future__ import print_function import sys +from os.path import isfile import click from ocrd.constants import BASHLIB_FILENAME @@ -15,6 +16,18 @@ import ocrd_utils.constants import ocrd_models.constants import ocrd_validators.constants +from ocrd.decorators import ( + parameter_option, + parameter_override_option, + ocrd_loglevel +) +from ocrd_utils import ( + is_local_filename, + get_local_filename, + initLogging +) +from ocrd.resolver import Resolver +from ocrd.processor import Processor # ---------------------------------------------------------------------- # ocrd bashlib @@ -61,3 +74,37 @@ def bashlib_constants(name): print("[%s]=%s" % (key, val[key]), end=' ') else: print(val) + +@bashlib_cli.command('input-files') +@click.option('-m', '--mets', help="METS to process", default="mets.xml") +@click.option('-w', '--working-dir', help="Working Directory") +@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT') +# repeat some other processor options for convenience (will be ignored here) +@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT') +@click.option('-g', '--page-id', help="ID(s) of the pages to process") +@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") +@parameter_option +@parameter_override_option +@ocrd_loglevel +def bashlib_input_files(**kwargs): + """ + List input files for processing + """ + initLogging() + mets = kwargs.pop('mets') + working_dir = kwargs.pop('working_dir') + if is_local_filename(mets) and not isfile(get_local_filename(mets)): + msg = "File does not exist: %s" % mets + raise Exception(msg) + resolver = Resolver() + workspace = resolver.workspace_from_url(mets, working_dir) + processor = Processor(workspace, + ocrd_tool=None, + page_id=kwargs['page_id'], + input_file_grp=kwargs['input_file_grp'], + output_file_grp=kwargs['output_file_grp']) + for input_file in processor.input_files: + for field in ['url', 'ID', 'mimetype', 'pageId']: + # make this bash-friendly (show initialization for associative array) + print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') + print() From 851ab5b98b5d749c1cd8e58cc4ac392ebedc7e79 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Dec 2021 19:02:47 +0100 Subject: [PATCH 07/15] cli.bashlib.input-files: also output make_file_id for each input file --- ocrd/ocrd/cli/bashlib.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/bashlib.py b/ocrd/ocrd/cli/bashlib.py index 003c876d04..f075ac6487 100644 --- a/ocrd/ocrd/cli/bashlib.py +++ b/ocrd/ocrd/cli/bashlib.py @@ -24,7 +24,8 @@ from ocrd_utils import ( is_local_filename, get_local_filename, - initLogging + initLogging, + make_file_id ) from ocrd.resolver import Resolver from ocrd.processor import Processor @@ -89,6 +90,13 @@ def bashlib_constants(name): def bashlib_input_files(**kwargs): """ List input files for processing + + Instantiate a processor and workspace from the given processing options. + Then loop through the input files of the input fileGrp, and for each one, + print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended + `outputFileId` (from ``make_file_id``). + + (The printing format is one associative array initializer per line.) """ initLogging() mets = kwargs.pop('mets') @@ -107,4 +115,4 @@ def bashlib_input_files(**kwargs): for field in ['url', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') - print() + print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp'])) From 5e5a1d8577b086cd1d5104b261a68efbdcfd370f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Dec 2021 21:01:20 +0100 Subject: [PATCH 08/15] bashlib ocrd__wrap: use cli.bashlib input-files --- ocrd/bashlib/src/wrap.bash | 16 ++++++++++++++++ ocrd/ocrd/cli/bashlib.py | 2 +- ocrd/ocrd/lib.bash | 16 ++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/ocrd/bashlib/src/wrap.bash b/ocrd/bashlib/src/wrap.bash index aaa2cee99f..e46f5b90d3 100644 --- a/ocrd/bashlib/src/wrap.bash +++ b/ocrd/bashlib/src/wrap.bash @@ -27,4 +27,20 @@ ocrd__wrap () { ocrd__parse_argv "$@" + i=0 + declare -ag ocrd__files + while read line; do + eval declare -Ag "ocrd__file$i=( $line )" + eval "ocrd__files[$i]=ocrd__file$i" + let ++i + done < <(ocrd bashlib input-files \ + -m "${ocrd__argv[mets_file]}" \ + -I "${ocrd__argv[input_file_grp]}" \ + -O "${ocrd__argv[output_file_grp]}" \ + ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) +} + +# usage: pageId=$(ocrd__input_file 3 pageId) +ocrd__input_file() { + eval echo "\${${ocrd__files[$1]}[$2]}" } diff --git a/ocrd/ocrd/cli/bashlib.py b/ocrd/ocrd/cli/bashlib.py index f075ac6487..ed3d8c3344 100644 --- a/ocrd/ocrd/cli/bashlib.py +++ b/ocrd/ocrd/cli/bashlib.py @@ -80,8 +80,8 @@ def bashlib_constants(name): @click.option('-m', '--mets', help="METS to process", default="mets.xml") @click.option('-w', '--working-dir', help="Working Directory") @click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT') -# repeat some other processor options for convenience (will be ignored here) @click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT') +# repeat some other processor options for convenience (will be ignored here) @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") @parameter_option diff --git a/ocrd/ocrd/lib.bash b/ocrd/ocrd/lib.bash index 5c1074d87c..8fbb37d9cf 100644 --- a/ocrd/ocrd/lib.bash +++ b/ocrd/ocrd/lib.bash @@ -225,6 +225,22 @@ ocrd__wrap () { ocrd__parse_argv "$@" + i=0 + declare -ag ocrd__files + while read line; do + eval declare -Ag "ocrd__file$i=( $line )" + eval "ocrd__files[$i]=ocrd__file$i" + let ++i + done < <(ocrd bashlib input-files \ + -m "${ocrd__argv[mets_file]}" \ + -I "${ocrd__argv[input_file_grp]}" \ + -O "${ocrd__argv[output_file_grp]}" \ + ${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-}) +} + +# usage: pageId=$(ocrd__input_file 3 pageId) +ocrd__input_file() { + eval echo "\${${ocrd__files[$1]}[$2]}" } # END-INCLUDE From d135c2f1f686350d3aad31cc950bc543754240df Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 2 Dec 2021 21:08:39 +0100 Subject: [PATCH 09/15] :package: v2.29.0 --- CHANGELOG.md | 7 +++++++ ocrd_utils/setup.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fc87a8225..9c930133ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Added: + + * `ocrd ocrd-tool`: wrap `list-resources` and `show-resource` from `Processor` + * bashlib `ocrd__parse_argv`: add `--list-resources` and `--show-resource`, #751 + * `ocrd bashlib`: wrap `input-files` from `Processor` and `make_file_id` + * bashlib `ocrd__wrap`: offer `ocrd__files` and `ocrd__input_file`, #571 + ## [2.28.0] - 2021-11-30 Added: diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index b398d6286d..92c8de5ccc 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.28.0', + version='2.29.0', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', From cdca67ef04fb21cf591573d4ec8dbe7bd6861947 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 6 Dec 2021 14:49:19 +0100 Subject: [PATCH 10/15] make_file_id: in pageId fallback, use grp+id instead of id+grp Co-authored-by: Konstantin Baierer --- ocrd_utils/ocrd_utils/str.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_utils/ocrd_utils/str.py b/ocrd_utils/ocrd_utils/str.py index e27027ee17..0a10e43f64 100644 --- a/ocrd_utils/ocrd_utils/str.py +++ b/ocrd_utils/ocrd_utils/str.py @@ -70,7 +70,7 @@ def make_file_id(ocrd_file, output_file_grp): if ret == ocrd_file.ID: if ocrd_file.pageId and ocrd_file.pageId in ocrd_file.ID: # still sufficiently unique - ret = ocrd_file.ID + '_' + output_file_grp + ret = output_file_grp + '_' + ocrd_file.ID else: ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)] try: From 26d3bc5c994144a13b56bf3ed98cba490771a7a3 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 7 Dec 2021 18:48:31 +0100 Subject: [PATCH 11/15] ocrd_utils.make_file_id: add test for multiple numbers in ID --- tests/test_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 21db84a280..2e848e61e1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -293,6 +293,16 @@ def test_make_file_id_605(self): f = mets.add_file('2:!GRP', ID='FOO_0002', pageId='phys0002') self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0002') + def test_make_file_id_744(self): + """ + https://github.com/OCR-D/core/pull/744 + > Often file IDs have two numbers, one of which will clash. In that case only the numerical fallback works. + """ + mets = OcrdMets.empty_mets() + f = mets.add_file('2:!GRP', ID='img1796-97_00000024_img', pageId='phys0024') + f = mets.add_file('2:!GRP', ID='img1796-97_00000025_img', pageId='phys0025') + self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0025') + def test_generate_range(self): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] with self.assertRaisesRegex(ValueError, 'Unable to generate range'): From 1ec9e6b8b6246ab4ab8976f816c423d3528fef3f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 7 Dec 2021 18:51:11 +0100 Subject: [PATCH 12/15] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c930133ba..1fbefa5da0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * `ocrd_utils.make_file_id`: only increment number in ID if it is the page ID, #744 + Added: * `ocrd ocrd-tool`: wrap `list-resources` and `show-resource` from `Processor` From 93b4d14c0988dc66a4f3564f250af01a4bd12c79 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 7 Dec 2021 19:16:12 +0100 Subject: [PATCH 13/15] Update spec to 3.15.0 --- repo/spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/spec b/repo/spec index be209674d7..2eb3940500 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit be209674d7e980b97c196ec6f2f667d59b8a3ef3 +Subproject commit 2eb3940500ee39e02c09cc926bd5dc8ac76818f7 From f291b381a2762dac471eae8e6a0be1ee030eceb0 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 7 Dec 2021 19:16:58 +0100 Subject: [PATCH 14/15] OcrdMets.add_file: Enforce mets:fileGrp/@USE to be valid xsd:int, #746 --- CHANGELOG.md | 4 ++++ ocrd_models/ocrd_models/ocrd_mets.py | 4 +++- tests/model/test_ocrd_mets.py | 6 ++++++ tests/test_utils.py | 6 +++--- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fc87a8225..c5fafedff9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * `OcrdMets.add_file`: `mets:fileGrp/@USE` must be valid `xs:ID`, #746 + ## [2.28.0] - 2021-11-30 Added: diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 5a6dbabbd2..8d7fbb91fd 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -296,7 +296,9 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force if not fileGrp: raise ValueError("Must set fileGrp of the mets:file") if not REGEX_FILE_ID.fullmatch(ID): - raise ValueError("Invalid syntax for mets:file/@ID %s" % ID) + raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID) + if not REGEX_FILE_ID.fullmatch(fileGrp): + raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % ID) el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS) if el_fileGrp is None: el_fileGrp = self.add_file_group(fileGrp) diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 986cf4e282..9e9183a01a 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -255,5 +255,11 @@ def test_merge(self): self.mets.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) assert len(self.mets.file_groups) == 18 + def test_invalid_filegrp(self): + """https://github.com/OCR-D/core/issues/746""" + mets = OcrdMets(content="") + with self.assertRaisesRegex(ValueError, "Invalid syntax for mets:fileGrp/@USE"): + mets.add_file('1:! bad filegrp', ID="foo123", pageId="foobar") + if __name__ == '__main__': main(__file__) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7326c1e80f..04d09f8f59 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -289,9 +289,9 @@ def test_make_file_id_570(self): def test_make_file_id_605(self): """https://github.com/OCR-D/core/pull/605""" mets = OcrdMets.empty_mets() - f = mets.add_file('1:!GRP', ID='FOO_0001', pageId='phys0001') - f = mets.add_file('2:!GRP', ID='FOO_0002', pageId='phys0002') - self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0002') + f = mets.add_file('GRP1', ID='FOO_0001', pageId='phys0001') + f = mets.add_file('GRP2', ID='FOO_0002', pageId='phys0002') + self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0002') def test_generate_range(self): assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] From 8e809d04f09910009a3b7514d714f55ac0ab3332 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 8 Dec 2021 11:33:52 +0100 Subject: [PATCH 15/15] Update CHANGELOG.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1b26efdfb..768019e2d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: - * `ocrd_utils.make_file_id`: only increment number in ID if it is the page ID, #744 + * `ocrd_utils.make_file_id`: combine with output fileGrp if input has pageId, but don't extract numbers, #744 * `OcrdMets.add_file`: `mets:fileGrp/@USE` must be valid `xs:ID`, #746 Added: