From 7e7752ae9a82756efb73110c0e0b73203d77835c Mon Sep 17 00:00:00 2001 From: simleo Date: Wed, 31 May 2023 17:33:28 +0200 Subject: [PATCH 1/9] add option to remap data entity names to the original ones --- src/runcrate/cli.py | 9 ++++-- src/runcrate/convert.py | 22 ++++++++++---- tests/test_cwlprov_crate_builder.py | 46 +++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/src/runcrate/cli.py b/src/runcrate/cli.py index e8010d2..4907ad1 100644 --- a/src/runcrate/cli.py +++ b/src/runcrate/cli.py @@ -56,7 +56,12 @@ def cli(): type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), help="path to a README file (should be README.md in Markdown format)", ) -def convert(root, output, license, workflow_name, readme): +@click.option( + "--remap-names", + help="remap file/dir names to the original ones (MAY LEAD TO CLASHES!)", + is_flag=True +) +def convert(root, output, license, workflow_name, readme, remap_names): """\ Convert a CWLProv RO bundle into a Workflow Run RO-Crate. @@ -64,7 +69,7 @@ def convert(root, output, license, workflow_name, readme): """ if not output: output = Path(f"{root.name}.crate.zip") - builder = ProvCrateBuilder(root, workflow_name, license, readme) + builder = ProvCrateBuilder(root, workflow_name, license, readme, remap_names=remap_names) crate = builder.build() if output.suffix == ".zip": crate.write_zip(output) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index a37ec6b..36ee319 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -192,7 +192,8 @@ def get_workflow(wf_path): class ProvCrateBuilder: - def __init__(self, root, workflow_name=None, license=None, readme=None): + def __init__(self, root, workflow_name=None, license=None, readme=None, + remap_names=False): self.root = Path(root) self.workflow_name = workflow_name self.license = license @@ -213,6 +214,7 @@ def __init__(self, root, workflow_name=None, license=None, readme=None): # map source files to destination files self.file_map = {} self.manifest = self._get_manifest() + self.remap_names = remap_names @staticmethod def _get_step_maps(cwl_defs): @@ -583,14 +585,19 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): return action_p if "wf4ever:File" in type_names: hash_ = self.hashes[prov_param.id.localpart] - dest = Path(parent.id if parent else "") / hash_ + if self.remap_names: + basename = getattr(prov_param, "basename", hash_) + else: + basename = hash_ + dest = Path(parent.id if parent else "") / basename action_p = crate.dereference(dest.as_posix()) if not action_p: source = self.manifest[hash_] action_p = crate.add_file(source, dest, properties={ "sha1": hash_, }) - self._set_alternate_name(prov_param, action_p, parent=parent) + if not self.remap_names: + self._set_alternate_name(prov_param, action_p, parent=parent) try: source_k = str(source.resolve(strict=False)) except RuntimeError: @@ -599,11 +606,16 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): return action_p if "ro:Folder" in type_names: hash_ = self.hashes[prov_param.id.localpart] - dest = Path(parent.id if parent else "") / hash_ + if self.remap_names: + basename = getattr(prov_param, "basename", hash_) + else: + basename = hash_ + dest = Path(parent.id if parent else "") / basename action_p = crate.dereference(dest.as_posix()) if not action_p: action_p = crate.add_directory(dest_path=dest) - self._set_alternate_name(prov_param, action_p, parent=parent) + if not self.remap_names: + self._set_alternate_name(prov_param, action_p, parent=parent) for child in self.get_dict(prov_param).values(): part = self.convert_param(child, crate, parent=action_p) action_p.append_to("hasPart", part) diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index 22123c2..895ca63 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -1153,3 +1153,49 @@ def test_revsort_inline(data_dir, tmpdir, cwl_version): (reverse_sort_param.id, reverse_param.id), } assert set(_connected(workflow)) == {(sort_out_param.id, out_file_param.id)} + + +def test_remap_names(data_dir, tmpdir): + root = data_dir / "grepucase-run-1" + output = tmpdir / "grepucase-run-1-crate" + license = "Apache-2.0" + builder = ProvCrateBuilder(root, license=license, remap_names=True) + crate = builder.build() + crate.write(output) + crate = ROCrate(output) + workflow = crate.mainEntity + action_map = {_["instrument"].id: _ for _ in crate.contextual_entities + if "CreateAction" in _.type} + assert len(action_map) == 3 + wf_action = action_map["packed.cwl"] + assert wf_action["instrument"] is workflow + wf_objects = wf_action["object"] + wf_results = wf_action["result"] + assert len(wf_objects) == 2 + assert len(wf_results) == 1 + wf_objects_map = {_.id: _ for _ in wf_objects} + wf_input_dir = wf_objects_map.get("grepucase_in/") + assert wf_input_dir + wf_output_dir = wf_results[0] + assert wf_output_dir.id == "ucase_out/" + assert set(_.id for _ in wf_input_dir["hasPart"]) == { + "grepucase_in/bar", "grepucase_in/foo" + } + assert set(_.id for _ in wf_output_dir["hasPart"]) == { + "ucase_out/bar.out/", "ucase_out/foo.out/" + } + for d in wf_output_dir["hasPart"]: + if d.id == "ucase_out/bar.out/": + assert d["hasPart"][0].id == "ucase_out/bar.out/bar.out.out" + else: + assert d["hasPart"][0].id == "ucase_out/foo.out/foo.out.out" + greptool_action = action_map["packed.cwl#greptool.cwl"] + greptool_objects = greptool_action["object"] + greptool_results = greptool_action["result"] + assert len(greptool_objects) == 2 + assert len(greptool_results) == 1 + greptool_objects_map = {_.id: _ for _ in greptool_objects} + greptool_input_dir = greptool_objects_map.get("grepucase_in/") + assert greptool_input_dir is wf_input_dir + greptool_output_dir = greptool_results[0] + assert greptool_output_dir.id == "grep_out/" From 689d75b35fa57b158a6b84c00b25dd325867b27c Mon Sep 17 00:00:00 2001 From: simleo Date: Thu, 1 Jun 2023 12:01:20 +0200 Subject: [PATCH 2/9] finish test_remap_names --- tests/test_cwlprov_crate_builder.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index 895ca63..8e07076 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -1199,3 +1199,23 @@ def test_remap_names(data_dir, tmpdir): assert greptool_input_dir is wf_input_dir greptool_output_dir = greptool_results[0] assert greptool_output_dir.id == "grep_out/" + ucasetool_action = action_map["packed.cwl#ucasetool.cwl"] + ucasetool_objects = ucasetool_action["object"] + ucasetool_results = ucasetool_action["result"] + assert len(ucasetool_objects) == 1 + assert len(ucasetool_results) == 1 + ucasetool_input_dir = ucasetool_objects[0] + assert ucasetool_input_dir is greptool_output_dir + ucasetool_output_dir = ucasetool_results[0] + assert ucasetool_output_dir is wf_output_dir + for e in crate.data_entities: + assert "alternateName" not in e + for p in ( + "grepucase_in/bar", + "grepucase_in/foo", + "grep_out/bar.out", + "grep_out/foo.out", + "ucase_out/bar.out/bar.out.out", + "ucase_out/foo.out/foo.out.out", + ): + assert (output / p).is_file() From 57e7958b44e355558bbe61fdcd62502f08a09161 Mon Sep 17 00:00:00 2001 From: simleo Date: Thu, 1 Jun 2023 12:16:55 +0200 Subject: [PATCH 3/9] add test_cli_convert_remap_names --- tests/test_cli.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 6ae91a3..f638a39 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -94,6 +94,17 @@ def test_cli_convert_readme(data_dir, tmpdir): assert crate.get(readme.name) +def test_cli_convert_remap_names(data_dir, tmpdir): + root = data_dir / "grepucase-run-1" + crate_dir = tmpdir / "grepucase-run-1-crate" + runner = CliRunner() + args = ["convert", str(root), "-o", str(crate_dir), "--remap-names"] + assert runner.invoke(cli, args).exit_code == 0 + crate = ROCrate(crate_dir) + assert crate.get("grepucase_in/") + assert (crate_dir / "grepucase_in").is_dir() + + def test_cli_report_provenance_minimal(data_dir, caplog): crate_dir = data_dir / "revsort-provenance-crate-minimal" runner = CliRunner() From fc20b6199f0856c97b2789292c98263dd7af66ca Mon Sep 17 00:00:00 2001 From: simleo Date: Fri, 14 Jul 2023 17:09:01 +0200 Subject: [PATCH 4/9] names remapping: avoid clashes by using different dirs --- src/runcrate/cli.py | 2 +- src/runcrate/convert.py | 28 ++++++++----- tests/test_cli.py | 4 +- tests/test_cwlprov_crate_builder.py | 61 ++++++++++++++++++++--------- 4 files changed, 64 insertions(+), 31 deletions(-) diff --git a/src/runcrate/cli.py b/src/runcrate/cli.py index 4907ad1..319af8b 100644 --- a/src/runcrate/cli.py +++ b/src/runcrate/cli.py @@ -58,7 +58,7 @@ def cli(): ) @click.option( "--remap-names", - help="remap file/dir names to the original ones (MAY LEAD TO CLASHES!)", + help="remap file/dir names to the original ones", is_flag=True ) def convert(root, output, license, workflow_name, readme, remap_names): diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 36ee319..bb8ed3a 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -55,7 +55,7 @@ "null": None, } -SCATTER_JOB_PATTERN = re.compile(r"^(.+)_\d+$") +SCATTER_JOB_PATTERN = re.compile(r"^(.+)_(\d+)$") CWLPROV_NONE = "https://w3id.org/cwl/prov#None" @@ -215,6 +215,7 @@ def __init__(self, root, workflow_name=None, license=None, readme=None, self.file_map = {} self.manifest = self._get_manifest() self.remap_names = remap_names + self.data_root = "data" @staticmethod def _get_step_maps(cwl_defs): @@ -240,11 +241,13 @@ def _get_manifest(self): def _resolve_plan(self, activity): job_qname = activity.plan() plan = activity.provenance.entity(job_qname) + scatter_id = None if not plan: m = SCATTER_JOB_PATTERN.match(str(job_qname)) if m: plan = activity.provenance.entity(m.groups()[0]) - return plan + scatter_id = m.groups()[1] + return plan, scatter_id def _get_hash(self, prov_param): k = prov_param.id.localpart @@ -463,9 +466,11 @@ def add_action(self, crate, activity, parent_instrument=None): "@type": "CreateAction", "name": activity.label, })) - plan = self._resolve_plan(activity) + plan, scatter_id = self._resolve_plan(activity) plan_tag = plan.id.localpart + dest_base = Path(self.data_root) if plan_tag == "main": + dest_base = dest_base / "main" assert str(activity.type) == "wfprov:WorkflowRun" instrument = workflow self.roc_engine_run["result"] = action @@ -480,6 +485,7 @@ def to_wf_p(k): if parts[0] == "main": parts[0] = parent_instrument_fragment plan_tag = "/".join(parts) + dest_base = dest_base / (f"{plan_tag}_{scatter_id}" if scatter_id else f"{plan_tag}") tool_name = self.step_maps[parent_instrument_fragment][plan_tag]["tool"] instrument = crate.dereference(f"{workflow.id}#{tool_name}") control_action = self.control_actions.get(plan_tag) @@ -503,12 +509,14 @@ def to_wf_p(k): action["instrument"] = instrument action["startTime"] = activity.start().time.isoformat() action["endTime"] = activity.end().time.isoformat() - action["object"] = self.add_action_params(crate, activity, to_wf_p, "usage") - action["result"] = self.add_action_params(crate, activity, to_wf_p, "generation") + action["object"] = self.add_action_params(crate, activity, to_wf_p, "usage", + dest_base / "in" if self.remap_names else "") + action["result"] = self.add_action_params(crate, activity, to_wf_p, "generation", + dest_base / "out" if self.remap_names else "") for job in activity.steps(): self.add_action(crate, job, parent_instrument=instrument) - def add_action_params(self, crate, activity, to_wf_p, ptype="usage"): + def add_action_params(self, crate, activity, to_wf_p, ptype="usage", dest_base=""): action_params = [] all_roles = set() for rel in getattr(activity, ptype)(): @@ -528,7 +536,7 @@ def add_action_params(self, crate, activity, to_wf_p, ptype="usage"): wf_p = crate.dereference(to_wf_p(k)) k = get_fragment(k) v = rel.entity() - value = self.convert_param(v, crate) + value = self.convert_param(v, crate, dest_base=dest_base) if value is None: continue # param is optional with no default and was not set if {"ro:Folder", "wf4ever:File"} & set(str(_) for _ in v.types()): @@ -565,7 +573,7 @@ def _set_alternate_name(prov_param, action_p, parent=None): if "alternateName" in parent: action_p["alternateName"] = (Path(parent["alternateName"]) / basename).as_posix() - def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): + def convert_param(self, prov_param, crate, convert_secondary=True, parent=None, dest_base=""): type_names = frozenset(str(_) for _ in prov_param.types()) secondary_files = [_.generated_entity() for _ in prov_param.derivations() if str(_.type) == "cwlprov:SecondaryFile"] @@ -589,7 +597,7 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): basename = getattr(prov_param, "basename", hash_) else: basename = hash_ - dest = Path(parent.id if parent else "") / basename + dest = Path(parent.id if parent else dest_base) / basename action_p = crate.dereference(dest.as_posix()) if not action_p: source = self.manifest[hash_] @@ -610,7 +618,7 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): basename = getattr(prov_param, "basename", hash_) else: basename = hash_ - dest = Path(parent.id if parent else "") / basename + dest = Path(parent.id if parent else dest_base) / basename action_p = crate.dereference(dest.as_posix()) if not action_p: action_p = crate.add_directory(dest_path=dest) diff --git a/tests/test_cli.py b/tests/test_cli.py index f638a39..f3d978e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -101,8 +101,8 @@ def test_cli_convert_remap_names(data_dir, tmpdir): args = ["convert", str(root), "-o", str(crate_dir), "--remap-names"] assert runner.invoke(cli, args).exit_code == 0 crate = ROCrate(crate_dir) - assert crate.get("grepucase_in/") - assert (crate_dir / "grepucase_in").is_dir() + assert crate.get("data/main/in/grepucase_in/") + assert (crate_dir / "data" / "main" / "in" / "grepucase_in").is_dir() def test_cli_report_provenance_minimal(data_dir, caplog): diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index 8e07076..5f71d7a 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -1174,48 +1174,73 @@ def test_remap_names(data_dir, tmpdir): assert len(wf_objects) == 2 assert len(wf_results) == 1 wf_objects_map = {_.id: _ for _ in wf_objects} - wf_input_dir = wf_objects_map.get("grepucase_in/") + wf_input_dir = wf_objects_map.get("data/main/in/grepucase_in/") assert wf_input_dir wf_output_dir = wf_results[0] - assert wf_output_dir.id == "ucase_out/" + assert wf_output_dir.id == "data/main/out/ucase_out/" assert set(_.id for _ in wf_input_dir["hasPart"]) == { - "grepucase_in/bar", "grepucase_in/foo" + "data/main/in/grepucase_in/bar", "data/main/in/grepucase_in/foo" } assert set(_.id for _ in wf_output_dir["hasPart"]) == { - "ucase_out/bar.out/", "ucase_out/foo.out/" + "data/main/out/ucase_out/bar.out/", "data/main/out/ucase_out/foo.out/" } for d in wf_output_dir["hasPart"]: - if d.id == "ucase_out/bar.out/": - assert d["hasPart"][0].id == "ucase_out/bar.out/bar.out.out" + if d.id == "data/main/out/ucase_out/bar.out/": + assert d["hasPart"][0].id == "data/main/out/ucase_out/bar.out/bar.out.out" else: - assert d["hasPart"][0].id == "ucase_out/foo.out/foo.out.out" + assert d["hasPart"][0].id == "data/main/out/ucase_out/foo.out/foo.out.out" greptool_action = action_map["packed.cwl#greptool.cwl"] greptool_objects = greptool_action["object"] greptool_results = greptool_action["result"] assert len(greptool_objects) == 2 assert len(greptool_results) == 1 greptool_objects_map = {_.id: _ for _ in greptool_objects} - greptool_input_dir = greptool_objects_map.get("grepucase_in/") - assert greptool_input_dir is wf_input_dir + greptool_input_dir = greptool_objects_map.get("data/main/grep/in/grepucase_in/") + assert greptool_input_dir + assert set(_.id for _ in greptool_input_dir["hasPart"]) == { + "data/main/grep/in/grepucase_in/bar", "data/main/grep/in/grepucase_in/foo" + } greptool_output_dir = greptool_results[0] - assert greptool_output_dir.id == "grep_out/" + assert greptool_output_dir.id == "data/main/grep/out/grep_out/" + assert set(_.id for _ in greptool_output_dir["hasPart"]) == { + "data/main/grep/out/grep_out/bar.out", "data/main/grep/out/grep_out/foo.out" + } ucasetool_action = action_map["packed.cwl#ucasetool.cwl"] ucasetool_objects = ucasetool_action["object"] ucasetool_results = ucasetool_action["result"] assert len(ucasetool_objects) == 1 assert len(ucasetool_results) == 1 ucasetool_input_dir = ucasetool_objects[0] - assert ucasetool_input_dir is greptool_output_dir + assert ucasetool_input_dir.id == "data/main/ucase/in/grep_out/" + assert set(_.id for _ in ucasetool_input_dir["hasPart"]) == { + "data/main/ucase/in/grep_out/bar.out", "data/main/ucase/in/grep_out/foo.out" + } ucasetool_output_dir = ucasetool_results[0] - assert ucasetool_output_dir is wf_output_dir + assert ucasetool_output_dir.id == "data/main/ucase/out/ucase_out/" + assert set(_.id for _ in ucasetool_output_dir["hasPart"]) == { + "data/main/ucase/out/ucase_out/bar.out/", "data/main/ucase/out/ucase_out/foo.out/" + } + + for d in ucasetool_output_dir["hasPart"]: + if d.id == "data/main/ucase/out/ucase_out/bar.out/": + assert d["hasPart"][0].id == "data/main/ucase/out/ucase_out/bar.out/bar.out.out" + else: + assert d["hasPart"][0].id == "data/main/ucase/out/ucase_out/foo.out/foo.out.out" + for e in crate.data_entities: assert "alternateName" not in e for p in ( - "grepucase_in/bar", - "grepucase_in/foo", - "grep_out/bar.out", - "grep_out/foo.out", - "ucase_out/bar.out/bar.out.out", - "ucase_out/foo.out/foo.out.out", + "data/main/in/grepucase_in/bar", + "data/main/in/grepucase_in/foo", + "data/main/out/ucase_out/bar.out/bar.out.out", + "data/main/out/ucase_out/foo.out/foo.out.out", + "data/main/grep/in/grepucase_in/bar", + "data/main/grep/in/grepucase_in/foo", + "data/main/grep/out/grep_out/bar.out", + "data/main/grep/out/grep_out/foo.out", + "data/main/ucase/in/grep_out/bar.out", + "data/main/ucase/in/grep_out/foo.out", + "data/main/ucase/out/ucase_out/bar.out/bar.out.out", + "data/main/ucase/out/ucase_out/foo.out/foo.out.out", ): assert (output / p).is_file() From 29e34d75039f8e16a85bc6caec3f8fa754d2f34e Mon Sep 17 00:00:00 2001 From: simleo Date: Mon, 17 Jul 2023 13:21:21 +0200 Subject: [PATCH 5/9] names remapping: test to check there are no clashes if same basename --- tests/test_cwlprov_crate_builder.py | 80 +++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index 5f71d7a..84ee6aa 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -1244,3 +1244,83 @@ def test_remap_names(data_dir, tmpdir): "data/main/ucase/out/ucase_out/foo.out/foo.out.out", ): assert (output / p).is_file() + + +def test_remap_names_noclash(data_dir, tmpdir): + root = data_dir / "revsort-run-1" + output = tmpdir / "revsort-run-1-crate" + license = "Apache-2.0" + builder = ProvCrateBuilder(root, license=license, remap_names=True) + crate = builder.build() + crate.write(output) + crate = ROCrate(output) + workflow = crate.mainEntity + wf_inputs_map = {_.id: _ for _ in workflow["input"]} + wf_outputs_map = {_.id: _ for _ in workflow["output"]} + tool_map = {_.id: _ for _ in workflow["hasPart"]} + assert len(tool_map) == 2 + rev_tool = tool_map["packed.cwl#revtool.cwl"] + sort_tool = tool_map["packed.cwl#sorttool.cwl"] + rev_inputs_map = {_.id: _ for _ in rev_tool["input"]} + rev_outputs_map = {_.id: _ for _ in rev_tool["output"]} + sort_inputs_map = {_.id: _ for _ in sort_tool["input"]} + sort_outputs_map = {_.id: _ for _ in sort_tool["output"]} + action_map = {_["instrument"].id: _ for _ in crate.contextual_entities + if "CreateAction" in _.type} + assert len(action_map) == 3 + wf_action = action_map["packed.cwl"] + assert wf_action["instrument"] is workflow + wf_objects = wf_action["object"] + wf_results = wf_action["result"] + assert len(wf_objects) == 2 + assert len(wf_results) == 1 + wf_objects_map = {_.id: _ for _ in wf_objects} + wf_results_map = {_.id: _ for _ in wf_results} + wf_input_file = wf_objects_map.get("data/main/in/whale.txt") + assert wf_input_file + assert wf_input_file["exampleOfWork"] is wf_inputs_map["packed.cwl#main/input"] + wf_output_file = wf_results_map.get("data/main/out/output.txt") + assert wf_output_file + assert wf_output_file["exampleOfWork"] is wf_outputs_map["packed.cwl#main/output"] + rev_action = action_map["packed.cwl#revtool.cwl"] + rev_objects = rev_action["object"] + rev_results = rev_action["result"] + assert len(rev_objects) == 1 + assert len(rev_results) == 1 + rev_objects_map = {_.id: _ for _ in rev_objects} + rev_results_map = {_.id: _ for _ in rev_results} + rev_input_file = rev_objects_map.get("data/main/rev/in/whale.txt") + assert rev_input_file + assert rev_input_file["exampleOfWork"] is rev_inputs_map["packed.cwl#revtool.cwl/input"] + rev_output_file = rev_results_map.get("data/main/rev/out/output.txt") + assert rev_output_file + assert rev_output_file["exampleOfWork"] is rev_outputs_map["packed.cwl#revtool.cwl/output"] + sort_action = action_map["packed.cwl#sorttool.cwl"] + sort_objects = sort_action["object"] + sort_results = sort_action["result"] + assert len(sort_objects) == 2 + assert len(sort_results) == 1 + sort_objects_map = {_.id: _ for _ in sort_objects} + sort_results_map = {_.id: _ for _ in sort_results} + sort_input_file = sort_objects_map.get("data/main/sorted/in/output.txt") + assert sort_input_file + assert sort_input_file["exampleOfWork"] is sort_inputs_map["packed.cwl#sorttool.cwl/input"] + sort_output_file = sort_results_map.get("data/main/sorted/out/output.txt") + assert sort_output_file + assert sort_output_file["exampleOfWork"] is sort_outputs_map["packed.cwl#sorttool.cwl/output"] + for p in ( + "data/main/in/whale.txt", + "data/main/out/output.txt", + "data/main/rev/in/whale.txt", + "data/main/rev/out/output.txt", + "data/main/sorted/in/output.txt", + "data/main/sorted/out/output.txt", + ): + assert (output / p).is_file() + wf_in_txt = (output / "data/main/in/whale.txt").read_text() + assert (output / "data/main/rev/in/whale.txt").read_text() == wf_in_txt + wf_out_txt = (output / "data/main/out/output.txt").read_text() + assert (output / "data/main/sorted/out/output.txt").read_text() == wf_out_txt + rev_out_txt = (output / "data/main/rev/out/output.txt").read_text() + sort_in_txt = (output / "data/main/sorted/in/output.txt").read_text() + assert rev_out_txt == sort_in_txt From bc6b8bf7bd85358676b70379d9deb6cce9ab9d24 Mon Sep 17 00:00:00 2001 From: simleo Date: Mon, 17 Jul 2023 16:38:57 +0200 Subject: [PATCH 6/9] convert_param fixes --- src/runcrate/convert.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index bb8ed3a..76b7d6e 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -578,7 +578,8 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None, secondary_files = [_.generated_entity() for _ in prov_param.derivations() if str(_.type) == "cwlprov:SecondaryFile"] if convert_secondary and secondary_files: - main_entity = self.convert_param(prov_param, crate, convert_secondary=False) + main_entity = self.convert_param(prov_param, crate, convert_secondary=False, + dest_base=dest_base) action_p = self.collections.get(main_entity.id) if not action_p: action_p = crate.add(ContextEntity(crate, properties={ @@ -586,7 +587,7 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None, })) action_p["mainEntity"] = main_entity action_p["hasPart"] = [main_entity] + [ - self.convert_param(_, crate) for _ in secondary_files + self.convert_param(_, crate, dest_base=dest_base) for _ in secondary_files ] crate.root_dataset.append_to("mentions", action_p) self.collections[main_entity.id] = action_p @@ -594,7 +595,7 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None, if "wf4ever:File" in type_names: hash_ = self.hashes[prov_param.id.localpart] if self.remap_names: - basename = getattr(prov_param, "basename", hash_) + basename = getattr(prov_param, "basename", hash_) or hash_ else: basename = hash_ dest = Path(parent.id if parent else dest_base) / basename @@ -615,7 +616,7 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None, if "ro:Folder" in type_names: hash_ = self.hashes[prov_param.id.localpart] if self.remap_names: - basename = getattr(prov_param, "basename", hash_) + basename = getattr(prov_param, "basename", hash_) or hash_ else: basename = hash_ dest = Path(parent.id if parent else dest_base) / basename @@ -632,12 +633,13 @@ def convert_param(self, prov_param, crate, convert_secondary=True, parent=None, return str(prov_param.value) if "prov:Dictionary" in type_names: return dict( - (k, self.convert_param(v, crate)) + (k, self.convert_param(v, crate, dest_base=dest_base)) for k, v in self.get_dict(prov_param).items() if k != "@id" ) if "prov:Collection" in type_names: - return [self.convert_param(_, crate) for _ in self.get_members(prov_param)] + return [self.convert_param(_, crate, dest_base=dest_base) + for _ in self.get_members(prov_param)] if prov_param.id.uri == CWLPROV_NONE: return None raise RuntimeError(f"No value to convert for {prov_param}") From 79ba6216074f55eb7d0c8af146d1fba2bdc2281a Mon Sep 17 00:00:00 2001 From: simleo Date: Tue, 18 Jul 2023 17:08:48 +0200 Subject: [PATCH 7/9] expand some unit tests --- tests/test_cwlprov_crate_builder.py | 41 +++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index 84ee6aa..fe1a5ac 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -1244,6 +1244,42 @@ def test_remap_names(data_dir, tmpdir): "data/main/ucase/out/ucase_out/foo.out/foo.out.out", ): assert (output / p).is_file() + for p in ( + "bar", + "foo", + "bar.out", + "foo.out", + "bar.out.out", + "foo.out.out", + ): + assert not (output / p).is_file() + wf_in_txt = ( + (output / "data/main/in/grepucase_in/bar").read_text(), + (output / "data/main/in/grepucase_in/foo").read_text(), + ) + grep_in_txt = ( + (output / "data/main/grep/in/grepucase_in/bar").read_text(), + (output / "data/main/grep/in/grepucase_in/foo").read_text(), + ) + assert grep_in_txt == wf_in_txt + grep_out_txt = ( + (output / "data/main/grep/out/grep_out/bar.out").read_text(), + (output / "data/main/grep/out/grep_out/foo.out").read_text(), + ) + ucase_in_txt = ( + (output / "data/main/ucase/in/grep_out/bar.out").read_text(), + (output / "data/main/ucase/in/grep_out/foo.out").read_text(), + ) + assert ucase_in_txt == grep_out_txt + ucase_out_txt = ( + (output / "data/main/ucase/out/ucase_out/bar.out/bar.out.out").read_text(), + (output / "data/main/ucase/out/ucase_out/foo.out/foo.out.out").read_text(), + ) + wf_out_txt = ( + (output / "data/main/out/ucase_out/bar.out/bar.out.out").read_text(), + (output / "data/main/out/ucase_out/foo.out/foo.out.out").read_text(), + ) + assert ucase_out_txt == wf_out_txt def test_remap_names_noclash(data_dir, tmpdir): @@ -1317,6 +1353,11 @@ def test_remap_names_noclash(data_dir, tmpdir): "data/main/sorted/out/output.txt", ): assert (output / p).is_file() + for p in ( + "whale.txt", + "output.txt", + ): + assert not (output / p).is_file() wf_in_txt = (output / "data/main/in/whale.txt").read_text() assert (output / "data/main/rev/in/whale.txt").read_text() == wf_in_txt wf_out_txt = (output / "data/main/out/output.txt").read_text() From eaaaad96e42bc18d5c32955236f1bd277286ff3c Mon Sep 17 00:00:00 2001 From: simleo Date: Wed, 19 Jul 2023 12:20:54 +0200 Subject: [PATCH 8/9] add test_remap_names_scatter --- tests/test_cwlprov_crate_builder.py | 62 +++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tests/test_cwlprov_crate_builder.py b/tests/test_cwlprov_crate_builder.py index fe1a5ac..d7d58cc 100644 --- a/tests/test_cwlprov_crate_builder.py +++ b/tests/test_cwlprov_crate_builder.py @@ -1365,3 +1365,65 @@ def test_remap_names_noclash(data_dir, tmpdir): rev_out_txt = (output / "data/main/rev/out/output.txt").read_text() sort_in_txt = (output / "data/main/sorted/in/output.txt").read_text() assert rev_out_txt == sort_in_txt + + +def test_remap_names_scatter(data_dir, tmpdir): + root = data_dir / "no-output-run-1" + output = tmpdir / "no-output-run-1-crate" + license = "Apache-2.0" + builder = ProvCrateBuilder(root, license=license, remap_names=True) + crate = builder.build() + crate.write(output) + crate = ROCrate(output) + workflow = crate.mainEntity + wf_inputs_map = {_.id: _ for _ in workflow["input"]} + assert len(wf_inputs_map) == 3 + tool_map = {_.id: _ for _ in workflow["hasPart"]} + assert len(tool_map) == 2 + date_tool = tool_map["packed.cwl#date.cwl"] + step_map = {_.id: _ for _ in workflow["step"]} + assert len(step_map) == 3 + date2_step = step_map["packed.cwl#main/date2_step"] + actions = [_ for _ in crate.contextual_entities if "CreateAction" in _.type] + assert len(actions) == 5 + sel = [_ for _ in actions if _["instrument"] is workflow] + assert len(sel) == 1 + wf_action = sel[0] + assert wf_action["instrument"] is workflow + wf_objects = wf_action["object"] + assert len(wf_objects) == 3 + wf_objects_map = {_.type: _ for _ in wf_objects} + assert set(wf_objects_map) == {"PropertyValue", "Dataset", "File"} + wf_array_obj = wf_objects_map["PropertyValue"] + assert wf_array_obj["exampleOfWork"] is wf_inputs_map["packed.cwl#main/pdb_array"] + wf_array_files_map = {_.id for _ in wf_array_obj["value"]} + assert set(wf_array_files_map) == {"data/main/in/7mb7.cif", "data/main/in/7zxf.cif"} + control_actions = [_ for _ in crate.contextual_entities if "ControlAction" in _.type] + sel = [_ for _ in control_actions if _["instrument"] is date2_step] + assert len(sel) == 1 + date2_control_action = sel[0] + date2_create_actions = date2_control_action["object"] + for a in date2_create_actions: + assert a["instrument"] is date_tool + assert len(a["object"]) == 1 + date2_files = [_["object"][0] for _ in date2_create_actions] + assert set(_.id for _ in date2_files) == { + "data/main/date2_step/in/7mb7.cif", + "data/main/date2_step_2/in/7zxf.cif" + } + for p in ( + "data/main/in/7mb7.cif", + "data/main/in/7zxf.cif", + "data/main/date2_step/in/7mb7.cif", + "data/main/date2_step_2/in/7zxf.cif", + ): + assert (output / p).is_file() + for p in ( + "7mb7.cif", + "7zxf.cif", + ): + assert not (output / p).is_file() + text_7mb7 = (output / "data/main/in/7mb7.cif").read_text() + text_7zxf = (output / "data/main/in/7zxf.cif").read_text() + assert (output / "data/main/date2_step/in/7mb7.cif").read_text() == text_7mb7 + assert (output / "data/main/date2_step_2/in/7zxf.cif").read_text() == text_7zxf From 57eae278e6b49447aae2639537da135e3d990568 Mon Sep 17 00:00:00 2001 From: simleo Date: Wed, 19 Jul 2023 16:19:59 +0200 Subject: [PATCH 9/9] add a note to patch_workflow_input_collection --- src/runcrate/convert.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 76b7d6e..1e50646 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -702,6 +702,10 @@ def patch_workflow_input_collection(self, crate, wf=None): entity of the collection alone (a File). This method fixes the mapping by retrieving the correct Collection entity from the relevant tool execution. + + Note that this trick does not lead to a correct result with + remap_names on: the workflow-level parameter should be mapped to a + separate collection where files have different paths. """ if wf is None: wf = crate.mainEntity