Skip to content

Commit 675f1cc

Browse files
committed
feat: add validation for workflow versions
1 parent 3b4168c commit 675f1cc

File tree

3 files changed

+223
-52
lines changed

3 files changed

+223
-52
lines changed

catalog/output/workflows.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"ploidy": "ANY",
2525
"taxonomyId": "11158",
2626
"trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1",
27-
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.",
27+
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).",
2828
"workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes"
2929
},
3030
{
@@ -323,7 +323,7 @@
323323
"ploidy": "ANY",
324324
"taxonomyId": "11158",
325325
"trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1",
326-
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.",
326+
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).",
327327
"workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes"
328328
},
329329
{
@@ -360,7 +360,7 @@
360360
"taxonomyId": "2",
361361
"trsId": "#workflow/github.com/iwc-workflows/amr_gene_detection/main/versions/v1.1.5",
362362
"workflowDescription": "Antimicrobial resistance gene detection from assembled bacterial genomes",
363-
"workflowName": "AMR gene detection"
363+
"workflowName": "amr_gene_detection"
364364
},
365365
{
366366
"iwcId": "lncrnas-annotation-main",

catalog/py_package/catalog_build/iwc_manifest_to_workflows_yaml.py

Lines changed: 151 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import argparse
22
import json
33
import os
4+
import re
45
import subprocess
5-
from typing import Dict
6+
import time
7+
from typing import Dict, List
68

79
import requests
810
import yaml
@@ -33,7 +35,7 @@
3335
)
3436

3537

36-
def read_existing_yaml(workflows_path):
38+
def read_existing_yaml(workflows_path: str) -> Dict[str, Workflow]:
3739
if os.path.exists(workflows_path):
3840
with open(workflows_path) as fh:
3941
workflows = Workflows.model_validate(yaml.safe_load(fh)).workflows
@@ -44,7 +46,9 @@ def read_existing_yaml(workflows_path):
4446
return by_trs_id
4547

4648

47-
def get_workflow_categories_from_collections(collections):
49+
def get_workflow_categories_from_collections(
50+
collections: List[str],
51+
) -> List[WorkflowCategoryId]:
4852
return sorted(
4953
list(
5054
set(
@@ -57,10 +61,10 @@ def get_workflow_categories_from_collections(collections):
5761
)
5862

5963

60-
def get_input_types(workflow_definition):
64+
def get_input_types(workflow_definition: dict) -> List[WorkflowParameter]:
6165
# get all input types
6266
INPUT_TYPES = ["data_input", "data_collection_input", "parameter_input"]
63-
inputs: list[WorkflowParameter] = []
67+
inputs: List[WorkflowParameter] = []
6468
for step in workflow_definition["steps"].values():
6569
step_label = step["label"]
6670
step_type = step["type"]
@@ -94,18 +98,74 @@ def get_input_types(workflow_definition):
9498
return inputs
9599

96100

97-
def generate_current_workflows():
101+
def verify_trs_version_exists(trs_id: str, skip_validation: bool = False) -> bool:
102+
"""Check if a workflow version exists on Dockstore via TRS API."""
103+
if skip_validation:
104+
return True
105+
106+
# Parse the TRS ID to extract components
107+
match = re.match(
108+
r"#workflow/github\.com/iwc-workflows/([^/]+)/([^/]+)/versions/v(.+)", trs_id
109+
)
110+
if not match:
111+
print(f"Warning: Cannot parse TRS ID for validation: {trs_id}")
112+
return True # We can't look this up, but someone put it in -- don't fail
113+
114+
repo, workflow_name, version = match.groups()
115+
116+
# The workflow ID format for Dockstore is the full TRS ID without the version part
117+
workflow_id = f"#workflow/github.com/iwc-workflows/{repo}/{workflow_name}"
118+
# URL encode the workflow ID and version
119+
encoded_id = requests.utils.quote(workflow_id, safe="")
120+
encoded_version = requests.utils.quote(f"v{version}", safe="")
121+
122+
dockstore_url = f"https://dockstore.org/api/ga4gh/trs/v2/tools/{encoded_id}/versions/{encoded_version}"
123+
124+
try:
125+
response = requests.get(dockstore_url, timeout=10)
126+
if response.status_code == 200:
127+
return True
128+
elif response.status_code == 404:
129+
return False
130+
else:
131+
print(
132+
f"Warning: Unexpected status {response.status_code} checking {trs_id} at Dockstore"
133+
)
134+
return True # Don't drop workflows on weirdness
135+
except requests.RequestException as e:
136+
print(f"Warning: Error checking version {trs_id}: {e}")
137+
return True
138+
finally:
139+
# Don't slam dockstore
140+
time.sleep(0.1)
141+
142+
143+
def generate_current_workflows(skip_validation: bool = False) -> Dict[str, Workflow]:
98144
manifest_data = requests.get(URL).json()
99145
by_trs_id: Dict[str, Workflow] = {}
146+
version_warnings = []
147+
100148
for repo in manifest_data:
101149
for workflow in repo["workflows"]:
102150
if "tests" not in workflow:
103151
# probably fixed on main branch of iwc ?
104152
# this branch is pretty out of date
105153
continue
154+
155+
trs_id = (
156+
f"{workflow['trsID']}/versions/v{workflow['definition']['release']}"
157+
)
158+
159+
if not verify_trs_version_exists(trs_id, skip_validation):
160+
# This is just informational - we'll keep the workflow with whatever
161+
# version is already in workflows.yml (handled in merge_into_existing)
162+
version_warnings.append(
163+
f"Info: IWC manifest has v{workflow['definition']['release']} for {workflow['trsID']} but it's not on Dockstore yet"
164+
)
165+
106166
workflow_input = Workflow(
107167
active=False,
108-
trs_id=f"{workflow['trsID']}/versions/v{workflow['definition']['release']}",
168+
trs_id=trs_id,
109169
workflow_name=workflow["definition"]["name"],
110170
categories=get_workflow_categories_from_collections(
111171
workflow["collections"]
@@ -118,6 +178,12 @@ def generate_current_workflows():
118178
parameters=get_input_types(workflow["definition"]),
119179
)
120180
by_trs_id[workflow["trsID"]] = workflow_input
181+
182+
if version_warnings and not skip_validation:
183+
print("\nVersion status notes:")
184+
for warning in version_warnings:
185+
print(f" {warning}")
186+
121187
return by_trs_id
122188

123189

@@ -149,29 +215,82 @@ def add_missing_parameters(
149215
existing_workflow_input.parameters.append(param)
150216

151217

152-
def merge_into_existing(workflows_path):
218+
def merge_into_existing(
219+
workflows_path: str, skip_validation: bool = False
220+
) -> Dict[str, Workflow]:
153221
existing = read_existing_yaml(workflows_path)
154-
current = generate_current_workflows()
222+
current = generate_current_workflows(skip_validation)
155223
merged: Dict[str, Workflow] = {}
224+
invalid_versions = []
225+
versions_kept = []
226+
156227
for versionless_trs_id, current_workflow_input in current.items():
157228
existing_workflow_input = existing.get(versionless_trs_id)
158-
if existing_workflow_input:
159-
# we'll keep whatever has been specified in the brc repo,
160-
# and only update values that are in the iwc manifest
161-
exisiting_dict = existing_workflow_input.model_dump()
162-
new_dict = current_workflow_input.model_dump()
163-
for key in MANIFEST_SOURCE_OF_TRUTH:
164-
exisiting_dict[key] = new_dict[key]
165-
ensure_parameters_exist(current_workflow_input, existing_workflow_input)
166-
updated_existing_workflow = Workflow(**exisiting_dict)
167-
add_missing_parameters(current_workflow_input, updated_existing_workflow)
168-
current_workflow_input = updated_existing_workflow
229+
if not existing_workflow_input:
230+
merged[versionless_trs_id] = current_workflow_input
231+
continue
232+
233+
iwc_version_valid = verify_trs_version_exists(
234+
current_workflow_input.trs_id, skip_validation
235+
)
236+
existing_version_valid = verify_trs_version_exists(
237+
existing_workflow_input.trs_id, skip_validation
238+
)
239+
240+
# Decide which version to use
241+
if not iwc_version_valid and existing_version_valid:
242+
# IWC version not on Dockstore yet, but existing version is valid
243+
versions_kept.append(
244+
f"Keeping {existing_workflow_input.trs_id} (IWC has newer unreleased version)"
245+
)
246+
current_workflow_input.trs_id = existing_workflow_input.trs_id
247+
elif not existing_version_valid:
248+
# Existing version is invalid (manually edited to bad version)
249+
if iwc_version_valid:
250+
print(
251+
f"Error: Invalid version {existing_workflow_input.trs_id} doesn't exist on Dockstore"
252+
)
253+
print(f" -> Reverting to IWC version: {current_workflow_input.trs_id}")
254+
invalid_versions.append(existing_workflow_input.trs_id)
255+
else:
256+
# Both versions are invalid - this shouldn't happen often
257+
print(
258+
f"Error: Neither existing nor IWC version exists on Dockstore for {versionless_trs_id}"
259+
)
260+
# Keep what we have
261+
current_workflow_input.trs_id = existing_workflow_input.trs_id
262+
263+
# Build the merged workflow
264+
existing_dict = existing_workflow_input.model_dump()
265+
new_dict = current_workflow_input.model_dump()
266+
267+
# Update manifest-controlled fields
268+
for key in MANIFEST_SOURCE_OF_TRUTH:
269+
existing_dict[key] = new_dict[key]
270+
271+
ensure_parameters_exist(current_workflow_input, existing_workflow_input)
272+
updated_existing_workflow = Workflow(**existing_dict)
273+
add_missing_parameters(current_workflow_input, updated_existing_workflow)
274+
current_workflow_input = updated_existing_workflow
169275
merged[versionless_trs_id] = current_workflow_input
276+
277+
if versions_kept and not skip_validation:
278+
print(
279+
f"\nKept {len(versions_kept)} existing versions (newer IWC versions not on Dockstore yet)"
280+
)
281+
for msg in versions_kept:
282+
print(f" {msg}")
283+
284+
if invalid_versions:
285+
print(f"\nFixed {len(invalid_versions)} invalid versions in workflows.yml")
286+
170287
return merged
171288

172289

173-
def to_workflows_yaml(workflows_path: str, exclude_other: bool):
174-
by_trs_id = merge_into_existing(workflows_path)
290+
def to_workflows_yaml(
291+
workflows_path: str, exclude_other: bool, skip_validation: bool = False
292+
):
293+
by_trs_id = merge_into_existing(workflows_path, skip_validation)
175294
# sort by trs id, should play nicer with git diffs
176295
sorted_workflows = list(dict(sorted(by_trs_id.items())).values())
177296
if exclude_other:
@@ -209,5 +328,14 @@ def to_workflows_yaml(workflows_path: str, exclude_other: bool):
209328
action="store_true",
210329
help="Exclude other items from processing.",
211330
)
331+
parser.add_argument(
332+
"--skip-validation",
333+
action="store_true",
334+
help="Skip validation of workflow versions against TRS API.",
335+
)
212336
args = parser.parse_args()
213-
to_workflows_yaml(args.workflows_path, exclude_other=args.exclude_other)
337+
to_workflows_yaml(
338+
args.workflows_path,
339+
exclude_other=args.exclude_other,
340+
skip_validation=args.skip_validation,
341+
)

0 commit comments

Comments
 (0)