diff --git a/catalog/output/workflows.json b/catalog/output/workflows.json index faf2d3c6..b36cd7df 100644 --- a/catalog/output/workflows.json +++ b/catalog/output/workflows.json @@ -24,7 +24,7 @@ "ploidy": "ANY", "taxonomyId": "11158", "trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1", - "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.", + "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).", "workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes" }, { @@ -323,7 +323,7 @@ "ploidy": "ANY", "taxonomyId": "11158", "trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1", - "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.", + "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).", "workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes" }, { @@ -360,7 +360,7 @@ "taxonomyId": "2", "trsId": "#workflow/github.com/iwc-workflows/amr_gene_detection/main/versions/v1.1.5", "workflowDescription": "Antimicrobial resistance gene detection from assembled bacterial genomes", - "workflowName": "AMR gene detection" + "workflowName": "amr_gene_detection" }, { "iwcId": "lncrnas-annotation-main", diff --git a/catalog/py_package/catalog_build/iwc_manifest_to_workflows_yaml.py b/catalog/py_package/catalog_build/iwc_manifest_to_workflows_yaml.py index 99e8f23e..d2c0430e 100644 --- a/catalog/py_package/catalog_build/iwc_manifest_to_workflows_yaml.py +++ b/catalog/py_package/catalog_build/iwc_manifest_to_workflows_yaml.py @@ -1,8 +1,10 @@ import argparse import json import os +import re import subprocess -from typing import Dict +import time +from typing import Dict, List import requests import yaml @@ -33,7 +35,7 @@ ) -def read_existing_yaml(workflows_path): +def read_existing_yaml(workflows_path: str) -> Dict[str, Workflow]: if os.path.exists(workflows_path): with open(workflows_path) as fh: workflows = Workflows.model_validate(yaml.safe_load(fh)).workflows @@ -44,7 +46,9 @@ def read_existing_yaml(workflows_path): return by_trs_id -def get_workflow_categories_from_collections(collections): +def get_workflow_categories_from_collections( + collections: List[str], +) -> List[WorkflowCategoryId]: return sorted( list( set( @@ -57,10 +61,10 @@ def get_workflow_categories_from_collections(collections): ) -def get_input_types(workflow_definition): +def get_input_types(workflow_definition: dict) -> List[WorkflowParameter]: # get all input types INPUT_TYPES = ["data_input", "data_collection_input", "parameter_input"] - inputs: list[WorkflowParameter] = [] + inputs: List[WorkflowParameter] = [] for step in workflow_definition["steps"].values(): step_label = step["label"] step_type = step["type"] @@ -94,18 +98,74 @@ def get_input_types(workflow_definition): return inputs -def generate_current_workflows(): +def verify_trs_version_exists(trs_id: str, skip_validation: bool = False) -> bool: + """Check if a workflow version exists on Dockstore via TRS API.""" + if skip_validation: + return True + + # Parse the TRS ID to extract components + match = re.match( + r"#workflow/github\.com/iwc-workflows/([^/]+)/([^/]+)/versions/v(.+)", trs_id + ) + if not match: + print(f"Warning: Cannot parse TRS ID for validation: {trs_id}") + return True # We can't look this up, but someone put it in -- don't fail + + repo, workflow_name, version = match.groups() + + # The workflow ID format for Dockstore is the full TRS ID without the version part + workflow_id = f"#workflow/github.com/iwc-workflows/{repo}/{workflow_name}" + # URL encode the workflow ID and version + encoded_id = requests.utils.quote(workflow_id, safe="") + encoded_version = requests.utils.quote(f"v{version}", safe="") + + dockstore_url = f"https://dockstore.org/api/ga4gh/trs/v2/tools/{encoded_id}/versions/{encoded_version}" + + try: + response = requests.get(dockstore_url, timeout=10) + if response.status_code == 200: + return True + elif response.status_code == 404: + return False + else: + print( + f"Warning: Unexpected status {response.status_code} checking {trs_id} at Dockstore" + ) + return True # Don't drop workflows on weirdness + except requests.RequestException as e: + print(f"Warning: Error checking version {trs_id}: {e}") + return True + finally: + # Don't slam dockstore + time.sleep(0.1) + + +def generate_current_workflows(skip_validation: bool = False) -> Dict[str, Workflow]: manifest_data = requests.get(URL).json() by_trs_id: Dict[str, Workflow] = {} + version_warnings = [] + for repo in manifest_data: for workflow in repo["workflows"]: if "tests" not in workflow: # probably fixed on main branch of iwc ? # this branch is pretty out of date continue + + trs_id = ( + f"{workflow['trsID']}/versions/v{workflow['definition']['release']}" + ) + + if not verify_trs_version_exists(trs_id, skip_validation): + # This is just informational - we'll keep the workflow with whatever + # version is already in workflows.yml (handled in merge_into_existing) + version_warnings.append( + f"Info: IWC manifest has v{workflow['definition']['release']} for {workflow['trsID']} but it's not on Dockstore yet" + ) + workflow_input = Workflow( active=False, - trs_id=f"{workflow['trsID']}/versions/v{workflow['definition']['release']}", + trs_id=trs_id, workflow_name=workflow["definition"]["name"], categories=get_workflow_categories_from_collections( workflow["collections"] @@ -118,6 +178,12 @@ def generate_current_workflows(): parameters=get_input_types(workflow["definition"]), ) by_trs_id[workflow["trsID"]] = workflow_input + + if version_warnings and not skip_validation: + print("\nVersion status notes:") + for warning in version_warnings: + print(f" {warning}") + return by_trs_id @@ -149,29 +215,82 @@ def add_missing_parameters( existing_workflow_input.parameters.append(param) -def merge_into_existing(workflows_path): +def merge_into_existing( + workflows_path: str, skip_validation: bool = False +) -> Dict[str, Workflow]: existing = read_existing_yaml(workflows_path) - current = generate_current_workflows() + current = generate_current_workflows(skip_validation) merged: Dict[str, Workflow] = {} + invalid_versions = [] + versions_kept = [] + for versionless_trs_id, current_workflow_input in current.items(): existing_workflow_input = existing.get(versionless_trs_id) - if existing_workflow_input: - # we'll keep whatever has been specified in the brc repo, - # and only update values that are in the iwc manifest - exisiting_dict = existing_workflow_input.model_dump() - new_dict = current_workflow_input.model_dump() - for key in MANIFEST_SOURCE_OF_TRUTH: - exisiting_dict[key] = new_dict[key] - ensure_parameters_exist(current_workflow_input, existing_workflow_input) - updated_existing_workflow = Workflow(**exisiting_dict) - add_missing_parameters(current_workflow_input, updated_existing_workflow) - current_workflow_input = updated_existing_workflow + if not existing_workflow_input: + merged[versionless_trs_id] = current_workflow_input + continue + + iwc_version_valid = verify_trs_version_exists( + current_workflow_input.trs_id, skip_validation + ) + existing_version_valid = verify_trs_version_exists( + existing_workflow_input.trs_id, skip_validation + ) + + # Decide which version to use + if not iwc_version_valid and existing_version_valid: + # IWC version not on Dockstore yet, but existing version is valid + versions_kept.append( + f"Keeping {existing_workflow_input.trs_id} (IWC has newer unreleased version)" + ) + current_workflow_input.trs_id = existing_workflow_input.trs_id + elif not existing_version_valid: + # Existing version is invalid (manually edited to bad version) + if iwc_version_valid: + print( + f"Error: Invalid version {existing_workflow_input.trs_id} doesn't exist on Dockstore" + ) + print(f" -> Reverting to IWC version: {current_workflow_input.trs_id}") + invalid_versions.append(existing_workflow_input.trs_id) + else: + # Both versions are invalid - this shouldn't happen often + print( + f"Error: Neither existing nor IWC version exists on Dockstore for {versionless_trs_id}" + ) + # Keep what we have + current_workflow_input.trs_id = existing_workflow_input.trs_id + + # Build the merged workflow + existing_dict = existing_workflow_input.model_dump() + new_dict = current_workflow_input.model_dump() + + # Update manifest-controlled fields + for key in MANIFEST_SOURCE_OF_TRUTH: + existing_dict[key] = new_dict[key] + + ensure_parameters_exist(current_workflow_input, existing_workflow_input) + updated_existing_workflow = Workflow(**existing_dict) + add_missing_parameters(current_workflow_input, updated_existing_workflow) + current_workflow_input = updated_existing_workflow merged[versionless_trs_id] = current_workflow_input + + if versions_kept and not skip_validation: + print( + f"\nKept {len(versions_kept)} existing versions (newer IWC versions not on Dockstore yet)" + ) + for msg in versions_kept: + print(f" {msg}") + + if invalid_versions: + print(f"\nFixed {len(invalid_versions)} invalid versions in workflows.yml") + return merged -def to_workflows_yaml(workflows_path: str, exclude_other: bool): - by_trs_id = merge_into_existing(workflows_path) +def to_workflows_yaml( + workflows_path: str, exclude_other: bool, skip_validation: bool = False +): + by_trs_id = merge_into_existing(workflows_path, skip_validation) # sort by trs id, should play nicer with git diffs sorted_workflows = list(dict(sorted(by_trs_id.items())).values()) if exclude_other: @@ -209,5 +328,14 @@ def to_workflows_yaml(workflows_path: str, exclude_other: bool): action="store_true", help="Exclude other items from processing.", ) + parser.add_argument( + "--skip-validation", + action="store_true", + help="Skip validation of workflow versions against TRS API.", + ) args = parser.parse_args() - to_workflows_yaml(args.workflows_path, exclude_other=args.exclude_other) + to_workflows_yaml( + args.workflows_path, + exclude_other=args.exclude_other, + skip_validation=args.skip_validation, + ) diff --git a/catalog/source/workflows.yml b/catalog/source/workflows.yml index eb3f6b18..120439ed 100644 --- a/catalog/source/workflows.yml +++ b/catalog/source/workflows.yml @@ -2,7 +2,7 @@ workflows: - trs_id: "#workflow/github.com/iwc-workflows/amr_gene_detection/main/versions/v1.1.5" categories: - ANNOTATION - workflow_name: AMR gene detection + workflow_name: amr_gene_detection workflow_description: Antimicrobial resistance gene detection from assembled bacterial genomes @@ -162,10 +162,54 @@ workflows: - fastqsanger.gz active: false iwc_id: bacterial-genome-assembly-main + - trs_id: "#workflow/github.com/iwc-workflows/bacterial-quality-and-contamination-control-post-assembly/main/versions/v1.0" + categories: [] + workflow_name: + Post-Assembly Quality Control and Contamination Check for Bacterial + Genomes + workflow_description: + This workflow performs quality and contamination control analysis + on assembled contigs to assess bacterial genome quality and taxonomic assignment + ploidy: ANY + parameters: + - key: Input sequence reads (forward) + type_guide: + class: File + ext: + - fastq + - fastq.gz + - fastqsanger + - fastqsanger.gz + - key: Input sequence reads (reverse) + type_guide: + class: File + ext: + - fastq + - fastq.gz + - fastqsanger + - fastqsanger.gz + - key: Fastq boolean + type_guide: + class: boolean + - key: Fasta boolean + type_guide: + class: boolean + - key: Input sequence contigs FASTA + type_guide: + class: File + ext: fasta + - key: Select a taxonomy database + type_guide: + class: text + - key: Select a NCBI taxonomy database + type_guide: + class: text + active: false + iwc_id: bacterial-quality-and-contamination-control-post-assembly-main - trs_id: "#workflow/github.com/iwc-workflows/bacterial_genome_annotation/main/versions/v1.1.11" categories: - ANNOTATION - workflow_name: Bacterial Genome Annotation + workflow_name: bacterial_genome_annotation workflow_description: Annotation of an assembled bacterial genomes to detect genes, potential plasmids, integrons and Insertion sequence (IS) elements. @@ -539,6 +583,20 @@ workflows: class: boolean active: true iwc_id: fastq-to-matrix-10x-scrna-seq-fastq-to-matrix-10x-v3 + - trs_id: "#workflow/github.com/iwc-workflows/functional-annotation-protein-sequences/main/versions/v0.1" + categories: + - ANNOTATION + workflow_name: Functional annotation of protein sequences + workflow_description: + This workflow uses eggNOG mapper and InterProScan for functional + annotation of protein sequences. + ploidy: ANY + parameters: + - key: input + type_guide: + class: File + active: false + iwc_id: functional-annotation-protein-sequences-main - trs_id: "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1" categories: - CONSENSUS_SEQUENCES @@ -549,23 +607,23 @@ workflows: workflow_description: Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure - (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data. + (like e.g. Morbilliviruses). ploidy: ANY taxonomy_id: 11158 parameters: - key: Paired collection of sequencing data + variable: SANGER_READ_RUN_PAIRED type_guide: class: Collection collection_type: list:paired - variable: SANGER_READ_RUN_PAIRED - key: Reference annotation + variable: GENE_MODEL_URL type_guide: class: File - variable: GENE_MODEL_URL - key: Fasta reference genome + variable: ASSEMBLY_FASTA_URL type_guide: class: File - variable: ASSEMBLY_FASTA_URL - key: Primer scheme (optional) type_guide: class: File @@ -577,20 +635,6 @@ workflows: class: integer active: true iwc_id: generic-non-segmented-viral-variant-calling-main - - trs_id: "#workflow/github.com/iwc-workflows/functional-annotation-protein-sequences/main/versions/v0.1" - categories: - - ANNOTATION - workflow_name: Functional annotation of protein sequences - workflow_description: - This workflow uses eggNOG mapper and InterProScan for functional - annotation of protein sequences. - ploidy: ANY - parameters: - - key: input - type_guide: - class: File - active: false - iwc_id: functional-annotation-protein-sequences-main - trs_id: "#workflow/github.com/iwc-workflows/generic-variant-calling-wgs-pe/main/versions/v0.1.1" categories: - VARIANT_CALLING @@ -964,13 +1008,12 @@ workflows: class: text active: false iwc_id: pseudobulk-worflow-decoupler-edger-main - - trs_id: "#workflow/github.com/iwc-workflows/quality-and-contamination-control/main/versions/v1.1.9" - categories: - - ASSEMBLY - workflow_name: Quality and Contamination Control For Genome Assembly + - trs_id: "#workflow/github.com/iwc-workflows/quality-and-contamination-control-raw-reads/main/versions/v1.1.10" + categories: [] + workflow_name: Raw Read Quality and Contamination Control For Genome Assembly workflow_description: Short paired-end read analysis to provide quality analysis, - read cleaning and taxonomy assignation + read cleaning and taxonomy assignation directly from raw reads ploidy: ANY parameters: - key: Input sequence reads (forward) @@ -996,7 +1039,7 @@ workflows: type_guide: class: text active: false - iwc_id: quality-and-contamination-control-main + iwc_id: quality-and-contamination-control-raw-reads-main - trs_id: "#workflow/github.com/iwc-workflows/repeatmasking/main/versions/v0.1" categories: - ANNOTATION