diff --git a/.env.ci b/.env.ci new file mode 100644 index 0000000..f4157a8 --- /dev/null +++ b/.env.ci @@ -0,0 +1,21 @@ +#!/bin/bash + +# ************* BEGIN: Required CLI Settings *********************** + +# ** DELETE SAFETY CHECK ** +# We highly recommend leaving this set to True in production +# When set to True, delete operations will only be allowed on service URLs that +# are on localhost. This includes KF dataservice, Dewrangle, and FHIR service +# When this is set to False, delete operations will be allowed on any +# service the CLI interacts with + +# CI needs this to be false +DWDS_DELETE_SAFETY_CHECK=False + +# Dewrangle API variables +export DEWRANGLE_BASE_URL="https://dewrangle.com/" + +# CI overwrites this at runtime, leave commented +# export DEWRANGLE_DEV_PAT= + +# ************* END: Required CLI Settings *********************** diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6ca513a..4e28ed9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,36 +2,37 @@ name: ✅ CI on: pull_request: - types: [opened, reopened, edited, synchronize, closed] + types: [opened, reopened, synchronize, ready_for_review] concurrency: group: ci-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true env: - DEWRANGLE_DEV_PAT: ${{ secrets.DEWRANGLE_DEV_PAT }} - DEWRANGLE_BASE_URL: ${{ secrets.DEWRANGLE_BASE_URL }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_BUCKET_DATA_TRANSFER_TEST: ${{ secrets.AWS_BUCKET_DATA_TRANSFER_TEST }} - CAVATICA_BILLING_GROUP_ID: ${{ secrets.CAVATICA_BILLING_GROUP_ID }} + DEWRANGLE_DEV_PAT: ${{ secrets.DEWRANGLE_DEV_PAT }} + DEWRANGLE_BASE_URL: ${{ secrets.DEWRANGLE_BASE_URL }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_BUCKET_DATA_TRANSFER_TEST: ${{ secrets.AWS_BUCKET_DATA_TRANSFER_TEST }} + CAVATICA_BILLING_GROUP_ID: ${{ secrets.CAVATICA_BILLING_GROUP_ID }} + DWDS_DELETE_SAFETY_CHECK: ${{ secrets.DWDS_DELETE_SAFETY_CHECK }} jobs: lint: name: 🚨 Lint code - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest timeout-minutes: 10 steps: - - name: 👩‍💻 Checkout code - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: 🐍 Setup Python - uses: actions/setup-python@v3 + - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: pip - name: 📦 Install deps run: | + python -m pip install --upgrade pip pip install black==24.10.0 - name: 🚨 Lint code @@ -40,55 +41,58 @@ jobs: unit-test: name: ✅ Unit test - runs-on: ubuntu-20.04 - timeout-minutes: 10 + runs-on: ubuntu-latest + timeout-minutes: 15 steps: - - name: 👩‍💻 Checkout code - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: 🐍 Setup Python - uses: actions/setup-python@v3 + - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: pip - name: 📦 Install deps run: | - pip install --upgrade virtualenv - virtualenv -p python3 venv - source venv/bin/activate - pip install -e . - pip install .[dev] + python -m pip install --upgrade pip + pip install -e ".[dev]" - name: ✅ Test run: | - source venv/bin/activate - pytest --show-capture={no,stdout,stderr} ./tests/unit - + pytest --show-capture={no,stdout,stderr} tests/unit integration-test: name: ✅ Integration test - runs-on: ubuntu-20.04 - timeout-minutes: 10 + runs-on: ubuntu-latest + timeout-minutes: 30 + + # IMPORTANT: secrets are not exposed to fork PRs + if: ${{ github.event.pull_request.head.repo.fork == false }} + steps: - - name: 👩‍💻 Checkout code - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: 🐍 Setup Python - uses: actions/setup-python@v3 + - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: pip + + - name: 🐳 Sanity check Docker + run: | + docker version + docker info - name: 📦 Install deps run: | - pip install --upgrade virtualenv - virtualenv -p python3 venv - source venv/bin/activate - pip install -e . - pip install .[dev] + python -m pip install --upgrade pip + pip install -e ".[dev]" d3b-clients --help - name: ✅ Test + env: + # Ensure these are present in the test process + DEWRANGLE_DEV_PAT: ${{ secrets.DEWRANGLE_DEV_PAT }} + DEWRANGLE_BASE_URL: ${{ secrets.DEWRANGLE_BASE_URL }} run: | - source venv/bin/activate - pytest --show-capture={no,stdout,stderr} ./tests/integration - + # Optional: hard-fail early with a clear message instead of confusing KeyErrors + python -c "import os; assert os.getenv('DEWRANGLE_DEV_PAT') and os.getenv('DEWRANGLE_BASE_URL'), 'Missing Dewrangle secrets'" + pytest --show-capture={no,stdout,stderr} tests/integration diff --git a/.gitignore b/.gitignore index 52bcd6f..45cd9fe 100644 --- a/.gitignore +++ b/.gitignore @@ -182,7 +182,8 @@ cython_debug/ *.python-version data/ -!d3b-api-clients/tests/data +!tests/data/ +!tests/data/** pyvenv.cfg .DS_Store diff --git a/d3b_api_client_cli/cli/__init__.py b/d3b_api_client_cli/cli/__init__.py index e268080..bbf1e4c 100644 --- a/d3b_api_client_cli/cli/__init__.py +++ b/d3b_api_client_cli/cli/__init__.py @@ -4,8 +4,11 @@ All commands are initialized here """ -import click -from d3b_api_client_cli.cli.dewrangle import * +from d3b_api_client_cli.cli.dewrangle.graphql_commands import * +from d3b_api_client_cli.cli.dewrangle.ingest_commands import * +from d3b_api_client_cli.cli.dewrangle.setup_commands import * +from d3b_api_client_cli.cli.dewrangle.global_id_commands import * +from d3b_api_client_cli.cli.fhir.commands import * from d3b_api_client_cli.cli.postgres import * from d3b_api_client_cli.cli.faker import * @@ -27,9 +30,17 @@ def postgres(): @click.group() def dewrangle(): """ - Group of lower level CLI commands relating to working directly with the - Dewrangle API + Group of CLI commands relating to working with the Dewrangle """ + pass + + +@click.group() +def fhir(): + """ + Group of CLI commands relating to FHIR service operations + """ + pass @click.group() @@ -49,31 +60,37 @@ def main(): # Postgres API commands postgres.add_command(save_file_to_db) -# Dewrangle API commands +# CRUD Graphql commands dewrangle.add_command(upsert_organization) dewrangle.add_command(delete_organization) dewrangle.add_command(read_organizations) dewrangle.add_command(upsert_study) dewrangle.add_command(delete_study) dewrangle.add_command(read_studies) -dewrangle.add_command(upsert_credential) -dewrangle.add_command(delete_credential) -dewrangle.add_command(read_credentials) -dewrangle.add_command(upsert_volume) -dewrangle.add_command(delete_volume) -dewrangle.add_command(read_volumes) -dewrangle.add_command(list_and_hash_volume) -dewrangle.add_command(hash_volume_and_wait) -dewrangle.add_command(read_job) -dewrangle.add_command(create_billing_group) -dewrangle.add_command(delete_billing_group) -dewrangle.add_command(read_billing_groups) -dewrangle.add_command(upsert_global_descriptors) -dewrangle.add_command(download_global_descriptors) -dewrangle.add_command(upsert_and_download_global_descriptors) -dewrangle.add_command(upsert_and_download_global_descriptor) - -# Add command groups to the root CLI -main.add_command(dewrangle) -main.add_command(postgres) -main.add_command(faker) +dewrangle.add_command(get_study) +dewrangle.add_command(read_fhir_ingest_job) +dewrangle.add_command(read_fhir_servers) +dewrangle.add_command(upsert_fhir_server) +dewrangle.add_command(delete_fhir_server) + +# Ingest into Dewrangle commands +dewrangle.add_command(upload_study_file) +dewrangle.add_command(ingest_study_file) +dewrangle.add_command(ingest_study_files) + +# Setup commands +dewrangle.add_command(setup_dewrangle_org) +dewrangle.add_command(setup_dewrangle_study) +dewrangle.add_command(setup_all_studies) + +# Global ID commands +dewrangle.add_command(upsert_global_ids) +dewrangle.add_command(download_global_ids) + +# FHIR commands +fhir.add_command(build) +fhir.add_command(load_fhir) +fhir.add_command(delete_from_file) +fhir.add_command(delete_all) +fhir.add_command(delete_fhir_study) +fhir.add_command(total_counts) diff --git a/d3b_api_client_cli/cli/common.py b/d3b_api_client_cli/cli/common.py new file mode 100644 index 0000000..dd57c0d --- /dev/null +++ b/d3b_api_client_cli/cli/common.py @@ -0,0 +1,72 @@ +""" +Common functions for CLI commands. Mostly parameter validators +""" + +from pprint import pformat +from typing import List + +from urllib.parse import urlparse +import click + +from d3b_api_client_cli import utils +from d3b_api_client_cli.config import ( + KidsFirstFhirEntity, + ETL_STAGES, + config, +) + + +def validate_kids_first_types(types: List[str]): + """ + Validate Kids First Dataservice types + """ + endpoints = config["dataservice"]["endpoints"] + + kids_first_types = set(types) + default = set(utils.camel_to_snake(e) for e in endpoints) + + if not kids_first_types <= default: + invalid = kids_first_types - default + raise click.BadParameter( + f"Invalid Kids First Dataservice Type: {pformat(invalid)}." + f" Each type must be one of {pformat(default)}" + ) + + +def validate_url(ctx, param, url: str): + """ + Ensure url is valid + """ + try: + result = urlparse(url) + if not (result.scheme and result.netloc): + raise click.BadParameter(f"{url} is not a valid URL") + return url + except Exception as exc: + raise click.BadParameter(f"{url} is not a valid URL") from exc + + +def validate_kids_first_fhir_types(types: List[str]): + """ + Validate kids_first_fhir_types + """ + kids_first_fhir_types = set(types) + default = {v.value for v in KidsFirstFhirEntity} + + if not kids_first_fhir_types <= default: + invalid = kids_first_fhir_types - default + raise click.BadParameter( + f"Invalid Kids First FHIR Type: {pformat(invalid)}." + f" Each type must be one of {pformat(default)}" + ) + + +def validate_stages(stages): + """ + Validate stages CLI option + """ + if not all(s in set(ETL_STAGES) for s in stages): + raise click.BadParameter( + f"Invalid stages value {stages}. Must one or more chars in:" + f" '{ETL_STAGES}'" + ) diff --git a/d3b_api_client_cli/cli/dewrangle/__init__.py b/d3b_api_client_cli/cli/dewrangle/__init__.py index 2c7febd..e69de29 100644 --- a/d3b_api_client_cli/cli/dewrangle/__init__.py +++ b/d3b_api_client_cli/cli/dewrangle/__init__.py @@ -1,13 +0,0 @@ -""" -Dewrangle CLI commands - -Functions for interacting the Dewrangle's GraphQL and REST APIs -""" - -from d3b_api_client_cli.cli.dewrangle.organization_commands import * -from d3b_api_client_cli.cli.dewrangle.study_commands import * -from d3b_api_client_cli.cli.dewrangle.credential_commands import * -from d3b_api_client_cli.cli.dewrangle.volume_commands import * -from d3b_api_client_cli.cli.dewrangle.job_commands import * -from d3b_api_client_cli.cli.dewrangle.billing_group_commands import * -from d3b_api_client_cli.cli.dewrangle.global_id_commands import * diff --git a/d3b_api_client_cli/cli/dewrangle/billing_group_commands.py b/d3b_api_client_cli/cli/dewrangle/billing_group_commands.py deleted file mode 100644 index 747ff48..0000000 --- a/d3b_api_client_cli/cli/dewrangle/billing_group_commands.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Dewrangle billing_group commands -""" - -import logging - -import click - -from d3b_api_client_cli.config import config -from d3b_api_client_cli.config.log import init_logger -from d3b_api_client_cli.dewrangle import graphql as gql_client - -logger = logging.getLogger(__name__) -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] - - -@click.command() -@click.option( - "--cavatica-billing-group-id", - required=True, - help="Cavatica billing group ID", -) -@click.option( - "--organization-id", - required=True, - help="ID of the Dewrangle org this billing_group will belong to", -) -def create_billing_group(cavatica_billing_group_id, organization_id): - """ - Create billing_group in Dewrangle - """ - init_logger() - - return gql_client.create_billing_group( - organization_id, cavatica_billing_group_id - ) - - -@click.command() -@click.argument( - "billing_group_id", -) -@click.option( - "--disable-delete-safety-check", - is_flag=True, - help="This will allow deleting of the billing group", -) -def delete_billing_group(billing_group_id, disable_delete_safety_check): - """ - Delete billing_group in Dewrangle by ID - - \b - Arguments: - \b - billing_group_id - Dewrangle node ID - """ - init_logger() - - return gql_client.delete_billing_group( - billing_group_id, not disable_delete_safety_check - ) - - -@click.command() -@click.option( - "--output-dir", - default=DEWRANGLE_DIR, - type=click.Path(exists=False, file_okay=False, dir_okay=True), - help="The path to the data dir where billing_groups will be written", -) -def read_billing_groups(output_dir): - """ - Fetch billing_groups from Dewrangle - """ - init_logger() - - return gql_client.read_billing_groups(output_dir) - - -@click.command() -@click.argument( - "node_id", -) -def get_billing_group(node_id): - """ - Get billing_group in Dewrangle by Dewrangle GraphQL node ID - - \b - Arguments: - \b - node_id - ID of the billing_group in Dewrangle - """ - init_logger() - - return gql_client.read_billing_group(node_id) diff --git a/d3b_api_client_cli/cli/dewrangle/credential_commands.py b/d3b_api_client_cli/cli/dewrangle/credential_commands.py deleted file mode 100644 index 68ccd61..0000000 --- a/d3b_api_client_cli/cli/dewrangle/credential_commands.py +++ /dev/null @@ -1,158 +0,0 @@ -""" -Dewrangle credential commands -""" - -import logging - -import click - -from d3b_api_client_cli.config import config -from d3b_api_client_cli.config.log import init_logger -from d3b_api_client_cli.utils import read_json -from d3b_api_client_cli.dewrangle import graphql as gql_client - -logger = logging.getLogger(__name__) -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] -DEFAULT_CREDENTIAL_TYPE = config["dewrangle"]["credential_type"] - - -@click.command() -@click.option( - "--name", - help="Credential name", -) -@click.option( - "--key", - help="Credential key", -) -@click.option( - "--secret", - help="Credential secret", -) -@click.option( - "--credential-type", - default=DEFAULT_CREDENTIAL_TYPE, - help="Credential name", -) -@click.option( - "--study-id", - help="Graphql node id of the Study this credential belongs to", -) -@click.option( - "--study-global-id", - help="Global ID of the study this credential belongs to", -) -@click.option( - "--filepath", - type=click.Path(exists=True, file_okay=True, dir_okay=False), -) -def upsert_credential( - name, - key, - secret, - credential_type, - study_id, - study_global_id, - filepath, -): - """ - Upsert credential in Dewrangle. Either provide values in a JSON file or - provide values via CLI options. If both are provided, CLI options - take precedence and will overwrite the values from the file - """ - init_logger() - - cli_opts = { - "key": key, - "secret": secret, - "name": name, - "type": credential_type, - } - input_data = {} - if filepath: - input_data.update(read_json(filepath)) - for k, value in cli_opts.items(): - if value: - input_data[k] = value - else: - input_data = cli_opts - - return gql_client.upsert_credential( - input_data, study_id=study_id, study_global_id=study_global_id - ) - - -@click.command() -@click.option( - "--node-id", - help="Credential graphql node ID", -) -@click.option( - "--credential-key", - help="Credential key", -) -@click.option( - "--study-global-id", - help="Global ID of the study this credential belongs to", -) -@click.option( - "--disable-delete-safety-check", - is_flag=True, - help="This will allow deleting of the organization", -) -def delete_credential( - node_id, credential_key, study_global_id, disable_delete_safety_check -): - """ - Delete credential using either credential key or credential graphql node - ID - """ - init_logger() - - return gql_client.delete_credential( - node_id=node_id, - credential_key=credential_key, - study_global_id=study_global_id, - delete_safety_check=not disable_delete_safety_check, - ) - - -@click.command() -@click.option( - "--output-dir", - default=DEWRANGLE_DIR, - type=click.Path(exists=False, file_okay=False, dir_okay=True), - help="The path to the data dir where credentials will be written", -) -@click.option( - "--study-global-id", - help="Global ID of the study to filter credentials by", -) -def read_credentials(output_dir, study_global_id): - """ - Fetch credentials from Dewrangle - """ - init_logger() - - return gql_client.read_credentials( - study_global_id, - output_dir, - ) - - -@click.command() -@click.argument( - "node_id", -) -def get_credential(node_id): - """ - Get credential in Dewrangle by Dewrangle GraphQL node ID - - \b - Arguments: - \b - node_id - ID of the credential in Dewrangle - """ - init_logger() - - return gql_client.read_credential(node_id) diff --git a/d3b_api_client_cli/cli/dewrangle/global_id_commands.py b/d3b_api_client_cli/cli/dewrangle/global_id_commands.py index 654cd90..c583d43 100644 --- a/d3b_api_client_cli/cli/dewrangle/global_id_commands.py +++ b/d3b_api_client_cli/cli/dewrangle/global_id_commands.py @@ -3,17 +3,12 @@ in Dewrangle """ -import os import logging import click -from d3b_api_client_cli.config import log, FHIR_RESOURCE_TYPES -from d3b_api_client_cli.dewrangle.global_id import GlobalIdDescriptorOptions -from d3b_api_client_cli.dewrangle.global_id import ( - upsert_global_descriptors as _upsert_global_descriptors, - download_global_descriptors as _download_global_descriptors, - upsert_and_download_global_descriptors as _upsert_and_download_global_descriptors, - upsert_and_download_global_descriptor as _upsert_and_download_global_descriptor, +from d3b_api_client_cli.dewrangle.rest import ( + request_global_ids as _upsert_global_ids, + download_global_ids as _download_global_ids, ) logger = logging.getLogger(__name__) @@ -21,188 +16,57 @@ @click.command() @click.option( - "--output-filepath", - type=click.Path(exists=False, file_okay=True, dir_okay=False), - help="If provided, download the file to this path. This takes " - "precedence over the --output-dir option", -) -@click.option( - "--output-dir", - default=os.getcwd(), - type=click.Path(exists=True, file_okay=False, dir_okay=True), - help="If provided, download the file with the default file name into " - "this directory", -) -@click.option( - "--download-all", - is_flag=True, - help="What descriptor(s) for each global ID to download. Either download" - " all descriptors for each global ID or just the most recent", -) -@click.option( - "--study-global-id", - help="The global ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", -) -@click.option( - "--study-id", - help="The GraphQL ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", -) -@click.option( - "--global-id", - help="Global ID associated with this descriptor." - " If this is provided, and the descriptor is new, then Dewrangle" - " will append the descriptor to this global ID's descriptor list", -) -@click.option( - "--fhir-resource-type", - type=click.Choice([rt for rt in FHIR_RESOURCE_TYPES.keys()]), - required=True, -) -@click.option( - "--descriptor", - required=True, -) -def upsert_and_download_global_descriptor( - descriptor, - fhir_resource_type, - global_id, - study_id, - study_global_id, - download_all, - output_dir, - output_filepath, -): - """ - Send request to upsert one global ID descriptor in Dewrangle and - download the resulting global ID descriptors. - - In order to create new global IDs provide: - descriptor, fhir-resource-type - - In order to update existing global IDs: - descriptor, fhir-resource-type, global-id - - \b - Arguments: - \b - input_filepath - Path to the file with global IDs and descriptors - """ - - log.init_logger() - - if (not study_id) and (not study_global_id): - raise click.BadParameter( - "❌ You must provide either the study's global ID in Dewrangle OR " - "the study's GraphQL ID in Dewrangle" - ) - return _upsert_and_download_global_descriptor( - descriptor, - fhir_resource_type, - global_id=global_id, - study_global_id=study_global_id, - dewrangle_study_id=study_id, - download_all=download_all, - output_dir=output_dir, - output_filepath=output_filepath, - ) - - -@click.command() -@click.option( - "--output-filepath", - type=click.Path(exists=False, file_okay=True, dir_okay=False), - help="If provided, download the file to this path. This takes " - "precedence over the --output-dir option", -) -@click.option( - "--output-dir", - default=os.getcwd(), - type=click.Path(exists=True, file_okay=False, dir_okay=True), - help="If provided, download the file with the default file name into " - "this directory", -) -@click.option( - "--download-all", - is_flag=True, - help="What descriptor(s) for each global ID to download. Either download" - " all descriptors for each global ID or just the most recent", -) -@click.option( - "--study-global-id", - help="The global ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", + "--dewrangle-job-id", + help="The job id returned from upserting global IDs. If provided, " + "only global IDs from the recent upsert request will be downloaded", ) -@click.option( - "--study-id", - help="The GraphQL ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", +@click.argument( + "kf_study_id", ) @click.argument( - "input_filepath", + "filepath", type=click.Path(exists=False, file_okay=True, dir_okay=False), ) -def upsert_and_download_global_descriptors( - input_filepath, - study_id, - study_global_id, - download_all, - output_dir, - output_filepath, -): +def download_global_ids(kf_study_id, filepath, dewrangle_job_id): """ - Send request to upsert global ID descriptors in Dewrangle and - download the resulting global ID descriptors. - - In order to create new global IDs provide a CSV file with the columns: - descriptor, fhirResourceType - - In order to update existing global IDs provide a CSV file with the columns: - descriptor, fhirResourceType, globalId + Download global IDs for a study in Dewrangle \b Arguments: \b - input_filepath - Path to the file with global IDs and descriptors + kf_study_id - Kids First ID of the study in Dataservice + filepath - Path the file that you want global IDs to be written to """ + from d3b_api_client_cli.config import log log.init_logger() - if (not study_id) and (not study_global_id): - raise click.BadParameter( - "❌ You must provide either the study's global ID in Dewrangle OR " - "the study's GraphQL ID in Dewrangle" - ) + logger.info( + f"🛸 Downloading global ID file to {filepath} from Dewrangle study:" + f" {kf_study_id}" + ) + what = "all" + if dewrangle_job_id: + what = f"job {dewrangle_job_id}" - return _upsert_and_download_global_descriptors( - input_filepath, - study_global_id=study_global_id, - dewrangle_study_id=study_id, - download_all=download_all, - output_dir=output_dir, - output_filepath=output_filepath, + logger.info( + f"🛸 Downloading {what} global IDs from Dewrangle to file: " + f"{filepath} for study {kf_study_id}" ) + _download_global_ids(kf_study_id, filepath, dewrangle_job_id) @click.command() -@click.option( - "--study-global-id", - help="The global ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", -) -@click.option( - "--study-id", - help="The GraphQL ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", +@click.argument( + "kf_study_id", ) @click.argument( "filepath", type=click.Path(exists=False, file_okay=True, dir_okay=False), ) -def upsert_global_descriptors(filepath, study_id, study_global_id): +def upsert_global_ids(kf_study_id, filepath): """ - Upsert global ID descriptors in Dewrangle for a study. + Request global IDs to be created or updated in Dewrangle for a study. In order to create new global IDs provide a CSV file with the columns: descriptor, fhirResourceType @@ -213,73 +77,15 @@ def upsert_global_descriptors(filepath, study_id, study_global_id): \b Arguments: \b - filepath - Path to the file with global IDs and descriptors - """ - - log.init_logger() - - if (not study_id) and (not study_global_id): - raise click.BadParameter( - "❌ You must provide either the study's global ID in Dewrangle OR " - "the study's GraphQL ID in Dewrangle" - ) - - return _upsert_global_descriptors(filepath, study_global_id, study_id) - - -@click.command() -@click.option( - "--output-dir", - default=os.getcwd(), - type=click.Path(exists=True, file_okay=False, dir_okay=True), - help="If provided, download the file with the default file name into " - "this directory", -) -@click.option( - "--download-all", - is_flag=True, - help="What descriptor(s) for each global ID to download. Either download" - " all descriptors for each global ID or just the most recent", -) -@click.option( - "--job-id", help="Dewrangle job id from the upsert_global_descriptors cmd" -) -@click.option( - "--study-global-id", - help="The global ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", -) -@click.option( - "--study-id", - help="The GraphQL ID of the study in Dewrangle. You must provide either " - "the global ID of the study OR the GraphQL ID of the study but not both", -) -@click.option( - "--filepath", - type=click.Path(exists=False, file_okay=True, dir_okay=False), - help="If provided, download the file to this filepath. This takes " - "precedence over --output-dir", -) -def download_global_descriptors( - filepath, study_id, study_global_id, job_id, download_all, output_dir -): - """ - Download global ID descriptors in Dewrangle for a study. + kf_study_id - Kids First ID of the study in Dataservice + filepath - Path to the global ID request file """ + from d3b_api_client_cli.config import log log.init_logger() - if (not study_id) and (not study_global_id): - raise click.BadParameter( - "❌ You must provide either the study's global ID in Dewrangle OR " - "the study's GraphQL ID in Dewrangle" - ) - - return _download_global_descriptors( - dewrangle_study_id=study_id, - study_global_id=study_global_id, - filepath=filepath, - job_id=job_id, - download_all=download_all, - output_dir=output_dir, + logger.info( + "🛸 Requesting global IDs from Dewrangle for descriptors in " + f"{filepath} and study {kf_study_id}" ) + _upsert_global_ids(kf_study_id, filepath) diff --git a/d3b_api_client_cli/cli/dewrangle/graphql_commands.py b/d3b_api_client_cli/cli/dewrangle/graphql_commands.py new file mode 100644 index 0000000..839127f --- /dev/null +++ b/d3b_api_client_cli/cli/dewrangle/graphql_commands.py @@ -0,0 +1,291 @@ +""" +All CLI commands related to basic Dewrangle graphql operations + +- CRUD Dewrangle organization +- CRUD Dewrangle study +""" + +import logging + +import click + +from d3b_api_client_cli.config import ( + IdTypes, + DEWRANGLE_DIR, +) +from d3b_api_client_cli.utils import read_json +from d3b_api_client_cli.dewrangle import graphql as gql_client + +logger = logging.getLogger(__name__) + + +@click.command(help="Delete FHIR server in Dewrangle") +@click.argument( + "dewrangle_organization_id", +) +def delete_fhir_server(dewrangle_organization_id): + """ + Delete FHIR server in Dewrangle + + \b + Arguments: + \b + dewrangle_organization_id - Dewrangle ID of organization + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.delete_fhir_server(dewrangle_organization_id) + + +@click.command(help="Upsert FHIR server in Dewrangle") +@click.option( + "--oidc-client-secret", + help="The secret for the Keycloak OIDC client that will be used to " + " authenticate with the FHIR server before releasing data to it", +) +@click.argument( + "filepath", + type=click.Path(exists=True, file_okay=True, dir_okay=False), +) +@click.argument( + "dewrangle_organization_id", +) +def upsert_fhir_server(dewrangle_organization_id, filepath, oidc_client_secret): + """ + Upsert FHIR server in Dewrangle + + \b + Arguments: + \b + dewrangle_organization_id - Dewrangle ID of organization + filepath - Path to file defining Dewrangle organization + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.upsert_fhir_server( + dewrangle_organization_id, + read_json(filepath), + oidc_client_secret=oidc_client_secret, + ) + + +@click.command(help="Fetch FHIR servers from Dewrangle") +@click.option( + "--output-dir", + default=DEWRANGLE_DIR, + type=click.Path(exists=False, file_okay=False, dir_okay=True), + help="The path to the data dir where fhir servers will be written", +) +@click.argument( + "dewrangle_organization_id", +) +def read_fhir_servers(dewrangle_organization_id, output_dir): + """ + Fetch FHIR servers from Dewrangle + + \b + Arguments: + \b + dewrangle_organization_id - Dewrangle ID of organization + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.read_fhir_servers(dewrangle_organization_id, output_dir) + + +@click.command(help="Fetch FHIR ingest job from Dewrangle") +@click.option( + "--output-dir", + default=DEWRANGLE_DIR, + type=click.Path(exists=False, file_okay=False, dir_okay=True), + help="The path to the data dir where organizations will be written", +) +@click.argument( + "node_id", +) +def read_fhir_ingest_job(node_id, output_dir): + """ + Fetch FHIR ingest job from Dewrangle + + \b + Arguments: + \b + node_id - Dewrangle ID of FHIR ingest job + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.read_fhir_ingest_job(node_id, output_dir) + + +@click.command(help="Upsert organization in Dewrangle") +@click.argument( + "filepath", + type=click.Path(exists=True, file_okay=True, dir_okay=False), +) +def upsert_organization(filepath): + """ + Upsert organization in Dewrangle + + \b + Arguments: + \b + filepath - Path to file defining Dewrangle organization + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.upsert_organization(read_json(filepath)) + + +@click.command(help="Delete organization in Dewrangle by either ID or name") +@click.option( + "--dewrangle-org-id", + help=f"The Dewrangle GraphQL node ID of organization ", +) +@click.option( + "--dewrangle-org-name", + help=f"The Dewrangle name of organization ", +) +def delete_organization(dewrangle_org_id, dewrangle_org_name): + """ + Delete organization in Dewrangle by either ID or name + + \b + Arguments: + \b + node_id - ID of the organization in Dewrangle + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + if dewrangle_org_id: + kwargs = {"dewrangle_org_id": dewrangle_org_id} + else: + kwargs = {"dewrangle_org_name": dewrangle_org_name} + return gql_client.delete_organization(**kwargs) + + +@click.command(help="Fetch organizations from Dewrangle") +@click.option( + "--output-dir", + default=DEWRANGLE_DIR, + type=click.Path(exists=False, file_okay=False, dir_okay=True), + help="The path to the data dir where organizations will be written", +) +def read_organizations(output_dir): + """ + Fetch organizations from Dewrangle + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.read_organizations(output_dir) + + +@click.command(help="Upsert study in Dewrangle") +@click.option( + "--kf-study-id", + help="The KF ID of the study to use as the Dewrangle global ID", +) +@click.argument( + "filepath", + type=click.Path(exists=True, file_okay=True, dir_okay=False), +) +@click.argument( + "organization_id", +) +def upsert_study(filepath, organization_id, kf_study_id): + """ + Upsert study in Dewrangle + + \b + Arguments: + \b + filepath - Path to file defining Dewrangle study + organization_id - ID of the Dewrangle org this study will belong to + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + if not kf_study_id: + data = read_json(filepath) + kf_id = data.pop("kf_id", None) + else: + kf_id = kf_study_id + + return gql_client.upsert_study(data, organization_id, kf_study_id=kf_id) + + +@click.command(help="Delete study in Dewrangle") +@click.option( + "--id-type", + default=IdTypes.DEWRANGLE.value, + help=f"The type of ID. Must be one of: {[i.value for i in IdTypes]} ", +) +@click.argument( + "node_id", +) +def delete_study(node_id, id_type): + """ + Delete study in Dewrangle + + \b + Arguments: + \b + node_id - ID of the study in Dewrangle + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.delete_study(node_id, id_type) + + +@click.command(help="Fetch studies from Dewrangle") +@click.option( + "--output-dir", + default=DEWRANGLE_DIR, + type=click.Path(exists=False, file_okay=False, dir_okay=True), + help="The path to the data dir where studies will be written", +) +def read_studies(output_dir): + """ + Fetch studies from Dewrangle + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.read_studies(output_dir) + + +@click.command(help="Get study in Dewrangle by Dewrangle node ID") +@click.argument( + "node_id", +) +def get_study(node_id): + """ + Get study in Dewrangle by Dewrangle GraphQL node ID + + \b + Arguments: + \b + node_id - ID of the study in Dewrangle + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return gql_client.read_study(node_id) diff --git a/d3b_api_client_cli/cli/dewrangle/ingest_commands.py b/d3b_api_client_cli/cli/dewrangle/ingest_commands.py new file mode 100644 index 0000000..291fa9b --- /dev/null +++ b/d3b_api_client_cli/cli/dewrangle/ingest_commands.py @@ -0,0 +1,103 @@ +""" +All CLI commands related to Dewrangle ingest operations + +- Ingest study files into Dewrangle for a given study +- Upload single study file without starting an ingest job in Dewrangle +""" + +import logging +import click + +from d3b_api_client_cli.dewrangle.rest import ( + upload_study_file as _upload_study_file, +) +from d3b_api_client_cli.dewrangle import ingest + +logger = logging.getLogger(__name__) + + +@click.command(help="Upload study file to Dewrangle study") +@click.argument( + "dewrangle_study_id", +) +@click.argument( + "filepath", + type=click.Path(exists=False, file_okay=True, dir_okay=False), +) +def upload_study_file(dewrangle_study_id, filepath): + """ + Upload a clinical study file to Dewrangle + + \b + Arguments: + \b + dewrangle_study_id - ID of the study in Dewrangle + filepath - Path to the file to be uploaded + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + logger.info( + f"🛸 Uploading study file: {filepath} to Dewrangle study:" + f" {dewrangle_study_id}" + ) + _upload_study_file(dewrangle_study_id, filepath) + + +@click.command(help="Ingest FHIR resource file into Dewrangle study") +@click.argument( + "kf_study_id", +) +@click.argument( + "filepath", + type=click.Path(exists=False, file_okay=True, dir_okay=False), +) +def ingest_study_file(kf_study_id, filepath): + """ + Ingest a FHIR resource file into Dewrangle + + ** Only for debugging purposes ** + + \b + Arguments: + \b + kf_study_id - KF ID of the study in Dataservice + filepath - Path to the file to be uploaded + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + logger.info( + f"🏭 Ingesting study file: {filepath} to Dewrangle study:" + f" {kf_study_id}" + ) + ingest.upload_and_ingest_study_file(kf_study_id, filepath) + + +@click.command( + help="Ingest a study's FHIR resource files into a Dewrangle study" +) +@click.argument( + "study_data_dir_or_file", + type=click.Path(exists=True, file_okay=True, dir_okay=True), +) +def ingest_study_files(study_data_dir_or_file): + """ + Ingest a study's FHIR resource files into a Dewrangle study + + \b + Arguments: + \b + study_data_dir - Path to the dir with FHIR json files to be ingested + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + logger.info( + f"🏭 Ingesting study data in: {study_data_dir_or_file}" f" to Dewrangle" + ) + + return ingest.ingest_study_files(study_data_dir_or_file) diff --git a/d3b_api_client_cli/cli/dewrangle/job_commands.py b/d3b_api_client_cli/cli/dewrangle/job_commands.py deleted file mode 100644 index 686f35f..0000000 --- a/d3b_api_client_cli/cli/dewrangle/job_commands.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Dewrangle Job commands -""" - -import logging - -import click - -from d3b_api_client_cli.config.log import init_logger -from d3b_api_client_cli.dewrangle import graphql as gql_client -from d3b_api_client_cli.config import config - -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] - -logger = logging.getLogger(__name__) - - -@click.command() -@click.argument( - "node_id", -) -@click.option( - "--output-dir", - default=DEWRANGLE_DIR, - type=click.Path(exists=False, file_okay=False, dir_okay=True), - help="The path to the data dir where volumes will be written", -) -def read_job(node_id, output_dir): - """ - Get Job in Dewrangle by Dewrangle GraphQL node ID - - \b - Arguments: - \b - node_id - ID of the volume in Dewrangle - """ - init_logger() - - return gql_client.read_job(node_id, output_dir) diff --git a/d3b_api_client_cli/cli/dewrangle/organization_commands.py b/d3b_api_client_cli/cli/dewrangle/organization_commands.py deleted file mode 100644 index 8b11624..0000000 --- a/d3b_api_client_cli/cli/dewrangle/organization_commands.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -Dewrangle CLI commands - -Functions for interacting the Dewrangle's GraphQL and REST APIs -""" - -import logging - -import click - -from d3b_api_client_cli.config import config -from d3b_api_client_cli.config.log import init_logger -from d3b_api_client_cli.utils import read_json -from d3b_api_client_cli.dewrangle import graphql as gql_client - -from pprint import pprint - -logger = logging.getLogger(__name__) - - -@click.command() -@click.argument( - "filepath", - type=click.Path(exists=True, file_okay=True, dir_okay=False), -) -def upsert_organization(filepath): - """ - Upsert organization in Dewrangle. Used in integration testing - - \b - Arguments: - \b - filepath - Path to file defining Dewrangle organization - """ - init_logger() - - return gql_client.upsert_organization(read_json(filepath)) - - -@click.command() -@click.option( - "--disable-delete-safety-check", - is_flag=True, - help="This will allow deleting of the organization", -) -@click.option( - "--dewrangle-org-id", - help="The Dewrangle GraphQL node ID of organization", -) -@click.option( - "--dewrangle-org-name", - help="The Dewrangle name of organization", -) -def delete_organization( - dewrangle_org_id, dewrangle_org_name, disable_delete_safety_check -): - """ - Delete organization in Dewrangle by either ID or name. Used in integration - testing - """ - init_logger() - - if dewrangle_org_id: - kwargs = {"dewrangle_org_id": dewrangle_org_id} - else: - kwargs = {"dewrangle_org_name": dewrangle_org_name} - - kwargs["delete_safety_check"] = not disable_delete_safety_check - - pprint(kwargs) - - return gql_client.delete_organization(**kwargs) - - -@click.command() -@click.option( - "--output-dir", - default=config["dewrangle"]["output_dir"], - type=click.Path(exists=False, file_okay=False, dir_okay=True), - help="The path to the data dir where organizations will be written", -) -def read_organizations(output_dir): - """ - Fetch organizations from Dewrangle. Used in integration testing - """ - init_logger() - - return gql_client.read_organizations(output_dir) diff --git a/d3b_api_client_cli/cli/dewrangle/setup_commands.py b/d3b_api_client_cli/cli/dewrangle/setup_commands.py new file mode 100644 index 0000000..5c24f9a --- /dev/null +++ b/d3b_api_client_cli/cli/dewrangle/setup_commands.py @@ -0,0 +1,142 @@ +""" +Commands related to initializing Dewrangle with data +""" + +import logging +import click + + +from d3b_api_client_cli.config import ( + DEWRANGLE_FHIR_SERVERS_FILEPATH, +) +from d3b_api_client_cli.utils import read_json +from d3b_api_client_cli.dewrangle import setup + +logger = logging.getLogger(__name__) + + +@click.command( + help="Upsert Kids First study in Dewrangle. Attach FHIR servers to study" +) +@click.option( + "--dewrangle-fhir-servers-file", + default=DEWRANGLE_FHIR_SERVERS_FILEPATH, + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help="The path to file containing Dewrangle FHIR server configuration", +) +@click.argument("kf_study_id") +@click.argument("dewrangle_org_name") +def setup_dewrangle_study( + kf_study_id, dewrangle_org_name, dewrangle_fhir_servers_file +): + """ + Upsert study in Dewrangle. Attach FHIR servers to study + + \b + Arguments: + \b + kf_study_id - ID of the study in Dataservice + dewrangle_org_name - Name of the Dewrangle organization to add study to + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + logger.info("✨ Setting up study in Dewrangle") + fhir_servers = read_json(dewrangle_fhir_servers_file) + + return setup.setup_dewrangle_study( + kf_study_id, + fhir_servers, + dewrangle_org_name=dewrangle_org_name, + ) + + +@click.command( + help="Upsert organization in Dewrangle. Upsert all studies in " + "Dataservice into org. Attach FHIR servers to each study" +) +@click.option( + "--organization-file", + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help="The path to file containing the Dewrangle organization payload", +) +@click.option( + "--with-studies", + is_flag=True, + help="Fetch all studies from Dataservice, upsert in Dewrangle, and " + "attach the FHIR servers to those studies", +) +@click.option( + "--dewrangle-fhir-servers-file", + default=DEWRANGLE_FHIR_SERVERS_FILEPATH, + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help="The path to file containing Dewrangle FHIR server configuration", +) +def setup_dewrangle_org( + with_studies, dewrangle_fhir_servers_file, organization_file +): + """ + Upsert organization in Dewrangle. Upsert all studies in + Dataservice into org. Attach FHIR servers to each study + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + logger.info("✨ Setting up organization in Dewrangle") + + org = None + if organization_file: + org = read_json(organization_file) + + return setup.setup_dewrangle_org( + organization_payload=org, + with_studies=with_studies, + fhir_servers_filepath=dewrangle_fhir_servers_file, + ) + + +@click.command( + help="Setup all Dataservice studies within Dewrangle organization." + " Upsert all studies in Dataservice into org then attach the org's" + " FHIR servers to each study" +) +@click.option( + "--dataservice-study-ids-file", + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help="The path to JSON file containing a list of study KF IDs", +) +@click.option( + "--dewrangle-fhir-servers-file", + default=DEWRANGLE_FHIR_SERVERS_FILEPATH, + type=click.Path(exists=True, file_okay=True, dir_okay=False), + help="The path to file containing Dewrangle FHIR server configuration", +) +@click.argument("dewrangle_org_name") +def setup_all_studies( + dewrangle_org_name, dewrangle_fhir_servers_file, dataservice_study_ids_file +): + """ + Setup all Dataservice studies within Dewrangle organization. + Upsert all studies in Dataservice into org then attach the org's + FHIR servers to each study + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + logger.info( + "✨ Setting up all Dataservice studies in Dewrangle organization:" + f" {dewrangle_org_name}" + ) + + study_ids = None + if dataservice_study_ids_file: + study_ids = read_json(dataservice_study_ids_file) + + fhir_servers = read_json(dewrangle_fhir_servers_file) + + return setup.setup_all_studies( + fhir_servers, study_ids=study_ids, dewrangle_org_name=dewrangle_org_name + ) diff --git a/d3b_api_client_cli/cli/dewrangle/study_commands.py b/d3b_api_client_cli/cli/dewrangle/study_commands.py deleted file mode 100644 index 0c66e23..0000000 --- a/d3b_api_client_cli/cli/dewrangle/study_commands.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -Dewrangle study commands -""" - -import logging - -import click - -from d3b_api_client_cli.config import config -from d3b_api_client_cli.config.log import init_logger -from d3b_api_client_cli.utils import read_json -from d3b_api_client_cli.dewrangle import graphql as gql_client - -logger = logging.getLogger(__name__) -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] - - -@click.command() -@click.option( - "--study-id", - help="Either the KF ID or Dewrangle global ID of the study", -) -@click.argument( - "filepath", - type=click.Path(exists=True, file_okay=True, dir_okay=False), -) -@click.argument( - "organization_id", -) -def upsert_study(filepath, organization_id, study_id): - """ - Upsert study in Dewrangle - - \b - Arguments: - \b - filepath - Path to file defining Dewrangle study - organization_id - ID of the Dewrangle org this study will belong to - """ - init_logger() - - if not study_id: - data = read_json(filepath) - kf_id = data.pop("kf_id", None) - else: - kf_id = study_id - - return gql_client.upsert_study(data, organization_id, study_id=kf_id) - - -@click.command() -@click.argument( - "study_id", -) -@click.option( - "--disable-delete-safety-check", - is_flag=True, - help="This will allow deleting of the organization", -) -def delete_study(study_id, disable_delete_safety_check): - """ - Delete study in Dewrangle by KF ID or global ID - - \b - Arguments: - \b - node_id - Either a Kids First ID or Dewrangle global ID - """ - init_logger() - - return gql_client.delete_study(study_id, not disable_delete_safety_check) - - -@click.command() -@click.option( - "--output-dir", - default=DEWRANGLE_DIR, - type=click.Path(exists=False, file_okay=False, dir_okay=True), - help="The path to the data dir where studies will be written", -) -def read_studies(output_dir): - """ - Fetch studies from Dewrangle - """ - init_logger() - - return gql_client.read_studies(output_dir) - - -@click.command() -@click.argument( - "node_id", -) -def get_study(node_id): - """ - Get study in Dewrangle by Dewrangle GraphQL node ID - - \b - Arguments: - \b - node_id - ID of the study in Dewrangle - """ - init_logger() - - return gql_client.read_study(node_id) diff --git a/d3b_api_client_cli/cli/dewrangle/volume_commands.py b/d3b_api_client_cli/cli/dewrangle/volume_commands.py deleted file mode 100644 index e1d6162..0000000 --- a/d3b_api_client_cli/cli/dewrangle/volume_commands.py +++ /dev/null @@ -1,264 +0,0 @@ -""" -Dewrangle volume commands -""" - -import logging - -import click - -from d3b_api_client_cli.config import config -from d3b_api_client_cli.config.log import init_logger -from d3b_api_client_cli.utils import read_json -from d3b_api_client_cli.dewrangle import graphql as gql_client - -logger = logging.getLogger(__name__) -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] -AWS_DEFAULT_REGION = config["aws"]["region"] - - -@click.command() -@click.option( - "--bucket", - help="Name of the S3 bucket this Volume points to", -) -@click.option( - "--path-prefix", - help="Path in the S3 bucket this Volume points to", -) -@click.option( - "--region", - default=AWS_DEFAULT_REGION, - help="AWS region the S3 bucket of the Volume is in", -) -@click.option( - "--credential-key", - required=True, - help="Credential key, if supplied, must be used with study-global-id" - " to look up existing volume", -) -@click.option( - "--study-id", - help="Graphql node id of the Study this volume belongs to", -) -@click.option( - "--study-global-id", - help="Global ID of the study this volume belongs to. If supplied, must be" - "used with --credential-key to look up existing volume", -) -@click.option( - "--filepath", - type=click.Path(exists=True, file_okay=True, dir_okay=False), -) -def upsert_volume( - bucket, - path_prefix, - region, - credential_key, - study_id, - study_global_id, - filepath, -): - """ - Upsert volume in Dewrangle. Either provide values in a JSON file or - provide values via CLI options. If both are provided, CLI options - take precedence and will overwrite the values from the file - - You can either provide the volume's bucket, path_prefix, and study global - ID or volume graphql node ID to lookup the volume for an update - """ - init_logger() - - cli_opts = { - "name": bucket, - "pathPrefix": path_prefix, - "region": region, - } - input_data = {} - if filepath: - input_data.update(read_json(filepath)) - for k, value in cli_opts.items(): - if value: - input_data[k] = value - else: - input_data = cli_opts - - return gql_client.upsert_volume( - input_data, - study_id=study_id, - study_global_id=study_global_id, - credential_key=credential_key, - ) - - -@click.command() -@click.option( - "--node-id", - help="Credential graphql node ID", -) -@click.option( - "--bucket", - help="Name of the S3 bucket this Volume points to", -) -@click.option( - "--path-prefix", - help="Path in the S3 bucket this Volume points to", -) -@click.option( - "--study-global-id", - help="Global ID of the study this volume belongs to", -) -@click.option( - "--disable-delete-safety-check", - is_flag=True, - help="This will allow deleting of the organization", -) -def delete_volume( - node_id, bucket, path_prefix, study_global_id, disable_delete_safety_check -): - """ - Delete volume - - You can either provide the volume's bucket, path_prefix, and study global - ID or volume graphql node ID to lookup the volume - """ - init_logger() - - return gql_client.delete_volume( - node_id=node_id, - bucket=bucket, - path_prefix=path_prefix, - study_global_id=study_global_id, - delete_safety_check=not disable_delete_safety_check, - ) - - -@click.command() -@click.option( - "--output-dir", - default=DEWRANGLE_DIR, - type=click.Path(exists=False, file_okay=False, dir_okay=True), - help="The path to the data dir where volumes will be written", -) -@click.option( - "--study-global-id", - help="Global ID of the study to filter volumes by", -) -def read_volumes(output_dir, study_global_id): - """ - Fetch volumes from Dewrangle - """ - init_logger() - - return gql_client.read_volumes( - study_global_id, - output_dir, - ) - - -@click.command() -@click.argument( - "node_id", -) -def get_volume(node_id): - """ - Get volume in Dewrangle by Dewrangle GraphQL node ID - - \b - Arguments: - \b - node_id - ID of the volume in Dewrangle - """ - init_logger() - - return gql_client.read_volume(node_id) - - -@click.command() -@click.option( - "--volume-id", - help="Volume graphql node ID", -) -@click.option( - "--billing-group-id", - help="Graphql ID of the biling group in Dewrangle", -) -@click.option( - "--bucket", - help="Name of the S3 bucket this Volume points to", -) -@click.option( - "--path-prefix", - help="Path in the S3 bucket this Volume points to", -) -@click.option( - "--study-global-id", - help="Global ID of the study this volume belongs to", -) -def list_and_hash_volume( - volume_id, - billing_group_id, - bucket, - path_prefix, - study_global_id, -): - """ - Trigger a list and hash volume job in Dewrangle - - You can either provide the volume's bucket, path_prefix, and study global - ID or volume graphql node ID to lookup the volume - """ - init_logger() - - return gql_client.list_and_hash( - volume_id=volume_id, - billing_group_id=billing_group_id, - bucket=bucket, - path_prefix=path_prefix, - study_global_id=study_global_id, - ) - - -@click.command() -@click.option( - "--volume-id", - help="Volume graphql node ID", -) -@click.option( - "--billing-group-id", - help="Graphql ID of the biling group in Dewrangle", -) -@click.option( - "--bucket", - help="Name of the S3 bucket this Volume points to", -) -@click.option( - "--path-prefix", - help="Path in the S3 bucket this Volume points to", -) -@click.option( - "--study-global-id", - help="Global ID of the study this volume belongs to", -) -def hash_volume_and_wait( - volume_id, - billing_group_id, - bucket, - path_prefix, - study_global_id, -): - """ - Trigger a list and hash volume job and poll for job status until the - job is complete or fails - - You can either provide the volume's bucket, path_prefix, and study global - ID or volume graphql node ID to lookup the volume - """ - init_logger() - - return gql_client.hash_and_wait( - volume_id=volume_id, - billing_group_id=billing_group_id, - bucket=bucket, - path_prefix=path_prefix, - study_global_id=study_global_id, - ) diff --git a/d3b_api_client_cli/dewrangle/__init__.py b/d3b_api_client_cli/cli/fhir/__init__.py similarity index 100% rename from d3b_api_client_cli/dewrangle/__init__.py rename to d3b_api_client_cli/cli/fhir/__init__.py diff --git a/d3b_api_client_cli/cli/fhir/commands.py b/d3b_api_client_cli/cli/fhir/commands.py new file mode 100644 index 0000000..d4cd988 --- /dev/null +++ b/d3b_api_client_cli/cli/fhir/commands.py @@ -0,0 +1,342 @@ +""" +All CLI commands related to FHIR operations +""" + +import logging +from pprint import pformat +import click + +from d3b_api_client_cli.config import ( + config, + valid_kids_first_fhir_types, + valid_fhir_types, +) +from d3b_api_client_cli import utils +from d3b_api_client_cli.fhir import builder, loader, counts +from d3b_api_client_cli.fhir import delete as trasher + +logger = logging.getLogger(__name__) + + +@click.command(help="Fetch total counts of KF FHIR types in FHIR server") +@click.argument( + "kf_study_id", +) +def total_counts(kf_study_id): + """ + Fetch total counts of KF FHIR types in FHIR server. Types are listed in + d3b_api_client_cli.config.KidsFirstFhirEntity + + \b + Arguments: + \b + kf_study_id - Kids First ID of study in Dataservice + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + return counts.get_counts(kf_study_id) + + +@click.command(help="Transform Dataservice tables into FHIR resource tables") +@click.option( + "--entity-types", + help="Comma delimited list of the Kids First FHIR entity types to build." + f" Must be one or more of: {pformat(valid_kids_first_fhir_types)}", +) +@click.option( + "--dest-dir", + type=click.Path(exists=False, file_okay=False, dir_okay=True), + help="The dir path to where FHIR JSON will be written by" " build process", +) +@click.argument( + "source_dir", +) +def build(source_dir, dest_dir, entity_types): + """ + Transform Dataservice tables into FHIR JSON + + \b + Arguments: + \b + source_dir - Directory of tables to merge + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + if entity_types: + entity_types = [ + et.strip() for et in utils.multisplit(entity_types, [","]) + ] + if not set(entity_types) <= valid_kids_first_fhir_types: + raise click.BadParameter( + "Bad --entity-types value. Must be comma delimited list " + "where each item is one of: " + f"{pformat(valid_kids_first_fhir_types)}" + ) + suffix = "all" if not entity_types else entity_types + logger.info( + f"🏭 Building FHIR JSON from Dataservice tables for {suffix} entities" + ) + builder.build_entities( + source_dir, + dest_dir=dest_dir, + kf_fhir_entity_types=entity_types, + ) + + +@click.command(name="load", help="Load FHIR JSON data into FHIR service") +@click.option( + "--fhir-url", + default=config["fhir"]["base_url"], + help="The base url of the FHIR service", +) +@click.option( + "--ignore-load-errors", + default=False, + help="Whether to let the FHIR loader keep going even if it encounters " + "load errors", +) +@click.argument( + "source_dir", + type=click.Path(exists=True, file_okay=False, dir_okay=True), +) +def load_fhir(source_dir, fhir_url, ignore_load_errors): + """ + Load data into FHIR service + + \b + Arguments: + \b + source_dir - Directory of FHIR JSON files + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + logger.info("🏭 Start loading data into FHIR service ...") + loader.load_data( + fhir_url, data_dir=source_dir, ignore_load_errors=ignore_load_errors + ) + + +@click.command(help="Delete FHIR resources in FHIR service") +@click.option( + "--disable-safety-check", + is_flag=True, + help="The base url of the FHIR service", +) +@click.option( + "--entity-types", + help="Comma delimited list of the Kids First FHIR entity types to delete." + f" Must be one or more of: {pformat(valid_kids_first_fhir_types)}", +) +@click.option( + "--fhir-url", + default=config["fhir"]["base_url"], + help="The base url of the FHIR service", +) +@click.argument( + "source_dir", + type=click.Path(exists=True, file_okay=False, dir_okay=True), +) +def delete_from_file(source_dir, fhir_url, entity_types, disable_safety_check): + """ + Delete the given FHIR resources in the FHIR service + + \b + Arguments: + \b + source_dir - Directory FHIR JSON files + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + if entity_types: + entity_types = [ + et.strip() for et in utils.multisplit(entity_types, [","]) + ] + if not set(entity_types) <= valid_kids_first_fhir_types: + raise click.BadParameter( + "Bad --entity-types value. Must be comma delimited list " + "where each item is one of: " + f"{pformat(valid_kids_first_fhir_types)}" + ) + + safety_check = not disable_safety_check + logger.info( + f"🚮 Start deleting data in FHIR service. SAFETY_CHECK: {safety_check}" + ) + if safety_check: + logger.info( + "When safety check is enabled, only resources in localhost" + " can be deleted" + ) + trasher.delete_from_file( + fhir_url, + source_dir, + entity_types=entity_types, + safety_check=safety_check, + ) + + +def _delete_all( + study_id, + fhir_url, + entity_types, + output_dir, + disable_safety_check, + legacy_server, +): + """ + Helper for delete-all and delete-study cmds + """ + from d3b_api_client_cli.config import log + + log.init_logger() + + if entity_types: + entity_types = [ + et.strip() for et in utils.multisplit(entity_types, [","]) + ] + if not set(entity_types) <= valid_fhir_types: + raise click.BadParameter( + "Bad --entity-types value. Must be comma delimited list " + "where each item is one of: " + f"{pformat(valid_fhir_types)}" + ) + else: + entity_types = valid_fhir_types + + safety_check = not disable_safety_check + logger.info( + f"🚮 Start deleting data in FHIR service: {fhir_url} " + f"SAFETY_CHECK: {safety_check}" + ) + if safety_check: + logger.info( + "When safety check is enabled, only resources in localhost" + " can be deleted" + ) + + if legacy_server: + use_kf_entity_tags = False + else: + use_kf_entity_tags = True + + trasher.delete_all( + fhir_url, + entity_types=entity_types, + output_dir=output_dir, + study_id=study_id, + safety_check=safety_check, + use_kf_entity_tags=use_kf_entity_tags, + ) + + +@click.command(help="Delete all FHIR resources in FHIR service by entity_type") +@click.option( + "--legacy-server", + is_flag=True, + help="If the FHIR service is a legacy server (without OAuth2)", +) +@click.option( + "--disable-safety-check", + is_flag=True, + help="The base url of the FHIR service", +) +@click.option( + "--output-dir", + type=click.Path(exists=False, file_okay=False, dir_okay=True), + help="The path to dir where delete results will be written", +) +@click.option( + "--study-id", + help="The KF ID of the study to delete data for", +) +@click.option( + "--entity-types", + help="Comma delimited list of the FHIR resource types to delete." + f" Must be one or more of: {pformat(valid_kids_first_fhir_types)}", +) +@click.option( + "--fhir-url", + default=config["fhir"]["base_url"], + help="The base url of the FHIR service", +) +def delete_all( + fhir_url, + entity_types, + study_id, + output_dir, + disable_safety_check, + legacy_server, +): + """ + Delete FHIR resources by entity_type in the FHIR service + """ + _delete_all( + study_id, + fhir_url, + entity_types, + output_dir, + disable_safety_check, + legacy_server, + ) + + +@click.command( + help="Delete all FHIR resources in FHIR service by study." + " Different from delete-all since it requires a study" +) +@click.option( + "--legacy-server", + is_flag=True, + help="If the FHIR service is a legacy server (without OAuth2)", +) +@click.option( + "--disable-safety-check", + is_flag=True, + help="The base url of the FHIR service", +) +@click.option( + "--output-dir", + type=click.Path(exists=False, file_okay=False, dir_okay=True), + help="The path to dir where delete results will be written", +) +@click.option( + "--entity-types", + help="Comma delimited list of the FHIR resource types to delete." + f" Must be one or more of: {pformat(valid_kids_first_fhir_types)}", +) +@click.option( + "--fhir-url", + default=config["fhir"]["base_url"], + help="The base url of the FHIR service", +) +@click.argument( + "kf_study_id", +) +def delete_fhir_study( + kf_study_id, + fhir_url, + entity_types, + output_dir, + disable_safety_check, + legacy_server, +): + """ + Delete FHIR resources by entity_type in the FHIR service + """ + + _delete_all( + kf_study_id, + fhir_url, + entity_types, + output_dir, + disable_safety_check, + legacy_server, + ) diff --git a/d3b_api_client_cli/config/__init__.py b/d3b_api_client_cli/config/__init__.py index 0c95cbd..88875d3 100644 --- a/d3b_api_client_cli/config/__init__.py +++ b/d3b_api_client_cli/config/__init__.py @@ -4,6 +4,7 @@ import os from dataclasses import dataclass +from enum import Enum from dotenv import find_dotenv, load_dotenv @@ -11,22 +12,62 @@ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname((__file__)))) ROOT_DATA_DIR = os.path.join(ROOT_DIR, "data") ROOT_FAKE_DATA_DIR = os.path.join(ROOT_DATA_DIR, "fake_data") +DATA_DIR = os.path.join(ROOT_DATA_DIR, "generated") +EXPORT_DIR = os.path.join(ROOT_DATA_DIR, "exported") +TRANSFORM_DIR = os.path.join(ROOT_DATA_DIR, "transformed") +DELETE_DIR = os.path.join(ROOT_DATA_DIR, "deleted") +LOAD_DIR = os.path.join(ROOT_DATA_DIR, "loaded") +STATS_DIR = os.path.join(ROOT_DATA_DIR, "stats") +DEWRANGLE_DIR = os.path.join(ROOT_DATA_DIR, "dewrangle") +FHIR_JSON_DIR = os.path.join(ROOT_DATA_DIR, "fhir") LOG_DIR = os.path.join(ROOT_DATA_DIR, "logs") +SAMPLE_ETL_DIR = os.path.join(ROOT_DATA_DIR, "sample_etl") +SAMPLE_ETL_MANIFEST = "sample_etl_manifest.json" + +# DB +DB_HOST = os.environ.get("DB_HOST") +DB_PORT = os.environ.get("DB_PORT") +DB_NAME = os.environ.get("DB_NAME") +DB_USER = os.environ.get("DB_USER") +DB_USER_PW = os.environ.get("DB_USER_PW") DOTENV_PATH = find_dotenv() if DOTENV_PATH: load_dotenv(DOTENV_PATH) +# Unit test environment variable +EXPORT_DIR_ENV_VAR = "DWDS_PYEXPORT_DIR_ENV_VAR" + +# Dataservice +DEV_STUDY_ID = "SD_ME0WME0W" +TEST_STUDY_ID = "SD_11111111" +DATASERVICE_DB_NAME = os.environ.get("DATASERVICE_DB_NAME") +DATASERVICE_DB_HOST = os.environ.get("DATASERVICE_DB_HOST") +DATASERVICE_DB_PORT = os.environ.get("DATASERVICE_DB_PORT") +DATASERVICE_DB_ADMIN_USER = os.environ.get("POSTGRES_ADMIN_USER") +DATASERVICE_DB_ADMIN_PW = os.environ.get("POSTGRES_ADMIN_PW") + # Dewrangle DEWRANGLE_DEV_PAT = os.environ.get("DEWRANGLE_DEV_PAT") -DEWRANGLE_BASE_URL = os.environ.get("DEWRANGLE_BASE_URL") +DEWRANGLE_BASE_URL = os.environ.get("DEWRANGLE_BASE_URL") or ( + "http://localhost:3000" +) +DEWRANGLE_FHIR_SERVERS_FILEPATH = os.path.join(ROOT_DIR, ".fhir_servers.json") +DEWRANGLE_MAX_PAGE_SIZE = 10 -# DB -DB_HOST = os.environ.get("DB_HOST") -DB_PORT = os.environ.get("DB_PORT") -DB_NAME = os.environ.get("DB_NAME") -DB_USER = os.environ.get("DB_USER") -DB_USER_PW = os.environ.get("DB_USER_PW") +# KF Gen3 Service where file metadata is stored: Indexd +INDEXD_BASE_URL = os.environ.get("INDEXD_BASE_URL") or ( + "https://data.kidsfirstdrc.org" +) +# NCI Gen3 Service where file metadata is stored: DCF +DCF_BASE_URL = os.environ.get("DCF_BASE_URL") or ( + "https://nci-crdc.datacommons.io" +) +INDEXD_ENDPOINT = os.environ.get("INDEXD_ENDPOINT") or "index/index" + +# FHIR +FHIR_BASE_URL = os.environ.get("FHIR_BASE_URL") +KF_FHIR_QA_OIDC_CLIENT_SECRET = os.environ.get("KF_FHIR_QA_OIDC_CLIENT_SECRET") @dataclass @@ -52,7 +93,98 @@ class SECRETS: """ DEWRANGLE_DEV_PAT = "DEWRANGLE_DEV_PAT" - DB_USER_PW = "DB_USER_PW" + CAVATICA_DEVELOPER_TOKEN = "CAVATICA_DEVELOPER_TOKEN" + KF_FHIR_QA_OIDC_CLIENT_SECRET = "KF_FHIR_QA_OIDC_CLIENT_SECRET" + DATASERVICE_DB_ADMIN_PW = "DATASERVICE_DB_ADMIN_PW" + DATASERVICE_DB_ADMIN_PW = "DATASERVICE_DB_ADMIN_PW" + POSTGRES_ADMIN_PW = "POSTGRES_ADMIN_PW" + D3B_WAREHOUSE_DB_USER_PW = "D3B_WAREHOUSE_DB_USER_PW" + D3B_WAREHOUSE_DB_ADMIN_PW = "D3B_WAREHOUSE_DB_ADMIN_PW" + FHIR_APP_ADMIN_PW = "FHIR_APP_ADMIN_PW" + AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID" + AWS_ACCESS_KEY_SECRET = "AWS_ACCESS_KEY_SECRET" + POSTGRES_ADMIN_USER = "POSTGRES_ADMIN_USER" + POSTGRES_ADMIN_PW = "POSTGRES_ADMIN_PW" + + +class IdTypes(Enum): + """ + Used in CLI option definitions + """ + + KIDS_FIRST = "kids_first" + DEWRANGLE = "dewrangle" + + +class KidsFirstFhirEntity(Enum): + """ + Names of Kids First clinical entities in FHIR. These map to FHIR + resource types but more than one entity can map to the same FHIR resource + type + """ + + # NOTE: Do not change order. These are in the order in which they must + # be loaded into the server + organization = "organization" + practitioner = "practitioner" + practitioner_role = "practitioner_role" + patient = "patient" + research_study = "research_study" + research_subject = "research_subject" + proband_status = "proband_status" + family = "family" + family_relationship = "family_relationship" + sequencing_center = "sequencing_center" + phenotype = "phenotype" + disease = "disease" + vital_status = "vital_status" + parent_specimen = "parent_specimen" + child_specimen = "child_specimen" + histopathology = "histopathology" + drs_document_reference = "drs_document_reference" + drs_document_reference_index = "drs_document_reference_index" + + +class ImagingFhirEntity(Enum): + """ + Names of imaging data entities in FHIR. These map to FHIR + resource types but more than one entity can map to the same FHIR resource + type + """ + + # NOTE: Do not change order. These are in the order in which they must + # be loaded into the server + imaging_device = "imaging_device" + imaging_document_reference = "imaging_document_reference" + imaging_study = "imaging_study" + + +IMAGING_ENTITY_TYPES = { + "Patient": "imaging_patient", + "Device": "imaging_device", + "DocumentReference": "imaging_document_reference", + "ImagingStudy": "imaging_study", +} +IMAGING_RESOURCE_TYPES = ["Device", "DocumentReference", "ImagingStudy"] + +valid_kids_first_fhir_types = {et.value for et in KidsFirstFhirEntity} +valid_fhir_types = set( + [et.value for et in KidsFirstFhirEntity] + + [et.value for et in ImagingFhirEntity] +) + +SKIP_ENTITIES = {"practitioner", "practitioner_role", "organization"} + +ETL_STAGES = "etbl" + + +class TargetAPI(Enum): + """ + Enum used in CLI option definitions + """ + + DEWRANGLE = "dewrangle" + FHIR = "fhir" def check_dewrangle_http_config(): @@ -69,33 +201,210 @@ def check_dewrangle_http_config(): config = { "logging": { - "default_log_filename": "d3b_api_client_cli", + "default_log_filename": "dwds_fhir_etl", "default_log_level": "info", "default_log_dir": LOG_DIR, }, + "fhir": { + "base_url": FHIR_BASE_URL or "http://localhost:8000", + "username": os.environ.get("FHIR_APP_ADMIN") or "admin", + "password": os.environ.get("FHIR_APP_ADMIN_PW") or "password", + "resource_types": { + "Group": "gr", + "Observation": "ob", + "DocumentReference": "dr", + "Specimen": "bs", + "Condition": "cn", + "ResearchSubject": "rs", + "Patient": "pt", + "PractitionerRole": "pr", + "Organization": "or", + "Practitioner": "pc", + "ResearchStudy": "sd", + }, + "mapping": { + "organization": {"endpoint": "/Organization", "params": {}}, + "research_study": {"endpoint": "/ResearchStudy", "params": {}}, + "practitioner": {"endpoint": "/Practitioner", "params": {}}, + "practitioner_role": { + "endpoint": "/PractitionerRole", + "params": {}, + }, + "patient": {"endpoint": "/Patient", "params": {}}, + "research_subject": {"endpoint": "/ResearchSubject", "params": {}}, + "proband_status": { + "endpoint": "/Observation", + "params": {"code": "85900004"}, + }, + "family": {"endpoint": "/Group", "params": {"code": "FAMMEMB"}}, + "family_relationship": { + "endpoint": "/Observation", + "params": {"code": "FAMMEMB"}, + }, + "sequencing_center": {"endpoint": "/Organization", "params": {}}, + "phenotype": { + "endpoint": "/Condition", + "params": { + "_profile:below": "https://ncpi-fhir.github.io/ncpi-fhir-ig/StructureDefinition/phenotype", + }, + }, + "disease": { + "endpoint": "/Condition", + "params": { + "_profile:below": "https://ncpi-fhir.github.io/ncpi-fhir-ig/StructureDefinition/disease", + }, + }, + "vital_status": { + "endpoint": "/Observation", + "params": {"code": "263493007"}, + }, + "child_specimen": {"endpoint": "/Specimen", "params": {}}, + "parent_specimen": {"endpoint": "/Specimen", "params": {}}, + "histopathology": { + "endpoint": "/Observation", + "params": {"code": "250537006"}, + }, + "drs_document_reference": { + "endpoint": "/DocumentReference", + "params": {}, + }, + "drs_document_reference_index": { + "endpoint": "/DocumentReference", + "params": {}, + }, + "imaging_device": { + "endpoint": "/Device", + "params": {}, + }, + "imaging_document_reference": { + "endpoint": "/DocumentReference", + "params": {}, + }, + "imaging_study": { + "endpoint": "/ImagingStudy", + "params": {}, + }, + }, + }, "dewrangle": { "base_url": DEWRANGLE_BASE_URL, - "pagination": {"max_page_size": 10}, - "client": {"execution_timeout": 30}, # seconds + "pagination": {"max_page_size": DEWRANGLE_MAX_PAGE_SIZE}, "endpoints": { "graphql": "/api/graphql", "rest": { "study_file": "api/rest/studies/{dewrangle_study_id}/files/{filename}", "global_id": "api/rest/studies/{dewrangle_study_id}/global-descriptors", - "job_errors": "/api/rest/jobs/{job_id}/errors", + "job_errors": "api/rest/jobs/{job_id}/errors", }, }, - "output_dir": os.path.join(ROOT_DATA_DIR, "dewrangle"), - "credential_type": "AWS", - "billing_group_id": os.environ.get("CAVATICA_BILLING_GROUP_ID"), + "ingest": [e.value for e in KidsFirstFhirEntity], }, - "faker": {"global_id": {"fhir_resource_types": FHIR_RESOURCE_TYPES}}, - "aws": { - "region": os.environ.get("AWS_DEFAULT_REGION") or "us-east-1", - "s3": { - "aws_access_key_id": os.environ.get("AWS_ACCESS_KEY_ID"), - "aws_secret_access_key": os.environ.get("AWS_SECRET_ACCESS_KEY"), - "test_bucket_name": os.environ.get("AWS_BUCKET_DATA_TRANSFER_TEST"), + "dataservice": { + "api_url": os.environ.get("DATASERVICE_BASE_URL") + or "http://localhost:5000", + "prefix": { + "SequencingCenter": "SC", + "Investigator": "IG", + "Study": "SD", + "Family": "FM", + "FamilyRelationship": "FR", + "Participant": "PT", + "Phenotype": "PH", + "Diagnosis": "DG", + "Outcome": "OC", + "Biospecimen": "BS", + "Sample": "SA", + "SampleRelationship": "SR", + "GenomicFile": "GF", + "SequencingExperiment": "SE", + "SequencingExperimentGenomicFile": "SG", + "BiospecimenGenomicFile": "BG", + "BiospecimenDiagnosis": "BD", }, + "seeder": { + "SequencingCenter": {"total": 1}, + "Investigator": {"total": 1}, + "Study": { + "foreign_keys": ["Investigator"], + "total": 2, # total study entities + }, + "Family": {"total": 3}, + "Participant": { + "foreign_keys": [ + "Family", + "Study", + ], + "total": 10, + }, + # Family relationship will be manually + # created. Therefore no need to specify fk + "FamilyRelationship": {"total": 3}, + "Phenotype": {"foreign_keys": ["Participant"], "total": 20}, + "Diagnosis": {"foreign_keys": ["Participant"], "total": 20}, + "Outcome": {"foreign_keys": ["Participant"], "total": 20}, + "Sample": { + "foreign_keys": ["Participant"], + "total": 40, + }, + # Sample relationship will be manually + # created. Therefore no need to specify fk + "SampleRelationship": {"total": 20}, + "Biospecimen": { + "foreign_keys": ["SequencingCenter", "Participant", "Sample"], + "total": 40, + }, + "GenomicFile": {"total": 40}, + "BiospecimenGenomicFile": { + "foreign_keys": [ + "Biospecimen", + "GenomicFile", + ], + "total": 40, + }, + "BiospecimenDiagnosis": { + "foreign_keys": [ + "Biospecimen", + "Diagnosis", + ], + "total": 40, + }, + "SequencingExperiment": { + "foreign_keys": [ + "SequencingCenter", + ], + "total": 40, + }, + "SequencingExperimentGenomicFile": { + "foreign_keys": [ + "SequencingExperiment", + "GenomicFile", + ], + "total": 40, + }, + }, + # Must be in the order that satisfies foreign key relationships + "endpoints": { + "SequencingCenter": "/sequencing-centers", + "Investigator": "/investigators", + "Study": "/studies", + "Family": "/families", + "Participant": "/participants", + "FamilyRelationship": "/family-relationships", + "Sample": "/samples", + "Biospecimen": "/biospecimens", + "SampleRelationship": "/sample-relationships", + "Diagnosis": "/diagnoses", + "Phenotype": "/phenotypes", + "Outcome": "/outcomes", + "GenomicFile": "/genomic-files", + "BiospecimenGenomicFile": "/biospecimen-genomic-files", + "BiospecimenDiagnosis": "/biospecimen-diagnoses", + "ReadGroup": "/read-groups", + "SequencingExperiment": "/sequencing-experiments", + "ReadGroupGenomicFile": "/read-group-genomic-files", + "SequencingExperimentGenomicFile": "/sequencing-experiment-genomic-files", + }, + "cached_schema_filepath": os.path.join(ROOT_DIR, "cached_schema.json"), }, + "faker": {"global_id": {"fhir_resource_types": FHIR_RESOURCE_TYPES}}, } diff --git a/d3b_api_client_cli/config/concept_schema.py b/d3b_api_client_cli/config/concept_schema.py new file mode 100644 index 0000000..3f048fa --- /dev/null +++ b/d3b_api_client_cli/config/concept_schema.py @@ -0,0 +1,424 @@ +""" +Classes that define the standard set of columns for an intermediate data model +used by the transform stage of ETLs. + +These columns are namespaced by entity type and create completely unambiguous +column names. For example all Dataservice tables have the kf_id column to +denote the primary key. However, when two tables are merged there will be +a collision on this column. + +Thus we map the source columns to these namespaced columns to eliminate +ambiguity. For example kf_id in the genomic file table becomes +GENOMIC_FILE.TARGET_SERVICE_ID +""" + +import inspect + +DELIMITER = "|" +UNIQUE_ID_ATTR = "UNIQUE_KEY" + + +def obj_attrs_to_dict(cls): + """ + Create a dict of obj attributes and values, including inherited attrs + """ + # Get non function attributes + attributes = inspect.getmembers(cls, lambda x: not (inspect.isroutine(x))) + + # Get non-hidden attrs + attributes = [ + a + for a in attributes + if not (a[0].startswith("__") and a[0].endswith("__")) + ] + return dict(attributes) + + +class QuantityMixin: + VALUE = None + UNITS = None + + +class FileMixin: + SIZE = None + FILE_NAME = None + HASH_DICT = None + URL_LIST = None + ACL = None + AUTHZ = None + AVAILABILITY = None + CONTROLLED_ACCESS = None + FILE_FORMAT = None + DATA_TYPE = None + DATA_CATEGORY = None + ACCESS_URL = None + FORMAT = None + + +class PropertyMixin: + _CONCEPT_NAME = None + UNIQUE_KEY = None + ID = None + TARGET_SERVICE_ID = None + HIDDEN = None # obverse of VISIBLE + VISIBLE = None # obverse of HIDDEN + + +class CONCEPT: + class PROJECT: + ID = None + + class INVESTIGATOR(PropertyMixin): + NAME = None + INSTITUTION = None + + class STUDY(PropertyMixin): + AUTHORITY = None + DOMAIN = None + SHORT_CODE = None + PROGRAM = None + VERSION = None + NAME = None + SHORT_NAME = None + ATTRIBUTION = None + RELEASE_STATUS = None + CATEGORY = None + BIOBANK_EMAIL = None + BIOBANK_NAME = None + BIOBANK_REQUEST_LINK = None + BIOBANK_REQUEST_INSTRUCTIONS = None + + class STUDY_FILE(PropertyMixin, FileMixin): + pass + + class FAMILY(PropertyMixin): + pass + + class FAMILY_RELATIONSHIP(PropertyMixin): + class PERSON1(PropertyMixin): + GENDER = None + pass + + class PERSON2(PropertyMixin): + GENDER = None + pass + + RELATION_FROM_1_TO_2 = None + + class SAMPLE_RELATIONSHIP(PropertyMixin): + NOTES = None + + class PARENT(PropertyMixin): + pass + + class CHILD(PropertyMixin): + pass + + class PARTICIPANT(PropertyMixin): + IS_PROBAND = None + FATHER_ID = None + MOTHER_ID = None + PROBAND_ID = None + RELATIONSHIP_TO_PROBAND = None + GENDER = None + SEX = None + ETHNICITY = None + RACE = None + CONSENT_TYPE = None + # affected by diagnoses/phenotypes specifically mentioned by the study + IS_AFFECTED_UNDER_STUDY = None + SPECIES = None + ENROLLMENT_AGE_DAYS = None + LAST_CONTACT_AGE_DAYS = None + + class ENROLLMENT_AGE(QuantityMixin): + pass + + class LAST_CONTACT_AGE(QuantityMixin): + pass + + class OUTCOME(PropertyMixin): + VITAL_STATUS = None + EVENT_AGE_DAYS = None + + class EVENT_AGE(QuantityMixin): + pass + + DISEASE_RELATED = None + + class DIAGNOSIS(PropertyMixin): + NAME = None + TUMOR_LOCATION = None + SPATIAL_DESCRIPTOR = None + CATEGORY = None + UBERON_TUMOR_LOCATION_ID = None + EVENT_AGE_DAYS = None + VERIFICATION = None + + class ABATEMENT_EVENT_AGE(QuantityMixin): + pass + + class EVENT_AGE(QuantityMixin): + pass + + MONDO_ID = None + NCIT_ID = None + ICD_ID = None + + class PHENOTYPE(PropertyMixin): + NAME = None + HPO_ID = None + SNOMED_ID = None + OBSERVED = None + EVENT_AGE_DAYS = None + INTERPRETATION = None + VERIFICATION = None + + class ABATEMENT_EVENT_AGE(QuantityMixin): + pass + + class EVENT_AGE(QuantityMixin): + pass + + class OBSERVATION(PropertyMixin): + NAME = None + ONTOLOGY_ONTOBEE_URI = None + ONTOLOGY_CODE = None + CATEGORY = None + INTERPRETATION = None + STATUS = None + ANATOMY_SITE = None + UBERON_ANATOMY_SITE_ID = None + + class EVENT_AGE(QuantityMixin): + pass + + class BIOSPECIMEN_GROUP(PropertyMixin): + pass + + class BIOSPECIMEN(PropertyMixin): + TISSUE_TYPE = None + NCIT_TISSUE_TYPE_ID = None + ANATOMY_SITE = None + NCIT_ANATOMY_SITE_ID = None + UBERON_ANATOMY_SITE_ID = None + TUMOR_DESCRIPTOR = None + COMPOSITION = None + STATUS = None + EVENT_AGE_DAYS = None + + class EVENT_AGE(QuantityMixin): + pass + + class QUANTITY(QuantityMixin): + pass + + class CONCENTRATION(QuantityMixin): + pass + + SPATIAL_DESCRIPTOR = None + SHIPMENT_ORIGIN = None + SHIPMENT_DATE = None + ANALYTE = None + CONCENTRATION_MG_PER_ML = None + VOLUME_UL = None + SAMPLE_PROCUREMENT = None + DBGAP_STYLE_CONSENT_CODE = None + CONSENT_SHORT_NAME = None + PRESERVATION_METHOD = None + HAS_MATCHED_NORMAL_SAMPLE = None + + class SAMPLE(PropertyMixin): + SAMPLE_TYPE = None + ANATOMY_SITE = None + NCIT_ANATOMY_SITE_ID = None + UBERON_ANATOMY_SITE_ID = None + TISSUE_TYPE = None + SAMPLE_PROCUREMENT = None + PRESERVATION_METHOD = None + DBGAP_STYLE_CONSENT_CODE = None + CONSENT_SHORT_NAME = None + VOLUME_UL = None + EVENT_AGE_DAYS = None + HAS_MATCHED_NORMAL_SAMPLE = None + EXTERNAL_COLLECTION_ID = None + + class EVENT_AGE(QuantityMixin): + pass + + class GENOMIC_FILE(PropertyMixin, FileMixin): + HARMONIZED = None + SOURCE_FILE = None + REFERENCE_GENOME = None + INDEXD_ID = None + DRS_URI = None + + class GENOMIC_INDEX_FILE(PropertyMixin, FileMixin): + HARMONIZED = None + SOURCE_FILE = None + REFERENCE_GENOME = None + INDEXD_ID = None + DRS_URI = None + + class READ_GROUP(PropertyMixin): + PAIRED_END = None + FLOW_CELL = None + LANE_NUMBER = None + QUALITY_SCALE = None + + class SEQUENCING(PropertyMixin): + DATE = None + STRATEGY = None + PAIRED_END = None + LIBRARY_NAME = None + LIBRARY_STRAND = None + LIBRARY_SELECTION = None + LIBRARY_PREP = None + PLATFORM = None + INSTRUMENT = None + INSERT_SIZE = None + REFERENCE_GENOME = None + MAX_INSERT_SIZE = None + MEAN_INSERT_SIZE = None + MEAN_DEPTH = None + TOTAL_READS = None + MEAN_READ_LENGTH = None + + class CENTER(PropertyMixin): + NAME = None + + class BIOSPECIMEN_GENOMIC_FILE(PropertyMixin): + pass + + class SEQUENCING_GENOMIC_FILE(PropertyMixin): + pass + + class BIOSPECIMEN_DIAGNOSIS(PropertyMixin): + pass + + class READ_GROUP_GENOMIC_FILE(PropertyMixin): + pass + + class IMAGING_DEVICE(PropertyMixin): + MANUFACTURER = None + MANUFACTURER_MODEL_NAME = None + MAGNETIC_FIELD_STRENGTH = None + SOFTWARE_VERSION = None + + class IMAGING_STUDY(PropertyMixin): + STATUS = None + TECHNIQUE = None + SEQUENCE = None + TOTAL_ACQUISITIONS = None + EVENT_AGE_DAYS = None + PROJECT_NAME = None + + class IMAGING_ACQUISITION(PropertyMixin): + BODY_SITE = None + NUMBER = None + MODALITY = None + LABEL = None + + class IMAGING_FILE(PropertyMixin, FileMixin): + TECHNIQUE = None + SEQUENCE = None + pass + + +def compile_schema(): + """ + "Compile" the concept schema + + Populate every concept class attribute with a string that represents + a path in the concept class hierarchy to reach that attribute. + + Store all the concept property strings in a set for later reference and + validation. + + This approach eliminates the need to manually assign concept class + attributes to a string. + """ + + property_path = [] + property_paths = set() + _set_cls_attrs( + CONCEPT, None, property_path, property_paths, include_root=False + ) + return property_paths + + +str_to_CONCEPT = {} + + +def _set_cls_attrs( + node, prev_node, property_path, property_paths, include_root=False +): + """ + Recursive method to traverse a class hierarchy and set class attributes + equal to a string which represents a path in the hierarchy to reach the + attribute. + + For example, after running the method on this class definition: + class A: + class B: + ID = None + class C: + ID = None + AGE = None + + Given a delimiter set to '|', the values of the attributes would be: + A.AGE = "A|AGE" + A.B.ID = "A|B|ID" + A.B.C.ID = "A|B|C|ID" + """ + # Process a class or child node + if callable(node): + # Add class name to property path + property_path.append(str(node.__name__)) + # Iterate over class attrs + for attr_name, value in obj_attrs_to_dict(node).items(): + # Recurse + if callable(value): + _set_cls_attrs( + value, + node, + property_path, + property_paths, + include_root=include_root, + ) + else: + _set_cls_attrs( + attr_name, + node, + property_path, + property_paths, + include_root=include_root, + ) + # Process leaf nodes + else: + # Don't include root in property path + if not include_root: + property_path = property_path[1:] + # Create current path str + concept_name_str = DELIMITER.join(property_path) + # Add attribute to property path + property_path.append(node) + # Create property string + property_path_str = DELIMITER.join(property_path) + # Set attribute on class to equal the property path string OR + # The concept name path string if the attribute is _CONCEPT_NAME + if node == "_CONCEPT_NAME": + setattr(prev_node, node, concept_name_str) + str_to_CONCEPT[concept_name_str] = prev_node + else: + setattr(prev_node, node, property_path_str) + + # Add property string to list of property path strings + property_paths.add(property_path_str) + + property_path.pop() + + +# Set the concept class attributes with their serialized property strings +# Create a set of the serialized concept property strings +concept_property_set = compile_schema() diff --git a/d3b_api_client_cli/config/log.py b/d3b_api_client_cli/config/log.py index a6da0be..0325c96 100644 --- a/d3b_api_client_cli/config/log.py +++ b/d3b_api_client_cli/config/log.py @@ -9,12 +9,14 @@ import logging from logging.handlers import RotatingFileHandler -from d3b_api_client_cli.config import SECRETS, LOG_DIR -from d3b_api_client_cli.utils.misc import timestamp +from d3b_api_client_cli import utils +from d3b_api_client_cli.config import SECRETS, config -DEFAULT_LOG_FILENAME = "d3b_data_transfer_pipeline" -DEFAULT_LOG_LEVEL = "info" -DEFAULT_LOG_DIR = LOG_DIR +config = config["logging"] + +DEFAULT_LOG_LEVEL = config["default_log_level"] +DEFAULT_LOG_FILENAME = config["default_log_filename"] +DEFAULT_LOG_DIR = config["default_log_dir"] VERBOTEN_PATTERNS = { re.escape(os.environ[v]): f"" @@ -47,7 +49,7 @@ def format(self, record): DEFAULT_FORMATTER = NoTokenFormatter(DEFAULT_FORMAT) -def init_logger(log_level=None, log_dir=None, write_logs=True): +def init_logger(log_level=None, log_dir=None): """ Configure and create the logger @@ -70,21 +72,18 @@ def init_logger(log_level=None, log_dir=None, write_logs=True): root.setLevel(log_level) root.addHandler(console_handler) - log_filepath = None - if write_logs: - if not log_dir: - log_dir = DEFAULT_LOG_DIR - os.makedirs(log_dir, exist_ok=True) + # Also log to file + if not log_dir: + log_dir = DEFAULT_LOG_DIR + os.makedirs(log_dir, exist_ok=True) - # Create a new log file named with a timestamp - filename = f"{DEFAULT_LOG_FILENAME}-{timestamp()}.log" - log_filepath = os.path.join(log_dir, filename) + # Create a new log file named with a timestamp + filename = f"{DEFAULT_LOG_FILENAME}-{utils.timestamp()}.log" + log_filepath = os.path.join(log_dir, filename) - file_handler = RotatingFileHandler( - log_filepath, mode="w", maxBytes=MB_50 - ) - file_handler.setFormatter(DEFAULT_FORMATTER) + file_handler = RotatingFileHandler(log_filepath, mode="w", maxBytes=MB_50) + file_handler.setFormatter(DEFAULT_FORMATTER) - root.addHandler(file_handler) + root.addHandler(file_handler) return log_filepath diff --git a/d3b_api_client_cli/dewrangle/global_id.py b/d3b_api_client_cli/dewrangle/global_id.py deleted file mode 100644 index feed126..0000000 --- a/d3b_api_client_cli/dewrangle/global_id.py +++ /dev/null @@ -1,302 +0,0 @@ -""" -Dewrangle functions to create, update, remove global descriptors in Dewrangle -""" - -from enum import Enum -from typing import Optional -from pprint import pformat -import logging -import os - -import pandas - -from d3b_api_client_cli.dewrangle.graphql import study as study_api -from d3b_api_client_cli.dewrangle.rest.files import download_file - -from d3b_api_client_cli.config import config, ROOT_DATA_DIR, FhirResourceType -from d3b_api_client_cli.dewrangle.rest import ( - upload_study_file, -) -from d3b_api_client_cli.utils import timestamp - -logger = logging.getLogger(__name__) - -CSV_CONTENT_TYPE = "text/csv" -DEWRANGLE_BASE_URL = config["dewrangle"]["base_url"].rstrip("/") -DEFAULT_FILENAME = f"dewrangle-file-{timestamp()}.csv" - - -class GlobalIdDescriptorOptions(Enum): - """ - Used in download_global_descriptors - """ - - DOWNLOAD_ALL_DESC = "all" - DOWNLOAD_MOST_RECENT = "most-recent" - - -def upsert_and_download_global_descriptor( - descriptor: str, - fhir_resource_type: FhirResourceType, - global_id: Optional[str] = None, - study_global_id: Optional[str] = None, - dewrangle_study_id: Optional[str] = None, - skip_unavailable_descriptors: Optional[bool] = True, - download_all: Optional[bool] = True, - output_dir: Optional[str] = None, - output_filepath: Optional[str] = None, -) -> str: - """ - Upsert a single global descriptor and download created/updated - global descriptors and ID from Dewrangle - - Args: - See upsert_global_descriptors and - d3b_api_client_cli.dewrangle.rest.download_global_descriptors - - Options: - See upsert_global_descriptors and - d3b_api_client_cli.dewrangle.rest.download_global_descriptors - - Returns: - filepath: path to downloaded global ID descriptors - """ - if not output_dir: - output_dir = os.path.join(ROOT_DATA_DIR) - os.makedirs(output_dir, exist_ok=True) - - s_id = study_global_id if (study_global_id) else dewrangle_study_id - - filepath = os.path.join(output_dir, f"global-descriptors-{s_id}.csv") - - logger.info("✏️ Preparing to upsert single global descriptor ...") - logger.info("Writing parameters to file %s", filepath) - - row = {"descriptor": descriptor, "fhirResourceType": fhir_resource_type} - if global_id: - row["globalId"] = global_id - - pandas.DataFrame([row]).to_csv(filepath, index=False) - - return upsert_and_download_global_descriptors( - filepath, - study_global_id=study_global_id, - dewrangle_study_id=dewrangle_study_id, - skip_unavailable_descriptors=skip_unavailable_descriptors, - download_all=download_all, - output_dir=output_dir, - output_filepath=output_filepath, - ) - - -def upsert_and_download_global_descriptors( - input_filepath: str, - study_global_id: Optional[str] = None, - dewrangle_study_id: Optional[str] = None, - skip_unavailable_descriptors: Optional[bool] = True, - download_all: Optional[bool] = True, - output_dir: Optional[str] = None, - output_filepath: Optional[str] = None, -) -> str: - """ - Send request to upsert global descriptors and download created/updated - global descriptors and ID from Dewrangle - - Args: - See upsert_global_descriptors and - d3b_api_client_cli.dewrangle.rest.download_global_descriptors - - Options: - See upsert_global_descriptors and - d3b_api_client_cli.dewrangle.rest.download_global_descriptors - - Returns: - filepath: path to downloaded global ID descriptors - """ - if not output_dir: - output_dir = os.path.join(ROOT_DATA_DIR) - os.makedirs(output_dir, exist_ok=True) - - result = upsert_global_descriptors( - input_filepath, - study_global_id=study_global_id, - dewrangle_study_id=dewrangle_study_id, - skip_unavailable_descriptors=skip_unavailable_descriptors, - ) - - job_id = result["job"]["id"] - dewrangle_study_id = result["study_id"] - - filepath = download_global_descriptors( - dewrangle_study_id=dewrangle_study_id, - job_id=job_id, - download_all=download_all, - filepath=output_filepath, - output_dir=output_dir, - ) - - return filepath - - -def upsert_global_descriptors( - filepath: str, - study_global_id: Optional[str], - dewrangle_study_id: Optional[str], - skip_unavailable_descriptors: Optional[bool] = True, -): - """ - Upsert global descriptors to Dewrangle - - This happens in two steps: - 1. Upload the global descriptor csv file to the study file endpoint - 2. Invoke the graphQL mutation to upsert global descriptors - - Args: - - skip_unavailable_descriptors (bool): If true any errors due to a - descriptor already having a global ID assigned will be ignored - - Options: - - study_global_id - Provide this when you don't know the study's - GraphQL ID in Dewrangle. - - study_id - Study GraphQL ID in Dewrangle - - You must provide either the study_global_id OR the study_id but not both - - Raise: - ValueError if the study does not exist in Dewrangle - """ - if dewrangle_study_id: - study = study_api.read_study(dewrangle_study_id) - else: - study = study_api.find_study(study_global_id) - - if not study: - raise ValueError( - f"❌ Study " - f"{study_global_id if study_global_id else dewrangle_study_id}" - " does not exist in Dewrangle. Aborting" - ) - - study_global_id = study["globalId"] - dewrangle_study_id = study["id"] - - logger.info( - "🛸 Upsert global IDs in %s to Dewrangle for study %s", - filepath, - study_global_id, - ) - - filepath = os.path.abspath(filepath) - base_url = config["dewrangle"]["base_url"] - endpoint_template = config["dewrangle"]["endpoints"]["rest"]["study_file"] - endpoint = endpoint_template.format( - dewrangle_study_id=dewrangle_study_id, - filename=os.path.split(filepath)[-1], - ) - - url = f"{base_url}/{endpoint}" - logger.info("🛸 POST global IDs file %s to Dewrangle %s", filepath, url) - - result = upload_study_file(dewrangle_study_id, filepath=filepath) - study_file_id = result["id"] - - # Trigger global descriptor upsert mutation - resp = study_api.upsert_global_descriptors( - study_file_id, skip_unavailable_descriptors=skip_unavailable_descriptors - ) - result = resp["globalDescriptorUpsert"] - job_id = result["job"]["id"] - result["study_global_id"] = study_global_id - result["study_id"] = study["id"] - - logger.info( - "✅ Completed request to upsert global descriptors. Job ID: %s", job_id - ) - - return result - - -def download_global_descriptors( - dewrangle_study_id: Optional[str] = None, - study_global_id: Optional[str] = None, - job_id: Optional[str] = None, - download_all: Optional[bool] = True, - filepath: Optional[str] = None, - output_dir: Optional[str] = None, -) -> str: - """ - Download study's global IDs from Dewrangle - - Args: - - dewrangle_study_id: GraphQL ID of study in Dewrangle - - filepath: GraphQL ID of study in Dewrangle - - Options: - - job_id: The job ID returned from the upsert_global_descriptors - method. If this is provided, only global IDs from that - job will be returned. - - - download_all: Determines how many descriptors - will be returned for the global ID. - - If True, return all descriptors associated - with the global ID - - If False, return the most recent - descriptor associated with the global ID - - - filepath: If filepath is provided, download content to that filepath - - - output_dir: If output_dir is provided, get filename from - Content-Disposition header and download the file to the - output directory with that filename - """ - if dewrangle_study_id: - study = study_api.read_study(dewrangle_study_id) - else: - study = study_api.find_study(study_global_id) - - if not study: - raise ValueError( - f"❌ Study " - f"{study_global_id if study_global_id else dewrangle_study_id}" - " does not exist in Dewrangle. Aborting" - ) - - study_global_id = study["globalId"] - dewrangle_study_id = study["id"] - - if download_all: - descriptors = GlobalIdDescriptorOptions.DOWNLOAD_ALL_DESC.value - else: - descriptors = GlobalIdDescriptorOptions.DOWNLOAD_MOST_RECENT.value - - base_url = config["dewrangle"]["base_url"] - endpoint_template = config["dewrangle"]["endpoints"]["rest"]["global_id"] - endpoint = endpoint_template.format(dewrangle_study_id=dewrangle_study_id) - url = f"{base_url}/{endpoint}" - - # Download global IDs associated with this job only - params = {} - if job_id: - params.update({"job": job_id}) - - # Download all descriptors associated with each affected global id - if descriptors: - params.update({"descriptors": descriptors}) - - logger.info( - "🛸 Start download of global IDs for study %s from Dewrangle: %s" - " Params: %s", - study_global_id, - url, - pformat(params), - ) - - filepath = download_file( - url, output_dir=output_dir, filepath=filepath, params=params - ) - - logger.info("✅ Completed download of global IDs: %s", filepath) - - return filepath diff --git a/d3b_api_client_cli/dewrangle/graphql/__init__.py b/d3b_api_client_cli/dewrangle/graphql/__init__.py index c1da0d4..c78b7b7 100644 --- a/d3b_api_client_cli/dewrangle/graphql/__init__.py +++ b/d3b_api_client_cli/dewrangle/graphql/__init__.py @@ -3,14 +3,11 @@ - CRUD organization(s) - CRUD study(ies) -- CRUD volume(s) -- CRUD credential(s) -- Read jobs +- Read fhir ingest jobs +- CRUD fhir servers """ from d3b_api_client_cli.dewrangle.graphql.organization import * from d3b_api_client_cli.dewrangle.graphql.study import * -from d3b_api_client_cli.dewrangle.graphql.credential import * -from d3b_api_client_cli.dewrangle.graphql.volume import * from d3b_api_client_cli.dewrangle.graphql.job import * -from d3b_api_client_cli.dewrangle.graphql.billing_group import * +from d3b_api_client_cli.dewrangle.graphql.fhir_server import * diff --git a/d3b_api_client_cli/dewrangle/graphql/billing_group/__init__.py b/d3b_api_client_cli/dewrangle/graphql/billing_group/__init__.py deleted file mode 100644 index 986ece9..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/billing_group/__init__.py +++ /dev/null @@ -1,240 +0,0 @@ -""" -GraphQL methods to CRUD billing_group in Dewrangle -""" - -import os -import logging -from pprint import pformat, pprint - -import gql - -from d3b_api_client_cli.dewrangle.graphql.common import ( - exec_query, -) -from d3b_api_client_cli.dewrangle.graphql.billing_group import ( - queries, - mutations, -) -from d3b_api_client_cli.dewrangle.graphql.organization import ( - paginate_organizations, -) -from d3b_api_client_cli.config import config -from d3b_api_client_cli.utils import ( - write_json, -) - -logger = logging.getLogger(__name__) - -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] -DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] - - -def create_or_find_billing_group( - organization_id: str, cavatica_billing_group_id: str -) -> dict: - """ - Create billing_group if it does not exist, otherwise return - the existing billing group in Dewrangle - - Arguments: - organization_id - Dewrangle ID of organization - cavatica_billing_group_id - Cavatica billing group ID - - Returns: - Dewrangle billing_group dict - """ - billing_group = create_billing_group( - organization_id, cavatica_billing_group_id - ) - if not billing_group: - return find_billing_group(cavatica_billing_group_id) - else: - return billing_group - - -def create_billing_group( - organization_id: str, cavatica_billing_group_id: str -) -> dict: - """ - Create billing_group in Dewrangle - - Arguments: - organization_id - Dewrangle ID of organization - cavatica_billing_group_id - Cavatica billing group ID - - Returns: - Dewrangle billing_group dict - """ - params = { - "input": { - "organizationId": organization_id, - "cavaticaBillingGroupId": cavatica_billing_group_id, - } - } - - key = "Create" - resp = exec_query(mutations.create_billing_group, variables=params) - - errors = resp.get(f"billingGroup{key}", {}).get("errors") - if errors: - logger.error("❌ %s billing_group failed:\n%s", key, pformat(resp)) - else: - logger.info("✅ %s billing_group succeeded:\n%s", key, pformat(resp)) - - result = resp["billingGroupCreate"]["billingGroup"] - if not errors: - result["organization_id"] = organization_id - - return result - - -def delete_billing_group( - _id: str, - delete_safety_check: bool = True, -) -> dict: - """ - Delete billing_group in Dewrangle - - Arguments: - _id - Dewrangle ID - delete_safety_check - only delete if this is False - - Returns: - Response from Dewrangle - """ - node_id = _id - - resp = exec_query( - mutations.delete_billing_group, - variables={"id": node_id}, - delete_safety_check=delete_safety_check, - ) - - errors = resp.get("billingGroupDelete", {}).get("errors") - key = "Delete" - if errors: - result = errors - logger.error("❌ %s billing_group failed:\n%s", key, pformat(resp)) - else: - logger.info("✅ %s billing_group succeeded:\n%s", key, pformat(resp)) - result = resp["billingGroupDelete"]["billingGroup"] - result["id"] = node_id - - return result - - -def read_billing_groups( - output_dir: str = DEWRANGLE_DIR, log_output: bool = True -) -> list[dict]: - """ - Fetch billing_groups that the client has access to - - Arguments: - output_dir - directory where billing_group metadata will be written - log_output - whether to log billing_group dicts - - Returns: - List of billing_group dicts - """ - data = paginate_billing_groups() - - if output_dir: - os.makedirs(output_dir, exist_ok=True) - filepath = os.path.join(output_dir, "BillingGroup.json") - write_json(data, filepath) - logger.info("✏️ Wrote %s billing_group to %s", len(data), filepath) - - if log_output: - logger.info("💰 BillingGroups:\n%s", pformat(data)) - - return data - - -def read_billing_group(node_id: str) -> dict: - """ - Fetch billing_group by node id - """ - variables = {"id": node_id} - resp = exec_query(queries.billing_group, variables=variables) - billing_group = resp.get("node", {}) - - if billing_group: - logger.info( - "🔎 Found Dewrangle billing_group %s:\n%s", - billing_group["id"], - pformat(billing_group), - ) - else: - logger.error("❌ Not Found: dewrangle billing_group %s", node_id) - - return billing_group - - -def paginate_billing_groups( - organizations=None, billing_group_page_size=DEWRANGLE_MAX_PAGE_SIZE -): - """ - Fetch all billing_groups in all organizations that the viewer has access to - - Use Relay graphql pagination - """ - if not organizations: - organizations = paginate_organizations() - - logger.info("📄 Paginating Dewrangle billing_groups ...") - - billing_groups = {} - for org in organizations: - variables = {"first": billing_group_page_size, "id": org["id"]} - resp = exec_query(queries.org_billing_groups, variables=variables) - - count = 0 - has_next_page = True - while has_next_page: - org_billing_groups = resp["node"]["billingGroups"]["edges"] - page_info = resp["node"]["billingGroups"]["pageInfo"] - - count += len(org_billing_groups) - total = resp["node"]["billingGroups"]["totalCount"] - - if not total: - has_next_page = False - continue - - logger.info( - "Collecting %s billing_groups for org %s", - count / total, - org["name"], - ) - # Add billing_groups to ouput - for s in org_billing_groups: - billing_group = s["node"] - billing_group["organization_id"] = org["id"] - billing_groups[billing_group["cavaticaBillingGroupId"]] = ( - billing_group - ) - logger.info( - "Found billing_group %s", - billing_group["cavaticaBillingGroupId"], - ) - - # Fetch next page if there is one - has_next_page = page_info["hasNextPage"] - end_cursor = page_info["endCursor"] - if has_next_page and end_cursor: - variables.update({"after": end_cursor}) - resp = exec_query( - queries.org_billing_groups, variables=variables - ) - - return billing_groups - - -def find_billing_group(cavatica_billing_group_id: str) -> dict: - """ - Find billing_group using cavatica billing group id. - Use this when you don't know the org ID - """ - billing_groups = paginate_billing_groups() - - return billing_groups.get(cavatica_billing_group_id, {}) diff --git a/d3b_api_client_cli/dewrangle/graphql/billing_group/mutations.py b/d3b_api_client_cli/dewrangle/graphql/billing_group/mutations.py deleted file mode 100644 index 68c8eb4..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/billing_group/mutations.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Dewrangle GraphQL mutation definitions -""" - -from gql import gql - - -create_billing_group = gql( - """ - mutation billingGroupCreateMutation($input: BillingGroupCreateInput!) { - billingGroupCreate(input: $input) { - errors { - ... on MutationError { - __typename - message - field - } - } - billingGroup { - id - name - cavaticaBillingGroupId - } - } - } - """ -) - -delete_billing_group = gql( - """ - mutation billingGroupDeleteMutation($id: ID!) { - billingGroupDelete(id: $id) { - errors { - ... on MutationError { - __typename - message - field - } - } - billingGroup { - id - name - cavaticaBillingGroupId - } - } - } - """ -) diff --git a/d3b_api_client_cli/dewrangle/graphql/billing_group/queries.py b/d3b_api_client_cli/dewrangle/graphql/billing_group/queries.py deleted file mode 100644 index 10a4cee..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/billing_group/queries.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Dewrangle GraphQL query definitions -""" - -from gql import gql - -billing_group = gql( - """ - query BillingGroupQuery($id: ID!) { - node(id: $id) { - id - ... on BillingGroup { - cavaticaBillingGroupId - name - id - organization { - name - id - } - } - } - } - """ -) - -org_billing_groups = gql( - """ - query orgBillingGroups($id: ID!, $first: Int, $after: ID) { - node(id: $id) { - id - ... on Organization { - name - id - billingGroups(first: $first, after: $after) { - totalCount - pageInfo { - hasNextPage - endCursor - } - edges { - cursor - node - { - id - cavaticaBillingGroupId - name - } - } - } - } - } - } - """ -) diff --git a/d3b_api_client_cli/dewrangle/graphql/common.py b/d3b_api_client_cli/dewrangle/graphql/common.py index 96d9b7b..ff0537d 100644 --- a/d3b_api_client_cli/dewrangle/graphql/common.py +++ b/d3b_api_client_cli/dewrangle/graphql/common.py @@ -5,20 +5,19 @@ import logging from gql import Client -from gql.transport.aiohttp import AIOHTTPTransport from graphql import print_ast +from gql.transport.aiohttp import AIOHTTPTransport from d3b_api_client_cli.config import ( config, DEWRANGLE_DEV_PAT, - check_dewrangle_http_config, + DEWRANGLE_MAX_PAGE_SIZE, +) +from d3b_api_client_cli.dewrangle.graphql.queries import ( + viewer_query, ) from d3b_api_client_cli import utils -DEWRANGLE_BASE_URL = config["dewrangle"]["base_url"] -DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] -EXECUTION_TIMEOUT = config["dewrangle"]["client"]["execution_timeout"] - logger = logging.getLogger(__name__) graphql_client = None @@ -26,20 +25,24 @@ gql_logger.setLevel(level=logging.CRITICAL) -def create_graphql_client() -> Client: +def create_graphql_client(): """ Create a gql GraphQL client that will exec queries asynchronously """ - # Ensure env vars are set - check_dewrangle_http_config() - base_url = config["dewrangle"]["base_url"] endpoint = config["dewrangle"]["endpoints"]["graphql"] - url = f"{base_url}/{endpoint}" - logger.info( - "🛠️ Setting up GraphQL client for %s", - f"{base_url.rstrip('/')}/{endpoint}", - ) + base = base_url.rstrip("/") + path = endpoint.lstrip("/") + url = f"{base}/{path}" + logger.info(f"🛠️ Setting up GraphQL client for {url}") + + if not DEWRANGLE_DEV_PAT: + raise Exception( + "❌ Cannot continue GraphQL operation because the environment" + " variable, DEWRANGLE_DEV_PAT, is not set. Please create a" + " personal access token on Dewrangle and set it in your environment" + " in the DEWRANGLE_DEV_PAT variable" + ) headers = {"x-api-key": DEWRANGLE_DEV_PAT} transport = AIOHTTPTransport(url=url, headers=headers) @@ -47,12 +50,12 @@ def create_graphql_client() -> Client: # Create a GraphQL client using the defined transport return Client( transport=transport, - fetch_schema_from_transport=True, - execute_timeout=EXECUTION_TIMEOUT, + fetch_schema_from_transport=False, + execute_timeout=30, ) -def exec_query(gql_query, variables=None, delete_safety_check=True): +def exec_query(gql_query, variables=None): """ Execute a graphql query and handle errors gracefully @@ -65,7 +68,7 @@ def exec_query(gql_query, variables=None, delete_safety_check=True): """ base_url = config["dewrangle"]["base_url"] str_query = print_ast(gql_query) - if delete_safety_check and "delete" in str_query.lower(): + if "delete" in str_query.lower(): utils.delete_safety_check(base_url) global graphql_client @@ -73,3 +76,50 @@ def exec_query(gql_query, variables=None, delete_safety_check=True): graphql_client = create_graphql_client() return graphql_client.execute(gql_query, variable_values=variables) + + +def viewer_entities(): + """ + Execute the viewer query, extract entities and flatten into lists of + entities. Return a dict with lists of flattened entities + """ + # NOTE: We should really implement pagination here, but for now we're + # just going to try and pull all the studies in one query + variables = {"first": DEWRANGLE_MAX_PAGE_SIZE} + resp = exec_query(viewer_query, variables=variables) + org_nodes = resp["viewer"]["organizationUsers"]["edges"] + if org_nodes: + organizations = [ + edge["node"]["organization"] + for edge in resp["viewer"]["organizationUsers"]["edges"] + ] + else: + organizations = [] + + studies = [] + fhir_servers = [] + + # orgs + for org in organizations: + # studies + for edge in org["studies"]["edges"]: + study = edge["node"] + study["organization_id"] = org["id"] + # fhir servers + server = study["fhirServerDeployments"] + ids = { + "study_fhir_server_id": server["id"], + "fhir_server_id": server["fhirServer"]["id"], + "study_id": study["id"], + "study_global_id": study["globalId"], + "organization_id": org["id"], + } + server.update(ids) + fhir_servers.append(server) + studies.append(study) + + return { + "organizations": organizations, + "studies": studies, + "study_fhir_servers": fhir_servers, + } diff --git a/d3b_api_client_cli/dewrangle/graphql/credential/__init__.py b/d3b_api_client_cli/dewrangle/graphql/credential/__init__.py deleted file mode 100644 index 736ff88..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/credential/__init__.py +++ /dev/null @@ -1,280 +0,0 @@ -""" -GraphQL methods to created and delete AWS credential in Dewrangle -""" - -import os -import logging -from pprint import pformat, pprint -from collections import defaultdict - -import gql - -from d3b_api_client_cli.dewrangle.graphql.common import ( - exec_query, -) -from d3b_api_client_cli.dewrangle.graphql.study import ( - paginate_studies, - find_study, -) -from d3b_api_client_cli.dewrangle.graphql.credential import ( - queries, - mutations, -) -from d3b_api_client_cli.config import config -from d3b_api_client_cli.utils import ( - write_json, -) - -logger = logging.getLogger(__name__) - -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] -DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] - - -def upsert_credential( - variables: dict, study_id=None, study_global_id=None -) -> dict: - """ - Upsert credential in Dewrangle - - Arguments: - variables - Credential attributes (see Dewrangle graphql schema) - study_id - Graphql node ID of the credential's study - study_global_id - Global ID of credential's study - Returns: - Dewrangle credential dict - """ - if not (study_id or study_global_id): - raise ValueError( - "❌ Either the graphql node ID or global ID of the credential's" - " study must be provided to either create or update the credential" - ) - - credential_key = variables.get("key") - - # If no study id provided, try querying for it via global ID - if not study_id: - study_id = find_study(study_global_id).get("id") - - # Try finding existing credential - if not credential_key: - credential = None - else: - credential = find_credential(credential_key, study_id) - - params = {"input": variables} - - if credential: - key = "Update" - - # Remove any immutable fields - for immutable_attr in ["key", "secret", "type", "studyId"]: - params["input"].pop(immutable_attr, None) - - params.update({"id": credential["id"]}) - resp = exec_query(mutations.update_credential, variables=params) - else: - key = "Create" - params["input"].update({"studyId": study_id}) - params.pop("id", None) - resp = exec_query(mutations.create_credential, variables=params) - - errors = resp.get(f"credential{key}", {}).get("errors") - if errors: - logger.error("❌ %s credential failed:\n%s", key, pformat(resp)) - result = errors - else: - logger.info("✅ %s credential succeeded:\n%s", key, pformat(resp)) - - result = resp[f"credential{key}"]["credential"] - result["id"] = result["id"] - result["study_id"] = result["study"]["id"] - - return result - - -def delete_credential( - node_id: str = None, - credential_key: str = None, - study_global_id: str = None, - delete_safety_check: bool = True, -) -> dict: - """ - Delete credential in Dewrangle - - Arguments: - node_id - Dewrangle graphql node ID - credential_key - Credential key - study_global_id - Global ID of credential's study - delete_safety_check - only delete if this is False - - Returns: - Response from Dewrangle - """ - if not (node_id or (credential_key and study_global_id)): - raise ValueError( - "❌ You must provide either the credential graphql ID or" - " credential key and study global ID to look up the credential" - ) - if credential_key: - study_id = find_study(study_global_id).get("id") - credential = find_credential(credential_key, study_id) - node_id = credential.get("id") - if not node_id: - logger.warning( - "⚠️ Could not find associated dewrangle ID." - " Delete credential %s ABORTED", - node_id, - ) - return - - resp = exec_query( - mutations.delete_credential, - variables={"id": node_id}, - delete_safety_check=delete_safety_check, - ) - - errors = resp.get("credentialDelete", {}).get("errors") - key = "Delete" - if errors: - result = errors - logger.error("❌ %s credential failed:\n%s", key, pformat(resp)) - else: - logger.info("✅ %s credential succeeded:\n%s", key, pformat(resp)) - result = resp["credentialDelete"]["credential"] - result["id"] = node_id - - return result - - -def read_credential(node_id: str) -> dict: - """ - Fetch credential by node id - """ - variables = {"id": node_id} - resp = exec_query(queries.credential, variables=variables) - credential = resp.get("node", {}) - - if credential: - logger.info( - "🔎 Found Dewrangle credential %s:\n%s", - credential["globalId"], - pformat(credential), - ) - else: - logger.error("❌ Not Found: dewrangle credential %s", node_id) - - return credential - - -def read_credentials( - study_global_id=None, - output_dir: str = DEWRANGLE_DIR, - log_output: bool = True, -) -> list[dict]: - """ - Fetch credentials that the client has access to - - Arguments: - study_global_id - Global ID of credential's study - output_dir - directory where study metadata will be written - log_output - whether to log study dicts - - Returns: - List of credential dicts - """ - study_id = None - if study_global_id: - study_id = find_study(study_global_id).get("id") - - data = paginate_credentials(study_id=study_id) - - if output_dir: - os.makedirs(output_dir, exist_ok=True) - filepath = os.path.join(output_dir, "Credential.json") - write_json(data, filepath) - logger.info("✏️ Wrote %s credential to %s", len(data), filepath) - - if log_output: - logger.info("🔐 Credentials:\n%s", pformat(data)) - - return data - - -def paginate_credentials( - studies=None, study_id=None, dewrangle_page_size=DEWRANGLE_MAX_PAGE_SIZE -) -> dict: - """ - Fetch all credentials in all studies that the viewer has access to - - Optionally filter credentials by study. Uses Relay graphql pagination - - Returns: - dict that looks like this - { - "credential_key1": { - "study1": { - - }, - "study2": { - - } - }, - "credential_key2": ... - } - """ - if not studies: - studies = paginate_studies() - - logger.info("📄 Paginating Dewrangle studies ...") - - credentials = defaultdict(lambda: defaultdict(dict)) - for study in studies.values(): - if study_id and (study.get("id") != study_id): - continue - variables = {"first": dewrangle_page_size, "id": study["id"]} - resp = exec_query(queries.study_credentials, variables=variables) - - count = 0 - has_next_page = True - while has_next_page: - study_credentials = resp["node"]["credentials"]["edges"] - page_info = resp["node"]["credentials"]["pageInfo"] - - count += len(study_credentials) - total = resp["node"]["credentials"]["totalCount"] - - if not total: - has_next_page = False - continue - - logger.info( - "Collecting %s credentials for study %s", - count / total, - study["name"], - ) - # Add credentials to ouput - for s in study_credentials: - credential = s["node"] - credential["study_id"] = study["id"] - credential["study_global_id"] = study["globalId"] - credentials[credential["key"]][study["id"]] = credential - - # Fetch next page if there is one - has_next_page = page_info["hasNextPage"] - end_cursor = page_info["endCursor"] - if has_next_page and end_cursor: - variables.update({"after": end_cursor}) - resp = exec_query( - queries.study_credentials, variables=variables - ) - - return dict(credentials) - - -def find_credential(credential_key: str, study_id: str) -> dict: - """ - Find credential using credential key and study id. - """ - credentials = paginate_credentials(study_id=study_id) - return credentials.get(credential_key, {}).get(study_id, {}) diff --git a/d3b_api_client_cli/dewrangle/graphql/credential/mutations.py b/d3b_api_client_cli/dewrangle/graphql/credential/mutations.py deleted file mode 100644 index eafbe35..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/credential/mutations.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Dewrangle GraphQL mutation definitions -""" - -from gql import gql - - -create_credential = gql( - """ - mutation credentialCreateMutation($input: CredentialCreateInput!) { - credentialCreate(input: $input) { - errors { - ... on MutationError { - __typename - message - field - } - } - credential { - id - name - key - study { - name - id - globalId - } - } - } - } - """ -) -update_credential = gql( - """ - mutation credentialUpdateMutation($id: ID!, $input: CredentialUpdateInput!) { - credentialUpdate(id: $id, input: $input) { - errors { - ... on MutationError { - __typename - message - field - } - } - credential { - id - name - key - study { - name - id - globalId - } - } - } - } - """ -) - -delete_credential = gql( - """ - mutation credentialDeleteMutation($id: ID!) { - credentialDelete(id: $id) { - errors { - ... on MutationError { - __typename - message - field - } - } - credential { - id - name - key - study { - name - id - globalId - } - } - } - } - """ -) diff --git a/d3b_api_client_cli/dewrangle/graphql/credential/queries.py b/d3b_api_client_cli/dewrangle/graphql/credential/queries.py deleted file mode 100644 index 10d0252..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/credential/queries.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Dewrangle GraphQL query definitions -""" - -from gql import gql - -credential = gql( - """ - query credentialQuery($id: ID!) { - node(id: $id) { - id - ... on Credential { - id - name - key - study { - name - id - globalId - } - } - } - } - """ -) - -study_credentials = gql( - """ - query studyCredentials($id: ID!, $first: Int, $after: ID) { - node(id: $id) { - id - ... on Study { - name - id - globalId - credentials(first: $first, after: $after) { - totalCount - pageInfo { - hasNextPage - endCursor - } - edges { - cursor - node - { - id - name - key - } - } - } - } - } - } - """ -) diff --git a/d3b_api_client_cli/dewrangle/graphql/fhir_server/__init__.py b/d3b_api_client_cli/dewrangle/graphql/fhir_server/__init__.py new file mode 100644 index 0000000..2205d24 --- /dev/null +++ b/d3b_api_client_cli/dewrangle/graphql/fhir_server/__init__.py @@ -0,0 +1,116 @@ +""" +GraphQL methods to CRUD fhir_server in Dewrangle +""" + +import os +import logging +from pprint import pformat + +from d3b_api_client_cli.dewrangle.graphql.common import ( + exec_query, +) +from d3b_api_client_cli.dewrangle.graphql.fhir_server import ( + queries, + mutations, +) +from d3b_api_client_cli.config import DEWRANGLE_DIR +from d3b_api_client_cli.utils import write_json + +logger = logging.getLogger(__name__) + + +def upsert_fhir_server( + dewrangle_organization_id, variables, oidc_client_secret=None +): + """ + Upsert fhir_server in Dewrangle + + :param variables: FhirServer attributes (see Dewrangle graphql schema) + :type variables: dict + :rtype: dict + :returns: the fhir_server + """ + variables.update({"organizationId": dewrangle_organization_id}) + if oidc_client_secret: + auth_config = variables.get("authConfig", {}) + auth_config["clientSecret"] = oidc_client_secret + variables.update({"authConfig": auth_config}) + + params = {"input": variables} + + # Check if this is an update or create + servers = read_fhir_servers(dewrangle_organization_id) + update = False + for server in servers: + if server["name"] == variables["name"]: + update = True + break + + if update: + key = "Update" + params["input"].pop("organizationId", None) + params["input"].pop("type", None) + params.update({"id": server["id"]}) + dwid = server["id"] + resp = exec_query(mutations.update_fhir_server, variables=params) + else: + key = "Create" + resp = exec_query(mutations.create_fhir_server, variables=params) + + errors = resp.get(f"fhirServer{key}", {}).get("errors") + if errors: + logger.warning(f"‼️ {key} fhir_server failed:\n{pformat(resp)}") + else: + logger.info(f"✅ {key} fhir_server succeeded:\n{pformat(resp)}") + + result = resp[f"fhirServer{key}"]["fhirServer"] + + return result + + +def read_fhir_servers(dewrangle_organization_id, output_dir=DEWRANGLE_DIR): + """ + Fetch FhirServers that the client has access to + + :rtype: dict + :returns: the FHIR servers + """ + params = {"id": dewrangle_organization_id} + resp = exec_query(queries.organization_fhir_servers, variables=params) + data = [ + edge.get("node", {}) for edge in resp["node"]["fhirServers"]["edges"] + ] + logger.info(f"Fetched {len(data)} fhir_servers") + + if output_dir: + os.makedirs(output_dir, exist_ok=True) + filepath = os.path.join(output_dir, "FhirServer.json") + write_json(data, filepath) + logger.info(f"✏️ Wrote {len(data)} fhir_server to {filepath}") + + logger.info(f"🔥 FhirServers:\n{pformat(data)}") + + return data + + +def delete_fhir_server(node_id): + """ + Delete fhir_server in Dewrangle + + :param node_id: Dewrangle node ID of the fhir_server + :type node_id: str + :rtype: dict + :returns: the response + """ + resp = exec_query(mutations.delete_fhir_server, variables={"id": node_id}) + + errors = resp.get("fhirServerDelete", {}).get("errors") + if errors: + result = errors + logger.warning(f"🚮 ‼️ Delete fhir_server failed:\n{pformat(resp)}") + else: + logger.info(f"🚮 Deleted fhir_server:\n{pformat(resp)}") + result = resp["fhirServerDelete"]["fhirServer"] + result["id"] = node_id + + return result diff --git a/d3b_api_client_cli/dewrangle/graphql/fhir_server/mutations.py b/d3b_api_client_cli/dewrangle/graphql/fhir_server/mutations.py new file mode 100644 index 0000000..e40487e --- /dev/null +++ b/d3b_api_client_cli/dewrangle/graphql/fhir_server/mutations.py @@ -0,0 +1,85 @@ +""" +Dewrangle GraphQL mutation definitions +""" + +from gql import gql + +create_fhir_server = gql( + """ + mutation fhirServerCreateMutation($input: FhirServerCreateInput!) { + fhirServerCreate(input: $input) { + errors { + ... on MutationError { + __typename + message + field + } + } + fhirServer { + id + name + url + type + authType + authConfig { + ... on FhirServerAuthConfigOIDCClientCredential { + issuerBaseUrl + clientId + } + } + } + } + } + """ +) +update_fhir_server = gql( + """ + mutation fhirServerUpdateMutation( + $id: ID! + $input: FhirServerUpdateInput! + ) { + fhirServerUpdate(id: $id, input: $input) { + errors { + ... on MutationError { + __typename + message + field + } + } + fhirServer { + id + name + url + type + authType + authConfig { + ... on FhirServerAuthConfigOIDCClientCredential { + issuerBaseUrl + clientId + } + } + } + } + } + """ +) + +delete_fhir_server = gql( + """ + mutation fhirServerDeleteMutation($id: ID!) { + fhirServerDelete(id: $id) { + errors { + ... on MutationError { + __typename + message + field + } + } + fhirServer { + id + name + } + } + } + """ +) diff --git a/d3b_api_client_cli/dewrangle/graphql/fhir_server/queries.py b/d3b_api_client_cli/dewrangle/graphql/fhir_server/queries.py new file mode 100644 index 0000000..921723b --- /dev/null +++ b/d3b_api_client_cli/dewrangle/graphql/fhir_server/queries.py @@ -0,0 +1,35 @@ +""" +Dewrangle GraphQL query definitions +""" + +from gql import gql + +organization_fhir_servers = gql( + """ + query organizationQuery($id: ID!) { + node(id: $id) { + id + ... on Organization { + id + name + fhirServers { + edges { + node { + id + name + url + authType + authConfig { + ... on FhirServerAuthConfigOIDCClientCredential { + issuerBaseUrl + clientId + } + } + } + } + } + } + } + } + """ +) diff --git a/d3b_api_client_cli/dewrangle/graphql/job/__init__.py b/d3b_api_client_cli/dewrangle/graphql/job/__init__.py index 6142d8c..3c0046c 100644 --- a/d3b_api_client_cli/dewrangle/graphql/job/__init__.py +++ b/d3b_api_client_cli/dewrangle/graphql/job/__init__.py @@ -1,198 +1,80 @@ """ -GraphQL methods for jobs in Dewrangle +GraphQL methods to CRUD organization in Dewrangle """ -import time import os import logging from pprint import pformat -from typing import Callable, Optional - -from graphql import DocumentNode from d3b_api_client_cli.dewrangle.graphql.common import exec_query from d3b_api_client_cli.dewrangle.graphql.job import ( queries, mutations, ) -from d3b_api_client_cli.config import config -from d3b_api_client_cli.utils import ( - write_json, -) +from d3b_api_client_cli.config import DEWRANGLE_DIR +from d3b_api_client_cli.utils import write_json logger = logging.getLogger(__name__) -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] -DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] - - -def poll_job( - job_id: str, - timeout_seconds: Optional[int] = None, - interval_seconds: Optional[int] = 30, -): - """ - Poll for status on a Dewrangle FHIR ingest job - - See _poll_job for details - """ - job_query = queries.job - - def is_complete(resp): - complete = resp["node"]["completedAt"] is not None - success = not resp["node"]["errors"]["edges"] - - return {"complete": complete, "success": success} - - return _poll_job( - job_id, - job_query, - is_complete, - timeout_seconds=timeout_seconds, - interval_seconds=interval_seconds, - ) - - -def _validate_status_format(status: dict): - """ - Validate that the deveoper supplied a properly formatted function for - job completion - """ - expected = {"complete": "", "success": ""} - for key in ["complete", "success"]: - if key not in status: - raise ValueError( - "❌ Invalid poll job complete function. Must return a dict " - f"with the following format: {pformat(expected)}" - ) - - -def _poll_job( - job_id: str, - job_query: DocumentNode, - complete_function: Callable[[dict], dict], - timeout_seconds: Optional[int] = None, - interval_seconds: Optional[int] = 30, -) -> dict: - """ - Poll for status on a Dewrangle job - - If timeout is not set poll until job is complete. - If timeout is set, poll until job is complete or timeout expires - - Wait interval_seconds between each status request to Dewrangle. - - Arguments: - node_id - Dewrangle node ID of the job - - job_query - A GraphQL query to fetch the job - - complete_function - A method which determines when the job is - complete and if it succeeded. This method will take in a dict containing - the output of the graphql query and it must return a dict containing the - following: { "complete": boolean, "success": boolean } - - Returns: - a dict of the form {"success": boolean or None, "job": job_dict} - - success = True if job is complete without errors - success = False if job is complete with errors - success = None if timeout is set, and timeout is exceeded and - job is not complete +def read_fhir_ingest_job(node_id, output_dir=DEWRANGLE_DIR): """ - elapsed_time_seconds = 0 - start_time = time.time() - - while True: - # Fetch job - params = {"id": job_id} - resp = exec_query(job_query, variables=params) - - job = resp["node"] - node_id = job["id"] - operation = job["operation"].lower().replace("_", "-") - - # Check completion status - status = complete_function(resp) - _validate_status_format(status) - - # Job completed - if status["complete"] or (not status["success"]): - success = status["success"] - emoji = "✅ " if success else "❌" - suffix = "" if success else " with errors" - logger.info( - "%s Job %s %s completed%s:\n%s", - emoji, - operation, - node_id, - suffix, - pformat(job), - ) - - return {"success": status["complete"], "job": job} - - elapsed_time_seconds = time.time() - start_time - elapsed_formatted = time.strftime( - "%H:%M:%S", time.gmtime(elapsed_time_seconds) - ) - - # Timeout exceeded - if (timeout_seconds is not None) and ( - elapsed_time_seconds > timeout_seconds - ): - logger.warning( - "⚠️ Timeout of %s seconds expired." - " Current job %s %s result:\n%s" - "\n✌️ Dewrangle must still be working, but CLI is exiting", - timeout_seconds, - operation, - node_id, - pformat(job), - ) - return {"success": None, "job": job} - - # Continue polling - logger.info( - "⏰ Waiting for job %s %s to" - " complete. Elapsed time (hh:mm:ss): %s", - operation, - node_id, - elapsed_formatted, - ) - - time.sleep(interval_seconds) + Fetch Job by ID from Dewrangle - -def read_job(node_id: str, output_dir: str = DEWRANGLE_DIR) -> dict: - """ - Fetch Job by ID from Dewrangle. Mostly for developer debugging purposes + :param node_id: Dewrangle node ID of the job + :type node_id: str + :rtype: dict + :returns: the job """ params = {"id": node_id} - resp = exec_query(queries.job, variables=params) + resp = exec_query(queries.fhir_resource_ingest_job, variables=params) + result = resp["node"] - logger.info("Fetched job %s", result["id"]) operation = result["operation"].lower().replace("_", "-") - errors = result["errors"]["edges"] + logger.info(f"Fetched job {result['id']}") if errors: - logger.error("❌ Read job %s failed:\n%s", operation, pformat(errors)) + logger.warning(f"‼️ Read job {operation} failed:\n{pformat(errors)}") else: - logger.info("🚦 Job-%s:\n%s", operation, pformat(result)) + logger.info(f"🚦 Job-{operation}:\n{pformat(result)}") if output_dir: os.makedirs(output_dir, exist_ok=True) filepath = os.path.join(output_dir, f"Job-{operation}.json") write_json(result, filepath) - emoji = "❌" if errors else "✅" + emoji = "‼️ " if errors else "✅" logger.info( - "✏️ Wrote %s job to %s. %s Found" " %s errors", - operation, - filepath, - emoji, - len(errors), + f"✏️ Wrote {operation} job to {filepath}. {emoji} Found" + f" {len(errors)} errors" ) return result + + +def fhir_resource_ingest(dewrangle_study_id, dewrangle_file_ids): + """ + Start a Dewrangle job to ingest previously uploaded study files + containing FHIR resources. + + :param dewrangle_study_id: GraphQL ID of the Dewrangle study. + :type dewrangle_study_id: str + + :param dewrangle_file_ids: List of Dewrangle StudyFile input objects + to ingest. Each item must match the GraphQL type + `FhirResourceIngestFileInput`, typically: + [{"id": ""}] + :type dewrangle_file_ids: list[dict] + + :rtype: dict + :returns: Raw GraphQL response from the `fhirResourceIngest` mutation. + """ + params = { + "input": { + "studyId": dewrangle_study_id, + "studyFileIds": dewrangle_file_ids, + } + } + resp = exec_query(mutations.fhir_resource_ingest, variables=params) + + return resp diff --git a/d3b_api_client_cli/dewrangle/graphql/job/mutations.py b/d3b_api_client_cli/dewrangle/graphql/job/mutations.py index 74531e2..2c0d7eb 100644 --- a/d3b_api_client_cli/dewrangle/graphql/job/mutations.py +++ b/d3b_api_client_cli/dewrangle/graphql/job/mutations.py @@ -1,3 +1,25 @@ """ Dewrangle GraphQL mutation definitions """ + +from gql import gql + +fhir_resource_ingest = gql( + """ + mutation fhirResourceIngestMutation( + $input: FhirResourceIngestInput! + ) { + fhirResourceIngest(input: $input) { + job { + id + } + errors { + ... on MutationError { + message + field + } + } + } + } + """ +) diff --git a/d3b_api_client_cli/dewrangle/graphql/job/queries.py b/d3b_api_client_cli/dewrangle/graphql/job/queries.py index 4ab7093..62c18c5 100644 --- a/d3b_api_client_cli/dewrangle/graphql/job/queries.py +++ b/d3b_api_client_cli/dewrangle/graphql/job/queries.py @@ -4,13 +4,44 @@ from gql import gql -job = gql( +fhir_resource_ingest_job = gql( """ - query jobQuery($id: ID!) { + query fhirResourceIngestJobQuery($id: ID!) { + node(id: $id) { + id + ... on Job { + operation + completedAt + result { + ... on JobResultFhirResource { + resources { + count + resourceType + } + } + } + errors { + edges { + node { + id + name + message + } + } + } + + } + } + } + """ +) + +job_status_query = gql( + """ + query jobStatusQuery($id: ID!) { node(id: $id) { id ... on Job { - id operation completedAt errors { @@ -22,6 +53,7 @@ } } } + } } } diff --git a/d3b_api_client_cli/dewrangle/graphql/organization/__init__.py b/d3b_api_client_cli/dewrangle/graphql/organization/__init__.py index b836c60..26e94f1 100644 --- a/d3b_api_client_cli/dewrangle/graphql/organization/__init__.py +++ b/d3b_api_client_cli/dewrangle/graphql/organization/__init__.py @@ -6,6 +6,7 @@ import logging from pprint import pformat +from d3b_api_client_cli.config import DEWRANGLE_MAX_PAGE_SIZE from d3b_api_client_cli.dewrangle.graphql.common import ( exec_query, ) @@ -13,34 +14,34 @@ queries, mutations, ) -from d3b_api_client_cli.config import config +from d3b_api_client_cli.config import DEWRANGLE_DIR from d3b_api_client_cli.utils import write_json -DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] logger = logging.getLogger(__name__) -def upsert_organization(variables: dict) -> dict: +def upsert_organization(variables): """ Upsert organization in Dewrangle - Args: - variables: Organization attributes (see Dewrangle graphql schema) + :param variables: Organization attributes (see Dewrangle graphql schema) + :type variables: dict + :rtype: dict + :returns: the organization """ params = {"input": variables} - # Check if this is an update or create orgs = read_organizations(log_output=False) - found_org = None + update = False for org in orgs: if org["name"] == variables["name"]: - found_org = org + update = True break - if found_org: + if update: key = "Update" - params.update({"id": found_org["id"]}) + params.update({"id": org["id"]}) + dwid = org["id"] resp = exec_query(mutations.update_organization, variables=params) else: key = "Create" @@ -48,33 +49,28 @@ def upsert_organization(variables: dict) -> dict: errors = resp.get(f"organization{key}", {}).get("errors") if errors: - logger.error("❌ %s organization failed:\n%s", key, pformat(resp)) + logger.warning(f"‼️ {key} organization failed:\n{pformat(resp)}") else: - logger.info("✅ %s organization succeeded:\n%s", key, pformat(resp)) + logger.warning(f"✅ {key} organization succeeded:\n{pformat(resp)}") result = resp[f"organization{key}"]["organization"] return result -def delete_organization( - dewrangle_org_id: str = None, - dewrangle_org_name: str = None, - delete_safety_check: bool = True, -) -> dict: +def delete_organization(dewrangle_org_id=None, dewrangle_org_name=None): """ - Delete organization in Dewrangle by graphql node ID or name - - Args: - dewrangle_org_id: Dewrangle node ID of the organization - dewrangle_org_name: Dewrangle name of organization - delete_safety_check: only delete if this is False - - Returns: - the response from Dewrangle + Delete organization in Dewrangle by node ID or name + + :param dewrangle_org_id: Dewrangle node ID of the organization + :type dewrangle_org_id: str + :param dewrangle_org_name: Dewrangle name of organization + :type dewrangle_org_name: str + :rtype: dict + :returns: the response """ if not (dewrangle_org_id or dewrangle_org_name): - raise ValueError( + raise Exception( "You must provide either the dewrangle_org_id or dewrangle_org_name" ) @@ -85,55 +81,48 @@ def delete_organization( else: node_id = dewrangle_org_id - resp = exec_query( - mutations.delete_organization, - variables={"id": node_id}, - delete_safety_check=delete_safety_check, - ) + resp = exec_query(mutations.delete_organization, variables={"id": node_id}) - key = "Delete" errors = resp.get("organizationDelete", {}).get("errors") if errors: - logger.error("❌ %s organization failed:\n%s", key, pformat(resp)) + result = errors + logger.warning(f"🚮 ‼️ Delete organization failed:\n{pformat(resp)}") else: - logger.info("✅ %s organization succeeded:\n%s", key, pformat(resp)) + logger.info(f"🚮 Deleted organization:\n{pformat(resp)}") result = resp["organizationDelete"]["organization"] result["id"] = node_id return result -def read_organizations( - output_dir: str = DEWRANGLE_DIR, log_output: bool = True -) -> list[dict]: +def read_organizations(output_dir=DEWRANGLE_DIR, log_output=True): """ Fetch organizations that the client has access to + + :rtype: dict + :returns: the organizations """ organizations = paginate_organizations() - logger.info("Fetched %s organizations", len(organizations)) + logger.info(f"Fetched {len(organizations)} organizations") if output_dir: os.makedirs(output_dir, exist_ok=True) filepath = os.path.join(output_dir, "Organization.json") write_json(organizations, filepath) - logger.info( - "✏️ Wrote %s organization to %s", len(organizations), filepath - ) + logger.info(f"✏️ Wrote {len(organizations)} organization to {filepath}") if log_output: - logger.info("👨‍👩‍👦 Organizations:\n%s", pformat(organizations)) + logger.info(f"👨‍👩‍👦 Organizations:\n{pformat(organizations)}") return organizations -def read_organization( - dewrangle_org_id: str = None, dewrangle_org_name: str = None -) -> dict: +def read_organization(dewrangle_org_name=None, dewrangle_org_id=None): """ Fetch Dewrangle organization by name """ if not (dewrangle_org_id or dewrangle_org_name): - raise ValueError( + raise Exception( "You must provide either the dewrangle_org_id or dewrangle_org_name" ) key = "id" if dewrangle_org_id else "name" @@ -149,9 +138,7 @@ def read_organization( return found_org -def paginate_organizations( - org_page_size: int = DEWRANGLE_MAX_PAGE_SIZE, -) -> list[dict]: +def paginate_organizations(org_page_size=DEWRANGLE_MAX_PAGE_SIZE): """ Fetch all organizations that the viewer has access to @@ -175,7 +162,7 @@ def paginate_organizations( has_next_page = False continue - logger.info("Collecting %s organizations", f"{count}/{total}") + logger.info(f"Collecting {count}/{total} organizations") for org_user in org_users: organizations.append(org_user["node"]["organization"]) @@ -191,7 +178,7 @@ def paginate_organizations( return organizations -def get_org_by_name(org_name: str) -> dict: +def get_org_by_name(org_name): """ Fetch organization from Dewrangle """ diff --git a/d3b_api_client_cli/dewrangle/graphql/organization/queries.py b/d3b_api_client_cli/dewrangle/graphql/organization/queries.py index b9aacc7..0abbfb8 100644 --- a/d3b_api_client_cli/dewrangle/graphql/organization/queries.py +++ b/d3b_api_client_cli/dewrangle/graphql/organization/queries.py @@ -27,6 +27,23 @@ description email website + fhirServers { + edges { + node { + id + name + url + type + authType + authConfig { + ... on FhirServerAuthConfigOIDCClientCredential { + issuerBaseUrl + clientId + } + } + } + } + } } } } diff --git a/d3b_api_client_cli/dewrangle/graphql/queries.py b/d3b_api_client_cli/dewrangle/graphql/queries.py new file mode 100644 index 0000000..5511d8d --- /dev/null +++ b/d3b_api_client_cli/dewrangle/graphql/queries.py @@ -0,0 +1,61 @@ +""" +Dewrangle GraphQL query definitions +""" + +from gql import gql + +viewer_query = gql( + """ + query($first: Int, $after: ID) { + viewer { + name + organizationUsers { + edges { + node { + organization { + id + name + description + email + website + fhirServers { + edges { + node { + id + name + } + } + } + studies(first: $first, after: $after) { + edges { + node + { + id + globalId + name + fhirServerDeployments { + id + fhirServer { + id + name + url + authType + authConfig { + ...on FhirServerAuthConfigOIDCClientCredential { + issuerBaseUrl + clientId + } + } + } + } + } + } + } + } + } + } + } + } +} + """ +) diff --git a/d3b_api_client_cli/dewrangle/graphql/study/__init__.py b/d3b_api_client_cli/dewrangle/graphql/study/__init__.py index 5ed1bec..521bf74 100644 --- a/d3b_api_client_cli/dewrangle/graphql/study/__init__.py +++ b/d3b_api_client_cli/dewrangle/graphql/study/__init__.py @@ -5,7 +5,6 @@ import os import logging from pprint import pformat -from typing import Optional import gql @@ -19,87 +18,95 @@ from d3b_api_client_cli.dewrangle.graphql.organization import ( paginate_organizations, ) -from d3b_api_client_cli.config import config +from d3b_api_client_cli.config import ( + DEWRANGLE_DIR, + IdTypes, + DEWRANGLE_MAX_PAGE_SIZE, +) from d3b_api_client_cli.utils import ( write_json, kf_id_to_global_id, global_id_to_kf_id, ) +from d3b_api_client_cli.dewrangle.poll_job import ( + poll_descriptor_upsert_job, +) logger = logging.getLogger(__name__) -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] -DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] - def upsert_global_descriptors( - study_file_id: str, skip_unavailable_descriptors: Optional[bool] = True -) -> dict: + dewrangle_study_id, study_file_id, skip_unavailable_descriptors=True +): """ Trigger the operation to upsert global descriptors in Dewrangle - Args: - - skip_unavailable_descriptors: If true any errors due to a descriptor + :param study_file_id: The Dewrangle ID returned from uploading a study + file to Dewrangle + :type study_file_id: str + :param skip_unavailable_descriptors: If true any errors due to a descriptor + already having a global ID assigned will be ignored + :type skip_unavailable_descriptors: boolean """ - logger.info( - "🛸 Upsert global descriptors for study file: %s", study_file_id - ) + logger.info(f"🛸 Upsert global descriptors for study file: {study_file_id}") variables = { "input": { - "studyFileId": study_file_id, + "studyId": dewrangle_study_id, + "studyFileIds": [{"id": study_file_id}], "skipUnavailableDescriptors": skip_unavailable_descriptors, } } - resp = exec_query(mutations.upsert_global_descriptors, variables=variables) + initial_resp = exec_query( + mutations.upsert_global_descriptors, variables=variables + ) + job_id = initial_resp["globalDescriptorUpsert"]["job"]["id"] - key = "globalDescriptorUpsert" - mutation_errors = resp.get(key, {}).get("errors") - job_errors = ( - resp.get(key, {}).get("job", {}).get("errors", {}).get("edges", []) + timeout_seconds = 60 + + poll_result = poll_descriptor_upsert_job( + job_id, + timeout_seconds=timeout_seconds, ) - if mutation_errors or job_errors: - logger.error("❌ %s for study failed", key) - if mutation_errors: - logger.error("❌ Mutation Errors:\n%s", pformat(mutation_errors)) - if job_errors: - logger.error("❌ Job Errors:\n%s", pformat(job_errors)) - else: - logger.info("✅ %s for study succeeded:\n%s", key, pformat(resp)) + is_complete_within_timeout = poll_result["status"] + final_job_state = poll_result["job"] - return resp + if not is_complete_within_timeout: + raise TimeoutError( + f"Job {job_id} did not complete within {timeout_seconds} seconds." + ) + + if final_job_state.get("errors", {}).get("totalCount", 0) > 0: + error_message = f"--- ❌ Job {job_id} failed on the server. ---" + raise RuntimeError(error_message) + return initial_resp -def upsert_study( - variables: dict, organization_id: str, study_id: str = None -) -> dict: + +def upsert_study(variables, organization_id, kf_study_id=None): """ Upsert study in Dewrangle - Arguments: - variables - Study attributes (see Dewrangle graphql schema) - organization_id - Dewrangle ID of organization - study_id - Kids First study ID or Dewrangle global ID + :param variables: Study attributes (see Dewrangle graphql schema) + :type variables: dict + :param organization_id: Dewrangle ID of organization + :type organization_id: str + :param kf_study_id: Kids First study ID + :type kf_study_id: str - Returns: - Dewrangle study dict + :rtype: dict + :returns: the study """ update = False - global_id = None - if study_id and study_id.startswith("SD_"): - global_id = kf_id_to_global_id(study_id) - - if not global_id: + if not kf_study_id: global_id = variables.get("globalId", "") + kf_study_id = global_id_to_kf_id(global_id) - study = None - if global_id: - study = find_study(global_id) - + study = get_study_by_kf_id(kf_study_id) if study: update = True if study["organization_id"] != organization_id: - raise ValueError( + raise Exception( "❌ This study is already part of another organization:" f" {study['organization_id']}. You cannot change its" " organization" @@ -121,9 +128,9 @@ def upsert_study( errors = resp.get(f"study{key}", {}).get("errors") if errors: - logger.error("❌ %s study failed:\n%s", key, pformat(resp)) + logger.warning(f"‼️ {key} study failed:\n{pformat(resp)}") else: - logger.info("✅ %s study succeeded:\n%s", key, pformat(resp)) + logger.info(f"✅ {key} study succeeded:\n{pformat(resp)}") result = resp[f"study{key}"]["study"] result["id"] = dwid @@ -132,63 +139,50 @@ def upsert_study( return result -def delete_study( - _id: str, - delete_safety_check: bool = True, -) -> dict: +def delete_study(_id, id_type=IdTypes.DEWRANGLE.value): """ Delete study in Dewrangle - Arguments: - _id - either a Kids First formatted ID or Dewrangle global ID - delete_safety_check - only delete if this is False - - Returns: - Response from Dewrangle + :param node_id: Dewrangle node ID of the study + :type node_id: str + :param id_type: The system/service this ID was generated by. One of + IdTypes + :type id_type: str + :rtype: dict + :returns: the response """ node_id = _id - if _id.startswith("SD_"): - study = find_study(kf_id_to_global_id(_id)) + if id_type == IdTypes.KIDS_FIRST.value: + study = get_study_by_kf_id(_id) node_id = study.get("id") if not node_id: logger.warning( "⚠️ Could not find associated dewrangle ID." - " Delete study %s ABORTED", - _id, + f" Delete study {_id} ABORTED" ) return - resp = exec_query( - mutations.delete_study, - variables={"id": node_id}, - delete_safety_check=delete_safety_check, - ) + resp = exec_query(mutations.delete_study, variables={"id": node_id}) errors = resp.get("studyDelete", {}).get("errors") - key = "Delete" if errors: result = errors - logger.error("❌ %s study failed:\n%s", key, pformat(resp)) + logger.warning(f"🚮 ‼️ Delete study failed:\n{pformat(resp)}") else: - logger.info("✅ %s study succeeded:\n%s", key, pformat(resp)) + logger.info(f"🚮 Deleted study:\n{pformat(resp)}") + result = resp["studyDelete"]["study"] result["id"] = node_id return result -def read_studies( - output_dir: str = DEWRANGLE_DIR, log_output: bool = True -) -> list[dict]: +def read_studies(output_dir=DEWRANGLE_DIR, log_output=True): """ Fetch studies that the client has access to - Arguments: - output_dir - directory where study metadata will be written - log_output - whether to log study dicts - - Returns: - List of study dicts + :rtype: dict + :returns: the studies """ data = paginate_studies() @@ -196,17 +190,20 @@ def read_studies( os.makedirs(output_dir, exist_ok=True) filepath = os.path.join(output_dir, "Study.json") write_json(data, filepath) - logger.info("✏️ Wrote %s study to %s", len(data), filepath) + logger.info(f"✏️ Wrote {len(data)} study to {filepath}") if log_output: - logger.info("🔬 Studies:\n%s", pformat(data)) + logger.info(f"🔬 Studies:\n{pformat(data)}") return data -def read_study(node_id: str) -> dict: +def read_study(node_id): """ Fetch study by node id + + :rtype: dict + :returns: the study """ variables = {"id": node_id} resp = exec_query(queries.study, variables=variables) @@ -214,12 +211,10 @@ def read_study(node_id: str) -> dict: if study: logger.info( - "🔎 Found Dewrangle study %s:\n%s", - study["globalId"], - pformat(study), + f"🔎 Found Dewrangle study {study['globalId']}:\n{pformat(study)}" ) else: - logger.error("❌ Not Found: dewrangle study %s", node_id) + logger.warning(f"❌ Not Found: dewrangle study {node_id}") return study @@ -255,9 +250,8 @@ def paginate_studies( has_next_page = False continue - logger.info("******* Organization %s *******", org["name"]) logger.info( - "Collecting %s/%s studies for org %s", count, total, org["name"] + f"Collecting {count}/{total} studies for org {org['name']}" ) # Add studies to ouput for s in org_studies: @@ -265,8 +259,7 @@ def paginate_studies( study["organization_id"] = org["id"] kf_id = global_id_to_kf_id(study["globalId"]) study["kf_id"] = kf_id - studies[study["globalId"]] = study - logger.info("Found study %s", study["globalId"]) + studies[kf_id] = study # Fetch next page if there is one has_next_page = page_info["hasNextPage"] @@ -278,39 +271,10 @@ def paginate_studies( return studies -def find_study(study_global_id: str) -> dict: +def get_study_by_kf_id(kf_study_id): """ - Find study using Dewrangle global ID. Use this when you don't know the - org ID + Fetch study from Dewrangle """ studies = paginate_studies() - return studies.get(study_global_id, {}) - - -def get_study_by_id(study_id: str, org_node_id: str) -> dict: - """ - Fetch study from Dewrangle by KF ID or global ID - """ - if study_id.startswith("SD_"): - study_id = kf_id_to_global_id(study_id) - resp = exec_query( - queries.study_by_global_id, - variables={"id": org_node_id, "filter": {"query": study_id}}, - ) - - errors = resp.get("studyQuery", {}).get("errors") - key = "Get" - if errors: - result = errors - logger.error("❌ %s study failed:\n%s", key, pformat(resp)) - else: - result = {} - logger.info("✅ %s study succeeded:\n%s", key, pformat(resp)) - edges = resp["studyQuery"]["node"]["studies"]["edges"] - if not edges: - logger.error("❌ Study %s not found!", study_id) - else: - result = edges[0]["node"] - - return result + return studies.get(kf_study_id, {}) diff --git a/d3b_api_client_cli/dewrangle/graphql/study/mutations.py b/d3b_api_client_cli/dewrangle/graphql/study/mutations.py index 48c96d3..e5222be 100644 --- a/d3b_api_client_cli/dewrangle/graphql/study/mutations.py +++ b/d3b_api_client_cli/dewrangle/graphql/study/mutations.py @@ -83,13 +83,11 @@ job { id completedAt - globalDescriptors { + descriptors { totalCount edges { node { descriptor - globalId - fhirResourceType } } } diff --git a/d3b_api_client_cli/dewrangle/graphql/study/queries.py b/d3b_api_client_cli/dewrangle/graphql/study/queries.py index 001c09d..c91da2b 100644 --- a/d3b_api_client_cli/dewrangle/graphql/study/queries.py +++ b/d3b_api_client_cli/dewrangle/graphql/study/queries.py @@ -4,6 +4,7 @@ from gql import gql + study = gql( """ query studyQuery($id: ID!) { @@ -17,25 +18,18 @@ name id } - } - } - } - """ -) - -study_by_global_id = gql( - """ - query studyQuery($id: ID!, $filter: StudyFilter!) { - node(id: $id) { - id - ... on Organization { - name - studies(filter: $filter) { - edges { - node { - id - name - globalId + fhirServerDeployments { + id + fhirServer { + id + name + url + authType + authConfig { + ...on FhirServerAuthConfigOIDCClientCredential { + issuerBaseUrl + clientId + } } } } @@ -65,24 +59,17 @@ id globalId name - studyFhirServers { - edges { - node { - id - ... on StudyFhirServer { - fhirServer { - id - name - url - type - authType - authConfig { - ... on FhirServerAuthConfigOIDCClientCredential { - issuerBaseUrl - clientId - } - } - } + fhirServerDeployments { + id + fhirServer { + id + name + url + authType + authConfig { + ...on FhirServerAuthConfigOIDCClientCredential { + issuerBaseUrl + clientId } } } diff --git a/d3b_api_client_cli/dewrangle/graphql/volume/__init__.py b/d3b_api_client_cli/dewrangle/graphql/volume/__init__.py deleted file mode 100644 index 5b0f184..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/volume/__init__.py +++ /dev/null @@ -1,414 +0,0 @@ -""" -GraphQL methods to crud Volumes in Dewrangle -""" - -import os -import logging -from pprint import pformat, pprint -from collections import defaultdict - -import gql - -from d3b_api_client_cli.dewrangle.graphql.common import ( - exec_query, -) -from d3b_api_client_cli.dewrangle.graphql.study import ( - paginate_studies, - find_study, -) -from d3b_api_client_cli.dewrangle.graphql.volume import ( - queries, - mutations, -) -from d3b_api_client_cli.dewrangle.graphql.credential import find_credential -from d3b_api_client_cli.dewrangle.graphql.job import poll_job -from d3b_api_client_cli.config import config -from d3b_api_client_cli.utils import ( - write_json, -) - -logger = logging.getLogger(__name__) - -DEWRANGLE_DIR = config["dewrangle"]["output_dir"] -DEWRANGLE_MAX_PAGE_SIZE = config["dewrangle"]["pagination"]["max_page_size"] -DELIMITER = "::" -# Wait 30s between querying Dewrangle -POLL_LIST_AND_HASH_INTERVAL_SECS = 30 - - -def upsert_volume( - variables: dict, study_id=None, study_global_id=None, credential_key=None -) -> dict: - """ - Upsert volume in Dewrangle - - Arguments: - variables - Volume attributes (see Dewrangle graphql schema) - study_id - Graphql node ID of the credential's study - study_global_id - Global ID of credential's study - Returns: - Dewrangle volume dict - """ - if not (study_id or study_global_id): - raise ValueError( - "❌ Either the graphql node ID or global ID of the volume's" - " study must be provided to either create or update the volume" - ) - - # If no study id provided, try querying for it via global ID - if not study_id: - study_id = find_study(study_global_id).get("id") - - # If not credential provided, try querying for it - credential_id = variables.get("credentialId") - if not credential_id: - credential_id = find_credential(credential_key, study_id)["id"] - variables["credentialId"] = credential_id - - # Try finding existing volume - bucket = variables.get("name") - path_prefix = variables.get("pathPrefix") - if not (bucket and path_prefix): - volume = None - else: - volume = find_volume(bucket, path_prefix, study_id) - - params = {"input": variables} - - if volume: - key = "Update" - - params["input"] = {"credentialId": credential_id} - params.update({"id": volume["id"]}) - resp = exec_query(mutations.update_volume, variables=params) - else: - key = "Create" - params["input"].update({"studyId": study_id}) - params.pop("id", None) - resp = exec_query(mutations.create_volume, variables=params) - - errors = resp.get(f"volume{key}", {}).get("errors") - entity = "volume" - if errors: - logger.error("❌ %s %s failed:\n%s", key, entity, pformat(resp)) - result = errors - else: - logger.info("✅ %s %s succeeded:\n%s", key, entity, pformat(resp)) - - result = resp[f"{entity}{key}"][entity] - result["id"] = result["id"] - result["study_id"] = result["study"]["id"] - - return result - - -def delete_volume( - node_id: str = None, - bucket: str = None, - path_prefix: str = None, - study_global_id: str = None, - delete_safety_check: bool = True, -) -> dict: - """ - Delete volume in Dewrangle - - Arguments: - node_id - Dewrangle graphql node ID - bucket - S3 bucket name - path_prefix - Path in the S3 bucket - study_global_id - Global ID of credential's study - delete_safety_check - only delete if this is False - - Returns: - Response from Dewrangle - """ - if not (node_id or (bucket and study_global_id)): - raise ValueError( - "❌ You must provide either the volume graphql ID or" - " volume key and study global ID to look up the volume" - ) - if bucket: - study_id = find_study(study_global_id).get("id") - volume = find_volume(bucket, path_prefix, study_id) - node_id = volume.get("id") - if not node_id: - logger.warning( - "⚠️ Could not find associated dewrangle ID." - " Delete volume %s ABORTED", - node_id, - ) - return - - resp = exec_query( - mutations.delete_volume, - variables={"id": node_id}, - delete_safety_check=delete_safety_check, - ) - - errors = resp.get("volumeDelete", {}).get("errors") - key = "Delete" - if errors: - result = errors - logger.error("❌ %s volume failed:\n%s", key, pformat(resp)) - else: - logger.info("✅ %s volume succeeded:\n%s", key, pformat(resp)) - result = resp["volumeDelete"]["volume"] - result["id"] = node_id - - return result - - -def read_volume(node_id: str) -> dict: - """ - Fetch volume by node id - """ - variables = {"id": node_id} - resp = exec_query(queries.volume, variables=variables) - volume = resp.get("node", {}) - - if volume: - logger.info( - "🔎 Found Dewrangle volume %s:\n%s", - volume["globalId"], - pformat(volume), - ) - else: - logger.error("❌ Not Found: dewrangle volume %s", node_id) - - return volume - - -def read_volumes( - study_global_id=None, - output_dir: str = DEWRANGLE_DIR, - log_output: bool = True, -) -> list[dict]: - """ - Fetch volumes that the client has access to - - Arguments: - study_global_id - Global ID of volume's study - output_dir - directory where study metadata will be written - log_output - whether to log study dicts - - Returns: - List of volume dicts - """ - study_id = None - if study_global_id: - study_id = find_study(study_global_id).get("id") - - data = paginate_volumes(study_id=study_id) - - if output_dir: - os.makedirs(output_dir, exist_ok=True) - filepath = os.path.join(output_dir, "Volume.json") - write_json(data, filepath) - logger.info("✏️ Wrote %s volume to %s", len(data), filepath) - - if log_output: - logger.info("🔐 Volumes:\n%s", pformat(data)) - - return data - - -def _volume_key(bucket: str, path_prefix: str) -> str: - """ - Helper method to create a dict key for the output of paginate_volumes - - This key uniquely identifies a volume among volumes within a study - """ - if path_prefix is None: - path_prefix = "" - return f"{bucket}{DELIMITER}{path_prefix}" - - -def _store_volumes( - study_volumes: list[dict], volumes: dict, study: dict -) -> dict: - """ - Storage page of volumes into volumes dict - """ - for s in study_volumes: - volume = s["node"] - volume["study_id"] = study["id"] - volume["study_global_id"] = study["globalId"] - - bucket = volume["name"] - path_prefix = volume["pathPrefix"] - volumes[_volume_key(bucket, path_prefix)][study["id"]] = volume - - return volumes - - -def paginate_volumes( - studies=None, study_id=None, dewrangle_page_size=DEWRANGLE_MAX_PAGE_SIZE -) -> dict: - """ - Fetch all volumes in all studies that the viewer has access to - - Optionally filter volumes by study. Uses Relay graphql pagination - - Returns: - dict that looks like this - { - "bucket1::/path/to/myfile": { - "study1": { - - }, - "study2": { - - } - }, - "bucket1::/path/to/another": ..., - "bucket2::": ... - } - """ - if not studies: - studies = paginate_studies() - - logger.info("📄 Paginating Dewrangle volumes ...") - - volumes = defaultdict(lambda: defaultdict(dict)) - for study in studies.values(): - if study_id and (study.get("id") != study_id): - continue - variables = {"first": dewrangle_page_size, "id": study["id"]} - resp = exec_query(queries.study_volumes, variables=variables) - - count = 0 - has_next_page = True - while has_next_page: - study_volumes = resp["node"]["volumes"]["edges"] - page_info = resp["node"]["volumes"]["pageInfo"] - - count += len(study_volumes) - total = resp["node"]["volumes"]["totalCount"] - - if not total: - has_next_page = False - continue - - logger.info( - "Collecting %s volumes for study %s", - count / total, - study["name"], - ) - # Add volumes in output - volumes = _store_volumes(study_volumes, volumes, study) - - # Fetch next page if there is one - has_next_page = page_info["hasNextPage"] - end_cursor = page_info["endCursor"] - if has_next_page and end_cursor: - variables.update({"after": end_cursor}) - resp = exec_query(queries.study_volumes, variables=variables) - - return dict(volumes) - - -def find_volume(bucket: str, path_prefix: str, study_id: str) -> dict: - """ - Find volume using S3 bucket name, path prefix, and study id. - """ - volumes = paginate_volumes(study_id=study_id) - return volumes.get(_volume_key(bucket, path_prefix), {}).get(study_id, {}) - - -def list_and_hash( - billing_group_id: str, - volume_id: str = None, - bucket: str = None, - path_prefix: str = None, - study_global_id: str = None, -) -> dict: - """ - Trigger a list and hash volume job in Dewrangle - - Arguments: - billing_group_id - Dewrangle graphql ID of billing group - volume_id - Dewrangle graphql ID of volume - bucket - S3 bucket name - path_prefix - Path in the S3 bucket - study_global_id - Global ID of volume's study - - Returns: - Job from Dewrangle - """ - key = "ListAndHash" - - if not billing_group_id: - raise ValueError( - "❌ Billing group ID is missing and required to hash a volume!" - ) - - if not (volume_id or (bucket and study_global_id)): - raise ValueError( - "❌ You must provide either the volume graphql ID or" - " volume name and study global ID to look up the volume" - ) - # Try querying for volume - if not volume_id: - study_id = find_study(study_global_id).get("id") - if not study_id: - raise ValueError( - "❌ Could not find associated dewrangle ID for " - f" study with ID {study_global_id}." - ) - - volume_id = find_volume(bucket, path_prefix, study_id).get("id") - - if not volume_id: - raise ValueError( - "❌ Could not find associated dewrangle ID for " - f" {key} volume with ID {volume_id}." - ) - - variables = { - "input": { - "billingGroupId": billing_group_id, - }, - "id": volume_id, - } - resp = exec_query(mutations.list_and_hash, variables=variables) - - errors = resp.get(f"volume{key}", {}).get("errors") - if errors: - result = errors - logger.error("❌ %s volume failed:\n%s", key, pformat(resp)) - else: - logger.info("✅ %s volume succeeded:\n%s", key, pformat(resp)) - result = resp[f"volume{key}"]["job"] - - return result - - -def hash_and_wait( - billing_group_id: str, - volume_id: str, - bucket: str = None, - path_prefix: str = None, - study_global_id: str = None, -): - """ - Trigger a list and hash volume job and poll for job status until the - job is complete or fails - - Arguments: - billing_group_id - Dewrangle graphql ID of billing group - volume_id - Dewrangle graphql ID of volume - bucket - S3 bucket name - path_prefix - Path in the S3 bucket - study_global_id - Global ID of volume's study - """ - # List and hash volume - job = list_and_hash( - volume_id=volume_id, - billing_group_id=billing_group_id, - bucket=bucket, - path_prefix=path_prefix, - study_global_id=study_global_id, - ) - return poll_job( - job["id"], interval_seconds=POLL_LIST_AND_HASH_INTERVAL_SECS - ) diff --git a/d3b_api_client_cli/dewrangle/graphql/volume/mutations.py b/d3b_api_client_cli/dewrangle/graphql/volume/mutations.py deleted file mode 100644 index 9a98262..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/volume/mutations.py +++ /dev/null @@ -1,119 +0,0 @@ -""" -Dewrangle GraphQL mutation definitions -""" - -from gql import gql - - -create_volume = gql( - """ - mutation volumeCreateMutation($input: VolumeCreateInput!) { - volumeCreate(input: $input) { - errors { - ... on MutationError { - __typename - message - field - } - } - volume { - id - name - region - type - pathPrefix - study { - id - globalId - } - credential { - id - type - key - } - } - } - } - """ -) -update_volume = gql( - """ - mutation volumeUpdateMutation($id: ID!, $input: VolumeUpdateInput!) { - volumeUpdate(id: $id, input: $input) { - errors { - ... on MutationError { - __typename - message - field - } - } - volume { - id - name - region - type - pathPrefix - study { - id - globalId - } - credential { - id - type - key - } - } - } - } - """ -) - -delete_volume = gql( - """ - mutation volumeDeleteMutation($id: ID!) { - volumeDelete(id: $id) { - errors { - ... on MutationError { - __typename - message - field - } - } - volume { - id - name - region - type - pathPrefix - } - } - } - """ -) - -list_and_hash = gql( - """ - mutation volumeListAndHashMutation( - $id: ID! - $input: VolumeListAndHashInput! - ) { - volumeListAndHash(id: $id, input: $input) { - errors { - ... on MutationError { - __typename - message - field - } - } - job { - id - temporalWorkflowId - operation - target { - id - } - } - } - } - """ -) diff --git a/d3b_api_client_cli/dewrangle/graphql/volume/queries.py b/d3b_api_client_cli/dewrangle/graphql/volume/queries.py deleted file mode 100644 index 008d75e..0000000 --- a/d3b_api_client_cli/dewrangle/graphql/volume/queries.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Dewrangle GraphQL query definitions -""" - -from gql import gql - -volume = gql( - """ - query volumeQuery($id: ID!) { - node(id: $id) { - id - ... on Volume { - id - name - region - type - pathPrefix - study { - id - globalId - } - credential { - id - type - key - } - } - } - } - """ -) - -study_volumes = gql( - """ - query studyVolumes($id: ID!, $first: Int, $after: ID) { - node(id: $id) { - id - ... on Study { - name - id - globalId - volumes(first: $first, after: $after) { - totalCount - pageInfo { - hasNextPage - endCursor - } - edges { - cursor - node - { - id - name - region - type - pathPrefix - study { - id - globalId - } - credential { - id - type - key - } - } - } - } - } - } - } - """ -) diff --git a/d3b_api_client_cli/dewrangle/ingest.py b/d3b_api_client_cli/dewrangle/ingest.py new file mode 100644 index 0000000..0377cfa --- /dev/null +++ b/d3b_api_client_cli/dewrangle/ingest.py @@ -0,0 +1,185 @@ +""" +Ingest FHIR data into Dewrangle + +Single File Ingest +------------------ +- Upload study file containing FHIR resources to Dewrangle study files endpoint +- Start Dewrangle FHIR resource ingest job for the file + +Study Ingest +------------ +- Start job to ingest all study files into Dewrangle given a KF study ID +""" + +import os +import logging +from pprint import pformat + +from d3b_api_client_cli.utils import read_json, kf_id_to_global_id +from d3b_api_client_cli.config import ( + config, + KidsFirstFhirEntity, + SKIP_ENTITIES, +) +from d3b_api_client_cli.dewrangle import graphql as gql_client +from d3b_api_client_cli.dewrangle.graphql.study import ( + get_study_by_kf_id, +) +from d3b_api_client_cli.dewrangle.rest import upload_study_file +from d3b_api_client_cli.dewrangle.poll_job import poll_fhir_ingest_job + +logger = logging.getLogger(__name__) +config = config["dewrangle"] +valid_kids_first_fhir_types = set([et.value for et in KidsFirstFhirEntity]) + + +def upload_and_ingest_study_file(kf_study_id, filepath): + """ + Upload study file to Dewrangle and start FHIR resource ingest job + + * For dev and debugging purposes * + + :param kf_study_id: the ID of the study in Dewrangle + :type kf_study_id: str + :param filepath: the path of the file to ingest into Dewrangle + :type filepath: str + + :rtype: dict + :returns: GraphQL response from ingest mutation + """ + # Upload study file to Dewrangle study + dewrangle_study = get_study_by_kf_id(kf_study_id) + if not dewrangle_study: + logger.warning( + f"⚠️ Could not ingest study file. Failed to find corresponding " + f" dewrangle study ID for KF ID {kf_study_id}" + ) + return + + dewrangle_study_id = dewrangle_study["id"] + resp = upload_study_file(dewrangle_study_id, filepath) + logger.info(f"🛸 Uploaded {filepath}:\n{pformat(resp)}") + + # Submit request to ingest FHIR resource files into Dewrangle + to_ingest = [] + to_ingest.append({"id": resp["id"]}) + resp = gql_client.fhir_resource_ingest(dewrangle_study_id, to_ingest) + logger.info( + f"✅ Submitted request to start FHIR resource ingest for" + f" {filepath}:\n{pformat(resp)}" + ) + # Check on job status + job = resp["fhirResourceIngest"] + errors = job.get("errors") + if errors: + logger.warning(f"⚠️ Dewrangle error:\n{pformat(errors)}") + return resp + + # Return job status + node_id = job["job"]["id"] + result = poll_fhir_ingest_job(node_id) + + return result + + +def ingest_study_files( + study_data_dir_or_file, dewrangle_study_node_id=None, entities_to_load=None +): + """ + Start job to ingest a study's FHIR resource files into Dewrangle + + - Get study in Dewrangle either by KF ID lookup or direct lookup via + the graphql node id `dewrangle_study_id` + - Start job to ingest study's FHIR resource files into Dewrangle + + :param study_data_dir_or_file: the path to the filr or dir of files to ingest + :type study_data_dir_or_file: str + :param dewrangle_study_id: the GraphQL node ID of the Dewrangle study + :type dewrangle_study_id: str + + :rtype: dict + :returns: resp from start FHIR resource ingest mutation + """ + if not entities_to_load: + entities_to_load = valid_kids_first_fhir_types + else: + entities_to_load = set(entities_to_load) + + # NOTE - This is temporary until Dewrangle is able to support these having + # these types in multiple studies + entities_to_load = entities_to_load - SKIP_ENTITIES + + # Determine if file or dir + ingest_one_file = False + study_data_dir = study_data_dir_or_file + if os.path.isfile(study_data_dir_or_file): + study_data_dir = os.path.dirname(study_data_dir_or_file) + ingest_one_file = True + + # Get study from in Dewrangle + study_filepath = os.path.join(study_data_dir, "Study.json") + study_params = read_json(study_filepath) + kf_study_id = study_params["kf_id"] + + if dewrangle_study_node_id: + dewrangle_study = gql_client.read_study(dewrangle_study_node_id) + else: + dewrangle_study = get_study_by_kf_id(kf_study_id) + + if not dewrangle_study: + logger.error( + f"‼️ Study {kf_study_id} does not exist in Dewrangle." + " Please run the 'dwds dewrangle setup-dewrangle-study' command" + " to create and setup your study in Dewrangle" + ) + return + dewrangle_study_id = dewrangle_study["id"] + + # Configure file upload endpoint + base_url = config["base_url"] + endpoint = config["endpoints"]["rest"]["study_file"] + logger.info(f"🛸 Starting upload of study files to {base_url}/{endpoint}") + + # Upload FHIR resource files + entities = config["ingest"] + to_ingest = [] + for kf_entity in entities: + if kf_entity not in entities_to_load: + logger.info( + f"⏭️ Skip {kf_entity}. User did not include it in" + "entities_to_load" + ) + continue + + if kf_entity == "sequencing_center": + continue + + filepath = os.path.join(study_data_dir, f"{kf_entity}.json") + if ingest_one_file and (filepath != study_data_dir_or_file): + continue + + if not os.path.exists(filepath): + logger.info(f"ℹ️ Skipping ingest of {kf_entity}. No data exists") + continue + + resp = upload_study_file(dewrangle_study_id, filepath) + to_ingest.append({"id": resp["id"]}) + + # Submit request to ingest FHIR resource files into Dewrangle + resp = gql_client.fhir_resource_ingest(dewrangle_study_id, to_ingest) + logger.info( + f"✅ Submitted request to start FHIR resource ingest for" + f" {study_data_dir_or_file}:\n{pformat(resp)}" + ) + # Check on job status + job = resp["fhirResourceIngest"] + errors = job.get("errors") + if errors: + logger.warning(f"⚠️ Dewrangle error:\n{pformat(errors)}") + return resp + + # Return job status + node_id = job["job"]["id"] + result = poll_fhir_ingest_job(node_id) + + return result diff --git a/d3b_api_client_cli/dewrangle/poll_job.py b/d3b_api_client_cli/dewrangle/poll_job.py new file mode 100644 index 0000000..8609e45 --- /dev/null +++ b/d3b_api_client_cli/dewrangle/poll_job.py @@ -0,0 +1,143 @@ +""" +Poll Dewrangle for status on various jobs: + - FHIR ingest job +""" + +import time +import logging + +from pprint import pformat +from d3b_api_client_cli.dewrangle.graphql.common import exec_query +from d3b_api_client_cli.dewrangle.graphql.job import ( + queries, +) + +logger = logging.getLogger(__name__) + +DEFAULT_INGEST_TIMEOUT_SEC = 3600 +DEFAULT_UPSERT_TIMEOUT_SEC = 3600 +DEFAULT_POLL_INTERVAL = 2 + + +def is_complete(resp): + complete = resp["node"]["completedAt"] is not None + success = not resp["node"]["errors"]["edges"] + + return {"complete": complete, "success": success} + + +def poll_fhir_ingest_job(job_id, timeout_seconds=DEFAULT_INGEST_TIMEOUT_SEC): + """ + Poll for status on a Dewrangle FHIR ingest job + + See _poll_job for details + """ + job_query = queries.fhir_resource_ingest_job + + return _poll_job( + job_id, job_query, is_complete, timeout_seconds=timeout_seconds + ) + + +def poll_descriptor_upsert_job( + job_id, timeout_seconds=DEFAULT_UPSERT_TIMEOUT_SEC +): + """ + Poll for status on a Dewrangle descriptor upsert job + + See _poll_job for details + """ + job_query = queries.job_status_query + + return _poll_job( + job_id, job_query, is_complete, timeout_seconds=timeout_seconds + ) + + +def _validate_status_format(status): + for key in ["complete", "success"]: + if key not in status: + raise Exception( + "Invalid poll job complete function. Must return a dict " + "with the following format: {'complete': , " + "'success': }" + ) + + +def _poll_job( + job_id, + job_query, + complete_function, + timeout_seconds=None, + interval_seconds=DEFAULT_POLL_INTERVAL, +): + """ + Poll for status on a Dewrangle job. If timeout is not set poll until job + is complete. If timeout is set, poll until job is complete or timeout + expires + + Return True if job is complete without errors + Return False if job is complete with errors + Return False if timeout is exceeded and job is not complete + + :param node_id: Dewrangle node ID of the job + :type node_id: str + :param job_query: A GraphQL query to fetch the job + :type job_query: gql + :param complete_function: A method which determines when the job is + complete and if it succeeded. This method will take in a dict containing + the output of the graphql query and it must return a dict containing the + following: { "complete": boolean, "success": boolean } + :type complete_function: A Python function + """ + elapsed_time_seconds = 0 + start_time = time.time() + + while True: + # Fetch job + params = {"id": job_id} + resp = exec_query(job_query, variables=params) + + job = resp["node"] + node_id = job["id"] + operation = job["operation"].lower().replace("_", "-") + + # Check completion status + status = complete_function(resp) + _validate_status_format(status) + + # Job completed + if status["complete"]: + success = status["success"] + emoji = "✅ " if success else "‼️ " + suffix = "" if success else " with errors" + logger.info( + f"{emoji} Job {operation} {node_id}" + f" completed{suffix}:\n{pformat(job)}" + ) + + return {"status": status["complete"], "job": job} + + elapsed_time_seconds = time.time() - start_time + print(elapsed_time_seconds, timeout_seconds) + t = time.strftime("%H:%M:%S", time.gmtime(elapsed_time_seconds)) + + # Timeout exceeded + if (timeout_seconds is not None) and ( + elapsed_time_seconds > timeout_seconds + ): + logger.warning( + f"⚠️ Timeout of {timeout_seconds} seconds expired." + f" Current job {operation} {node_id} result:\n{pformat(job)}" + f"\n✌️ Dewrangle must still be working ... but CLI is moving " + "on" + ) + return {"status": False, "job": job} + + # Continue polling + logger.info( + f"⏰ Waiting for job {operation} {node_id} to" + f" complete. Elapsed time (hh:mm:ss): {t}" + ) + + time.sleep(interval_seconds) diff --git a/d3b_api_client_cli/dewrangle/rest/__init__.py b/d3b_api_client_cli/dewrangle/rest/__init__.py index 99ff33b..e2667f2 100644 --- a/d3b_api_client_cli/dewrangle/rest/__init__.py +++ b/d3b_api_client_cli/dewrangle/rest/__init__.py @@ -1,5 +1,219 @@ """ -Top level moduel for REST API functions +Upload study files: POST api/rest/studies//files/ + +Study files contain JSON formatted lists of FHIR resources + +https://stackoverflow.com/questions/72911304/how-to-upload-large-files-using-post-method-in-python """ +import os +import logging +from pprint import pprint, pformat + +from d3b_api_client_cli.config import DEWRANGLE_DEV_PAT, config +from d3b_api_client_cli.utils import send_request +from d3b_api_client_cli.dewrangle.graphql.study import ( + get_study_by_kf_id, + upsert_global_descriptors, +) from d3b_api_client_cli.dewrangle.rest.files import * + +logger = logging.getLogger(__name__) + +JSON_CONTENT_TYPE = "application/json" +CSV_CONTENT_TYPE = "text/csv" + + +def upload_study_file(dewrangle_study_id, filepath): + """ + Upload a CSV file to Dewrangle's study file endpoint + + :param dewrangle_study_id: the ID of the study in Dewrangle + :type dewrangle_study_id: str + :param filepath: the path of the file to upload to Dewrangle + :type filepath: str + """ + filepath = os.path.abspath(filepath) + base_url = config["dewrangle"]["base_url"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["study_file"] + endpoint = endpoint_template.format( + dewrangle_study_id=dewrangle_study_id, + filename=os.path.split(filepath)[-1], + ) + url = f"{base_url}/{endpoint}" + + logger.info(f"⏰ Starting upload of {filepath} ...") + with open(filepath, "rb") as jsonfile: + headers = {"x-api-key": DEWRANGLE_DEV_PAT} + resp = send_request( + "post", + url, + headers=headers, + data=jsonfile, + # Set timeout to infinity so that uploads don't timeout + timeout=-1, + ) + + logger.info(f"✅ Completed upload: {os.path.split(filepath)[-1]}") + logger.info(pformat(resp.json())) + + return resp.json() + + +def request_global_ids( + kf_study_id, + filepath, + content_type=CSV_CONTENT_TYPE, + skip_unavailable_descriptors=True, +): + """ + Request global IDs from Dewrangle for the given FHIR resources + + This happens in two steps: + 1. Upload the global descriptor csv file to the study file endpoint + 2. Invoke the graphQL mutation to upsert global descriptors + + :param kf_study_id: Kids First ID of the study in Dataservice + :type kf_study_id: str + :param filepath: path to the global descriptors csv file + :type fileapth: str + :param content_type: value of the Content-Type head to use in the upload + file request + :type content_type: str + :param skip_unavailable_descriptors: If true any errors due to a descriptor + already having a global ID assigned will be ignored + :type skip_unavailable_descriptors: boolean + """ + logger.info(f"🛸 Fetch global IDs from Dewrangle for {kf_study_id} ...") + dewrangle_study = get_study_by_kf_id(kf_study_id) + if not dewrangle_study: + raise Exception( + f"‼️ Study {kf_study_id} does not exist in Dewrangle." + " Please run the 'dwds dewrangle setup-dewrangle-study' command" + " to create and setup your study in Dewrangle" + ) + dewrangle_study_id = dewrangle_study["id"] + + filepath = os.path.abspath(filepath) + base_url = config["dewrangle"]["base_url"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["study_file"] + endpoint = endpoint_template.format( + dewrangle_study_id=dewrangle_study_id, + filename=os.path.split(filepath)[-1], + ) + + logger.info(f"🛸 POST global IDs request file {filepath} to Dewrangle") + url = f"{base_url}/{endpoint}" + with open(filepath, "rb") as request_file: + headers = { + "x-api-key": DEWRANGLE_DEV_PAT, + "Content-Type": content_type, + } + resp = send_request( + "post", + url, + headers=headers, + data=request_file, + ) + result = resp.json() + study_file_id = result["id"] + + # Trigger global descriptor upsert mutation + resp = upsert_global_descriptors( + dewrangle_study_id, + study_file_id, + skip_unavailable_descriptors=skip_unavailable_descriptors, + ) + result = resp["globalDescriptorUpsert"] + job_id = result["job"]["id"] + + logger.info( + f"✅ Completed request to generate global IDs. Job ID: {job_id}" + ) + + return result + + +def download_global_ids(kf_study_id, filepath, job_id=None, descriptors="all"): + """ + Download study's global IDs from Dewrangle + """ + logger.info(f"🛸 Get dewrangle study id for {kf_study_id} ...") + dewrangle_study = get_study_by_kf_id(kf_study_id) + if not dewrangle_study: + raise Exception( + f"‼️ Study {kf_study_id} does not exist in Dewrangle." + " Please run the 'dwds dewrangle setup-dewrangle-study' command" + " to create and setup your study in Dewrangle" + ) + dewrangle_study_id = dewrangle_study["id"] + + filepath = os.path.abspath(filepath) + base_url = config["dewrangle"]["base_url"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["global_id"] + endpoint = endpoint_template.format(dewrangle_study_id=dewrangle_study_id) + base = base_url.rstrip("/") + path = endpoint.lstrip("/") + url = f"{base}/{path}" + + logger.info("🛸 Start downloading global IDs from Dewrangle ...") + + params = {} + if job_id: + params.update({"job": job_id}) + if descriptors: + params.update({"descriptors": descriptors}) + + with open(filepath, "wb") as csvfile: + headers = { + "x-api-key": DEWRANGLE_DEV_PAT, + } + resp = send_request( + "get", + url, + headers=headers, + params=params, + # Set timeout to infinity so that downloads don't timeout + timeout=-1, + ) + csvfile.write(resp.content) + + logger.info(f"✅ Completed download of generate global IDs {filepath}") + logger.info(resp.url) + + return filepath + + +def download_job_errors(job_id, filepath): + """ + Download study's global IDs from Dewrangle + + :param job_id: A Dewrangle generated ID of the job that has errors + :type job_id: str + :param filepath: Path to the file where errors will be written + :type filepath: str + """ + base_url = config["dewrangle"]["base_url"] + endpoint_template = config["dewrangle"]["endpoints"]["rest"]["job_errors"] + endpoint = endpoint_template.format(job_id=job_id) + url = f"{base_url}/{endpoint}" + + logger.info("🛸 Start downloading job {job_id} errors from Dewrangle ...") + + with open(filepath, "wb") as csvfile: + headers = { + "x-api-key": DEWRANGLE_DEV_PAT, + } + resp = send_request( + "get", + url, + headers=headers, + # Set timeout to infinity so that downloads don't timeout + timeout=-1, + ) + csvfile.write(resp.content) + + logger.info(f"✅ Completed download job errors: {filepath}") + logger.info(resp.url) + + return filepath diff --git a/d3b_api_client_cli/dewrangle/rest/files.py b/d3b_api_client_cli/dewrangle/rest/files.py index eb082eb..4bbcd9a 100644 --- a/d3b_api_client_cli/dewrangle/rest/files.py +++ b/d3b_api_client_cli/dewrangle/rest/files.py @@ -6,7 +6,8 @@ from pprint import pformat, pprint import logging import os -import cgi +from email.message import Message +from urllib.parse import unquote from d3b_api_client_cli.config import ( @@ -24,13 +25,38 @@ DEFAULT_FILENAME = f"dewrangle-file-{timestamp()}.csv" -def _filename_from_headers(headers: dict) -> str: +def _filename_from_headers(headers: dict) -> str | None: """ - Helper to get the filename from the Content-Disposition - header of an HTTP response + Helper to get the filename from the Content-Disposition header. + + Supports both: + - filename="foo.csv" + - filename*=UTF-8''foo%20bar.csv (RFC 5987) """ - _, params = cgi.parse_header(headers["Content-Disposition"]) - return params.get("filename") + cd = headers.get("Content-Disposition") + if not cd: + return None + + msg = Message() + msg["content-disposition"] = cd + + # email.Message.get_param handles quoted values + filename = msg.get_param("filename", header="content-disposition") + if filename: + return filename + + # RFC 5987: filename*=charset''urlencoded + filename_star = msg.get_param("filename*", header="content-disposition") + if not filename_star: + return None + + # Example: UTF-8''foo%20bar.csv + try: + _, encoded = filename_star.split("''", 1) + except ValueError: + encoded = filename_star + + return unquote(encoded) def upload_file(url: str, filepath: str, params: Optional[dict] = None): diff --git a/d3b_api_client_cli/dewrangle/setup.py b/d3b_api_client_cli/dewrangle/setup.py new file mode 100644 index 0000000..ab99e9c --- /dev/null +++ b/d3b_api_client_cli/dewrangle/setup.py @@ -0,0 +1,273 @@ +""" +Script to initialize the Kids First organization with all studies in Dewrangle + +- Upsert organization into Dewrangle +- Create FHIR servers for org +- Fetch all visible studies from Dataservice +- Upsert each study to Dewrangle org +- Attach FHIR servers to each study +""" + +import os +import logging +import time +import psycopg2 +import psycopg2.extras + +from d3b_api_client_cli.config import ( + config, + ROOT_DATA_DIR, + DEWRANGLE_FHIR_SERVERS_FILEPATH, + DATASERVICE_DB_NAME, + DATASERVICE_DB_HOST, + DATASERVICE_DB_PORT, + DATASERVICE_DB_ADMIN_USER, + DATASERVICE_DB_ADMIN_PW, +) +from d3b_api_client_cli import utils +from d3b_api_client_cli.dewrangle import graphql as gql_client + +KF_ORG = { + "name": "Kids First DRC", + "visibility": "PRIVATE", + "description": "The Gabriella Miller Kids First Data Resource Center" + " is a new, collaborative, pediatric research effort with the goal of" + " understanding the genetic causes and links between childhood cancer and" + " structural birth defects", + "email": "heathap@chop.edu", + "website": "https://kidsfirstdrc.org", +} +DEFAULT_URL = config["dataservice"]["api_url"] +logger = logging.getLogger(__name__) + + +def get_studies(kf_id=None, only_visible=True): + """ + Get all studies or one study by kf_id from db + """ + + try: + with psycopg2.connect( + dbname=DATASERVICE_DB_NAME, + user=DATASERVICE_DB_ADMIN_USER, + password=DATASERVICE_DB_ADMIN_PW, + host=DATASERVICE_DB_HOST, + port=DATASERVICE_DB_PORT, + ) as conn: + with conn.cursor( + cursor_factory=psycopg2.extras.RealDictCursor + ) as cursor: + if kf_id: + if only_visible: + suffix = " and study.visible=true" + else: + suffix = "" + cursor.execute( + "select * from study where kf_id= %s" + suffix + ";", + (kf_id,), + ) + else: + if only_visible: + suffix = " where study.visible=true" + else: + suffix = "" + + cursor.execute("select * from study" + suffix + ";") + + rows = cursor.fetchall() + except psycopg2.OperationalError as e: + logger.error( + "❌ Could not fetch study(ies) due to failure to connect to db:" + f" {DATASERVICE_DB_HOST}:{DATASERVICE_DB_PORT}/" + f"{DATASERVICE_DB_NAME}" + " Check your connection details in the environment" + ) + raise e + + if kf_id: + what = f"study {kf_id}" + if len(rows) == 0: + raise Exception( + f"‼️ Could not find study {kf_id} in Dataservice DB" + f" {DATASERVICE_DB_HOST}:{DATASERVICE_DB_PORT}/" + f"{DATASERVICE_DB_NAME}. Check study visibility!" + ) + return rows[0] + what = "studies" + + logger.info( + f"🛸 Exported {what} from {DATASERVICE_DB_HOST}:{DATASERVICE_DB_PORT}" + ) + + fp = os.path.join(ROOT_DATA_DIR, "dataservice_studies.json") + utils.write_json([row["kf_id"] for row in rows], fp) + + logger.info(f"✏️ Wrote exported studies to {fp}") + + return rows + + +def setup_dewrangle_study( + kf_study_id, + fhir_servers, + upsert_fhir_servers=True, + dewrangle_org_name=None, + dewrangle_org_id=None, +): + """ + Setup Dewrangle study + + - Upsert study into Dewrangle organization + - Attach FHIR server to study + """ + start_time = time.time() + + if not (dewrangle_org_id or dewrangle_org_name): + raise Exception( + "You must provide either the dewrangle_org_id or dewrangle_org_name" + ) + # Fetch org + if dewrangle_org_id: + org = gql_client.read_organization(dewrangle_org_id=dewrangle_org_id) + else: + org = gql_client.read_organization( + dewrangle_org_name=dewrangle_org_name + ) + if not org: + raise Exception( + f"‼️ Cannot find org ID for '{dewrangle_org_name}'. Aborting " + "study setup" + ) + dewrangle_org_id = org["id"] + + # Upsert FHIR servers to org + if upsert_fhir_servers: + for fhir_server in fhir_servers: + result = gql_client.upsert_fhir_server( + dewrangle_org_id, + fhir_server, + oidc_client_secret=fhir_server["authConfig"]["clientSecret"], + ) + + # Fetch study from Dataservice + study = get_studies(kf_id=kf_study_id) + + # Upsert study into Dewrangle + input_study = { + "name": study["name"], + "globalId": utils.kf_id_to_global_id(kf_study_id), + } + result = gql_client.upsert_study( + input_study, dewrangle_org_id, kf_study_id=kf_study_id + ) + + # Get FHIR servers from Dewrangle org + org_servers = [edge["node"] for edge in org["fhirServers"]["edges"]] + + end_time = utils.elapsed_time_hms(start_time) + logger.info(f"⏰ Elapsed time (hh:mm:ss): {end_time}") + + logger.info( + f"🛠️ ✅ Setup {kf_study_id} study in Dewrangle with" + f" {len(org_servers)} fhir servers complete!" + ) + + return result + + +def setup_all_studies( + fhir_servers, study_ids=None, dewrangle_org_name=None, dewrangle_org_id=None +): + """ + Setup all Kids First Dataservice studies within an org in Dewrangle + that's already setup with FHIR servers + """ + if not (dewrangle_org_id or dewrangle_org_name): + raise Exception( + "You must provide either the dewrangle_org_id or dewrangle_org_name" + ) + # Fetch org + if not dewrangle_org_id: + org = gql_client.read_organization( + dewrangle_org_name=dewrangle_org_name + ) + if not org: + raise Exception( + f"‼️ Cannot find org ID for '{dewrangle_org_name}'. Aborting " + "study setup" + ) + else: + dewrangle_org_id = org["id"] + + # Fetch all studies from dataservice + studies = get_studies() + + # Only setup studies in the input list + if study_ids: + study_ids = set(study_ids) + + # Setup studies + complete = [] + for i, study in enumerate(studies): + if study_ids and (study["kf_id"] not in study_ids): + continue + + logger.info(f"🛠️ Setting up study {i}: {study['kf_id']} ...") + complete_study = setup_dewrangle_study( + study["kf_id"], + fhir_servers, + upsert_fhir_servers=False, + dewrangle_org_id=dewrangle_org_id, + ) + complete.append(complete_study) + return complete + + +def setup_dewrangle_org( + organization_payload=None, + with_studies=False, + fhir_servers_filepath=DEWRANGLE_FHIR_SERVERS_FILEPATH, +): + """ + Script to initialize an organization in Dewrangle. Optionally add all + Dataservice studies to the organization and run setup process on each + + - Upsert Kids First org + - Create FHIR servers for org + + if with_studies=True + - Fetch all visible studies from Dataservice + - Upsert each study in Dewrangle + - Attach FHIR servers to each study + """ + if not os.path.exists(fhir_servers_filepath): + raise Exception( + f"‼️ Cannot complete setup without Dewrangle FHIR server config" + " file. See sampe.fhir_servers.json" + ) + # Upsert org + organization_payload = organization_payload or KF_ORG + result = gql_client.upsert_organization(organization_payload) + org_id = result["id"] + + # Upsert FHIR servers + server_configs = utils.read_json(fhir_servers_filepath) + fhir_servers = [] + for server_config in server_configs: + result = gql_client.upsert_fhir_server( + org_id, + server_config, + oidc_client_secret=server_config["authConfig"]["clientSecret"], + ) + fhir_servers.append(result) + + # Fetch all visible studies + if with_studies: + studies = setup_all_studies(fhir_servers, dewrangle_org_id=org_id) + suffix = f"with {len(studies)} studies complete!" + else: + suffix = "complete!" + + logger.info( + f"🛠️ ✅ Setup {organization_payload['name']} organization {suffix}" + ) diff --git a/d3b_api_client_cli/fhir/__init__.py b/d3b_api_client_cli/fhir/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3b_api_client_cli/fhir/builder.py b/d3b_api_client_cli/fhir/builder.py new file mode 100644 index 0000000..bc276ab --- /dev/null +++ b/d3b_api_client_cli/fhir/builder.py @@ -0,0 +1,105 @@ +""" +Iterate over FHIR entity builders, import from module and run the build to +produce FHIR JSON +""" + +import os +import logging +import shutil +from pprint import pformat + +from d3b_api_client_cli import utils +from d3b_api_client_cli.fhir.common import import_builders +from d3b_api_client_cli.config import ( + FHIR_JSON_DIR, + KidsFirstFhirEntity, +) + +logger = logging.getLogger(__name__) +valid_kids_first_fhir_types = set([et.value for et in KidsFirstFhirEntity]) + + +def build_entities( + source_dir, dest_dir=None, kf_fhir_entity_types=None, cleanup=False +): + """ + Build Kids First FHIR resources + + If kf_fhir_entity_types=None, produce all entities. Otherwise, only + produce the entities specified by kf_fhir_entity_types + + :param source_dir: dir where tables will be read from + :type source_dir: str + :param dest_dir: dir where merged data will be written + :type dest_dir: str + :param kf_fhir_entity_types: if not None, only build these entity types + :type kf_fhir_entity_types: str + :param cleanup: if true, delete the contents of ETL stage directories + before running ETL + :type cleanup: boolean + + :rtype: list of dicts + :returns: FHIR resource dicts + """ + if not kf_fhir_entity_types: + entities_to_build = valid_kids_first_fhir_types + else: + entities_to_build = set(kf_fhir_entity_types) + + # Get study id + fp = os.path.join(source_dir, "Study.json") + study = utils.read_json(fp) + study_id = study["kf_id"] + parent_study_id = study["parent_study_id"] + + if cleanup: + logger.info(f"🚮 Removing old fhir directory: {dest_dir}") + shutil.rmtree(dest_dir, ignore_errors=True) + + # Setup output dir + if not dest_dir: + dest_dir = os.path.join(FHIR_JSON_DIR, study_id) + os.makedirs(dest_dir, exist_ok=True) + + # Copy study definition file to output + src = fp + dst = os.path.join(dest_dir, "Study.json") + shutil.copyfile(src, dst) + + builders = import_builders() + logger.info(f"Found builders: {pformat(builders.keys())}") + + counts = {} + # For each fhir entity, import the entity builder + for entity_type, builder_class in builders.items(): + if entity_type not in entities_to_build: + logger.info( + f"⏭️ Skip building {entity_type}. User did not include it in" + f" input options: {pformat(kf_fhir_entity_types)}" + ) + continue + + # Build the target entities for FHIR + builder = builder_class(study_id, parent_study_id=parent_study_id) + try: + resources = builder.build(source_dir, dest_dir) + except FileNotFoundError as e: + logger.error( + f"‼️ Error building {entity_type}. Missing data files:" + f" {str(e)}" + ) + continue + + counts[entity_type] = len(resources) + + if len(resources): + # Write entities to file + out_fp = os.path.join(dest_dir, f"{entity_type}.json") + utils.write_json(resources, out_fp) + parts = os.path.split(out_fp) + + logger.info(f"✏️ Wrote {parts[-1]} to {parts[0]}") + + logger.info( + f"✅ Completed building FHIR JSON for {study_id}:\n{pformat(counts)}" + ) diff --git a/d3b_api_client_cli/fhir/common.py b/d3b_api_client_cli/fhir/common.py new file mode 100644 index 0000000..29f2d07 --- /dev/null +++ b/d3b_api_client_cli/fhir/common.py @@ -0,0 +1,43 @@ +""" +Common functions for FHIR package +""" + +import os +import logging + +from d3b_api_client_cli import utils +from d3b_api_client_cli.config import KidsFirstFhirEntity + +logger = logging.getLogger(__name__) + + +def import_builders(entity_types=None): + """ + Import FHIR entity builder classes from file + """ + filters = None + if entity_types: + filters = set(entity_types) + + builders = {} + for entity_type in KidsFirstFhirEntity: + if filters and (entity_type.value not in filters): + continue + + filepath = os.path.join( + os.path.dirname(__file__), + "entity_builders", + f"{entity_type.value}", + "__init__.py", + ) + if not os.path.exists(filepath): + logger.warning( + f"⚠️ No entity builder exists for {entity_type.value}" + ) + continue + + module = utils.import_module_from_file(filepath) + cls_name = utils.snake_to_camel(entity_type.value) + builders[entity_type.value] = getattr(module, cls_name) + + return builders diff --git a/d3b_api_client_cli/fhir/constants.py b/d3b_api_client_cli/fhir/constants.py new file mode 100644 index 0000000..6ee4027 --- /dev/null +++ b/d3b_api_client_cli/fhir/constants.py @@ -0,0 +1,1038 @@ +""" +When you want to use free text for values, don't. Use preset constants instead. + +All constants should be strings. +""" + + +class COMMON: + CANNOT_COLLECT = "Not Allowed To Collect" + MULTIPLE = "Multiple" + NOT_APPLICABLE = "Not Applicable" + NOT_AVAILABLE = "Not Available" + NOT_REPORTED = "Not Reported" + NO_MATCH = "No Match" + NOT_ABLE_TO_PROVIDE = "Not Able to Provide" + OTHER = "Other" + UNKNOWN = "Reported Unknown" + TRUE = "True" + FALSE = "False" + + +MISSING_DATA_VALUES = { + COMMON.CANNOT_COLLECT, + COMMON.NO_MATCH, + COMMON.NOT_ABLE_TO_PROVIDE, + COMMON.NOT_AVAILABLE, + COMMON.NOT_APPLICABLE, + COMMON.NOT_REPORTED, + COMMON.OTHER, + COMMON.UNKNOWN, +} + + +class AGE: + class UNITS: + DAYS = "Days" + MONTHS = "Months" + YEARS = "Years" + + +class FILE: + class HASH: + MD5 = "md5" + SHA1 = "sha1" + SHA256 = "sha256" + SHA512 = "sha512" + S3_ETAG = "etag" + + +class SPECIMEN: + class COMPOSITION: + BLOOD = "Peripheral Whole Blood" + BONE = "Bone" + BONE_MARROW = "Bone Marrow" + BUCCAL_SWAB = "Buccal Cells" + EBVI = "Epstein-Barr Virus Immortalized Cells" + FIBROBLASTS = "Fibroblasts" + LINE = "Derived Cell Line" + LYMPHOBLASTOID_CELL_LINES = "Lymphoblastoid Cell Lines" + LYMPHOCYTES = "Lymphocytes" + MNC = "Mononuclear Cells" + PLASMA = "Plasma" + SALIVA = "Saliva" + TISSUE = "Solid Tissue" + XENOGRAFT = "Xenograft Tissue" + + class SAMPLE_PROCUREMENT: + AUTOPSY = "Autopsy" + BIOPSY = "Biopsy" + BLOOD_DRAW = "Blood Draw" + SALIVA_KIT = "Saliva Kit" + SUBTOTAL_RESECTION = "Subtotal Resection" + TOTAL_RESECTION = "Gross Total Resection" + BONE_MARROW_ASPIRATION = "Bone Marrow Aspiration" + + class ANATOMY_SITE: + ARM = "Arm" + BONE_MARROW = "Bone Marrow" + HAIR = "Hair" + MOUTH = "Mouth" + SKULL = "Skull" + UMBILICAL_CORD = "Umbilical Cord" + CNS = "Central Nervous System" + + class TISSUE_TYPE: + GERMLINE = "Normal" + NORMAL = "Normal" + TUMOR = "Tumor" + + class STATUS: + DISPOSED = "Disposed" + NOT_AVAILABLE = "Not Available" + ON_SITE = "OnSite" + OTHER = "Other" + PATHOLOGY_GOVERNED = "Pathology governed" + SHIPPED = "Shipped" + SHIPPED_GENOMIC_DATA = "Shipped - Genomic Data Available" + UNKNOWN = "Unknown" + VIRTUAL = "Virtual" + + +class GENOMIC_FILE: + class AVAILABILITY: + IMMEDIATE = "Immediate Download" + COLD_STORAGE = "Cold Storage" + + class DATA_TYPE: + ALIGNED_READS = "Aligned Reads" + ALIGNED_READS_INDEX = "Aligned Reads Index" + EXPRESSION = "Expression" + GVCF = "gVCF" + GVCF_INDEX = "gVCF Index" + HISTOLOGY_IMAGES = "Histology Images" + NUCLEOTIDE_VARIATION = "Simple Nucleotide Variations" + OPERATION_REPORTS = "Operation Reports" + PATHOLOGY_REPORTS = "Pathology Reports" + RADIOLOGY_IMAGES = "Radiology Images" + RADIOLOGY_REPORTS = "Radiology Reports" + UNALIGNED_READS = "Unaligned Reads" + VARIANT_CALLS = "Variant Calls" + VARIANT_CALLS_INDEX = "Variant Calls Index" + ANNOTATED_SOMATIC_MUTATIONS = "Annotated Somatic Mutations" + GENE_EXPRESSION = "Gene Expression" + GENE_FUSIONS = "Gene Fusions" + ISOFORM_EXPRESSION = "Isoform Expression" + SOMATIC_COPY_NUMBER_VARIATIONS = "Somatic Copy Number Variations" + SOMATIC_STRUCTURAL_VARIATIONS = "Somatic Structural Variations" + + class FORMAT: + BAI = "bai" + BAM = "bam" + CRAI = "crai" + CRAM = "cram" + DCM = "dcm" + FASTQ = "fastq" + GPR = "gpr" + GVCF = "gvcf" + IDAT = "idat" + PDF = "pdf" + SVS = "svs" + TBI = "tbi" + VCF = "vcf" + HTML = "html" + MAF = "maf" + + +class READ_GROUP: + class QUALITY_SCALE: + ILLUMINA13 = "Illumina13" + ILLUMINA15 = "Illumina15" + ILLUMINA18 = "Illumina18" + SANGER = "Sanger" + SOLEXA = "Solexa" + + +class SEQUENCING: + class REFERENCE_GENOME: + GRCH38 = "GRCh38" + GRCH37 = "GRCh37" + HS37D5 = "hs37d5" + HG19 = "hg19" + + class PLATFORM: + GENOMICS = "Complete Genomics" + ILLUMINA = "Illumina" + ION_TORRENT = "Ion Torrent" + LS454 = "LS454" + PACBIO = "PacBio" + SOLID = "SOLiD" + ONT = "ONT" + + class INSTRUMENT: + HISEQ_X_v2_5 = "HiSeq X v2.5" + HISEQ_X_10 = "HiSeq X Ten" + + class STRAND: + FIRST = "First Stranded" + SECOND = "Second Stranded" + STRANDED = "Stranded" + UNSTRANDED = "Unstranded" + + class CENTER: + class ASHION: + NAME = "Ashion" + KF_ID = "SC_0CNMF82N" + + class BAYLOR: + NAME = "Baylor College of Medicine" + KF_ID = "SC_A1JNZAZH" + + class BC_CANCER_AGENCY: + NAME = "British Columbia Cancer Agency Genome Sciences Centre" + KF_ID = "SC_FN7NH453" + + class BGI: + NAME = "BGI@CHOP Genome Center" + KF_ID = "SC_WWEQ9HFY" + + class BGI_CHINA: + NAME = "BGI" + KF_ID = "SC_FAD4KCQG" + + class BROAD: + NAME = "Broad Institute" + KF_ID = "SC_DGDDQVYR" + + class CBTN_UNSEQUENCED: + NAME = "CBTN Unsequenced" + KF_ID = "SC_31369RXZ" + + class CHOP: + NAME = "CHOP" + KF_ID = "SC_9NSC532X" + + class CHOP_DGD: + NAME = "CHOP DGD" + KF_ID = "SC_ZZPPF973" + + class COMPLETE_GENOMICS: + NAME = "Complete Genomics" + KF_ID = "SC_D30SEWS4" + + class CSIR: + NAME = "CSIR - Institute of Genomics and Integrative Biology, Delhi, India" + KF_ID = "SC_MDY0AZMZ" + + class FELINE_DIAGNOSTICS: + NAME = "Feline Diagnostics LLC" + KF_ID = "SC_CATTVETT" + + class FG: + NAME = "fg" + KF_ID = "SC_XXXXXXX2" + + class HGT: + NAME = "Humangenetik Tübingen" + KF_ID = "SC_75KENA7A" + + class HUDSON_ALPHA: + NAME = "HudsonAlpha Institute for Biotechnology" + KF_ID = "SC_X1N69WJM" + + class NANT: + NAME = "NantOmics" + KF_ID = "SC_N1EVHSME" + + class NOVOGENE: + NAME = "Novogene" + KF_ID = "SC_2ZBAMKK0" + + class NYGC: + NAME = "New York Genome Center" + KF_ID = "SC_BJW95TMY" + + class NCI: + NAME = "National Cancer Institute, Khan Lab" + KF_ID = "SC_F6RZ51K9" + + class NIH: + NAME = "National Institutes of Health" + KF_ID = "SC_HEXD2E5R" + + class SICKKIDS: + NAME = "SickKids" + KF_ID = "SC_9WMJKQ1X" + + class SIDRA: + NAME = "Genomic Clinical Core at Sidra Medical and Research Center" + KF_ID = "SC_KE2ASNJM" + + class ST_JUDE: + NAME = "St Jude" + KF_ID = "SC_1K3QGW4V" + + class TEMP: + NAME = "TEMP" + KF_ID = "SC_SJJ0B9GN" + + class TEMPUS: + NAME = "Tempus" + KF_ID = "SC_TQ8HJWGE" + + class TGEN: + NAME = "The Translational Genomics Research Institute" + KF_ID = "SC_KQ9JZG3P" + + class UNKNOWN_CHRIS_JONES: + NAME = "UNKNOWN:CHRIS_JONES" + KF_ID = "SC_5A2B1T4K" + + class VIRTUAL: + NAME = "Virtual" + KF_ID = "SC_BATJDPHB" + + class WASHU: + NAME = "Washington University" + KF_ID = "SC_K52V7463" + + class YALE: + NAME = "Yale" + KF_ID = "SC_31W52VNX" + + class FREDHUTCH: + NAME = "Fred Hutchinson Cancer Research Center" + KF_ID = "SC_8JXH42X1" + + class STRATEGY: + LINKED_WGS = "Linked-Read WGS (10x Chromium)" + METHYL = "Methylation" + MRNA = "miRNA-Seq" + RNA = "RNA-Seq" + WGS = "WGS" + WXS = "WXS" + TARGETED = "Targeted Sequencing" + PANEL = "Panel" + SCRNA = "Single Cell RNA-Seq" + + class ANALYTE: + DNA = "DNA" + RNA = "RNA" + VIRTUAL = "Virtual" + + class LIBRARY: + class SELECTION: + HYBRID = "Hybrid Selection" + PCR = "PCR" + AFFINITY_ENRICHMENT = "Affinity Enrichment" + POLYT_ENRICHMENT = "Poly-T Enrichment" + RANDOM = "Random" + RNA_DEPLETION = "rRNA Depletion" + MRNA_SIZE_FRACTIONATION = "miRNA Size Fractionation" + + class PREP: + POLYA = "polyA" + TOTALRNASEQ = "totalRNAseq" + + +class STUDY: + CANCER = "Cancer" + STRUCTURAL_DEFECT = "Structural Birth Defect" + STRUCTURAL_DEFECT_AND_CANCER = "Structural Birth Defect and Cancer" + + class STATUS: + CANCELED = "Canceled" + FAILED = "Failed" + PENDING = "Pending" + PUBLISHED = "Published" + PUBLISHING = "Publishing" + RUNNING = "Running" + STAGED = "Staged" + WAITING = "Waiting" + + +class CONSENT_TYPE: + DS_OBDR_MDS = "DS-OBDR-MDS" + # Disease-Specific (Orofacial birth defects and related phenotypes, + # MDS) (DS-OBDR-MDS) - Use of the data must be related to Orofacial + # birth defects and related phenotypes. Use of the data includes + # methods development research (e.g., development of software or + # algorithms). Includes related diseases and phenotypes such as medical + # and dental data, speech characteristics, images and derived data + # (e.g. 3D facial measures, ultrasounds of the upper lip), and all + # other physical assessments taken on the study subjects. + DS_OBD_MDS = "DS-OBD-MDS" + # Disease-Specific (Orofacial birth defects, MDS) (DS-OBD-MDS) - Use of + # the data must be related to Orofacial birth defects. Use of the data + # includes methods development research (e.g., development of software + # or algorithms). + DS_OC_PUB_MDS = "DS-OC-PUB-MDS" + # Disease-Specific (Oral Clefts, PUB, MDS) (DS-OC-PUB-MDS) - Use of the + # data must be related to Oral Clefts. Requestor agrees to make results + # of studies using the data available to the larger scientific + # community. Use of the data includes methods development research + # (e.g., development of software or algorithms). + HMB_MDS = "HMB-MDS" + # Health/Medical/Biomedical (MDS) (HMB-MDS) - Use of this data is + # limited to health/medical/biomedical purposes, does not include the + # study of population origins or ancestry. Use of the data includes + # methods development research (e.g., development of software or + # algorithms). + HMB_IRB = "HMB-IRB" + # Health/Medical/Biomedical general research + DS_CHD_IRB = "DS-CHD-IRB" + # Disease specific Cardiac Heart Defect + DS_CHD = "DS-CHD" + # Disease specific Cardiac Heart Defect + HMB_NPU = "HMB-NPU" + # Health/Medical/Biomedical (NPU) (HMB-NPU) - Use of this data is + # limited to health/medical/biomedical purposes, does not include the + # study of population origins or ancestry. Use of the data is limited to + # not-for-profit organizations. + GRU = "GRU" + # General Research Use + GRU_NPU = "GRU-NPU" + # General Research Use (NPU) (GRU-NPU) - Use of this data is + # limited to general research use. Use of the data is limited to + # not-for-profit organizations. + HMB_GSO = "HMB-GSO" + # Health/Medical/Biomedical (GSO) (HMB-GSO) Genetic studies only. + # Use of the data is limited to genetic studies only + # (i.e., no “phenotype-only” research). + + +class AUTHORITY: + DBGAP = "dbGaP" + + +class OUTCOME: + class DISEASE_RELATED: + YES = "Yes" + NO = "No" + + class VITAL_STATUS: + ALIVE = "Alive" + DEAD = "Deceased" + + +# ######################## NOTE ############################################### +# "Twin" relationships here mean identical twins. +# For non-identical twins please just use the normal sibling relationships. +# ############################################################################# + + +class RELATIONSHIP: + PROBAND = "Proband" + # PARTNERS: + WIFE = "Wife" + HUSBAND = "Husband" + SPOUSE = "Spouse" + # PARENTS: + MOTHER = "Mother" + FATHER = "Father" + PARENT = "Parent" + # CHILDREN: + DAUGHTER = "Daughter" + SON = "Son" + CHILD = "Child" + # SIBLINGS: + SISTER = "Sister" + BROTHER = "Brother" + SIBLING = "Sibling" + TWIN_SISTER = "Twin Sister" + TWIN_BROTHER = "Twin Brother" + TWIN = "Twin" + MATERNAL_HALF_SISTER = "Maternal Half-Sister" + MATERNAL_HALF_BROTHER = "Maternal Half-Brother" + MATERNAL_HALF_SIBLING = "Maternal Half-Sibling" + PATERNAL_HALF_SISTER = "Paternal Half-Sister" + PATERNAL_HALF_BROTHER = "Paternal Half-Brother" + PATERNAL_HALF_SIBLING = "Paternal Half-Sibling" + HALF_SISTER = "Half-Sister" + HALF_BROTHER = "Half-Brother" + HALF_SIBLING = "Half-Sibling" + # GRANDPARENTS: + MATERNAL_GRANDMOTHER = "Maternal Grandmother" + MATERNAL_GRANDFATHER = "Maternal Grandfather" + MATERNAL_GRANDPARENT = "Maternal Grandparent" + PATERNAL_GRANDMOTHER = "Paternal Grandmother" + PATERNAL_GRANDFATHER = "Paternal Grandfather" + PATERNAL_GRANDPARENT = "Paternal Grandparent" + GRANDMOTHER = "Grandmother" + GRANDFATHER = "Grandfather" + GRANDPARENT = "Grandparent" + # GRANDCHILDREN: + MATERNAL_GRANDDAUGHTER = "Maternal Granddaughter" + MATERNAL_GRANDSON = "Maternal Grandson" + MATERNAL_GRANDCHILD = "Maternal Grandchild" + PATERNAL_GRANDDAUGHTER = "Paternal Granddaughter" + PATERNAL_GRANDSON = "Paternal Grandson" + PATERNAL_GRANDCHILD = "Paternal Grandchild" + GRANDDAUGHTER = "Granddaughter" + GRANDSON = "Grandson" + GRANDCHILD = "Grandchild" + # GREAT GRANDPARENTS: + MATERNAL_GREAT_GRANDMOTHER = "Maternal Great-Grandmother" + MATERNAL_GREAT_GRANDFATHER = "Maternal Great-Grandfather" + MATERNAL_GREAT_GRANDPARENT = "Maternal Great-Grandparent" + PATERNAL_GREAT_GRANDMOTHER = "Paternal Great-Grandmother" + PATERNAL_GREAT_GRANDFATHER = "Paternal Great-Grandfather" + PATERNAL_GREAT_GRANDPARENT = "Paternal Great-Grandparent" + GREAT_GRANDMOTHER = "Great-Grandmother" + GREAT_GRANDFATHER = "Great-Grandfather" + GREAT_GRANDPARENT = "Great-Grandparent" + # GREAT GRANDCHILDREN: + MATERNAL_GREAT_GRANDDAUGHTER = "Maternal Great-Granddaughter" + MATERNAL_GREAT_GRANDSON = "Maternal Great-Grandson" + MATERNAL_GREAT_GRANDCHILD = "Maternal Great-Grandchild" + PATERNAL_GREAT_GRANDDAUGHTER = "Paternal Great-Granddaughter" + PATERNAL_GREAT_GRANDSON = "Paternal Great-Grandson" + PATERNAL_GREAT_GRANDCHILD = "Paternal Great-Grandchild" + GREAT_GRANDDAUGHTER = "Great-Granddaughter" + GREAT_GRANDSON = "Great-Grandson" + GREAT_GRANDCHILD = "Great-Grandchild" + # AUNTS/UNCLES: + MATERNAL_AUNT = "Maternal Aunt" + MATERNAL_UNCLE = "Maternal Uncle" + MATERNAL_PIBLING = "Maternal Aunt or Uncle" + PATERNAL_AUNT = "Paternal Aunt" + PATERNAL_UNCLE = "Paternal Uncle" + PATERNAL_PIBLING = "Paternal Aunt or Uncle" + AUNT = "Aunt" + UNCLE = "Uncle" + PIBLING = "Aunt or Uncle" + # NIECES/NEPHEWS: + MATERNAL_NIECE = "Maternal Niece" + MATERNAL_NEPHEW = "Maternal Nephew" + MATERNAL_NIBLING = "Maternal Niece or Nephew" + PATERNAL_NIECE = "Paternal Niece" + PATERNAL_NEPHEW = "Paternal Nephew" + PATERNAL_NIBLING = "Paternal Niece or Nephew" + NIECE = "Niece" + NEPHEW = "Nephew" + NIBLING = "Niece or Nephew" + # GREAT AUNTS/UNCLES: + MATERNAL_GREAT_AUNT = "Maternal Great Aunt" + MATERNAL_GREAT_UNCLE = "Maternal Great Uncle" + MATERNAL_GREAT_PIBLING = "Maternal Great Aunt or Uncle" + PATERNAL_GREAT_AUNT = "Paternal Great Aunt" + PATERNAL_GREAT_UNCLE = "Paternal Great Uncle" + PATERNAL_GREAT_PIBLING = "Paternal Great Aunt or Uncle" + GREAT_AUNT = "Great Aunt" + GREAT_UNCLE = "Great Uncle" + GREAT_PIBLING = "Great Aunt or Uncle" + # GREAT NIECES/NEPHEWS: + MATERNAL_GREAT_NIECE = "Maternal Great Niece" + MATERNAL_GREAT_NEPHEW = "Maternal Great Nephew" + MATERNAL_GREAT_NIBLING = "Maternal Great Niece or Nephew" + PATERNAL_GREAT_NIECE = "Paternal Great Niece" + PATERNAL_GREAT_NEPHEW = "Paternal Great Nephew" + PATERNAL_GREAT_NIBLING = "Paternal Great Niece or Nephew" + GREAT_NIECE = "Great Niece" + GREAT_NEPHEW = "Great Nephew" + GREAT_NIBLING = "Great Niece or Nephew" + # COUSINS: + MATERNAL_FIRST_COUSIN = "Maternal First Cousin" + PATERNAL_FIRST_COUSIN = "Paternal First Cousin" + FIRST_COUSIN = "First Cousin" + MATERNAL_SECOND_COUSIN = "Maternal Second Cousin" + PATERNAL_SECOND_COUSIN = "Paternal Second Cousin" + SECOND_COUSIN = "Second Cousin" + MATERNAL_COUSIN = "Maternal Cousin" + PATERNAL_COUSIN = "Paternal Cousin" + COUSIN = "Cousin" + + +R = RELATIONSHIP + + +REVERSE_RELATIONSHIPS = { + # PARTNERS: + R.WIFE: R.SPOUSE, + R.HUSBAND: R.SPOUSE, + R.SPOUSE: R.SPOUSE, + # PARENTS: + R.MOTHER: R.CHILD, + R.FATHER: R.CHILD, + R.PARENT: R.CHILD, + # CHILDREN: + R.DAUGHTER: R.PARENT, + R.SON: R.PARENT, + R.CHILD: R.PARENT, + # SIBLINGS: + R.BROTHER: R.SIBLING, + R.SISTER: R.SIBLING, + R.SIBLING: R.SIBLING, + R.TWIN_BROTHER: R.TWIN, + R.TWIN_SISTER: R.TWIN, + R.TWIN: R.TWIN, + R.MATERNAL_HALF_SISTER: R.MATERNAL_HALF_SIBLING, + R.MATERNAL_HALF_BROTHER: R.MATERNAL_HALF_SIBLING, + R.MATERNAL_HALF_SIBLING: R.MATERNAL_HALF_SIBLING, + R.PATERNAL_HALF_SISTER: R.PATERNAL_HALF_SIBLING, + R.PATERNAL_HALF_BROTHER: R.PATERNAL_HALF_SIBLING, + R.PATERNAL_HALF_SIBLING: R.PATERNAL_HALF_SIBLING, + R.HALF_SISTER: R.HALF_SIBLING, + R.HALF_BROTHER: R.HALF_SIBLING, + R.HALF_SIBLING: R.HALF_SIBLING, + # GRANDPARENTS: + R.MATERNAL_GRANDMOTHER: R.MATERNAL_GRANDCHILD, + R.MATERNAL_GRANDFATHER: R.MATERNAL_GRANDCHILD, + R.MATERNAL_GRANDPARENT: R.MATERNAL_GRANDCHILD, + R.PATERNAL_GRANDMOTHER: R.PATERNAL_GRANDCHILD, + R.PATERNAL_GRANDFATHER: R.PATERNAL_GRANDCHILD, + R.PATERNAL_GRANDPARENT: R.PATERNAL_GRANDCHILD, + R.GRANDMOTHER: R.GRANDCHILD, + R.GRANDFATHER: R.GRANDCHILD, + R.GRANDPARENT: R.GRANDCHILD, + # GRANDCHILDREN: + R.MATERNAL_GRANDDAUGHTER: R.MATERNAL_GRANDPARENT, + R.MATERNAL_GRANDSON: R.MATERNAL_GRANDPARENT, + R.MATERNAL_GRANDCHILD: R.MATERNAL_GRANDPARENT, + R.PATERNAL_GRANDDAUGHTER: R.PATERNAL_GRANDPARENT, + R.PATERNAL_GRANDSON: R.PATERNAL_GRANDPARENT, + R.PATERNAL_GRANDCHILD: R.PATERNAL_GRANDPARENT, + R.GRANDDAUGHTER: R.GRANDPARENT, + R.GRANDSON: R.GRANDPARENT, + R.GRANDCHILD: R.GRANDPARENT, + # GREAT GRANDPARENTS: + R.MATERNAL_GREAT_GRANDMOTHER: R.MATERNAL_GREAT_GRANDCHILD, + R.MATERNAL_GREAT_GRANDFATHER: R.MATERNAL_GREAT_GRANDCHILD, + R.MATERNAL_GREAT_GRANDPARENT: R.MATERNAL_GREAT_GRANDCHILD, + R.PATERNAL_GREAT_GRANDMOTHER: R.PATERNAL_GREAT_GRANDCHILD, + R.PATERNAL_GREAT_GRANDFATHER: R.PATERNAL_GREAT_GRANDCHILD, + R.PATERNAL_GREAT_GRANDPARENT: R.PATERNAL_GREAT_GRANDCHILD, + R.GREAT_GRANDMOTHER: R.GREAT_GRANDCHILD, + R.GREAT_GRANDFATHER: R.GREAT_GRANDCHILD, + R.GREAT_GRANDPARENT: R.GREAT_GRANDCHILD, + # GREAT GRANDCHILDREN: + R.MATERNAL_GREAT_GRANDDAUGHTER: R.MATERNAL_GREAT_GRANDPARENT, + R.MATERNAL_GREAT_GRANDSON: R.MATERNAL_GREAT_GRANDPARENT, + R.MATERNAL_GREAT_GRANDCHILD: R.MATERNAL_GREAT_GRANDPARENT, + R.PATERNAL_GREAT_GRANDDAUGHTER: R.PATERNAL_GREAT_GRANDPARENT, + R.PATERNAL_GREAT_GRANDSON: R.PATERNAL_GREAT_GRANDPARENT, + R.PATERNAL_GREAT_GRANDCHILD: R.PATERNAL_GREAT_GRANDPARENT, + R.GREAT_GRANDDAUGHTER: R.GREAT_GRANDPARENT, + R.GREAT_GRANDSON: R.GREAT_GRANDPARENT, + R.GREAT_GRANDCHILD: R.GREAT_GRANDPARENT, + # AUNTS/UNCLES: + R.MATERNAL_AUNT: R.MATERNAL_NIBLING, + R.MATERNAL_UNCLE: R.MATERNAL_NIBLING, + R.MATERNAL_PIBLING: R.MATERNAL_NIBLING, + R.PATERNAL_AUNT: R.PATERNAL_NIBLING, + R.PATERNAL_UNCLE: R.PATERNAL_NIBLING, + R.PATERNAL_PIBLING: R.PATERNAL_NIBLING, + R.AUNT: R.NIBLING, + R.UNCLE: R.NIBLING, + R.PIBLING: R.NIBLING, + # NIECES/NEPHEWS: + R.MATERNAL_NIECE: R.MATERNAL_PIBLING, + R.MATERNAL_NEPHEW: R.MATERNAL_PIBLING, + R.MATERNAL_NIBLING: R.MATERNAL_PIBLING, + R.PATERNAL_NIECE: R.PATERNAL_PIBLING, + R.PATERNAL_NEPHEW: R.PATERNAL_PIBLING, + R.PATERNAL_NIBLING: R.PATERNAL_PIBLING, + R.NIECE: R.PIBLING, + R.NEPHEW: R.PIBLING, + R.NIBLING: R.PIBLING, + # GREAT AUNTS/UNCLES: + R.MATERNAL_GREAT_AUNT: R.MATERNAL_GREAT_NIBLING, + R.MATERNAL_GREAT_UNCLE: R.MATERNAL_GREAT_NIBLING, + R.MATERNAL_GREAT_PIBLING: R.MATERNAL_GREAT_NIBLING, + R.PATERNAL_GREAT_AUNT: R.PATERNAL_GREAT_NIBLING, + R.PATERNAL_GREAT_UNCLE: R.PATERNAL_GREAT_NIBLING, + R.PATERNAL_GREAT_PIBLING: R.PATERNAL_GREAT_NIBLING, + R.GREAT_AUNT: R.GREAT_NIBLING, + R.GREAT_UNCLE: R.GREAT_NIBLING, + R.GREAT_PIBLING: R.GREAT_NIBLING, + # GREAT NIECES/NEPHEWS: + R.MATERNAL_GREAT_NIECE: R.MATERNAL_GREAT_PIBLING, + R.MATERNAL_GREAT_NEPHEW: R.MATERNAL_GREAT_PIBLING, + R.MATERNAL_GREAT_NIBLING: R.MATERNAL_GREAT_PIBLING, + R.PATERNAL_GREAT_NIECE: R.PATERNAL_GREAT_PIBLING, + R.PATERNAL_GREAT_NEPHEW: R.PATERNAL_GREAT_PIBLING, + R.PATERNAL_GREAT_NIBLING: R.PATERNAL_GREAT_PIBLING, + R.GREAT_NIECE: R.GREAT_PIBLING, + R.GREAT_NEPHEW: R.GREAT_PIBLING, + R.GREAT_NIBLING: R.GREAT_PIBLING, + # COUSINS: + R.MATERNAL_FIRST_COUSIN: R.MATERNAL_FIRST_COUSIN, + R.PATERNAL_FIRST_COUSIN: R.PATERNAL_FIRST_COUSIN, + R.FIRST_COUSIN: R.FIRST_COUSIN, + R.MATERNAL_SECOND_COUSIN: R.MATERNAL_SECOND_COUSIN, + R.PATERNAL_SECOND_COUSIN: R.PATERNAL_SECOND_COUSIN, + R.SECOND_COUSIN: R.SECOND_COUSIN, + R.MATERNAL_COUSIN: R.MATERNAL_COUSIN, + R.PATERNAL_COUSIN: R.PATERNAL_COUSIN, + R.COUSIN: R.COUSIN, +} + + +class GENDER: + MALE = "Male" + FEMALE = "Female" + OTHER = "Other" + + +# relates gendered relationship terms to their genders. +GENDER_FROM_RELATION = { + r: g + for g, rs in { + GENDER.FEMALE: { + # PARTNERS (FEMALE): + R.WIFE, + # PARENTS (FEMALE): + R.MOTHER, + # CHILDREN (FEMALE): + R.DAUGHTER, + # SIBLINGS (FEMALE): + R.SISTER, + R.TWIN_SISTER, + R.MATERNAL_HALF_SISTER, + R.PATERNAL_HALF_SISTER, + R.HALF_SISTER, + # GRANDPARENTS (FEMALE): + R.MATERNAL_GRANDMOTHER, + R.PATERNAL_GRANDMOTHER, + R.GRANDMOTHER, + # GRANDCHILDREN (FEMALE): + R.MATERNAL_GRANDDAUGHTER, + R.PATERNAL_GRANDDAUGHTER, + R.GRANDDAUGHTER, + # GREAT GRANDPARENTS (FEMALE): + R.MATERNAL_GREAT_GRANDMOTHER, + R.PATERNAL_GREAT_GRANDMOTHER, + R.GREAT_GRANDMOTHER, + # GREAT GRANDCHILDREN (FEMALE): + R.MATERNAL_GREAT_GRANDDAUGHTER, + R.PATERNAL_GREAT_GRANDDAUGHTER, + R.GREAT_GRANDDAUGHTER, + # AUNTS/UNCLES (FEMALE): + R.MATERNAL_AUNT, + R.PATERNAL_AUNT, + R.AUNT, + # NIECES/NEPHEWS (FEMALE): + R.MATERNAL_NIECE, + R.PATERNAL_NIECE, + R.NIECE, + # GREAT AUNTS/UNCLES (FEMALE): + R.MATERNAL_GREAT_AUNT, + R.PATERNAL_GREAT_AUNT, + R.GREAT_AUNT, + # GREAT NIECES/NEPHEWS (FEMALE): + R.MATERNAL_GREAT_NIECE, + R.PATERNAL_GREAT_NIECE, + R.GREAT_NIECE, + # COUSINS (FEMALE): n/a + }, + GENDER.MALE: { + # PARTNERS (MALE): + R.HUSBAND, + # PARENTS (MALE): + R.FATHER, + # CHILDREN (MALE): + R.SON, + # SIBLINGS (MALE): + R.BROTHER, + R.TWIN_BROTHER, + R.MATERNAL_HALF_BROTHER, + R.PATERNAL_HALF_BROTHER, + R.HALF_BROTHER, + # GRANDPARENTS (MALE): + R.MATERNAL_GRANDFATHER, + R.PATERNAL_GRANDFATHER, + R.GRANDFATHER, + # GRANDCHILDREN (MALE): + R.MATERNAL_GRANDSON, + R.PATERNAL_GRANDSON, + R.GRANDSON, + # GREAT GRANDPARENTS (MALE): + R.MATERNAL_GREAT_GRANDFATHER, + R.PATERNAL_GREAT_GRANDFATHER, + R.GREAT_GRANDFATHER, + # GREAT GRANDCHILDREN (MALE): + R.MATERNAL_GREAT_GRANDSON, + R.PATERNAL_GREAT_GRANDSON, + R.GREAT_GRANDSON, + # AUNTS/UNCLES (MALE): + R.MATERNAL_UNCLE, + R.PATERNAL_UNCLE, + R.UNCLE, + # NIECES/NEPHEWS (MALE): + R.MATERNAL_NEPHEW, + R.PATERNAL_NEPHEW, + R.NEPHEW, + # GREAT AUNTS/UNCLES (MALE): + R.MATERNAL_GREAT_UNCLE, + R.PATERNAL_GREAT_UNCLE, + R.GREAT_UNCLE, + # GREAT NIECES/NEPHEWS (MALE): + R.MATERNAL_GREAT_NEPHEW, + R.PATERNAL_GREAT_NEPHEW, + R.GREAT_NEPHEW, + # COUSINS (MALE): n/a + }, + }.items() + for r in rs +} + +# relates generic relationship terms to gendered ones +GENDERED_RELATIONSHIPS = { + # PARTNERS: + R.SPOUSE: { + GENDER.MALE: R.HUSBAND, + GENDER.FEMALE: R.WIFE, + }, + # PARENTS: + R.PARENT: { + GENDER.MALE: R.FATHER, + GENDER.FEMALE: R.MOTHER, + }, + # CHILDREN: + R.CHILD: { + GENDER.MALE: R.SON, + GENDER.FEMALE: R.DAUGHTER, + }, + # SIBLINGS: + R.SIBLING: { + GENDER.MALE: R.BROTHER, + GENDER.FEMALE: R.SISTER, + }, + R.TWIN: { + GENDER.MALE: R.TWIN_BROTHER, + GENDER.FEMALE: R.TWIN_SISTER, + }, + R.MATERNAL_HALF_SIBLING: { + GENDER.MALE: R.MATERNAL_HALF_BROTHER, + GENDER.FEMALE: R.MATERNAL_HALF_SISTER, + }, + R.PATERNAL_HALF_SIBLING: { + GENDER.MALE: R.PATERNAL_HALF_BROTHER, + GENDER.FEMALE: R.PATERNAL_HALF_SISTER, + }, + R.HALF_SIBLING: { + GENDER.MALE: R.HALF_BROTHER, + GENDER.FEMALE: R.HALF_SISTER, + }, + # GRANDPARENTS: + R.MATERNAL_GRANDPARENT: { + GENDER.MALE: R.MATERNAL_GRANDFATHER, + GENDER.FEMALE: R.MATERNAL_GRANDMOTHER, + }, + R.PATERNAL_GRANDPARENT: { + GENDER.MALE: R.PATERNAL_GRANDFATHER, + GENDER.FEMALE: R.PATERNAL_GRANDMOTHER, + }, + R.GRANDPARENT: { + GENDER.MALE: R.GRANDFATHER, + GENDER.FEMALE: R.GRANDMOTHER, + }, + # GRANDCHILDREN: + R.MATERNAL_GRANDCHILD: { + GENDER.MALE: R.MATERNAL_GRANDSON, + GENDER.FEMALE: R.MATERNAL_GRANDDAUGHTER, + }, + R.PATERNAL_GRANDCHILD: { + GENDER.MALE: R.PATERNAL_GRANDSON, + GENDER.FEMALE: R.PATERNAL_GRANDDAUGHTER, + }, + R.GRANDCHILD: { + GENDER.MALE: R.GRANDSON, + GENDER.FEMALE: R.GRANDDAUGHTER, + }, + # GREAT GRANDPARENTS: + R.MATERNAL_GREAT_GRANDPARENT: { + GENDER.MALE: R.MATERNAL_GREAT_GRANDFATHER, + GENDER.FEMALE: R.MATERNAL_GREAT_GRANDMOTHER, + }, + R.PATERNAL_GREAT_GRANDPARENT: { + GENDER.MALE: R.PATERNAL_GREAT_GRANDFATHER, + GENDER.FEMALE: R.PATERNAL_GREAT_GRANDMOTHER, + }, + R.GREAT_GRANDPARENT: { + GENDER.MALE: R.GREAT_GRANDFATHER, + GENDER.FEMALE: R.GREAT_GRANDMOTHER, + }, + # GREAT GRANDCHILDREN: + R.MATERNAL_GREAT_GRANDCHILD: { + GENDER.MALE: R.MATERNAL_GREAT_GRANDSON, + GENDER.FEMALE: R.MATERNAL_GREAT_GRANDDAUGHTER, + }, + R.PATERNAL_GREAT_GRANDCHILD: { + GENDER.MALE: R.PATERNAL_GREAT_GRANDSON, + GENDER.FEMALE: R.PATERNAL_GREAT_GRANDDAUGHTER, + }, + R.GREAT_GRANDCHILD: { + GENDER.MALE: R.GREAT_GRANDSON, + GENDER.FEMALE: R.GREAT_GRANDDAUGHTER, + }, + # AUNTS/UNCLES: + R.MATERNAL_PIBLING: { + GENDER.MALE: R.MATERNAL_UNCLE, + GENDER.FEMALE: R.MATERNAL_AUNT, + }, + R.PATERNAL_PIBLING: { + GENDER.MALE: R.PATERNAL_UNCLE, + GENDER.FEMALE: R.PATERNAL_AUNT, + }, + R.PIBLING: { + GENDER.MALE: R.UNCLE, + GENDER.FEMALE: R.AUNT, + }, + # NIECES/NEPHEWS: + R.MATERNAL_NIBLING: { + GENDER.MALE: R.MATERNAL_NEPHEW, + GENDER.FEMALE: R.MATERNAL_NIECE, + }, + R.PATERNAL_NIBLING: { + GENDER.MALE: R.PATERNAL_NEPHEW, + GENDER.FEMALE: R.PATERNAL_NIECE, + }, + R.NIBLING: { + GENDER.MALE: R.NEPHEW, + GENDER.FEMALE: R.NIECE, + }, + # GREAT AUNTS/UNCLES: + R.MATERNAL_GREAT_PIBLING: { + GENDER.MALE: R.MATERNAL_GREAT_UNCLE, + GENDER.FEMALE: R.MATERNAL_GREAT_AUNT, + }, + R.PATERNAL_GREAT_PIBLING: { + GENDER.MALE: R.PATERNAL_GREAT_UNCLE, + GENDER.FEMALE: R.PATERNAL_GREAT_AUNT, + }, + R.GREAT_PIBLING: { + GENDER.MALE: R.GREAT_UNCLE, + GENDER.FEMALE: R.GREAT_AUNT, + }, + # GREAT NIECES/NEPHEWS: + R.MATERNAL_GREAT_NIBLING: { + GENDER.MALE: R.MATERNAL_GREAT_NEPHEW, + GENDER.FEMALE: R.MATERNAL_GREAT_NIECE, + }, + R.PATERNAL_GREAT_NIBLING: { + GENDER.MALE: R.PATERNAL_GREAT_NEPHEW, + GENDER.FEMALE: R.PATERNAL_GREAT_NIECE, + }, + R.GREAT_NIBLING: { + GENDER.MALE: R.GREAT_NEPHEW, + GENDER.FEMALE: R.GREAT_NIECE, + }, + # COUSINS: n/a +} + +# Relates sibling groups to their known parents. +# Don't include proband in siblings or parents. +RELATIONSHIP_PARENTS = [ + # SIBLINGS & PARENTS: + { + "siblings": { + R.BROTHER, + R.SISTER, + R.SIBLING, + R.TWIN_BROTHER, + R.TWIN_SISTER, + R.TWIN, + R.MATERNAL_HALF_BROTHER, + R.MATERNAL_HALF_SISTER, + R.MATERNAL_HALF_SIBLING, + R.PATERNAL_HALF_BROTHER, + R.PATERNAL_HALF_SISTER, + R.PATERNAL_HALF_SIBLING, + R.HALF_BROTHER, + R.HALF_SISTER, + R.HALF_SIBLING, + }, + "mother": R.MOTHER, + "father": R.FATHER, + "generic": R.PARENT, + }, + # MATERNAL GRANDPARENTS: + { + "siblings": {R.MOTHER}, + "mother": R.MATERNAL_GRANDMOTHER, + "father": R.MATERNAL_GRANDFATHER, + "generic": R.MATERNAL_GRANDPARENT, + }, + # PATERNAL GRANDPARENTS: + { + "siblings": {R.FATHER}, + "mother": R.PATERNAL_GRANDMOTHER, + "father": R.PATERNAL_GRANDFATHER, + "generic": R.PATERNAL_GRANDPARENT, + }, + # GREAT GRANDPARENTS: + { + "siblings": {R.GRANDMOTHER}, + "mother": R.GREAT_GRANDMOTHER, + "father": R.GREAT_GRANDFATHER, + "generic": R.GREAT_GRANDPARENT, + }, + { + "siblings": {R.GRANDFATHER}, + "mother": R.GREAT_GRANDMOTHER, + "father": R.GREAT_GRANDFATHER, + "generic": R.GREAT_GRANDPARENT, + }, + # MATERNAL GREAT GRANDPARENTS: + { + "siblings": {R.MATERNAL_GRANDMOTHER}, + "mother": R.MATERNAL_GREAT_GRANDMOTHER, + "father": R.MATERNAL_GREAT_GRANDFATHER, + "generic": R.MATERNAL_GREAT_GRANDPARENT, + }, + { + "siblings": {R.MATERNAL_GRANDFATHER}, + "mother": R.MATERNAL_GREAT_GRANDMOTHER, + "father": R.MATERNAL_GREAT_GRANDFATHER, + "generic": R.MATERNAL_GREAT_GRANDPARENT, + }, + # PATERNAL GREAT GRANDPARENTS: + { + "siblings": {R.PATERNAL_GRANDMOTHER}, + "mother": R.PATERNAL_GREAT_GRANDMOTHER, + "father": R.PATERNAL_GREAT_GRANDFATHER, + "generic": R.PATERNAL_GREAT_GRANDPARENT, + }, + { + "siblings": {R.PATERNAL_GRANDFATHER}, + "mother": R.PATERNAL_GREAT_GRANDMOTHER, + "father": R.PATERNAL_GREAT_GRANDFATHER, + "generic": R.PATERNAL_GREAT_GRANDPARENT, + }, +] + + +def genderize_relationship(relationship, gender): + r = GENDERED_RELATIONSHIPS.get(relationship) + if r: + return r.get(gender, relationship) + else: + return relationship + + +class RACE: + WHITE = "White" + NATIVE_AMERICAN = "American Indian or Alaska Native" + BLACK = "Black or African American" + ASIAN = "Asian" + PACIFIC = "Native Hawaiian or Other Pacific Islander" + MULTIPLE = "More Than One Race" + + +class ETHNICITY: + NON_HISPANIC = "Not Hispanic or Latino" + HISPANIC = "Hispanic or Latino" + + +class SPECIES: + DOG = "Canis lupus familiaris" + FLY = "Drosophila melanogaster" + HUMAN = "Homo sapiens" + MOUSE = "Mus musculus" + + +class PHENOTYPE: + class OBSERVED: + YES = "Positive" + NO = "Negative" diff --git a/d3b_api_client_cli/fhir/counts.py b/d3b_api_client_cli/fhir/counts.py new file mode 100644 index 0000000..a8ead48 --- /dev/null +++ b/d3b_api_client_cli/fhir/counts.py @@ -0,0 +1,86 @@ +""" +Generate total counts for KF FHIR types +""" + +import os +import logging +import random +from pprint import pformat, pprint +import pandas + +from d3b_api_client_cli.fhir.get import get +from d3b_api_client_cli.utils import read_json, elapsed_time_hms +from d3b_api_client_cli.config import ( + config, + KidsFirstFhirEntity, +) + +pandas.set_option("display.max_colwidth", None) + +config = config["fhir"] +fhir_base_url = config["base_url"] +fhir_username = config["username"] +fhir_password = config["password"] +kf_fhir_types = [et.value for et in KidsFirstFhirEntity] + +logger = logging.getLogger(__name__) + +queries = config["mapping"] + + +def get_counts( + study_id, + fhir_base_url=fhir_base_url, + fhir_username=fhir_username, + fhir_password=fhir_password, + legacy_server=True, +): + """ + Check that the total counts match up between the generated FHIR resources + and the FHIR resources in the legacy FHIR server + """ + if not (fhir_base_url and fhir_username and fhir_password): + raise Exception( + "You must set the FHIR server connection details" + "in your environment. See .env.sample for variable names" + ) + + results = [] + for kf_fhir_type in kf_fhir_types: + logger.info(f"🛜 Fetching counts for {kf_fhir_type}") + + query = queries.get(kf_fhir_type) + url = f"{fhir_base_url}{query['endpoint']}" + headers = {"Content-Type": "application/json"} + + # Setup query params + params = query["params"] + params["_total"] = "accurate" + if legacy_server: + tags = [] + else: + # In new FHIR servers, we tag the resource with the KF entity type + # and we MUST include this in the query parameters otherwise we + # won't be able to differentiate between some entities of the + # same resource type (e.g. drs doc ref vs drs doc ref index which + # both have resource type DocumentReference) + tags = [kf_fhir_type] + tags.append(study_id) + params["_tag"] = tags + + result = get( + url, fhir_username, fhir_password, headers=headers, params=params + ) + + results.append( + { + "resource_type": query["endpoint"].strip("/"), + "entity_type": kf_fhir_type, + "total": result["total"], + } + ) + + df = pandas.DataFrame(results) + logger.info(f"🔢 KF FHIR type counts for {fhir_base_url}:\n{df}") + + return results diff --git a/d3b_api_client_cli/fhir/delete.py b/d3b_api_client_cli/fhir/delete.py new file mode 100644 index 0000000..0dff4e6 --- /dev/null +++ b/d3b_api_client_cli/fhir/delete.py @@ -0,0 +1,537 @@ +""" +Delete FHIR resources in the FHIR server efficiently +""" + +import os +import time +import logging +from concurrent.futures import ThreadPoolExecutor, as_completed +from urllib.parse import urlparse +from pprint import pformat + +import requests +from requests.auth import HTTPBasicAuth + +from d3b_api_client_cli.config import ( + config, + KidsFirstFhirEntity, + ImagingFhirEntity, + DELETE_DIR, +) +from d3b_api_client_cli import utils +from d3b_api_client_cli.fhir.get import get_all, get + +logger = logging.getLogger(__name__) + +config = config["fhir"] +kf_to_fhir_mapping = config["mapping"] +entity_delete_order = reversed( + [et.value for et in KidsFirstFhirEntity] + + [et.value for et in ImagingFhirEntity] +) + +LOCAL_HOSTS = { + "localhost", + "127.0.0.1", +} + + +def _do_delete(base_url, endpoint, resource_id, username, password): + """ + Helper function to delete FHIR resource + """ + url = "/".join( + part.strip("/") for part in [base_url, endpoint, resource_id] + ) + headers = {"Content-Type": "application/json"} + results = { + "url": url, + "resp": None, + "failed": False, + } + try: + resp = utils.send_request( + "delete", + url, + ignore_status_codes={409}, + headers=headers, + auth=HTTPBasicAuth(username, password), + ) + except requests.exceptions.HTTPError as e: + if "404 Client Error" in str(e): + results["failed"] = True + results["resp"] = {"id": resource_id} + else: + raise + else: + results["resp"] = resp.json() + + if resp.status_code == 409: + logger.info(f"Conflict detected for {url}." " Delete in next round") + issue = resp.json()["issue"] + logger.info(f"Conflict Details: {pformat(issue)}") + + return results + + +def _process_result(result, results, i, total): + url = result.get("url") + url_path = urlparse(url).path + resp = result.get("resp") + failed = result["failed"] + if not failed: + results["success"].append(resp) + logger.info(f"DELETE {url_path}, #{i + 1}/{total}") + else: + results["failed"].append(resp) + logger.info(f"FAILED DELETE {url_path} #{i + 1}/{total}") + + return resp + + +def delete_entities( + base_url, entities, username, password, use_async=True, safety_check=True +): + """ + Delete the supplied FHIR entities in FHIR server + + Default behavior only deletes resources at localhost unless + safety_check=False + + :param base_url: Base url of the FHIR service + :type base_url: str + :param entities: list of FHIR resources to delete + :type entities: list of dict + :param username: username of the API admin user + :type username: str + :param password: password of the API admin user + :type password: str + :param use_async: A flag to determine whether to use multi-threading when + sending requests to the server + :type use_async: boolean + :param safety_check: A flag to prevent deleting in non-local servers + :type safety_check: boolean + :returns: None + """ + if safety_check and (not utils.is_localhost(base_url)): + raise Exception( + f"❌ Cannot delete from {base_url} because safety_check is ENABLED. " + f"Resources that are not in {LOCAL_HOSTS} will not be deleted " + "unless you set safety_check=False." + ) + logger.info(f"🚮 Deleting len(entities) from FHIR service: {base_url}") + + start_time = time.time() + + results = {"failed": [], "success": []} + total = len(entities) + if use_async: + logger.info("⚡️ Using async deleting ...") + with ThreadPoolExecutor() as tpex: + futures = [] + for i, entity in enumerate(entities): + resource_id = entity["id"] + endpoint = entity["resourceType"] + futures.append( + tpex.submit( + _do_delete, + base_url, + endpoint, + resource_id, + username, + password, + ) + ) + for i, f in enumerate(as_completed(futures)): + result = f.result() + _process_result(result, results, i, total) + else: + logger.info("🐌 Using synchronous deleting ...") + for i, entity in enumerate(entities): + resource_id = entity["id"] + endpoint = entity["resourceType"] + result = _do_delete( + base_url, endpoint, resource_id, username, password + ) + _process_result(result, results, i, total) + + logger.info( + f"⏰ Elapsed time (hh:mm:ss): {utils.elapsed_time_hms(start_time)}" + ) + + return results + + +def _delete_specimens( + base_url, + output_dir, + study_id, + username, + password, + use_async, + safety_check, +): + """ + Delete FHIR specimens in FHIR server + + Specimens must be deleted in a different way than the other entities + since their references to other specimens are not known ahead of time. + + This deletion works by attempting to delete specimens in whatever order + they are fetched until there are no more specimens left in the server + """ + entity_type = "specimen" + params = {"_total": "accurate", "_tag": study_id} + results = {"success": {}, "failed": {}} + while True: + # Get all specimens + data = get_all( + base_url, + "Specimen", + username, + password, + params=params, + ) + if not data: + logger.info("0️⃣ No more Specimen resources to delete") + break + + logger.info(f"🚮 Try deleting Specimen {len(data)} resources") + + # Try deleting specimens + r = delete_entities( + base_url, + data, + username, + password, + use_async=use_async, + safety_check=safety_check, + ) + results["success"].update(r["success"]) + results["failed"].update(r["failed"]) + + if results["failed"]: + fp = os.path.join(output_dir, "failed", f"{entity_type}.json") + utils.write_json(results["failed"], fp) + logger.info( + f"❌ Failed to delete {len(results['failed'])} {entity_type}" + ) + + if results["success"]: + fp = os.path.join(output_dir, "success", f"{entity_type}.json") + utils.write_json(results["success"], fp) + logger.info(f"✅ Deleted {len(results['success'])} {entity_type}") + + return results + + +def delete_all_of_type( + base_url, + entity_type, + output_dir, + study_id=None, + username=None, + password=None, + use_async=True, + safety_check=True, + use_kf_entity_tags=True, +): + """ + Delete all FHIR resources in the server by type. If study_id is provided + then delete only the resources which have been tagged by that study + + :param base_url: Base url of the FHIR service + :type base_url: str + :param entity_type: One of Kids First FHIR entity types in + d3b_api_client_cli.config.KidsFirstFhirEntity + :type entity_type: str + :param output_dir: Where delete results will be written + :type output_dir: str + :param study_id: the study_id to filter resources to delete + :type study_id: str + :param username: username of the API admin user + :type username: str + :param password: password of the API admin user + :type password: str + :param use_async: A flag to determine whether to use multi-threading when + sending requests to the server + :type use_async: boolean + :param safety_check: A flag to prevent deleting in non-local servers + :type safety_check: boolean + :param use_kf_entity_tags: A flag to include the KF entity type in the + query params. This is used when deleting from the new FHIR servers + :type use_kf_entity_tags: boolean + + :returns: None + """ + if safety_check and (not utils.is_localhost(base_url)): + raise Exception( + f"❌ Cannot delete from {base_url} because safety_check is ENABLED. " + f"Resources that are not in {LOCAL_HOSTS} will not be deleted " + "unless you set safety_check=False." + ) + + if not (username and password): + username = config["username"] + password = config["password"] + + start_time = time.time() + + # Get the mapping of KF type to FHIR resource type and the required + # query parameters to search for this entity in FHIR + entity_config = kf_to_fhir_mapping.get(entity_type) + if not entity_config: + raise Exception( + f"❌ Aborting delete. No configuration for {entity_type} in:\n" + f"{pformat(kf_to_fhir_mapping)}" + ) + + # Setup query params + tags = [] + if use_kf_entity_tags: + # In new FHIR servers, we tag the resource with the KF entity type + # and we MUST include this in the query parameters otherwise we + # won't be able to differentiate between some entities of the + # same resource type (e.g. drs doc ref vs drs doc ref index which + # both have resource type DocumentReference) + tags = [entity_type] + + if study_id: + tags.append(study_id) + + params = {"_tag": tags} + params.update(entity_config["params"]) + endpoint = entity_config["endpoint"] + + # Fetch all pages of data + data = get_all( + base_url, + endpoint, + username, + password, + params=params, + ) + # Delete all entities of type + results = {"failed": [], "success": []} + if data: + logger.info(f"🚮 Begin deleting {entity_type} {len(data)} resources") + results = delete_entities( + base_url, + data, + username, + password, + use_async=use_async, + safety_check=safety_check, + ) + if results["failed"]: + fp = os.path.join(output_dir, "failed", f"{entity_type}.json") + utils.write_json(results["failed"], fp) + logger.info( + f"❌ Failed to delete {len(results['failed'])} {entity_type}" + ) + + if results["success"]: + fp = os.path.join(output_dir, "success", f"{entity_type}.json") + utils.write_json(results["success"], fp) + logger.info(f"✅ Deleted {len(results['success'])} {entity_type}") + else: + logger.info(f"0️⃣ No {endpoint} resources to delete. Aborting") + + logger.info( + f"⏰ Elapsed time (hh:mm:ss): {utils.elapsed_time_hms(start_time)}" + ) + logger.info(f"📝 Wrote FHIR delete results for {study_id} to {output_dir}") + logger.info(f"✅ Completed FHIR delete for {entity_type}") + + return results + + +def delete_all( + base_url, + entity_types=None, + output_dir=None, + study_id=None, + username=None, + password=None, + use_async=True, + safety_check=True, + use_kf_entity_tags=True, +): + """ + Delete all FHIR resources in the server by type. If study_id is provided + then delete only the resources which have been tagged by that study + + :param base_url: Base url of the FHIR service + :type base_url: str + :param entity_types: Kids First FHIR entity types in + d3b_api_client_cli.config.KidsFirstFhirEntity + :type entity_types: str + :param study_id: the study_id to filter resources to delete + :type study_id: str + :param username: username of the API admin user + :type username: str + :param password: password of the API admin user + :type password: str + :param use_async: A flag to determine whether to use multi-threading when + sending requests to the server + :type use_async: boolean + :param safety_check: A flag to prevent deleting in non-local servers + :type safety_check: boolean + :param use_kf_entity_tags: A flag to include the KF entity type in the + query params. This is used when deleting from the new FHIR servers + :type use_kf_entity_tags: boolean + + :returns: None + """ + if safety_check and (not utils.is_localhost(base_url)): + raise Exception( + f"❌ Cannot delete from {base_url} because safety_check is ENABLED. " + f"Resources that are not in {LOCAL_HOSTS} will not be deleted " + "unless you set safety_check=False." + ) + + if not (username and password): + username = config["username"] + password = config["password"] + + if not output_dir: + last_dir = study_id if study_id else "all" + output_dir = os.path.join(DELETE_DIR, last_dir) + os.makedirs(os.path.join(output_dir, "success"), exist_ok=True) + os.makedirs(os.path.join(output_dir, "failed"), exist_ok=True) + + start_time = time.time() + failed = {} + success = {} + for entity_type in entity_delete_order: + if entity_types and (entity_type not in set(entity_types)): + logger.info( + f"Skipping {entity_type}, not in filters:" + f" {pformat(entity_types)}" + ) + continue + + # Delete all KF FHIR entities (e.g. vital_status) + # KF FHIR entity is translated into a FHIR resource query + # e.g. vital_status -> /Observation?code=250537006&_tag= + + # Specimens need to be deleted in a different way since their + # references to each other are not known ahead of time and therefore + # we don't know the order of deletion + if entity_type.endswith("specimen"): + results = _delete_specimens( + base_url, + output_dir, + study_id, + username, + password, + use_async, + safety_check, + ) + else: + results = delete_all_of_type( + base_url, + entity_type, + output_dir, + study_id=study_id, + username=username, + password=password, + use_async=use_async, + safety_check=safety_check, + use_kf_entity_tags=use_kf_entity_tags, + ) + failed[entity_type] = len(results["failed"]) + success[entity_type] = len(results["success"]) + + logger.info( + f"⏰ Elapsed time (hh:mm:ss): {utils.elapsed_time_hms(start_time)}" + ) + if any(failed.values()): + logger.info(f"❌ Failed delete counts:\n{pformat(failed)}") + if any(success.values()): + logger.info(f"✅ Success delete counts:\n{pformat(success)}") + if not (any(failed.values()) or any(success.values())): + logger.info("0️⃣ This study had no data to delete!") + logger.info( + f"✅ Completed FHIR delete for study {study_id if study_id else ''}" + ) + + +def delete_from_file( + base_url, + data_dir, + entity_types=None, + username=None, + password=None, + use_async=True, + safety_check=True, +): + """ + Read FHIR json files and delete the resources by ID in FHIR service + + :param base_url: Base url of the FHIR service + :type base_url: str + :param data_dir: Dir where data is loaded from + :type data_dir: str + :param entity_types: Kids First FHIR entity types in + d3b_api_client_cli.config.KidsFirstFhirEntity + :type entity_types: str + :param username: username of the API admin user + :type username: str + :param password: password of the API admin user + :type password: str + :param use_async: A flag to determine whether to use multi-threading when + sending requests to the server + :type use_async: boolean + :param safety_check: A flag to prevent deleting in non-local servers + :type safety_check: boolean + :returns: None + """ + if safety_check and (not utils.is_localhost(base_url)): + raise Exception( + f"❌ Cannot delete from {base_url} because safety_check is ENABLED. " + f"Resources that are not in {LOCAL_HOSTS} will not be deleted " + "unless you set safety_check=False." + ) + + if not (username and password): + username = config["username"] + password = config["password"] + + start_time = time.time() + for entity_type in entity_delete_order: + if entity_types and (entity_type not in set(entity_types)): + logger.info( + f"Skipping {entity_type}, not in filters:" + f" {pformat(entity_types)}" + ) + continue + + filename = f"{entity_type}.json" + filepath = os.path.join(data_dir, filename) + if not os.path.exists(filepath): + logger.warning( + f"⚠️ Skipping {entity_type}, data file does not exist" + ) + continue + + data = utils.read_json(filepath) + if not data: + logger.info(f"0 {entity_type} resources to delete. Aborting") + continue + logger.info(f"🗑️ Begin deleting {entity_type} {len(data)} resources") + results = delete_entities( + base_url, + data, + username, + password, + use_async=use_async, + safety_check=safety_check, + ) + + logger.info( + f"⏰ Elapsed time (hh:mm:ss): {utils.elapsed_time_hms(start_time)}" + ) + logger.info("✅ Completed delete") + + return results diff --git a/d3b_api_client_cli/fhir/entity_builders/__init__.py b/d3b_api_client_cli/fhir/entity_builders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3b_api_client_cli/fhir/entity_builders/base.py b/d3b_api_client_cli/fhir/entity_builders/base.py new file mode 100644 index 0000000..8ea3776 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/base.py @@ -0,0 +1,283 @@ +""" +Base class for Kids First FHIR entity builders. Defines abstract interface +for th entity builder subclasses + +Entity builders take in one or more source tabular files from Dataservice +and build FHIR JSON +""" + +from pprint import pformat, pprint +import logging +import pandas + +from d3b_api_client_cli.config import config, KidsFirstFhirEntity +from d3b_api_client_cli import utils +from d3b_api_client_cli.fhir.common import import_builders +from d3b_api_client_cli.fhir import constants +from d3b_api_client_cli.dataservice.transformer import common + +dataservice_url = config["dataservice"]["api_url"] +dataservice_types = list(k for k in config["dataservice"]["seeder"]) +fhir_resource_types = config["fhir"]["resource_types"] +fhir_entity_builders = [et.value for et in KidsFirstFhirEntity] + +logger = logging.getLogger(__name__) + + +def extension_age_at_event(reference, event_age_days): + """ + Helper to create JSON that for a FHIR extension that captures a + relative age in days since an event took place + """ + return { + "extension": [ + { + "extension": [ + {"url": "target", "valueReference": reference}, + { + "url": "targetPath", + "valueString": "birthDate", + }, + { + "url": "relationship", + "valueCode": "after", + }, + { + "url": "offset", + "valueDuration": { + "value": int(event_age_days), + "unit": "day", + "system": "http://unitsofmeasure.org", + "code": "d", + }, + }, + ], + "url": "http://hl7.org/fhir/StructureDefinition/cqf-relativeDateTime", + } + ] + } + + +def set_id_prefix(resource_type): + """ + Given a FHIR resource type return the two char prefix for resource IDs + having that resource type + """ + return fhir_resource_types.get(resource_type, "none") + + +class MissingSourceDataError(Exception): + """ + Error when source tables in entity builder cannot be read or loaded + """ + + pass + + +class FhirResourceBuilder: + """ + Abstract class defining interface for FHIR resource builders for Kids + First FHIR entities + """ + + sources = None + resource_type = None + kf_id_col = None + kf_id_system = None + external_id_col = None + external_id_system = None + reference_builders = None + id_prefix = None + + def __init__(self, study_id, parent_study_id=None): + self.study_id = study_id + self.parent_study_id = parent_study_id + self.entity_type = type(self).__name__ + + n = ".".join(__name__.split(".")[:-1]) + self.logger = logging.getLogger(f"{n}.{self.entity_type}") + + for attr in [ + "sources", + "resource_type", + "kf_id_col", + "kf_id_system", + "external_id_col", + "external_id_system", + "id_prefix", + ]: + self.validate_cls_attribute(attr, "") + + def validate_cls_attribute( + self, cls_attribute_name, msg, validate_method=None + ): + """ + Ensure that the class attribute is implemented and valid + """ + cls_attribute = getattr(self, cls_attribute_name) + if not cls_attribute: + raise NotImplementedError( + f"Must implement {self.entity_type}.{cls_attribute_name}." + f" {msg}" + ) + if validate_method: + validate_method() + + @classmethod + def fhir_id(cls, kf_id): + """ + Create the resource FHIR ID from the Dataservice Kids First ID + """ + return utils.kf_id_to_global_id(kf_id, replace_prefix=cls.id_prefix) + + @classmethod + def fhir_reference(cls, kf_id): + """ + Create a dict representing a reference to this resource + """ + ref_id = cls.fhir_id(kf_id) + return {"reference": f"{cls.resource_type}/{ref_id}"} + + def import_source_data(self, source_dir): + """ + Read source tables from file + """ + # Ensure class attribute is implemented + msg = ( + "Must be a list of Dataservice entity types:" + f" {pformat(dataservice_types)}" + ) + sources = self.sources.get("required", []) + self.sources.get( + "optional", [] + ) + assert set(sources) <= set(dataservice_types), msg + + dfs = common.read_dfs(source_dir, sources) + missing = {"required": [], "optional": []} + for source in sources: + if source not in dfs: + if source in self.sources["required"]: + missing["required"].append(source) + else: + missing["optional"].append(source) + + msg = ( + f"⚠️ One or more source tables do not exist: {pformat(missing)}" + f" during build of {self.resource_type}/{self.entity_type}!" + ) + + if missing["required"]: + logger.error(msg) + raise MissingSourceDataError(msg) + + if missing["optional"]: + logger.warning(msg) + + return dfs + + def import_reference_builders(self): + """ + Import entity builders for referenced FHIR resources. + + We need these to translate KF IDs to FHIR resource IDs using the + builder.fhir_id method + """ + msg = ( + "Must be one of the Kids First FHIR entity builders:" + f" {pformat(fhir_entity_builders)}" + ) + self.validate_cls_attribute("reference_builders", msg) + assert set(self.reference_builders) <= set(fhir_entity_builders), msg + + return import_builders(entity_types=self.reference_builders) + + def init_resource(self, row): + """ + Create the initial FHIR resource w content common to all resource types + """ + # Ensure class attribute is implemented + msg = ( + "Must be one of FHIR resource types:" + f" {pformat(fhir_resource_types.keys())}" + ) + assert self.resource_type in set(fhir_resource_types.keys()), msg + + kf_id = row[self.kf_id_col] + kf_id_system = self.kf_id_system + + external_id = str(row.get(self.external_id_col)) + if external_id == "None" or pandas.isnull(external_id): + external_id = constants.COMMON.NOT_REPORTED + external_id_system = self.external_id_system + + # Default tags for every resource + tags = [ + { + "system": f"{dataservice_url}/studies/", + "code": self.study_id, + }, + { + "system": "urn:kids_first_fhir_type", + "code": utils.camel_to_snake(self.entity_type), + }, + ] + # If this study has a parent study, tag all resources with both + # the child study ID and the parent study ID + if self.parent_study_id: + tags.append( + { + "system": f"{dataservice_url}/studies?parent_study_id=", + "code": self.parent_study_id, + }, + ) + + return { + "resourceType": self.resource_type, + "id": self.fhir_id(kf_id), + "meta": { + "profile": [ + f"http://hl7.org/fhir/StructureDefinition/{self.resource_type}" + ], + "tag": tags, + }, + "identifier": [ + { + "use": "official", + "system": f"{dataservice_url}/{kf_id_system.lstrip('/').rstrip('/')}/", + "value": kf_id, + }, + { + "use": "secondary", + "system": ( + f"{dataservice_url}/{external_id_system.lstrip('/')}" + ), + "value": external_id, + }, + ], + } + + def _build(self, source_dir, output_dir, **kwargs): + """ + Implemented by subclasses. Called by build + """ + raise NotImplementedError( + "All entity builders must implement this method. This should " + "return a list of dicts representing the list of FHIR JSON" + ) + + def build(self, source_dir, output_dir, **kwargs): + """ + The main method to build Kids First FHIR resources + :param source_dir: Directory of source tables + :type source_dir: str + :param output_dir: Directory FHIR JSON will be written + :type output_dir: str + """ + self.logger.info(f"🧱 Building {self.entity_type} FHIR entities") + + try: + resources = self._build(source_dir, **kwargs) + except MissingSourceDataError: + return [] + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/child_specimen/__init__.py b/d3b_api_client_cli/fhir/entity_builders/child_specimen/__init__.py new file mode 100644 index 0000000..730d511 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/child_specimen/__init__.py @@ -0,0 +1,184 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient child +specimen info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.child_specimen.mapping import * +from d3b_api_client_cli.fhir.entity_builders.specimen_common import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, + extension_age_at_event, +) + + +class ChildSpecimen(FhirResourceBuilder): + """ + Build FHIR Specimen from Dataservice Biospecimen. Link this Specimen to + the parent FHIR Specimen + """ + + sources = {"required": ["Biospecimen"], "optional": []} + resource_type = "Specimen" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.BIOSPECIMEN.TARGET_SERVICE_ID + external_id_col = CONCEPT.BIOSPECIMEN.ID + kf_id_system = "biospecimens" + external_id_system = f"{kf_id_system}?external_aliquot_id=" + + reference_builders = ["patient", "parent_specimen"] + + def _build_resource( + self, row, patient_reference, parent_specimen_reference + ): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + consent_type = row.get(CONCEPT.BIOSPECIMEN.CONSENT_SHORT_NAME) + dbgap_consent_code = row.get( + CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE + ) + external_sample_id = row.get(CONCEPT.SAMPLE.ID) + external_aliquot_id = row.get(CONCEPT.BIOSPECIMEN.ID) + analyte_type = row[CONCEPT.BIOSPECIMEN.ANALYTE] + event_age_days = row.get(CONCEPT.BIOSPECIMEN.EVENT_AGE_DAYS) + volume_ul = row.get(CONCEPT.BIOSPECIMEN.VOLUME_UL) + specimen_status = row.get(CONCEPT.BIOSPECIMEN.STATUS) + tissue_type = row.get(CONCEPT.BIOSPECIMEN.TISSUE_TYPE) + has_matched_normal_sample = row.get( + CONCEPT.BIOSPECIMEN.HAS_MATCHED_NORMAL_SAMPLE + ) + + preservation_method = row.get(CONCEPT.BIOSPECIMEN.PRESERVATION_METHOD) + + # Add references + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + parent_kf_id = row[CONCEPT.SAMPLE.TARGET_SERVICE_ID] + resource["subject"] = patient_reference(participant_kf_id) + if parent_kf_id: + resource["parent"] = [parent_specimen_reference(parent_kf_id)] + + # Add other content + resource["status"] = status_mapping.get(specimen_status, "unavailable") + + # meta.tag + tags = [] + + # tissue_type + if tissue_type: + tags.append( + {"system": "urn:kids_first_tissue_type", "code": tissue_type} + ) + + # has_paired_normal + if has_matched_normal_sample is not None: + tags.append( + { + "system": "urn:kids_first_has_paired_normal", + "code": has_matched_normal_sample, + } + ) + # external_sample_id + if external_sample_id: + tags.append( + { + "system": "urn:kids_first_external_collection_id", + "code": external_sample_id, + } + ) + + if tags: + resource["meta"]["tag"].extend(tags) + + # meta.security + if consent_type: + resource["meta"].setdefault("security", []).append( + { + "system": "https://kf-api-dataservice.kidsfirstdrc.org/biospecimens?consent_type=", + "code": consent_type, + } + ) + if dbgap_consent_code: + resource["meta"].setdefault("security", []).append( + { + "system": "https://kf-api-dataservice.kidsfirstdrc.org/biospecimens?dbgap_consent_code=", + "code": dbgap_consent_code, + } + ) + + # type + if analyte_type == constants.COMMON.NOT_APPLICABLE: + analyte_type = row.get(CONCEPT.BIOSPECIMEN.COMPOSITION) + specimen_type = {"text": analyte_type} + if type_coding.get(analyte_type): + specimen_type["coding"] = type_coding[analyte_type] + resource["type"] = specimen_type + + # collection + collection = {} + + # collection.collectedDateTime + try: + collection["_collectedDateTime"] = extension_age_at_event( + patient_reference(participant_kf_id), event_age_days + ) + # collection.quantity + collection["quantity"] = { + "value": float(volume_ul), + "unit": "microliter", + "system": "http://unitsofmeasure.org", + "code": "uL", + } + except (ValueError, TypeError): + pass + + if collection: + resource["collection"] = collection + + # processing + processing = {} + + # preservation_method + if preservation_method_coding.get(preservation_method): + procedure = { + "coding": preservation_method_coding[preservation_method] + } + processing["procedure"] = procedure + + if processing: + resource["processing"] = processing + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + parent_specimen_reference = builders["parent_specimen"].fhir_reference + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource( + row, patient_reference, parent_specimen_reference + ) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/child_specimen/mapping.py b/d3b_api_client_cli/fhir/entity_builders/child_specimen/mapping.py new file mode 100644 index 0000000..3ea9d44 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/child_specimen/mapping.py @@ -0,0 +1,22 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +from d3b_api_client_cli.fhir.entity_builders.specimen_common.mapping import * + +status_mapping = { + constants.SPECIMEN.STATUS.DISPOSED: "unavailable", + constants.SPECIMEN.STATUS.NOT_AVAILABLE: "unavailable", + constants.SPECIMEN.STATUS.ON_SITE: "available", + constants.SPECIMEN.STATUS.OTHER: "unavailable", + constants.SPECIMEN.STATUS.PATHOLOGY_GOVERNED: "unavailable", + constants.SPECIMEN.STATUS.SHIPPED: "unavailable", + constants.SPECIMEN.STATUS.SHIPPED_GENOMIC_DATA: "unavailable", + constants.SPECIMEN.STATUS.UNKNOWN: "unavailable", + constants.SPECIMEN.STATUS.VIRTUAL: "unavailable", +} diff --git a/d3b_api_client_cli/fhir/entity_builders/disease/__init__.py b/d3b_api_client_cli/fhir/entity_builders/disease/__init__.py new file mode 100644 index 0000000..4847a36 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/disease/__init__.py @@ -0,0 +1,152 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient disease +info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.constants import MISSING_DATA_VALUES +from d3b_api_client_cli.fhir.entity_builders.disease.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, + extension_age_at_event, +) + + +class Disease(FhirResourceBuilder): + """ + Build FHIR Condition from Dataservice Diagnosis + """ + + sources = {"required": ["Diagnosis"], "optional": []} + resource_type = "Condition" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.DIAGNOSIS.TARGET_SERVICE_ID + external_id_col = CONCEPT.DIAGNOSIS.ID + kf_id_system = "diagnoses" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient"] + + def _build_resource(self, row, patient_reference): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + # Add references + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + resource["subject"] = patient_reference(participant_kf_id) + + name = row[CONCEPT.DIAGNOSIS.NAME] + mondo_id = row.get(CONCEPT.DIAGNOSIS.MONDO_ID) + icd_id = row.get(CONCEPT.DIAGNOSIS.ICD_ID) + ncit_id = row.get(CONCEPT.DIAGNOSIS.NCIT_ID) + tumor_location = row.get(CONCEPT.DIAGNOSIS.TUMOR_LOCATION) + uberon_id = row.get(CONCEPT.DIAGNOSIS.UBERON_TUMOR_LOCATION_ID) + event_age_days = row.get(CONCEPT.DIAGNOSIS.EVENT_AGE_DAYS) + + # Add other content + resource["meta"]["profile"] = [ + "https://ncpi-fhir.github.io/ncpi-fhir-ig/StructureDefinition/disease" + ] + resource.update( + { + "clinicalStatus": { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/condition-clinical", + "code": "active", + "display": "Active", + } + ], + "text": "Active", + }, + "category": [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/condition-category", + "code": "encounter-diagnosis", + "display": "Encounter Diagnosis", + } + ] + } + ], + } + ) + # code + code = {"text": name} + if mondo_id and mondo_id not in MISSING_DATA_VALUES: + code.setdefault("coding", []).append( + { + "system": "http://purl.obolibrary.org/obo/mondo.owl", + "code": mondo_id, + } + ) + if icd_id and icd_id not in MISSING_DATA_VALUES: + code.setdefault("coding", []).append( + { + "system": "https://www.who.int/classifications/classification-of-diseases", + "code": icd_id, + } + ) + if ncit_id and ncit_id not in MISSING_DATA_VALUES: + code.setdefault("coding", []).append( + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": ncit_id, + } + ) + resource["code"] = code + + # bodySite + body_site = {} + if tumor_location: + body_site["text"] = tumor_location + if uberon_id and uberon_id not in MISSING_DATA_VALUES: + body_site.setdefault("coding", []).append( + { + "system": "http://purl.obolibrary.org/obo/uberon.owl", + "code": uberon_id, + } + ) + if body_site: + resource.setdefault("bodySite", []).append(body_site) + + # recordedDate + try: + resource["_recordedDate"] = extension_age_at_event( + patient_reference(participant_kf_id), event_age_days + ) + except (ValueError, TypeError): + pass + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row, patient_reference) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/disease/mapping.py b/d3b_api_client_cli/fhir/entity_builders/disease/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/disease/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/drs_doc_ref_common/__init__.py b/d3b_api_client_cli/fhir/entity_builders/drs_doc_ref_common/__init__.py new file mode 100644 index 0000000..5077898 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/drs_doc_ref_common/__init__.py @@ -0,0 +1,524 @@ +# noqa + +import os +import ast +import logging +from pprint import pformat +from urllib.parse import urlparse + +import pandas + +from d3b_api_client_cli.config import config +from d3b_api_client_cli import utils +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.constants import MISSING_DATA_VALUES +from d3b_api_client_cli.fhir.entity_builders.drs_doc_ref_common.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) +from d3b_api_client_cli.fhir.constants import COMMON + +logger = logging.getLogger(__name__) + + +def _build_content_list( + resource, file_name, urls, drs_uri, hash_dict, file_format, size +): + """ + Build the content list with file metadata for the resource + """ + # content + content_list = [] + + # DRS content + content = {} + + # format + if file_format and (file_format not in MISSING_DATA_VALUES): + content["format"] = { + "system": file_format_coding["system"], + "code": file_format, + "display": file_format, + } + + # attachment + attachment = {} + + # size + try: + attachment.setdefault("extension", []).append( + { + "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/file-size", + "valueDecimal": int(size), + } + ) + except: + pass + + # hash + if hash_dict: + if isinstance(hash_dict, str): + hash_dict = ast.literal_eval(hash_dict) + for algorithm, hash_value in hash_dict.items(): + attachment.setdefault("extension", []).append( + { + "url": "https://nih-ncpi.github.io/ncpi-fhir-ig/StructureDefinition/hashes", + "valueCodeableConcept": { + "coding": [{"display": algorithm}], + "text": hash_value, + }, + } + ) + + # DRS URI + if drs_uri: + attachment["url"] = drs_uri + + # Add tag for file source + tag = { + "system": "urn:drs_hostname", + "code": urlparse(drs_uri).netloc, + } + resource["meta"]["tag"].append(tag) + + # File name + s3_url = urls["s3_url"] + if not file_name: + if s3_url: + file_name = os.path.split(s3_url)[-1] + + if file_name: + attachment["title"] = file_name + + # s3 url + if s3_url: + content_list.append({"attachment": {"url": s3_url}}) + # Additional urls + other_urls = urls["others"] + if other_urls: + for url in other_urls: + if url != drs_uri: + content_list.append({"attachment": {"url": url}}) + + if attachment: + content["attachment"] = attachment + + if content: + content_list.append(content) + + return content_list + + +def _get_external_id(resource): + """ + Extract the KF external id for genomic files + """ + identifiers = resource["identifier"] + external_id = None + for identifier in identifiers: + if ( + identifier["system"] + == "https://kf-api-dataservice.kidsfirstdrc.org/genomic-files?external_id=" + ): + external_id = identifier["value"] + return external_id + + +def _extract_urls(row, resource): + """ + Extract urls and mark the s3_url separately + + Try finding s3_url from url list first. If not there try external_id + """ + urls = {"s3_url": None, "others": []} + url_list = row.get(CONCEPT.GENOMIC_FILE.URL_LIST) + if url_list: + if isinstance(url_list, str): + url_list = ast.literal_eval(url_list) + + for url in url_list: + if url.startswith("s3://"): + urls["s3_url"] = url + else: + urls["others"].append(url) + + if not urls["s3_url"]: + external_id = _get_external_id(resource) + if external_id and external_id.startswith("s3://"): + urls["s3_url"] = external_id + + return urls + + +def set_authorization(row): + """ + Create the auth codes for the security label from genomic_file authz + or acl field + + - authz takes precedence over acl + - Ensure all values are in "old" acl format + """ + authz = row.get(CONCEPT.GENOMIC_FILE.ACL) + acl = row.get(CONCEPT.GENOMIC_FILE.ACL) + + if isinstance(acl, str): + acl = ast.literal_eval(acl) + + if isinstance(authz, str): + authz = ast.literal_eval(authz) + + if authz: + new_codes = [] + for code in authz: + new_code = code.split("/")[-1].strip() + if new_code == "open": + new_code = "*" + new_codes.append(new_code) + else: + new_codes = acl or [] + + return new_codes + + +class DocumentReferenceBase(FhirResourceBuilder): + """ + Base class for FHIR DocumentReference entity builders. Common functionality + """ + + sources = {"required": ["SequencingExperimentGenomicFile"], "optional": []} + resource_type = "DocumentReference" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID + external_id_col = CONCEPT.GENOMIC_FILE.ID + kf_id_system = "genomic-files" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient", "child_specimen"] + + def _mark_index_files(self, df): + """ + Mark which rows are index files in the table + + Also save a dict of index file KF IDs mapped to non-index file KF IDs. + This dict will be used in the drs_document_reference_index builder to + create the reference to the drs_document_reference + """ + gf_kf_id = CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID + gf_filename = CONCEPT.GENOMIC_FILE.FILE_NAME + gf_ref = CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID + "_REF" + + # Drop duplicate gfs + df = df.drop_duplicates(gf_kf_id) + + # Fix filenames if needed + def fix_file_name(row): + file_name = row.get(gf_filename) + url_list = row.get(CONCEPT.GENOMIC_FILE.URL_LIST) + + # Try getting file name from s3 url + if (not file_name) and url_list: + if isinstance(url_list, str): + url_list = ast.literal_eval(url_list) + file_path = None + for url in url_list: + if url.startswith("s3://"): + file_path = url + break + if file_path: + file_name = os.path.split(".")[0] + + return file_name + + df[gf_filename] = df.apply(fix_file_name, axis=1) + + # If we don't have file metadata from Indexd then we cannot + # figure out which index files go with which genomic files since + # we need the genomic filename. Mark all the files as normal files + if gf_filename not in df.columns: + df[CONCEPT.GENOMIC_INDEX_FILE.TARGET_SERVICE_ID] = None + df["is_index"] = False + return df + + # Mark index files + logger.info("Mark index files") + df["is_index"] = df[gf_filename].apply( + lambda file_name: ( + file_name.split(".")[-1].endswith("i") if file_name else False + ) + ) + + # Get file name without index extension + logger.info("Get filename without extension") + df["without_extension"] = df.apply( + lambda row: ( + ".".join(row[CONCEPT.GENOMIC_FILE.FILE_NAME].split(".")[0:-1]) + if row[CONCEPT.GENOMIC_FILE.FILE_NAME] and row["is_index"] + else row[CONCEPT.GENOMIC_FILE.FILE_NAME] + ), + axis=1, + ) + # Mark non-index file KF_ID column as something different so + # we can merge it into the original df. After merge we should + # see that index file rows have a referenced non-index file KF ID + logger.info("Collect non-index files") + non_index_files = ( + df[df["is_index"] == False][ + [ + gf_kf_id, + "without_extension", + ] + ] + .rename( + columns={ + gf_kf_id: gf_ref, + } + ) + .drop_duplicates(gf_ref) + ) + + logger.info( + f"Left merging files together. All gfs {df.shape}, non-index" + f" gfs {non_index_files.shape}" + ) + logger.info("Set index") + df = df.set_index("without_extension") + logger.info("Complete set index") + + df = pandas.merge( + df, non_index_files, how="left", on="without_extension" + ).drop_duplicates(gf_kf_id) + + logger.info("Finished merge") + + return df + + def _mark_participant_specimen_lists(self, df): + """ + Add two columns to the source table to capture each genomic file's + list of associated biospecimens and list of participants + + Each row in the source table represents a genomic files. You will notice + that genomic files repeat since there is a row for every file and + its: + - linked participant info + - linked biospecimen info + + This method essentially reduces the source table so that there is + one row per genomic file and that row will have: + - genomic file metadata + - list of linked participants (comma delimited str of IDs) + - list of linked biospecimen (comma delimited str of IDs) + ) + + The resulting table is returned as a list of dicts where each dict + represents a row in the table + """ + rows = [] + total = df.shape[0] + for i, (genomic_file_id, group) in enumerate( + df.groupby(CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID) + ): + logger.info( + f"Collecting pts, biospec for gf {genomic_file_id}: {i+1}/{total}" + ) + row = { + "PARTICIPANT|LIST": group[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + .unique() + .tolist(), + "BIOSPECIMEN|LIST": group[CONCEPT.BIOSPECIMEN.TARGET_SERVICE_ID] + .unique() + .tolist(), + } + rest = group.drop_duplicates( + CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID + ).to_dict(orient="records")[0] + + row.update(rest) + rows.append(row) + + return pandas.DataFrame(rows) + + def _transform(self, source_tables, only_index_files=False): + """ + Do necessary transformations to source tables before FHIR resource + creation. Return a resulting table as a list of dicts/rows. + + See _mark_index_files and _mark_participant_specimen_lists for + details + """ + genomic_file_df = source_tables["SequencingExperimentGenomicFile"] + + logger.info("Create participant, specimen groups") + df = self._mark_participant_specimen_lists(genomic_file_df) + logger.info("Finished grouping participants and specimens") + + logger.info("Start marking genomic index files ...") + df = self._mark_index_files(df) + logger.info(f"Finished marking index files: {df.shape}") + + df = df[df["is_index"] == only_index_files] + logger.info(f"Get files by is_index files: {df.shape}") + + return df + + def _build_resource(self, row, patient_reference, specimen_reference): + """ + Build a DRS DocumentReference resource from a CSV row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + kf_id = row.get(CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID) + strategy = row.get(CONCEPT.SEQUENCING.STRATEGY) + controlled_access = row.get(CONCEPT.GENOMIC_FILE.CONTROLLED_ACCESS) + data_type = row.get(CONCEPT.GENOMIC_FILE.DATA_TYPE) + file_format = row.get(CONCEPT.GENOMIC_FILE.FILE_FORMAT) + file_name = row.get(CONCEPT.GENOMIC_FILE.FILE_NAME) + hash_dict = row.get(CONCEPT.GENOMIC_FILE.HASH_DICT) + size = row.get(CONCEPT.GENOMIC_FILE.SIZE) + acl_list = set_authorization(row) + + drs_uri = row.get(CONCEPT.GENOMIC_FILE.DRS_URI) + urls = _extract_urls(row, resource) + + resource["meta"]["profile"] = [ + "https://ncpi-fhir.github.io/ncpi-fhir-ig/StructureDefinition/ncpi-drs-document-reference" + ] + # TEMPORARY: Impute data_type + if ( + data_type + in { + "Simple Nucleotide Variations", + constants.GENOMIC_FILE.DATA_TYPE.SOMATIC_STRUCTURAL_VARIATIONS, + } + and file_format == "tbi" + ): + data_type = f"{data_type} Index" + + # -- Add references -- + # Patient Reference + # NOTE - This approach only works for DRS document references that + # that link to 1 participant. We must find a better way in the + # future + participant_kf_id = row.get("PARTICIPANT|LIST")[0] + resource["subject"] = patient_reference(participant_kf_id) + + # Biospecimen Reference + resource.setdefault("context", {})["related"] = [ + specimen_reference(biospecimen_kf_id) + for biospecimen_kf_id in row.get("BIOSPECIMEN|LIST") + ] + + # Add other content + resource["status"] = "current" + resource["docStatus"] = "final" + + # type + if data_type: + doc_type = {"text": data_type} + if type_coding.get(data_type): + doc_type.setdefault("coding", []).append(type_coding[data_type]) + resource["type"] = doc_type + + # category + category = [] + if strategy: + # Experimental strategy + experimental_strategy = {"text": strategy} + if experimental_strategy_coding.get(strategy): + experimental_strategy.setdefault("coding", []).append( + experimental_strategy_coding[strategy] + ) + category.append(experimental_strategy) + + # Data Category + data_cateogry = {"text": strategy} + if data_cateogry_coding.get(strategy): + data_cateogry.setdefault("coding", []).append( + data_cateogry_coding[strategy] + ) + category.append(data_cateogry) + if category: + resource["category"] = category + + # securityLabel + controlled_access = str( + controlled_access or COMMON.NOT_REPORTED + ).lower() + security_label_list = [] + security_label = {"text": controlled_access} + security_label.setdefault("coding", []).append( + data_access_coding.get(str(controlled_access)) + or data_access_coding.get("default") + ) + security_label_list.append(security_label) + if acl_list: + for acl in acl_list: + security_label = {"text": acl} + if len(acl.split(".")) > 1: + security_label.setdefault("coding", []).append( + { + "code": acl.split(".")[1], + "system": "urn:dbgap_consent_code", + } + ) + security_label_list.append(security_label) + if security_label_list: + resource["securityLabel"] = security_label_list + + # content + content_list = _build_content_list( + resource, file_name, urls, drs_uri, hash_dict, file_format, size + ) + if content_list: + resource["content"] = content_list + + return resource + + def _modify_resource(self, resource, **kwargs): + """ + Modify the FHIR JSON in any way after the base resource has been built + + This gets called directly after _build_resource in the _build method + + Optional: To be implemented by subclasses needing to exec custom + business logic + """ + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + # Import source data + self.source_tables = self.import_source_data(source_dir) + + # Import reference entity builders + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + specimen_reference = builders["child_specimen"].fhir_reference + + # Transform the source tables before processing to collect + # genomic_file members in each genomic_file row + only_index_files = kwargs.get("only_index_files", False) + logger.info("*** Begin transforming rows for document references ****") + df = self._transform(self.source_tables, only_index_files) + + resources = [] + total = df.shape[0] + for i, (_, row) in enumerate(df.iterrows()): + # Build base drs document reference resource + resource = self._build_resource( + row, patient_reference, specimen_reference + ) + # Apply hook for subclasses to perform any custom logic here + resource = self._modify_resource(resource, row=row) + + resources.append(resource) + + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/drs_doc_ref_common/mapping.py b/d3b_api_client_cli/fhir/entity_builders/drs_doc_ref_common/mapping.py new file mode 100644 index 0000000..9943a36 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/drs_doc_ref_common/mapping.py @@ -0,0 +1,608 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +# https://includedcc.org/fhir/code-systems/data_types +type_coding = { + # Non-index file data types + "Aligned Read": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads", + "display": "Aligned Reads", + }, + constants.GENOMIC_FILE.DATA_TYPE.ALIGNED_READS: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads", + "display": "Aligned Reads", + }, + constants.GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads-Index", + "display": "Aligned Reads Index", + }, + "Alternative Splicing": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Alternative-Splicing", + "display": "Alternative Splicing", + }, + "Annotated Gene Fusion": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Annotated-Gene-Fusion", + "display": "Annotated Gene Fusion", + }, + "Annotated Germline Structural Variation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Germline-Structural-Variations", + "display": "Germline Structural Variations", + }, + "Annotated Somatic Copy Number Segment": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Copy-Number-Variations", + "display": "Somatic Copy Number Variations", + }, + "Annotated Somatic Mutation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations", + "display": "Simple Nucleotide Variations", + }, + "Annotated Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations-Index", + "display": "Simple Nucleotide Variations Index", + }, + "Annotated Somatic Mutations": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations", + "display": "Simple Nucleotide Variations", + }, + "Annotated Variant Call": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations", + "display": "Simple Nucleotide Variations", + }, + "Annotated Variant Call Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations-Index", + "display": "Simple Nucleotide Variations Index", + }, + "Consensus Somatic Mutation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations", + "display": "Simple Nucleotide Variations", + }, + "Consensus Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations-Index", + "display": "Simple Nucleotide Variations Index", + }, + constants.GENOMIC_FILE.DATA_TYPE.EXPRESSION: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Gene-Expression-Quantifications", + "display": "Gene Expression Quantifications", + }, + "Extra-Chromosomal DNA": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Extra-Chromosomal-DNA", + "display": "Extra-Chromosomal DNA", + }, + "Familial Relationship Report": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Familial-Relationship-Report", + "display": "Familial Relationship Report", + }, + "Gene Expression": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Gene-Expression-Quantifications", + "display": "Gene Expression Quantifications", + }, + "Gene Expression Quantification": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Gene-Expression-Quantifications", + "display": "Gene Expression Quantifications", + }, + constants.GENOMIC_FILE.DATA_TYPE.GENE_FUSIONS: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Gene-Fusions", + "display": "Gene Fusions", + }, + "Gene Level Copy Number": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Gene-Level-Copy-Number", + "display": "Gene Level Copy Number", + }, + "Genome Aligned Read": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads", + "display": "Aligned Reads", + }, + "Genome Aligned Read Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads-Index", + "display": "Aligned Reads Index", + }, + "Genomic Variant": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "gVCF", + "display": "gVCF", + }, + "Genomic Variant Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "gVCF-Index", + "display": "gVCF Index", + }, + constants.GENOMIC_FILE.DATA_TYPE.GVCF: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "gVCF", + "display": "gVCF", + }, + constants.GENOMIC_FILE.DATA_TYPE.GVCF_INDEX: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "gVCF-Index", + "display": "gVCF Index", + }, + "Isoform Expression": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Isoform-Expression-Quantifications", + "display": "Isoform Expression Quantifications", + }, + "Isoform Expression Quantifications": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Isoform-Expression-Quantifications", + "display": "Isoform Expression Quantifications", + }, + "Masked Consensus Somatic Mutation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations", + "display": "Somatic Simple Nucleotide Variations", + }, + "Masked Consensus Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations-Index", + "display": "Somatic Simple Nucleotide Variations Index", + }, + "Masked Somatic Mutation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations", + "display": "Somatic Simple Nucleotide Variations", + }, + "Masked Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations-Index", + "display": "Somatic Simple Nucleotide Variations Index", + }, + "Pre-pass Somatic Structural Variation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations", + "display": "Somatic Structural Variations", + }, + "Pre-pass Somatic Structural Variation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations-Index", + "display": "Somatic Structural Variations Index", + }, + "Raw Gene Fusion": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Raw-Gene-Fusions", + "display": "Raw Gene Fusions", + }, + "Raw Germline Structural Variation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Germline-Structural-Variations", + "display": "Germline Structural Variations", + }, + "Raw Germline Structural Variation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Germline-Structural-Variations-Index", + "display": "Germline Structural Variations Index", + }, + "Raw Simple Somatic Mutation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations", + "display": "Somatic Simple Nucleotide Variations", + }, + "Raw Simple Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations-Index", + "display": "Somatic Simple Nucleotide Variations Index", + }, + "Raw Somatic Copy Number Segment": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Copy-Number-Variations", + "display": "Somatic Copy Number Variations", + }, + "Raw Somatic Structural Variation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations", + "display": "Somatic Structural Variations", + }, + "Raw Somatic Structural Variation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations-Index", + "display": "Somatic Structural Variations Index", + }, + "Simple Nucleotide Variations": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations", + "display": "Somatic Simple Nucleotide Variations", + }, + "Somatic Copy Number Variation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Copy-Number-Variations", + "display": "Somatic Copy Number Variations", + }, + constants.GENOMIC_FILE.DATA_TYPE.SOMATIC_COPY_NUMBER_VARIATIONS: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Copy-Number-Variations", + "display": "Somatic Copy Number Variations", + }, + "Somatic Structural Variation": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations", + "display": "Somatic Structural Variations", + }, + constants.GENOMIC_FILE.DATA_TYPE.SOMATIC_STRUCTURAL_VARIATIONS: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations", + "display": "Somatic Structural Variations", + }, + "Transcriptome Aligned Read": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads", + "display": "Aligned Reads", + }, + "Tumor Copy Number Segment": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Tumor-Copy-Number-Segment", + "display": "Tumor Copy Number Segment", + }, + "Tumor Ploidy and Purity": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Tumor Ploidy and Purity", + "display": "Tumor Ploidy and Purity", + }, + "Unaligned Reads": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Unaligned-Reads", + "display": "Unaligned Reads", + }, + constants.GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Variant-Calls", + "display": "Variant Calls", + }, + # Index File Data Types + constants.GENOMIC_FILE.DATA_TYPE.ALIGNED_READS_INDEX: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads-Index", + "display": "Aligned Reads Index", + }, + "Annotated Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations-Index", + "display": "Simple Nucleotide Variations Index", + }, + "Annotated Variant Call Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations-Index", + "display": "Simple Nucleotide Variations Index", + }, + "Consensus Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Simple-Nucleotide-Variations-Index", + "display": "Simple Nucleotide Variations Index", + }, + "Genome Aligned Read Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Aligned-Reads-Index", + "display": "Aligned Reads Index", + }, + "Genomic Variant Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "gVCF-Index", + "display": "gVCF Index", + }, + constants.GENOMIC_FILE.DATA_TYPE.GVCF_INDEX: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "gVCF-Index", + "display": "gVCF Index", + }, + "Masked Consensus Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations-Index", + "display": "Somatic Simple Nucleotide Variations Index", + }, + "Masked Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations-Index", + "display": "Somatic Simple Nucleotide Variations Index", + }, + "Pre-pass Somatic Structural Variation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations-Index", + "display": "Somatic Structural Variations Index", + }, + "Raw Germline Structural Variation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Germline-Structural-Variations-Index", + "display": "Germline Structural Variations Index", + }, + "Raw Simple Somatic Mutation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Simple-Nucleotide-Variations-Index", + "display": "Somatic Simple Nucleotide Variations Index", + }, + "Raw Somatic Structural Variation Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Structural-Variations-Index", + "display": "Somatic Structural Variations Index", + }, + constants.GENOMIC_FILE.DATA_TYPE.VARIANT_CALLS_INDEX: { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Variant-Calls-Index", + "display": "Variant Calls Index", + }, + # Metric File types + "HLA Genotyping": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "HLA-Genotyping", + "display": "HLA Genotyping", + }, + "Artifact Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Artifact-Metrics", + "display": "Artifact Metrics", + }, + "Cutadapter Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Cutadapter-Metrics", + "display": "Cutadapter Metrics", + }, + "GC Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "GC-Metrics", + "display": "GC Metrics", + }, + "Gender Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Gender-Metrics", + "display": "Gender Metrics", + }, + "Gender QC Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Gender-QC-Metrics", + "display": "Gender QC Metrics", + }, + "Het Call QC Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Het-Call-QC-Metrics", + "display": "Het Call QC Metrics", + }, + "Insert Size Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Cutadapter-Metrics", + "display": "Cutadapter Metrics", + }, + "QC Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "QC-Metrics", + "display": "QC Metrics", + }, + "Relatedness QC Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Relatedness-QC-Metrics", + "display": "Relatedness QC Metrics", + }, + "RNAseq Alignment Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "RNAseq-Alignment-Metrics", + "display": "RNAseq Alignment Metrics", + }, + "Somatic Copy Number Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Somatic-Copy-Number-Metrics", + "display": "Somatic Copy Number Metrics", + }, + "Variant Calling Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Variant-Calling-Metrics", + "display": "Variant Calling Metrics", + }, + "WGS Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "WGS-Metrics", + "display": "WGS Metrics", + }, + "WXS Metrics": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "WXS-Metrics", + "display": "WXS Metrics", + }, + "Genome Aligned Reads": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Genome-Aligned-Reads", + "display": "Genome Aligned Reads", + }, + "Genome Aligned Reads Index": { + "system": "https://includedcc.org/fhir/code-systems/data_types", + "code": "Genome-Aligned-Reads-Index", + "display": "Genome Aligned Reads Index", + }, +} + +# https://includedcc.org/fhir/code-systems/experimental_strategies +experimental_strategy_coding = { + constants.SEQUENCING.STRATEGY.LINKED_WGS: { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "Linked-Read-WGS", + "display": "Linked-Read WGS", + }, + constants.SEQUENCING.STRATEGY.METHYL: { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "Methylation", + "display": "Methylation", + }, + constants.SEQUENCING.STRATEGY.MRNA: { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "miRNA-Seq", + "display": "MicroRNA Sequencing", + }, + constants.SEQUENCING.STRATEGY.RNA: { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "RNA-Seq", + "display": "RNA Sequencing", + }, + "scRNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "scRNA-Seq", + "display": "Single-Cell RNA Sequencing", + }, + "snRNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "snRNA-Seq", + "display": "Single-Nucleus RNA Sequencing", + }, + constants.SEQUENCING.STRATEGY.TARGETED: { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "Targeted-Sequencing", + "display": "Targeted Sequencing", + }, + constants.SEQUENCING.STRATEGY.WGS: { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "WGS", + "display": "Whole Genome Sequencing", + }, + constants.SEQUENCING.STRATEGY.WXS: { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "WXS", + "display": "Whole Exome Sequencing", + }, + # long reads strategies + "ONT WGS": { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "ONT-WGS", + "display": "ONT Whole Genome Sequencing", + }, + "Circular Consensus Sequencing WGS": { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "CCS-WGS", + "display": "Circular Consensus Whole Genome Sequencing", + }, + "Circular Consensus Sequencing RNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "CCS-RNASeq", + "display": "Circular Consensus RNA Sequencing", + }, + "Continuous Long Reads WGS": { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "CLR-WGS", + "display": "Continuous Long Reads Whole Genome Sequencing", + }, + "Continuous Long Reads RNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/experimental_strategies", + "code": "CLR-RNASeq", + "display": "Continuous Long Reads RNA Sequencing", + }, +} + +# https://includedcc.org/fhir/code-systems/data_categories +data_cateogry_coding = { + constants.SEQUENCING.STRATEGY.LINKED_WGS: { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + constants.SEQUENCING.STRATEGY.METHYL: { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + constants.SEQUENCING.STRATEGY.MRNA: { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Transcriptomics", + "display": "Transcriptomics", + }, + "scRNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Transcriptomics", + "display": "Transcriptomics", + }, + "snRNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Transcriptomics", + "display": "Transcriptomics", + }, + constants.SEQUENCING.STRATEGY.RNA: { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Transcriptomics", + "display": "Transcriptomics", + }, + constants.SEQUENCING.STRATEGY.TARGETED: { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + constants.SEQUENCING.STRATEGY.WGS: { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + constants.SEQUENCING.STRATEGY.WXS: { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + # long reads strategies + "ONT WGS": { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + "Circular Consensus Sequencing WGS": { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + "Circular Consensus Sequencing RNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Transcriptomics", + "display": "Transcriptomics", + }, + "Continuous Long Reads WGS": { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Genomics", + "display": "Genomics", + }, + "Continuous Long Reads RNA-Seq": { + "system": "https://includedcc.org/fhir/code-systems/data_categories", + "code": "Transcriptomics", + "display": "Transcriptomics", + }, +} + +# https://includedcc.org/fhir/code-systems/data_access_types +data_access_coding = { + "true": { + "system": "https://includedcc.org/fhir/code-systems/data_access_types", + "code": "controlled", + "display": "Controlled", + }, + "false": { + "system": "https://includedcc.org/fhir/code-systems/data_access_types", + "code": "registered", + "display": "Registered", + }, + "default": { + "system": "https://includedcc.org/fhir/code-systems/data_access_types", + "code": "controlled", + "display": "Controlled", + }, +} + +file_format_coding = { + "system": "https://includedcc.org/fhir/code-systems/file_formats", +} diff --git a/d3b_api_client_cli/fhir/entity_builders/drs_document_reference/__init__.py b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference/__init__.py new file mode 100644 index 0000000..29dd3f5 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference/__init__.py @@ -0,0 +1,25 @@ +# noqa + +import logging +import pandas + +from d3b_api_client_cli import utils +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.drs_document_reference.mapping import * +from d3b_api_client_cli.fhir.entity_builders.drs_doc_ref_common import ( + DocumentReferenceBase, +) + +logger = logging.getLogger(__name__) + + +class DrsDocumentReference(DocumentReferenceBase): + """ + Build FHIR DocumentReference from Dataservice GenomicFile + """ + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + return super()._build(source_dir, only_index_files=False) diff --git a/d3b_api_client_cli/fhir/entity_builders/drs_document_reference/mapping.py b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/drs_document_reference_index/__init__.py b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference_index/__init__.py new file mode 100644 index 0000000..a90e4b2 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference_index/__init__.py @@ -0,0 +1,45 @@ +# noqa + +import logging + +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.drs_document_reference_index.mapping import * +from d3b_api_client_cli.fhir.entity_builders.drs_doc_ref_common import ( + DocumentReferenceBase, +) + +logger = logging.getLogger(__name__) + + +class DrsDocumentReferenceIndex(DocumentReferenceBase): + """ + Build FHIR DocumentReference from Dataservice GenomicFile index files + (e.g. .crai, .bai) + """ + + def _modify_resource(self, resource, row): + """ + Add reference to DrsDocumentReferenceIndex + """ + index_file_kf_id = row.get(CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID) + non_index_kf_id = row.get( + CONCEPT.GENOMIC_FILE.TARGET_SERVICE_ID + "_REF" + ) + + if pandas.notnull(index_file_kf_id) and pandas.notnull(non_index_kf_id): + resource.setdefault("relatesTo", []).append( + { + "code": "transforms", + "target": self.fhir_reference(non_index_kf_id), + } + ) + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + return super()._build(source_dir, only_index_files=True) diff --git a/d3b_api_client_cli/fhir/entity_builders/drs_document_reference_index/mapping.py b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference_index/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/drs_document_reference_index/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/family/__init__.py b/d3b_api_client_cli/fhir/entity_builders/family/__init__.py new file mode 100644 index 0000000..2e8ad4c --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/family/__init__.py @@ -0,0 +1,135 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient family +info +""" + +import logging +import pandas + +from d3b_api_client_cli import utils +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.family.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class Family(FhirResourceBuilder): + """ + Build FHIR Group from Dataservice Participant and Family + """ + + sources = {"required": ["Participant", "Family"], "optional": []} + resource_type = "Group" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.FAMILY.TARGET_SERVICE_ID + external_id_col = CONCEPT.FAMILY.ID + kf_id_system = "families" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient"] + + def _transform(self, source_tables): + """ + Do necessary transformations to source tables before FHIR resource + creation. Return a resulting table as a list of dicts/rows. + """ + participant_df = source_tables["Participant"] + family_df = source_tables["Family"] + + participant_cols = [ + CONCEPT.FAMILY.TARGET_SERVICE_ID, + CONCEPT.PARTICIPANT.TARGET_SERVICE_ID, + CONCEPT.PARTICIPANT.SPECIES, + ] + + # Merge participants with families + df = utils.merge_wo_duplicates( + participant_df[participant_cols], + family_df, + on=CONCEPT.FAMILY.TARGET_SERVICE_ID, + ) + # Group rows by family ID so we can have one row per family where + # that row will have a list of family members + rows = [ + { + CONCEPT.FAMILY.TARGET_SERVICE_ID: family_id, + CONCEPT.FAMILY.ID: group.get(CONCEPT.FAMILY.ID).unique()[0], + CONCEPT.PARTICIPANT.SPECIES: group.get( + CONCEPT.PARTICIPANT.SPECIES + ).unique()[0], + CONCEPT.PARTICIPANT.TARGET_SERVICE_ID: group.get( + CONCEPT.PARTICIPANT.TARGET_SERVICE_ID + ).unique(), + } + for family_id, group in df.groupby(CONCEPT.FAMILY.TARGET_SERVICE_ID) + ] + return rows + + def _build_resource(self, row, patient_reference): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + species = row.get(CONCEPT.PARTICIPANT.SPECIES) + + # Add references + members = [ + { + "entity": patient_reference(participant_kf_id), + "inactive": False, + } + for participant_kf_id in row.get( + CONCEPT.PARTICIPANT.TARGET_SERVICE_ID, [] + ) + ] + if members: + resource["quantity"] = len(members) + resource["member"] = members + + # Add other content + resource.update( + { + "type": type_code.get(species) or "person", + "actual": True, + "code": { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "FAMMEMB", + "display": "family member", + }, + ] + }, + } + ) + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + + # Transform the source tables before processing to collect + # family members in each family row + rows = self._transform(self.source_tables) + + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + + resources = [] + total = len(rows) + for i, row in enumerate(rows): + resource = self._build_resource(row, patient_reference) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/family/mapping.py b/d3b_api_client_cli/fhir/entity_builders/family/mapping.py new file mode 100644 index 0000000..cbf76e1 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/family/mapping.py @@ -0,0 +1,15 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +type_code = { + constants.SPECIES.DOG: "animal", + constants.SPECIES.FLY: "animal", + constants.SPECIES.HUMAN: "person", + constants.SPECIES.MOUSE: "animal", +} diff --git a/d3b_api_client_cli/fhir/entity_builders/family_relationship/__init__.py b/d3b_api_client_cli/fhir/entity_builders/family_relationship/__init__.py new file mode 100644 index 0000000..f65aed4 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/family_relationship/__init__.py @@ -0,0 +1,99 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient family +relationship info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.family_relationship.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class FamilyRelationship(FhirResourceBuilder): + """ + Build FHIR Observation from Dataservice FamilyRelationship + """ + + sources = {"required": ["FamilyRelationship"], "optional": []} + resource_type = "Observation" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.FAMILY_RELATIONSHIP.TARGET_SERVICE_ID + external_id_col = CONCEPT.FAMILY_RELATIONSHIP.ID + kf_id_system = "family_relationships" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient"] + + def _build_resource(self, row, patient_reference): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + relation_from_1_to_2 = row[ + CONCEPT.FAMILY_RELATIONSHIP.RELATION_FROM_1_TO_2 + ] + + # Add references + person1_kf_id = row[ + CONCEPT.FAMILY_RELATIONSHIP.PERSON1.TARGET_SERVICE_ID + ] + person2_kf_id = row[ + CONCEPT.FAMILY_RELATIONSHIP.PERSON2.TARGET_SERVICE_ID + ] + resource["subject"] = patient_reference(person1_kf_id) + resource["focus"] = [patient_reference(person2_kf_id)] + + # Add other content + resource["meta"]["profile"] = [ + "https://ncpi-fhir.github.io/ncpi-fhir-ig/StructureDefinition/family-relationship" + ] + resource["status"] = "final" + resource["code"] = { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "FAMMEMB", + "display": "family member", + } + ], + "text": "Family Relationship", + } + # valueCodeableConcept + if relation_from_1_to_2: + value = {"text": relation_from_1_to_2} + if code_coding.get(relation_from_1_to_2): + value.setdefault("coding", []).append( + code_coding[relation_from_1_to_2] + ) + resource["valueCodeableConcept"] = value + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row, patient_reference) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/family_relationship/mapping.py b/d3b_api_client_cli/fhir/entity_builders/family_relationship/mapping.py new file mode 100644 index 0000000..c34d88e --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/family_relationship/mapping.py @@ -0,0 +1,312 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +# http://terminology.hl7.org/ValueSet/v3-FamilyMember +code_coding = { + "Aunt": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "AUNT", + "display": "aunt", + }, + constants.RELATIONSHIP.BROTHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "BRO", + "display": "brother", + }, + "Brother-in-law": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "BROINLAW", + "display": "brother-in-law", + }, + "Brother-Monozygotic Twin": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "TWINBRO", + "display": "twin brother", + }, + constants.RELATIONSHIP.CHILD: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "CHILD", + "display": "child", + }, + "Cousin": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "COUSN", + "display": "cousin", + }, + constants.RELATIONSHIP.DAUGHTER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "DAUC", + "display": "daughter", + }, + "father": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "FTH", + "display": "father", + }, + constants.RELATIONSHIP.FATHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "FTH", + "display": "father", + }, + "First cousin once removed": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + constants.RELATIONSHIP.GRANDCHILD: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "GRNDCHILD", + "display": "grandchild", + }, + constants.RELATIONSHIP.GRANDDAUGHTER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "GRNDDAU", + "display": "granddaughter", + }, + constants.RELATIONSHIP.GRANDFATHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "GRFTH", + "display": "grandfather", + }, + constants.RELATIONSHIP.GRANDMOTHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "GRMTH", + "display": "grandmother", + }, + constants.RELATIONSHIP.GRANDSON: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "GRNDSON", + "display": "grandson", + }, + "Great Nephew": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + constants.RELATIONSHIP.HUSBAND: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "HUSB", + "display": "husband", + }, + "Married in aunt": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Married in Husband": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "HUSB", + "display": "husband", + }, + "Married in-Spouse": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "SPS", + "display": "spouse", + }, + "Maternal aunt": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MAUNT", + "display": "maternal aunt", + }, + "Maternal Aunt": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MAUNT", + "display": "maternal aunt", + }, + "Maternal cousin": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MCOUSN", + "display": "maternal cousin", + }, + "Maternal Cousin": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MCOUSN", + "display": "maternal cousin", + }, + "Maternal grandfather": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MGRFTH", + "display": "maternal grandfather", + }, + constants.RELATIONSHIP.MATERNAL_GRANDDAUGHTER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "GRNDDAU", + "display": "granddaughter", + }, + constants.RELATIONSHIP.MATERNAL_GRANDFATHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MGRFTH", + "display": "maternal grandfather", + }, + "Maternal grandmother": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MGRMTH", + "display": "maternal grandmother", + }, + constants.RELATIONSHIP.MATERNAL_GRANDMOTHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MGRMTH", + "display": "maternal grandmother", + }, + "Maternal great aunt": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Maternal Great Aunt": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Maternal Great Aunt (Mother's paternal aunt)": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Maternal Great Grandmother": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MGGRMTH", + "display": "maternal great-grandmother", + }, + "Maternal Great Uncle": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Maternal half-sister": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Maternal Relation": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Maternal uncle": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MUNCLE", + "display": "maternal uncle", + }, + "mother": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MTH", + "display": "mother", + }, + constants.RELATIONSHIP.MOTHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "MTH", + "display": "mother", + }, + "Nephew": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "NEPHEW", + "display": "nephew", + }, + "Niece": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "NIECE", + "display": "niece", + }, + "Paternal aunt": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "PAUNT", + "display": "paternal aunt", + }, + "Paternal cousin": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "PCOUSN", + "display": "paternal cousin", + }, + "Paternal Cousin": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "PCOUSN", + "display": "paternal cousin", + }, + "Paternal grandfather": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "PGRFTH", + "display": "paternal grandfather", + }, + "Paternal grandmother": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "PGRMTH", + "display": "paternal grandmother", + }, + constants.RELATIONSHIP.PATERNAL_GRANDMOTHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "PGRMTH", + "display": "paternal grandmother", + }, + "Paternal uncle": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "PUNCLE", + "display": "paternal uncle", + }, + constants.RELATIONSHIP.PROBAND: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "CHILD", + "display": "child", + }, + constants.RELATIONSHIP.SIBLING: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "SIB", + "display": "sibling", + }, + constants.RELATIONSHIP.SISTER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "SIS", + "display": "sister", + }, + constants.RELATIONSHIP.SON: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "SONC", + "display": "son", + }, + constants.RELATIONSHIP.SPOUSE: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "SPS", + "display": "spouse", + }, + "Twin": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "TWIN", + "display": "twin", + }, + constants.RELATIONSHIP.TWIN_BROTHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "TWINBRO", + "display": "twin brother", + }, + constants.RELATIONSHIP.TWIN_SISTER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "TWINSIS", + "display": "twin sister", + }, + "Uncle": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "UNCLE", + "display": "uncle", + }, + "Uncle-married in": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "EXT", + "display": "extended family member", + }, + "Wife": { + "system": "http://terminology.hl7.org/CodeSystem/v3-RoleCode", + "code": "WIFE", + "display": "wife", + }, + constants.COMMON.OTHER: { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "OTH", + "display": "other", + }, +} diff --git a/d3b_api_client_cli/fhir/entity_builders/histopathology/__init__.py b/d3b_api_client_cli/fhir/entity_builders/histopathology/__init__.py new file mode 100644 index 0000000..a1d1bb2 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/histopathology/__init__.py @@ -0,0 +1,107 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient +histopathology info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.constants import MISSING_DATA_VALUES +from d3b_api_client_cli.fhir.entity_builders.histopathology.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class Histopathology(FhirResourceBuilder): + """ + Build FHIR Observation from Dataservice BiospecimenDiagnosis + """ + + sources = {"required": ["BiospecimenDiagnosis"], "optional": []} + resource_type = "Observation" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.BIOSPECIMEN_DIAGNOSIS.TARGET_SERVICE_ID + external_id_col = CONCEPT.BIOSPECIMEN_DIAGNOSIS.ID + kf_id_system = "biospecimen-diagnoses" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient", "child_specimen", "disease"] + + def _build_resource( + self, row, patient_reference, specimen_reference, disease_reference + ): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + tumor_descriptor = row.get(CONCEPT.BIOSPECIMEN.TUMOR_DESCRIPTOR) + + # Add references + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + biospecimen_kf_id = row[CONCEPT.BIOSPECIMEN.TARGET_SERVICE_ID] + diagnosis_kf_id = row[CONCEPT.DIAGNOSIS.TARGET_SERVICE_ID] + resource["subject"] = patient_reference(participant_kf_id) + resource["specimen"] = specimen_reference(biospecimen_kf_id) + resource["focus"] = [disease_reference(diagnosis_kf_id)] + + # Add other content + resource["status"] = "final" + resource["code"] = { + "coding": [ + { + "system": "http://snomed.info/sct", + "code": "250537006", + "display": "Histopathology finding (finding)", + } + ], + "text": "Histopathology", + } + resource["category"] = [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "code": "laboratory", + "display": "Laboratory", + } + ], + "text": "Histopathology", + } + ] + + if tumor_descriptor and tumor_descriptor not in MISSING_DATA_VALUES: + resource["valueCodeableConcept"] = {"text": tumor_descriptor} + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + specimen_reference = builders["child_specimen"].fhir_reference + disease_reference = builders["disease"].fhir_reference + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource( + row, patient_reference, specimen_reference, disease_reference + ) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/histopathology/mapping.py b/d3b_api_client_cli/fhir/entity_builders/histopathology/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/histopathology/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/organization/__init__.py b/d3b_api_client_cli/fhir/entity_builders/organization/__init__.py new file mode 100644 index 0000000..c1abd65 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/organization/__init__.py @@ -0,0 +1,63 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the organization +conductinng the patient's study +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.organization.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class Organization(FhirResourceBuilder): + """ + Build FHIR Organization from Dataservice Investigator + """ + + sources = {"required": ["Investigator"], "optional": []} + resource_type = "Organization" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.INVESTIGATOR.TARGET_SERVICE_ID + external_id_col = CONCEPT.INVESTIGATOR.ID + kf_id_system = "investigators" + external_id_system = f"{kf_id_system}?external_id=" + + def _build_resource(self, row): + """ + Build FHIR JSON from a csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + institution = row.get(CONCEPT.INVESTIGATOR.INSTITUTION) + + # Add other content + resource["active"] = True + if institution: + resource["name"] = institution + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/organization/mapping.py b/d3b_api_client_cli/fhir/entity_builders/organization/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/organization/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/parent_specimen/__init__.py b/d3b_api_client_cli/fhir/entity_builders/parent_specimen/__init__.py new file mode 100644 index 0000000..fce40c2 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/parent_specimen/__init__.py @@ -0,0 +1,397 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient parent +specimen info +""" + +import logging +import pandas +from numpy import nan + +from d3b_api_client_cli import utils +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.parent_specimen.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, + extension_age_at_event, +) +from d3b_api_client_cli.fhir.entity_builders.parent_specimen import ( + tree, +) + + +PID = tree.PID +CID = tree.CID + + +class ParentSpecimen(FhirResourceBuilder): + """ + Build FHIR Specimen from Dataservice Biospecimen and Sample + """ + + sources = { + "required": ["Biospecimen", "Sample"], + "optional": ["SampleRelationship"], + } + resource_type = "Specimen" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.SAMPLE.TARGET_SERVICE_ID + external_id_col = CONCEPT.SAMPLE.ID + kf_id_system = "samples" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient"] + + def _build_resource(self, row, patient_reference): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + # Add references + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + resource["subject"] = patient_reference(participant_kf_id) + + kf_id = row[self.kf_id_col] + sample_type = str(row[CONCEPT.SAMPLE.SAMPLE_TYPE]) + + # TODO Where is the consent going to come from?! + consent_type = row.get(CONCEPT.BIOSPECIMEN.CONSENT_SHORT_NAME) + dbgap_consent_code = row.get( + CONCEPT.BIOSPECIMEN.DBGAP_STYLE_CONSENT_CODE + ) + + tissue_type = row.get(CONCEPT.SAMPLE.TISSUE_TYPE) + has_matched_normal_sample = row.get( + CONCEPT.SAMPLE.HAS_MATCHED_NORMAL_SAMPLE + ) + external_collection_id = row.get(CONCEPT.SAMPLE.EXTERNAL_COLLECTION_ID) + + # ncit_id_tissue_type = row.get(CONCEPT.SAMPLE.NCIT_TISSUE_TYPE_ID) + event_age_days = row.get(CONCEPT.SAMPLE.EVENT_AGE_DAYS) + volume_ul = row.get(CONCEPT.SAMPLE.VOLUME_UL) + sample_procurement = row.get(CONCEPT.SAMPLE.SAMPLE_PROCUREMENT) + anatomy_site = row.get(CONCEPT.SAMPLE.ANATOMY_SITE) + uberon_anatomy_site_id = row.get( + CONCEPT.BIOSPECIMEN.UBERON_ANATOMY_SITE_ID + ) + ncit_anatomy_site_id = row.get(CONCEPT.BIOSPECIMEN.NCIT_ANATOMY_SITE_ID) + + preservation_method = row.get(CONCEPT.SAMPLE.PRESERVATION_METHOD) + + # Add other content + resource["status"] = "unavailable" + + # meta.tag + tags = [] + + # tissue_type + if tissue_type: + tags.append( + {"system": "urn:kids_first_tissue_type", "code": tissue_type} + ) + + # has_paired_normal + if has_matched_normal_sample is not None: + tags.append( + { + "system": "urn:kids_first_has_paired_normal", + "code": has_matched_normal_sample, + } + ) + + # external_collection_id + if external_collection_id: + tags.append( + { + "system": "urn:kids_first_external_collection_id", + "code": external_collection_id, + } + ) + + if tags: + resource["meta"]["tag"].extend(tags) + + # meta.security + if consent_type: + resource["meta"].setdefault("security", []).append( + { + "system": "https://kf-api-dataservice.kidsfirstdrc.org/samples?consent_type=", + "code": consent_type, + } + ) + if dbgap_consent_code: + resource["meta"].setdefault("security", []).append( + { + "system": "https://kf-api-dataservice.kidsfirstdrc.org/samples?dbgap_consent_code=", + "code": dbgap_consent_code, + } + ) + + # type + specimen_type = {"text": sample_type} + if type_coding.get(sample_type): + specimen_type["coding"] = type_coding[sample_type] + resource["type"] = specimen_type + + # collection + collection = {} + + # collection.collectedDateTime + try: + collection["_collectedDateTime"] = extension_age_at_event( + patient_reference(participant_kf_id), event_age_days + ) + # collection.quantity + collection["quantity"] = { + "value": float(volume_ul), + "unit": "microliter", + "system": "http://unitsofmeasure.org", + "code": "uL", + } + except (ValueError, TypeError): + pass + + # method + method = {} + if sample_procurement: + method["text"] = sample_procurement + # if collection_method_coding.get(sample_procurement): + # method.setdefault("coding", []).append( + # collection_method_coding[sample_procurement] + # ) + if method: + collection["method"] = method + + # bodySite + body_site = {} + if anatomy_site: + body_site["text"] = anatomy_site + if uberon_anatomy_site_id: + body_site_coding = {"code": uberon_anatomy_site_id} + if uberon_anatomy_site_id.startswith("UBERON:"): + body_site_coding["system"] = ( + "http://purl.obolibrary.org/obo/uberon.owl" + ) + elif uberon_anatomy_site_id.startswith("EFO:"): + body_site_coding["system"] = "http://www.ebi.ac.uk/efo/efo.owl" + body_site.setdefault("coding", []).append(body_site_coding) + if ncit_anatomy_site_id and ncit_anatomy_site_id.startswith("NCIT:"): + body_site.setdefault("coding", []).append( + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": ncit_anatomy_site_id, + } + ) + if body_site: + collection["bodySite"] = body_site + + if collection: + resource["collection"] = collection + + # processing + processing = {} + + # preservation_method + if preservation_method_coding.get(preservation_method): + procedure = { + "coding": preservation_method_coding[preservation_method] + } + processing["procedure"] = procedure + + if processing: + resource["processing"] = processing + + return resource + + def _transform(self, source_tables): + """ + Do necessary transformations to source tables before FHIR resource + creation + """ + # Now we cannot inner merge bc not all Samples have linked Biospecimens + # Left merge Biospecimens into Samples so that we can get + # some of the sample info from Biospecimens (consent, etc) + self.logger.info( + f"🏭 Start {self.entity_type} pre-processing/transformation ..." + ) + biospecimens = source_tables["Biospecimen"] + samples = source_tables["Sample"] + df = utils.merge_wo_duplicates( + samples, + biospecimens[ + [ + c + for c in biospecimens.columns + if c.startswith(CONCEPT.BIOSPECIMEN._CONCEPT_NAME) + ] + + [CONCEPT.SAMPLE.TARGET_SERVICE_ID] + ], + how="left", + on=CONCEPT.SAMPLE.TARGET_SERVICE_ID, + ) + df = ( + df.drop_duplicates(CONCEPT.SAMPLE.TARGET_SERVICE_ID) + .replace({nan: None}) + .reset_index() + ) + + self.logger.info( + f"Completed pre-processing of {df.shape[0]} {self.entity_type} ✅" + ) + + return df + + def _create_specimens(self, sample_df): + """ + Create FHIR Specimens from rows of Samples + + :returns: dict of specimens keyed by FHIR IDs + """ + self.logger.info(f"🏭 Starting {self.entity_type} resource build ...") + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + + specimen_dict = {} + total = sample_df.shape[0] + for i, row in sample_df.iterrows(): + resource = self._build_resource(row, patient_reference) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + specimen_dict[resource["id"]] = resource + + self.logger.info( + f"Completed building {total} {self.entity_type} resources ✅" + ) + + return specimen_dict + + def _link_specimens(self, specimen_dict, sample_relationship_df): + """ + Update the FHIR Specimens with references to their appropriate + specimens, using the sample_relationships table + + :returns: dict of specimens keyed by FHIR IDs + """ + self.logger.info(f"🏭 Start linking {len(specimen_dict)} specimens ...") + # Iterate over child specimens in the relationships + for _, row in sample_relationship_df.iterrows(): + child_id = row.get( + CONCEPT.SAMPLE_RELATIONSHIP.CHILD.TARGET_SERVICE_ID + ) + # Look up child FHIR specimen + child_fhir_id = self.fhir_id(child_id) + child_specimen = specimen_dict[child_fhir_id] + + # Link each child FHIR specimen to its parent FHIR specimen + parent_id = row.get( + CONCEPT.SAMPLE_RELATIONSHIP.PARENT.TARGET_SERVICE_ID + ) + if parent_id: + child_specimen["parent"] = [self.fhir_reference(parent_id)] + + self.logger.info(f"Completed specimen linking ✅") + + return specimen_dict + + def _order_specimens(self, specimen_dict, sample_relationship_df): + """ + Determine load order of Specimens so that references are respected + + Iterate over sample_relationships table to build Specimen trees + Traverse Specimen trees using BFS/level order traversal + Add the Specimens from each traversal to the output list of Specimens + + Return the list of ordered Specimens + + :returns: list of dicts + """ + self.logger.info( + f"🏭 Start order operation for {len(specimen_dict)} specimens ..." + ) + ordered_specimens = [] + ordered_ids = [] + + # kf_id -> fhir id + for c in [PID, CID]: + sample_relationship_df[c] = sample_relationship_df[c].apply( + lambda sid: self.fhir_id(sid) if sid else sid + ) + + # Collect Specimens that are not part of Specimen trees + specimens_in_relationships = set( + sample_relationship_df[PID].values + ).union(set(sample_relationship_df[CID].values)) + + all_specimens = set(specimen_dict.keys()) + + # Add non-tree Specimens to output + ordered_ids.extend(all_specimens - specimens_in_relationships) + + self.logger.debug( + f"Found {len(ordered_ids)} specimens not in relationships" + ) + self.logger.debug( + f"Found {len(specimens_in_relationships)} specimens" + " that are part of a sample tree" + ) + + # Specimens that are part of Specimen trees need to be ordered + # in a way such that references are respected + if utils.df_exists(sample_relationship_df): + self.logger.info( + "🧪 Detected sample relationships table of m x n: " + f" {sample_relationship_df.shape}." + " Starting tree processing ..." + ) + ordered_ids.extend( + tree.level_ordered_specimens(sample_relationship_df) + ) + + # Populate payloads list + ordered_specimens = [specimen_dict[sid] for sid in ordered_ids] + + if len(ordered_specimens) < len(all_specimens): + raise tree.MissingDataException( + "❌ Detected problems in specimen data during " + "ordering operation. This is likely due to a missing " + "sample relationship that designates a sample is the " + "root of a sample tree. Each root sample must have a row " + "in the sample_relationship like this: parent_id=null " + "child_id=" + ) + self.logger.info( + f"Completed ordering {len(ordered_specimens)} specimens ✅" + ) + + return ordered_specimens + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + # Import source tables + source_tables = self.import_source_data(source_dir) + + # Sample pre-processing + sample_df = self._transform(source_tables) + + # Build specimens + specimen_dict = self._create_specimens(sample_df) + + # If sample relationships exist, link specimens to each other + if "SampleRelationship" in source_tables: + relationship_df = source_tables["SampleRelationship"] + # -- Link specimens to each other -- + specimen_dict = self._link_specimens(specimen_dict, relationship_df) + + # -- Determine Specimen load order -- # + resources = self._order_specimens(specimen_dict, relationship_df) + else: + resources = list(specimen_dict.values()) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/parent_specimen/mapping.py b/d3b_api_client_cli/fhir/entity_builders/parent_specimen/mapping.py new file mode 100644 index 0000000..6d872b9 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/parent_specimen/mapping.py @@ -0,0 +1,7 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" +from d3b_api_client_cli.fhir.entity_builders.specimen_common.mapping import * diff --git a/d3b_api_client_cli/fhir/entity_builders/parent_specimen/tree.py b/d3b_api_client_cli/fhir/entity_builders/parent_specimen/tree.py new file mode 100644 index 0000000..4280f69 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/parent_specimen/tree.py @@ -0,0 +1,118 @@ +""" +A module for building Specimen trees and traversing the trees + +Used to determine the load order of Specimens +""" + +import logging +from queue import Queue +from pprint import pformat + +from d3b_api_client_cli.config.concept_schema import CONCEPT + +PID = CONCEPT.SAMPLE_RELATIONSHIP.PARENT.TARGET_SERVICE_ID +CID = CONCEPT.SAMPLE_RELATIONSHIP.CHILD.TARGET_SERVICE_ID + +logger = logging.getLogger(__name__) + + +class MissingDataException(Exception): + pass + + +def get_roots(relationships_df): + """ + Get all child samples with a null parents + These are the roots of all sample trees + """ + return relationships_df[ + relationships_df[PID].isnull() & relationships_df[CID].notnull() + ][CID].values.tolist() + + +def get_direct_children(sample_id, relationships_df): + """ + Get the immediate/direct children of the sample identified + by sample_id + """ + return relationships_df[relationships_df[PID] == sample_id][ + CID + ].values.tolist() + + +class TreeNode: + """ + Represents a Specimen in a Specimen tree + """ + + def __init__(self, node_id, children=None): + self.node_id = node_id + self.children = None + + def level_order(self): + """ + Traverse tree with breadth first search to output a list of tree + nodes in level-order + """ + output = [] + if not self.node_id: + return output + + q = Queue() + q.put(self) + + while not q.empty(): + current = q.get() + output.append(current.node_id) + + if current.children: + for c in current.children: + q.put(c) + + return output + + +def build_tree(node, relationships_df): + """ + Given a table with directed parent->child sample relationships, + build a list of trees representing this relationships table + """ + + children = get_direct_children(node.node_id, relationships_df) + logger.debug( + f"Build tree: node {node.node_id}, children {pformat(children)}" + ) + if not children: + return + + node.children = [TreeNode(c_id) for c_id in children] + + for c_node in node.children: + build_tree(c_node, relationships_df) + + +def level_ordered_specimens(relationships_df): + """ + For each Specimen tree, output a list of Specimens in level-order + """ + output = [] + tree_roots = get_roots(relationships_df) + + logger.debug(f"Specimen tree roots {pformat(tree_roots)}") + + if not tree_roots: + raise MissingDataException( + "❌ Missing data in sample relationships table." + " The provided sample relationships table has 0 root specimens." + " This may because you forgot to add rows for root specimens." + " For example if SA_1 is a root of a specimen tree then" + " there must be a row in the sample relationshps table where" + " child_id=SA_1 and parent_id=NULL. All root specimens require" + " a row like this" + ) + + for root_id in get_roots(relationships_df): + root = TreeNode(root_id) + build_tree(root, relationships_df) + output.extend(root.level_order()) + return output diff --git a/d3b_api_client_cli/fhir/entity_builders/patient/__init__.py b/d3b_api_client_cli/fhir/entity_builders/patient/__init__.py new file mode 100644 index 0000000..33e9437 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/patient/__init__.py @@ -0,0 +1,94 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.patient.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class Patient(FhirResourceBuilder): + """ + Build FHIR Patient from Dataservice Participant + """ + + sources = {"required": ["Participant"], "optional": []} + resource_type = "Patient" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.PARTICIPANT.TARGET_SERVICE_ID + external_id_col = CONCEPT.PARTICIPANT.ID + kf_id_system = "participants" + external_id_system = f"{kf_id_system}?external_id=" + + def _build_resource(self, row): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + race = row.get(CONCEPT.PARTICIPANT.RACE) + ethnicity = row.get(CONCEPT.PARTICIPANT.ETHNICITY) + gender = row.get(CONCEPT.PARTICIPANT.GENDER) + + # US Core Race + us_core_race = {} + if race: + us_core_race.update( + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", + "extension": [{"url": "text", "valueString": race}], + } + ) + if omb_race_category.get(race): + us_core_race["extension"].append(omb_race_category[race]) + if us_core_race: + resource.setdefault("extension", []).append(us_core_race) + + # US Core Ethnicity + us_core_ethnicity = {} + if ethnicity: + us_core_ethnicity.update( + { + "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", + "extension": [{"url": "text", "valueString": ethnicity}], + } + ) + if omb_ethnicity_category.get(ethnicity): + us_core_ethnicity["extension"].append( + omb_ethnicity_category[ethnicity] + ) + if us_core_ethnicity: + resource.setdefault("extension", []).append(us_core_ethnicity) + + # gender + if administrative_gender_code.get(gender): + resource["gender"] = administrative_gender_code[gender] + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/patient/mapping.py b/d3b_api_client_cli/fhir/entity_builders/patient/mapping.py new file mode 100644 index 0000000..314d7a4 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/patient/mapping.py @@ -0,0 +1,138 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +# https://hl7.org/fhir/us/core/ValueSet-omb-race-category.html +omb_race_category = { + constants.RACE.NATIVE_AMERICAN: { + "url": "ombCategory", + "valueCoding": { + "system": "urn:oid:2.16.840.1.113883.6.238", + "code": "1002-5", + "display": "American Indian or Alaska Native", + }, + }, + constants.RACE.ASIAN: { + "url": "ombCategory", + "valueCoding": { + "system": "urn:oid:2.16.840.1.113883.6.238", + "code": "2028-9", + "display": "Asian", + }, + }, + constants.RACE.BLACK: { + "url": "ombCategory", + "valueCoding": { + "system": "urn:oid:2.16.840.1.113883.6.238", + "code": "2054-5", + "display": "Black or African American", + }, + }, + constants.RACE.PACIFIC: { + "url": "ombCategory", + "valueCoding": { + "system": "urn:oid:2.16.840.1.113883.6.238", + "code": "2076-8", + "display": "Native Hawaiian or Other Pacific Islander", + }, + }, + constants.RACE.WHITE: { + "url": "ombCategory", + "valueCoding": { + "system": "urn:oid:2.16.840.1.113883.6.238", + "code": "2106-3", + "display": "White", + }, + }, + constants.COMMON.OTHER: { + "url": "ombCategory", + "valueCoding": { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "OTH", + "display": "other", + }, + }, + constants.COMMON.NOT_AVAILABLE: { + "url": "ombCategory", + "valueCoding": { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "NAVU", + "display": "not available", + }, + }, + constants.COMMON.NOT_REPORTED: { + "url": "ombCategory", + "valueCoding": { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "NI", + "display": "NoInformation", + }, + }, + constants.COMMON.UNKNOWN: { + "url": "ombCategory", + "valueCoding": { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "UNK", + "display": "unknown", + }, + }, +} + +# https://hl7.org/fhir/us/core/ValueSet-omb-ethnicity-category.html +omb_ethnicity_category = { + constants.ETHNICITY.HISPANIC: { + "url": "ombCategory", + "valueCoding": { + "system": "urn:oid:2.16.840.1.113883.6.238", + "code": "2135-2", + "display": "Hispanic or Latino", + }, + }, + constants.ETHNICITY.NON_HISPANIC: { + "url": "ombCategory", + "valueCoding": { + "system": "urn:oid:2.16.840.1.113883.6.238", + "code": "2186-5", + "display": "Not Hispanic or Latino", + }, + }, + constants.COMMON.NOT_AVAILABLE: { + "url": "ombCategory", + "valueCoding": { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "NAVU", + "display": "not available", + }, + }, + constants.COMMON.NOT_REPORTED: { + "url": "ombCategory", + "valueCoding": { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "NI", + "display": "NoInformation", + }, + }, + constants.COMMON.UNKNOWN: { + "url": "ombCategory", + "valueCoding": { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "UNK", + "display": "unknown", + }, + }, +} + +# http://hl7.org/fhir/R4/codesystem-administrative-gender.html +administrative_gender_code = { + constants.GENDER.MALE: "male", + constants.GENDER.FEMALE: "female", + constants.COMMON.NOT_AVAILABLE: "unknown", + constants.COMMON.NOT_REPORTED: "unknown", + constants.COMMON.UNKNOWN: "unknown", + constants.COMMON.OTHER: "other", +} diff --git a/d3b_api_client_cli/fhir/entity_builders/phenotype/__init__.py b/d3b_api_client_cli/fhir/entity_builders/phenotype/__init__.py new file mode 100644 index 0000000..eb048ac --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/phenotype/__init__.py @@ -0,0 +1,114 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the patient phenotype +info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.phenotype.mapping import * +from d3b_api_client_cli.fhir.constants import MISSING_DATA_VALUES +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, + extension_age_at_event, +) + + +class Phenotype(FhirResourceBuilder): + """ + Build FHIR Condition from Dataservice Phenotype + """ + + sources = {"required": ["Phenotype"], "optional": []} + resource_type = "Condition" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.PHENOTYPE.TARGET_SERVICE_ID + external_id_col = CONCEPT.PHENOTYPE.ID + kf_id_system = "phenotypes" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient"] + + def _build_resource(self, row, patient_reference): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + observed = row.get(CONCEPT.PHENOTYPE.OBSERVED) + name = row.get(CONCEPT.PHENOTYPE.NAME) + hpo_id = row.get(CONCEPT.PHENOTYPE.HPO_ID) + snomed_id = row.get(CONCEPT.PHENOTYPE.SNOMED_ID) + event_age_days = row.get(CONCEPT.PHENOTYPE.EVENT_AGE_DAYS) + + # Add references + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + resource["subject"] = patient_reference(participant_kf_id) + + # Add other content + resource["meta"]["profile"] = [ + "https://ncpi-fhir.github.io/ncpi-fhir-ig/StructureDefinition/phenotype" + ] + + # verificationStatus + verification_status = {"text": observed} + observed_code = verification_status_coding.get(observed) + if not observed_code: + observed_code = verification_status_coding["default"] + verification_status.setdefault("coding", []).append(observed_code) + resource["verificationStatus"] = verification_status + + # code + code = {"text": name} + if hpo_id and hpo_id not in MISSING_DATA_VALUES: + code.setdefault("coding", []).append( + { + "system": "http://purl.obolibrary.org/obo/hp.owl", + "code": hpo_id, + } + ) + if snomed_id and snomed_id not in MISSING_DATA_VALUES: + code.setdefault("coding", []).append( + { + "system": "http://snomed.info/sct", + "code": snomed_id, + } + ) + resource["code"] = code + + # recordedDate + try: + resource["_recordedDate"] = extension_age_at_event( + patient_reference(participant_kf_id), event_age_days + ) + except (ValueError, TypeError): + pass + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row, patient_reference) + resources.append(resource) + + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/phenotype/mapping.py b/d3b_api_client_cli/fhir/entity_builders/phenotype/mapping.py new file mode 100644 index 0000000..88e64c6 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/phenotype/mapping.py @@ -0,0 +1,27 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +# http://hl7.org/fhir/ValueSet/condition-ver-status +verification_status_coding = { + constants.PHENOTYPE.OBSERVED.YES: { + "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", + "code": "confirmed", + "display": "Confirmed", + }, + constants.PHENOTYPE.OBSERVED.NO: { + "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", + "code": "refuted", + "display": "Refuted", + }, + "default": { + "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status", + "code": "unconfirmed", + "display": "unconfirmed", + }, +} diff --git a/d3b_api_client_cli/fhir/entity_builders/practitioner/__init__.py b/d3b_api_client_cli/fhir/entity_builders/practitioner/__init__.py new file mode 100644 index 0000000..e714cee --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/practitioner/__init__.py @@ -0,0 +1,62 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the practitioner info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.practitioner.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class Practitioner(FhirResourceBuilder): + """ + Build FHIR Practitioner from Dataservice Investigator + """ + + sources = {"required": ["Investigator"], "optional": []} + resource_type = "Practitioner" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.INVESTIGATOR.TARGET_SERVICE_ID + external_id_col = CONCEPT.INVESTIGATOR.ID + kf_id_system = "investigators" + external_id_system = f"{kf_id_system}?external_id=" + + def _build_resource(self, row): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + name = row.get(CONCEPT.INVESTIGATOR.NAME) + + # Add other content + resource["active"] = True + if name: + resource["name"] = [{"text": name}] + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/practitioner/mapping.py b/d3b_api_client_cli/fhir/entity_builders/practitioner/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/practitioner/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/practitioner_role/__init__.py b/d3b_api_client_cli/fhir/entity_builders/practitioner_role/__init__.py new file mode 100644 index 0000000..4057c38 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/practitioner_role/__init__.py @@ -0,0 +1,87 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the practitioner_role +info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.practitioner_role.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class PractitionerRole(FhirResourceBuilder): + """ + Build FHIR PractitionerRole from Dataservice Investigator + """ + + sources = {"required": ["Investigator"], "optional": []} + resource_type = "PractitionerRole" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.INVESTIGATOR.TARGET_SERVICE_ID + external_id_col = CONCEPT.INVESTIGATOR.ID + kf_id_system = "investigators" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["practitioner", "organization"] + + def _build_resource( + self, row, practitioner_reference, organization_reference + ): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + # Add other content + resource["active"] = True + resource["code"] = [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/practitioner-role", + "code": "researcher", + "display": "Researcher", + } + ] + } + ] + + # Add references + investigator_kf_id = row[CONCEPT.INVESTIGATOR.TARGET_SERVICE_ID] + resource["organization"] = organization_reference(investigator_kf_id) + resource["practitioner"] = practitioner_reference(investigator_kf_id) + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + + builders = self.import_reference_builders() + practitioner_reference = builders["practitioner"].fhir_reference + organization_reference = builders["organization"].fhir_reference + + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource( + row, practitioner_reference, organization_reference + ) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/practitioner_role/mapping.py b/d3b_api_client_cli/fhir/entity_builders/practitioner_role/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/practitioner_role/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/proband_status/__init__.py b/d3b_api_client_cli/fhir/entity_builders/proband_status/__init__.py new file mode 100644 index 0000000..ef92830 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/proband_status/__init__.py @@ -0,0 +1,83 @@ +# noqa +""" +Module responsible for building FHIR JSON that capture the patient's +proband status +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.proband_status.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class ProbandStatus(FhirResourceBuilder): + """ + Build FHIR Observation from Dataservice Participant + """ + + sources = {"required": ["Participant"], "optional": []} + resource_type = "Observation" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.PARTICIPANT.TARGET_SERVICE_ID + external_id_col = CONCEPT.PARTICIPANT.ID + kf_id_system = "participants" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient"] + + def _build_resource(self, row, patient_reference): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + proband_status = row[CONCEPT.PARTICIPANT.IS_PROBAND] + + # Add references + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + resource["subject"] = patient_reference(participant_kf_id) + + # Add other content + resource["status"] = "final" + resource["code"] = { + "coding": [ + { + "system": "http://snomed.info/sct", + "code": "85900004", + "display": "Proband (finding)", + } + ], + "text": "Proband status", + } + resource["valueCodeableConcept"] = { + "coding": [value_coding[proband_status]], + "text": proband_status, + } + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row, patient_reference) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/proband_status/mapping.py b/d3b_api_client_cli/fhir/entity_builders/proband_status/mapping.py new file mode 100644 index 0000000..a44bbc5 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/proband_status/mapping.py @@ -0,0 +1,27 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +# http://terminology.hl7.org/CodeSystem/v2-0136 +value_coding = { + constants.COMMON.TRUE: { + "system": "http://terminology.hl7.org/CodeSystem/v2-0136", + "code": "Y", + "display": "Yes", + }, + constants.COMMON.FALSE: { + "system": "http://terminology.hl7.org/CodeSystem/v2-0136", + "code": "N", + "display": "No", + }, + None: { + "system": "http://terminology.hl7.org/CodeSystem/v3-NullFlavor", + "code": "NI", + "display": "NoInformation", + }, +} diff --git a/d3b_api_client_cli/fhir/entity_builders/research_study/__init__.py b/d3b_api_client_cli/fhir/entity_builders/research_study/__init__.py new file mode 100644 index 0000000..4721235 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/research_study/__init__.py @@ -0,0 +1,190 @@ +# noqa +""" +Module responsible for building FHIR JSON to capture the reseearch study info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.research_study.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) +from d3b_api_client_cli import utils + + +class ResearchStudy(FhirResourceBuilder): + """ + Build FHIR ResearchStudy from Dataservice Study + """ + + sources = {"required": ["Study"], "optional": []} + resource_type = "ResearchStudy" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.STUDY.TARGET_SERVICE_ID + external_id_col = CONCEPT.STUDY.ID + kf_id_system = "studies" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["practitioner_role"] + + def _build_resource(self, row, practitioner_role_reference): + """ + Build FHIR JSON from a csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + + external_id = row.get(CONCEPT.STUDY.ID) + version = row.get(CONCEPT.STUDY.VERSION) + study_name = row.get(CONCEPT.STUDY.NAME) + domain = row.get(CONCEPT.STUDY.DOMAIN) + program = row.get(CONCEPT.STUDY.PROGRAM) + short_code = row.get(CONCEPT.STUDY.SHORT_CODE) + biobank_name = row.get(CONCEPT.STUDY.BIOBANK_NAME) + biobank_email = row.get(CONCEPT.STUDY.BIOBANK_EMAIL) + biobank_request_link = row.get(CONCEPT.STUDY.BIOBANK_REQUEST_LINK) + biobank_request_instructions = row.get( + CONCEPT.STUDY.BIOBANK_REQUEST_INSTRUCTIONS + ) + institution = row.get(CONCEPT.INVESTIGATOR.INSTITUTION) + investigator_name = row.get(CONCEPT.INVESTIGATOR.NAME) + + # NOTE - Uncomment when Dewrangle supports multi-study pr + # # Add references + # investigator_kf_id = row.get(CONCEPT.INVESTIGATOR.TARGET_SERVICE_ID) + # if investigator_kf_id: + # resource["principalInvestigator"] = practitioner_role_reference( + # investigator_kf_id + # ) + + # Additional payload content + resource["status"] = "completed" + + # identifier + if external_id and external_id.startswith("phs"): + accession = external_id.split(".")[0].strip() + if version and version.startswith("v"): + accession = ".".join([accession, version.strip()]) + resource["identifier"].append( + { + "use": "secondary", + "system": "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", + "value": accession, + } + ) + + # title + if study_name: + resource["title"] = study_name + + # Contacts + contact_list = [] + + # Biobank contact + biobank_contact = {} + if biobank_name: + biobank_contact["name"] = biobank_name + if biobank_email: + biobank_contact["telecom"] = [ + {"system": "email", "value": biobank_email} + ] + if biobank_request_link: + biobank_contact["telecom"].append( + {"system": "url", "value": biobank_request_link} + ) + + if biobank_contact: + contact_list = [biobank_contact] + + # Investigator info + investigator_contact = {} + if institution: + investigator_contact["extension"] = [ + { + "url": "https://include-dcc.github.io/include-model-forge/StructureDefinition/contact-detail-institution", + "valueString": institution, + } + ] + if investigator_name: + investigator_contact["name"] = investigator_name + + if investigator_contact: + contact_list.append(investigator_contact) + + resource["contact"] = contact_list + + # VBR Request README + if biobank_request_instructions: + resource["note"] = [{"text": biobank_request_instructions}] + + # cateogry + category = {} + if domain: + category["text"] = domain + if category_coding.get(domain): + category.setdefault("coding", []).append( + category_coding[domain] + ) + elif domain == "CANCERANDBIRTHDEFECT": + category["coding"] = [ + category_coding["CANCER"], + category_coding["BIRTHDEFECT"], + ] + if category: + resource.setdefault("category", []).append(category) + + # keyword + if program: + resource.setdefault("keyword", []).append( + { + "coding": [ + { + "system": "https://kf-api-dataservice.kidsfirstdrc.org/studies?program=", + "code": program, + "display": program, + } + ], + "text": program, + } + ) + if short_code: + resource.setdefault("keyword", []).append( + { + "coding": [ + { + "system": "https://kf-api-dataservice.kidsfirstdrc.org/studies?short_code=", + "code": short_code, + "display": short_code, + } + ], + "text": short_code, + } + ) + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + + builders = self.import_reference_builders() + practitioner_role_reference = builders[ + "practitioner_role" + ].fhir_reference + + df = list(self.source_tables.values())[0] + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row, practitioner_role_reference) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/research_study/mapping.py b/d3b_api_client_cli/fhir/entity_builders/research_study/mapping.py new file mode 100644 index 0000000..08922cf --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/research_study/mapping.py @@ -0,0 +1,26 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +category_coding = { + "BIRTHDEFECT": { + "system": "http://snomed.info/sct", + "code": "276720006", + "display": "Dysmorphism (disorder)", + }, + "CANCER": { + "system": "http://snomed.info/sct", + "code": "86049000", + "display": "Malignant neoplasm, primary (morphologic abnormality)", + }, + "COVID19": { + "system": "http://snomed.info/sct", + "code": "840539006", + "display": "Disease caused by Severe acute respiratory syndrome coronavirus 2", + }, +} diff --git a/d3b_api_client_cli/fhir/entity_builders/research_subject/__init__.py b/d3b_api_client_cli/fhir/entity_builders/research_subject/__init__.py new file mode 100644 index 0000000..9f08dee --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/research_subject/__init__.py @@ -0,0 +1,60 @@ +# noqa + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.research_subject.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class ResearchSubject(FhirResourceBuilder): + """ + Build FHIR ResearchSubject from Dataservice Participant + """ + + sources = {"required": ["Participant"], "optional": []} + resource_type = "ResearchSubject" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.PARTICIPANT.TARGET_SERVICE_ID + external_id_col = CONCEPT.PARTICIPANT.ID + kf_id_system = "participants" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["research_study", "patient"] + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + study_reference = builders["research_study"].fhir_reference + + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + # Initalize resource with basic content + resource = self.init_resource(row) + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + study_kf_id = self.study_id + + resource["status"] = "off-study" + + # Add references + resource["study"] = study_reference(study_kf_id) + resource["individual"] = patient_reference(participant_kf_id) + + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + resources.append(resource) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/research_subject/mapping.py b/d3b_api_client_cli/fhir/entity_builders/research_subject/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/research_subject/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/sequencing_center/__init__.py b/d3b_api_client_cli/fhir/entity_builders/sequencing_center/__init__.py new file mode 100644 index 0000000..18cba8b --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/sequencing_center/__init__.py @@ -0,0 +1,63 @@ +# noqa +""" +Module responsible for building FHIR JSON that captures the sequencing center +info +""" + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.sequencing_center.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, +) + + +class SequencingCenter(FhirResourceBuilder): + """ + Build FHIR Organization from Dataservice SequencingCenter + """ + + sources = {"required": ["SequencingCenter"], "optional": []} + resource_type = "Organization" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.SEQUENCING.CENTER.TARGET_SERVICE_ID + external_id_col = CONCEPT.SEQUENCING.CENTER.ID + kf_id_system = "sequencing_centers" + external_id_system = f"{kf_id_system}?external_id=" + + def _build_resource(self, row): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + name = row.get(CONCEPT.SEQUENCING.CENTER.NAME) + + # Add other content + # name + if name: + resource["name"] = name + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row) + resources.append(resource) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/sequencing_center/mapping.py b/d3b_api_client_cli/fhir/entity_builders/sequencing_center/mapping.py new file mode 100644 index 0000000..4d40879 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/sequencing_center/mapping.py @@ -0,0 +1,8 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants diff --git a/d3b_api_client_cli/fhir/entity_builders/specimen_common/mapping.py b/d3b_api_client_cli/fhir/entity_builders/specimen_common/mapping.py new file mode 100644 index 0000000..d255d34 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/specimen_common/mapping.py @@ -0,0 +1,935 @@ +""" +Sample type to FHIR coding mapping +For FHIR Specimens +""" + +type_coding = { + "Amniocytes": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C118138", + "display": "Reactive Amniocyte", + }, + { + "code": "Reactive-Amniocyte", + "display": "Reactive Amniocyte", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "amniotic fluid": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C13188", + "display": "Amniotic Fluid", + }, + { + "code": "Amniotic-Fluid", + "display": "Amniotic Fluid", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "blood": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17610", + "display": "Blood Sample", + }, + { + "code": "Peripheral-Whole-Blood", + "display": "Peripheral Whole Blood", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Peripheral Whole Blood": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17610", + "display": "Blood Sample", + }, + { + "code": "Peripheral-Whole-Blood", + "display": "Peripheral Whole Blood", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Blood": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17610", + "display": "Blood Sample", + }, + { + "code": "Peripheral-Whole-Blood", + "display": "Peripheral Whole Blood", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Blood Derived Cancer - Bone Marrow, Post-treatment": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C164009", + "display": "Bone Marrow Sample", + }, + { + "code": "Bone-Marrow", + "display": "Bone Marrow", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Blood Derived Cancer - Peripheral Blood, Post-treatment": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17610", + "display": "Blood Sample", + }, + { + "code": "Peripheral-Whole-Blood", + "display": "Peripheral Whole Blood", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Blood EDTA": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C158462", + "display": "EDTA Blood Cell Fraction", + }, + { + "code": "EDTA-Blood-Cell-Fraction", + "display": "EDTA Blood Cell Fraction", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Blood-Lymphocyte": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12535", + "display": "Lymphocyte", + }, + { + "code": "Lymphocyte", + "display": "Lymphocyte", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "bone": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12366", + "display": "Bone", + }, + { + "code": "Bone", + "display": "Bone", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Bone": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12366", + "display": "Bone", + }, + { + "code": "Bone", + "display": "Bone", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Bone marrow": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C164009", + "display": "Bone Marrow Sample", + }, + { + "code": "Bone-Marrow", + "display": "Bone Marrow", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Bone Marrow": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C164009", + "display": "Bone Marrow Sample", + }, + { + "code": "Bone-Marrow", + "display": "Bone Marrow", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "brain": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12439", + "display": "Brain", + }, + { + "code": "Brain", + "display": "Brain", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Brain Tissue": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12439", + "display": "Brain", + }, + { + "code": "Brain", + "display": "Brain", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Buccal": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C172264", + "display": "Buccal Cell Sample", + }, + { + "code": "Buccal-Cells", + "display": "Buccal Cells", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Buccal Cell Normal": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C172264", + "display": "Buccal Cell Sample", + }, + { + "code": "Buccal-Cells", + "display": "Buccal Cells", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Buccal Cells": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C172264", + "display": "Buccal Cell Sample", + }, + { + "code": "Buccal-Cells", + "display": "Buccal Cells", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Buccal Mucosa": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12505", + "display": "Buccal Mucosa", + }, + { + "code": "Buccal-Mucosa", + "display": "Buccal Mucosa", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Buffy Coat": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C84507", + "display": "Buffy Coat", + }, + { + "code": "Buffy-Coat", + "display": "Buffy Coat", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Cartilage": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12373", + "display": "Cartilage", + }, + { + "code": "Cartilage", + "display": "Cartilage", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Cell Freeze": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12508", + "display": "Cell", + }, + { + "code": "Cell", + "display": "Cell", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Cells": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12508", + "display": "Cell", + }, + { + "code": "Cell", + "display": "Cell", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Cerebral Spinal Fluid": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C185194", + "display": "Cerebrospinal Fluid Sample", + }, + { + "code": "Cerebrospinal-Fluid-Sample", + "display": "Cerebrospinal Fluid Sample", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Cheek Swab": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C113747", + "display": "Buccal Swab", + }, + { + "code": "Buccal-Swab", + "display": "Buccal Swab", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "chest wall": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C62484", + "display": "Chest Wall", + }, + { + "code": "Chest-Wall", + "display": "Chest Wall", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Cyst Fluid": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C2978", + "display": "Cyst", + }, + { + "code": "Cyst", + "display": "Cyst", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "DNA": [ + { + "system": "http://purl.obolibrary.org/obo/obi.owl", + "code": "OBI:0001051", + "display": "DNA extract", + }, + { + "code": "DNA", + "display": "DNA", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "dura": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C32488", + "display": "Dura Mater", + }, + { + "code": "Dura-Mater", + "display": "Dura Mater", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Epstein-Barr Virus Immortalized Cells": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C163993", + "display": "EBV Immortalized Lymphocytes", + }, + { + "code": "EBV-Immortalized-Lymphocytes", + "display": "EBV Immortalized Lymphocytes", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Fetal Tissue Liver": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C34169", + "display": "Fetal Liver", + }, + { + "code": "Fetal-Liver", + "display": "Fetal Liver", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Fetal Tissue Unspecified": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17730", + "display": "Fetal Tissue", + }, + { + "code": "Fetal-Tissue", + "display": "Fetal Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Fibroblast": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12482", + "display": "Fibroblast", + }, + { + "code": "Fibroblast", + "display": "Fibroblast", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Fibroblasts": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12482", + "display": "Fibroblast", + }, + { + "code": "Fibroblast", + "display": "Fibroblast", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Fibroblasts from Bone Marrow Normal": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12482", + "display": "Fibroblast", + }, + { + "code": "Fibroblast", + "display": "Fibroblast", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "groin": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12726", + "display": "Inguinal Region", + }, + { + "code": "Inguinal-Region", + "display": "Inguinal Region", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Hair": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C32705", + "display": "Hair", + }, + { + "code": "Hair", + "display": "Hair", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Derived Cell Line": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C156445", + "display": "Derived Cell Line", + }, + { + "code": "Derived-Cell-Line", + "display": "Derived Cell Line", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "LCL": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C32941", + "display": "Lateral Ligament", + }, + { + "code": "Lateral-Ligament", + "display": "Lateral Ligament", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Leukocyte": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12529", + "display": "Leukocyte", + }, + { + "code": "Leukocyte", + "display": "Leukocyte", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "lung": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C33024", + "display": "Lung Tissue", + }, + { + "code": "Lung-Tissue", + "display": "Lung Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "lymph node": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12745", + "display": "Lymph Node", + }, + { + "code": "Lymph-Node", + "display": "Lymph Node", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Lymphocytes": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12535", + "display": "Lymphocyte", + }, + { + "code": "Lymphocyte", + "display": "Lymphocyte", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "marrow": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C164009", + "display": "Bone Marrow Sample", + }, + { + "code": "Bone-Marrow", + "display": "Bone Marrow", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "mediastinum": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12748", + "display": "Mediastinum", + }, + { + "code": "Mediastinum", + "display": "Mediastinum", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Mononuclear Cells": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C178965", + "display": "Peripheral Blood Mononuclear Cell Sample", + }, + { + "code": "Peripheral-Blood-Mononuclear-Cell-Sample", + "display": "Peripheral Blood Mononuclear Cell Sample", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "muscle": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12435", + "display": "Muscle Tissue", + }, + { + "code": "Muscle-Tissue", + "display": "Muscle Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Muscle": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12435", + "display": "Muscle Tissue", + }, + { + "code": "Muscle-Tissue", + "display": "Muscle Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Myocyte": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C12612", + "display": "Muscle Cell", + }, + { + "code": "Muscle-Cell", + "display": "Muscle Cell", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Negative Lymph Node": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C36174", + "display": "Negative Lymph Node", + }, + { + "code": "Negative-Lymph-Node", + "display": "Negative Lymph Node", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Patient Derived Xenograft": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C122936", + "display": "Patient Derived Xenograft", + }, + { + "code": "Patient-Derived-Xenograft", + "display": "Patient Derived Xenograft", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "PBMC": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C178965", + "display": "Peripheral Blood Mononuclear Cell Sample", + }, + { + "code": "Peripheral-Blood-Mononuclear-Cell-Sample", + "display": "Peripheral Blood Mononuclear Cell Sample", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Peripheral blood": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17610", + "display": "Blood Sample", + }, + { + "code": "Peripheral-Whole-Blood", + "display": "Peripheral Whole Blood", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Plasma": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C185204", + "display": "Plasma Sample", + }, + { + "code": "Plasma", + "display": "Plasma", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Primary Blood Derived Cancer - Bone Marrow": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C164009", + "display": "Bone Marrow Sample", + }, + { + "code": "Bone-Marrow", + "display": "Bone Marrow", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Primary Blood Derived Cancer - Peripheral Blood": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17610", + "display": "Blood Sample", + }, + { + "code": "Peripheral-Whole-Blood", + "display": "Peripheral Whole Blood", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Recurrent Blood Derived Cancer - Peripheral Blood": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17610", + "display": "Blood Sample", + }, + { + "code": "Peripheral-Whole-Blood", + "display": "Peripheral Whole Blood", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "RNA": [ + { + "system": "http://purl.obolibrary.org/obo/obi.owl", + "code": "OBI:0000880", + "display": "RNA extract", + }, + { + "system": "https://includedcc.org/fhir/code-systems/sample_types", + "code": "RNA", + "display": "RNA", + }, + ], + "Saliva": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C174119", + "display": "Saliva Sample", + }, + { + "code": "Saliva", + "display": "Saliva", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "saliva": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C174119", + "display": "Saliva Sample", + }, + { + "code": "Saliva", + "display": "Saliva", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Serum": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C178987", + "display": "Serum Sample", + }, + { + "code": "Serum", + "display": "Serum", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "skin": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C33563", + "display": "Skin Tissue", + }, + { + "code": "Skin-Tissue", + "display": "Skin Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Solid Tissue": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C164014", + "display": "Solid Tissue Specimen", + }, + { + "code": "Solid-Tissue", + "display": "Solid Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Tissue Cell Culture": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C17201", + "display": "Tissue Culture", + }, + { + "code": "Tissue-Culture", + "display": "Tissue Culture", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Tissue FFPE": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C172265", + "display": "Formalin-Fixed Paraffin-Embedded Tissue Sample", + }, + { + "code": "Formalin-Fixed-Paraffin-Embedded-Tissue-Sample", + "display": "Formalin-Fixed Paraffin-Embedded Tissue Sample", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Tissue Flash Frozen": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C158417", + "display": "Frozen Tissue", + }, + { + "code": "Frozen-Tissue", + "display": "Frozen Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Tissue Freezing Media": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C158417", + "display": "Frozen Tissue", + }, + { + "code": "Frozen-Tissue", + "display": "Frozen Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Tissue Perineum": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C33301", + "display": "Perineum", + }, + { + "code": "Perineum", + "display": "Perineum", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Tumor": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C18009", + "display": "Tumor Tissue", + }, + { + "code": "Tumor-Tissue", + "display": "Tumor Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], + "Vascular tissue": [ + { + "system": "http://purl.obolibrary.org/obo/ncit.owl", + "code": "C33853", + "display": "Vascular Smooth Muscle Tissue", + }, + { + "code": "Vascular-Smooth-Muscle-Tissue", + "display": "Vascular Smooth Muscle Tissue", + "system": "https://includedcc.org/fhir/code-systems/sample_types", + }, + ], +} + +preservation_method_coding = { + "OCT": { + "code": "OCT", + "display": "Optimal cutting temperature compound", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FFRZ": { + "code": "FFRZ", + "display": "Flash Frozen", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "Flash Frozen": { + "code": "FFRZ", + "display": "Flash Frozen", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "RT": { + "code": "RT", + "display": "Room Temp", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "Not Reported": { + "code": "not-reported", + "display": "Not Reported", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "CRYO": { + "code": "CRYO", + "display": "CRYO", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FRZN80": { + "code": "FRZN80", + "display": "Frozen at -80", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "Cerbral Spinal Fluid": { + "code": "not-reported", + "display": "Not Reported", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FRZN": { + "code": "FRZN", + "display": "Frozen", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FRZN20": { + "code": "FRZN20", + "display": "Frozen At -20", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FRZM": { + "code": "FRZM", + "display": "Freezing Media Type", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "RNAL": { + "code": "RNAL", + "display": "RNAlater", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "EDTA": { + "code": "EDTA", + "display": "EDTA tube", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FRZNLN": { + "code": "FRZNLN", + "display": "Frozen In Liquid Nitrogen Vapor", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FFPE": { + "code": "FFPE", + "display": "Formalin Fixed Parrafin Embedded", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "FRZNDI": { + "code": "FRZNDI", + "display": "Frozen Dry Ice", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, + "UFPE": { + "code": "UFPE", + "display": "Unknown Fixed Parrafin Embedded", + "system": "https://includedcc.org/fhir/code-systems/preservation_method", + }, +} diff --git a/d3b_api_client_cli/fhir/entity_builders/vital_status/__init__.py b/d3b_api_client_cli/fhir/entity_builders/vital_status/__init__.py new file mode 100644 index 0000000..4395154 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/vital_status/__init__.py @@ -0,0 +1,92 @@ +# noqa + +import logging +import pandas + +from d3b_api_client_cli.config.concept_schema import CONCEPT +from d3b_api_client_cli.fhir.entity_builders.vital_status.mapping import * +from d3b_api_client_cli.fhir.entity_builders.base import ( + FhirResourceBuilder, + set_id_prefix, + extension_age_at_event, +) + + +class VitalStatus(FhirResourceBuilder): + """ + Build FHIR Observation from Dataservice Outcome + """ + + sources = {"required": ["Outcome"], "optional": []} + resource_type = "Observation" + id_prefix = set_id_prefix(resource_type) + + kf_id_col = CONCEPT.OUTCOME.TARGET_SERVICE_ID + external_id_col = CONCEPT.OUTCOME.ID + kf_id_system = "outcomes" + external_id_system = f"{kf_id_system}?external_id=" + + reference_builders = ["patient"] + + def _build_resource(self, row, patient_reference): + """ + Build FHIR JSON from csv row + """ + # Initalize resource with basic content + resource = self.init_resource(row) + vital_status = row.get(CONCEPT.OUTCOME.VITAL_STATUS) + event_age_days = row.get(CONCEPT.OUTCOME.EVENT_AGE_DAYS) + + # Add references + participant_kf_id = row[CONCEPT.PARTICIPANT.TARGET_SERVICE_ID] + resource["subject"] = patient_reference(participant_kf_id) + + # Add other content + resource["status"] = "final" + resource["code"] = { + "coding": [ + { + "system": "http://snomed.info/sct", + "code": "263493007", + "display": "Clinical status (attribute)", + } + ], + "text": "Clinical status", + } + # effectiveDateTime + try: + resource["_effectiveDateTime"] = extension_age_at_event( + patient_reference(participant_kf_id), event_age_days + ) + except (ValueError, TypeError): + pass + + # valueCodeableConcept + if vital_status: + value = {"text": vital_status} + if code_coding.get(vital_status): + value.setdefault("coding", []).append(code_coding[vital_status]) + resource["valueCodeableConcept"] = value + + return resource + + def _build(self, source_dir, **kwargs): + """ + See d3b_api_client_cli.fhir.entity_builders.base + """ + self.source_tables = self.import_source_data(source_dir) + df = list(self.source_tables.values())[0] + + builders = self.import_reference_builders() + patient_reference = builders["patient"].fhir_reference + + resources = [] + total = df.shape[0] + for i, row in df.iterrows(): + resource = self._build_resource(row, patient_reference) + self.logger.info( + f"Built {self.resource_type} {i+1}/{total}: {resource['id']}" + ) + resources.append(resource) + + return resources diff --git a/d3b_api_client_cli/fhir/entity_builders/vital_status/mapping.py b/d3b_api_client_cli/fhir/entity_builders/vital_status/mapping.py new file mode 100644 index 0000000..fca4b19 --- /dev/null +++ b/d3b_api_client_cli/fhir/entity_builders/vital_status/mapping.py @@ -0,0 +1,21 @@ +# noqa + +""" +Map a string from d3b_api_client_cli.fhir.constants to +a FHIR coding type +""" + +from d3b_api_client_cli.fhir import constants + +code_coding = { + constants.OUTCOME.VITAL_STATUS.ALIVE: { + "system": "http://snomed.info/sct", + "code": "438949009", + "display": "Alive (finding)", + }, + constants.OUTCOME.VITAL_STATUS.DEAD: { + "system": "http://snomed.info/sct", + "code": "419099009", + "display": "Dead (finding)", + }, +} diff --git a/d3b_api_client_cli/fhir/get.py b/d3b_api_client_cli/fhir/get.py new file mode 100644 index 0000000..a8c7eef --- /dev/null +++ b/d3b_api_client_cli/fhir/get.py @@ -0,0 +1,128 @@ +""" +Get FHIR resources from the FHIR server efficiently +""" + +import time +import logging +from urllib.parse import urlparse + +from requests.auth import HTTPBasicAuth + +from d3b_api_client_cli.config import ( + config, +) +from d3b_api_client_cli.utils import ( + elapsed_time_hms, + send_request, +) + +logger = logging.getLogger(__name__) + +config = config["fhir"] + + +def get(url, username, password, headers=None, params=None): + """ + Helper to GET FHIR resource + """ + resp = send_request( + "get", + url, + params=params, + auth=HTTPBasicAuth(username, password), + headers=headers, + ) + return resp.json() + + +def _replace_host_with_origin(base_url, link_url): + """ + Replace hostname in link_url with base_url hostname if different + + This is needed to support pagination on the legacy FHIR servers since + some of the legacy servers do not return pagination links with the + externally resolvable hostname that was requested and, + instead return http://localhost:8000 or wherever it is deployed behind + the load balancer + """ + link_result = urlparse(link_url) + base_result = urlparse(base_url) + if link_result.netloc != base_result.netloc: + link_url = ( + f"{base_result.scheme}://{base_result.netloc}" + f"{link_result.path}?{link_result.query}" + ) + + return link_url + + +def get_all(base_url, resource_type, username, password, params=None): + """Paginate and fetch all FHIR resources by resource_type and + any query params supplied by caller + + :param base_url: Base url of the FHIR service + :type base_url: str + :param resource_type: FHIR endpoint + :type resource_type: str + :param params: dict of query params + :type params: int + :returns: list of dict responses + """ + url = "/".join(part.strip("/") for part in [base_url, resource_type]) + headers = {"Content-Type": "application/json"} + + if not params: + params = {} + + # Get total + params["_total"] = "accurate" + total = get(url, username, password, headers, params=params)["total"] + + # Remove this bc getting total can be an expensive FHIR operation + params.pop("_total") + + logger.info(f"🧺 Begin fetching {resource_type} {total} entities") + start_time = time.time() + + # Fetch pages of resources + resources = [] + count = 0 + while count < total: + params.update({"_count": 100}) + body = get( + _replace_host_with_origin(base_url, url), + username, + password, + headers=headers, + params=params, + ) + # Check if any resources were in the page + entry = body.get("entry", []) + if not entry: + return + + # Add page of resources to output + page = [entry.get("resource") for entry in body.get("entry", [])] + resources.extend(page) + + count += int(len(entry)) + logger.info(f"📄 Fetched page of {resource_type}. Seen {count}/{total}") + + # Check if there are any more pages to fetch + url = None + links = body.get("link", []) + for link in links: + if link["relation"] == "next": + url = link["url"] + break + # This is the last page. Last page link is in self + if not url: + for link in links: + if link["relation"] == "self": + url = link["url"] + break + + logger.info(f"⏰ Elapsed time (hh:mm:ss): {elapsed_time_hms(start_time)}") + logger.info("✅ Completed fetching resources") + + return resources diff --git a/d3b_api_client_cli/fhir/loader.py b/d3b_api_client_cli/fhir/loader.py new file mode 100644 index 0000000..359df86 --- /dev/null +++ b/d3b_api_client_cli/fhir/loader.py @@ -0,0 +1,240 @@ +""" +Load FHIR JSON into the FHIR server efficiently +""" + +import os +import shutil +import time +import logging +from collections import defaultdict +from urllib.parse import urlparse +from concurrent.futures import ThreadPoolExecutor, as_completed +from pprint import pformat + +import requests +from requests.auth import HTTPBasicAuth + +from d3b_api_client_cli.config import ( + config, + LOAD_DIR, + SKIP_ENTITIES, + KidsFirstFhirEntity, + ImagingFhirEntity, +) +from d3b_api_client_cli import utils + +logger = logging.getLogger(__name__) + +config = config["fhir"] +entity_load_order = [et.value for et in KidsFirstFhirEntity] + [ + et.value for et in ImagingFhirEntity +] +valid_fhir_types = set(entity_load_order) + + +def _do_put( + base_url, endpoint, username, password, payload, ignore_errors=None +): + """ + Helper function to PUT FHIR resource + """ + resource_id = payload["id"] + url = "/".join( + part.strip("/") for part in [base_url, endpoint, resource_id] + ) + headers = {"Content-Type": "application/json"} + results = { + "url": url, + "resp": None, + "failed": False, + } + try: + resp = utils.send_request( + "put", + url, + ignore_status_codes=None, + headers=headers, + auth=HTTPBasicAuth(username, password), + json=payload, + ) + except requests.exceptions.HTTPError as e: + results["failed"] = True + results["resp"] = {"id": resource_id} + results["error"] = str(e) + + if ignore_errors is False: + raise + else: + results["resp"] = resp.json() + + return results + + +def _process_result(result, results, i, total, counts, entity_type): + """ + Helper to process result of _do_put + """ + url = result.get("url") + url_path = urlparse(url).path + resp = result.get("resp") + failed = result["failed"] + + if entity_type not in counts: + counts[entity_type] = {"success": 0, "failed": 0} + + if not failed: + counts[entity_type]["success"] += 1 + results["success"].append(resp) + logger.info(f"PUT {url_path}, #{i + 1}/{total}") + else: + counts[entity_type]["failed"] += 1 + results["failed"].append({"resp": resp, "error": result["error"]}) + logger.info(f"FAILED PUT {url_path} #{i + 1}/{total}") + + return resp + + +def _write_results(entity_type, results, output_dir): + """ + Write put results to file + """ + if results["failed"]: + fp = os.path.join(output_dir, "failed", f"{entity_type}.json") + utils.write_json(results["failed"], fp) + logger.info( + f"❌ Failed to update {len(results['failed'])} {entity_type}" + ) + + if results["success"]: + fp = os.path.join(output_dir, "success", f"{entity_type}.json") + utils.write_json(results["success"], fp) + logger.info(f"✅ Updated {len(results['success'])} {entity_type}") + + logger.info(f"Wrote results to {output_dir}") + + +def load_data( + base_url, + data_dir, + output_dir=None, + username=None, + password=None, + use_async=True, + entities_to_load=None, + ignore_load_errors=False, + cleanup=False, +): + """ + Read FHIR json files and load into FHIR service + + :param base_url: Base url of the FHIR service + :type base_url: str + :param data_dir: Dir where data is loaded from + :type data_dir: str + :param use_async: A flag to determine whether to use multi-threading when + sending requests to the server + :param cleanup: if true, delete the contents of ETL stage directories + before running ETL + :type cleanup: boolean + :type use_async: boolean + :returns: None + """ + if not output_dir: + output_dir = LOAD_DIR + + if cleanup: + shutil.rmtree(output_dir, ignore_errors=True) + os.makedirs(os.path.join(output_dir, "success"), exist_ok=True) + os.makedirs(os.path.join(output_dir, "failed"), exist_ok=True) + + start_time = time.time() + + if not entities_to_load: + entities_to_load = valid_fhir_types + else: + entities_to_load = set(entities_to_load) + + # NOTE - This is temporary until Dewrangle is able to support these having + # these types in multiple studies + entities_to_load = entities_to_load - SKIP_ENTITIES + + study_id = None + if not (username and password): + username = config["username"] + password = config["password"] + + counts = defaultdict(int) + for entity_type in entity_load_order: + if entity_type not in entities_to_load: + logger.info( + f"⏭️ Skip {entity_type}. User did not include it in" + " entities_to_load" + ) + continue + + # NOTE: VERY IMPORTANT + # Non-aliquot specimens must be loaded synchronously because they + # have to be loaded in a specific order which satisfies the specimen + # tree + save_async = use_async + if entity_type == "parent_specimen": + use_async = False + else: + use_async = save_async + + filename = f"{entity_type}.json" + filepath = os.path.join(data_dir, filename) + if not os.path.exists(filepath): + logger.warning( + f"⚠️ Skipping {entity_type}, data file does not exist" + ) + continue + data = utils.read_json(filepath) + + if not study_id and (entity_type == "research_study"): + study_id = data[0]["id"].upper().replace("-", "_") + + results = {"failed": [], "success": []} + total = len(data) + logger.info(f"👉 Begin loading {entity_type} {total} entities") + + if use_async: + logger.info("⚡️ Using async loading ...") + with ThreadPoolExecutor() as tpex: + futures = [] + for i, payload in enumerate(data): + endpoint = payload["resourceType"] + futures.append( + tpex.submit( + _do_put, + base_url, + endpoint, + username, + password, + payload, + ignore_errors=ignore_load_errors, + ) + ) + for i, f in enumerate(as_completed(futures)): + _process_result( + f.result(), results, i, total, counts, entity_type + ) + else: + logger.info("🐌 Using synchronous loading ...") + for i, payload in enumerate(data): + endpoint = payload["resourceType"] + result = _do_put( + base_url, endpoint, username, password, payload + ) + _process_result(result, results, i, total, counts, entity_type) + + # Write results + _write_results(entity_type, results, output_dir) + + logger.info(f"🔢 Load Counts:\n{pformat(dict(counts))}") + logger.info( + f"⏰ Elapsed time (hh:mm:ss): {utils.elapsed_time_hms(start_time)}" + ) + logger.info(f"✅ Completed load") + + return counts diff --git a/d3b_api_client_cli/fhir/stats.py b/d3b_api_client_cli/fhir/stats.py new file mode 100644 index 0000000..c2f8f45 --- /dev/null +++ b/d3b_api_client_cli/fhir/stats.py @@ -0,0 +1,118 @@ +""" +Collect stats like total counts of KF entities in FHIR server +""" + +import os +import logging +import urllib.parse + +import pandas + +from d3b_api_client_cli.config import config +from d3b_api_client_cli.fhir.counts import get_counts + +logger = logging.getLogger(__name__) + +config = config["fhir"] +FHIR_BASE_URL = config["base_url"] +FHIR_USERNAME = config["username"] +FHIR_PW = config["password"] + +fhir_queries = config["mapping"] +dataservice_to_kf_fhir_map = { + "Participant": ["patient"], + "Biospecimen": ["child_specimen", "parent_specimen"], + "Family": ["family"], + "FamilyRelationship": ["family_relationship"], + "BiospecimenDiagnosis": ["histopathology"], + "Phenotype": ["phenotype"], + "Diagnosis": ["disease"], + "Outcome": ["vital_status"], + "GenomicFile": ["drs_document_reference", "drs_document_reference_index"], +} + + +def count_df(studies, fhir_url, output_dir): + """ + Write out the entity counts to file + """ + study_ids = [s["kf_id"] for s in studies] + + # Dataservice entity counts + fp = os.path.join(output_dir, "fhirservice_counts.csv") + counts = api_entity_counts(study_ids, fhir_base_url=fhir_url) + df = pandas.DataFrame(counts) + + # Join Study info with counts + study_df = pandas.DataFrame(studies) + df = pandas.merge(df, study_df, left_on="study_id", right_on="kf_id") + df = df[["study_id", "short_code", "name", "entity_type", "count"]] + df.to_csv(fp, index=False) + + logger.info(f"✏️ Wrote FHIR service counts to {fp}") + + return df + + +def api_entity_counts( + study_ids, + fhir_base_url=FHIR_BASE_URL, + username=FHIR_USERNAME, + password=FHIR_PW, + legacy_server=True, +): + """ + Produce a table of entity type counts for each study in FHIR service + + Format: + + -------------------------------------------------- + | Study KF ID | Entity Type | Count | Query String| + -------------------------------------------------- + | ... | + """ + + logger.info("🔢 Collecting FHIR resource counts ...") + + # We have to do this for legacy server bc the GenomicFile + # total = drs_document_reference + drs_document_reference_index + # but in legacy server these two queries are the same since + # there is no way to differentiate between the two + # Same is true for Biospecimen. Biospecimen = + # child_specimen + parent_specimen but the queries don't + # differentiate + if legacy_server: + dataservice_to_kf_fhir_map["Biospecimen"] = ["child_specimen"] + dataservice_to_kf_fhir_map["GenomicFile"] = ["drs_document_reference"] + + data = [] + for study_id in study_ids: + logger.info(f"👩🏻‍🔬 Study {study_id} ...") + + # Get KF FHIR entity counts + counts = get_counts(study_id, legacy_server=legacy_server) + count_by_type = {item["entity_type"]: item["total"] for item in counts} + # Map KF FHIR entity counts to Dataservice entity counts + for entity_type, kf_fhir_entities in dataservice_to_kf_fhir_map.items(): + total = 0 + queries = [] + endpoints = [] + for kf_fhir_entity in kf_fhir_entities: + cfg = fhir_queries[kf_fhir_entity] + queries.append(cfg["params"]) + endpoints.append(cfg["endpoint"]) + total += count_by_type[kf_fhir_entity] + + stats = { + "study_id": study_id, + "entity_type": entity_type, + "count": total, + "url": fhir_base_url, + "endpoint": ",".join(endpoints), + "query": ",".join( + urllib.parse.urlencode(params) for params in queries + ), + } + data.append(stats) + logger.info(f"{entity_type} count: {stats['count']}") + return data diff --git a/d3b_api_client_cli/utils/__init__.py b/d3b_api_client_cli/utils/__init__.py index d52bf46..aec2dad 100644 --- a/d3b_api_client_cli/utils/__init__.py +++ b/d3b_api_client_cli/utils/__init__.py @@ -1,6 +1,4 @@ -""" -Top-level utilities module -""" - from d3b_api_client_cli.utils.misc import * +from d3b_api_client_cli.utils.data import * +from d3b_api_client_cli.utils.db import * from d3b_api_client_cli.utils.io import * diff --git a/d3b_api_client_cli/utils/data.py b/d3b_api_client_cli/utils/data.py new file mode 100644 index 0000000..be7ba02 --- /dev/null +++ b/d3b_api_client_cli/utils/data.py @@ -0,0 +1,337 @@ +""" +Miscellaneous utility functions for dealing with data +""" + +import os +import re +from itertools import zip_longest +from pprint import pprint +from inspect import signature +from collections import defaultdict +import logging + +import requests +import pandas + +from d3b_api_client_cli.config import config + +DEFAULT_TABLE_BATCH_SIZE = 1000 + +logger = logging.getLogger(__name__) + + +def read_df(filepath, cleanup=True): + """ + Read table in from disk and safely convert values to strings + + This is necessary since pandas does not do a good job coercing values to + their correct types + + NaN convert to None + All other values get converted to strings + """ + fp = filepath + fn = os.path.split(fp)[-1] + df = None + + logger.info(f"📚 Reading data for {fn} ...") + df = pandas.read_csv(fp, index_col=False) + + if cleanup: + df = clean_up_df(df) + logger.info(f"🧼 Cleaning up dataframe {fn}, m x n: {df.shape}") + + return df + + +def write_df(df, filepath, msg=None, cleanup=False, **kwargs): + """ + Write Pandas Dataframe to disk. Optionally call clean_up_df on it before + writing it to file + """ + fn = os.path.split(filepath)[-1] + if cleanup: + logger.info(f"🧼 Cleaning up dataframe {fn} before write") + df = clean_up_df(df) + + df.to_csv(filepath, index=False, **kwargs) + + +def df_exists(df): + """ + Check that DF is a pandas DataFrame and not empty + """ + return isinstance(df, pandas.DataFrame) and (not df.empty) + + +def chunked_dataframe_reader( + filepath, batch_size=DEFAULT_TABLE_BATCH_SIZE, **read_csv_kwargs +): + """ + Return a generator to iterate over the Dataframe rows in batches + + :param filepath: Path to the tabular file + :type filepath: str + :param batch_size: Number of rows to read from the df in one iteration + :type batch_size: int + :param read_csv_kwargs: pandas.read_csv kwargs + :type read_csv_kwargs: dict + :yields: pandas.Dataframe + """ + read_csv_kwargs.pop("chunksize", None) + count = 0 + for i, chunk in enumerate( + pandas.read_csv(filepath, chunksize=batch_size, **read_csv_kwargs) + ): + nrows = chunk.shape[0] + count += nrows + logger.debug(f"Reading {nrows} rows of {count} seen") + yield chunk + + +def merge_wo_duplicates(left, right, left_name=None, right_name=None, **kwargs): + """ + Merge two dataframes and return a dataframe with no duplicate columns. + + If duplicate columns result from the merge, resolve duplicates by + filling nans on the left column with values from the right column. + + :param left: left dataframe + :type left: Pandas.DataFrame + :param left_name: Optional name of left DataFrame to use in logging + the DataFrame's uniques using nunique() + :type left_name: str + :param right_name: Optional name of right DataFrame to use in logging + the DataFrame's uniques using nunique() + :type right_name: str + :param right: right dataframe + :type right: Pandas.DataFrame + :param kwargs: keyword args expected by Pandas.merge function + """ + left = left.astype(object) + right = right.astype(object) + left_name = left_name or "Left" + right_name = right_name or "Right" + + # Check if merge col(s) are present in DataFrame + def check_merge_col(merge_on, df_name, df, err_msgs): + if isinstance(merge_on, str): + merge_on = [merge_on] + for col in merge_on: + if col not in df.columns: + err_msgs.append(f"'{col}' not found in {df_name}: {df.columns}") + return err_msgs + + err = [] + if "on" in kwargs: + on = kwargs["on"] + err = check_merge_col(on, left_name, left, err) + err = check_merge_col(on, right_name, right, err) + elif ("left_on" in kwargs) and ("right_on" in kwargs): + err = check_merge_col(kwargs["left_on"], left_name, left, err) + err = check_merge_col(kwargs["right_on"], right_name, right, err) + else: + err = [ + ( + "Missing merge column keyword argument(s). Must supply either `on`" + " or both `left_on` and `right_on` arguments." + ) + ] + + if err: + raise Exception("\n".join(err)) + + def resolve_duplicates(df, suffixes): + l_suffix = suffixes[0] + r_suffix = suffixes[1] + + while True: + to_del = set() + for coll in df.columns: + if coll.endswith(l_suffix): + firstpart = coll.split(l_suffix)[0] + colr = firstpart + r_suffix + inconsistent = ( + (df[coll] != df[colr]) + & df[coll].notna() + & df[colr].notna() + ) + if any(inconsistent): + raise Exception( + "Inconsistent data between left and right DFs.\n" + f"Kwargs: {kwargs}\n" + f"Left side was:\n{left}\n" + f"Right side was:\n{right}\n" + f"Intermediate was:\n{df}\n" + f"Merge collision between: {coll} and {colr}\n" + "Mismatching values:\n" + f"{df[[coll, colr]][inconsistent]}" + ) + df[firstpart] = df[coll].fillna(df[colr]) + to_del.update([coll, colr]) + if not to_del: + break + else: + for c in to_del: + del df[c] + return df + + merged = pandas.merge(left, right, **kwargs) + reduced = resolve_duplicates(merged, kwargs.pop("suffixes", ("_x", "_y"))) + + default_how = signature(pandas.merge).parameters["how"].default + + # Hopefully this will help us know that we didn't lose anything important + # in the merge + collective_uniques = defaultdict(set) + for c in left.columns: + collective_uniques[c] |= set(left[c]) + for c in right.columns: + collective_uniques[c] |= set(right[c]) + collective_uniques = pandas.DataFrame( + {c: [len(v)] for c, v in collective_uniques.items()} + ) + msg = ( + f'*** {kwargs.get("how", default_how).title()} merge {left_name} with ' + f"{right_name}***\n" + f"-- Left+Right Collective Uniques --\n{collective_uniques}\n" + f"-- Merged DataFrame Uniques --\n{reduced.nunique()}" + ) + logger.debug(msg) + + return reduced + + +def convert_to_downcasted_str(val, replace_na=False, na=None): + """ + Converts values to stripped strings while collapsing downcastable floats. + + Examples: + to_str_with_floats_downcast_to_ints_first(1) -> "1" + to_str_with_floats_downcast_to_ints_first(1.0) -> "1" + to_str_with_floats_downcast_to_ints_first("1_1 ") -> "1_1" + to_str_with_floats_downcast_to_ints_first(None) -> None + to_str_with_floats_downcast_to_ints_first(None, True, "") -> "" + + If you're wondering what this is good for, try the following: + import pandas + df1 = pandas.DataFrame({"a":[1,2,3,None]}, dtype=object) + df2 = pandas.read_json(df1.to_json(), dtype=object) + str(df1['a'][0]) == str(df2['a'][0]) # this returns False. Yuck. + df1 = df1.map(to_str_with_floats_downcast_to_ints_first) + df2 = df2.map(to_str_with_floats_downcast_to_ints_first) + str(df1['a'][0]) == str(df2['a'][0]) # this returns True. Good. + + :param val: any basic type + :param replace_na: should None/NaN/blank values be replaced with something + :type replace_na: boolean + :param na: if replace_na is True, what should None/NaN/blank values be + replaced with + + + :return: new representation of `val` + """ + if isinstance(val, list): + # make hashable without changing style or losing comparability + return str(sorted(convert_to_downcasted_str(v) for v in val)) + if isinstance(val, dict): + # make hashable without changing style or losing comparability + return str( + dict( + sorted( + (k, convert_to_downcasted_str(v)) for k, v in val.items() + ) + ) + ) + if pandas.isnull(val): + if replace_na: + return na + else: + return val + + val = str(val).strip() + if val != "": + # Try downcasting val + i_val = None + f_val = None + try: + f_val = float(val) + i_val = int(f_val) + except Exception: + pass + + # Don't automatically change anything with leading zeros + # (except something that equates to int 0), scientific + # notation, or underscores (I don't care what PEP 515 says). + if (i_val != 0) and ( + (val[0] == "0") or (not re.fullmatch(r"[\d.]+", val)) + ): + return val + + # Return str version of downcasted val + if i_val == f_val: + return str(i_val) + + elif replace_na: + return na + + return val + + +def str_to_obj(var): + """ + Convert a string that looks like a list, dict, tuple, or bool back into its + native object form. + """ + if not isinstance(var, str): + return var + elif var.startswith(("[", "{", "(")): + try: + return ast.literal_eval(var) + except Exception: + pass + else: + lowvar = var.strip().lower() + if lowvar == "false": + return False + elif lowvar == "true": + return True + return var + + +def recover_containers_from_df_strings(df): + """ + Undo one bit of necessary madness imposed by clean_up_df where lists and + dicts (both unhashable) are sorted and then converted to strings for + safekeeping. This finds strings that look like lists, dicts, or tuples and + converts them back to their native forms. + + :param df: a pandas DataFrame + :return: Dataframe with object-like strings converted to native objects + :rtype: DataFrame + """ + return df.map(str_to_obj) + + +def clean_up_df(df): + """ + We can't universally control which null type will get used by a data + file loader, and it might also change, so let's always push them all + to None because other nulls are not our friends. It's easier for a + configurator to equate empty spreadsheet cells with None than e.g. + numpy.nan. + + Typed loaders like pandas.read_json force us into storing numerically + typed values. And then nulls, which read_json does not let you handle + inline, cause pandas to convert perfectly good ints into ugly floats. + So here we get any untidy values back to nice and tidy strings. + + :param df: a pandas DataFrame + :return: Dataframe with numbers converted to strings and NaNs/blanks + converted to None + :rtype: DataFrame + """ + + return df.map( + lambda x: convert_to_downcasted_str(x, replace_na=True, na=None) + ).drop_duplicates() diff --git a/d3b_api_client_cli/utils/db.py b/d3b_api_client_cli/utils/db.py new file mode 100644 index 0000000..2590dd4 --- /dev/null +++ b/d3b_api_client_cli/utils/db.py @@ -0,0 +1,214 @@ +""" +Database utility functions +""" + +import os +import time +import logging + +from psycopg2 import sql, connect +import psycopg2.extras +import sqlalchemy + +from d3b_api_client_cli.utils.data import chunked_dataframe_reader +from d3b_api_client_cli.utils.misc import elapsed_time_hms + +logger = logging.getLogger(__name__) + +DEFAULT_BATCH_SIZE = 10000 + + +def load_table_from_file( + filepath, + schema_name, + table_name, + batch_size=DEFAULT_BATCH_SIZE, + sqla_engine=None, + dispose_at_end=False, + **db_conn_args, +): + """ + Create/update a table within the specified schema and load the data in + into the specified table with the data from the file + + :param filepath: Path to csv file containing table data + :type filepath: str + :param schema_name: Name of db schema where tables get created + :type schema_name: str + :param table_name: Name of db table where table will be upserted + :type table_name: str + :param batch_size: Number of rows to upsert at a time + :type batch_size: int + :param sqla_engine: Existing sqlalchemy engine instance + :type sqla_engine: sqlalchemy.engine.Engine + :param dispose_at_end: Whether to close the connection after load + :type dispose_at_end: boolean + :param db_conn_args: + :type db_conn_args: + """ + logger.info( + f"🗃️ Starting to load {filepath} into {schema_name}.{table_name}" + ) + start_time = time.time() + + if not sqla_engine: + # Create connection to db + try: + username = db_conn_args["username"] + password = db_conn_args["password"] + hostname = db_conn_args["host"] + port = db_conn_args["port"] + db_name = db_conn_args["db_name"] + except KeyError: + logger.error("❌ Not enough inputs to connect to database!") + raise + + sqla_engine = sqlalchemy.create_engine( + f"postgresql://{username}:{password}@{hostname}:{port}/{db_name}", + connect_args={"connect_timeout": 5}, + ) + dispose_at_end = True + + filename = os.path.split(filepath)[-1] + count = 0 + + # Stream data from file + logger.info(f"Streaming file {filename} into db table {table_name}...") + for _, df in enumerate(chunked_dataframe_reader(filepath)): + # Bulk insert rows into db table + df.to_sql( + table_name, + sqla_engine, + schema=schema_name, + if_exists="replace", + index=False, + method="multi", + chunksize=batch_size, + ) + count += df.shape[0] + logger.info(f"-- Loaded {count} total rows") + + if dispose_at_end: + sqla_engine.dispose() + + end_time = elapsed_time_hms(start_time) + logger.info(f"⏰ Elapsed time (hh:mm:ss): {end_time}") + + +def create_db_schema(conn, schema_name): + """ + Create schema in db specified by the conn Connection object + + :param conn: an existing psycopg2 connection + :type conn: resulting object from psycopg2.connect + :param schema_name: Name of db schema to create + :type schema_name: str + """ + logger.info(f"✨ Creating new schema {schema_name} in database ...") + with conn.cursor() as cursor: + conn.autocommit = True + cursor.execute( + sql.SQL( + "CREATE SCHEMA {0};", + ).format( + sql.Identifier(schema_name), + ) + ) + + +def create_db_user(conn, user, password): + """ + Alter db user if it exists + Create a new db user if it does not exist + + :param conn: an existing psycopg2 connection + :type conn: resulting object from psycopg2.connect + :param user: username to create + :type user: str + :param user: password to create + :type user: str + """ + logger.info(f"✨ Upserting user {user} in database ...") + with conn.cursor() as cursor: + conn.autocommit = True + cursor.execute( + sql.SQL( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM pg_roles + WHERE rolname = {user_literal} + ) THEN + CREATE USER {user} WITH + CONNECTION LIMIT 1000 + LOGIN ENCRYPTED PASSWORD {password}; + ELSE + ALTER USER {user} WITH + ENCRYPTED PASSWORD {password}; + END IF; + END + $$; + """ + ).format( + user_literal=sql.Literal(user), + user=sql.Identifier(user), + password=sql.Literal(password), + ) + ) + + +def grant_db_privileges(conn, schema_name, schema_owner, username): + """ + Grant select privileges to user on schema + + :param conn: an existing psycopg2 connection + :type conn: resulting object from psycopg2.connect + :param schema_name: Name of db schema to grant privileges to + :type schema_name: str + :param username: username to grant select privileges to + :type username: str + """ + with conn.cursor() as cursor: + conn.autocommit = True + query = sql.SQL( + "GRANT USAGE ON SCHEMA {schema_name} TO {username};" + "ALTER DEFAULT PRIVILEGES FOR USER {schema_owner} IN SCHEMA " + "{schema_name} " + "GRANT ALL ON TABLES TO {username};" + ).format( + username=sql.Identifier(username), + schema_owner=sql.Identifier(schema_owner), + schema_name=sql.Identifier(schema_name), + ) + cursor.execute(query) + + +def run_query(query_string, username, password, dbname, host, port): + """ + Run a basic sql query with no variables + """ + + try: + with connect( + dbname=dbname, + user=username, + password=password, + host=host, + port=port, + ) as conn: + with conn.cursor( + cursor_factory=psycopg2.extras.RealDictCursor + ) as cursor: + cursor.execute(query_string) + rows = cursor.fetchall() + except psycopg2.OperationalError as e: + logger.error( + "❌ Could not execute query due to failure to connect to db:" + f" {host}:{port}/{dbname}" + " Check your connection details in the environment" + ) + raise e + + return rows diff --git a/d3b_api_client_cli/utils/io/__init__.py b/d3b_api_client_cli/utils/io.py similarity index 100% rename from d3b_api_client_cli/utils/io/__init__.py rename to d3b_api_client_cli/utils/io.py diff --git a/d3b_api_client_cli/utils/misc.py b/d3b_api_client_cli/utils/misc.py new file mode 100644 index 0000000..57a242a --- /dev/null +++ b/d3b_api_client_cli/utils/misc.py @@ -0,0 +1,361 @@ +""" +Miscellaneous utility functions +""" + +import os +import importlib +import re +import time +import logging +import json +import datetime +from pprint import pformat +from urllib.parse import urlparse +import pandas +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +from d3b_api_client_cli.config import config + +DEFAULT_TABLE_BATCH_SIZE = 1000 + +logger = logging.getLogger(__name__) + +LOCAL_HOSTS = { + "localhost", + "127.0.0.1", +} + + +def df_exists(df: pandas.DataFrame) -> bool: + """ + Check that DF is a pandas DataFrame and not empty + """ + return isinstance(df, pandas.DataFrame) and (not df.empty) + + +def timestamp(): + """ + Helper to create an ISO 8601 formatted string that represents local time + and includes the timezone info. + """ + # Calculate the offset taking into account daylight saving time + # https://stackoverflow.com/questions/2150739/iso-time-iso-8601-in-python + if time.localtime().tm_isdst: + utc_offset_sec = time.altzone + else: + utc_offset_sec = time.timezone + utc_offset = datetime.timedelta(seconds=-utc_offset_sec) + t = ( + datetime.datetime.now() + .replace(tzinfo=datetime.timezone(offset=utc_offset)) + .isoformat() + ) + + return str(t) + + +def get_id(link_name, entity): + """ + Extract a KF ID from the _links dict in an entity that was in a Dataservice + response json + """ + return entity.get("_links", {}).get(link_name, "").split("/")[-1] + + +def get_total(base_url, endpoint, study_id, params={}): + """ + Get total entity count in Dataservice + """ + params.update({"study_id": study_id, "limit": 1}) + url = "/".join(part.strip("/") for part in [base_url, endpoint]) + headers = {"Content-Type": "application/json"} + + resp = send_request("get", url, headers=headers, params=params) + return resp.json()["total"] + + +def is_localhost(url): + """ + Determine whether url is on localhost + """ + url = url.strip("/") + host = urlparse(url).netloc.split(":")[0] + return (host in LOCAL_HOSTS) or ( + any([url.startswith(h) for h in LOCAL_HOSTS]) + ) + + +def delete_safety_check(url, error_msg=None): + """ + Check if the url is on localhost and raise an exception if it is. This + method is used in delete operations where you want to protect against + deletions on hosts other than localhost + """ + DELETE_SAFETY_CHECK = os.environ.get("DWDS_DELETE_SAFETY_CHECK", True) + if str(DELETE_SAFETY_CHECK).lower() == "false": + return + + if is_localhost(url): + # If localhost, we are allowed delete + pass + else: + if not error_msg: + error_msg = ( + f"❌ Cannot delete from {url} because env variable" + f" DWDS_DELETE_SAFETY_CHECK=True. Resources that are not in" + f" {LOCAL_HOSTS} will not be deleted. To disable safety check," + f" set DWDS_DELETE_SAFETY_CHECK=False in your environment" + ) + raise Exception(error_msg) + + +def kf_id_to_global_id(kfid, replace_prefix=None): + """Convert Kids First ID to Dewrangle global ID + + :param kfid: the KF ID to convert + :type kfid: str + :param replace_prefix: the char seq to replace the KF ID prefix with + :type replace_prefix: str + :rtype: str + :returns: global ID + """ + parts = str(kfid).lower().split("_") + prefix = parts[0] + rest = parts[-1] + + if replace_prefix: + prefix = replace_prefix + + return "-".join([prefix, rest]) + + +def global_id_to_kf_id(global_id): + """ + Convert global ID format to KF ID format + """ + return global_id.replace("-", "_").upper() + + +def update_in_dictlist(list_, updates, key, value): + """ + Find a dict in the list by matching key and value. Update the + found item with updates. Return the new updated list + """ + new_list = [] + for item in list_: + if item.get(key) == value: + item.update(updates) + new_list.append(item) + + return new_list + + +def dataservice_count(entity_type, filter_kwargs=None, request_kwargs=None): + """ + Get count for dataservice url + """ + cfg = config["dataservice"] + base_url = cfg["api_url"] + endpoint = cfg["endpoints"].get(entity_type) + url = f"{base_url}{endpoint}" + + params = filter_kwargs or {} + request_kwargs = request_kwargs or {} + params["limit"] = 1 + resp = send_request("get", url, params=params, **request_kwargs) + + return resp.json()["total"] + + +def import_module_from_file(filepath): + """ + Import a Python module given a filepath + """ + module_name = os.path.basename(filepath).split(".")[0] + spec = importlib.util.spec_from_file_location(module_name, filepath) + imported_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(imported_module) + return imported_module + + +def multisplit(string, delimiters): + """Split a string by multiple delimiters. + + :param string: the string to split + :type string: str + :param delimiters: the list of delimiters to split by + :type delimiters: list + :return: the split substrings + :rtype: list + """ + + regexPattern = "|".join(map(re.escape, delimiters)) + return re.split(regexPattern, string) + + +def camel_to_snake(value): + """Convert CamelCase string to snake_case string + + :param value: The camel case value to convert + :type value: str + :returns: snake_cased string + :rtype: str + """ + value = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", value) + + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", value).lower() + + +def snake_to_camel(value): + """Convert snake_case string to CamelCase string + + :param value: The value to convert + :type value: str + :returns: CamelCased string + :rtype: str + """ + return "".join([w.title() for w in value.split("_")]) + + +def elapsed_time_hms(start_time): + """Get time elapsed since start_time in hh:mm:ss str format + + :param start_time: The starting time from which to calc elapsed time + :type start_time: datetime.datetime obj + :returns: a time string formatted as hh:mm:ss + :rtype: str + """ + elapsed = time.time() - start_time + return time.strftime("%H:%M:%S", time.gmtime(elapsed)) + + +def requests_retry_session( + session=None, + retries=3, + backoff_factor=0.3, + allowed_methods=frozenset({"GET"}), + status_forcelist=(500, 502, 503, 504), +): + """ + Requests session that retries on recoverable errors + + See https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + allowed_methods=allowed_methods, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + + return session + + +def send_request(method, *args, ignore_status_codes=None, **kwargs): + """Send http request. Raise exception on status_code >= 300 + + :param method: name of the requests method to call + :type method: str + :raises: requests.Exception.HTTPError + :returns: requests Response object + :rtype: requests.Response + """ + if isinstance(ignore_status_codes, str): + ignore_status_codes = [ignore_status_codes] + + # NOTE: Set timeout so requests don't hang + # See https://requests.readthedocs.io/en/latest/user/advanced/#timeouts + if not kwargs.get("timeout"): + # connect timeout, read timeout + kwargs["timeout"] = (6.05, 120) + # If timeout is negative, remove it so there is no timeout limit + elif kwargs.get("timeout") < 0: + kwargs.pop("timeout", None) + else: + logger.info( + f"⌚️ Applying user timeout: {kwargs['timeout']} (connect, read)" + " seconds to request" + ) + + # Use retries if the user specifies it + use_retries = kwargs.pop("use_retries", False) + session = kwargs.pop("session", None) + if use_retries: + requests_op = getattr( + session or requests_retry_session(), method.lower() + ) + else: + requests_op = getattr(requests, method.lower()) + + status_code = 0 + try: + resp = requests_op(*args, **kwargs) + status_code = resp.status_code + resp.raise_for_status() + except requests.exceptions.HTTPError as e: + if ignore_status_codes and (status_code in ignore_status_codes): + pass + else: + body = "" + try: + body = pformat(resp.json()) + except Exception: + body = resp.text + + kwargs["use_retries"] = use_retries + msg = ( + f"❌ Problem sending {method} request to server\n" + f"{str(e)}\n" + f"args: {args}\n" + f"kwargs: {pformat(kwargs)}\n" + f"{body}\n" + ) + logger.error(msg) + raise e + + return resp + + +def read_json(filepath, default=None): + """ + Read JSON file into Python dict. If default is not None and the file + does not exist, then return default. + + :param filepath: path to JSON file + :type filepath: str + :param default: default return value if file not found, defaults to None + :type default: any, optional + :returns: your data + :rtype: dict + """ + if (default is not None) and (not os.path.isfile(filepath)): + return default + + with open(filepath, "r") as json_file: + return json.load(json_file) + + +def write_json(data, filepath, **kwargs): + r""" + Write Python data to JSON file. + + :param data: your data + :param filepath: where to write your JSON file + :type filepath: str + :param \**kwargs: keyword arguments to pass to json.dump + :returns: None + """ + if "indent" not in kwargs: + kwargs["indent"] = 4 + if "sort_keys" not in kwargs: + kwargs["sort_keys"] = False + with open(filepath, "w") as json_file: + json.dump(data, json_file, **kwargs) diff --git a/d3b_api_client_cli/utils/misc/__init__.py b/d3b_api_client_cli/utils/misc/__init__.py deleted file mode 100644 index d94e7c5..0000000 --- a/d3b_api_client_cli/utils/misc/__init__.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -Miscellaneous Utility Functions - -This module provides utility functions for common tasks that assist in various parts -of the data transfer pipeline. These utilities include checking DataFrame validity -and generating ISO 8601-compliant timestamps with timezone information. - -Key Functions: --------------- -1. **df_exists**: - - Verifies that a pandas DataFrame exists and is not empty. - - Parameters: - - `df` (pandas.DataFrame): The DataFrame to check. - - Returns: - - `bool`: `True` if `df` is a valid, non-empty DataFrame; `False` otherwise. - -2. **timestamp**: - - Generates a timestamp in ISO 8601 format, representing the current local time - with timezone information. - - Returns: - - `str`: A string representation of the current local time, e.g., - `"2024-11-21T15:34:45-05:00"`. - -""" - -import time -from urllib.parse import urlparse -import datetime - -import pandas - -LOCAL_HOSTS = { - "localhost", - "127.0.0.1", -} - - -def df_exists(df: pandas.DataFrame) -> bool: - """ - Check that DF is a pandas DataFrame and not empty - """ - return isinstance(df, pandas.DataFrame) and (not df.empty) - - -def timestamp() -> str: - """ - Helper to create an ISO 8601 formatted string that represents local time - and includes the timezone info. - """ - # Calculate the offset taking into account daylight saving time - # https://stackoverflow.com/questions/2150739/iso-time-iso-8601-in-python - if time.localtime().tm_isdst: - utc_offset_sec = time.altzone - else: - utc_offset_sec = time.timezone - utc_offset = datetime.timedelta(seconds=-utc_offset_sec) - t = ( - datetime.datetime.now() - .replace(tzinfo=datetime.timezone(offset=utc_offset)) - .isoformat() - ) - - return str(t) - - -def is_localhost(url: str) -> bool: - """ - Determine whether url is on localhost - """ - url = url.strip("/") - host = urlparse(url).netloc.split(":")[0] - return (host in LOCAL_HOSTS) or ( - any([url.startswith(h) for h in LOCAL_HOSTS]) - ) - - -def delete_safety_check(url: str, error_msg: str = None) -> None: - """ - Check if the url is on localhost and raise an exception if it is. - - This method is used in delete operations where you want to protect against - deletions on hosts other than localhost - """ - if is_localhost(url): - # If localhost, we are allowed delete - pass - else: - if not error_msg: - error_msg = ( - f"❌ Cannot delete from {url} because env variable" - " DELETE_SAFETY_CHECK=True. Resources that are not in" - f" {LOCAL_HOSTS} will not be deleted. To disable safety check," - " set DELETE_SAFETY_CHECK=False in your environment" - ) - raise ValueError(error_msg) - - -def kf_id_to_global_id(kf_id: str, replace_prefix: str = None) -> str: - """ - Convert Kids First ID to Dewrangle global ID - - Example - - KF_ID: SD_ME0WME0W -> Dewrnagle global id: sd-me0wme0w - """ - parts = str(kf_id).lower().split("_") - prefix = parts[0] - rest = parts[-1] - - if replace_prefix: - prefix = replace_prefix - - return "-".join([prefix, rest]) - - -def global_id_to_kf_id(global_id: str) -> str: - """ - Convert Dewrangle global ID format to Kids First ID format - - Example - - Dewrnagle global id: sd-me0wme0w -> KF_ID: SD_ME0WME0W - """ - return global_id.replace("-", "_").upper() - - -def elapsed_time_hms(start_time: float) -> str: - """ - Gets the time elapsed since `start_time` in hh:mm:ss string format. - - Args: - start_time (datetime.datetime): The starting time from which to calculate the elapsed time. - - Returns: - str: A time string formatted as hh:mm:ss. - """ - elapsed = time.time() - start_time - return time.strftime("%H:%M:%S", time.gmtime(elapsed)) diff --git a/pyproject.toml b/pyproject.toml index 9c6d996..58d250d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,30 +7,40 @@ authors = [ ] description = "A CLI tool with functions that interact with commonly used D3b APIs" readme = "README.md" -requires-python = ">=3.12" +requires-python = ">=3.9,<3.13" classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache License", - "Operating System :: Linux", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache License", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS", ] dependencies = [ "click==8.1.7", - "python-dotenv==1.0.1", - "gql==3.5.0", - "gql[aiohttp]==3.5.0", - "requests==2.32.3", - "pandas==2.2.3", - "psycopg2-binary==2.9.10", - "sqlalchemy==2.0.36", + "python-dotenv==1.0.0", + "requests==2.31.0", + "Faker==19.3.1", + "pandas>=2.2.0,<3", + "numpy<2.0.0", + "psycopg2-binary==2.9.9", + "base32-crockford==0.3.0", + "gql==3.4.1", + "aiohttp>=3.9.5,<4", + "deepdiff==6.7.1", + "pyyaml==6.0.1", + "urllib3", + "SQLAlchemy==2.0.32", ] [project.optional-dependencies] dev = [ - "pytest==8.3.3", - "pytest-mock==3.14.0", + "autopep8", + "pylint", "black==24.10.0", + "pytest==7.4.2", + "requests-mock==1.11.0", + "deepdiff==6.7.1", + "boto3==1.35.25", "testcontainers[postgres]==4.9.0", - "requests-mock==1.12.1", ] [project.scripts] @@ -41,7 +51,7 @@ Homepage = "https://github.com/d3b-center/d3b-api-client-cli" Issues = "https://github.com/d3b-center/d3b-api-client-cli/issues" [build-system] -requires = ["setuptools >= 61.0"] +requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] diff --git a/tests/conftest.py b/tests/conftest.py index 2f4542e..3089f8b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,195 +1,149 @@ -""" -Reusable fixtures and helpers for all tests -""" - import os +import shutil import pytest +import json from click.testing import CliRunner from testcontainers.postgres import PostgresContainer -from d3b_api_client_cli.utils import write_json - -from d3b_api_client_cli.cli import * -from d3b_api_client_cli.dewrangle.graphql import ( - organization, - study, - credential, +from d3b_api_client_cli.utils import read_json, send_request +from d3b_api_client_cli.config import ( + ROOT_DIR, + KF_FHIR_QA_OIDC_CLIENT_SECRET, + config, ) -from d3b_api_client_cli.config import config +from d3b_api_client_cli.cli.fhir.commands import delete_all, load_fhir +from d3b_api_client_cli.cli.dewrangle.graphql_commands import ( + upsert_organization, + upsert_study, + upsert_fhir_server, +) +from d3b_api_client_cli.dewrangle.graphql import organization -AWS_ACCESS_KEY_ID = config["aws"]["s3"]["aws_access_key_id"] -AWS_SECRET_ACCESS_KEY = config["aws"]["s3"]["aws_secret_access_key"] -AWS_BUCKET_DATA_TRANSFER_TEST = config["aws"]["s3"]["test_bucket_name"] POSTGRES_DB_IMAGE = "postgres:16-alpine" -ORG_NAME = "Integration Tests d3b-api-client-cli" -@pytest.fixture(scope="session") -def organization_file(tmp_path_factory): +def get_study_id(): + """ + Gets a single study from Dataservice + + Used in various tests + """ + base_url = config["dataservice"]["api_url"] + url = f"{base_url}/studies?limit=1" + resp = send_request("get", url) + + return resp.json()["results"][0]["kf_id"] + + +@pytest.fixture +def fhir_json_data(tmp_path, dewrangle_study): """ - Write the inputs to create a Dewrangle Organization to file + Provide a temp FHIR directory + Dewrangle study node id for ingest tests. + + Returns: + (dewrangle_study_node_id, fhir_dir, entities_to_load) """ + src_dir = os.path.join(ROOT_DIR, "tests", "data", "fhir_minimal") + dest_dir = tmp_path / "fhir_minimal" + shutil.copytree(src_dir, dest_dir) - def create_and_write_org(org_name=ORG_NAME): - data_dir = tmp_path_factory.mktemp("data") - org_filepath = os.path.join(data_dir, "Organization.json") - org = { - "name": org_name, - "description": "A test org", - "visibility": "PRIVATE", - } - write_json(org, org_filepath) + study_json_path = dest_dir / "Study.json" + with open(study_json_path, "w") as f: + json.dump({"kf_id": dewrangle_study["globalId"]}, f) - return org_filepath + entities_to_load = ["patient", "observation", "encounter"] - return create_and_write_org + return (dewrangle_study["id"], str(dest_dir), entities_to_load) @pytest.fixture(scope="session") -def study_file(tmp_path_factory): +def delete_fhir_data(): """ - Write the inputs to create a Dewrangle Organization to file + Delete all data in FHIR server """ + runner = CliRunner() + result = runner.invoke(delete_all, []) + assert result.exit_code == 0 - def create_and_write_study(study_name="TestStudy", global_id=None): - data_dir = tmp_path_factory.mktemp("data") - study_filepath = os.path.join(data_dir, "Study.json") - study = { - "name": study_name, - "description": "A test study", - } - if global_id: - study["global_id"] = global_id - write_json(study, study_filepath) - return study_filepath +@pytest.fixture(scope="session") +def load_fhirservice(delete_fhir_data, fhir_json_data): + """ + Load all test data in FHIR server + """ + study_id, fhir_json_dir = fhir_json_data - return create_and_write_study + runner = CliRunner() + result = runner.invoke(load_fhir, [fhir_json_dir]) + assert result.exit_code == 0 + + resources = {} + for fn in os.listdir(fhir_json_dir): + fp = os.path.join(fhir_json_dir, fn) + resources[os.path.splitext(fn)[0]] = read_json(fp) + + return study_id, resources @pytest.fixture(scope="session") -def dewrangle_org(organization_file): +def dewrangle_org(): """ Upsert an Organization in Dewrangle for other tests to use """ - fp = organization_file() + fp = os.path.join(ROOT_DIR, "tests/data/test-org.json") runner = CliRunner() result = runner.invoke(upsert_organization, [fp], standalone_mode=False) assert result.exit_code == 0 - yield result.return_value, fp + yield result.return_value - organization.delete_organization( - result.return_value["id"], delete_safety_check=False - ) + organization.delete_organization(dewrangle_org_id=result.return_value["id"]) @pytest.fixture(scope="session") -def dewrangle_study(dewrangle_org, study_file): +def dewrangle_study(dewrangle_org): """ - Upsert a Dewrangle study into the integration tests org + Upsert a Study in Dewrangle for other tests to use """ - org, fp = dewrangle_org - fp = study_file() - + fp = os.path.join(ROOT_DIR, "tests/data/test-study.json") runner = CliRunner() - result = runner.invoke(upsert_study, [fp, org["id"]], standalone_mode=False) - return result.return_value, fp - + result = runner.invoke( + upsert_study, [fp, dewrangle_org["id"]], standalone_mode=False + ) + assert result.exit_code == 0 -@pytest.fixture(scope="function") -def delete_credentials(tmp_path): - """ - Delete all credentials - """ - temp_dir = tmp_path / "output" - temp_dir.mkdir() - credentials = credential.read_credentials(temp_dir) + study = result.return_value + study["organization_name"] = dewrangle_org["name"] - for node in credentials.values(): - runner = CliRunner() - result = runner.invoke( - delete_credential, - ["--node-id", node["id"], "--disable-delete-safety-check"], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value + return study @pytest.fixture(scope="session") -def dewrangle_credential(dewrangle_study): +def dewrangle_fhir_server(dewrangle_org): """ - Create credential for dewrangle study + Upsert a FHIR server in Dewrangle for other tests to use """ - study, _ = dewrangle_study - runner = CliRunner() + fp = os.path.join(ROOT_DIR, "tests/data/kidsfirst-qa-upgrade-server.json") + server = read_json(fp) + runner = CliRunner() result = runner.invoke( - upsert_credential, + upsert_fhir_server, [ - "--name", - "e2e", - "--key", - AWS_ACCESS_KEY_ID, - "--secret", - AWS_SECRET_ACCESS_KEY, - "--study-id", - study["id"], + fp, + dewrangle_org["id"], + "--oidc-client-secret", + KF_FHIR_QA_OIDC_CLIENT_SECRET, ], standalone_mode=False, ) assert result.exit_code == 0 - credential = result.return_value - credential["study_global_id"] = study["globalId"] - credential["study_id"] = study["id"] + server["id"] = result.return_value["id"] - return credential + return server -@pytest.fixture(scope="session") -def make_dewrangle_volume(dewrangle_credential): - """ - Return a function taht creates a dewrangle volume - """ - - def _make_volume(jira_ticket_number): - """ - Create a dewrangle volume - """ - study_id = dewrangle_credential["study_id"] - runner = CliRunner() - bucket = AWS_BUCKET_DATA_TRANSFER_TEST - path_prefix = jira_ticket_number - - # Create - result = runner.invoke( - upsert_volume, - [ - "--bucket", - bucket, - "--path-prefix", - path_prefix, - "--study-id", - study_id, - "--credential-key", - dewrangle_credential["key"], - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - - volume = result.return_value - volume["study_id"] = study_id - volume["study_global_id"] = dewrangle_credential["study_global_id"] - - return volume - - return _make_volume - - -# Postgres DB Fixtures @pytest.fixture(scope="module") def postgres_db(request): """ diff --git a/tests/data/fhir_minimal/encounter.json b/tests/data/fhir_minimal/encounter.json new file mode 100644 index 0000000..bde2867 --- /dev/null +++ b/tests/data/fhir_minimal/encounter.json @@ -0,0 +1,16 @@ +{ + "resourceType": "Encounter", + "id": "encounter-1", + "status": "finished", + "class": { + "system": "http://terminology.hl7.org/CodeSystem/v3-ActCode", + "code": "AMB" + }, + "subject": { + "reference": "Patient/patient-1" + }, + "period": { + "start": "2024-01-01T10:00:00Z", + "end": "2024-01-01T11:00:00Z" + } +} diff --git a/tests/data/fhir_minimal/observation.json b/tests/data/fhir_minimal/observation.json new file mode 100644 index 0000000..9f458b3 --- /dev/null +++ b/tests/data/fhir_minimal/observation.json @@ -0,0 +1,31 @@ +{ + "resourceType": "Observation", + "id": "observation-1", + "status": "final", + "category": [ + { + "coding": [ + { + "system": "http://terminology.hl7.org/CodeSystem/observation-category", + "code": "vital-signs" + } + ] + } + ], + "code": { + "coding": [ + { + "system": "http://loinc.org", + "code": "8867-4", + "display": "Heart rate" + } + ] + }, + "subject": { + "reference": "Patient/patient-1" + }, + "valueQuantity": { + "value": 72, + "unit": "beats/minute" + } +} diff --git a/tests/data/fhir_minimal/patient.json b/tests/data/fhir_minimal/patient.json new file mode 100644 index 0000000..ab708da --- /dev/null +++ b/tests/data/fhir_minimal/patient.json @@ -0,0 +1,18 @@ +{ + "resourceType": "Patient", + "id": "patient-1", + "identifier": [ + { + "system": "http://example.org/mrn", + "value": "12345" + } + ], + "name": [ + { + "family": "Doe", + "given": ["John"] + } + ], + "gender": "male", + "birthDate": "1990-01-01" +} diff --git a/tests/data/indexd.json b/tests/data/indexd.json new file mode 100644 index 0000000..aee6fe4 --- /dev/null +++ b/tests/data/indexd.json @@ -0,0 +1,52 @@ +[ + { + "acl": ["drc_test", "SD_11111111"], + "authz": ["/programs/drc_test", "/programs/SD_11111111"], + "baseid": "dde67421-333c-416b-8b6a-5ed9cf4645f0", + "created_date": "2023-10-11T20:01:54.359835", + "did": "6903df49-96a9-41da-ae46-395a7dbb1166", + "file_name": "genomic-file-0.cram", + "form": "object", + "hashes": { + "md5": "b4a3728a635d2ca72834f7e119235f38", + "sha256": "dcb8eb82b12c4c4ddc111ac11193b3fb4106e12c12f1ed2e7aec3eeea74a7869" + }, + "metadata": {}, + "rev": "eaa7205e", + "size": 874, + "updated_date": "2023-10-16T14:38:03.574371", + "uploader": null, + "urls": [ + "s3://kf-study-us-east-1-prd-sd-me0wme0w/source/genomic-files/genomic-file-0.cram" + ], + "urls_metadata": { + "s3://kf-study-us-east-1-prd-sd-me0wme0w/source/genomic-files/genomic-file-0.cram": {} + }, + "version": null + }, + { + "acl": ["drc_test", "SD_11111111"], + "authz": ["/programs/drc_test", "/programs/SD_11111111"], + "baseid": "dde67421-333c-416b-8b6a-5ed9cf4645f0", + "created_date": "2023-10-11T20:01:54.359835", + "did": "6903df49-96a9-41da-ae46-395a7dbb1166", + "file_name": "genomic-file-0.cram.crai", + "form": "object", + "hashes": { + "md5": "b4a3728a635d2ca72834f7e119235f38", + "sha256": "dcb8eb82b12c4c4ddc111ac11193b3fb4106e12c12f1ed2e7aec3eeea74a7869" + }, + "metadata": {}, + "rev": "eaa7205e", + "size": 874, + "updated_date": "2023-10-16T14:38:03.574371", + "uploader": null, + "urls": [ + "s3://kf-study-us-east-1-prd-sd-me0wme0w/source/genomic-files/genomic-file-0.cram.crai" + ], + "urls_metadata": { + "s3://kf-study-us-east-1-prd-sd-me0wme0w/source/genomic-files/genomic-file-0.cram.crai": {} + }, + "version": null + } +] diff --git a/tests/data/test-org.json b/tests/data/test-org.json new file mode 100644 index 0000000..f2b87da --- /dev/null +++ b/tests/data/test-org.json @@ -0,0 +1,5 @@ +{ + "name": "Integration Test Org", + "description": "Test org for integration testing", + "visibility": "PRIVATE" +} diff --git a/tests/data/test-study.json b/tests/data/test-study.json new file mode 100644 index 0000000..380b8b4 --- /dev/null +++ b/tests/data/test-study.json @@ -0,0 +1,4 @@ +{ + "name": "Integration Test Study", + "globalId": "sd-11111111" +} diff --git a/tests/integration/db/test_postgres_save.py b/tests/integration/db/test_postgres_save.py index b740f6f..ef78e8b 100644 --- a/tests/integration/db/test_postgres_save.py +++ b/tests/integration/db/test_postgres_save.py @@ -4,9 +4,10 @@ import os import pytest - +from sqlalchemy import create_engine import pandas as pd from psycopg2 import connect, sql +from urllib.parse import quote_plus from d3b_api_client_cli.db.postgres.save import save_df_to_db from d3b_api_client_cli.db.postgres.admin import create_db_schema @@ -55,6 +56,9 @@ def test_df_to_db(postgres_db, test_dataframe): host=db_host, port=db_port, ) + engine = create_engine( + f"postgresql+psycopg2://{db_user}:{quote_plus(db_password)}@{db_host}:{db_port}/{db_name}" + ) create_db_schema(conn, TEST_SCHEMA) @@ -69,5 +73,5 @@ def test_df_to_db(postgres_db, test_dataframe): query = sql.SQL( "SELECT * FROM {0}.test_table;", ).format(sql.Identifier(TEST_SCHEMA)) - df_db = pd.read_sql_query(query.as_string(conn), conn) + df_db = pd.read_sql_query(query.as_string(conn), engine) assert df_db.shape[0] == 3 diff --git a/tests/integration/dewrangle/test_crud_billing_groups.py b/tests/integration/dewrangle/test_crud_billing_groups.py deleted file mode 100644 index 65738bf..0000000 --- a/tests/integration/dewrangle/test_crud_billing_groups.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -Test CRUD Dewrangle billing_group functions/cmds -""" - -import os - -import pytest -from click.testing import CliRunner - -from d3b_api_client_cli.config import config -from d3b_api_client_cli.cli.dewrangle import ( - upsert_organization, - create_billing_group, - read_billing_groups, - delete_billing_group, -) -from d3b_api_client_cli.dewrangle.graphql import organization - -CAVATICA_BILLING_GROUP_ID = config["dewrangle"]["billing_group_id"] - - -@pytest.fixture(scope="session") -def test_org(organization_file): - """ - Upsert an Organization in Dewrangle for other tests to use - """ - fp = organization_file(org_name="Billing Group Tests") - runner = CliRunner() - result = runner.invoke(upsert_organization, [fp], standalone_mode=False) - assert result.exit_code == 0 - - yield result.return_value, fp - - organization.delete_organization( - result.return_value["id"], delete_safety_check=False - ) - - -def test_crud_billing_group(tmp_path, test_org): - """ - Test `d3b-clients dewrangle create-billing-group` command - Test `d3b-clients dewrangle read-billing-groups` command - Test `d3b-clients dewrangle delete-billing-group` command - """ - org, _ = test_org - - # Create - runner = CliRunner() - result = runner.invoke( - create_billing_group, - [ - "--organization-id", - org["id"], - "--cavatica-billing-group-id", - CAVATICA_BILLING_GROUP_ID, - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - billing_group_id = result.return_value["id"] - assert billing_group_id - - # Can't create same billing group in org - result = runner.invoke( - create_billing_group, - [ - "--organization-id", - org["id"], - "--cavatica-billing-group-id", - CAVATICA_BILLING_GROUP_ID, - ], - standalone_mode=False, - ) - assert result.return_value is None - - # Read - temp_dir = tmp_path / "output" - temp_dir.mkdir() - - runner = CliRunner() - result = runner.invoke( - read_billing_groups, ["--output-dir", temp_dir], standalone_mode=False - ) - assert result.exit_code == 0 - assert len(result.return_value) > 0 - assert os.path.exists(os.path.join(temp_dir, "BillingGroup.json")) - - # Delete - runner = CliRunner() - result = runner.invoke( - delete_billing_group, - [billing_group_id, "--disable-delete-safety-check"], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] diff --git a/tests/integration/dewrangle/test_crud_credentials.py b/tests/integration/dewrangle/test_crud_credentials.py deleted file mode 100644 index 5bfa1ed..0000000 --- a/tests/integration/dewrangle/test_crud_credentials.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -Test CRUD Dewrangle credential functions/cmds -""" - -import os -from pprint import pprint - -from click.testing import CliRunner - -from d3b_api_client_cli.cli.dewrangle import * -from d3b_api_client_cli.dewrangle.graphql import credential - -from d3b_api_client_cli.config import config - -AWS_ACCESS_KEY_ID = config["aws"]["s3"]["aws_access_key_id"] -AWS_SECRET_ACCESS_KEY = config["aws"]["s3"]["aws_secret_access_key"] - - -def test_upsert_credential_bad_input(): - """ - Test `d3b-clients dewrangle upsert-credential` command - """ - runner = CliRunner() - - # Create - result = runner.invoke( - upsert_credential, - [ - "--name", - "e2e", - "--key", - AWS_ACCESS_KEY_ID, - "--secret", - AWS_SECRET_ACCESS_KEY, - ], - standalone_mode=False, - ) - assert result.exit_code == 1 - assert "the graphql node ID or global ID" in str(result.exc_info) - - -def test_crud_credential(tmp_path, dewrangle_study): - """ - Test `d3b-clients dewrangle upsert-credential` command - Test `d3b-clients dewrangle delete-credential` command - Test `d3b-clients dewrangle read-credentials` command - """ - study, _ = dewrangle_study - runner = CliRunner() - - # Create - result = runner.invoke( - upsert_credential, - [ - "--name", - "e2e", - "--key", - AWS_ACCESS_KEY_ID, - "--secret", - AWS_SECRET_ACCESS_KEY, - "--study-id", - study["id"], - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] - assert result.return_value["name"] == "e2e" - - # Read - temp_dir = tmp_path / "output" - temp_dir.mkdir() - - runner = CliRunner() - result = runner.invoke( - read_credentials, - [ - "--output-dir", - temp_dir, - "--study-global-id", - study["globalId"], - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert len(result.return_value) > 0 - assert os.path.exists(os.path.join(temp_dir, "Credential.json")) - - # Update - result = runner.invoke( - upsert_credential, - [ - "--name", - "foobar", - "--key", - AWS_ACCESS_KEY_ID, - "--secret", - AWS_SECRET_ACCESS_KEY, - "--study-global-id", - study["globalId"], - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] - assert result.return_value["name"] == "foobar" - - # Delete - result = runner.invoke( - delete_credential, - [ - "--node-id", - result.return_value["id"], - "--disable-delete-safety-check", - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] - - -def test_delete_credential_bad_input(): - """ - Test `d3b-clients dewrangle delete-credential` command - """ - runner = CliRunner() - result = runner.invoke(delete_credential, [], standalone_mode=False) - assert result.exit_code == 1 - assert "must provide" in str(result.exc_info) diff --git a/tests/integration/dewrangle/test_crud_fhir_server.py b/tests/integration/dewrangle/test_crud_fhir_server.py new file mode 100644 index 0000000..74a04fb --- /dev/null +++ b/tests/integration/dewrangle/test_crud_fhir_server.py @@ -0,0 +1,77 @@ +import os + +import pytest +from click.testing import CliRunner + +from d3b_api_client_cli.utils import read_json, write_json +from d3b_api_client_cli.config import config, ROOT_DIR, IdTypes +from d3b_api_client_cli.cli import * +from d3b_api_client_cli.dewrangle.graphql import fhir_server + + +@pytest.fixture(scope="session") +def fhir_server_file(tmp_path_factory): + """ + Write the inputs to create a Dewrangle FhirServer to file + """ + data_dir = tmp_path_factory.mktemp("data") + fhir_server_filepath = os.path.join(data_dir, "FhirServer.json") + fhir_server = { + "name": "Test FHIR Server", + "type": "EXTERNAL", + "authType": "OIDC_CLIENT_CREDENTIAL", + "url": "https://kf-api-fhir-service-upgrade-dev.kf-strides.org", + "authConfig": { + "clientId": "ingest-study-client", + "clientSecret": "secret", + "issuerBaseUrl": "https://kf-keycloak-qa.kf-strides.org/auth/realms/FHIR-TEST", + }, + } + write_json(fhir_server, fhir_server_filepath) + + return fhir_server_filepath + + +def test_crud_fhir_server(tmp_path, fhir_server_file, dewrangle_org): + """ + Test `dwds dewrangle upsert-fhir-server` command + Test `dwds dewrangle read-fhir-servers` command + Test `dwds dewrangle delete-fhir-server` command + """ + dewrangle_organization_id = dewrangle_org["id"] + fp = fhir_server_file + + # Upsert + runner = CliRunner() + result = runner.invoke( + upsert_fhir_server, + [fp, dewrangle_organization_id], + standalone_mode=False, + ) + assert result.exit_code == 0 + node_id = result.return_value["id"] + assert node_id + + # Read + temp_dir = tmp_path / "output" + temp_dir.mkdir() + + runner = CliRunner() + result = runner.invoke( + read_fhir_servers, + [dewrangle_organization_id, "--output-dir", temp_dir], + standalone_mode=False, + ) + assert result.exit_code == 0 + assert len(result.return_value) > 0 + assert os.path.exists(os.path.join(temp_dir, "FhirServer.json")) + + # Delete + runner = CliRunner() + result = runner.invoke(delete_fhir_server, [node_id], standalone_mode=False) + assert result.exit_code == 0 + assert result.return_value["id"] + + servers = fhir_server.read_fhir_servers(dewrangle_organization_id) + if servers: + assert all([server["name"] != "Test FHIR Server" for server in servers]) diff --git a/tests/integration/dewrangle/test_crud_organization.py b/tests/integration/dewrangle/test_crud_organization.py index 1a511d0..96aa111 100644 --- a/tests/integration/dewrangle/test_crud_organization.py +++ b/tests/integration/dewrangle/test_crud_organization.py @@ -1,54 +1,45 @@ -""" -Test Dewrangle organization cmds -""" - import os import pytest from click.testing import CliRunner from d3b_api_client_cli.utils import read_json, write_json +from d3b_api_client_cli.config import config, ROOT_DIR, IdTypes from d3b_api_client_cli.cli import * from d3b_api_client_cli.dewrangle.graphql import organization -from tests.conftest import ORG_NAME -TEST_ORG_NAME = ORG_NAME + " for orgs" +@pytest.fixture(scope="session") +def organization_file(tmp_path_factory): + """ + Write the inputs to create a Dewrangle Organization to file + """ + data_dir = tmp_path_factory.mktemp("data") + org_filepath = os.path.join(data_dir, "Organization.json") + org = { + "name": "TestOrg", + "description": "A test org", + "visibility": "PRIVATE", + } + write_json(org, org_filepath) + + return org_filepath -def test_upsert_organization(tmp_path, organization_file): + +def test_upsert_organization(organization_file): """ - Test `d3b-clients dewrangle upsert-organization` command + Test `dwds dewrangle upsert-organization` command """ - # Create - fp = organization_file(org_name=TEST_ORG_NAME) - organization = read_json(fp) + fp = organization_file runner = CliRunner() result = runner.invoke(upsert_organization, [fp], standalone_mode=False) assert result.exit_code == 0 assert result.return_value["id"] - # Update - organization["name"] = "foobar" - write_json(organization, fp) - result = runner.invoke(upsert_organization, [fp], standalone_mode=False) - assert result.exit_code == 0 - assert result.return_value["name"] == "foobar" - - # Delete it - result = runner.invoke( - delete_organization, - [ - "--dewrangle-org-id", - result.return_value["id"], - "--disable-delete-safety-check", - ], - standalone_mode=False, - ) - def test_read_organization(tmp_path): """ - Test `d3b-clients dewrangle read-organizations` command + Test `dwds dewrangle read-organizations` command """ temp_dir = tmp_path / "output" temp_dir.mkdir() @@ -62,38 +53,9 @@ def test_read_organization(tmp_path): assert os.path.exists(os.path.join(temp_dir, "Organization.json")) -def test_delete_organization_safety_check_on(): - """ - Test `d3b-clients dewrangle delete-organization` command - with safety check enabled for delete - """ - orgs = organization.read_organizations() - if not orgs: - return - - runner = CliRunner() - result = runner.invoke( - delete_organization, - ["--dewrangle-org-name", TEST_ORG_NAME], - standalone_mode=False, - ) - assert result.exit_code == 1 - assert "DELETE_SAFETY_CHECK" in str(result.exc_info) - - orgs = organization.read_organizations() - found_org = None - if orgs: - for org in orgs: - if org["name"] == TEST_ORG_NAME: - found_org = org - break - assert found_org - - -def test_delete_organization_safety_check_off(): +def test_delete_organization(): """ - Test `d3b-clients dewrangle delete-organization` command - with safety check disabled for delete + Test `dwds dewrangle delete-organization` command """ orgs = organization.read_organizations() if not orgs: @@ -101,7 +63,7 @@ def test_delete_organization_safety_check_off(): dwid = None for org in orgs: - if org["name"] == TEST_ORG_NAME: + if org["name"] == "TestOrg": dwid = org["id"] break @@ -111,13 +73,11 @@ def test_delete_organization_safety_check_off(): runner = CliRunner() result = runner.invoke( - delete_organization, - ["--dewrangle-org-id", dwid, "--disable-delete-safety-check"], - standalone_mode=False, + delete_organization, ["--dewrangle-org-id", dwid], standalone_mode=False ) assert result.exit_code == 0 assert result.return_value["id"] orgs = organization.read_organizations() if orgs: - assert all([org["name"] != TEST_ORG_NAME for org in orgs]) + assert all([org["name"] != "TestOrg" for org in orgs]) diff --git a/tests/integration/dewrangle/test_crud_studies.py b/tests/integration/dewrangle/test_crud_studies.py deleted file mode 100644 index 361f271..0000000 --- a/tests/integration/dewrangle/test_crud_studies.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Test CRUD Dewrangle study functions/cmds -""" - -import os -from pprint import pprint - -from click.testing import CliRunner - -from d3b_api_client_cli.cli.dewrangle import * -from d3b_api_client_cli.dewrangle.graphql import study - - -def test_crud_study(tmp_path, dewrangle_study): - """ - Test `d3b-clients dewrangle upsert-study` command - Test `d3b-clients dewrangle read-studies` command - Test `d3b-clients dewrangle delete-study` command - """ - study_obj, fp = dewrangle_study - - # Update - runner = CliRunner() - result = runner.invoke( - upsert_study, [fp, study_obj["organization_id"]], standalone_mode=False - ) - study_id = result.return_value["id"] - assert result.exit_code == 0 - assert study_id - - # Read - temp_dir = tmp_path / "output" - temp_dir.mkdir() - - runner = CliRunner() - result = runner.invoke( - read_studies, ["--output-dir", temp_dir], standalone_mode=False - ) - assert result.exit_code == 0 - assert len(result.return_value) > 0 - assert os.path.exists(os.path.join(temp_dir, "Study.json")) - - # Delete - runner = CliRunner() - result = runner.invoke( - delete_study, - [study_id, "--disable-delete-safety-check"], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] diff --git a/tests/integration/dewrangle/test_crud_study.py b/tests/integration/dewrangle/test_crud_study.py new file mode 100644 index 0000000..7187d95 --- /dev/null +++ b/tests/integration/dewrangle/test_crud_study.py @@ -0,0 +1,67 @@ +import os + +import pytest +from click.testing import CliRunner + +from d3b_api_client_cli.utils import read_json, kf_id_to_global_id +from d3b_api_client_cli.config import ( + config, + ROOT_DIR, + IdTypes, + TEST_STUDY_ID, +) +from d3b_api_client_cli.cli import * +from d3b_api_client_cli.dewrangle.graphql import study + + +def test_upsert_study(dewrangle_org): + """ + Test `dwds dewrangle upsert-study` command + """ + study_id = TEST_STUDY_ID + + fp = os.path.join(ROOT_DIR, "tests/data/test-study.json") + runner = CliRunner() + result = runner.invoke( + upsert_study, [fp, dewrangle_org["id"]], standalone_mode=False + ) + assert result.exit_code == 0 + assert result.return_value["id"] + + +def test_read_studies(tmp_path): + """ + Test `dwds dewrangle read-studies` command + """ + temp_dir = tmp_path / "output" + temp_dir.mkdir() + + runner = CliRunner() + result = runner.invoke( + read_studies, ["--output-dir", temp_dir], standalone_mode=False + ) + assert result.exit_code == 0 + assert len(result.return_value) > 0 + assert os.path.exists(os.path.join(temp_dir, "Study.json")) + + +def test_delete_study(): + """ + Test `dwds dewrangle delete-study` command + """ + runner = CliRunner() + result = runner.invoke( + delete_study, + [TEST_STUDY_ID, "--id-type", IdTypes.KIDS_FIRST.value], + standalone_mode=False, + ) + assert result.exit_code == 0 + assert result.return_value + + studies = study.read_studies() + assert all( + [ + study["globalId"] != kf_id_to_global_id(TEST_STUDY_ID) + for study in studies.values() + ] + ) diff --git a/tests/integration/dewrangle/test_crud_volumes.py b/tests/integration/dewrangle/test_crud_volumes.py deleted file mode 100644 index 345325a..0000000 --- a/tests/integration/dewrangle/test_crud_volumes.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Test CRUD Dewrangle volume functions/cmds -""" - -import os -from pprint import pprint - -from click.testing import CliRunner - -from d3b_api_client_cli.cli.dewrangle import * -from d3b_api_client_cli.dewrangle.graphql import volume - -from d3b_api_client_cli.config import config - -AWS_ACCESS_KEY_ID = config["aws"]["s3"]["aws_access_key_id"] -AWS_SECRET_ACCESS_KEY = config["aws"]["s3"]["aws_secret_access_key"] -AWS_BUCKET_DATA_TRANSFER_TEST = config["aws"]["s3"]["test_bucket_name"] - - -def test_upsert_volume_bad_input(): - """ - Test `d3b-clients dewrangle upsert-volume` command - """ - runner = CliRunner() - - # Create - result = runner.invoke( - upsert_volume, - ["--bucket", "e2e", "--credential-key", "foo"], - standalone_mode=False, - ) - assert result.exit_code == 1 - assert "the graphql node ID or global ID" in str(result.exc_info) - - -def test_crud_volume(tmp_path, dewrangle_credential): - """ - Test `d3b-clients dewrangle upsert-volume` command - Test `d3b-clients dewrangle delete-volume` command - Test `d3b-clients dewrangle read-volumes` command - """ - study_id = dewrangle_credential["study_id"] - runner = CliRunner() - bucket = AWS_BUCKET_DATA_TRANSFER_TEST - path_prefix = "AD-4000" - - # Create - result = runner.invoke( - upsert_volume, - [ - "--bucket", - bucket, - "--path-prefix", - path_prefix, - "--study-id", - study_id, - "--credential-key", - dewrangle_credential["key"], - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] - assert result.return_value["name"] == bucket - assert result.return_value["pathPrefix"] == path_prefix - - # Read - temp_dir = tmp_path / "output" - temp_dir.mkdir() - - runner = CliRunner() - result = runner.invoke( - read_volumes, - [ - "--output-dir", - temp_dir, - "--study-global-id", - dewrangle_credential["study_global_id"], - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert len(result.return_value) > 0 - assert os.path.exists(os.path.join(temp_dir, "Volume.json")) - - # Update - result = runner.invoke( - upsert_volume, - [ - "--bucket", - bucket, - "--path-prefix", - path_prefix, - "--credential-key", - dewrangle_credential["key"], - "--study-id", - study_id, - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] - assert "Update" in result.stdout - - # Delete - result = runner.invoke( - delete_volume, - [ - "--node-id", - result.return_value["id"], - "--disable-delete-safety-check", - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["id"] - - -def test_delete_volume_bad_input(): - """ - Test `d3b-clients dewrangle delete-volume` command - """ - runner = CliRunner() - result = runner.invoke(delete_volume, [], standalone_mode=False) - assert result.exit_code == 1 - assert "must provide" in str(result.exc_info) diff --git a/tests/integration/dewrangle/test_global_ids.py b/tests/integration/dewrangle/test_global_ids.py index 22b73ab..2d5a0c3 100644 --- a/tests/integration/dewrangle/test_global_ids.py +++ b/tests/integration/dewrangle/test_global_ids.py @@ -1,185 +1,67 @@ -""" -Test Dewrangle global ID commands -""" - import os +from pprint import pprint +import pandas import pytest from click.testing import CliRunner -import pandas -from d3b_api_client_cli.cli.dewrangle.global_id_commands import ( - upsert_global_descriptors, - download_global_descriptors, - upsert_and_download_global_descriptors, - upsert_and_download_global_descriptor, -) -from d3b_api_client_cli.dewrangle.global_id import ( - upsert_global_descriptors as _upsert_global_descriptors, -) -from d3b_api_client_cli.faker.global_id import ( - generate_global_id_file, -) +from d3b_api_client_cli import utils +from d3b_api_client_cli.config import config, ROOT_DIR +from d3b_api_client_cli.cli import * @pytest.fixture(scope="session") -def upserted_global_descriptors(dewrangle_study): +def global_id_request(tmp_path_factory, dewrangle_study): """ - Upsert global descriptors + Fixture that requests global IDs from Dewrangle """ - study, fp = dewrangle_study - output_dir = os.path.dirname(fp) - - filepath = generate_global_id_file(output_dir=output_dir) - - runner = CliRunner() - result = runner.invoke( - upsert_global_descriptors, - [filepath, "--study-id", study["id"]], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value - - return result.return_value, filepath + global_id = dewrangle_study["globalId"] + kf_id = utils.global_id_to_kf_id(global_id) + output_dir = str(tmp_path_factory.mktemp("output")) + input_filepath = os.path.join(output_dir, "global_id_request.csv") - -@pytest.fixture(scope="session") -def downloaded_global_descriptors(upserted_global_descriptors): - """ - Download newly created global descriptors - """ - result, filepath = upserted_global_descriptors - output_dir = os.path.dirname(filepath) - study_id = result["study_id"] - job_id = result["job"]["id"] - - runner = CliRunner() - - result = runner.invoke( - download_global_descriptors, + df = pandas.DataFrame( [ - "--study-id", - study_id, - "--job-id", - job_id, - "--output-dir", - output_dir, - ], - standalone_mode=False, + {"descriptor": f"P{i}", "fhirResourceType": "Patient"} + for i in range(5) + ] ) - assert result.exit_code == 0 - filepath = result.return_value - - return study_id, filepath - - -def test_upsert_global_descriptors(upserted_global_descriptors): - """ - Test d3b-clients dewrangle upsert-global-descriptors - """ - upserted_global_descriptors - - -def test_download_global_descriptors(downloaded_global_descriptors): - """ - Test d3b-clients dewrangle download-global-descriptors - """ - _, filepath = downloaded_global_descriptors - df = pandas.read_csv(filepath) - assert df.shape[0] == 10 - - -def test_upsert_and_download_global_descriptors(downloaded_global_descriptors): - """ - Test d3b-clients dewrangle upsert-and-download-global-descriptors - """ - study_id, filepath = downloaded_global_descriptors - output_dir = os.path.dirname(filepath) - - # Update the descriptors - df = pandas.read_csv(filepath) - df = df[[c for c in ("fhirResourceType", "descriptor", "globalId")]] - df["descriptor"] = df["descriptor"].apply(lambda d: d + "1") - df.to_csv(filepath, index=False) + df.to_csv(input_filepath, index=False) runner = CliRunner() - - # Upsert and download the descriptors result = runner.invoke( - upsert_and_download_global_descriptors, - [filepath, "--study-id", study_id, "--output-dir", output_dir], + upsert_global_ids, + [kf_id, input_filepath], standalone_mode=False, ) assert result.exit_code == 0 - filepath = result.return_value - df = pandas.read_csv(filepath) - assert df.shape[0] == 10 + return kf_id, df -def test_download_all_descriptors(dewrangle_study): +def test_upsert_global_ids(global_id_request): """ - Test d3b-clients dewrangle download-global-descriptors for all ids + Test `dwds dewrangle request-global-ids` command """ - study, filepath = dewrangle_study - output_dir = os.path.dirname(filepath) - - runner = CliRunner() - result = runner.invoke( - download_global_descriptors, - [ - "--study-id", - study["id"], - "--download-all", - "--output-dir", - output_dir, - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - filepath = result.return_value + pass - df = pandas.read_csv(filepath) - # Should have double the descriptors plus one for the study - assert df.shape[0] == 21 - - -def test_one_upsert_and_download_global_descriptor( - downloaded_global_descriptors, -): +def test_download_global_ids(tmp_path_factory, global_id_request): """ - Test d3b-clients dewrangle upsert-and-download-global-descriptor + Test `dwds dewrangle request-global-ids` command """ - study_id, filepath = downloaded_global_descriptors - output_dir = os.path.dirname(filepath) - - # Get an existing global ID - df = pandas.read_csv(filepath) - row = df.to_dict(orient="records")[0] + kf_id, df_ids = global_id_request + output_dir = str(tmp_path_factory.mktemp("output")) + filepath = os.path.join(output_dir, "downloaded_global_ids.csv") runner = CliRunner() - - # Upsert and download the descriptors result = runner.invoke( - upsert_and_download_global_descriptor, - [ - "--descriptor", - "foo", - "--fhir-resource-type", - row["fhirResourceType"], - "--global-id", - row["globalId"], - "--study-id", - study_id, - "--output-dir", - output_dir, - ], + download_global_ids, + [kf_id, filepath], standalone_mode=False, ) assert result.exit_code == 0 - filepath = result.return_value df = pandas.read_csv(filepath) - assert df.shape[0] == 1 + assert utils.df_exists(df) + assert df[df["fhirResourceType"] == "Patient"].shape[0] == df_ids.shape[0] diff --git a/tests/integration/dewrangle/test_ingest.py b/tests/integration/dewrangle/test_ingest.py new file mode 100644 index 0000000..31606b5 --- /dev/null +++ b/tests/integration/dewrangle/test_ingest.py @@ -0,0 +1,45 @@ +import os + +from d3b_api_client_cli.dewrangle import ingest + + +def _assert_ok(result): + assert ( + result is not None + ), "ingest returned None (study lookup/setup failed)" + assert result.get("status") is True, result + + +def test_ingest_study_file(fhir_json_data): + """ + Ingest a single FHIR file into an existing Dewrangle study. + """ + dewrangle_study_node_id, fhir_json_dir, entities_to_load = fhir_json_data + fp = os.path.join(fhir_json_dir, "patient.json") + + job = ingest.ingest_study_files( + fp, + dewrangle_study_node_id=dewrangle_study_node_id, + entities_to_load=entities_to_load, + ) + + _assert_ok(job) + for resource_result in job["job"]["result"]["resources"]: + assert resource_result["count"] >= 0 + + +def test_ingest_study_files(fhir_json_data): + """ + Ingest a directory of FHIR files into an existing Dewrangle study. + """ + dewrangle_study_node_id, fhir_json_dir, entities_to_load = fhir_json_data + + job = ingest.ingest_study_files( + fhir_json_dir, + dewrangle_study_node_id=dewrangle_study_node_id, + entities_to_load=entities_to_load, + ) + + _assert_ok(job) + for resource_result in job["job"]["result"]["resources"]: + assert resource_result["count"] >= 0 diff --git a/tests/unit/dewrangle/test_download.py b/tests/unit/dewrangle/test_download.py deleted file mode 100644 index a0e26d5..0000000 --- a/tests/unit/dewrangle/test_download.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -Test downloading volume hash files (job errors) from Dewrangle -""" - -import os -import requests_mock -import pytest - -from d3b_api_client_cli.dewrangle.rest import files -from d3b_api_client_cli.config import config - -DEWRANGLE_BASE_URL = config["dewrangle"]["base_url"] - - -def test_filename_from_headers(): - """ - Test helper function to extract filename from http response headers - """ - headers = {"Content-Disposition": "attachment; filename=file.csv"} - filename = files._filename_from_headers(headers) - assert filename == "file.csv" - - -def test_download_file_original_filename(tmp_path): - """ - Test download file from Dewrangle with original filename - """ - output_dir = tmp_path / "output" - output_dir.mkdir() - - url = "https://dewrangle.com/files" - expected_filename = "volume_hash_file.csv" - headers = { - "Content-Disposition": f"attachment; filename={expected_filename}" - } - with requests_mock.Mocker() as m: - # Setup mock - m.get(url, content=b"foo", headers=headers) - filepath = files.download_file(url, output_dir=output_dir) - - _, filename = os.path.split(filepath) - assert filename == expected_filename - assert os.path.isfile(os.path.join(output_dir, expected_filename)) - - -def test_download_file_to_filepath(tmp_path): - """ - Test download file from Dewrangle to provided filepath - """ - output_dir = tmp_path / "output" - output_dir.mkdir() - - expected_filename = "foo.csv" - expected_filepath = os.path.join(output_dir, expected_filename) - - url = "https://dewrangle.com/files" - headers = {"Content-Disposition": "attachment; filename=dewrangle.csv"} - with requests_mock.Mocker() as m: - # Setup mock - m.get(url, content=b"foo", headers=headers) - filepath = files.download_file(url, filepath=expected_filepath) - - _, filename = os.path.split(filepath) - assert filename == expected_filename - assert os.path.isfile(expected_filepath) - - -def test_download_job_errors(mocker): - """ - Test download Dewrangle job errors - """ - mock_download_file = mocker.patch( - "d3b_api_client_cli.dewrangle.rest.files.download_file" - ) - - files.download_job_errors("job-id", output_dir="output") - - endpoint_template = config["dewrangle"]["endpoints"]["rest"]["job_errors"] - endpoint = endpoint_template.format(job_id="job-id") - url = f"{DEWRANGLE_BASE_URL.rstrip('/')}/{endpoint.lstrip('/')}" - - mock_download_file.assert_called_with( - url, output_dir="output", filepath=None - ) - - -@pytest.mark.parametrize("download_method", [files.download_job_errors]) -@pytest.mark.parametrize( - "token,url, expected_msg", - [ - (None, None, "Missing required configuration"), - ("foo", None, "Missing required configuration"), - (None, "foo", "Missing required configuration"), - ], -) -def test_missing_configuration( - mocker, download_method, token, url, expected_msg -): - """ - Test download files with missing configuration - """ - mocker.patch("d3b_api_client_cli.config.DEWRANGLE_DEV_PAT", token) - mocker.patch("d3b_api_client_cli.config.DEWRANGLE_BASE_URL", url) - - if expected_msg: - with pytest.raises(ValueError) as e: - download_method("1234") - assert expected_msg in str(e) - else: - assert download_method("1234") diff --git a/tests/unit/dewrangle/test_global_ids.py b/tests/unit/dewrangle/test_global_ids.py deleted file mode 100644 index e061f03..0000000 --- a/tests/unit/dewrangle/test_global_ids.py +++ /dev/null @@ -1,76 +0,0 @@ -""" -Unit test global ID command -""" - -import pytest -from click.testing import CliRunner - -from d3b_api_client_cli.cli.dewrangle.global_id_commands import ( - upsert_global_descriptors, -) -from d3b_api_client_cli.dewrangle.global_id import ( - upsert_global_descriptors as _upsert_global_descriptors, - download_global_descriptors as _download_global_descriptors, -) - - -def test_upsert_global_descriptors_cli_errors(): - """ - Test d3b-clients dewrangle upser-global-descriptor errors - """ - runner = CliRunner() - - result = runner.invoke( - upsert_global_descriptors, - ["global_ids.csv"], - standalone_mode=False, - ) - assert result.exit_code == 1 - assert "BadParameter" in str(result.exc_info) - assert "global ID" in str(result.exc_info) - - -@pytest.mark.parametrize( - "kwargs", - [ - {"dewrangle_study_id": None, "study_global_id": "foo"}, - {"dewrangle_study_id": "foo", "study_global_id": None}, - ], -) -def test_upsert_global_descriptors_no_study(mocker, kwargs): - """ - Test d3b-clients dewrangle upsert-global-descriptors when study - is not found - """ - mock_study_api = mocker.patch( - "d3b_api_client_cli.dewrangle.global_id.study_api" - ) - mock_study_api.read_study.return_value = {} - mock_study_api.find_study.return_value = {} - - with pytest.raises(ValueError) as e: - _upsert_global_descriptors("global_ids.csv", **kwargs) - assert "does not exist" in str(e) - - -@pytest.mark.parametrize( - "kwargs", - [ - {"dewrangle_study_id": None, "study_global_id": "foo"}, - {"dewrangle_study_id": "foo", "study_global_id": None}, - ], -) -def test_download_global_descriptors_no_study(mocker, kwargs): - """ - Test d3b-clients dewrangle download-global-descriptors when study - is not found - """ - mock_study_api = mocker.patch( - "d3b_api_client_cli.dewrangle.global_id.study_api" - ) - mock_study_api.read_study.return_value = {} - mock_study_api.find_study.return_value = {} - - with pytest.raises(ValueError) as e: - _download_global_descriptors(**kwargs) - assert "does not exist" in str(e) diff --git a/tests/unit/dewrangle/test_graphql_common.py b/tests/unit/dewrangle/test_graphql_common.py deleted file mode 100644 index 619931c..0000000 --- a/tests/unit/dewrangle/test_graphql_common.py +++ /dev/null @@ -1,27 +0,0 @@ -import pytest - -from d3b_api_client_cli.dewrangle.graphql.common import create_graphql_client - - -@pytest.mark.parametrize( - "token,url, expected_msg", - [ - (None, None, "Missing required configuration"), - ("foo", None, "Missing required configuration"), - (None, "foo", "Missing required configuration"), - ("bar", "foo", None), - ], -) -def test_missing_configuration(mocker, token, url, expected_msg): - """ - Test create_graphql_client with missing configuration - """ - mocker.patch("d3b_api_client_cli.config.DEWRANGLE_DEV_PAT", token) - mocker.patch("d3b_api_client_cli.config.DEWRANGLE_BASE_URL", url) - - if expected_msg: - with pytest.raises(ValueError) as e: - create_graphql_client() - assert expected_msg in str(e) - else: - assert create_graphql_client() diff --git a/tests/unit/dewrangle/test_job.py b/tests/unit/dewrangle/test_job.py deleted file mode 100644 index d1dbc93..0000000 --- a/tests/unit/dewrangle/test_job.py +++ /dev/null @@ -1,179 +0,0 @@ -""" -Test job related functionality - -Poll job -""" - -import os -import time -import pytest -from click.testing import CliRunner - -from d3b_api_client_cli.dewrangle.graphql import job -from d3b_api_client_cli.cli import read_job - - -@pytest.mark.parametrize( - "status,expected_exc", - [ - ({"complete": True, "success": True}, None), - ({"done": True, "success": True}, ValueError), - ], -) -def test_validate_status_format(status, expected_exc): - """ - Test function that validates the complete function, - called after polling completes, returned a valid result - """ - if expected_exc: - with pytest.raises(expected_exc): - job._validate_status_format(status) - else: - job._validate_status_format(status) - - -def test_poll_job_success(mocker): - """ - Test polling Dewrangle for job status on successful completion - """ - # Test success case - mock_results = [ - { - "node": { - "id": "foo", - "operation": "VOLUME_LIST_AND_HASH", - "completedAt": None, - "errors": {"edges": []}, - } - } - for i in range(3) - ] - mock_results[-1]["node"]["completedAt"] = "completed date" - mock_exec_query = mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.job.exec_query", - side_effect=mock_results, - ) - - output = job.poll_job("job_id") - - assert output["success"] == True - assert mock_exec_query.call_count == 3 - - -def test_poll_job_errors(mocker): - """ - Test polling Dewrangle for job status with errors - """ - # Test success case - mock_results = [ - { - "node": { - "id": "foo", - "operation": "VOLUME_LIST_AND_HASH", - "completedAt": None, - "errors": {"edges": []}, - } - } - for i in range(3) - ] - mock_results[-1]["node"]["errors"] = { - "edges": [{"message": "something went wrong"}] - } - mock_exec_query = mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.job.exec_query", - side_effect=mock_results, - ) - - output = job.poll_job("job_id") - - assert output["success"] is False - assert output["job"]["errors"]["edges"] - assert mock_exec_query.call_count == 3 - - -def test_poll_job_timeout(mocker): - """ - Test polling Dewrangle for job status with a timeout - """ - - def mock_func(query, variables): - time.sleep(2) - return { - "node": { - "id": "foo", - "operation": "VOLUME_LIST_AND_HASH", - "completedAt": None, - "errors": {"edges": []}, - } - } - - mock_exec_query = mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.job.exec_query", - side_effect=mock_func, - ) - - output = job.poll_job("job_id", timeout_seconds=1) - - assert output["success"] is None - assert mock_exec_query.call_count == 1 - - -def test_read_job(tmp_path, mocker): - """ - Test d3b dewrangle read-job command - """ - output_dir = tmp_path / "output" - output_dir.mkdir() - - mock_results = [ - { - "node": { - "id": "foo", - "operation": "VOLUME_LIST_AND_HASH", - "completedAt": None, - "errors": {"edges": []}, - } - } - ] - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.job.exec_query", - side_effect=mock_results, - ) - runner = CliRunner() - result = runner.invoke( - read_job, ["job-id", "--output-dir", output_dir], standalone_mode=False - ) - assert result.exit_code == 0 - assert os.path.isfile( - os.path.join(output_dir, "Job-volume-list-and-hash.json") - ) - - -def test_read_job_errors(mocker): - """ - Test d3b dewrangle read-job command - """ - mock_results = [ - { - "node": { - "id": "foo", - "operation": "VOLUME_LIST_AND_HASH", - "completedAt": None, - "errors": {"edges": [{"message": "bad"}]}, - } - } - ] - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.job.exec_query", - side_effect=mock_results, - ) - runner = CliRunner() - result = runner.invoke( - read_job, - [ - "job-id", - ], - standalone_mode=False, - ) - assert result.exit_code == 0 - assert result.return_value["errors"]["edges"] diff --git a/tests/unit/dewrangle/test_volume.py b/tests/unit/dewrangle/test_volume.py deleted file mode 100644 index 76fa98e..0000000 --- a/tests/unit/dewrangle/test_volume.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Test Dewrangle volume related functionality -""" - -import pytest -from d3b_api_client_cli.dewrangle.graphql.volume import list_and_hash - - -def test_list_and_hash(mocker): - """ - Test volume.list_and_hash - """ - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.volume.find_study", - return_value={"id": "study"}, - ) - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.volume.find_volume", - return_value={"id": "volume"}, - ) - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.volume.exec_query", - return_value={"volumeListAndHash": {"job": {"id": "foo"}}}, - ) - result = list_and_hash("billing", bucket="vol", study_global_id="study") - assert result["id"] - - -def test_list_and_hash_missing_inputs(): - """ - Test volume.list_and_hash with missing inputs to lookup volume - """ - - with pytest.raises(ValueError) as e: - list_and_hash("billing", bucket="vol") - assert "must provide" in str(e) - - with pytest.raises(ValueError) as e: - list_and_hash("billing", study_global_id="study") - assert "must provide" in str(e) - - -def test_list_and_hash_no_billing_group(mocker): - """ - Test volume.list_and_hash empty billing group - """ - with pytest.raises(ValueError) as e: - list_and_hash("", bucket="vol", study_global_id="study") - assert "Billing group" in str(e) - - -def test_list_and_hash_no_study(mocker): - """ - Test volume.list_and_hash without existing study - """ - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.volume.find_study", - return_value={}, - ) - - with pytest.raises(ValueError) as e: - list_and_hash("billing", bucket="vol", study_global_id="study") - assert "study with ID" in str(e) - - -def test_list_and_hash_no_volume(mocker): - """ - Test volume.list_and_hash without existing volume - """ - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.volume.find_study", - return_value={"id": "study"}, - ) - mocker.patch( - "d3b_api_client_cli.dewrangle.graphql.volume.find_volume", - return_value={}, - ) - - with pytest.raises(ValueError) as e: - list_and_hash("billing", bucket="vol", study_global_id="study") - assert "volume with ID" in str(e)