diff --git a/cache/directory.py b/cache/directory.py index 9522b78..c0adeec 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -55,6 +55,24 @@ def download(self, output: str | PathLike): ) } }, + "UniProt": { + # We use FTP when possible, but we delegate to the UniProt REST API in cases that would save significant bandwidth. + "9606": { + # We prefer manually curated genes. + "SwissProt_9606.tsv": CacheItem( + cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk", + online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29" + ), + "HUMAN_9606_idmapping_selected.tab.gz": CacheItem( + cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX", + online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz" + ), + "HUMAN_9606_idmapping.dat.gz": CacheItem( + cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O", + online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz" + ) + } + }, "DISEASES": { # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their # archived files directory instead. @@ -80,6 +98,28 @@ def download(self, output: str | PathLike): cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL", online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()) ) + }, + "DepMap": { + "OmicsProfiles.csv": CacheItem( + cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL", + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads" + ), + "CRISPRGeneDependency.csv": CacheItem( + cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz", + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads" + ), + "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem( + cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh", + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads" + ), + "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem( + cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP", + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads" + ), + "OmicsCNGeneWGS.csv": CacheItem( + cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub", + online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads" + ) } } diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml index 775aa69..a3ed1af 100644 --- a/configs/dmmm.yaml +++ b/configs/dmmm.yaml @@ -44,12 +44,12 @@ datasets: # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml - label: dmmmhiv060 node_files: ["processed_prize_060.txt"] - edge_files: ["phosphosite-irefindex13.0-uniprot.txt"] + edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"] other_files: [] data_dir: "datasets/hiv/processed" - label: dmmmhiv05 node_files: ["processed_prize_05.txt"] - edge_files: ["phosphosite-irefindex13.0-uniprot.txt"] + edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"] other_files: [] data_dir: "datasets/hiv/processed" # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml @@ -72,7 +72,11 @@ datasets: node_files: - prize_files/diabetes_mellitus_prizes.txt other_files: [] - + - label: dmmm_cellline_fadu + data_dir: datasets/depmap + edge_files: ["../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"] + node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"] + other_files: [] gold_standards: - label: gs0 node_files: ['GS_files/Alopecia_areata_GS.txt'] @@ -82,3 +86,7 @@ gold_standards: node_files: ['GS_files/Diabetes_mellitus_GS.txt'] data_dir: "datasets/diseases" dataset_labels: ["dmmm_diabetes_mellitus"] + - label: gs_fadu + node_files: ["processed/FADU_gold_standard.txt"] + data_dir: datasets/depmap + dataset_labels: ["dmmm_cellline_fadu"] diff --git a/databases/irefindex/README.md b/databases/irefindex/README.md new file mode 100644 index 0000000..de7f4bb --- /dev/null +++ b/databases/irefindex/README.md @@ -0,0 +1 @@ +The input edge file for the background network can be obtained from the SPRAS repo [`input/phosphosite-irefindex13.0-uniprot.txt`](https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt). The actual originating site for this dataset is down. \ No newline at end of file diff --git a/datasets/hiv/raw/phosphosite-irefindex13.0-uniprot.txt b/databases/irefindex/phosphosite-irefindex13.0-uniprot.txt similarity index 100% rename from datasets/hiv/raw/phosphosite-irefindex13.0-uniprot.txt rename to databases/irefindex/phosphosite-irefindex13.0-uniprot.txt diff --git a/databases/stringdb.py b/databases/stringdb.py index 706b1c7..3c6717a 100644 --- a/databases/stringdb.py +++ b/databases/stringdb.py @@ -1,8 +1,7 @@ import argparse -import gzip import os from pathlib import Path -import shutil +from databases.util import uncompress from cache.directory import get_cache_item @@ -32,13 +31,6 @@ def parse_args(): return parser.parse_args() -def uncompress(source: Path, target: Path): - """Uncompresses a .gz file""" - # Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775 - with gzip.open(source, "rb") as f_compressed: - with open(target, "wb") as f_uncompressed: - shutil.copyfileobj(f_compressed, f_uncompressed) - def main(): args = parse_args() string_path.mkdir(exist_ok=True) diff --git a/databases/util.py b/databases/util.py new file mode 100644 index 0000000..8ffb500 --- /dev/null +++ b/databases/util.py @@ -0,0 +1,10 @@ +from pathlib import Path +import gzip +import shutil + +def uncompress(source: Path, target: Path): + """Uncompresses a .gz file""" + # Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775 + with gzip.open(source, "rb") as f_compressed: + with open(target, "wb") as f_uncompressed: + shutil.copyfileobj(f_compressed, f_uncompressed) diff --git a/datasets/depmap/.gitignore b/datasets/depmap/.gitignore new file mode 100644 index 0000000..dcba146 --- /dev/null +++ b/datasets/depmap/.gitignore @@ -0,0 +1,3 @@ +raw +testing +processed diff --git a/datasets/depmap/README.md b/datasets/depmap/README.md new file mode 100644 index 0000000..03e721a --- /dev/null +++ b/datasets/depmap/README.md @@ -0,0 +1,50 @@ +# Cancer Dependency Map Dataset + +This folder contains the processed data and the scripts for data analysis and preparation on datasets from The [Cancer Dependency Map](https://depmap.org/portal/), an initiative led by the Broad Institute to provide large-scale omics data in identifying cancer dependencies/vulnerabilities. + +You can read more about DepMap and the projects included here: https://www.broadinstitute.org/cancer/cancer-dependency-map + +## Raw Data +You can visit the DepMap all data downloads portal at: https://depmap.org/portal/data_page/?tab=allData +Download the following datasets under the primary files section of DepMap and move them to a directory named `raw` that you create. The dataset descriptions from the website are also included: + +Currently used files: + +- `OmicsProfiles.csv`: Omics metadata and ID mapping information for files indexed by Profile ID. This dataset is used for mapping cell line names to DepMap model IDs as a basis for data processing. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsProfiles.csv) +- `CRISPRGeneDependency.csv`: Gene dependency probability estimates for all models in the integrated gene effect. This dataset is used to identify gold standard genes in each cell line, a dependency probability cutoff of 0.5 is currently used to get the genes with considerable impact on the cell line. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=CRISPRGeneDependency.csv) +- `OmicsSomaticMutationsMatrixDamaging.csv`: Genotyped matrix determining for each cell line whether each gene has at least one damaging mutation. A variant is considered a damaging mutation if LikelyLoF == True. (0 == no mutation; If there is one or more damaging mutations in the same gene for the same cell line, the allele frequencies are summed, and if the sum is greater than 0.95, a value of 2 is assigned and if not, a value of 1 is assigned.). This dataset is used to prepare the input prize file. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsSomaticMutationsMatrixDamaging.csv) +- `OmicsExpressionProteinCodingGenesTPMLogp1.csv`: Model-level TPMs derived from Salmon v1.10.0 (Patro et al 2017) Rows: Model IDs Columns: Gene names. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsExpressionProteinCodingGenesTPMLogp1.csv) +- `OmicsCNGeneWGS.csv`: Gene-level copy number data inferred from WGS data only. Additional copy number datasets are available for download as part of the full DepMap Data Release. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsCNGeneWGS.csv) + + +## Scripts +Currently contains: +- `local_cell_line_preprocessing.ipynb`: Jupyter notebook for exploratory data analysis and initial pipeline development. Includes CRISPR dependency analysis with multiple thresholds, visualization of gene dependency distributions, UniProt ID mapping workflow (both gene symbols and gene numbers approaches currently), and step-by-step generation of prize input files and gold standard files for individual cell lines. +- `cell_line_processing.py`: General cell line processing pipeline for generating prize input files and gold standard files converted into Python scripts. Should be reproducible for any cell line name, could be further organized and refined. +- `uniprot_mapping.py`: Gene symbol extraction script for UniProt ID mapping preparation. Parses gene symbols from any DepMap dataset column headers (e.g., "GENE_NAME (12345)" format) and saves them as CSV files ready for input to the UniProt web service. Currently used to extract gene symbols from `OmicsSomaticMutationsMatrixDamaging.csv`, but should be compatible with any omics dataset. + + +Files used for preparing required files: +- `OmicsProfiles.csv` used for mapping cell line names to DepMap model IDs. +- `OmicsSomaticMutationsMatrixDamaging.csv` used for preparing prize input file. +- `CRISPRGeneDependency.csv` used for preparing gold standard output. +- `OmicsExpressionProteinCodingGenesTPMLogp1.csv`: Model-level TPMs derived from Salmon v1.10.0 (Patro et al 2017) Rows: Model IDs Columns: Gene names. +- `OmicsCNGeneWGS.csv`: Gene-level copy number data inferred from WGS data only. Additional copy number datasets are available for download as part of the full DepMap Data Release. + +## Processed Data +Files used for UniProt ID mapping: +- `DamagingMutationsGeneSymbols.csv`: Gene symbols and Gene IDs parsed from gene columns in `OmicsSomaticMutationsMatrixDamaging.csv` on the date described +- `DamagingMutations_idMapping.tsv`: Gene symbols from `DamagingMutationsGeneSymbols_20250718.csv` mapped to UniProt SwissProt IDs, using Gene ID data +to provide more accurate mappings when possible, since gene symbol -> UniProt mappings are not one-to-one mapping. (TODO: some Gene ID -> UniProt +mappings are also not one-to-one: the accuracy could be improved by identifying the gene via the mutations present in the associated matrix.) + +Started processing with the FADU cell line: +- Input prize file prepared from the damaging mutations dataset +- Gold standard file prepared from the CRISPR gene dependency dataset + +## Config +Example Config file used to get preliminary results on OmicsIntegrator1 and 2 following the EGFR dataset example. Will test out more parameters and update. + +## Release Citation +For DepMap Release data, including CRISPR Screens, PRISM Drug Screens, Copy Number, Mutation, Expression, and Fusions: +DepMap, Broad (2025). DepMap Public 25Q2. Dataset. depmap.org diff --git a/datasets/depmap/Snakefile b/datasets/depmap/Snakefile new file mode 100644 index 0000000..2182663 --- /dev/null +++ b/datasets/depmap/Snakefile @@ -0,0 +1,45 @@ +rule all: + # We currently only care about the FADU cell line. + input: + "processed/FADU_cell_line_prizes_input_nonzero.txt", + "processed/FADU_cell_line_prizes.txt", + "processed/FADU_gold_standard_thresh_0_5.txt" + +rule fetch: + output: + "raw/CRISPRGeneDependency.csv", + "raw/OmicsProfiles.csv", + "raw/OmicsSomaticMutationsMatrixDamaging.csv", + "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv", + "raw/OmicsCNGeneWGS.csv", + "raw/HUMAN_9606_idmapping.tsv", + "raw/HUMAN_9606_idmapping_selected.tsv", + "raw/SwissProt_9606.tsv" + shell: + "uv run scripts/fetch.py" + +rule mapping: + input: + "raw/SwissProt_9606.tsv", + "raw/HUMAN_9606_idmapping.tsv", + "raw/HUMAN_9606_idmapping_selected.tsv", + "raw/OmicsSomaticMutationsMatrixDamaging.csv" + output: + "processed/DamagingMutations_idMapping.tsv" + shell: + "uv run scripts/uniprot_mapping.py" + +rule process: + input: + "processed/DamagingMutations_idMapping.tsv", + "raw/OmicsSomaticMutationsMatrixDamaging.csv", + "raw/OmicsProfiles.csv", + "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv", + "raw/OmicsCNGeneWGS.csv", + "raw/CRISPRGeneDependency.csv" + output: + "processed/FADU_cell_line_prizes_input_nonzero.txt", + "processed/FADU_cell_line_prizes.txt", + "processed/FADU_gold_standard_thresh_0_5.txt" + shell: + "uv run scripts/cell_line_processing.py" diff --git a/datasets/depmap/scripts/cell_line_processing.py b/datasets/depmap/scripts/cell_line_processing.py new file mode 100644 index 0000000..0fad7d9 --- /dev/null +++ b/datasets/depmap/scripts/cell_line_processing.py @@ -0,0 +1,221 @@ +import pandas as pd +import re +import os +from pathlib import Path + +# configuration - change cell line list and dependency cutoff as needed +cell_line_names = ["FADU", "BHY", "SCC4"] +dependency_threshold = 0.5 +require_all_datasets = False # set to true to require all data types + +dir_path = Path(os.path.dirname(os.path.realpath(__file__))) + + +class CellLineProcessingError(Exception): + """Custom exception for cell line processing errors.""" + + pass + + +def check_cell_line(cell_line_name, omics_profiles, damaging_mutations_df, CRISPR_dependency, omics_expression, omics_cnv): + match = omics_profiles[omics_profiles["StrippedCellLineName"].str.lower() == cell_line_name.lower()] + + if match.empty: + raise CellLineProcessingError(f"Cell line '{cell_line_name}' not found in OmicsProfiles.") + + model_id = match.index[0] + print(f"Found '{cell_line_name}' cell line, model ID: {model_id}") + + # Check required datasets (mutations + CRISPR) - always raise errors if missing + if model_id not in damaging_mutations_df.index: + raise CellLineProcessingError(f"Cell line '{cell_line_name}' not found in required damaging mutations matrix") + else: + row_mut = damaging_mutations_df.index.get_loc(model_id) + print(f"{cell_line_name} cell line (Model ID: {model_id}) present in mutations matrix at row {row_mut}") + + if model_id not in CRISPR_dependency.index: + raise CellLineProcessingError(f"Cell line '{cell_line_name}' not found in required CRISPR dependency data") + else: + row_dep = CRISPR_dependency.index.get_loc(model_id) + print(f"{cell_line_name} cell line (Model ID: {model_id}) present in CRISPR dependencies at row {row_dep}") + + # check other datasets + if require_all_datasets: + if model_id not in omics_cnv.index: + raise CellLineProcessingError(f"Cell line '{cell_line_name}' not found in required omics CNV data") + else: + row_cnv = omics_cnv.index.get_loc(model_id) + print(f"{cell_line_name} cell line (Model ID: {model_id}) present in omics CNV at row {row_cnv}") + + if model_id not in omics_expression.index: + raise CellLineProcessingError(f"Cell line '{cell_line_name}' not found in required expression data") + else: + row_exp = omics_expression.index.get_loc(model_id) + print(f"{cell_line_name} cell line (Model ID: {model_id}) present in omics expressions at row {row_exp}") + + print(f"All required datasets contain the cell line '{cell_line_name}' (Model ID: {model_id})") + else: + # log availability of optional datasets + cnv_present = model_id in omics_cnv.index + expression_present = model_id in omics_expression.index + + if cnv_present: + row_cnv = omics_cnv.index.get_loc(model_id) + print(f"{cell_line_name} cell line (Model ID: {model_id}) present in omics CNV at row {row_cnv}") + else: + print(f"WARNING: {cell_line_name} not found in omics CNV data (optional dataset)") + + if expression_present: + row_exp = omics_expression.index.get_loc(model_id) + print(f"{cell_line_name} cell line (Model ID: {model_id}) present in omics expressions at row {row_exp}") + else: + print(f"WARNING: {cell_line_name} not found in expression data (optional dataset)") + + print(f"Required datasets contain the cell line '{cell_line_name}' (Model ID: {model_id})") + + return True, model_id + + +def generate_prize_files(cell_line_name, model_id, damaging_mutations_df, uniprot_map): + """Generate prize input file for the cell line based on damaging mutations and uniprot mapping.""" + # Extract mutation data for the cell line + mutation_row = damaging_mutations_df.loc[model_id] + # mapping gene symbols to Uniprot IDs + gene_to_uniprot = dict(zip(uniprot_map["From"], uniprot_map["Entry Name"])) + rows = [] + for col, score in mutation_row.items(): + # extract gene symbols for mapping + match = re.match(r"^(.*?) \(", col) + gene_symbol = match.group(1) if match else col + # only map for gene symbols in uniprot map + if gene_symbol in gene_to_uniprot: + uniprot_id = gene_to_uniprot[gene_symbol] + rows.append([gene_symbol, uniprot_id, score]) + mapped_prizes_df = pd.DataFrame(rows, columns=["GeneSymbol", "UniprotID", "Prize"]) + + # prize input file for all genes + prizes_input_file = mapped_prizes_df[mapped_prizes_df.columns[1:]].rename(columns={"UniprotID": "NODEID", "Prize": "prize"}) + output_path = dir_path / ".." / "processed" / f"{cell_line_name}_cell_line_prizes.txt" + prizes_input_file.to_csv(output_path, sep="\t", index=False, header=True) + print(f"Prize file saved for cell line '{cell_line_name}' at: {output_path}") + + # nonzero prizes input file + nonzero_prizes_input_file = prizes_input_file[prizes_input_file["prize"] > 0] + nonzero_output_path = dir_path / ".." / "processed" / f"{cell_line_name}_cell_line_prizes_input_nonzero.txt" + nonzero_prizes_input_file.to_csv(nonzero_output_path, sep="\t", index=False, header=True) + print(f"Nonzero prize file saved for cell line '{cell_line_name}' at: {nonzero_output_path}") + + return gene_to_uniprot + + +def process_single_cell_line( + cell_line_name: str, + omics_profiles: pd.DataFrame, + damaging_mutations_df: pd.DataFrame, + CRISPR_dependency: pd.DataFrame, + omics_expression: pd.DataFrame, + omics_cnv: pd.DataFrame, + uniprot_map: pd.DataFrame, +): + """Process a single cell line and generate output files.""" + print(f"\n=== Processing cell line: {cell_line_name} ===") + + is_present, model_id = check_cell_line(cell_line_name, omics_profiles, damaging_mutations_df, CRISPR_dependency, omics_expression, omics_cnv) + + if is_present: + # make prize input files + gene_to_uniprot = generate_prize_files(cell_line_name, model_id, damaging_mutations_df, uniprot_map) + + # make gold standard file + generate_gold_standard(cell_line_name, model_id, CRISPR_dependency, gene_to_uniprot, dependency_threshold) + print(f"Processing for cell line '{cell_line_name}' completed successfully.") + return True + +def generate_gold_standard(cell_line_name, model_id, CRISPR_dependency, gene_to_uniprot, threshold: float): + """Generate gold standard file for the cell line based on CRISPR dependency and gene to Uniprot mapping.""" + # map Uniprot IDs to gene symbols in the CRISPR dependency data + cell_line_dependency = CRISPR_dependency.loc[model_id] + filtered_dependency = cell_line_dependency[cell_line_dependency > threshold] + mapped_dependency = [] + for gene, dependency in filtered_dependency.items(): + match = re.match(r"^(.*?) \(", gene) + gene_symbol = match.group(1) if match else gene + if gene_symbol in gene_to_uniprot: + uniprot_id = gene_to_uniprot[gene_symbol] + mapped_dependency.append([gene_symbol, uniprot_id, dependency]) + + mapped_dependency_df = pd.DataFrame(mapped_dependency, columns=["GeneSymbol", "UniprotID", "Dependency"]) + + # save mapped dependency as gold standard file + gold_standard = mapped_dependency_df[mapped_dependency_df.columns[1]] + threshold_str = str(dependency_threshold).replace(".", "_") + gold_standard_output_path = dir_path / ".." / "processed" / f"{cell_line_name}_gold_standard_thresh_{threshold_str}.txt" + gold_standard.to_csv(gold_standard_output_path, sep="\t", index=False, header=False) + print(f"Gold standard file saved for cell line '{cell_line_name}' at: {gold_standard_output_path}") + print(f"Threshold: {dependency_threshold} Number of genes in gold standard: {len(gold_standard)}") + + +def main(): + print(f"Processing cell lines: {cell_line_names}") + print(f"Require all datasets: {require_all_datasets}") + + try: + # Load raw dataset files + print("Loading datasets...") + base_dir = dir_path / ".." / "raw" + damaging_mutations_df = pd.read_csv(base_dir / "OmicsSomaticMutationsMatrixDamaging.csv", index_col=0) + omics_profiles = pd.read_csv(base_dir / "OmicsProfiles.csv", index_col=0) + omics_expression = pd.read_csv(base_dir / "OmicsExpressionProteinCodingGenesTPMLogp1.csv", index_col=0) + omics_cnv = pd.read_csv(base_dir / "OmicsCNGeneWGS.csv", index_col=0) + CRISPR_dependency = pd.read_csv(base_dir / "CRISPRGeneDependency.csv", index_col=0) + + # Load uniprot mapping file form gene symbols + print("Loading UniProt mapping file...") + uniprot_map = pd.read_csv(dir_path / ".." / "processed" / "DamagingMutations_idMapping.tsv", sep="\t") + + except FileNotFoundError as e: + print(f"ERROR: Required data file not found: {e}") + raise SystemExit(1) + except Exception as e: + print(f"ERROR: Failed to load data files: {e}") + raise SystemExit(1) + + # Process each cell line + successful_count = 0 + failed_count = 0 + successful_cell_lines = [] + failed_cell_lines = [] + + for cell_line_name in cell_line_names: + success = process_single_cell_line( + cell_line_name, omics_profiles, damaging_mutations_df, CRISPR_dependency, omics_expression, omics_cnv, uniprot_map + ) + if success: + successful_count += 1 + successful_cell_lines.append(cell_line_name) + else: + failed_count += 1 + failed_cell_lines.append(cell_line_name) + + # summary stats + print("\n=== Processing Summary ===") + print(f"Total cell lines processed: {len(cell_line_names)}") + print(f"Successfully processed: {successful_count}") + print(f"Failed to process: {failed_count}") + + if successful_cell_lines: + print(f"Successfully processed cell lines: {', '.join(successful_cell_lines)}") + + if failed_cell_lines: + print(f"Failed cell lines: {', '.join(failed_cell_lines)}") + + # Exit with error code if any cell line processing failed + if failed_count > 0: + print("Some cell lines failed to process. Check error messages above.") + raise SystemExit(1) + else: + print("All cell lines processed successfully.") + + +if __name__ == "__main__": + main() diff --git a/datasets/depmap/scripts/fetch.py b/datasets/depmap/scripts/fetch.py new file mode 100644 index 0000000..b922312 --- /dev/null +++ b/datasets/depmap/scripts/fetch.py @@ -0,0 +1,48 @@ +""" +Fetches the latest DepMap data we need + +Download page: https://depmap.org/portal/data_page/?tab=allData +""" + +from pathlib import Path +import os +from cache.directory import get_cache_item +from databases.util import uncompress + +# https://stackoverflow.com/a/5137509/7589775 +dir_path = os.path.dirname(os.path.realpath(__file__)) + +raw_dir = Path(dir_path, "..", "raw") + + +def main(): + raw_dir.mkdir(exist_ok=True) + + print("Fetching DepMap omics metadata") + get_cache_item(["DepMap", "OmicsProfiles.csv"]).download(raw_dir / "OmicsProfiles.csv") + + print("Fetching DepMap gene dependency probability estimates...") + get_cache_item(["DepMap", "CRISPRGeneDependency.csv"]).download(raw_dir / "CRISPRGeneDependency.csv") + + print("Fetching DepMap genotyped matrix...") + get_cache_item(["DepMap", "OmicsSomaticMutationsMatrixDamaging.csv"]).download(raw_dir / "OmicsSomaticMutationsMatrixDamaging.csv") + + print("Fetching DepMap model-level TPMs...") + get_cache_item(["DepMap", "OmicsExpressionProteinCodingGenesTPMLogp1.csv"]).download(raw_dir / "OmicsExpressionProteinCodingGenesTPMLogp1.csv") + + print("Fetching DepMap gene-level copy number data...") + get_cache_item(["DepMap", "OmicsCNGeneWGS.csv"]).download(raw_dir / "OmicsCNGeneWGS.csv") + + print("Fetching UniProt internal id mapping...") + get_cache_item(["UniProt", "9606", "HUMAN_9606_idmapping.dat.gz"]).download(raw_dir / "HUMAN_9606_idmapping.dat.gz") + uncompress(raw_dir / "HUMAN_9606_idmapping.dat.gz", raw_dir / "HUMAN_9606_idmapping.tsv") + + print("Fetching UniProt id external database mapping...") + get_cache_item(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"]).download(raw_dir / "HUMAN_9606_idmapping_selected.tab.gz") + uncompress(raw_dir / "HUMAN_9606_idmapping_selected.tab.gz", raw_dir / "HUMAN_9606_idmapping_selected.tsv") + + print("Fetching UniProt SwissProt genes...") + get_cache_item(["UniProt", "9606", "SwissProt_9606.tsv"]).download(raw_dir / "SwissProt_9606.tsv") + +if __name__ == "__main__": + main() diff --git a/datasets/depmap/scripts/uniprot_mapping.py b/datasets/depmap/scripts/uniprot_mapping.py new file mode 100644 index 0000000..4198366 --- /dev/null +++ b/datasets/depmap/scripts/uniprot_mapping.py @@ -0,0 +1,79 @@ +import pandas as pd +import os +from pathlib import Path + +"""Extracts gene symbols from dataset for UniProt mapping""" + +dir_path = Path(os.path.dirname(os.path.realpath(__file__))) + + +def extract_gene_symbols(input_df: pd.DataFrame) -> pd.DataFrame: + """ + Extracts gene symbols from the input file. + """ + # The gene symbols are stored in the columns of the matrix, as GENE_NAME (GENE_ID) + gene_columns = input_df.columns.tolist()[1:] + gene_symbols = [ + # We want to extract GENE_NAME from GENE_NAME (Unknown) + (col[:col.find("(") - 1], None) if "(Unknown)" in col else + # or GENE_ID from "GENE_NAME (GENE_ID)" + (col[:col.find("(") - 1], col[col.find("(") + 1:-1]) if "(" in col else + (col, None) + for col in gene_columns + ] + + gene_symbols_df = pd.DataFrame(gene_symbols, columns=["GeneSymbol", "GeneID"]) + + print(f"Extracted {len(gene_symbols_df)} gene symbols from the input file.") + print(f"First 5 gene symbols: {gene_symbols_df['GeneSymbol'].head().tolist()}") + + return gene_symbols_df + + +def main(): + # Load the dataset + # We only read the first row since we only care about the column names of the matrix + input_df = pd.read_csv(dir_path / ".." / "raw" / "OmicsSomaticMutationsMatrixDamaging.csv", index_col=0, nrows=1) + + gene_symbols_df = extract_gene_symbols(input_df) + + processed_path = dir_path / ".." / "processed" + processed_path.mkdir(exist_ok=True) + + # We split gene_symbols_df into where GeneID does and does not exist + gene_symbols_df_id = gene_symbols_df[gene_symbols_df["GeneID"].notnull()] + gene_symbols_df_nid = gene_symbols_df[~gene_symbols_df["GeneID"].notnull()] + + # Now, we map it through HUMAN_9606_idmapping_selected.tsv from UniProt (see fetch.py): + # Associated documentation of this format is at https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README + # We have two files: idmapping_selected will be used for GeneID -> UniProtKB-AC mapping, + # while idmapping will be used for GeneSymbol -> UniProtKB-AC mapping. + + # We'll also take the idmapping data and trim for specifically Swiss-Prot (curated) genes. + curated_df = pd.read_csv(dir_path / ".." / "raw" / "SwissProt_9606.tsv", sep='\t', usecols=["Entry", "Entry Name", "Gene Names"]) + curated_df.columns = ["UniProtKB-AC", "Entry Name", "Gene Names"] + + idmapping_df = pd.read_csv( + dir_path / ".." / "raw" / "HUMAN_9606_idmapping.tsv", + header=None, names=["UniProtKB-AC", "ID_type", "Value"], sep='\t') + idmapping_df = idmapping_df[idmapping_df["ID_type"] == "Gene_Name"].drop(columns=["ID_type"]).rename(columns={"Value": "GeneSymbol"}) + idmapping_df = idmapping_df.merge(curated_df, on="UniProtKB-AC", how="inner") + gene_symbols_df_nid = gene_symbols_df_nid.merge(idmapping_df, on="GeneSymbol", how="inner").drop(columns=["GeneID"]) + + idmapping_selected_df = pd.read_csv( + dir_path / ".." / "raw" / "HUMAN_9606_idmapping_selected.tsv", + header=None, usecols=[0, 1, 2], names=["UniProtKB-AC", "UniProtKB-ID", "GeneID"], sep='\t' + ) + idmapping_selected_df = idmapping_selected_df[~idmapping_selected_df["GeneID"].isna()] + idmapping_selected_df = idmapping_selected_df.merge(curated_df, on="UniProtKB-AC", how="inner") + gene_symbols_df_id = gene_symbols_df_id.merge(idmapping_selected_df, on="GeneID", how="inner") + gene_symbols_df_id = gene_symbols_df_id.drop(columns=["GeneID", "UniProtKB-ID"]) + + gene_symbol_df = gene_symbols_df_id.merge(gene_symbols_df_nid, on=["GeneSymbol", "UniProtKB-AC", "Entry Name", "Gene Names"], how="outer") + gene_symbol_df = gene_symbol_df.drop(columns=["Gene Names"]) + gene_symbol_df = gene_symbol_df.rename(columns={"GeneSymbol": "From"}) + gene_symbol_df.to_csv(dir_path / ".." / "processed" / "DamagingMutations_idMapping.tsv", sep='\t', index=False) + + +if __name__ == "__main__": + main() diff --git a/datasets/hiv/Snakefile b/datasets/hiv/Snakefile index 14c1d6b..a91d136 100644 --- a/datasets/hiv/Snakefile +++ b/datasets/hiv/Snakefile @@ -2,7 +2,6 @@ rule all: input: "processed/processed_prize_05.txt", "processed/processed_prize_060.txt", - "processed/phosphosite-irefindex13.0-uniprot.txt" rule data_prep: input: @@ -30,11 +29,3 @@ rule spras_formatting: "processed/processed_prize_060.txt" shell: "uv run Scripts/SPRAS_Formatting.py" - -rule copy_network: - input: - "raw/phosphosite-irefindex13.0-uniprot.txt" - output: - "processed/phosphosite-irefindex13.0-uniprot.txt" - shell: - "cp raw/phosphosite-irefindex13.0-uniprot.txt processed/phosphosite-irefindex13.0-uniprot.txt" \ No newline at end of file diff --git a/run_snakemake.sh b/run_snakemake.sh index d48a951..2430524 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -17,6 +17,7 @@ main() { uv run snakemake --cores 1 -d datasets/hiv -s datasets/hiv/Snakefile uv run snakemake --cores 1 -d datasets/diseases -s datasets/diseases/Snakefile uv run snakemake --cores 1 -d datasets/rn-muscle-skeletal -s datasets/rn-muscle-skeletal/Snakefile + uv run snakemake --cores 1 -d datasets/depmap -s datasets/depmap/Snakefile } main "$@"