Reed-CompBio · tristan-f-r · Dec 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 31, 2025
diff --git a/cache/directory.py b/cache/directory.py
@@ -55,6 +55,24 @@ def download(self, output: str | PathLike):
             )
         }
     },
+    "UniProt": {
+        # We use FTP when possible, but we delegate to the UniProt REST API in cases that would save significant bandwidth.
+        "9606": {
+            # We prefer manually curated genes.
+            "SwissProt_9606.tsv": CacheItem(
+                cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
+                online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"
+            ),
+            "HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
+                cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
+                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"
+            ),
+            "HUMAN_9606_idmapping.dat.gz": CacheItem(
+                cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O",
+                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"
+            )
+        }
+    },
     "DISEASES": {
         # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
         # archived files directory instead.
@@ -80,6 +98,28 @@ def download(self, output: str | PathLike):
             cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
             online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text())
         )
+    },
+    "DepMap": {
+        "OmicsProfiles.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"
+        ),
+        "CRISPRGeneDependency.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"
+        ),
+        "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"
+        ),
+        "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"
+        ),
+        "OmicsCNGeneWGS.csv": CacheItem(
+            cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
+            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"
+        )
     }
 }
 

diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
@@ -44,12 +44,12 @@ datasets:
   # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
   - label: dmmmhiv060
     node_files: ["processed_prize_060.txt"]
-    edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
+    edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
   - label: dmmmhiv05
     node_files: ["processed_prize_05.txt"]
-    edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
+    edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
   # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
@@ -72,7 +72,11 @@ datasets:
     node_files:
       - prize_files/diabetes_mellitus_prizes.txt
     other_files: []
-
+  - label: dmmm_cellline_fadu
+    data_dir: datasets/depmap
+    edge_files: ["../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
+    node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
+    other_files: []
 gold_standards:
   - label: gs0
     node_files: ['GS_files/Alopecia_areata_GS.txt']
@@ -82,3 +86,7 @@ gold_standards:
     node_files: ['GS_files/Diabetes_mellitus_GS.txt']
     data_dir: "datasets/diseases"
     dataset_labels: ["dmmm_diabetes_mellitus"]
+  - label: gs_fadu
+    node_files: ["processed/FADU_gold_standard.txt"]
+    data_dir: datasets/depmap
+    dataset_labels: ["dmmm_cellline_fadu"]
diff --git a/databases/irefindex/README.md b/databases/irefindex/README.md
@@ -0,0 +1 @@
+The input edge file for the background network can be obtained from the SPRAS repo [`input/phosphosite-irefindex13.0-uniprot.txt`](https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt). The actual originating site for this dataset is down.
diff --git a/...raw/phosphosite-irefindex13.0-uniprot.txt → ...dex/phosphosite-irefindex13.0-uniprot.txt b/...raw/phosphosite-irefindex13.0-uniprot.txt → ...dex/phosphosite-irefindex13.0-uniprot.txt
diff --git a/databases/stringdb.py b/databases/stringdb.py
@@ -1,8 +1,7 @@
 import argparse
-import gzip
 import os
 from pathlib import Path
-import shutil
+from databases.util import uncompress
 
 from cache.directory import get_cache_item
 
@@ -32,13 +31,6 @@ def parse_args():
 
     return parser.parse_args()
 
-def uncompress(source: Path, target: Path):
-    """Uncompresses a .gz file"""
-    # Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775
-    with gzip.open(source, "rb") as f_compressed:
-        with open(target, "wb") as f_uncompressed:
-            shutil.copyfileobj(f_compressed, f_uncompressed)
-
 def main():
     args = parse_args()
     string_path.mkdir(exist_ok=True)

diff --git a/databases/util.py b/databases/util.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+import gzip
+import shutil
+
+def uncompress(source: Path, target: Path):
+    """Uncompresses a .gz file"""
+    # Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775
+    with gzip.open(source, "rb") as f_compressed:
+        with open(target, "wb") as f_uncompressed:
+            shutil.copyfileobj(f_compressed, f_uncompressed)
diff --git a/datasets/depmap/.gitignore b/datasets/depmap/.gitignore
@@ -0,0 +1,3 @@
+raw
+testing
+processed
diff --git a/datasets/depmap/README.md b/datasets/depmap/README.md
@@ -0,0 +1,50 @@
+# Cancer Dependency Map Dataset
+
+This folder contains the processed data and the scripts for data analysis and preparation on datasets from The [Cancer Dependency Map](https://depmap.org/portal/), an initiative led by the Broad Institute to provide large-scale omics data in identifying cancer dependencies/vulnerabilities.
+
+You can read more about DepMap and the projects included here: https://www.broadinstitute.org/cancer/cancer-dependency-map
+
+## Raw Data
+You can visit the DepMap all data downloads portal at: https://depmap.org/portal/data_page/?tab=allData
+Download the following datasets under the primary files section of DepMap and move them to a directory named `raw` that you create. The dataset descriptions from the website are also included:
+
+Currently used files:
+
+- `OmicsProfiles.csv`: Omics metadata and ID mapping information for files indexed by Profile ID. This dataset is used for mapping cell line names to DepMap model IDs as a basis for data processing. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsProfiles.csv)
+- `CRISPRGeneDependency.csv`: Gene dependency probability estimates for all models in the integrated gene effect. This dataset is used to identify gold standard genes in each cell line, a dependency probability cutoff of 0.5 is currently used to get the genes with considerable impact on the cell line. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=CRISPRGeneDependency.csv)
+- `OmicsSomaticMutationsMatrixDamaging.csv`: Genotyped matrix determining for each cell line whether each gene has at least one damaging mutation. A variant is considered a damaging mutation if LikelyLoF == True. (0 == no mutation; If there is one or more damaging mutations in the same gene for the same cell line, the allele frequencies are summed, and if the sum is greater than 0.95, a value of 2 is assigned and if not, a value of 1 is assigned.). This dataset is used to prepare the input prize file. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsSomaticMutationsMatrixDamaging.csv)
+- `OmicsExpressionProteinCodingGenesTPMLogp1.csv`: Model-level TPMs derived from Salmon v1.10.0 (Patro et al 2017) Rows: Model IDs Columns: Gene names. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsExpressionProteinCodingGenesTPMLogp1.csv)
+- `OmicsCNGeneWGS.csv`: Gene-level copy number data inferred from WGS data only. Additional copy number datasets are available for download as part of the full DepMap Data Release. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsCNGeneWGS.csv)
+
+
+## Scripts
+Currently contains:
+- `local_cell_line_preprocessing.ipynb`: Jupyter notebook for exploratory data analysis and initial pipeline development. Includes CRISPR dependency analysis with multiple thresholds, visualization of gene dependency distributions, UniProt ID mapping workflow (both gene symbols and gene numbers approaches currently), and step-by-step generation of prize input files and gold standard files for individual cell lines.
+- `cell_line_processing.py`: General cell line processing pipeline for generating prize input files and gold standard files converted into Python scripts. Should be reproducible for any cell line name, could be further organized and refined.
+- `uniprot_mapping.py`: Gene symbol extraction script for UniProt ID mapping preparation. Parses gene symbols from any DepMap dataset column headers (e.g., "GENE_NAME (12345)" format) and saves them as CSV files ready for input to the UniProt web service. Currently used to extract gene symbols from `OmicsSomaticMutationsMatrixDamaging.csv`, but should be compatible with any omics dataset.
+
+
+Files used for preparing required files:
+- `OmicsProfiles.csv` used for mapping cell line names to DepMap model IDs.
+- `OmicsSomaticMutationsMatrixDamaging.csv` used for preparing prize input file.
+- `CRISPRGeneDependency.csv` used for preparing gold standard output.  
+- `OmicsExpressionProteinCodingGenesTPMLogp1.csv`: Model-level TPMs derived from Salmon v1.10.0 (Patro et al 2017) Rows: Model IDs Columns: Gene names.
+- `OmicsCNGeneWGS.csv`: Gene-level copy number data inferred from WGS data only. Additional copy number datasets are available for download as part of the full DepMap Data Release.
+
+## Processed Data
+Files used for UniProt ID mapping:
+- `DamagingMutationsGeneSymbols.csv`: Gene symbols and Gene IDs parsed from gene columns in `OmicsSomaticMutationsMatrixDamaging.csv` on the date described
+- `DamagingMutations_idMapping.tsv`: Gene symbols from `DamagingMutationsGeneSymbols_20250718.csv` mapped to UniProt SwissProt IDs, using Gene ID data
+to provide more accurate mappings when possible, since gene symbol -> UniProt mappings are not one-to-one mapping. (TODO: some Gene ID -> UniProt
+mappings are also not one-to-one: the accuracy could be improved by identifying the gene via the mutations present in the associated matrix.)
+
+Started processing with the FADU cell line:
+- Input prize file prepared from the damaging mutations dataset
+- Gold standard file prepared from the CRISPR gene dependency dataset
+
+## Config
+Example Config file used to get preliminary results on OmicsIntegrator1 and 2 following the EGFR dataset example. Will test out more parameters and update.
+
+## Release Citation
+For DepMap Release data, including CRISPR Screens, PRISM Drug Screens, Copy Number, Mutation, Expression, and Fusions:
+DepMap, Broad (2025). DepMap Public 25Q2. Dataset. depmap.org
diff --git a/datasets/depmap/Snakefile b/datasets/depmap/Snakefile
@@ -0,0 +1,45 @@
+rule all:
+    # We currently only care about the FADU cell line.
+    input:
+        "processed/FADU_cell_line_prizes_input_nonzero.txt",
+        "processed/FADU_cell_line_prizes.txt",
+        "processed/FADU_gold_standard_thresh_0_5.txt"
+
+rule fetch:
+    output:
+        "raw/CRISPRGeneDependency.csv",
+        "raw/OmicsProfiles.csv",
+        "raw/OmicsSomaticMutationsMatrixDamaging.csv",
+        "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv",
+        "raw/OmicsCNGeneWGS.csv",
+        "raw/HUMAN_9606_idmapping.tsv",
+        "raw/HUMAN_9606_idmapping_selected.tsv",
+        "raw/SwissProt_9606.tsv"
+    shell:
+        "uv run scripts/fetch.py"
+
+rule mapping:
+    input:
+        "raw/SwissProt_9606.tsv",
+        "raw/HUMAN_9606_idmapping.tsv",
+        "raw/HUMAN_9606_idmapping_selected.tsv",
+        "raw/OmicsSomaticMutationsMatrixDamaging.csv"
+    output:
+        "processed/DamagingMutations_idMapping.tsv"
+    shell:
+        "uv run scripts/uniprot_mapping.py"
+
+rule process:
+    input:
+        "processed/DamagingMutations_idMapping.tsv",
+        "raw/OmicsSomaticMutationsMatrixDamaging.csv",
+        "raw/OmicsProfiles.csv",
+        "raw/OmicsExpressionProteinCodingGenesTPMLogp1.csv",
+        "raw/OmicsCNGeneWGS.csv",
+        "raw/CRISPRGeneDependency.csv"
+    output:
+        "processed/FADU_cell_line_prizes_input_nonzero.txt",
+        "processed/FADU_cell_line_prizes.txt",
+        "processed/FADU_gold_standard_thresh_0_5.txt"
+    shell:
+        "uv run scripts/cell_line_processing.py"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The input edge file for the background network can be obtained from the SPRAS repo [`input/phosphosite-irefindex13.0-uniprot.txt`](https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt). The actual originating site for this dataset is down.