Reed-CompBio · tristan-f-r · Dec 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 31, 2025
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -44,24 +44,24 @@ jobs:
           environment-file: spras/environment.yml
           auto-activate-base: false
           miniconda-version: 'latest'
-      # Install spras in the environment using pip
       - name: Install spras in conda env
+        # Install spras in the environment using pip
         shell: bash --login {0}
         run: pip install ./spras
-      # Log conda environment contents
       - name: Log conda environment
+        # Log conda environment contents
         shell: bash --login {0}
         run: conda list
       - name: Process raw data through Snakemake
         run: sh run_snakemake.sh
       - name: Run Snakemake workflow for DMMMs
         shell: bash --login {0}
         run: snakemake --cores 4 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile
-      # TODO: re-enable PRAs once RN/synthetic data PRs are merged.
       # - name: Run Snakemake workflow for PRAs
       #   shell: bash --login {0}
       #   run: snakemake --cores 1 --configfile configs/pra.yaml --show-failed-logs -s spras/Snakefile
       - name: Setup PNPM
+        # TODO: re-enable PRAs once RN/synthetic data PRs are merged.
         uses: pnpm/action-setup@v4
         with:
           version: 10

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,11 +10,11 @@ repos:
     hooks:
       # Attempts to load all yaml files to verify syntax.
       - id: check-yaml
-      # Attempts to load all TOML files to verify syntax.
       - id: check-toml
-      # Trims trailing whitespace.
+        # Attempts to load all TOML files to verify syntax.
       - id: trailing-whitespace
         # Preserves Markdown hard linebreaks.
+        # Trims trailing whitespace.
         args: [--markdown-linebreak-ext=md]
         # Do not trim whitespace from all files, input files may need trailing whitespace for empty values in columns.
         types_or: [markdown, python, yaml]

diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
@@ -55,8 +55,8 @@ datasets:
     edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
-  # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
   - label: dmmmyeast
+    # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
     node_files: ["prizes1_dummies.txt"]
     edge_files: ["network1.txt"]
     other_files: []

diff --git a/datasets/depmap/README.md b/datasets/depmap/README.md
@@ -0,0 +1,34 @@
+# Cancer Dependency Map Dataset 
+
+This folder contains the processed data and the scripts for data analysis  and preparation on datasets from The Cancer Dependency Map, an initiative lead by the Broad Institute to provide large-scale omics data in identifying cancer dependencies/vulnerabilities. 
+
+You can read more DepMap and the projects included here: https://www.broadinstitute.org/cancer/cancer-dependency-map 
+
+## raw data 
+You can visit the DepMap all data downloads portal at: https://depmap.org/portal/data_page/?tab=allData 
+Download the following datasets under the primary files section and move them to the raw folder, the dataset descriptions from the website is also included : 
+
+
+- OmicsProfiles.csv: Omics metadata and ID mapping information for files indexed by Profile ID.This dataset is used for mapping cell line names to DepMap model IDs as a basis for data processing. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsProfiles.csv) 
+- CRISPRGeneDependency.csv: Gene dependency probability estimates for all models in the integrated gene effect.
+This dataset is used to identify gold standard genes in each cell line, a dependency probability cutoff of 0.5 is currently used to get the genes with considerable impact on the cell line. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=CRISPRGeneDependency.csv) 
+- OmicsCNGeneWGS.csv: Gene-level copy number data inferred from WGS data only.Additional copy number datasets are available for download as part of the full DepMap Data Release.(file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsCNGeneWGS.csv) 
+- OmicsSomaticMutationsMatrixDamaging.csv: Genotyped matrix determining for each cell line whether each gene has at least one damaging mutation. A variant is considered a damaging mutation if LikelyLoF == True. (0 == no mutation; If there is one or more damaging mutations in the same gene for the same cell line, the allele frequencies are summed, and if the sum is greater than 0.95, a value of 2 is assigned and if not, a value of 1 is assigned.). This dataset is used to prepare the input prize file. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsSomaticMutationsMatrixDamaging.csv) 
+- OmicsExpressionProteinCodingGenesTPMLogp1.csv:Model-level TPMs derived from Salmon v1.10.0 (Patro et al 2017) Rows: Model IDs Columns: Gene names. (file URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsExpressionProteinCodingGenesTPMLogp1.csv) 
+
+## scripts
+Currently only the Jupyter notebook file I used to analyze dependency data and do the data processing locally to get the input prize file and gold standards. Should be reproducible for any cell line name, but is not yet organized or refined for GitHub.  
+
+## processed data 
+Files used for Uniprot ID mapping: 
+- Gene symbols parsed 
+- Gene symbols mapped to Uniprot IDs 
+- folder of processed data for an attempt to do UniProt mapping with the gene index numbers instead, got stuck due to duplicate matches for the same gene number, a future step could be referring to the original mutations file(OmicsSomaticMutations.csv on DepMap, URL: https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2025Q2&filename=OmicsSomaticMutations.csv) for gene numbers with duplicate matches and do exact matches by seeing where the mutation is located and get more accurate mappings. 
+
+Started processing with the FADU cell line: 
+- input prize file prepared from the damaging mutations dataset 
+- gold standard file prepared from the CRISPR gene dependency dataset
+
+## config 
+Example Config file used to get preliminary results on OmicsIntegrator1 and 2 following the EGFR dataset example. Will test out more parameters and update. 
+
diff --git a/datasets/depmap/config/cellline_fadu.yaml b/datasets/depmap/config/cellline_fadu.yaml
@@ -0,0 +1,94 @@
+# The length of the hash used to identify a parameter combination
+hash_length: 7
+
+# If true, use Singularity instead of Docker
+# Singularity support is only available on Unix
+singularity: false
+
+algorithms:
+  -
+    name: pathlinker
+    params:
+      include: false
+      run1:
+        k:
+          - 10
+          - 20
+  -
+    name: omicsintegrator1
+    params:
+      include: true
+      run1:
+        b:
+          - 0.55
+          - 2
+          - 10
+        d:
+          - 10
+        g:
+          - 1e-3
+        r:
+          - 0.01
+        w:
+          - 0.1
+        mu:
+          - 0.008
+        dummy_mode: ["terminal"]
+  -
+    name: omicsintegrator2
+    params:
+      include: true
+      run1:
+        b:
+          - 4
+        g:
+          - 0
+      run2:
+        b:
+          - 2
+        g:
+          - 3
+  -
+    name: meo
+    params:
+      include: false
+      run1:
+        local_search:
+          - "Yes"
+        max_path_length:
+          - 3
+        rand_restarts:
+          - 10
+  -
+    name: domino
+    params:
+      include: false
+      run1:
+        slice_threshold:
+          - 0.3
+        module_threshold:
+          - 0.05
+datasets:
+  -
+    data_dir: input
+    edge_files:
+      - phosphosite-irefindex13.0-uniprot.txt
+    label: cellline
+    node_files:
+      - cellline_fadu_nodes.txt
+    other_files: []
+reconstruction_settings:
+  locations:
+    reconstruction_dir: output/cellline_fadu
+  run: true
+analysis:
+  graphspace:
+    include: false
+  cytoscape:
+    include: true
+  summary:
+    include: true
+  ml:
+    include: false
+  evaluation:
+    include: false