Reed-CompBio · tristan-f-r · Dec 26, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,6 @@ cython_debug/
 
 # pnpm
 .pnpm-store
+
+# mac
+.DS_Store
diff --git a/__init__.py b/__init__.py
diff --git a/cache/__init__.py b/cache/__init__.py
diff --git a/cache/biomart/README.md b/cache/biomart/README.md
@@ -0,0 +1,3 @@
+# BioMart XML Queries
+
+Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).
diff --git a/cache/biomart/ensg-ensp.xml b/cache/biomart/ensg-ensp.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+
+	<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
+		<Attribute name = "ensembl_peptide_id" />
+		<Attribute name = "ensembl_gene_id" />
+	</Dataset>
+</Query>
diff --git a/cache/directory.py b/cache/directory.py
@@ -0,0 +1,100 @@
+from dataclasses import dataclass
+from typing import Union
+from os import PathLike
+from tempfile import NamedTemporaryFile
+import urllib.request
+import filecmp
+import urllib.parse
+import os
+from pathlib import Path
+
+import gdown
+
+dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
+
+def fetch_biomart_url(xml: str) -> str:
+    """
+    Access BioMart data through the BioMart REST API:
+    https://useast.ensembl.org/info/data/biomart/biomart_restful.html#biomartxml
+    """
+    ROOT = "http://www.ensembl.org/biomart/martservice?query="
+    return ROOT + urllib.parse.quote_plus(xml)
+
+@dataclass
+class CacheItem:
+    """Class for differentriating between offline and online items in a cache."""
+
+    cached: str
+    online: str
+
+    def download(self, output: str | PathLike):
+        print(f"Downloading {self.online}...")
+
+        urllib.request.urlretrieve(self.online, output)
+
+        with NamedTemporaryFile() as cached_file:
+            print(f"Downloading cache {self.cached}...")
+            gdown.download(self.cached, cached_file)
+            print("Checking that downloaded artifact matches with cached artifact...")
+            filecmp.cmp(output, cached_file.name)
+
+
+CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
+
+# An *unversioned* directory list.
+directory: CacheDirectory = {
+    "STRING": {
+        "9606": {
+            "links": CacheItem(
+                cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
+                online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
+            ),
+            "aliases": CacheItem(
+                cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
+                online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
+            )
+        }
+    },
+    "DISEASES": {
+        # Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
+        # archived files directory instead.
+        "tiga_gene-trait_stats.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
+            online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
+        ),
+        "HumanDO.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
+            online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv",
+        ),
+        "human_disease_textmining_filtered.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
+            online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
+        ),
+        "human_disease_knowledge_filtered.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
+            online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
+        ),
+    },
+    "BioMart": {
+        "ensg-ensp.tsv": CacheItem(
+            cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
+            online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text())
+        )
+    }
+}
+
+
+def get_cache_item(path: list[str]) -> CacheItem:
+    """Takes a path and gets the underlying cache item."""
+    assert len(path) != 0
+
+    current_item = directory
+    for entry in path:
+        if isinstance(current_item, CacheItem):
+            raise ValueError(f"Path {path} leads to a cache item too early!")
+        current_item = current_item[entry]
+
+    if not isinstance(current_item, CacheItem):
+        raise ValueError(f"Path {path} doesn't lead to a cache item")
+
+    return current_item
diff --git a/cache/index.py b/cache/index.py
@@ -0,0 +1 @@
+# Artifact caching
diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
@@ -58,3 +58,27 @@ datasets:
     edge_files: ["network1.txt"]
     other_files: []
     data_dir: "datasets/yeast-osmotic-stress/processed"
+  - label: dmmm_alopecia_areata
+    data_dir: datasets/diseases
+    edge_files:
+      - raw/string_interactome.txt
+    node_files:
+      - prize_files/alopecia_areata_prizes.txt
+    other_files: []
+  - label: dmmm_diabetes_mellitus
+    data_dir: datasets/diseases
+    edge_files:
+      - raw/string_interactome.txt
+    node_files:
+      - prize_files/diabetes_mellitus_prizes.txt
+    other_files: []
+
+gold_standards:
+  - label: gs0
+    node_files: ['GS_files/Alopecia_areata_GS.txt']
+    data_dir: "datasets/diseases"
+    dataset_labels: ["dmmm_alopecia_areata"]
+  - label: gs1
+    node_files: ['GS_files/Diabetes_mellitus_GS.txt']
+    data_dir: "datasets/diseases"
+    dataset_labels: ["dmmm_diabetes_mellitus"]
diff --git a/databases/.gitignore b/databases/.gitignore
@@ -0,0 +1 @@
+/string
diff --git a/databases/README.md b/databases/README.md
@@ -0,0 +1,3 @@
+# databases
+
+A catalog of CLIs wrapping various common background PPI databases.
diff --git a/databases/__init__.py b/databases/__init__.py
diff --git a/databases/stringdb.py b/databases/stringdb.py
@@ -0,0 +1,57 @@
+import argparse
+import gzip
+import os
+from pathlib import Path
+import shutil
+
+from cache.directory import get_cache_item
+
+# https://stackoverflow.com/a/5137509/7589775
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+string_path = Path(dir_path, "string")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="STRING DB Fetcher", description="Downloads specified STRING DB background interactomes from a specific organism."
+    )
+
+    parser.add_argument(
+        "-i",
+        "--id",
+        help="""
+                        The specified organism ID to use.
+                        See https://string-db.org/cgi/download for more info.
+                        For example, 9606 is the homo sapiens background interactome.
+                        For an example usage, see datasets/diseases's Snakefile.
+                        """,
+        type=int,
+        required=True,
+    )
+
+    return parser.parse_args()
+
+def uncompress(source: Path, target: Path):
+    """Uncompresses a .gz file"""
+    # Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775
+    with gzip.open(source, "rb") as f_compressed:
+        with open(target, "wb") as f_uncompressed:
+            shutil.copyfileobj(f_compressed, f_uncompressed)
+
+def main():
+    args = parse_args()
+    string_path.mkdir(exist_ok=True)
+
+    # We download the links file
+    links_file = string_path / f"{args.id}.protein.links.v12.0.txt.gz"
+    get_cache_item(["STRING", str(args.id), "links"]).download(links_file)
+    uncompress(links_file, links_file.with_suffix("")) # an extra call of with_suffix strips the `.gz` prefix
+
+    # and its associated aliases
+    aliases_file = string_path / f"{args.id}.protein.aliases.v12.0.txt.gz"
+    get_cache_item(["STRING", str(args.id), "aliases"]).download(aliases_file)
+    uncompress(aliases_file, aliases_file.with_suffix(""))
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/diseases/.gitignore b/datasets/diseases/.gitignore
@@ -0,0 +1,9 @@
+*.tsv
+*.pkl
+data
+
+# prize and gold standard files
+GS_files
+prize_files
+raw
+Pickles
diff --git a/datasets/diseases/README.md b/datasets/diseases/README.md
@@ -0,0 +1,61 @@
+# GWAS-based Disease Gene Prediction
+
+In this dataset collection, we identify a number of disease-related trait-gene associations from a GWAS database (TIGA). This resources is one of many resources that are integrated into the DISEASES database, which predicts disease-gene associations.
+
+Here, we ask: **how well does GWAS data predict disease-gene associations when they are considered prize nodes within a protein interactome?**
+
+The inputs are the GWAS trait-gene associations (TIGA) and the interactome (STRING-DB). The gold standard dataset is the DISEASES database, which uses other sources of evidence (such as co-occurrence in texts) to establish disease-gene associations.
+
+## DISEASES Database
+
+This dataset comes from the [DISEASES Database](https://diseases.jensenlab.org/About). Relevant papers include:
+- Grissa et al., [DISEASES 2.0: a weekly updated database of disease–gene associations from text mining and data integration](https://academic.oup.com/database/article/doi/10.1093/database/baac019/6554833). DATABASE 2022.
+- Pletscher-Frankild et al., [DISEASES: Text mining and data integration of disease-gene associations](https://www.sciencedirect.com/science/article/pii/S1046202314003831). Methods, 2015.
+
+Lars Juhl Jensen's lab developed and maintains the STRING database (in collaboration with other groups). The DISEASES Database uses a scheme for scoring disease-gene associations in the same manner as the text-mining scores in STRINGv9.1.
+
+Additionally the DISEASES Database is updated weekly, so this is great for getting relevant information but we should take care in specifying dates/times that the database was accessed.
+
+The DISEASES Database has three channels: text mining, knowledge, and Experiments. **We only consider the Text Mining and Knowledge channels when building the gold standard to avoid overlapping data with the inputs.**
+
+The data can be obtained from [their Downloads page](https://diseases.jensenlab.org/Downloads).
+
+## TIGA
+
+The most recent DISEASES paper (Grissa et al. 2022) integrates a GWAS database called Target Illumination by GWAS Analytics (TIGA), also by the Jensen lab:  
+- Yang et al., [TIGA: target illumination GWAS analytics](https://academic.oup.com/bioinformatics/article/37/21/3865/6292081). Bioinformatics 2021.
+
+TIGA calculates confidence scores for gene-trait associations across genome-wide association studies. They include both citation-based and SNP-based measures in their confidence scores (called their mean rank scores); we only take their SNP data for the inputs. These SNPs are weighted by the distance inverse exponential to handle linkage disequilibrium described in their paper (called `N_snpw`). The SNPs themselves are collected from an Ensemble pipeline - TIGA does not do any novel mapping.
+
+The TIGA gene-trait association data can be obtained from [their shiny app page](https://unmtid-shinyapps.net/shiny/tiga/), either through
+all gene-trait associations, or through their [archived files](https://unmtid-dbs.net/download/TIGA/).
+
+## Disease Ontology
+
+Finally, we use the Disease Ontology to get from gene-trait associations to gene-disease associations by limiting the traits to diseases. The Disease Ontology data can be obtained from [their Downloads page](https://disease-ontology.org/downloads/).
+
+## Putting it all together
+
+We hashed out this pipeline on the whiteboard in July:
+
+![whiteboard-image](figs/DISEASES-board.jpg)
+
+Briefly the steps are:
+
+**A. Gold Standard Dataset Generation**:
+- Use the text mining and knowledge channels from DISEASES.
+- For every disease-gene association, get the max value from those two channels (we believe the confidence scores aren't averaged, but that would make sense - we should double-check).
+- Remove all disease-gene associations that have a confidence score of less than 4 (retain all w/ scores 4 or 5 out of 5). Call these "high confidence disease-gene pairs."
+- Then, remove all disease-gene associations for which there are fewer than 10 high confidence disease-gene pairs for a disease.
+
+By our count, we have 41 diseases that pass these filters, and have 10 or more high confidence disease-gene pairs.
+
+**B. GWAS Dataset Creation**:
+- Take the TIGA trait-gene associations and the Disease Ontology (DO) annotations.
+- Retain all TIGA trait-gene associations where the trait is in the disease ontology. Call these "DO-gene associations". There will be snp_w scores for every gene.
+- Retain the DO-gene associations for the 41 diseases from the gold standard dataset. (We discussed a version 2 where we also run DO-gene associations for diseases _not_ in the validation set; that's a later project).
+
+**C. SPRAS Inputs**:
+- Use the STRING-DB interactome (there is a benchmark file for the DISEASES database with STRINGv9.1, but we might want to use the most recent STRING version).
+- Each of the 41 diseases will be a separate node prizes dataset. For each disease, convert the snp_w scores into prizes and make a `node-prizes.txt` file.
+- Each of the 41 diseases will have a validation dataset, comprising of the high confidence diseases-gene pairs from the DISEASES text mining and/or knowledge channels. They have a score (a 4 or a 5), but I assumed we would consider them all "high confidence" and thus a gene set.
diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
@@ -0,0 +1,56 @@
+rule all:
+    input:
+        "GS_files/Alopecia_areata_GS.txt",
+        "GS_files/Diabetes_mellitus_GS.txt",
+        "prize_files/alopecia_areata_prizes.txt",
+        "prize_files/diabetes_mellitus_prizes.txt"
+
+rule of_db:
+    output:
+        "../../databases/string/9606.protein.links.v12.0.txt",
+        "../../databases/string/9606.protein.aliases.v12.0.txt"
+    shell:
+        "uv run ../../databases/stringdb.py --id 9606"
+
+rule fetch:
+    output:
+        "raw/human_disease_knowledge_filtered.tsv",
+        "raw/human_disease_textmining_filtered.tsv",
+        "raw/HumanDO.tsv",
+        "raw/tiga_gene-trait_stats.tsv"
+    shell:
+        "uv run scripts/fetch.py"
+
+rule inputs:
+    input:
+        "raw/HumanDO.tsv",
+        "raw/tiga_gene-trait_stats.tsv",
+        "../../databases/string/9606.protein.aliases.v12.0.txt"
+    output:
+        "data/inputs.csv"
+    shell:
+        "uv run scripts/inputs.py"
+
+rule gold_standard:
+    input:
+        "raw/human_disease_knowledge_filtered.tsv",
+        "raw/human_disease_textmining_filtered.tsv",
+        "../../databases/string/9606.protein.aliases.v12.0.txt"
+    output:
+        "data/gold_standard.csv"
+    shell:
+        "uv run scripts/gold_standard.py"
+
+rule files:
+    input:
+        "data/inputs.csv",
+        "data/gold_standard.csv",
+        "../../databases/string/9606.protein.links.v12.0.txt"
+    output:
+        # These are the two we use for the SPRAS run for now
+        "GS_files/Alopecia_areata_GS.txt",
+        "GS_files/Diabetes_mellitus_GS.txt",
+        "prize_files/alopecia_areata_prizes.txt",
+        "prize_files/diabetes_mellitus_prizes.txt"
+    shell:
+        "uv run scripts/files.py"
diff --git a/datasets/diseases/figs/DISEASES-board.jpg b/datasets/diseases/figs/DISEASES-board.jpg
diff --git a/datasets/diseases/scripts/fetch.py b/datasets/diseases/scripts/fetch.py
@@ -0,0 +1,44 @@
+"""
+Fetches the latest DISEASES database channels, TIGA data, and human disease ontology data that we need.
+
+Download pages:
+- DISEASES: https://diseases.jensenlab.org/Downloads
+- TIGA: https://unmtid-shinyapps.net/shiny/tiga/
+- Disease Ontology: https://disease-ontology.org/downloads/
+"""
+
+from pathlib import Path
+import os
+from cache.directory import get_cache_item
+
+# https://stackoverflow.com/a/5137509/7589775
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+raw_dir = Path(dir_path, "..", "raw")
+
+
+def main():
+    # We only need the text mining and knowledge channels
+    # and avoid the integrated channel as it is the multiplied probabilities of all
+    # three channels (personal correspondence with Damian Szklarczyk)
+
+    raw_dir.mkdir(exist_ok=True)
+
+    print("Fetching DISEASES text channel...")
+    get_cache_item(["DISEASES", "human_disease_textmining_filtered.tsv"]).download(raw_dir / "human_disease_textmining_filtered.tsv")
+
+    print("Fetching DISEASES knowledge channel...")
+    get_cache_item(["DISEASES", "human_disease_knowledge_filtered.tsv"]).download(raw_dir / "human_disease_knowledge_filtered.tsv")
+
+    print("Fetching TIGA data...")
+    get_cache_item(["DISEASES", "tiga_gene-trait_stats.tsv"]).download(raw_dir / "tiga_gene-trait_stats.tsv")
+
+    print("Fetching human disease ontology data...")
+    get_cache_item(["DISEASES", "HumanDO.tsv"]).download(raw_dir / "HumanDO.tsv")
+
+    print("Fetching BioMart ENSG - ENSP mapping...")
+    get_cache_item(["BioMart", "ensg-ensp.tsv"]).download(raw_dir / "ensg-ensp.tsv")
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -164,3 +164,6 @@ cython_debug/ @@
     # pnpm
     .pnpm-store
+    # mac
+    .DS_Store
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# BioMart XML Queries

		Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# databases

		A catalog of CLIs wrapping various common background PPI databases.