Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
15c1ba8
added Diseases data and processing
AMINOexe Jul 15, 2025
5945aa2
added diseases vizualization
AMINOexe Jul 15, 2025
8b69ac7
generated gold standard files
AMINOexe Jul 28, 2025
3c01555
refactored disease scripts
AMINOexe Jul 28, 2025
c196a58
Merge branch 'main' into diseases_dataset
tristan-f-r Jul 29, 2025
3028762
fix: rm DS store
tristan-f-r Jul 29, 2025
96bf124
updated diseases config
AMINOexe Jul 29, 2025
9039d6e
updated diseases config
AMINOexe Jul 29, 2025
bea5d95
style: fmt
tristan-f-r Jul 29, 2025
31a4c27
style: fmt
tristan-f-r Jul 29, 2025
0923d1d
fix: drop dsstore
tristan-f-r Jul 29, 2025
ba4c568
Merge branch 'main' into diseases_dataset
tristan-f-r Jul 30, 2025
a36f3ea
refactor: begin moving aroun
tristan-f-r Jul 30, 2025
bf56e7b
chore: some cleanup
tristan-f-r Jul 30, 2025
5e7bf8e
feat: tiga / DO fetching
tristan-f-r Jul 30, 2025
3db1d02
fix: use provided cwd path for string
tristan-f-r Jul 30, 2025
31e0173
added readme with information about summer work
annaritz Nov 3, 2025
6b2cce9
Merge branch 'main' into diseases_dataset
tristan-f-r Dec 14, 2025
3e642b9
Merge branch 'main' into diseases_dataset
tristan-f-r Dec 14, 2025
047f093
style: fmt
tristan-f-r Dec 14, 2025
e543755
fix: create directories, make snakemake
tristan-f-r Dec 15, 2025
6921df5
chore: drop unnecessary cmt
tristan-f-r Dec 15, 2025
bfa2fb6
use csv instead of pickle, merge diseases yaml to dmmm yaml
tristan-f-r Dec 15, 2025
0350891
chore: mv around files, correctly use non-pickled file
tristan-f-r Dec 15, 2025
c273418
Merge branch 'main' into diseases_dataset
tristan-f-r Dec 19, 2025
96f01d4
chore: bump spras
tristan-f-r Dec 19, 2025
168f580
Merge branch 'main' into diseases_dataset
tristan-f-r Dec 19, 2025
1117233
feat: cache, databases
tristan-f-r Dec 22, 2025
728dbb6
fix: properly download cache items
tristan-f-r Dec 23, 2025
4f04bcf
chore: cleanup
tristan-f-r Dec 23, 2025
fce53cb
feat: consider aliases
tristan-f-r Dec 23, 2025
3b07532
feat(inputs.py): drop use of STRING API
tristan-f-r Dec 23, 2025
eba0fda
feat: use protein aliases for gold_standard
tristan-f-r Dec 23, 2025
be5253f
fix: drop duplicates of non str_id subset
tristan-f-r Dec 23, 2025
2846eb9
style: fmt
tristan-f-r Dec 24, 2025
21a6961
chore: bump
tristan-f-r Dec 24, 2025
775af24
fix: correct snakefile
tristan-f-r Dec 24, 2025
b1d4ced
fix: preserve str_id
tristan-f-r Dec 24, 2025
50b8aa4
feat: setup biomart fetching
tristan-f-r Dec 26, 2025
45b3304
feat: use biomart instead of the gconvert wrapper
tristan-f-r Dec 26, 2025
964c3d9
docs(biomart): provide context to biomart folder
tristan-f-r Dec 26, 2025
1a43d73
docs: more biomart
tristan-f-r Dec 26, 2025
5e85513
config: correct dmmm prefix
tristan-f-r Dec 26, 2025
832e773
chore: drop dataPreb.ipynb
tristan-f-r Dec 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,6 @@ cython_debug/

# pnpm
.pnpm-store

# mac
.DS_Store
Empty file added __init__.py
Empty file.
Empty file added cache/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions cache/biomart/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# BioMart XML Queries

Directory for storing XML queries generated from [the BioMart interface](https://www.ensembl.org/info/data/biomart/index.html).
9 changes: 9 additions & 0 deletions cache/biomart/ensg-ensp.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >

<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Attribute name = "ensembl_peptide_id" />
<Attribute name = "ensembl_gene_id" />
</Dataset>
</Query>
100 changes: 100 additions & 0 deletions cache/directory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from dataclasses import dataclass
from typing import Union
from os import PathLike
from tempfile import NamedTemporaryFile
import urllib.request
import filecmp
import urllib.parse
import os
from pathlib import Path

import gdown

dir_path = Path(os.path.dirname(os.path.realpath(__file__)))

def fetch_biomart_url(xml: str) -> str:
"""
Access BioMart data through the BioMart REST API:
https://useast.ensembl.org/info/data/biomart/biomart_restful.html#biomartxml
"""
ROOT = "http://www.ensembl.org/biomart/martservice?query="
return ROOT + urllib.parse.quote_plus(xml)

@dataclass
class CacheItem:
"""Class for differentriating between offline and online items in a cache."""

cached: str
online: str

def download(self, output: str | PathLike):
print(f"Downloading {self.online}...")

urllib.request.urlretrieve(self.online, output)

with NamedTemporaryFile() as cached_file:
print(f"Downloading cache {self.cached}...")
gdown.download(self.cached, cached_file)
print("Checking that downloaded artifact matches with cached artifact...")
filecmp.cmp(output, cached_file.name)


CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]

# An *unversioned* directory list.
directory: CacheDirectory = {
"STRING": {
"9606": {
"links": CacheItem(
cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
),
"aliases": CacheItem(
cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
)
}
},
"DISEASES": {
# Instead of going through https://unmtid-shinyapps.net/shiny/tiga/, we use their
# archived files directory instead.
"tiga_gene-trait_stats.tsv": CacheItem(
cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
),
"HumanDO.tsv": CacheItem(
cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv",
),
"human_disease_textmining_filtered.tsv": CacheItem(
cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
),
"human_disease_knowledge_filtered.tsv": CacheItem(
cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
),
},
"BioMart": {
"ensg-ensp.tsv": CacheItem(
cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text())
)
}
}


def get_cache_item(path: list[str]) -> CacheItem:
"""Takes a path and gets the underlying cache item."""
assert len(path) != 0

current_item = directory
for entry in path:
if isinstance(current_item, CacheItem):
raise ValueError(f"Path {path} leads to a cache item too early!")
current_item = current_item[entry]

if not isinstance(current_item, CacheItem):
raise ValueError(f"Path {path} doesn't lead to a cache item")

return current_item
1 change: 1 addition & 0 deletions cache/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Artifact caching
24 changes: 24 additions & 0 deletions configs/dmmm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,27 @@ datasets:
edge_files: ["network1.txt"]
other_files: []
data_dir: "datasets/yeast-osmotic-stress/processed"
- label: dmmm_alopecia_areata
data_dir: datasets/diseases
edge_files:
- raw/string_interactome.txt
node_files:
- prize_files/alopecia_areata_prizes.txt
other_files: []
- label: dmmm_diabetes_mellitus
data_dir: datasets/diseases
edge_files:
- raw/string_interactome.txt
node_files:
- prize_files/diabetes_mellitus_prizes.txt
other_files: []

gold_standards:
- label: gs0
node_files: ['GS_files/Alopecia_areata_GS.txt']
data_dir: "datasets/diseases"
dataset_labels: ["dmmm_alopecia_areata"]
- label: gs1
node_files: ['GS_files/Diabetes_mellitus_GS.txt']
data_dir: "datasets/diseases"
dataset_labels: ["dmmm_diabetes_mellitus"]
1 change: 1 addition & 0 deletions databases/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/string
3 changes: 3 additions & 0 deletions databases/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# databases

A catalog of CLIs wrapping various common background PPI databases.
Empty file added databases/__init__.py
Empty file.
57 changes: 57 additions & 0 deletions databases/stringdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import argparse
import gzip
import os
from pathlib import Path
import shutil

from cache.directory import get_cache_item

# https://stackoverflow.com/a/5137509/7589775
dir_path = os.path.dirname(os.path.realpath(__file__))

string_path = Path(dir_path, "string")


def parse_args():
parser = argparse.ArgumentParser(
prog="STRING DB Fetcher", description="Downloads specified STRING DB background interactomes from a specific organism."
)

parser.add_argument(
"-i",
"--id",
help="""
The specified organism ID to use.
See https://string-db.org/cgi/download for more info.
For example, 9606 is the homo sapiens background interactome.
For an example usage, see datasets/diseases's Snakefile.
""",
type=int,
required=True,
)

return parser.parse_args()

def uncompress(source: Path, target: Path):
"""Uncompresses a .gz file"""
# Uncompressing a .gz file: https://stackoverflow.com/a/44712152/7589775
with gzip.open(source, "rb") as f_compressed:
with open(target, "wb") as f_uncompressed:
shutil.copyfileobj(f_compressed, f_uncompressed)

def main():
args = parse_args()
string_path.mkdir(exist_ok=True)

# We download the links file
links_file = string_path / f"{args.id}.protein.links.v12.0.txt.gz"
get_cache_item(["STRING", str(args.id), "links"]).download(links_file)
uncompress(links_file, links_file.with_suffix("")) # an extra call of with_suffix strips the `.gz` prefix

# and its associated aliases
aliases_file = string_path / f"{args.id}.protein.aliases.v12.0.txt.gz"
get_cache_item(["STRING", str(args.id), "aliases"]).download(aliases_file)
uncompress(aliases_file, aliases_file.with_suffix(""))

if __name__ == "__main__":
main()
9 changes: 9 additions & 0 deletions datasets/diseases/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
*.tsv
*.pkl
data

# prize and gold standard files
GS_files
prize_files
raw
Pickles
61 changes: 61 additions & 0 deletions datasets/diseases/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# GWAS-based Disease Gene Prediction

In this dataset collection, we identify a number of disease-related trait-gene associations from a GWAS database (TIGA). This resources is one of many resources that are integrated into the DISEASES database, which predicts disease-gene associations.

Here, we ask: **how well does GWAS data predict disease-gene associations when they are considered prize nodes within a protein interactome?**

The inputs are the GWAS trait-gene associations (TIGA) and the interactome (STRING-DB). The gold standard dataset is the DISEASES database, which uses other sources of evidence (such as co-occurrence in texts) to establish disease-gene associations.

## DISEASES Database

This dataset comes from the [DISEASES Database](https://diseases.jensenlab.org/About). Relevant papers include:
- Grissa et al., [DISEASES 2.0: a weekly updated database of disease–gene associations from text mining and data integration](https://academic.oup.com/database/article/doi/10.1093/database/baac019/6554833). DATABASE 2022.
- Pletscher-Frankild et al., [DISEASES: Text mining and data integration of disease-gene associations](https://www.sciencedirect.com/science/article/pii/S1046202314003831). Methods, 2015.

Lars Juhl Jensen's lab developed and maintains the STRING database (in collaboration with other groups). The DISEASES Database uses a scheme for scoring disease-gene associations in the same manner as the text-mining scores in STRINGv9.1.

Additionally the DISEASES Database is updated weekly, so this is great for getting relevant information but we should take care in specifying dates/times that the database was accessed.

The DISEASES Database has three channels: text mining, knowledge, and Experiments. **We only consider the Text Mining and Knowledge channels when building the gold standard to avoid overlapping data with the inputs.**

The data can be obtained from [their Downloads page](https://diseases.jensenlab.org/Downloads).

## TIGA

The most recent DISEASES paper (Grissa et al. 2022) integrates a GWAS database called Target Illumination by GWAS Analytics (TIGA), also by the Jensen lab:
- Yang et al., [TIGA: target illumination GWAS analytics](https://academic.oup.com/bioinformatics/article/37/21/3865/6292081). Bioinformatics 2021.

TIGA calculates confidence scores for gene-trait associations across genome-wide association studies. They include both citation-based and SNP-based measures in their confidence scores (called their mean rank scores); we only take their SNP data for the inputs. These SNPs are weighted by the distance inverse exponential to handle linkage disequilibrium described in their paper (called `N_snpw`). The SNPs themselves are collected from an Ensemble pipeline - TIGA does not do any novel mapping.

The TIGA gene-trait association data can be obtained from [their shiny app page](https://unmtid-shinyapps.net/shiny/tiga/), either through
all gene-trait associations, or through their [archived files](https://unmtid-dbs.net/download/TIGA/).

## Disease Ontology

Finally, we use the Disease Ontology to get from gene-trait associations to gene-disease associations by limiting the traits to diseases. The Disease Ontology data can be obtained from [their Downloads page](https://disease-ontology.org/downloads/).

## Putting it all together

We hashed out this pipeline on the whiteboard in July:

![whiteboard-image](figs/DISEASES-board.jpg)

Briefly the steps are:

**A. Gold Standard Dataset Generation**:
- Use the text mining and knowledge channels from DISEASES.
- For every disease-gene association, get the max value from those two channels (we believe the confidence scores aren't averaged, but that would make sense - we should double-check).
- Remove all disease-gene associations that have a confidence score of less than 4 (retain all w/ scores 4 or 5 out of 5). Call these "high confidence disease-gene pairs."
- Then, remove all disease-gene associations for which there are fewer than 10 high confidence disease-gene pairs for a disease.

By our count, we have 41 diseases that pass these filters, and have 10 or more high confidence disease-gene pairs.

**B. GWAS Dataset Creation**:
- Take the TIGA trait-gene associations and the Disease Ontology (DO) annotations.
- Retain all TIGA trait-gene associations where the trait is in the disease ontology. Call these "DO-gene associations". There will be snp_w scores for every gene.
- Retain the DO-gene associations for the 41 diseases from the gold standard dataset. (We discussed a version 2 where we also run DO-gene associations for diseases _not_ in the validation set; that's a later project).

**C. SPRAS Inputs**:
- Use the STRING-DB interactome (there is a benchmark file for the DISEASES database with STRINGv9.1, but we might want to use the most recent STRING version).
- Each of the 41 diseases will be a separate node prizes dataset. For each disease, convert the snp_w scores into prizes and make a `node-prizes.txt` file.
- Each of the 41 diseases will have a validation dataset, comprising of the high confidence diseases-gene pairs from the DISEASES text mining and/or knowledge channels. They have a score (a 4 or a 5), but I assumed we would consider them all "high confidence" and thus a gene set.
56 changes: 56 additions & 0 deletions datasets/diseases/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
rule all:
input:
"GS_files/Alopecia_areata_GS.txt",
"GS_files/Diabetes_mellitus_GS.txt",
"prize_files/alopecia_areata_prizes.txt",
"prize_files/diabetes_mellitus_prizes.txt"

rule of_db:
output:
"../../databases/string/9606.protein.links.v12.0.txt",
"../../databases/string/9606.protein.aliases.v12.0.txt"
shell:
"uv run ../../databases/stringdb.py --id 9606"

rule fetch:
output:
"raw/human_disease_knowledge_filtered.tsv",
"raw/human_disease_textmining_filtered.tsv",
"raw/HumanDO.tsv",
"raw/tiga_gene-trait_stats.tsv"
shell:
"uv run scripts/fetch.py"

rule inputs:
input:
"raw/HumanDO.tsv",
"raw/tiga_gene-trait_stats.tsv",
"../../databases/string/9606.protein.aliases.v12.0.txt"
output:
"data/inputs.csv"
shell:
"uv run scripts/inputs.py"

rule gold_standard:
input:
"raw/human_disease_knowledge_filtered.tsv",
"raw/human_disease_textmining_filtered.tsv",
"../../databases/string/9606.protein.aliases.v12.0.txt"
output:
"data/gold_standard.csv"
shell:
"uv run scripts/gold_standard.py"

rule files:
input:
"data/inputs.csv",
"data/gold_standard.csv",
"../../databases/string/9606.protein.links.v12.0.txt"
output:
# These are the two we use for the SPRAS run for now
"GS_files/Alopecia_areata_GS.txt",
"GS_files/Diabetes_mellitus_GS.txt",
"prize_files/alopecia_areata_prizes.txt",
"prize_files/diabetes_mellitus_prizes.txt"
shell:
"uv run scripts/files.py"
Binary file added datasets/diseases/figs/DISEASES-board.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
44 changes: 44 additions & 0 deletions datasets/diseases/scripts/fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Fetches the latest DISEASES database channels, TIGA data, and human disease ontology data that we need.
Download pages:
- DISEASES: https://diseases.jensenlab.org/Downloads
- TIGA: https://unmtid-shinyapps.net/shiny/tiga/
- Disease Ontology: https://disease-ontology.org/downloads/
"""

from pathlib import Path
import os
from cache.directory import get_cache_item

# https://stackoverflow.com/a/5137509/7589775
dir_path = os.path.dirname(os.path.realpath(__file__))

raw_dir = Path(dir_path, "..", "raw")


def main():
# We only need the text mining and knowledge channels
# and avoid the integrated channel as it is the multiplied probabilities of all
# three channels (personal correspondence with Damian Szklarczyk)

raw_dir.mkdir(exist_ok=True)

print("Fetching DISEASES text channel...")
get_cache_item(["DISEASES", "human_disease_textmining_filtered.tsv"]).download(raw_dir / "human_disease_textmining_filtered.tsv")

print("Fetching DISEASES knowledge channel...")
get_cache_item(["DISEASES", "human_disease_knowledge_filtered.tsv"]).download(raw_dir / "human_disease_knowledge_filtered.tsv")

print("Fetching TIGA data...")
get_cache_item(["DISEASES", "tiga_gene-trait_stats.tsv"]).download(raw_dir / "tiga_gene-trait_stats.tsv")

print("Fetching human disease ontology data...")
get_cache_item(["DISEASES", "HumanDO.tsv"]).download(raw_dir / "HumanDO.tsv")

print("Fetching BioMart ENSG - ENSP mapping...")
get_cache_item(["BioMart", "ensg-ensp.tsv"]).download(raw_dir / "ensg-ensp.tsv")


if __name__ == "__main__":
main()
Loading
Loading