Reed-CompBio · tristan-f-r · Jul 1, 2025 · Jul 28, 2025 · Jan 6, 2026 · Jan 6, 2026
diff --git a/cache/directory.py b/cache/directory.py
@@ -46,8 +46,8 @@ def download(self, output: str | PathLike):
     "STRING": {
         "9606": {
             "links": CacheItem(
-                cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
-                online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
+                cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE",
+                online="http://stringdb-downloads.org/download/protein.links.full.v12.0/9606.protein.links.full.v12.0.txt.gz",
             ),
             "aliases": CacheItem(
                 cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",

diff --git a/databases/stringdb.py b/databases/stringdb.py
@@ -36,7 +36,7 @@ def main():
     string_path.mkdir(exist_ok=True)
 
     # We download the links file
-    links_file = string_path / f"{args.id}.protein.links.v12.0.txt.gz"
+    links_file = string_path / f"{args.id}.protein.links.full.v12.0.txt.gz"
     get_cache_item(["STRING", str(args.id), "links"]).download(links_file)
     uncompress(links_file, links_file.with_suffix("")) # an extra call of with_suffix strips the `.gz` prefix
 

diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
@@ -7,7 +7,7 @@ rule all:
 
 rule of_db:
     output:
-        "../../databases/string/9606.protein.links.v12.0.txt",
+        "../../databases/string/9606.protein.links.full.v12.0.txt",
         "../../databases/string/9606.protein.aliases.v12.0.txt"
     shell:
         "uv run ../../databases/stringdb.py --id 9606"
@@ -45,7 +45,7 @@ rule files:
     input:
         "data/inputs.csv",
         "data/gold_standard.csv",
-        "../../databases/string/9606.protein.links.v12.0.txt"
+        "../../databases/string/9606.protein.links.full.v12.0.txt"
     output:
         # These are the two we use for the SPRAS run for now
         "GS_files/Alopecia_areata_GS.txt",

diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py
@@ -42,7 +42,7 @@ def main():
 
     # See /databases/stringdb.py for information on how this was grabbed.
     # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
-    string = pd.read_csv(diseases_path / ".." / ".." / "databases" / "string" / "9606.protein.links.v12.0.txt", sep=" ", skiprows=[0], header=None)
+    string = pd.read_csv(diseases_path / ".." / ".." / "databases" / "string" / "9606.protein.links.full.v12.0.txt", sep=" ", skiprows=[0], header=None)
 
     # Threshold anything above a confidence score of 900 to trim down the background interactome
     string = string[string.iloc[:, 2] > 900]

diff --git a/datasets/synthetic-data/.gitignore b/datasets/synthetic-data/.gitignore
@@ -0,0 +1,2 @@
+intermediate
+processed
diff --git a/datasets/synthetic-data/README.md b/datasets/synthetic-data/README.md
@@ -0,0 +1,93 @@
+# Synthetic Data
+
+## Download STRING Human Interactome
+1. Download the STRING *Homo sapiens* `9606.protein.links.full.v12.0.txt.gz` database file from [STRING](https://string-db.org/cgi/download?sessionId=bL9sRTdIaUEt&species_text=Homo+sapiens&settings_expanded=0&min_download_score=0&filter_redundant_pairs=0&delimiter_type=txt).
+2. Move the downloaded file into the `raw/human-interactome/` folder.
+3. From the `raw/synthetic-data/` directory, extract the file using:
+
+   ```
+   gunzip human-interactome/9606.protein.links.full.v12.0.txt.gz
+   ```
+
+## Download New PANTHER Pathways
+1. Visit [Pathway Commons](https://www.pathwaycommons.org/).
+2. Search for the desired pathway (e.g., "signaling") and filter the results by the **PANTHER pathway** data source.  
+   Example: [Search for "Signaling" filtered by PANTHER pathway](https://apps.pathwaycommons.org/search?datasource=panther&q=Signaling&type=Pathway)
+3. Click on the desired pathway and download the **Extended SIF** version of the pathway.
+4. In the `raw/pathway-data/` folder, create a new subfolder named after the pathway you downloaded.
+5. Move the downloaded Extended SIF file to this new folder (as a `.txt` file). Rename the file to match the subfolder name exactly.
+
+## Sources and Targets
+
+[Sources](https://www.pnas.org/doi/full/10.1073/pnas.1808790115) are silico human surfaceomes receptors.
+
+[Targets](https://academic.oup.com/nar/article/51/D1/D39/6765312) are human transcription factors.
+
+## Steps to Generate SPRAS-Compatible Pathways
+
+### 1. Process PANTHER Pathways
+
+1. Open `process_panther_pathway.py` and add the name of any new pathways to the `pathways` vector on **line 6**.
+2. Run the command:
+   ```
+   uv run src/process_panther_pathway.py
+   ```
+3. This will create five new files in each subfolder of the `pathway-data/` directory:
+- `EDGES.txt`
+- `NODES.txt`
+- `PRIZES-100.txt`
+- `SOURCES.txt`
+- `TARGETS.txt`
+
+### 2. Convert Pathways to SPRAS-Compatible Format
+1.	In `SPRAS_compatible_files.py`, add the name of any new pathways to the `pathway_dirs` list on **line 8**.
+2.	From the synthetic-data/ directory, run the command:
+```
+python src/SPRAS_compatible_files.py
+```
+3. This will create a new folder named `spras-compatible-pathway-data`, containing subfolders for each PANTHER pathway in SPRAS-compatible format.  
+Each subfolder will include the following three files:
+- `<pathway_name>_gs_edges.txt`
+- `<pathway_name>_gs_nodes.txt`
+- `<pathway_name>_node_prizes.txt`
+
+4. From the synthetic-data/ directory, run the command:
+```
+python src/ratios.py
+```
+5. This will create a new file `data_ratio.txt` in `spras-compatible-pathway-data` to explain the edge to target/sources ratios.
+
+## Steps to get the interactomes
+### 1. Steps to get threshold interactomes
+1. From the synthetic-data/ directory, run the command:
+```
+python src/threshold_interactomes.py
+```
+2.	This will create a new folder named `interactomes`, containing a subfolder called `uniprot-threshold-interactomes`.
+The subfolder will include the following 12 files:
+- 10 thresholded interactomes: `uniprot_human_interactome_<threshold>.txt` (thresholds range from 1 to 900)
+- `proteins_missing_aliases.csv`: STRING IDs that are missing UniProt accession identifiers
+- `removed_edges.txt`: All edges removed from the uniprot_human_interactome_<threshold>.txt files
+
+### 2. Steps to get combined interactomes (Panther pathways and threshold interactomes)
+1. In `combine.py`, adjust the `pathway_dirs` list on **line 11** to be the pathways to be included in the combined networks
+2. From the synthetic-data/ directory, run the command:
+```
+python src/combine.py
+```
+3. This will create a new a subfolder called `uniprot-combined-threshold-interactomes` in `interactomes`.
+This subfolder will include 12 files:
+- 10 combined threshold interactomes combined with the chosen pathways: `uniprot_combined_interactome_<threshold>.txt` (thresholds range from 1 to 900)
+- `overlap_combined_info.csv`
+- `overlap_info.csv`
+
+# Pilot Data
+For the pilot data, use the list `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]` in both:
+- the list in `combine.py`
+- the list in `overlap_analytics.py`
+
+Make sure these pathways in the list are also added `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]`to:
+- the `pathways` vector in `ProcessPantherPathway.R`
+- the list in `SPRAS_compatible_files.py`
+
+**Once you’ve updated the pathway lists in all relevant scripts, run all the steps above to generate the Pilot dataset.**
diff --git a/datasets/synthetic-data/Snakefile b/datasets/synthetic-data/Snakefile
@@ -0,0 +1,92 @@
+pathways = ["Apoptosis_signaling", "B_cell_activation",
+            "Beta3_adrenergic_rec", "Cadherin_signaling",
+            "Hedgehog_signaling", "Insulin_IGF",
+            "Interleukin_signaling", "Notch_signaling",
+            "PDGF_signaling", "Ras", "T_cell_activation",
+            "Toll_signaling", "Wnt_signaling", "p38_MAPK",
+            "Nicotinic_acetylchol", "Fas_signaling",
+            "FGF_signaling", "Interferon_gamma_signaling",
+            "JAK_STAT_signaling", "VEGF_signaling"]
+# TODO: deduplicate this from thresholding scripts by passing it in?
+thresholds = [1, 100, 200, 300, 400, 500, 600, 700, 800, 900]
+
+rule all:
+    input:
+        "../../databases/string/9606.protein.links.full.v12.0.txt",
+        expand([
+            "processed/{pathway}/{pathway}_node_prizes.txt",
+            "processed/{pathway}/{pathway}_gs_edges.txt",
+            "processed/{pathway}/{pathway}_gs_nodes.txt"
+        ], pathway=pathways),
+        expand(
+            "processed/interactomes/uniprot-combined-threshold-interactomes/uniprot_combined_interactome_{threshold}.txt",
+            threshold=thresholds)
+
+rule of_db:
+    output:
+        "../../databases/string/9606.protein.links.full.v12.0.txt"
+    run:
+        "uv run ../../databases/stringdb.py --id 9606"
+
+rule combine_interactomes:
+    input:
+        expand("processed/{pathway}/{pathway}_gs_edges.txt", pathway=pathways),
+        expand(
+            "processed/interactomes/uniprot-threshold-interactomes/uniprot_human_interactome_{threshold}.txt",
+            threshold=thresholds)
+    output:
+        "processed/interactomes/uniprot-combined-threshold-interactomes/overlap_info.csv",
+        "processed/interactomes/uniprot-combined-threshold-interactomes/overlap_combined_info.csv",
+        expand(
+            "processed/interactomes/uniprot-combined-threshold-interactomes/uniprot_combined_interactome_{threshold}.txt",
+            threshold=thresholds)
+    shell:
+        "uv run src/combine.py"
+
+rule threshold_interactomes:
+    input:
+        "../../databases/string/9606.protein.links.full.v12.0.txt",
+        expand([
+            "processed/{pathway}/{pathway}_node_prizes.txt",
+            "processed/{pathway}/{pathway}_gs_edges.txt",
+            "processed/{pathway}/{pathway}_gs_nodes.txt"
+        ], pathway=pathways)
+    output:
+        "processed/interactomes/uniprot-threshold-interactomes/proteins_missing_aliases.csv",
+        "processed/interactomes/uniprot-threshold-interactomes/removed_edges.txt",
+        expand(
+            "processed/interactomes/uniprot-threshold-interactomes/uniprot_human_interactome_{threshold}.txt",
+            threshold=thresholds)
+    shell:
+        "uv run src/threshold_interactomes.py"
+
+rule process_panther_pathway:
+    input: "raw/pathway-data/{pathway}.txt"
+    output:
+        [
+            "intermediate/{pathway}/EDGES.txt",
+            "intermediate/{pathway}/NODES.txt",
+            "intermediate/{pathway}/TARGETS.txt",
+            "intermediate/{pathway}/SOURCES.txt",
+            "intermediate/{pathway}/PRIZES.txt"
+        ]
+    shell:
+        "uv run src/process_panther_pathway.py {wildcards.pathway}"
+
+rule make_spras_compatible:
+    input:
+        [
+            "intermediate/{pathway}/EDGES.txt",
+            "intermediate/{pathway}/NODES.txt",
+            "intermediate/{pathway}/TARGETS.txt",
+            "intermediate/{pathway}/SOURCES.txt",
+            "intermediate/{pathway}/PRIZES.txt"
+        ]
+    output:
+        [
+            "processed/{pathway}/{pathway}_node_prizes.txt",
+            "processed/{pathway}/{pathway}_gs_edges.txt",
+            "processed/{pathway}/{pathway}_gs_nodes.txt"
+        ]
+    shell:
+        "uv run src/SPRAS_compatible_files.py {wildcards.pathway}"