Reed-CompBio · tristan-f-r · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
@@ -46,12 +46,12 @@ datasets:
   # TODO: use old paramaters for datasets
   # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
   - label: dmmmhiv060
-    node_files: ["processed_prize_060.txt"]
+    node_files: ["processed_prizes_060.txt"]
     edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
   - label: dmmmhiv05
-    node_files: ["processed_prize_05.txt"]
+    node_files: ["processed_prizes_05.txt"]
     edge_files: ["phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"

diff --git a/datasets/hiv/.gitignore b/datasets/hiv/.gitignore
@@ -1 +1,2 @@
-processed
+/processed
+/Pickles
diff --git a/datasets/hiv/README.md b/datasets/hiv/README.md
@@ -0,0 +1,15 @@
+# HIV dataset
+
+## Raw files
+
+See `raw/README.md`.
+
+## File organization
+
+See `Snakefile` for the way that all of the IO files are connected.
+
+1. `fetch.py` - This grabs the score files from https://doi.org/10.1371/journal.ppat.1011492
+1. `prepare.py` - This cleans up the prize files in `raw`; specifically to remove duplicates.
+1. `name_mapping.py` - Converts from UniProt KB-ACID to UniProt KB to meet in the middle with `kegg_ortholog.py`. We chose UniProt KB for its generality.
+1. `spras_formatting.py` - Formats the input files into a SPRAS-ready format.
+1. `kegg_orthology.py` - This is used to generate the KEGG ortholog file for gold standards, but this has yet to be finalized.
diff --git a/datasets/hiv/Scripts/Data_Prep.py b/datasets/hiv/Scripts/Data_Prep.py
diff --git a/datasets/hiv/Scripts/Kegg_Orthology.py b/datasets/hiv/Scripts/Kegg_Orthology.py
diff --git a/datasets/hiv/Scripts/SPRAS_Formatting.py b/datasets/hiv/Scripts/SPRAS_Formatting.py
diff --git a/datasets/hiv/Snakefile b/datasets/hiv/Snakefile
@@ -1,40 +1,48 @@
 rule all:
     input:
-        "processed/processed_prize_05.txt",
-        "processed/processed_prize_060.txt",
+        "processed/processed_prizes_05.txt",
+        "processed/processed_prizes_060.txt",
         "processed/phosphosite-irefindex13.0-uniprot.txt"
 
+rule fetch:
+    output:
+        "raw/prizes_05.tsv",
+        "raw/prizes_060.tsv",
+        "raw/ko03250.xml"
+    shell:
+        "uv run scripts/fetch.py"
+
 rule data_prep:
     input: 
-        "raw/prize_05.csv",
-        "raw/prize_060.csv"
+        "raw/prizes_05.tsv",
+        "raw/prizes_060.tsv"
     output:
         "Pickles/NodeIDs.pkl"
     shell:
-        "uv run Scripts/Data_Prep.py"
+        "uv run scripts/prepare.py"
 
 rule name_mapping:
     input: 
         "Pickles/NodeIDs.pkl"
     output:
         "Pickles/UniprotIDs.pkl"
     shell:
-        "uv run Scripts/Name_Mapping.py"
+        "uv run scripts/name_mapping.py"
 
 rule spras_formatting:
     input:
         "Pickles/NodeIDs.pkl",
         "Pickles/UniprotIDs.pkl"
     output:
-        "processed/processed_prize_05.txt",
-        "processed/processed_prize_060.txt"
+        "processed/processed_prizes_05.txt",
+        "processed/processed_prizes_060.txt"
     shell:
-        "uv run Scripts/SPRAS_Formatting.py"
+        "uv run scripts/spras_formatting.py"
 
 rule copy_network:
     input:
         "raw/phosphosite-irefindex13.0-uniprot.txt"
     output:
         "processed/phosphosite-irefindex13.0-uniprot.txt"
     shell:
-        "cp raw/phosphosite-irefindex13.0-uniprot.txt processed/phosphosite-irefindex13.0-uniprot.txt"
+        "cp raw/phosphosite-irefindex13.0-uniprot.txt processed/phosphosite-irefindex13.0-uniprot.txt"
diff --git a/datasets/hiv/raw/.gitignore b/datasets/hiv/raw/.gitignore
@@ -0,0 +1,3 @@
+prizes_05.tsv
+prizes_060.tsv
+ko03250.xml
diff --git a/datasets/hiv/raw/README.md b/datasets/hiv/raw/README.md
@@ -0,0 +1,10 @@
+# raw
+
+Some `raw` files are fetched from `../scripts/fetch.py`. 
+
+The `phosphosite-irefindex13.0-uniprot.txt` is
+a background interactome provided by SPRAS: https://github.com/Reed-CompBio/spras/blob/be8bc7f8d71880d7ce9c9ceeeddfefa6eb60c522/input/phosphosite-irefindex13.0-uniprot.txt.
+
+The `ko03250.xml` is from `https://www.kegg.jp/entry/ko03250`. Specifically, if you click on the pathway image in the entry,
+you'll get to https://www.kegg.jp/pathway/ko03250, where you download the KGML file (which is formatted as a `.xml` file)
+under `Download` -> `KGML`. (The final file is at https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml).