Reed-CompBio · tristan-f-r · May 27, 2025 · May 27, 2025 · May 28, 2025 · May 28, 2025
diff --git a/Snakefile b/Snakefile
@@ -6,6 +6,7 @@ from spras.dataset import Dataset
 from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, cytoscape
 import spras.config.config as _config
+from spras.util import extend_filename
 
 # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
@@ -209,7 +210,9 @@ checkpoint prepare_input:
         # Use the algorithm's generate_inputs function to load the merged dataset, extract the relevant columns,
         # and write the output files specified by required_inputs
         # The filename_map provides the output file path for each required input file type
-        filename_map = {input_type: SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', f'{input_type}.txt']) for input_type in runner.get_required_inputs(wildcards.algorithm)}
+        filename_map = {input_type: SEP.join(
+            [out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', extend_filename(input_type)]
+        ) for input_type in runner.get_required_inputs(wildcards.algorithm)}
         runner.prepare_inputs(wildcards.algorithm, input.dataset_file, filename_map)
 
 # Collect the prepared input files from the specified directory
@@ -227,7 +230,7 @@ def collect_prepared_input(wildcards):
     prepared_dir = SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs'])
 
     # Construct the list of expected prepared input files for the reconstruction algorithm
-    prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}.txt',type=runner.get_required_inputs(algorithm=wildcards.algorithm))
+    prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}',type=map(extend_filename, runner.get_required_inputs(algorithm=wildcards.algorithm)))
     # If the directory is missing, do nothing because the missing output triggers running prepare_input
     if os.path.isdir(prepared_dir):
         # First, check if .snakemake_timestamp, the last written file in a directory rule,

diff --git a/_typos.toml b/_typos.toml
@@ -1,4 +1,4 @@
 [type.txt]
 # Ignore data files
-extend-glob = ["*.txt"]
+extend-glob = ["*.txt", "*.sif"]
 check-file = false
diff --git a/docker-wrappers/DOMINO/0001-fix-split-runner-domino-and-runner-slice.patch b/docker-wrappers/DOMINO/0001-fix-split-runner-domino-and-runner-slice.patch
@@ -0,0 +1,102 @@
+From 49b7580db0700980b8e8c8ce3777165ab56a31c2 Mon Sep 17 00:00:00 2001
+From: "Tristan F.-R." <[email protected]>
+Date: Tue, 27 May 2025 13:56:32 -0700
+Subject: [PATCH 1/2] fix: split runner domino and runner slice
+
+before this, it was only possible to install DOMINO through pypi, because the setup.py script has a broken dependency list.
+---
+ src/runner.py       |  1 -
+ src/runner_slice.py | 71 +++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 71 insertions(+), 1 deletion(-)
+ create mode 100644 src/runner_slice.py
+
+diff --git a/src/runner.py b/src/runner.py
+index a8e5ce5..7a504ab 100755
+--- a/src/runner.py
++++ b/src/runner.py
+@@ -68,5 +68,4 @@ def main_slicer():
+
+
+ if __name__=="__main__":
+-    main_slicer()
+     main_domino()
+diff --git a/src/runner_slice.py b/src/runner_slice.py
+new file mode 100644
+index 0000000..a00130b
+--- /dev/null
++++ b/src/runner_slice.py
+@@ -0,0 +1,71 @@
++import argparse
++import os
++from src.core.domino import main as domino_main
++from src.core.preprocess_slices import create_slices
++from src.utils.visualize_modules import visualize_modules
++import src.constants as constants
++def main_domino():
++
++    parser = argparse.ArgumentParser(description='DOMINO: An active module identification algorithm with reduce rate of false.\n NOTE YOU SHOULD RUN THE SLICES SCRIPT FIRST! (more info, type slicer -h) \n Example input files are available @ https://github.com/Shamir-Lab/DOMINO/tree/master/examples')
++    parser.add_argument('-a', '--active_genes_files', dest='active_genes_files', help='Comma delimited list of absolute paths to files, each containing a list of active genes, separated by a new line char (\\n). e.g. /path/to/active_genes_files_1,/path/to/active_genes_files_2.', default="examples/tnfa_active_genes_file.txt")
++    parser.add_argument('-n', '--network_file', dest='network_file', help='A path to network file (sif format). e.g. /path/to/network_file.sif', default="examples/huri.sif")
++    parser.add_argument('-s', '--slices_file', dest='slices_file', help='A path to slices file (i.e. the output of "slicer" script). e.g., /path/to/slices_file.txt', default="examples/huri_slices.txt")
++    parser.add_argument('-o', '--output_folder', dest='output_folder', help='A folder where output files will be written e.g., /path/to/output', default="examples/output")
++    parser.add_argument('-c', '--use_cache', dest='use_cache', help='Use auto-generated cache network files (*.pkl) from previous executions with the same network. NOTE: (1) THIS IS NOT THE SLICES FILE! (2) If the content of the file has changed, you should set this option to "false"', default="true")
++    parser.add_argument('-p', '--parallelization', dest='parallelization', help='The number of threads allocated to the run (usually single thread is enough)', default="1")
++    parser.add_argument('-v', '--visualization', dest='visualization', help='Indicates whether a visualization of the modules ought to be generated', default="true")
++    parser.add_argument('-sth', '--slice_threshold', dest='slice_threshold', default="0.3", help='The threshold for considering a slice as relevant')
++    parser.add_argument('-mth', '--module_threshold', dest='module_threshold', default="0.05", help='The threshold for considering a putative module as final module')
++
++
++    args = parser.parse_args()
++    active_genes_files = args.active_genes_files.split(",")
++    output_folder = args.output_folder
++    network_file = args.network_file
++    slices_file = args.slices_file
++    slice_threshold = float(args.slice_threshold)
++    module_threshold = float(args.module_threshold)
++    use_cache = args.use_cache=="true"
++    parallelization = int(args.parallelization)
++    visualization = args.visualization=="true"
++
++    constants.N_OF_THREADS=parallelization
++    constants.USE_CACHE=use_cache
++
++    for cur_ag in active_genes_files:
++        G_final_modules=domino_main(active_genes_file=cur_ag, network_file=network_file, slices_file=slices_file, slice_threshold=slice_threshold, module_threshold=module_threshold)
++        activity_name=os.path.splitext(os.path.split(cur_ag)[-1])[0]
++        report_folder=os.path.join(output_folder,activity_name)
++        try:
++            os.makedirs(report_folder)
++        except:
++            pass
++
++        out_file=os.path.join(report_folder, "modules.out") 
++        if len(G_final_modules) !=0:
++            open(out_file, 'w+').write("\n".join(['[%s]' % ', '.join(list(m.nodes)) for m in G_final_modules])+"\n")
++        else:
++            open(out_file, 'w+').write("")
++
++        print(f'{len(G_final_modules)} final modules are reported at {out_file}')
++        print(visualization)
++        if visualization:
++            visualize_modules(os.path.splitext(cur_ag.split('/')[-1])[0], G_final_modules, None, network_file, report_folder)
++
++def main_slicer():
++
++    parser = argparse.ArgumentParser(description='Slicer for DOMINO (step #0): A preprocessing step for the network')
++    parser.add_argument('-n', '--network_file', dest='network_file', help='A path to network file (sif format). e.g. /path/to/network_file.sif', default="examples/huri.sif")
++    parser.add_argument('-o', '--output_file', dest='output_file', default="examples/huri.sif", help='A path to the output slices file. e.g., /path/to/output/slices_file.txt')
++
++
++    args = parser.parse_args()
++    network_file = args.network_file
++    output_file = args.output_file
++    create_slices(network_file, output_file)
++
++
++
++
++if __name__=="__main__":
++    main_slicer()
+-- 
+2.47.0
+
diff --git a/docker-wrappers/DOMINO/0002-fix-update-imports.patch b/docker-wrappers/DOMINO/0002-fix-update-imports.patch
@@ -0,0 +1,98 @@
+From 9781c1e6c6b884f3666f3ade47d584dd7a2b50fe Mon Sep 17 00:00:00 2001
+From: "Tristan F.-R." <[email protected]>
+Date: Tue, 27 May 2025 14:40:05 -0700
+Subject: [PATCH 2/2] fix: update imports
+
+since we are no longer using venv, we need to update all of our imports.
+---
+ src/core/domino.py               | 8 ++++----
+ src/runner.py                    | 8 ++++----
+ src/runner_slice.py              | 8 ++++----
+ src/utils/ensembl2gene_symbol.py | 2 +-
+ src/utils/visualize_modules.py   | 4 ++--
+ 5 files changed, 15 insertions(+), 15 deletions(-)
+
+diff --git a/src/core/domino.py b/src/core/domino.py
+index 89c3e87..d4e1663 100644
+--- a/src/core/domino.py
++++ b/src/core/domino.py
+@@ -18,10 +18,10 @@ from networkx.algorithms.community.centrality import girvan_newman
+ from networkx.algorithms.components import connected_components
+
+ from functools import reduce
+-from src.utils.graph_influence_linear_th import linear_threshold
+-from src.core.preprocess_slices import read_preprocessed_slices
+-from src.core.network_builder import build_network
+-import src.constants as constants
++from utils.graph_influence_linear_th import linear_threshold
++from core.preprocess_slices import read_preprocessed_slices
++from core.network_builder import build_network
++import constants as constants
+
+ G_modularity = None
+
+diff --git a/src/runner.py b/src/runner.py
+index 7a504ab..955e465 100755
+--- a/src/runner.py
++++ b/src/runner.py
+@@ -1,9 +1,9 @@
+ import argparse
+ import os
+-from src.core.domino import main as domino_main
+-from src.core.preprocess_slices import create_slices
+-from src.utils.visualize_modules import visualize_modules
+-import src.constants as constants
++from core.domino import main as domino_main
++from core.preprocess_slices import create_slices
++from utils.visualize_modules import visualize_modules
++import constants as constants
+ def main_domino():
+
+     parser = argparse.ArgumentParser(description='DOMINO: An active module identification algorithm with reduce rate of false.\n NOTE YOU SHOULD RUN THE SLICES SCRIPT FIRST! (more info, type slicer -h) \n Example input files are available @ https://github.com/Shamir-Lab/DOMINO/tree/master/examples')
+diff --git a/src/runner_slice.py b/src/runner_slice.py
+index a00130b..3ea8e04 100644
+--- a/src/runner_slice.py
++++ b/src/runner_slice.py
+@@ -1,9 +1,9 @@
+ import argparse
+ import os
+-from src.core.domino import main as domino_main
+-from src.core.preprocess_slices import create_slices
+-from src.utils.visualize_modules import visualize_modules
+-import src.constants as constants
++from core.domino import main as domino_main
++from core.preprocess_slices import create_slices
++from utils.visualize_modules import visualize_modules
++import constants as constants
+ def main_domino():
+
+     parser = argparse.ArgumentParser(description='DOMINO: An active module identification algorithm with reduce rate of false.\n NOTE YOU SHOULD RUN THE SLICES SCRIPT FIRST! (more info, type slicer -h) \n Example input files are available @ https://github.com/Shamir-Lab/DOMINO/tree/master/examples')
+diff --git a/src/utils/ensembl2gene_symbol.py b/src/utils/ensembl2gene_symbol.py
+index 72d395a..62ae9d2 100755
+--- a/src/utils/ensembl2gene_symbol.py
++++ b/src/utils/ensembl2gene_symbol.py
+@@ -1,4 +1,4 @@
+-import src.constants as constants
++import constants as constants
+ import os
+ g2e_dict = None
+ e2g_dict = None
+diff --git a/src/utils/visualize_modules.py b/src/utils/visualize_modules.py
+index aecc29f..c27587b 100755
+--- a/src/utils/visualize_modules.py
++++ b/src/utils/visualize_modules.py
+@@ -9,8 +9,8 @@ import json
+ import pandas as pd
+
+-from src import constants
+-from src.utils.scripts import format_script
+-from src.utils.ensembl2gene_symbol import  e2g_convertor
++import constants
++from utils.scripts import format_script
++from utils.ensembl2gene_symbol import  e2g_convertor
+ import zipfile
+
+ import multiprocessing
+-- 
+2.47.0
+
diff --git a/docker-wrappers/DOMINO/Dockerfile b/docker-wrappers/DOMINO/Dockerfile
@@ -1,11 +1,17 @@
 # DOMINO wrapper
 # https://github.com/Shamir-Lab/DOMINO
-FROM python:3.7
+FROM python:3.8.20-bullseye
 
-RUN pip install domino-python==0.1.1
+COPY requirements.txt .
 
-# DOMINO requires data files in hard-coded locations
-RUN cd /usr/local/lib/python3.7/site-packages/src/data && \
-    wget https://raw.githubusercontent.com/Shamir-Lab/DOMINO/master/src/data/ensg2gene_symbol.txt && \
-    wget https://raw.githubusercontent.com/Shamir-Lab/DOMINO/master/src/data/ensmusg2gene_symbol.txt && \
-    wget https://raw.githubusercontent.com/Shamir-Lab/DOMINO/master/src/data/graph.html.format
+RUN pip install -r requirements.txt
+
+COPY *.patch .
+
+RUN git clone https://github.com/Shamir-Lab/DOMINO/ && \
+    cd /DOMINO && \
+    git reset --hard 85dad1515717b425b17f58f92b13a063ccccb85d && \
+    git config user.email "[email protected]" && \
+    git config user.name "Non-existent User" && \
+    # https://stackoverflow.com/a/4832785/7589775
+    git apply --ignore-space-change --ignore-whitespace --verbose /*.patch
diff --git a/docker-wrappers/DOMINO/requirements.txt b/docker-wrappers/DOMINO/requirements.txt
@@ -0,0 +1,7 @@
+networkx==2.4
+numpy==1.22.0
+scipy==1.10.0
+pandas==1.5.1
+pcst-fast==1.0.7
+statsmodels==0.11.0
+python-louvain==0.14
diff --git a/spras/domino.py b/spras/domino.py
@@ -39,7 +39,7 @@ class DominoParams(BaseModel):
 - it can include repeated and bidirectional edges
 """
 class DOMINO(PRM[DominoParams]):
-    required_inputs = ['network', 'active_genes']
+    required_inputs = ['network.sif', 'active_genes']
     dois = ["10.15252/msb.20209593"]
 
     @staticmethod
@@ -79,7 +79,7 @@ def generate_inputs(data, filename_map):
         edges_df['Interactor1'] = edges_df['Interactor1'].apply(pre_domino_id_transform)
         edges_df['Interactor2'] = edges_df['Interactor2'].apply(pre_domino_id_transform)
 
-        edges_df.to_csv(filename_map['network'], sep='\t', index=False, columns=['Interactor1', 'ppi', 'Interactor2'],
+        edges_df.to_csv(filename_map['network.sif'], sep='\t', index=False, columns=['Interactor1', 'ppi', 'Interactor2'],
                         header=['ID_interactor_A', 'ppi', 'ID_interactor_B'])
 
     @staticmethod
@@ -93,7 +93,7 @@ def run(inputs, output_file, args=None, container_settings=None):
         # Each volume is a tuple (source, destination)
         volumes = list()
 
-        bind_path, network_file = prepare_volume(inputs["network"], work_dir, container_settings)
+        bind_path, network_file = prepare_volume(inputs["network.sif"], work_dir, container_settings)
         volumes.append(bind_path)
 
         bind_path, node_file = prepare_volume(inputs["active_genes"], work_dir, container_settings)
@@ -109,11 +109,11 @@ def run(inputs, output_file, args=None, container_settings=None):
         volumes.append(bind_path)
 
         # Make the Python command to run within the container
-        slicer_command = ['slicer',
+        slicer_command = ['python', '/DOMINO/src/runner_slice.py',
                           '--network_file', network_file,
                           '--output_file', mapped_slices_file]
 
-        container_suffix = "domino"
+        container_suffix = "domino:latest"
         try:
             run_container_and_log('slicer',
                                 container_suffix,
@@ -131,8 +131,7 @@ def run(inputs, output_file, args=None, container_settings=None):
                 raise err
 
         # Make the Python command to run within the container
-        # Let visualization be always true, parallelization be always 1 thread, and use_cache be always false.
-        domino_command = ['domino',
+        domino_command = ['python', '/DOMINO/src/runner.py',
                           '--active_genes_files', node_file,
                           '--network_file', network_file,
                           '--slices_file', mapped_slices_file,
@@ -183,7 +182,7 @@ def run(inputs, output_file, args=None, container_settings=None):
         # Clean up DOMINO intermediate and pickle files
         slices_file.unlink(missing_ok=True)
         Path(out_dir, 'network.slices.pkl').unlink(missing_ok=True)
-        Path(str(inputs['network']) + '.pkl').unlink(missing_ok=True)
+        Path(str(inputs['network.sif']) + '.pkl').unlink(missing_ok=True)
 
     @staticmethod
     def parse_output(raw_pathway_file, standardized_pathway_file, params):
@@ -242,7 +241,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file, params):
 
 def pre_domino_id_transform(node_id):
     """
-    DOMINO requires module edges to have the 'ENSG0' string as a prefix for visualization.
+    DOMINO requires module edges to have the 'ENSG0' string (Ensemble format)
+    as a prefix for visualization.
     Prepend each node id with this ID_PREFIX.
     @param node_id: the node id to transform
     @return the node id with the prefix added

diff --git a/spras/util.py b/spras/util.py
@@ -131,3 +131,13 @@ def duplicate_edges(df: pd.DataFrame) -> tuple[pd.DataFrame, bool]:
     unique_edges_df = df_sorted.drop_duplicates(subset=["Node1", "Node2", "Direction"], keep="first", ignore_index=True)
 
     return unique_edges_df, not unique_edges_df.equals(df)
+
+# https://stackoverflow.com/a/49689414/7589775
+def extend_filename(file_name: str, extension=".txt") -> str:
+    """
+    Adds a default file extension if none is provided.
+    """
+    root, ext = os.path.splitext(file_name)
+    if not ext:
+        ext = extension
+    return f'{root}{ext}'