HelikarLab
diff --git a/‎.github/workflows/unit_tests.yml
+1-1 b/‎.github/workflows/unit_tests.yml
+1-1
diff --git a/‎Dockerfile
+5-6 b/‎Dockerfile
+5-6
diff --git a/‎environment.yaml
+11-11 b/‎environment.yaml
+11-11
diff --git a/‎main/COMO.ipynb
+141-427 b/‎main/COMO.ipynb
+141-427
diff --git a/‎main/como/__init__.py
+1 b/‎main/como/__init__.py
+1
diff --git a/‎main/como/cluster_rnaseq.py
+271 b/‎main/como/cluster_rnaseq.py
+271
diff --git a/‎main/src/cluster_sources.py ‎main/como/cluster_sources.py b/‎main/src/cluster_sources.py ‎main/como/cluster_sources.py
diff --git a/‎main/como/combine_distributions.py
+19 b/‎main/como/combine_distributions.py
+19
diff --git a/‎main/src/como_utilities.py ‎main/como/como_utilities.py b/‎main/src/como_utilities.py ‎main/como/como_utilities.py
@@ -35,6 +35,6 @@ jobs:
             python=${{ matrix.python-version }}
 
       - name: Run tests
-        run: python -m pytest
+        run: "cd main && python -m pytest"
         shell: micromamba-shell {0}
 
@@ -12,14 +12,13 @@ RUN sed -i '/^python/d' /opt/conda/conda-meta/pinned && \
     echo "c.ServerApp.root_dir = '${HOME}/main'" >> "${HOME}/.jupyter/jupyter_notebook_config.py" && \
     echo "c.ServerApp.token = ''" >> "${HOME}/.jupyter/jupyter_notebook_config.py" && \
     echo "c.ServerApp.password = ''" >> "${HOME}/.jupyter/jupyter_notebook_config.py" && \
-    conda config --quiet --add channels conda-forge && \
-    conda config --quiet --add channels bioconda && \
-    conda config --quiet --add channels r && \
-    rm -rf "${HOME}/main/tests"  # Remove tests, they are not required for running COMO
+    mamba config --quiet --add channels conda-forge && \
+    mamba config --quiet --add channels bioconda && \
+    mamba config --quiet --add channels r && \
+    rm -rf "${HOME}/main/tests"  # Tests not required for running COMO
 
 # Update base environment
-RUN ls "${HOME}" && \
-    mamba env update --name=base --file="${HOME}/environment.yaml" && \
+RUN mamba env update --name=base --file="${HOME}/environment.yaml" && \
     R -e "devtools::install_github('babessell1/zFPKM')" && \
     pip cache purge && \
     mamba clean --all --yes
 
@@ -9,14 +9,14 @@ channels:
 dependencies:
   - bioconda::bioconductor-affyio~=1.64.0
   - bioconda::bioconductor-affy~=1.72.0
-  - bioconda::bioconductor-agilp
-  - bioconda::bioconductor-biomart
-  - bioconda::bioconductor-deseq2
-  - bioconda::bioconductor-edger
-  - bioconda::bioconductor-genefilter
-  - bioconda::bioconductor-genomeinfodbdata  # ~=1.2.11  # Required or else hgu133acdf fails to install
+  - bioconda::bioconductor-agilp~=3.26.0
+  - bioconda::bioconductor-biomart~=2.50.0
+  - bioconda::bioconductor-deseq2~=1.34.0
+  - bioconda::bioconductor-edger~=3.36.0
+  - bioconda::bioconductor-genefilter~=1.76.0
+  # - bioconda::bioconductor-genomeinfodbdata~=1.2.11  # Required or else hgu133acdf fails to install
   - bioconda::bioconductor-hgu133acdf~=2.18.0
-  - bioconda::bioconductor-limma
+  # - bioconda::bioconductor-limma~=3.50.1
   - bioconda::crux-toolkit~=4.1
   - bioconda::thermorawfileparser~=1.4.0
   - conda-forge::aioftp~=0.21.2
@@ -26,7 +26,7 @@ dependencies:
   - conda-forge::git~=2.37.0  # Required for pip-related dependencies
   - conda-forge::jupyterlab~=4.0.0
   - conda-forge::lxml~=4.9.1
-  - conda-forge::numpy
+  - conda-forge::numpy~=1.23.0
   - conda-forge::openpyxl~=3.0.10
   # - conda-forge::optlang~=1.5.2
   - conda-forge::pandas<=3.0.0
@@ -45,8 +45,8 @@ dependencies:
   - conda-forge::r-repr~=1.1.4
   - conda-forge::r-rzmq~=0.9.8
   - conda-forge::r-sjmisc~=2.8.9
-  - conda-forge::r-stringr
-  - conda-forge::r-tidyverse
+  - conda-forge::r-stringr~=1.4.0
+  - conda-forge::r-tidyverse~=1.3.1
   - conda-forge::r-uwot~=0.1.11
   - conda-forge::r-zoo~=1.8_10
   # - conda-forge::requests~=2.28.1
@@ -60,10 +60,10 @@ dependencies:
   # - conda-forge::xlrd~=2.0.1
   - gurobi::gurobi
   - pip:
+      - git+https://github.com/JoshLoecker/fast_bioservices
       # - escher==1.7.3
       - git+https://github.com/JoshLoecker/escher.git@python38#subdirectory=py
       - framed==0.5.*
       - memote<=1.0
       - git+https://github.com/JoshLoecker/cobamp.git
       - git+https://github.com/JoshLoecker/troppo.git
-      - git+https://github.com/JoshLoecker/fast_bioservices.git
@@ -0,0 +1 @@
+from .como_utilities import stringlist_to_list
@@ -0,0 +1,271 @@
+import argparse
+from pathlib import Path
+
+import numpy as np
+import rpy2_api
+from como_utilities import stringlist_to_list
+from project import Config
+
+# read and translate R functions
+configs = Config()
+r_file_path = Path(configs.code_dir, "rscripts", "cluster_samples.R")
+
+
+def main() -> None:
+    """
+    Cluster RNA-seq Data
+    """
+
+    parser = argparse.ArgumentParser(
+        prog="cluster_rnaseq.py",
+        description="Cluster RNA-seq Data using Multiple Correspondence Analysis or UMAP. Clusters at the replicate, "
+        "batch/study, and context levels.",
+        epilog="For additional help, please post questions/issues in the MADRID GitHub repo at "
+        "https://github.com/HelikarLab/MADRID or email [email protected]",
+    )
+    parser.add_argument(
+        "-n",
+        "--context-names",
+        type=str,
+        required=True,
+        dest="context_names",
+        help="""Tissue/cell name of models to generate. If making multiple models in a batch, then
+                             use the format: \"['context1', 'context2', ... etc]\". Note the outer double-quotes and the 
+                             inner single-quotes are required to be interpreted. This a string, not a python list""",
+    )
+    parser.add_argument(
+        "-t",
+        "--filt-technique",
+        type=str,
+        required=True,
+        dest="technique",
+        help="'zfpkm', 'quantile', or 'cpm'",
+    )
+    parser.add_argument(
+        "-a",
+        "--cluster-algorithm",
+        type=str,
+        required=False,
+        default="umap",
+        dest="clust_algo",
+        help="""Clustering algorithm to use. 'mca' or 'umap'.""",
+    )
+    parser.add_argument(
+        "-l",
+        "--label",
+        type=str,
+        required=False,
+        default=True,
+        dest="label",
+        help="""True to label replicate/batch/context names on the plots. May be ugly for large sets""",
+    )
+    parser.add_argument(
+        "-d",
+        "--min-dist",
+        type=float,
+        required=False,
+        default=0.01,
+        dest="min_dist",
+        help="""Minimum distance for UMAP clustering. Must be between 0 and 1""",
+    )
+    parser.add_argument(
+        "-r",
+        "--replicate-ratio",
+        type=str,
+        required=False,
+        default=0.9,
+        dest="rep_ratio",
+        help="""Ratio of genes active in replicates for a batch/study to be active""",
+    )
+    parser.add_argument(
+        "-b",
+        "--batch-ratio",
+        type=str or float,
+        required=False,
+        default=0.9,
+        dest="batch_ratio",
+        help="""Ratio of genes active in a batch/study to be active in the context""",
+    )
+    parser.add_argument(
+        "-nr",
+        "--n-neighbors-rep",
+        type=str or float,
+        required=False,
+        default="default",
+        dest="n_neigh_rep",
+        help="""N nearest neighbors for replicate clustering, 'default' is total number of replicates""",
+    )
+    parser.add_argument(
+        "-nb",
+        "--n-neighbors-batch",
+        type=str or float,
+        required=False,
+        default="default",
+        dest="n_neigh_batch",
+        help="""N nearest neighbors for batch clustering, 'default' is total number of batches""",
+    )
+    parser.add_argument(
+        "-nc",
+        "--n-neighbors-context",
+        type=str or float,
+        required=False,
+        default="default",
+        dest="n_neigh_cont",
+        help="""N nearest neighbors for context clustering, 'default' is total number of contexts""",
+    )
+    parser.add_argument(
+        "-c",
+        "--min-count",
+        type=str or int,
+        required=False,
+        default="default",
+        dest="min_count",
+        help="""Ratio of active genes in a batch/study to be active in the context""",
+    )
+    parser.add_argument(
+        "-q",
+        "--quantile",
+        type=str or int,
+        required=False,
+        default=0.5,
+        dest="quantile",
+        help="""Ratio of active genes in a batch/study to be active in the context""",
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        required=False,
+        default=-1,
+        dest="seed",
+        help="""Random seed for clustering algorithm initialization""",
+    )
+    args = parser.parse_args()
+
+    context_names = stringlist_to_list(args.context_names)
+    technique = args.technique.lower()
+    clust_algo = args.clust_algo.lower()
+    label = args.label
+    rep_ratio = args.rep_ratio
+    batch_ratio = args.batch_ratio
+    min_count = args.min_count
+    quantile = args.quantile
+    min_dist = args.min_dist
+    n_neigh_rep = args.n_neigh_rep
+    n_neigh_batch = args.n_neigh_batch
+    n_neigh_cont = args.n_neigh_cont
+
+    # Set a random seed if none provided
+    if int(args.seed) == -1:
+        seed = np.random.randint(0, 100000)
+    else:
+        seed = args.seed
+
+    if isinstance(min_count, str) and min_count.lower() == "default":
+        try:
+            min_count = int(min_count)
+        except ValueError:
+            raise ValueError("--min-count must be either 'default' or an integer > 0")
+    if not isinstance(min_count, str) and min_count < 0:
+        raise ValueError("--min-count must be either 'default' or an integer > 0")
+
+    if isinstance(quantile, str) and not quantile.lower() == "default":
+        try:
+            quantile = int(quantile)
+        except ValueError:
+            raise ValueError("--quantile must be either 'default' or an integer between 0 and 100")
+    if not isinstance(quantile, str) and 0 > quantile > 100:
+        raise ValueError("--quantile must be either 'default' or an integer between 0 and 100")
+
+    if isinstance(rep_ratio, str) and not rep_ratio.lower() == "default":
+        try:
+            rep_ratio = float(rep_ratio)
+        except ValueError:
+            raise ValueError("--rep-ratio must be 'default' or a float between 0 and 1")
+    if not isinstance(rep_ratio, str) and 0 > rep_ratio > 1.0:
+        raise ValueError("--rep-ratio must be 'default' or a float between 0 and 1")
+
+    if isinstance(batch_ratio, str) and not batch_ratio.lower() == "default":
+        try:
+            batch_ratio = float(batch_ratio)
+        except ValueError:
+            raise ValueError("--batch-ratio must be 'default' or a float between 0 and 1")
+    if not isinstance(batch_ratio, str) and 0 > batch_ratio > 1.0:
+        raise ValueError("--batch-ratio must be 'default' or a float between 0 and 1")
+
+    if technique.lower() not in ["quantile", "tpm", "cpm", "zfpkm"]:
+        raise ValueError("--technique must be either 'quantile', 'tpm', 'cpm', 'zfpkm'")
+
+    if technique.lower() == "tpm":
+        technique = "quantile"
+
+    if clust_algo.lower() not in ["mca", "umap"]:
+        raise ValueError("--clust_algo must be either 'mca', 'umap'")
+
+    if not isinstance(min_dist, str) and 0 > min_dist > 1.0:
+        raise ValueError("--min_dist must be a float between 0 and 1")
+
+    if isinstance(n_neigh_rep, str) and not n_neigh_rep.lower() == "default":
+        try:
+            n_neigh_rep = int(n_neigh_rep)
+        except ValueError:
+            raise ValueError(
+                "--n_neigh_rep must be either 'default' or an integer greater than 1 and less than or equal to "
+                "the total number of replicates being clustered across all contexts."
+            )
+    if not isinstance(n_neigh_rep, str) and n_neigh_rep < 2:
+        raise ValueError(
+            "--n_neigh_rep must be either 'default' or an integer greater than 1 and less than or equal to "
+            "the total number of replicates being clustered across all contexts."
+        )
+
+    if isinstance(n_neigh_batch, str) and not n_neigh_batch.lower() == "default":
+        try:
+            n_neigh_batch = int(n_neigh_batch)
+        except ValueError:
+            raise ValueError(
+                "--n_neigh_batch must be either 'default' or an integer greater than 1 and less than or equal to "
+                "the total number of batches being clustered across all contexts."
+            )
+    if not isinstance(n_neigh_batch, str) and n_neigh_batch < 2:
+        raise ValueError(
+            "--n_neigh_batch must be either 'default' or an integer greater than 1 and less than or equal to "
+            "the total number of batches being clustered across all contexts."
+        )
+
+    if isinstance(n_neigh_cont, str) and not n_neigh_cont.lower() == "default":
+        try:
+            n_neigh_cont = int(n_neigh_cont)
+        except ValueError:
+            raise ValueError(
+                "--n_neigh_batch must be either 'default' or an integer greater than 1 and less than or equal to "
+                "the total number of batches being clustered across all contexts."
+            )
+    if not isinstance(n_neigh_cont, str) and n_neigh_cont < 2:
+        raise ValueError(
+            "--n_neigh_context must be either 'default' or an integer greater than 1 and less than or equal to "
+            "the total number of contexts being clustered."
+        )
+
+    cluster_samples = rpy2_api.Rpy2(
+        r_file_path=r_file_path,
+        wd=configs.result_dir,
+        context_names=context_names,
+        technique=technique,
+        clust_algo=clust_algo,
+        label=label,
+        min_dist=min_dist,
+        n_neigh_rep=n_neigh_rep,
+        n_neigh_batch=n_neigh_batch,
+        n_neigh_cont=n_neigh_cont,
+        rep_ratio=rep_ratio,
+        batch_ratio=batch_ratio,
+        quantile=quantile,
+        min_count=min_count,
+        seed=seed,
+    )
+    cluster_samples.call_function("cluster_samples_main")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,19 @@
+from pathlib import Path
+from typing import Literal, Union
+
+
+def parse_contexts_zfpkm(wd: Union[str, Path], contexts: list[str], prep: Literal["mrna", "total"]):
+    wd: Path = Path(wd)
+
+    batches = []
+    for context in contexts:
+        dir_name = Path(wd, context, prep)
+        files = dir_name.glob(f"zFPKM_Matrix_{prep}_*.csv")
+        batches += [Path(file).stem for file in files]
+
+    return batches
+
+
+if __name__ == "__main__":
+    result = parse_contexts_zfpkm(wd="/Users/joshl/PycharmProjects/COMO/main/data/results", contexts=["naiveB"], prep="total")
+    print(result)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .como_utilities import stringlist_to_list`