Skip to content

Commit 8884922

Browse files
authored
Merge pull request #184 from HelikarLab/update-cc
Introduce package
2 parents ce2c833 + cc93c17 commit 8884922

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+4682
-7231
lines changed

.github/workflows/unit_tests.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,6 @@ jobs:
3535
python=${{ matrix.python-version }}
3636
3737
- name: Run tests
38-
run: python -m pytest
38+
run: "cd main && python -m pytest"
3939
shell: micromamba-shell {0}
4040

Dockerfile

+5-6
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,13 @@ RUN sed -i '/^python/d' /opt/conda/conda-meta/pinned && \
1212
echo "c.ServerApp.root_dir = '${HOME}/main'" >> "${HOME}/.jupyter/jupyter_notebook_config.py" && \
1313
echo "c.ServerApp.token = ''" >> "${HOME}/.jupyter/jupyter_notebook_config.py" && \
1414
echo "c.ServerApp.password = ''" >> "${HOME}/.jupyter/jupyter_notebook_config.py" && \
15-
conda config --quiet --add channels conda-forge && \
16-
conda config --quiet --add channels bioconda && \
17-
conda config --quiet --add channels r && \
18-
rm -rf "${HOME}/main/tests" # Remove tests, they are not required for running COMO
15+
mamba config --quiet --add channels conda-forge && \
16+
mamba config --quiet --add channels bioconda && \
17+
mamba config --quiet --add channels r && \
18+
rm -rf "${HOME}/main/tests" # Tests not required for running COMO
1919

2020
# Update base environment
21-
RUN ls "${HOME}" && \
22-
mamba env update --name=base --file="${HOME}/environment.yaml" && \
21+
RUN mamba env update --name=base --file="${HOME}/environment.yaml" && \
2322
R -e "devtools::install_github('babessell1/zFPKM')" && \
2423
pip cache purge && \
2524
mamba clean --all --yes

environment.yaml

+11-11
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ channels:
99
dependencies:
1010
- bioconda::bioconductor-affyio~=1.64.0
1111
- bioconda::bioconductor-affy~=1.72.0
12-
- bioconda::bioconductor-agilp
13-
- bioconda::bioconductor-biomart
14-
- bioconda::bioconductor-deseq2
15-
- bioconda::bioconductor-edger
16-
- bioconda::bioconductor-genefilter
17-
- bioconda::bioconductor-genomeinfodbdata # ~=1.2.11 # Required or else hgu133acdf fails to install
12+
- bioconda::bioconductor-agilp~=3.26.0
13+
- bioconda::bioconductor-biomart~=2.50.0
14+
- bioconda::bioconductor-deseq2~=1.34.0
15+
- bioconda::bioconductor-edger~=3.36.0
16+
- bioconda::bioconductor-genefilter~=1.76.0
17+
# - bioconda::bioconductor-genomeinfodbdata~=1.2.11 # Required or else hgu133acdf fails to install
1818
- bioconda::bioconductor-hgu133acdf~=2.18.0
19-
- bioconda::bioconductor-limma
19+
# - bioconda::bioconductor-limma~=3.50.1
2020
- bioconda::crux-toolkit~=4.1
2121
- bioconda::thermorawfileparser~=1.4.0
2222
- conda-forge::aioftp~=0.21.2
@@ -26,7 +26,7 @@ dependencies:
2626
- conda-forge::git~=2.37.0 # Required for pip-related dependencies
2727
- conda-forge::jupyterlab~=4.0.0
2828
- conda-forge::lxml~=4.9.1
29-
- conda-forge::numpy
29+
- conda-forge::numpy~=1.23.0
3030
- conda-forge::openpyxl~=3.0.10
3131
# - conda-forge::optlang~=1.5.2
3232
- conda-forge::pandas<=3.0.0
@@ -45,8 +45,8 @@ dependencies:
4545
- conda-forge::r-repr~=1.1.4
4646
- conda-forge::r-rzmq~=0.9.8
4747
- conda-forge::r-sjmisc~=2.8.9
48-
- conda-forge::r-stringr
49-
- conda-forge::r-tidyverse
48+
- conda-forge::r-stringr~=1.4.0
49+
- conda-forge::r-tidyverse~=1.3.1
5050
- conda-forge::r-uwot~=0.1.11
5151
- conda-forge::r-zoo~=1.8_10
5252
# - conda-forge::requests~=2.28.1
@@ -60,10 +60,10 @@ dependencies:
6060
# - conda-forge::xlrd~=2.0.1
6161
- gurobi::gurobi
6262
- pip:
63+
- git+https://github.com/JoshLoecker/fast_bioservices
6364
# - escher==1.7.3
6465
- git+https://github.com/JoshLoecker/escher.git@python38#subdirectory=py
6566
- framed==0.5.*
6667
- memote<=1.0
6768
- git+https://github.com/JoshLoecker/cobamp.git
6869
- git+https://github.com/JoshLoecker/troppo.git
69-
- git+https://github.com/JoshLoecker/fast_bioservices.git

main/COMO.ipynb

+141-427
Large diffs are not rendered by default.

main/como/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .como_utilities import stringlist_to_list

main/como/cluster_rnaseq.py

+271
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
import argparse
2+
from pathlib import Path
3+
4+
import numpy as np
5+
import rpy2_api
6+
from como_utilities import stringlist_to_list
7+
from project import Config
8+
9+
# read and translate R functions
10+
configs = Config()
11+
r_file_path = Path(configs.code_dir, "rscripts", "cluster_samples.R")
12+
13+
14+
def main() -> None:
15+
"""
16+
Cluster RNA-seq Data
17+
"""
18+
19+
parser = argparse.ArgumentParser(
20+
prog="cluster_rnaseq.py",
21+
description="Cluster RNA-seq Data using Multiple Correspondence Analysis or UMAP. Clusters at the replicate, "
22+
"batch/study, and context levels.",
23+
epilog="For additional help, please post questions/issues in the MADRID GitHub repo at "
24+
"https://github.com/HelikarLab/MADRID or email [email protected]",
25+
)
26+
parser.add_argument(
27+
"-n",
28+
"--context-names",
29+
type=str,
30+
required=True,
31+
dest="context_names",
32+
help="""Tissue/cell name of models to generate. If making multiple models in a batch, then
33+
use the format: \"['context1', 'context2', ... etc]\". Note the outer double-quotes and the
34+
inner single-quotes are required to be interpreted. This a string, not a python list""",
35+
)
36+
parser.add_argument(
37+
"-t",
38+
"--filt-technique",
39+
type=str,
40+
required=True,
41+
dest="technique",
42+
help="'zfpkm', 'quantile', or 'cpm'",
43+
)
44+
parser.add_argument(
45+
"-a",
46+
"--cluster-algorithm",
47+
type=str,
48+
required=False,
49+
default="umap",
50+
dest="clust_algo",
51+
help="""Clustering algorithm to use. 'mca' or 'umap'.""",
52+
)
53+
parser.add_argument(
54+
"-l",
55+
"--label",
56+
type=str,
57+
required=False,
58+
default=True,
59+
dest="label",
60+
help="""True to label replicate/batch/context names on the plots. May be ugly for large sets""",
61+
)
62+
parser.add_argument(
63+
"-d",
64+
"--min-dist",
65+
type=float,
66+
required=False,
67+
default=0.01,
68+
dest="min_dist",
69+
help="""Minimum distance for UMAP clustering. Must be between 0 and 1""",
70+
)
71+
parser.add_argument(
72+
"-r",
73+
"--replicate-ratio",
74+
type=str,
75+
required=False,
76+
default=0.9,
77+
dest="rep_ratio",
78+
help="""Ratio of genes active in replicates for a batch/study to be active""",
79+
)
80+
parser.add_argument(
81+
"-b",
82+
"--batch-ratio",
83+
type=str or float,
84+
required=False,
85+
default=0.9,
86+
dest="batch_ratio",
87+
help="""Ratio of genes active in a batch/study to be active in the context""",
88+
)
89+
parser.add_argument(
90+
"-nr",
91+
"--n-neighbors-rep",
92+
type=str or float,
93+
required=False,
94+
default="default",
95+
dest="n_neigh_rep",
96+
help="""N nearest neighbors for replicate clustering, 'default' is total number of replicates""",
97+
)
98+
parser.add_argument(
99+
"-nb",
100+
"--n-neighbors-batch",
101+
type=str or float,
102+
required=False,
103+
default="default",
104+
dest="n_neigh_batch",
105+
help="""N nearest neighbors for batch clustering, 'default' is total number of batches""",
106+
)
107+
parser.add_argument(
108+
"-nc",
109+
"--n-neighbors-context",
110+
type=str or float,
111+
required=False,
112+
default="default",
113+
dest="n_neigh_cont",
114+
help="""N nearest neighbors for context clustering, 'default' is total number of contexts""",
115+
)
116+
parser.add_argument(
117+
"-c",
118+
"--min-count",
119+
type=str or int,
120+
required=False,
121+
default="default",
122+
dest="min_count",
123+
help="""Ratio of active genes in a batch/study to be active in the context""",
124+
)
125+
parser.add_argument(
126+
"-q",
127+
"--quantile",
128+
type=str or int,
129+
required=False,
130+
default=0.5,
131+
dest="quantile",
132+
help="""Ratio of active genes in a batch/study to be active in the context""",
133+
)
134+
parser.add_argument(
135+
"-s",
136+
"--seed",
137+
type=int,
138+
required=False,
139+
default=-1,
140+
dest="seed",
141+
help="""Random seed for clustering algorithm initialization""",
142+
)
143+
args = parser.parse_args()
144+
145+
context_names = stringlist_to_list(args.context_names)
146+
technique = args.technique.lower()
147+
clust_algo = args.clust_algo.lower()
148+
label = args.label
149+
rep_ratio = args.rep_ratio
150+
batch_ratio = args.batch_ratio
151+
min_count = args.min_count
152+
quantile = args.quantile
153+
min_dist = args.min_dist
154+
n_neigh_rep = args.n_neigh_rep
155+
n_neigh_batch = args.n_neigh_batch
156+
n_neigh_cont = args.n_neigh_cont
157+
158+
# Set a random seed if none provided
159+
if int(args.seed) == -1:
160+
seed = np.random.randint(0, 100000)
161+
else:
162+
seed = args.seed
163+
164+
if isinstance(min_count, str) and min_count.lower() == "default":
165+
try:
166+
min_count = int(min_count)
167+
except ValueError:
168+
raise ValueError("--min-count must be either 'default' or an integer > 0")
169+
if not isinstance(min_count, str) and min_count < 0:
170+
raise ValueError("--min-count must be either 'default' or an integer > 0")
171+
172+
if isinstance(quantile, str) and not quantile.lower() == "default":
173+
try:
174+
quantile = int(quantile)
175+
except ValueError:
176+
raise ValueError("--quantile must be either 'default' or an integer between 0 and 100")
177+
if not isinstance(quantile, str) and 0 > quantile > 100:
178+
raise ValueError("--quantile must be either 'default' or an integer between 0 and 100")
179+
180+
if isinstance(rep_ratio, str) and not rep_ratio.lower() == "default":
181+
try:
182+
rep_ratio = float(rep_ratio)
183+
except ValueError:
184+
raise ValueError("--rep-ratio must be 'default' or a float between 0 and 1")
185+
if not isinstance(rep_ratio, str) and 0 > rep_ratio > 1.0:
186+
raise ValueError("--rep-ratio must be 'default' or a float between 0 and 1")
187+
188+
if isinstance(batch_ratio, str) and not batch_ratio.lower() == "default":
189+
try:
190+
batch_ratio = float(batch_ratio)
191+
except ValueError:
192+
raise ValueError("--batch-ratio must be 'default' or a float between 0 and 1")
193+
if not isinstance(batch_ratio, str) and 0 > batch_ratio > 1.0:
194+
raise ValueError("--batch-ratio must be 'default' or a float between 0 and 1")
195+
196+
if technique.lower() not in ["quantile", "tpm", "cpm", "zfpkm"]:
197+
raise ValueError("--technique must be either 'quantile', 'tpm', 'cpm', 'zfpkm'")
198+
199+
if technique.lower() == "tpm":
200+
technique = "quantile"
201+
202+
if clust_algo.lower() not in ["mca", "umap"]:
203+
raise ValueError("--clust_algo must be either 'mca', 'umap'")
204+
205+
if not isinstance(min_dist, str) and 0 > min_dist > 1.0:
206+
raise ValueError("--min_dist must be a float between 0 and 1")
207+
208+
if isinstance(n_neigh_rep, str) and not n_neigh_rep.lower() == "default":
209+
try:
210+
n_neigh_rep = int(n_neigh_rep)
211+
except ValueError:
212+
raise ValueError(
213+
"--n_neigh_rep must be either 'default' or an integer greater than 1 and less than or equal to "
214+
"the total number of replicates being clustered across all contexts."
215+
)
216+
if not isinstance(n_neigh_rep, str) and n_neigh_rep < 2:
217+
raise ValueError(
218+
"--n_neigh_rep must be either 'default' or an integer greater than 1 and less than or equal to "
219+
"the total number of replicates being clustered across all contexts."
220+
)
221+
222+
if isinstance(n_neigh_batch, str) and not n_neigh_batch.lower() == "default":
223+
try:
224+
n_neigh_batch = int(n_neigh_batch)
225+
except ValueError:
226+
raise ValueError(
227+
"--n_neigh_batch must be either 'default' or an integer greater than 1 and less than or equal to "
228+
"the total number of batches being clustered across all contexts."
229+
)
230+
if not isinstance(n_neigh_batch, str) and n_neigh_batch < 2:
231+
raise ValueError(
232+
"--n_neigh_batch must be either 'default' or an integer greater than 1 and less than or equal to "
233+
"the total number of batches being clustered across all contexts."
234+
)
235+
236+
if isinstance(n_neigh_cont, str) and not n_neigh_cont.lower() == "default":
237+
try:
238+
n_neigh_cont = int(n_neigh_cont)
239+
except ValueError:
240+
raise ValueError(
241+
"--n_neigh_batch must be either 'default' or an integer greater than 1 and less than or equal to "
242+
"the total number of batches being clustered across all contexts."
243+
)
244+
if not isinstance(n_neigh_cont, str) and n_neigh_cont < 2:
245+
raise ValueError(
246+
"--n_neigh_context must be either 'default' or an integer greater than 1 and less than or equal to "
247+
"the total number of contexts being clustered."
248+
)
249+
250+
cluster_samples = rpy2_api.Rpy2(
251+
r_file_path=r_file_path,
252+
wd=configs.result_dir,
253+
context_names=context_names,
254+
technique=technique,
255+
clust_algo=clust_algo,
256+
label=label,
257+
min_dist=min_dist,
258+
n_neigh_rep=n_neigh_rep,
259+
n_neigh_batch=n_neigh_batch,
260+
n_neigh_cont=n_neigh_cont,
261+
rep_ratio=rep_ratio,
262+
batch_ratio=batch_ratio,
263+
quantile=quantile,
264+
min_count=min_count,
265+
seed=seed,
266+
)
267+
cluster_samples.call_function("cluster_samples_main")
268+
269+
270+
if __name__ == "__main__":
271+
main()
File renamed without changes.

main/como/combine_distributions.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from pathlib import Path
2+
from typing import Literal, Union
3+
4+
5+
def parse_contexts_zfpkm(wd: Union[str, Path], contexts: list[str], prep: Literal["mrna", "total"]):
6+
wd: Path = Path(wd)
7+
8+
batches = []
9+
for context in contexts:
10+
dir_name = Path(wd, context, prep)
11+
files = dir_name.glob(f"zFPKM_Matrix_{prep}_*.csv")
12+
batches += [Path(file).stem for file in files]
13+
14+
return batches
15+
16+
17+
if __name__ == "__main__":
18+
result = parse_contexts_zfpkm(wd="/Users/joshl/PycharmProjects/COMO/main/data/results", contexts=["naiveB"], prep="total")
19+
print(result)
File renamed without changes.

0 commit comments

Comments
 (0)