Skip to content

Commit 6ac1dca

Browse files
authored
Merge pull request #51 from metagenlab/dev
Update apptainer mounts and add art-args
2 parents 90e07ea + 3c83209 commit 6ac1dca

9 files changed

Lines changed: 133 additions & 68 deletions

File tree

mess/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
"--frag-len",
8686
"--frag-sd",
8787
"--errfree",
88+
"--art-args",
8889
],
8990
},
9091
{

mess/util.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,27 @@
1515
)
1616

1717

18-
def get_fasta_dirs(config):
18+
def fasta_path(config):
1919
if os.path.isfile(config["args"]["input"]):
2020
files = [config["args"]["input"]]
2121
else:
2222
files = glob.glob(os.path.join(config["args"]["input"], "*.tsv"))
2323
df = pd.concat([pd.read_csv(file, sep="\t") for file in files])
2424
if "path" in df.columns:
25-
return set(os.path.abspath(os.path.dirname(p)) for p in df["path"])
25+
return os.path.commonpath(df["path"].to_list())
2626
else:
2727
return False
2828

2929

30+
def custom_taxonkit_dir(config):
31+
if config["args"]["taxonkit"] == os.path.join(os.getcwd(), ".taxonkit"):
32+
return False
33+
else:
34+
return os.path.dirname(
35+
os.path.realpath(os.path.join(config["args"]["taxonkit"], "names.dmp"))
36+
)
37+
38+
3039
def snake_base(rel_path):
3140
"""Get the filepath to a Snaketool system file (relative to __main__.py)"""
3241
return os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)
@@ -124,9 +133,10 @@ def run_snakemake(
124133
os.path.join(os.path.dirname(os.path.realpath(__file__))),
125134
os.path.abspath(snake_config["args"]["output"]),
126135
]
127-
if get_fasta_dirs(snake_config):
128-
for path in get_fasta_dirs(snake_config):
129-
paths.append(path)
136+
if fasta_path(snake_config):
137+
paths.append(fasta_path(snake_config))
138+
if custom_taxonkit_dir(snake_config):
139+
paths.append(custom_taxonkit_dir(snake_config))
130140

131141
sdm_args = " ".join([f"-B {path}:{path}" for path in paths])
132142

@@ -344,6 +354,9 @@ def sim_options(func):
344354
type=str,
345355
default=None,
346356
),
357+
click.option(
358+
"--art-args", help="additional art_illumina args", type=str, default=""
359+
),
347360
click.option(
348361
"--errfree",
349362
help="Generate error free alignments with art_illumina",

mess/workflow/rules/preflight/functions.smk

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,22 @@ def get_value(value, wildcards):
183183

184184

185185
def get_asm_summary(wildcards):
186+
if (
187+
("seq_len" in tsv_df.columns)
188+
and ("seq_num" in tsv_df.columns)
189+
and ("path" in tsv_df.columns)
190+
):
191+
return os.path.join(dir.out.base, "replicates.tsv")
192+
if ("taxon" in tsv_df.columns) or ("accession" in tsv_df.columns):
193+
if PRIMERSEARCH:
194+
return [
195+
checkpoints.download_assemblies.get(**wildcards).output[0],
196+
os.path.join(dir.out.processing, "seqkit_stats.tsv"),
197+
]
198+
else:
199+
return checkpoints.download_assemblies.get(**wildcards).output[0]
186200
if PRIMERSEARCH or FASTA_DIR or FASTA_PATH:
187201
return os.path.join(dir.out.processing, "seqkit_stats.tsv")
188-
else:
189-
try:
190-
return checkpoints.download_assemblies.get(**wildcards).output[0]
191-
except AttributeError:
192-
return os.path.join(dir.out.processing, "seqkit_stats.tsv")
193202

194203

195204
tsv_cache = {}

mess/workflow/rules/processing/coverages.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ rule replicates_table:
1717

1818
checkpoint calculate_genome_coverages:
1919
input:
20-
df=os.path.join(dir.out.base, "replicates.tsv"),
20+
rep=os.path.join(dir.out.base, "replicates.tsv"),
2121
asm=get_asm_summary,
2222
output:
2323
os.path.join(dir.out.base, "coverages.tsv"),

mess/workflow/rules/processing/fastas.smk

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ checkpoint split_contigs:
129129
params:
130130
circular=CIRCULAR,
131131
rotate=ROTATE,
132-
amplicons=PRIMERSEARCH,
133132
read_len=MEAN_LEN,
134133
resources:
135134
mem_mb=config.resources.sml.mem,

mess/workflow/rules/processing/reads.smk

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,11 @@ rule cat_fastqs:
367367
mem=str(config.resources.sml.mem) + "MB",
368368
time=config.resources.norm.time,
369369
message:
370-
"Concatenating {wildcards.sample} reads : {params.head} ... "
370+
(
371+
"Concatenating {wildcards.sample} R{wildcards.p} reads : {params.head} ... "
372+
if PAIRED
373+
else "Concatenating {wildcards.sample} reads : {params.head} ... "
374+
)
371375
shell:
372376
"""
373377
find {params.dir} -name "{params.name}" | sort | xargs cat > {output}

mess/workflow/rules/simulate/short_reads.smk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ if BAM or TAX:
2323
if ERRFREE:
2424
art_args += "-ef "
2525

26+
art_args += f"{config.args.art_args}"
27+
2628

2729
fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}")
2830
if CIRCULAR:

mess/workflow/scripts/calculate_cov.py

Lines changed: 90 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -48,143 +48,170 @@ def strip_fasta_ext(filename):
4848
Main
4949
"""
5050
# Set seed for distributions
51-
np.random.seed(snakemake.params.seed)
51+
np.random.seed(snakemake.params["seed"])
5252

5353
# Set seed for read simulators
54-
random.seed(snakemake.params.seed)
54+
random.seed(snakemake.params["seed"])
5555

5656
# Pairing
57-
if snakemake.params.pairing:
57+
if snakemake.params["pairing"]:
5858
p = 2
5959
else:
6060
p = 1
6161

62+
entry_df = pd.read_csv(snakemake.input["rep"], sep="\t")
6263

63-
# Get table with assembly genome sizes and their taxonomy
6464

65+
if ("fasta" not in entry_df.columns) and ("path" in entry_df.columns):
66+
entry_df["fasta"] = [
67+
strip_fasta_ext(os.path.basename(path)) for path in entry_df["path"]
68+
]
6569

66-
asm_df = pd.read_csv(snakemake.input.asm, sep="\t")
67-
entry_df = pd.read_csv(snakemake.input.df, sep="\t")
6870

69-
if snakemake.params.fa_dir:
71+
if snakemake.params["fa_dir"]:
7072
entry_df["fasta"] = entry_df["fasta"].apply(strip_fasta_ext)
7173
entry_df["path"] = [
72-
glob.glob(os.path.join(snakemake.params.fa_dir, f"{fa}*"))[0]
74+
glob.glob(os.path.join(snakemake.params["fa_dir"], f"{fa}*"))[0]
7375
for fa in entry_df["fasta"]
7476
]
75-
if snakemake.params.fa_path:
77+
if snakemake.params["fa_path"]:
7678
entry_df["fasta"] = [
7779
strip_fasta_ext(os.path.basename(path)) for path in entry_df["path"]
7880
]
7981

80-
if snakemake.params.fa_dir or snakemake.params.fa_path:
81-
asm_df = pd.read_csv(snakemake.input.asm, sep="\t")
82-
asm_df.rename(
82+
if isinstance(snakemake.input["asm"], list):
83+
summary_df = pd.read_csv(snakemake.input["asm"][0], sep="\t")
84+
summary_df["fasta"] = [
85+
strip_fasta_ext(os.path.basename(path)) for path in summary_df["path"]
86+
]
87+
stats_df = pd.read_csv(snakemake.input["asm"][1], sep="\t")
88+
stats_df.rename(
8389
columns={
8490
"file": "fasta",
85-
"sum_len": "total_sequence_length",
86-
"num_seqs": "number_of_contigs",
91+
"sum_len": "seq_len",
92+
"num_seqs": "seq_num",
8793
},
8894
inplace=True,
8995
)
90-
asm_df["fasta"] = asm_df["fasta"].apply(strip_fasta_ext)
96+
stats_df["fasta"] = stats_df["fasta"].apply(strip_fasta_ext)
97+
stats_df["fasta"] = stats_df["fasta"].str.replace(".amplicons", "")
98+
asm_df = pd.merge(stats_df, summary_df, on="fasta")
9199

92-
if "fasta" not in asm_df.columns:
93-
asm_df["fasta"] = [
94-
strip_fasta_ext(os.path.basename(path)) for path in asm_df["path"]
95-
]
96100

97-
if snakemake.params.amplicons:
98-
asm_df["fasta"] = asm_df["fasta"].str.replace(".amplicons", "")
101+
else:
102+
asm_df = pd.read_csv(snakemake.input["asm"], sep="\t")
103+
if "file" in asm_df.columns:
104+
asm_df.rename(
105+
columns={
106+
"file": "fasta",
107+
"sum_len": "seq_len",
108+
"num_seqs": "seq_num",
109+
},
110+
inplace=True,
111+
)
112+
asm_df["fasta"] = asm_df["fasta"].apply(strip_fasta_ext)
113+
114+
if ("fasta" not in asm_df.columns) and ("path" in asm_df.columns):
115+
asm_df["fasta"] = [
116+
strip_fasta_ext(os.path.basename(path)) for path in asm_df["path"]
117+
]
118+
if snakemake.params["amplicons"]:
119+
asm_df["fasta"] = asm_df["fasta"].str.replace(".amplicons", "")
120+
if (
121+
"total_sequence_length" in asm_df.columns
122+
and "number_of_contigs" in asm_df.columns
123+
):
124+
asm_df = asm_df.rename(
125+
columns={"total_sequence_length": "seq_len", "number_of_contigs": "seq_num"}
126+
)
127+
99128

100129
same_cols = list(np.intersect1d(entry_df.columns, asm_df.columns))
101130
df = pd.merge(entry_df, asm_df, how="left", on=same_cols)
102131

103132

104133
# Get total bases
105-
bases = parse_size(snakemake.params.bases)
134+
bases = parse_size(snakemake.params["bases"])
106135

107136

108137
if "tax_id" in df.columns:
109138
df["tax_id"] = df["tax_id"].astype(int)
110139
# Calculate prportion with dist
111-
if snakemake.params.dist == "even":
140+
if snakemake.params["dist"] == "even":
112141
df = get_even_dist(df)
113142
df["tax_abundance"] = df["proportion"] / df["count"]
114-
df["genome_bases"] = df["total_sequence_length"] * df["tax_abundance"]
143+
df["genome_bases"] = df["seq_len"] * df["tax_abundance"]
115144
df["sum_genome_bases"] = df.groupby("samplename")["genome_bases"].transform("sum")
116145
df["cov_obtained"] = bases / df["sum_genome_bases"]
117146
df["cov_sim"] = df["tax_abundance"] * df["cov_obtained"]
118147
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
119-
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
120-
df["reads"] = df["bases"] / snakemake.params.read_len
148+
df["bases"] = df["cov_sim"] * df["seq_len"]
149+
df["reads"] = df["bases"] / snakemake.params["read_len"]
121150
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
122151
df["seq_abundance"] = df["bases"] / df["sum_bases"]
123152

124153

125-
elif snakemake.params.dist == "lognormal":
126-
df = get_lognormal_dist(df, mu=snakemake.params.mu, sigma=snakemake.params.sigma)
154+
elif snakemake.params["dist"] == "lognormal":
155+
df = get_lognormal_dist(
156+
df, mu=snakemake.params["mu"], sigma=snakemake.params["sigma"]
157+
)
127158
df["bases"] = df["seq_abundance"] * bases
128-
df["reads"] = df["bases"] / snakemake.params.read_len
129-
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
159+
df["reads"] = df["bases"] / snakemake.params["read_len"]
160+
df["cov_sim"] = df["bases"] / df["seq_len"]
130161
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
131162
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
132163
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
133164
df["seq_abundance"] = df["bases"] / df["sum_bases"]
134165
else:
135166
if "tax_abundance" in entry_df.columns:
136-
df["genome_bases"] = df["total_sequence_length"] * df["tax_abundance"]
167+
df["genome_bases"] = df["seq_len"] * df["tax_abundance"]
137168
df["sum_genome_bases"] = df.groupby("samplename")["genome_bases"].transform(
138169
"sum"
139170
)
140171
df["cov_obtained"] = bases / df["sum_genome_bases"]
141172
df["cov_sim"] = df["tax_abundance"] * df["cov_obtained"]
142173
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
143-
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
144-
df["reads"] = df["bases"] / snakemake.params.read_len
174+
df["bases"] = df["cov_sim"] * df["seq_len"]
175+
df["reads"] = df["bases"] / snakemake.params["read_len"]
145176
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
146177
df["seq_abundance"] = df["bases"] / df["sum_bases"]
147178

148179
if "seq_abundance" in entry_df.columns:
149180
df["bases"] = df["seq_abundance"] * bases
150-
df["reads"] = df["bases"] / snakemake.params.read_len
151-
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
181+
df["reads"] = df["bases"] / snakemake.params["read_len"]
182+
df["cov_sim"] = df["bases"] / df["seq_len"]
152183
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
153184
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
154185

155186
if "reads" in entry_df.columns:
156-
df["bases"] = df["reads"] * snakemake.params.read_len * p
187+
df["bases"] = df["reads"] * snakemake.params["read_len"] * p
157188
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
158189
df["seq_abundance"] = df["bases"] / df["sum_bases"]
159-
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
190+
df["cov_sim"] = df["bases"] / df["seq_len"]
160191
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
161192
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
162193

163194
if "bases" in entry_df.columns:
164-
df["reads"] = df["bases"] / snakemake.params.read_len
195+
df["reads"] = df["bases"] / snakemake.params["read_len"]
165196
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
166197
df["seq_abundance"] = df["bases"] / df["sum_bases"]
167-
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
198+
df["cov_sim"] = df["bases"] / df["seq_len"]
168199
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
169200
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
170201

171202
elif "cov_sim" in entry_df.columns:
172203
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
173204
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
174-
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
205+
df["bases"] = df["cov_sim"] * df["seq_len"]
175206
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
176-
df["reads"] = df["bases"] / snakemake.params.read_len
207+
df["reads"] = df["bases"] / snakemake.params["read_len"]
177208
df["seq_abundance"] = df["bases"] / df["sum_bases"]
178209

179210

180211
df["seed"] = random.sample(range(1, 1000000), len(df))
181-
df = df.rename(
182-
columns={"total_sequence_length": "seq_len", "number_of_contigs": "seq_num"}
183-
)
184212
cols = [
185213
"samplename",
186214
"fasta",
187-
"path",
188215
"seq_len",
189216
"seq_num",
190217
"reads",
@@ -200,10 +227,21 @@ def strip_fasta_ext(filename):
200227
cols.append("tax_id")
201228

202229
# replace values with 0 for empty amplicon fastas
203-
df.loc[
204-
df["seq_len"] == 0,
205-
["seq_num", "reads", "bases", "cov_sim", "tax_abundance", "seq_abundance", "seed"],
206-
] = 0
207-
df[cols].replace(0, np.nan).convert_dtypes().to_csv(
208-
snakemake.output[0], sep="\t", index=False
209-
)
230+
231+
if (df["seq_len"] == 0).any():
232+
df.loc[
233+
df["seq_len"] == 0,
234+
[
235+
"seq_num",
236+
"reads",
237+
"bases",
238+
"cov_sim",
239+
"tax_abundance",
240+
"seq_abundance",
241+
"seed",
242+
],
243+
] = 0
244+
df = df[cols].replace(0, np.nan)
245+
df[cols].sort_values(
246+
["samplename", "fasta", "cov_sim"], ascending=[True, True, False]
247+
).convert_dtypes().to_csv(snakemake.output[0], sep="\t", index=False)

mess/workflow/scripts/split_contigs.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,12 @@ def split_fasta(fa, outdir, suffix):
3131
os.mkdir(snakemake.output.dir)
3232
id2fa = []
3333
suffix = ".fasta"
34-
if snakemake.params.amplicons:
35-
suffix = ".amplicons.fasta"
3634
for fa in snakemake.input.fa:
35+
if ".amplicons" in fa:
36+
suffix = ".amplicons"
3737
id2fa.append(split_fasta(fa, snakemake.output.dir, suffix))
3838
id2fa = list(chain.from_iterable(id2fa))
3939
contig_df = pd.DataFrame.from_records(id2fa)
40-
4140
df = pd.merge(contig_df, cov_df, how="left", on="fasta")
4241

4342
cols = [

0 commit comments

Comments
 (0)