@@ -48,143 +48,170 @@ def strip_fasta_ext(filename):
4848Main
4949"""
5050# Set seed for distributions
51- np .random .seed (snakemake .params . seed )
51+ np .random .seed (snakemake .params [ " seed" ] )
5252
5353# Set seed for read simulators
54- random .seed (snakemake .params . seed )
54+ random .seed (snakemake .params [ " seed" ] )
5555
5656# Pairing
57- if snakemake .params . pairing :
57+ if snakemake .params [ " pairing" ] :
5858 p = 2
5959else :
6060 p = 1
6161
62+ entry_df = pd .read_csv (snakemake .input ["rep" ], sep = "\t " )
6263
63- # Get table with assembly genome sizes and their taxonomy
6464
65+ if ("fasta" not in entry_df .columns ) and ("path" in entry_df .columns ):
66+ entry_df ["fasta" ] = [
67+ strip_fasta_ext (os .path .basename (path )) for path in entry_df ["path" ]
68+ ]
6569
66- asm_df = pd .read_csv (snakemake .input .asm , sep = "\t " )
67- entry_df = pd .read_csv (snakemake .input .df , sep = "\t " )
6870
69- if snakemake .params . fa_dir :
71+ if snakemake .params [ " fa_dir" ] :
7072 entry_df ["fasta" ] = entry_df ["fasta" ].apply (strip_fasta_ext )
7173 entry_df ["path" ] = [
72- glob .glob (os .path .join (snakemake .params . fa_dir , f"{ fa } *" ))[0 ]
74+ glob .glob (os .path .join (snakemake .params [ " fa_dir" ] , f"{ fa } *" ))[0 ]
7375 for fa in entry_df ["fasta" ]
7476 ]
75- if snakemake .params . fa_path :
77+ if snakemake .params [ " fa_path" ] :
7678 entry_df ["fasta" ] = [
7779 strip_fasta_ext (os .path .basename (path )) for path in entry_df ["path" ]
7880 ]
7981
80- if snakemake .params .fa_dir or snakemake .params .fa_path :
81- asm_df = pd .read_csv (snakemake .input .asm , sep = "\t " )
82- asm_df .rename (
82+ if isinstance (snakemake .input ["asm" ], list ):
83+ summary_df = pd .read_csv (snakemake .input ["asm" ][0 ], sep = "\t " )
84+ summary_df ["fasta" ] = [
85+ strip_fasta_ext (os .path .basename (path )) for path in summary_df ["path" ]
86+ ]
87+ stats_df = pd .read_csv (snakemake .input ["asm" ][1 ], sep = "\t " )
88+ stats_df .rename (
8389 columns = {
8490 "file" : "fasta" ,
85- "sum_len" : "total_sequence_length " ,
86- "num_seqs" : "number_of_contigs " ,
91+ "sum_len" : "seq_len " ,
92+ "num_seqs" : "seq_num " ,
8793 },
8894 inplace = True ,
8995 )
90- asm_df ["fasta" ] = asm_df ["fasta" ].apply (strip_fasta_ext )
96+ stats_df ["fasta" ] = stats_df ["fasta" ].apply (strip_fasta_ext )
97+ stats_df ["fasta" ] = stats_df ["fasta" ].str .replace (".amplicons" , "" )
98+ asm_df = pd .merge (stats_df , summary_df , on = "fasta" )
9199
92- if "fasta" not in asm_df .columns :
93- asm_df ["fasta" ] = [
94- strip_fasta_ext (os .path .basename (path )) for path in asm_df ["path" ]
95- ]
96100
97- if snakemake .params .amplicons :
98- asm_df ["fasta" ] = asm_df ["fasta" ].str .replace (".amplicons" , "" )
101+ else :
102+ asm_df = pd .read_csv (snakemake .input ["asm" ], sep = "\t " )
103+ if "file" in asm_df .columns :
104+ asm_df .rename (
105+ columns = {
106+ "file" : "fasta" ,
107+ "sum_len" : "seq_len" ,
108+ "num_seqs" : "seq_num" ,
109+ },
110+ inplace = True ,
111+ )
112+ asm_df ["fasta" ] = asm_df ["fasta" ].apply (strip_fasta_ext )
113+
114+ if ("fasta" not in asm_df .columns ) and ("path" in asm_df .columns ):
115+ asm_df ["fasta" ] = [
116+ strip_fasta_ext (os .path .basename (path )) for path in asm_df ["path" ]
117+ ]
118+ if snakemake .params ["amplicons" ]:
119+ asm_df ["fasta" ] = asm_df ["fasta" ].str .replace (".amplicons" , "" )
120+ if (
121+ "total_sequence_length" in asm_df .columns
122+ and "number_of_contigs" in asm_df .columns
123+ ):
124+ asm_df = asm_df .rename (
125+ columns = {"total_sequence_length" : "seq_len" , "number_of_contigs" : "seq_num" }
126+ )
127+
99128
100129same_cols = list (np .intersect1d (entry_df .columns , asm_df .columns ))
101130df = pd .merge (entry_df , asm_df , how = "left" , on = same_cols )
102131
103132
104133# Get total bases
105- bases = parse_size (snakemake .params . bases )
134+ bases = parse_size (snakemake .params [ " bases" ] )
106135
107136
108137if "tax_id" in df .columns :
109138 df ["tax_id" ] = df ["tax_id" ].astype (int )
110139# Calculate prportion with dist
111- if snakemake .params . dist == "even" :
140+ if snakemake .params [ " dist" ] == "even" :
112141 df = get_even_dist (df )
113142 df ["tax_abundance" ] = df ["proportion" ] / df ["count" ]
114- df ["genome_bases" ] = df ["total_sequence_length " ] * df ["tax_abundance" ]
143+ df ["genome_bases" ] = df ["seq_len " ] * df ["tax_abundance" ]
115144 df ["sum_genome_bases" ] = df .groupby ("samplename" )["genome_bases" ].transform ("sum" )
116145 df ["cov_obtained" ] = bases / df ["sum_genome_bases" ]
117146 df ["cov_sim" ] = df ["tax_abundance" ] * df ["cov_obtained" ]
118147 df ["sum_cov" ] = df .groupby ("samplename" )["cov_sim" ].transform ("sum" )
119- df ["bases" ] = df ["cov_sim" ] * df ["total_sequence_length " ]
120- df ["reads" ] = df ["bases" ] / snakemake .params . read_len
148+ df ["bases" ] = df ["cov_sim" ] * df ["seq_len " ]
149+ df ["reads" ] = df ["bases" ] / snakemake .params [ " read_len" ]
121150 df ["sum_bases" ] = df .groupby ("samplename" )["bases" ].transform ("sum" )
122151 df ["seq_abundance" ] = df ["bases" ] / df ["sum_bases" ]
123152
124153
125- elif snakemake .params .dist == "lognormal" :
126- df = get_lognormal_dist (df , mu = snakemake .params .mu , sigma = snakemake .params .sigma )
154+ elif snakemake .params ["dist" ] == "lognormal" :
155+ df = get_lognormal_dist (
156+ df , mu = snakemake .params ["mu" ], sigma = snakemake .params ["sigma" ]
157+ )
127158 df ["bases" ] = df ["seq_abundance" ] * bases
128- df ["reads" ] = df ["bases" ] / snakemake .params . read_len
129- df ["cov_sim" ] = df ["bases" ] / df ["total_sequence_length " ]
159+ df ["reads" ] = df ["bases" ] / snakemake .params [ " read_len" ]
160+ df ["cov_sim" ] = df ["bases" ] / df ["seq_len " ]
130161 df ["sum_cov" ] = df .groupby ("samplename" )["cov_sim" ].transform ("sum" )
131162 df ["tax_abundance" ] = df ["cov_sim" ] / df ["sum_cov" ]
132163 df ["sum_bases" ] = df .groupby ("samplename" )["bases" ].transform ("sum" )
133164 df ["seq_abundance" ] = df ["bases" ] / df ["sum_bases" ]
134165else :
135166 if "tax_abundance" in entry_df .columns :
136- df ["genome_bases" ] = df ["total_sequence_length " ] * df ["tax_abundance" ]
167+ df ["genome_bases" ] = df ["seq_len " ] * df ["tax_abundance" ]
137168 df ["sum_genome_bases" ] = df .groupby ("samplename" )["genome_bases" ].transform (
138169 "sum"
139170 )
140171 df ["cov_obtained" ] = bases / df ["sum_genome_bases" ]
141172 df ["cov_sim" ] = df ["tax_abundance" ] * df ["cov_obtained" ]
142173 df ["sum_cov" ] = df .groupby ("samplename" )["cov_sim" ].transform ("sum" )
143- df ["bases" ] = df ["cov_sim" ] * df ["total_sequence_length " ]
144- df ["reads" ] = df ["bases" ] / snakemake .params . read_len
174+ df ["bases" ] = df ["cov_sim" ] * df ["seq_len " ]
175+ df ["reads" ] = df ["bases" ] / snakemake .params [ " read_len" ]
145176 df ["sum_bases" ] = df .groupby ("samplename" )["bases" ].transform ("sum" )
146177 df ["seq_abundance" ] = df ["bases" ] / df ["sum_bases" ]
147178
148179 if "seq_abundance" in entry_df .columns :
149180 df ["bases" ] = df ["seq_abundance" ] * bases
150- df ["reads" ] = df ["bases" ] / snakemake .params . read_len
151- df ["cov_sim" ] = df ["bases" ] / df ["total_sequence_length " ]
181+ df ["reads" ] = df ["bases" ] / snakemake .params [ " read_len" ]
182+ df ["cov_sim" ] = df ["bases" ] / df ["seq_len " ]
152183 df ["sum_cov" ] = df .groupby ("samplename" )["cov_sim" ].transform ("sum" )
153184 df ["tax_abundance" ] = df ["cov_sim" ] / df ["sum_cov" ]
154185
155186 if "reads" in entry_df .columns :
156- df ["bases" ] = df ["reads" ] * snakemake .params . read_len * p
187+ df ["bases" ] = df ["reads" ] * snakemake .params [ " read_len" ] * p
157188 df ["sum_bases" ] = df .groupby ("samplename" )["bases" ].transform ("sum" )
158189 df ["seq_abundance" ] = df ["bases" ] / df ["sum_bases" ]
159- df ["cov_sim" ] = df ["bases" ] / df ["total_sequence_length " ]
190+ df ["cov_sim" ] = df ["bases" ] / df ["seq_len " ]
160191 df ["sum_cov" ] = df .groupby ("samplename" )["cov_sim" ].transform ("sum" )
161192 df ["tax_abundance" ] = df ["cov_sim" ] / df ["sum_cov" ]
162193
163194 if "bases" in entry_df .columns :
164- df ["reads" ] = df ["bases" ] / snakemake .params . read_len
195+ df ["reads" ] = df ["bases" ] / snakemake .params [ " read_len" ]
165196 df ["sum_bases" ] = df .groupby ("samplename" )["bases" ].transform ("sum" )
166197 df ["seq_abundance" ] = df ["bases" ] / df ["sum_bases" ]
167- df ["cov_sim" ] = df ["bases" ] / df ["total_sequence_length " ]
198+ df ["cov_sim" ] = df ["bases" ] / df ["seq_len " ]
168199 df ["sum_cov" ] = df .groupby ("samplename" )["cov_sim" ].transform ("sum" )
169200 df ["tax_abundance" ] = df ["cov_sim" ] / df ["sum_cov" ]
170201
171202 elif "cov_sim" in entry_df .columns :
172203 df ["sum_cov" ] = df .groupby ("samplename" )["cov_sim" ].transform ("sum" )
173204 df ["tax_abundance" ] = df ["cov_sim" ] / df ["sum_cov" ]
174- df ["bases" ] = df ["cov_sim" ] * df ["total_sequence_length " ]
205+ df ["bases" ] = df ["cov_sim" ] * df ["seq_len " ]
175206 df ["sum_bases" ] = df .groupby ("samplename" )["bases" ].transform ("sum" )
176- df ["reads" ] = df ["bases" ] / snakemake .params . read_len
207+ df ["reads" ] = df ["bases" ] / snakemake .params [ " read_len" ]
177208 df ["seq_abundance" ] = df ["bases" ] / df ["sum_bases" ]
178209
179210
180211df ["seed" ] = random .sample (range (1 , 1000000 ), len (df ))
181- df = df .rename (
182- columns = {"total_sequence_length" : "seq_len" , "number_of_contigs" : "seq_num" }
183- )
184212cols = [
185213 "samplename" ,
186214 "fasta" ,
187- "path" ,
188215 "seq_len" ,
189216 "seq_num" ,
190217 "reads" ,
@@ -200,10 +227,21 @@ def strip_fasta_ext(filename):
200227 cols .append ("tax_id" )
201228
202229# replace values with 0 for empty amplicon fastas
203- df .loc [
204- df ["seq_len" ] == 0 ,
205- ["seq_num" , "reads" , "bases" , "cov_sim" , "tax_abundance" , "seq_abundance" , "seed" ],
206- ] = 0
207- df [cols ].replace (0 , np .nan ).convert_dtypes ().to_csv (
208- snakemake .output [0 ], sep = "\t " , index = False
209- )
230+
231+ if (df ["seq_len" ] == 0 ).any ():
232+ df .loc [
233+ df ["seq_len" ] == 0 ,
234+ [
235+ "seq_num" ,
236+ "reads" ,
237+ "bases" ,
238+ "cov_sim" ,
239+ "tax_abundance" ,
240+ "seq_abundance" ,
241+ "seed" ,
242+ ],
243+ ] = 0
244+ df = df [cols ].replace (0 , np .nan )
245+ df [cols ].sort_values (
246+ ["samplename" , "fasta" , "cov_sim" ], ascending = [True , True , False ]
247+ ).convert_dtypes ().to_csv (snakemake .output [0 ], sep = "\t " , index = False )
0 commit comments