-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathscore.py
More file actions
executable file
·333 lines (278 loc) · 10.6 KB
/
score.py
File metadata and controls
executable file
·333 lines (278 loc) · 10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#!/usr/bin/env python3
import os
import sys
import pandas as pd
import taxonomy as t
from multiprocessing import Pool
__version__ = "1.0.0-RC5"
def loadUniqueness(uniqueness_file):
uniq_info = {}
try:
with open(uniqueness_file, 'r', encoding='UTF-8') as f:
for line in f:
(tid, raw_len, sk, p, c, o, f, g, s, sn) = line.rstrip('\r\n').split('\t')
if not tid in uniq_info: uniq_info[tid]={}
uniq_info[tid]["raw"] = int(raw_len)
uniq_info[tid]["superkingdom"] = int(sk)
uniq_info[tid]["phylum"] = int(p) if p != '-' else p
uniq_info[tid]["class"] = int(c) if c != '-' else c
uniq_info[tid]["order"] = int(o) if o != '-' else o
uniq_info[tid]["family"] = int(f) if f != '-' else f
uniq_info[tid]["genus"] = int(g) if g != '-' else g
uniq_info[tid]["species"] = int(s) if s != '-' else s
uniq_info[tid]["strain"] = int(sn) if sn != '-' else sn
except IOError:
sys.stderr.write( "Failed to open uniqueness file: %s.\n" % uniqueness_file )
return uniq_info
def scoreStrainUniqueness(res, tid, uniq_meta, method="default"):
ranks = ['strain','species','genus','family','order','class','phylum','superkingdom']
ranks_priority = dict(zip(ranks, range(8)))
inlvl = res['LVL']
p = 1
u_len = 0
t_len = 0
if tid in uniq_meta:
t_len = uniq_meta[tid]['raw']
# add up score
tol_score = 0
#tid found in uniq metrics
p_of_this_rank=1
if tid in uniq_meta:
(mapped, nm, read_len, read_num, read_rnr, dcnr) = (0,0,0,0,0,0)
# interating all ranks from strain to superkingdom
for lcr_lvl in ranks:
# only procede when uniq info available in current rank
if uniq_meta[tid][lcr_lvl] != "-":
u_len = uniq_meta[tid][lcr_lvl]
u_pct = (t_len-u_len)/t_len
#RC5
p = 0.90*u_pct+0.05
# v2.4.9
# p = (u_pct+1)/2 #default
if ":" in method:
(low, hi) = method.split(':')
try:
low = float(low)
hi = float(hi)
if low > 1 and hi > 1:
low = low/100
hi = hi/100
p = (hi-low)*u_pct+low
except:
sys.stderr.write( f"[ERROR] {method} is an invalid normalization parameter for scoring function.\n")
sys.exit()
else:
if p>0.995: p=0.995
if p<0.005: p=0.005
# P(Gr) = Pr
# P(Xr|Gr) = Cr
# P(X) = 𝑃(𝑋_𝑟│𝐺_𝑟) ∙ 𝑃(𝐺_𝑟) + 𝑃(𝑋_𝑟│~𝐺_𝑟) ∙ 𝑃(~𝐺_𝑟)
# P(G|X) = ∑ ∝ P(Xr|Gr)*P(G) / P(X)
# = ∑ ∝ ( C*P / (2CP-P-C+1) )
# P(Gr) = Pr = p
# prior propability of the genome presence in the sample
if uniq_meta[tid][lcr_lvl] == "-": # the rank is more specific than target taxonomy
if lcr_lvl in res:
# add-up rank-specific info to target rank
mapped += res[lcr_lvl]["M"]
nm += res[lcr_lvl]["N"]
read_len += res[lcr_lvl]["T"]
read_num += res[lcr_lvl]["R"]
read_rnr += res[lcr_lvl]["Rnr"]
dcnr += res[lcr_lvl]["DCnr"]
else:
continue
else:
if lcr_lvl in res or mapped>0:
if lcr_lvl in res:
mapped += res[lcr_lvl]["M"]
nm += res[lcr_lvl]["N"]
read_len += res[lcr_lvl]["T"]
read_num += res[lcr_lvl]["R"]
read_rnr += res[lcr_lvl]["Rnr"]
dcnr += res[lcr_lvl]["DCnr"]
# skip this rank because the taxonomy is higher than this rank
if uniq_meta[tid][lcr_lvl] == "-": continue
# P(Xr|Gr) = Cr = c
# Given genome exists, the probability of observing X number of reads mapped to this genome
c = dcnr
# if "propDCnr" in method:
# propDCnr = dcnr/res["DCnr"]
# c = propDCnr
if c > 0.995:
c = 0.995
# else:
# if c > 0.99:
# c = 0.99
try:
#tol_score += (c*p/(2*c*p-p-c+1)) * read_rnr
tol_score += (c*p/(2*c*p-p-c+1)) * p_of_this_rank
p_of_this_rank *= 1-p
(mapped, nm, read_len, read_num, read_rnr, dcnr) = (0,0,0,0,0,0)
except:
e = sys.exc_info()[0]
sys.stderr.write( "[ERROR] %s.\n" % e )
sys.stderr.write( "[INFO] t_len: %s, u_len: %s, lcr_lvl: %s.\n" % (t_len, u_len, lcr_lvl) )
#sys.stderr.write( "[INFO] tid: %s, c: %s, p: %s, read_rnr: %s.\n" % (tid, c, p, read_rnr) )
sys.exit()
else:
p_of_this_rank *= 1-p
# P(G|X)
#return tol_score/res["MR"] if tol_score/res["MR"] < 1 else 1
if res["MRNr"] == 0:
return 0
else:
#return tol_score/res["MRNr"] if tol_score/res["MRNr"] < 1 else 1
return tol_score if tol_score < 1 else 1
else:
# Not in uniqueness matrix
return "NA"
def clacRefMask(res_rollup, refs, bg_mask, tempdir, mb, mr, ml, mc, verbose, debug):
depth={}
# retrieve depths
for ref in refs:
# skip this ref if its organism doesn't pass thresholds
(acc,slen,tid,tag) = ref.split('|')
if not tid in res_rollup: continue
if mb > res_rollup[tid]["rsMRNb"]: continue
if mr > res_rollup[tid]["MR"]: continue
if ml > res_rollup[tid]["LL"]: continue
if mc > res_rollup[tid]["LC"]: continue
depthfile = "%s/merged_sam/%s.sorted.depth"%(tempdir, ref)
if not os.path.isfile(depthfile) or not os.path.getsize(depthfile):
continue
if ref in bg_mask:
if debug: sys.stderr.write( "[DEBUG] Processing reference %s...\n"%ref)
# load bg mask to dataframe
maskbinstr = bin(bg_mask[ref])[2:]
dfmask = pd.DataFrame( list(maskbinstr) )
dfmask.columns = ['bg_mapped']
dfmask.index += (int(slen)-len(maskbinstr)+1)
if debug: sys.stderr.write( "[DEBUG] Printing mask...\n")
if debug: sys.stderr.write( str(dfmask)+"\n" )
# load depth
df = pd.read_csv(depthfile,
sep='\t',
names=['ref','pos','dep'],
usecols=['pos','dep'],
dtype={'pos':int,'dep':int},
index_col=['pos']
)
if debug: sys.stderr.write( "[DEBUG] Printing base-by-base depths in target dataset...\n")
if debug: sys.stderr.write( str(df)+"\n" )
# join bg mask and the deps
df1 = df.join(dfmask, on=df.index, how='left')
df1['dep_mask'] = df1['dep']
df1.loc[df1.bg_mapped=='1','dep_mask']=0
if debug: sys.stderr.write( "[DEBUG] Printing MASKED base-by-base depths...\n")
if debug: sys.stderr.write( str(df1)+"\n" )
if tid in depth:
depth[tid]['orig'].append(df1['dep'])
depth[tid]['mask'].append(df1['dep_mask'])
else:
depth[tid]={}
depth[tid]['orig'] = df1['dep'].copy()
depth[tid]['mask'] = df1['dep_mask'].copy()
return depth
def scoreBg(res_rollup, refs, bg_mask, tempdir, numthreads, mb, mr, ml, mc, verbose, debug, method="overlapping_prop_raw"):
"""
Calculate a score by performing the 2-sample Kolmogorov-Smirnov test between
the depth of coverage (DoC) and background masked DoC.
"""
from scipy import stats
import numpy as np
if debug: sys.stderr.write( "[DEBUG] Using score.py version:%s\n"%__version__ )
pool = Pool(processes=numthreads)
jobs = []
depth={}
n = 10 #number of refs per chunk
chunks = [refs[i:i+n] for i in range(0, len(refs), n)]
for chunk in chunks:
jobs.append( pool.apply_async(clacRefMask, (res_rollup, chunk, bg_mask, tempdir, mb, mr, ml, mc, verbose, debug)) )
tol_jobs = len(jobs)
cnt=0
for job in jobs:
depth_chunk = job.get()
if depth_chunk:
for tid in depth_chunk:
lng = t.taxid2lineageDICT(tid, 1, 1)
for rank in lng:
if rank == "type": continue #skip "type" level
#res_tree[pid][tid] = 1
taxid = lng[rank]['taxid']
# if no particular rank for this organism, use strain
if taxid == 0:
taxid = lng[rank]['name'] # for example: 'Flaviviridae - no_o_rank - no_c_rank'
if taxid in depth:
depth[taxid]['orig'].append(depth_chunk[tid]['orig'])
depth[taxid]['mask'].append(depth_chunk[tid]['mask'])
else:
depth[taxid] = {}
depth[taxid]['orig'] = depth_chunk[tid]['orig'].copy()
depth[taxid]['mask'] = depth_chunk[tid]['mask'].copy()
cnt+=1
if verbose: print( "[INFO] Progress: %s/%s (%.1f%%) chunks done."%(cnt, tol_jobs, cnt/tol_jobs*100))
#clean up
pool.close()
for tid in res_rollup:
# skipping un-qualified taxas
if mb > res_rollup[tid]["rsMRNb"] or mr > res_rollup[tid]["MR"] or ml > res_rollup[tid]["LL"] or mc > res_rollup[tid]["LC"]:
continue
if tid in depth:
if debug: sys.stderr.write( "[DEBUG] Processing %s (taxid:%s)...\n"%(tid if '-' in tid else t.taxid2name(tid), tid))
def transform_freq(orig, mask):
dep = pd.DataFrame(data={
'orig': orig, 'mask': mask
})
dep_co = dep.groupby(['orig']).count()
dep_cm = dep.groupby(['mask']).count()
dep_co.index.names = ['depth']
dep_cm.index.names = ['depth']
dep_comb = dep_co.join(dep_cm, how='outer').fillna(0).rename({'orig':'mask','mask':'orig'}, axis='columns')
if debug: sys.stderr.write( "[DEBUG] Freq of binned depths...\n" )
if debug: sys.stderr.write( str(dep_comb)+"\n" )
return dep_comb
def overlapping_prop_raw(orig, mask):
non_overlapping_area = np.absolute(orig-mask).sum()
sum_area = (orig+mask).sum()
union_area = (sum_area+non_overlapping_area)/2
non_overlapping_prop = non_overlapping_area/union_area
if debug: sys.stderr.write( "[DEBUG] (non_overlapping_area, union_area, non_overlapping_prop) = (%s, %s, %s)\n"%(non_overlapping_area, union_area, non_overlapping_prop))
return 1-non_overlapping_prop
def overlapping_prop_freq(orig, mask):
dep_comb = transform_freq(orig, mask)
return overlapping_prop_raw(dep_comb['orig'], dep_comb['mask'])
def KS_2samp_freq(orig, mask):
"""
2-sample K-S test
"""
dep_comb = transform_freq(orig, mask)
d, pval = stats.ks_2samp( dep_comb['orig'], dep_comb['mask'] )
if debug: sys.stderr.write( "[DEBUG] (d, pval)=(%s, %s)\n"%(d, pval) )
return pval
def chi2_contingency_freq(orig, mask):
"""
Chi2 test
"""
dep_comb = transform_freq(orig, mask)
chi2, pval, dof, ex = stats.chi2_contingency([dep_comb['orig'], dep_comb['mask']])
return pval
def KS_2samp_freq_removedZero(orig, mask):
dep_comb = transform_freq(orig, mask)
if 0 in dep_comb.index:
dep_comb = dep_comb.drop(0)
d, pval = stats.ks_2samp( dep_comb['orig'], dep_comb['mask'] )
if debug: sys.stderr.write( "[DEBUG] (d, pval)=(%s, %s)\n"%(d, pval) )
return pval
if debug: sys.stderr.write( "[DEBUG] Using method: %s...\n"%method)
if method=="overlapping_prop_freq":
res_rollup[tid]["S_BG"] = overlapping_prop_freq( depth[tid]['orig'], depth[tid]['mask'] )
elif method=="KS_2samp_freq":
res_rollup[tid]["S_BG"] = KS_2samp_freq( depth[tid]['orig'], depth[tid]['mask'] )
elif method=="KS_2samp_freq_removedZero":
res_rollup[tid]["S_BG"] = KS_2samp_freq_removedZero( depth[tid]['orig'], depth[tid]['mask'] )
else:
res_rollup[tid]["S_BG"] = overlapping_prop_raw( depth[tid]['orig'], depth[tid]['mask'] )
else:
res_rollup[tid]["S_BG"] = 1
return res_rollup