addressed issues #5 and #6 and made python 3 compatible (issue #1)

raphael-group · Nov 20, 2017 · 2bed001 · 2bed001
1 parent eb59166
commit 2bed001
Show file tree

Hide file tree

Showing 10 changed files with 75 additions and 58 deletions.
diff --git a/NAIBR.py b/NAIBR.py
@@ -1,3 +1,4 @@
+from __future__ import print_function,division
 import os,sys,time,json,pysam,collections
 sys.path.insert(0, 'src')
 if len(sys.argv) < 2:
@@ -10,7 +11,6 @@
 from rank import *
 
 
-
 def run_NAIBR_user(cand):
 	'''
 	use user input candidate novel adjacencies
@@ -32,18 +32,18 @@ def run_NAIBR(chrom):
 	if len(reads_by_LR) > 0:
 		cands,p_len,p_rate = get_candidates(discs,reads_by_LR)
 		if cands == None:
-			print 'No candidates from',chrom
+			print('No candidates from %s'%chrom)
 			return reads_by_LR,LRs_by_pos,discs_by_barcode,interchrom_discs,coverage,scores
-		print 'ranking',len(cands),'candidates from',chrom
+		print('ranking %i candidates from %s'%(len(cands),chrom))
 		scores = predict_NAs(reads_by_LR,LRs_by_pos,discs_by_barcode,cands,p_len,p_rate,coverage,False)
 	else:
-		print 'No candidates from',chrom
+		print('No candidates from %s'%chrom)
 	return reads_by_LR,LRs_by_pos,discs_by_barcode,interchrom_discs,coverage,scores
 
 def main():
 	starttime = time.time()
 	if len(candidates) > 0:
-		print 'user defined candidates'
+		print('user defined candidates')
 		with open(candidates) as f:
 			cands = f.read().split('\n')
 			cands = [x.split('\t') for x in cands]
@@ -73,15 +73,15 @@ def main():
 				scores += scores_chrom
 		cands,p_len,p_rate = get_candidates(discs,reads_by_LR)
 		if cands != None:
-			print 'ranking',len(cands),'interchromosomal candidates'
+			print('ranking %i interchromosomal candidates'%len(cands))
 			scores += predict_NAs(reads_by_LR,LRs_by_pos,discs_by_barcode,cands,p_len,p_rate,np.mean(coverage),True)
 		else:
-			print 'No interchromosomal candidates'
+			print('No interchromosomal candidates')
 		write_scores(scores)
 
 
 
-	print 'Finished in',(time.time()-starttime)/60.0,'minutes'
+	print('Finished in',(time.time()-starttime)/60.0,'minutes')
 	return
 
 if __name__ == "__main__":

diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 ## Overview
 
-NAIBR (Novel Adjacency Identification with Barcoded Reads) identifies novel adjacencies created by structural variation events such as deletions, duplications, inversions, and complex rearrangements using linked-read whole-genome sequencing data produced by 10X Genomics. 
+NAIBR (Novel Adjacency Identification with Barcoded Reads) identifies novel adjacencies created by structural variation events such as deletions, duplications, inversions, and complex rearrangements using linked-read whole-genome sequencing data produced by 10X Genomics. Please refer to the [publication](https://doi.org/10.1093/bioinformatics/btx712) for details about the method.
+
 
 NAIBR takes as in put a BAM file produced by 10X Genomic's Long Ranger pipeline and outputs a BEDPE file containing predicted novel adjacencies and a likelihood score for each adjacency.
 
@@ -44,6 +45,15 @@ python NAIBR.py example/example.config
 
 will produce the file 'example/NAIBR_SVs.bedpe'.
 
-
+### Citing NAIBR
+Elyanow, Rebecca, Hsin-Ta Wu, and Benjamin J. Raphael. "Identifying structural variants using linked-read sequencing data." Bioinformatics (2017).
+```
+@article{elyanow2017identifying,
+  title={Identifying structural variants using linked-read sequencing data},
+  author={Elyanow, Rebecca and Wu, Hsin-Ta and Raphael, Benjamin J},
+  journal={Bioinformatics},
+  year={2017}
+}
+```
 
 
diff --git a/example/NAIBR_SVs.bedpe b/example/NAIBR_SVs.bedpe
@@ -1,2 +1,2 @@
 Chr1	Break1	Chr2	Break2	Split molecules	Discordant reads	Orientation	Haplotype	Score	Pass filter
-21	18759465	21	18906782	28.0	0.0	--	1,1	572.050370637	PASS
+21	18759465	21	18906782	28.0	0.0	--	1,1	568.4063351114613	PASS
diff --git a/example/example.config b/example/example.config
@@ -19,8 +19,8 @@ d=10000
 # minimum size of structural variant (default=lmax)
 min_sv=1000
 
-# number of cores (default=1)
-cores=1
+# number of threads (default=1)
+threads=1
 
 # minimum number of barcode overlaps supporting a candidate NA (default = 3)
 k=3
diff --git a/src/distributions.py b/src/distributions.py
@@ -1,3 +1,5 @@
+from __future__ import print_function,division
+from future.utils import iteritems
 import os,sys,pysam,collections,time,gc,random,itertools
 from utils import *
 import numpy as np
@@ -36,7 +38,7 @@ def fit(self, data, p = None, r = None):
 			va = np.var(data)
 			r = (av*av)/(va-av)
 			p = (va-av)/(va)
-		sm = np.sum(data)/len(data)
+		sm = np.sum(data)/(len(data))
 		x = optimize.fsolve(self.mleFun, np.array([p, r]), args=(data, sm))
 		self.p = x[0]
 		self.r = x[1]
@@ -87,8 +89,8 @@ def get_overlap(barcode_LRs):
 			LR1,LR2 = [LR2,LR1]
 		chr1,start1,end1,count1 = LR1
 		chr2,start2,end2,count2 = LR2
-		index1 = set([start1/d*d,end1/d*d])
-		index2 = set([start2/d*d,end2/d*d])
+		index1 = set([int(start1/d)*d,int(end1/d)*d])
+		index2 = set([int(start2/d)*d,int(end2/d)*d])
 		for id1 in index1:
 			for id2 in index2:
 				barcode_overlap[(chr1,id1,chr2,id2)] += 1
@@ -100,27 +102,27 @@ def get_distributions(reads_by_LR):
 	global barcode_overlap,LRs_by_barcode
 	LRs_by_barcode = collections.defaultdict(list)
 	barcode_overlap = collections.defaultdict(int)
-	for key,reads in reads_by_LR.iteritems():
+	for key,reads in iteritems(reads_by_LR):
 		chrom,barcode = key
 		barcode_LRS = linked_reads(reads,chrom)
 		LRs += barcode_LRS
 		LRs_by_barcode[barcode] += barcode_LRS
-	for barcode,barcode_LRS in LRs_by_barcode.iteritems():
+	for barcode,barcode_LRS in iteritems(LRs_by_barcode):
 		if len(barcode_LRS) > 1:
 			get_overlap(barcode_LRS)
-
 	if len(LRs) == 0:
 		return None,None,None
+	elif len(LRs) < 100:
+		print('WARNING: Few linked-reads detected')
 	p_rate = get_rate_distr(LRs)
 	p_len = get_length_distr(LRs)	
 	return p_len,p_rate,barcode_overlap
 
 def get_length_distr(LRs):
 	lengths = [x[2]-x[1] for x in LRs]
 	lengths.sort()
-	lengths = lengths[len(lengths)/10:len(lengths)/10*9]
-	#print np.mean(lengths)
-	#lengths = random.sample(lengths,100000)
+	if len(lengths) > 10:
+		lengths = lengths[int(len(lengths)/10):int(len(lengths)/10*9)]
 	b = negBin()
 	b.fit(lengths)
 	p = b.pdf
@@ -134,8 +136,8 @@ def get_length_distr(LRs):
 def get_rate_distr(LRs):
 	rate = [x[3]/float(x[2]-x[1]) for x in LRs]
 	rate.sort()
-	rate = rate[len(rate)/10:len(rate)/10*9]
-
+	if len(rate) > 10:
+		rate = rate[int(len(rate)/10):int(len(rate)/10*9)]
 	alpha,loc,beta = scipy.stats.gamma.fit(rate)
 	p = scipy.stats.gamma(alpha,loc,beta).cdf
 	pp = lambda x: max(1e-20,float(p([max(x,1e-6)])[0]-p([max(x,1e-6)-1e-6])[0]))

diff --git a/src/estimate_params.py b/src/estimate_params.py
@@ -1,3 +1,4 @@
+from __future__ import print_function,division
 import os,sys,subprocess,pysam,collections,time,gc
 import multiprocessing as mp
 from utils import *
@@ -10,7 +11,7 @@ def estimate_lmin_lmax():
 	ls = []
 	num = 0
 	for i,chrm in enumerate(reads.references):
-		start = reads.lengths[i]/2
+		start = int(reads.lengths[i]/2)
 		end = start+1000000
 		for read in reads.fetch(chrm,start,end):
 			num += 1
@@ -49,7 +50,7 @@ def estimate_delta(read_list):
 			prev_end = end
 	mean = np.mean(gaps)
 	gaps.sort()
-	d = max(1000,gaps[len(gaps)-int((len(gaps)/100)*5)]/1000*1000)
+	d = max(1000,int(gaps[len(gaps)-int((len(gaps)/100)*5)]/1000)*1000)
 	return d
 
 
diff --git a/src/get_reads.py b/src/get_reads.py
@@ -1,3 +1,5 @@
+from __future__ import print_function,division
+from future.utils import iteritems
 import os,sys,subprocess,pysam,collections,time,gc,json,random,copy
 import multiprocessing as mp
 from utils import *
@@ -29,7 +31,7 @@ def make_barcodeDict(chrom):
 	Returns: dict barcoded reads, dict of reads overlapping ref positions, 
 	discordant reads, candidate NAs, total coverage
 	'''
-	print 'getting candidates for chrom %s'%chrom
+	print('getting candidates for chrom %s'%chrom)
 	cov = 0
 	global discs
 	global reads_by_barcode
@@ -71,11 +73,11 @@ def make_barcodeDict(chrom):
 					if peread.chrm == peread.nextchrm:
 						discs = add_disc(peread,discs)
 					else:
-						interchrom_discs[(peread.chrm,peread.i/r*r,peread.nextchrm,peread.j/r*r,peread.orient)].append(peread)
+						interchrom_discs[(int(peread.chrm,peread.i/r)*r,peread.nextchrm,int(peread.j/r)*r,peread.orient)].append(peread)
 			elif read.is_proper_pair and fragment_length(read) > lmin:
 					reads_by_LR[(peread.chrm,barcode)].append((peread.start,peread.nextend,peread.hap,peread.mapq))
-					if barcode not in LRs_by_pos[(peread.chrm,peread.mid()/R*R)]:
-						LRs_by_pos[(peread.chrm,peread.mid()/R*R)].append(barcode)
+					if barcode not in LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)]:
+						LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)].append(barcode)
 	return reads_by_LR,LRs_by_pos,discs_by_barcode,discs,interchrom_discs,cov/float(lengths)
 
 def signi(disc):
@@ -93,10 +95,10 @@ def signj(disc):
 
 def add_disc(peread,discs):
 	r = lmax
-	for a in [-r/2,0,r/2]:
-		for b in [-r/2,0,r/2]:
-			if (a == 0 or (peread.i+a)/r*r != peread.i/r*r) and (b == 0 or (peread.j+b)/r*r != peread.j/r*r):
-				discs[(peread.chrm,(peread.i+a)/r*r,peread.nextchrm,(peread.j+b)/r*r,peread.orient)].append(peread)
+	for a in [int(-r/2),0,int(r/2)]:
+		for b in [int(-r/2),0,int(r/2)]:
+			if (a == 0 or int((peread.i+a)/r)*r != int(peread.i/r)*r) and (b == 0 or int((peread.j+b)/r)*r != int(peread.j/r)*r):
+				discs[(peread.chrm,int((peread.i+a)/r)*r,peread.nextchrm,int((peread.j+b)/r)*r,peread.orient)].append(peread)
 	return discs
 
 def largest_overlap(items):
@@ -111,13 +113,13 @@ def largest_overlap(items):
 			positions_j[pos] += 1
 	i = 0
 	overlap_i = 0
-	for pos,overlap in positions_i.iteritems():
+	for pos,overlap in iteritems(positions_i):
 		if overlap > overlap_i:
 			overlap_i = overlap
 			i = pos
 	j = 0
 	overlap_j = 0
-	for pos,overlap in positions_j.iteritems():
+	for pos,overlap in iteritems(positions_j):
 		if overlap > overlap_j:
 			overlap_j = overlap
 			j = pos
@@ -151,7 +153,7 @@ def get_candidates(discs,reads_by_LR):
 	if p_len == None:
 		return None,None,None
 	num_cands = 0
-	for key,items in discs.iteritems():
+	for key,items in iteritems(discs):
 		orient = key[4]
 		si,ei,sj,ej = disc_intersection(items)
 		if si and sj and len(items) >= MIN_DISCS:
@@ -160,7 +162,7 @@ def get_candidates(discs,reads_by_LR):
 			cand = copy.copy(items[0])
 			cand.i = i
 			cand.j = j
-			barcode_overlaps = barcode_overlap[(cand.chrm,cand.i/d*d,cand.nextchrm,cand.j/d*d)]
+			barcode_overlaps = barcode_overlap[(cand.chrm,int(cand.i/d)*d,cand.nextchrm,int(cand.j/d)*d)]
 			if not inblacklist(cand) and ((cand.chrm == cand.nextchrm and cand.j-cand.i < d)\
 				or barcode_overlaps >= k):
 				already_appended = sum([1 for x in candidates if x.i == cand.i and x.j == cand.j])
@@ -224,8 +226,8 @@ def make_barcodeDict_user(candidate):
 						discs_by_barcode[(peread.chrm,peread.nextchrm,peread.barcode)].append(peread)
 				elif read.is_proper_pair and fragment_length(read) > lmin:
 						reads_by_LR[(peread.chrm,barcode)].append((peread.start,peread.nextend,peread.hap,peread.mapq))
-						if barcode not in LRs_by_pos[(peread.chrm,peread.mid()/R*R)]:
-							LRs_by_pos[(peread.chrm,peread.mid()/R*R)].append(barcode)
+						if barcode not in LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)]:
+							LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)].append(barcode)
 	cand = copy.copy(peread)
 	cand.chrm = chrm1.strip('chr')
 	cand.i = break1

diff --git a/src/global_vars.py b/src/global_vars.py
@@ -1,4 +1,5 @@
 ### GLOBALS ####
+from __future__ import print_function
 import os,json,sys,collections,pysam
 import numpy as np
 

diff --git a/src/rank.py b/src/rank.py
@@ -1,3 +1,4 @@
+from __future__ import print_function,division
 import os,sys,subprocess,pysam,collections,time,gc,math,json,copy
 import scipy.stats,operator,itertools
 import multiprocessing as mp
@@ -114,10 +115,7 @@ def spanning(x,candidate):
 
 def discs(candidate,barcode):
 	ds = discs_by_barcode[(candidate.chrm,candidate.nextchrm,barcode)]
-	# for d in ds:
-	# 	if d.chrm != d.nextchrm:
-	# 		print d.chrm,d.nextchrm,d.i,d.j
-	ds = [(read.mapq+read.nextmapq)/2 for read in ds if \
+	ds = [int((read.mapq+read.nextmapq)/2) for read in ds if \
 	candidate.chrm == read.chrm and candidate.nextchrm == read.nextchrm\
 	and is_close(candidate.i,read.i,read.orient[0]) and \
 	    is_close(candidate.j,read.j,read.orient[1]) and candidate.orient == read.orient]
@@ -182,10 +180,10 @@ def get_LRs(candidate,barcodes):
 
 def get_CSMs(i):
 	candidate = candidates[i]
-	break1_barcodes = set(LRs_by_pos[(candidate.chrm,candidate.i/R*R)]+\
-		LRs_by_pos[(candidate.chrm,candidate.i/R*R-R)]+LRs_by_pos[(candidate.chrm,candidate.i/R*R+R)])
-	break2_barcodes = set(list(LRs_by_pos[(candidate.nextchrm,candidate.j/R*R)])+\
-		LRs_by_pos[(candidate.nextchrm,candidate.j/R*R-R)]+LRs_by_pos[(candidate.nextchrm,candidate.j/R*R+R)])
+	break1_barcodes = set(LRs_by_pos[(candidate.chrm,int(candidate.i/R)*R)]+\
+		LRs_by_pos[(candidate.chrm,int(candidate.i/R)*R-R)]+LRs_by_pos[(candidate.chrm,int(candidate.i/R)*R+R)])
+	break2_barcodes = set(list(LRs_by_pos[(candidate.nextchrm,int(candidate.j/R)*R)])+\
+		LRs_by_pos[(candidate.nextchrm,int(candidate.j/R)*R-R)]+LRs_by_pos[(candidate.nextchrm,int(candidate.j/R)*R+R)])
 	CSMs,spans = get_LRs(candidate,break1_barcodes.intersection(break2_barcodes))
 	key = (candidate.i,candidate.j,candidate.orient)
 	return key,CSMs,spans

diff --git a/src/utils.py b/src/utils.py
@@ -1,3 +1,5 @@
+from __future__ import print_function,division
+from future.utils import iteritems
 from global_vars import *
 import math,mpmath,copy
 import numpy as np
@@ -95,7 +97,7 @@ def length(self):
 
 class PERead:
 	__slots__ = ['barcode', 'chrm','orient','start','disc','mapq','end','hap','nextchrm',\
-	'nextstart','nextend','nexthap','nextmapq']
+	'nextstart','nextend','nexthap','nextmapq','i','j']
 	def __init__(self,read):
 		read_hap = get_hap(read)
 		r = [read.reference_name.strip('chr'),read.reference_start,read.reference_end,\
@@ -238,7 +240,8 @@ def NB(k,m,r):
 	return mpmath.gamma(k + r)/(mpmath.gamma(k+1)*mpmath.gamma(r))*np.power(m/(r+m),k)*np.power(r/(r+m),r)
 
 def Pois(k,lam):
-	return (np.power(lam,k)*math.exp(-lam))/mpmath.gamma(k+1)
+	return (np.power(lam,k)*math.exp(-lam))/(mpmath.gamma(k+1))
+
 def get_hap(r):
 	if r.has_tag('HP'):
 		return r.get_tag('HP')
@@ -305,7 +308,7 @@ def get_orientation(read,mate):
 			return ['++',read.reference_end,mate.reference_end]
 
 def rnd(num):
-	return num/R*R
+	return int(num/R)*R
 
 def flatten(l):
 	return [item for sublist in l for item in sublist]
@@ -339,21 +342,21 @@ def collapse(a,c):
 		score = float(linea[8].split(':')[-1])
 		if 'Y' not in chrom1 and 'Y' not in chrom2:
 			l.append([chrom1,int(s1),chrom2,int(s2),split,disc,orient,haps,score])
-	r = lmax/100*100*5
+	r = int(lmax/100)*100*5
 	l.sort(key = lambda x: x[-1],reverse=True)
 	l2 = []
 	nas = collections.defaultdict(list)
 	for chrom1,s1,chrom2,s2,split,disc,orient,haps,score in l:
 		already_appended = False
 		for i in [-r,0,r]:
 			for j in [-r,0,r]:
-				if (chrom1,s1/r*r+i,chrom2,s2/r*r+j) in nas:
-					ch1,ds1,ch2,ds2 = nas[(chrom1,s1/r*r+i,chrom2,s2/r*r+j)][0:4]
+				if (chrom1,int(s1/r)*r+i,chrom2,int(s2/r)*r+j) in nas:
+					ch1,ds1,ch2,ds2 = nas[(chrom1,int(s1/r)*r+i,chrom2,int(s2/r)*r+j)][0:4]
 					if abs(ds1-s1) < r and abs(ds2-s2) < r:
 						already_appended = True
 		if not already_appended:
-			nas[(chrom1,s1/r*r,chrom2,s2/r*r)] = [chrom1,s1,chrom2,s2,split,disc,orient,haps,score]
-	for i,elem in nas.iteritems():
+			nas[(chrom1,int(s1/r)*r,chrom2,int(s2/r)*r)] = [chrom1,s1,chrom2,s2,split,disc,orient,haps,score]
+	for i,elem in iteritems(nas):
 		if elem[-1] >= c:
 			l2.append(elem+['PASS'])
 		else:
@@ -365,7 +368,7 @@ def collapse(a,c):
 
 def write_scores(scores):
 	fname = 'NAIBR_SVs.bedpe'
-	print os.path.join(DIR,fname)
+	print(os.path.join(DIR,fname))
 	f = open(os.path.join(DIR,fname),'w')
 	f.write('\t'.join(['Chr1','Break1','Chr2','Break2','Split molecules','Discordant reads',\
 	'Orientation','Haplotype','Score','Pass filter\n']))
@@ -378,7 +381,7 @@ def parallel_execute(function,input_list):
 	if NUM_THREADS != 1:
 		pool = mp.Pool(NUM_THREADS,maxtasksperchild=1)
 		map_fn = pool.map
-		print 'running on %s threads'%str(NUM_THREADS)
+		print('running on %s threads'%str(NUM_THREADS))
 		data = map_fn(function,input_list)
 		pool.close()
 		pool.join()