Skip to content

Commit

Permalink
addressed issues #5 and #6 and made python 3 compatible (issue #1)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rebecca G. Elyanow committed Nov 20, 2017
1 parent eb59166 commit 2bed001
Show file tree
Hide file tree
Showing 10 changed files with 75 additions and 58 deletions.
16 changes: 8 additions & 8 deletions NAIBR.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import print_function,division
import os,sys,time,json,pysam,collections
sys.path.insert(0, 'src')
if len(sys.argv) < 2:
Expand All @@ -10,7 +11,6 @@
from rank import *



def run_NAIBR_user(cand):
'''
use user input candidate novel adjacencies
Expand All @@ -32,18 +32,18 @@ def run_NAIBR(chrom):
if len(reads_by_LR) > 0:
cands,p_len,p_rate = get_candidates(discs,reads_by_LR)
if cands == None:
print 'No candidates from',chrom
print('No candidates from %s'%chrom)
return reads_by_LR,LRs_by_pos,discs_by_barcode,interchrom_discs,coverage,scores
print 'ranking',len(cands),'candidates from',chrom
print('ranking %i candidates from %s'%(len(cands),chrom))
scores = predict_NAs(reads_by_LR,LRs_by_pos,discs_by_barcode,cands,p_len,p_rate,coverage,False)
else:
print 'No candidates from',chrom
print('No candidates from %s'%chrom)
return reads_by_LR,LRs_by_pos,discs_by_barcode,interchrom_discs,coverage,scores

def main():
starttime = time.time()
if len(candidates) > 0:
print 'user defined candidates'
print('user defined candidates')
with open(candidates) as f:
cands = f.read().split('\n')
cands = [x.split('\t') for x in cands]
Expand Down Expand Up @@ -73,15 +73,15 @@ def main():
scores += scores_chrom
cands,p_len,p_rate = get_candidates(discs,reads_by_LR)
if cands != None:
print 'ranking',len(cands),'interchromosomal candidates'
print('ranking %i interchromosomal candidates'%len(cands))
scores += predict_NAs(reads_by_LR,LRs_by_pos,discs_by_barcode,cands,p_len,p_rate,np.mean(coverage),True)
else:
print 'No interchromosomal candidates'
print('No interchromosomal candidates')
write_scores(scores)



print 'Finished in',(time.time()-starttime)/60.0,'minutes'
print('Finished in',(time.time()-starttime)/60.0,'minutes')
return

if __name__ == "__main__":
Expand Down
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## Overview

NAIBR (Novel Adjacency Identification with Barcoded Reads) identifies novel adjacencies created by structural variation events such as deletions, duplications, inversions, and complex rearrangements using linked-read whole-genome sequencing data produced by 10X Genomics.
NAIBR (Novel Adjacency Identification with Barcoded Reads) identifies novel adjacencies created by structural variation events such as deletions, duplications, inversions, and complex rearrangements using linked-read whole-genome sequencing data produced by 10X Genomics. Please refer to the [publication](https://doi.org/10.1093/bioinformatics/btx712) for details about the method.


NAIBR takes as in put a BAM file produced by 10X Genomic's Long Ranger pipeline and outputs a BEDPE file containing predicted novel adjacencies and a likelihood score for each adjacency.

Expand Down Expand Up @@ -44,6 +45,15 @@ python NAIBR.py example/example.config

will produce the file 'example/NAIBR_SVs.bedpe'.


### Citing NAIBR
Elyanow, Rebecca, Hsin-Ta Wu, and Benjamin J. Raphael. "Identifying structural variants using linked-read sequencing data." Bioinformatics (2017).
```
@article{elyanow2017identifying,
title={Identifying structural variants using linked-read sequencing data},
author={Elyanow, Rebecca and Wu, Hsin-Ta and Raphael, Benjamin J},
journal={Bioinformatics},
year={2017}
}
```


2 changes: 1 addition & 1 deletion example/NAIBR_SVs.bedpe
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Chr1 Break1 Chr2 Break2 Split molecules Discordant reads Orientation Haplotype Score Pass filter
21 18759465 21 18906782 28.0 0.0 -- 1,1 572.050370637 PASS
21 18759465 21 18906782 28.0 0.0 -- 1,1 568.4063351114613 PASS
4 changes: 2 additions & 2 deletions example/example.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ d=10000
# minimum size of structural variant (default=lmax)
min_sv=1000

# number of cores (default=1)
cores=1
# number of threads (default=1)
threads=1

# minimum number of barcode overlaps supporting a candidate NA (default = 3)
k=3
24 changes: 13 additions & 11 deletions src/distributions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function,division
from future.utils import iteritems
import os,sys,pysam,collections,time,gc,random,itertools
from utils import *
import numpy as np
Expand Down Expand Up @@ -36,7 +38,7 @@ def fit(self, data, p = None, r = None):
va = np.var(data)
r = (av*av)/(va-av)
p = (va-av)/(va)
sm = np.sum(data)/len(data)
sm = np.sum(data)/(len(data))
x = optimize.fsolve(self.mleFun, np.array([p, r]), args=(data, sm))
self.p = x[0]
self.r = x[1]
Expand Down Expand Up @@ -87,8 +89,8 @@ def get_overlap(barcode_LRs):
LR1,LR2 = [LR2,LR1]
chr1,start1,end1,count1 = LR1
chr2,start2,end2,count2 = LR2
index1 = set([start1/d*d,end1/d*d])
index2 = set([start2/d*d,end2/d*d])
index1 = set([int(start1/d)*d,int(end1/d)*d])
index2 = set([int(start2/d)*d,int(end2/d)*d])
for id1 in index1:
for id2 in index2:
barcode_overlap[(chr1,id1,chr2,id2)] += 1
Expand All @@ -100,27 +102,27 @@ def get_distributions(reads_by_LR):
global barcode_overlap,LRs_by_barcode
LRs_by_barcode = collections.defaultdict(list)
barcode_overlap = collections.defaultdict(int)
for key,reads in reads_by_LR.iteritems():
for key,reads in iteritems(reads_by_LR):
chrom,barcode = key
barcode_LRS = linked_reads(reads,chrom)
LRs += barcode_LRS
LRs_by_barcode[barcode] += barcode_LRS
for barcode,barcode_LRS in LRs_by_barcode.iteritems():
for barcode,barcode_LRS in iteritems(LRs_by_barcode):
if len(barcode_LRS) > 1:
get_overlap(barcode_LRS)

if len(LRs) == 0:
return None,None,None
elif len(LRs) < 100:
print('WARNING: Few linked-reads detected')
p_rate = get_rate_distr(LRs)
p_len = get_length_distr(LRs)
return p_len,p_rate,barcode_overlap

def get_length_distr(LRs):
lengths = [x[2]-x[1] for x in LRs]
lengths.sort()
lengths = lengths[len(lengths)/10:len(lengths)/10*9]
#print np.mean(lengths)
#lengths = random.sample(lengths,100000)
if len(lengths) > 10:
lengths = lengths[int(len(lengths)/10):int(len(lengths)/10*9)]
b = negBin()
b.fit(lengths)
p = b.pdf
Expand All @@ -134,8 +136,8 @@ def get_length_distr(LRs):
def get_rate_distr(LRs):
rate = [x[3]/float(x[2]-x[1]) for x in LRs]
rate.sort()
rate = rate[len(rate)/10:len(rate)/10*9]

if len(rate) > 10:
rate = rate[int(len(rate)/10):int(len(rate)/10*9)]
alpha,loc,beta = scipy.stats.gamma.fit(rate)
p = scipy.stats.gamma(alpha,loc,beta).cdf
pp = lambda x: max(1e-20,float(p([max(x,1e-6)])[0]-p([max(x,1e-6)-1e-6])[0]))
Expand Down
5 changes: 3 additions & 2 deletions src/estimate_params.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import print_function,division
import os,sys,subprocess,pysam,collections,time,gc
import multiprocessing as mp
from utils import *
Expand All @@ -10,7 +11,7 @@ def estimate_lmin_lmax():
ls = []
num = 0
for i,chrm in enumerate(reads.references):
start = reads.lengths[i]/2
start = int(reads.lengths[i]/2)
end = start+1000000
for read in reads.fetch(chrm,start,end):
num += 1
Expand Down Expand Up @@ -49,7 +50,7 @@ def estimate_delta(read_list):
prev_end = end
mean = np.mean(gaps)
gaps.sort()
d = max(1000,gaps[len(gaps)-int((len(gaps)/100)*5)]/1000*1000)
d = max(1000,int(gaps[len(gaps)-int((len(gaps)/100)*5)]/1000)*1000)
return d


30 changes: 16 additions & 14 deletions src/get_reads.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function,division
from future.utils import iteritems
import os,sys,subprocess,pysam,collections,time,gc,json,random,copy
import multiprocessing as mp
from utils import *
Expand Down Expand Up @@ -29,7 +31,7 @@ def make_barcodeDict(chrom):
Returns: dict barcoded reads, dict of reads overlapping ref positions,
discordant reads, candidate NAs, total coverage
'''
print 'getting candidates for chrom %s'%chrom
print('getting candidates for chrom %s'%chrom)
cov = 0
global discs
global reads_by_barcode
Expand Down Expand Up @@ -71,11 +73,11 @@ def make_barcodeDict(chrom):
if peread.chrm == peread.nextchrm:
discs = add_disc(peread,discs)
else:
interchrom_discs[(peread.chrm,peread.i/r*r,peread.nextchrm,peread.j/r*r,peread.orient)].append(peread)
interchrom_discs[(int(peread.chrm,peread.i/r)*r,peread.nextchrm,int(peread.j/r)*r,peread.orient)].append(peread)
elif read.is_proper_pair and fragment_length(read) > lmin:
reads_by_LR[(peread.chrm,barcode)].append((peread.start,peread.nextend,peread.hap,peread.mapq))
if barcode not in LRs_by_pos[(peread.chrm,peread.mid()/R*R)]:
LRs_by_pos[(peread.chrm,peread.mid()/R*R)].append(barcode)
if barcode not in LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)]:
LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)].append(barcode)
return reads_by_LR,LRs_by_pos,discs_by_barcode,discs,interchrom_discs,cov/float(lengths)

def signi(disc):
Expand All @@ -93,10 +95,10 @@ def signj(disc):

def add_disc(peread,discs):
r = lmax
for a in [-r/2,0,r/2]:
for b in [-r/2,0,r/2]:
if (a == 0 or (peread.i+a)/r*r != peread.i/r*r) and (b == 0 or (peread.j+b)/r*r != peread.j/r*r):
discs[(peread.chrm,(peread.i+a)/r*r,peread.nextchrm,(peread.j+b)/r*r,peread.orient)].append(peread)
for a in [int(-r/2),0,int(r/2)]:
for b in [int(-r/2),0,int(r/2)]:
if (a == 0 or int((peread.i+a)/r)*r != int(peread.i/r)*r) and (b == 0 or int((peread.j+b)/r)*r != int(peread.j/r)*r):
discs[(peread.chrm,int((peread.i+a)/r)*r,peread.nextchrm,int((peread.j+b)/r)*r,peread.orient)].append(peread)
return discs

def largest_overlap(items):
Expand All @@ -111,13 +113,13 @@ def largest_overlap(items):
positions_j[pos] += 1
i = 0
overlap_i = 0
for pos,overlap in positions_i.iteritems():
for pos,overlap in iteritems(positions_i):
if overlap > overlap_i:
overlap_i = overlap
i = pos
j = 0
overlap_j = 0
for pos,overlap in positions_j.iteritems():
for pos,overlap in iteritems(positions_j):
if overlap > overlap_j:
overlap_j = overlap
j = pos
Expand Down Expand Up @@ -151,7 +153,7 @@ def get_candidates(discs,reads_by_LR):
if p_len == None:
return None,None,None
num_cands = 0
for key,items in discs.iteritems():
for key,items in iteritems(discs):
orient = key[4]
si,ei,sj,ej = disc_intersection(items)
if si and sj and len(items) >= MIN_DISCS:
Expand All @@ -160,7 +162,7 @@ def get_candidates(discs,reads_by_LR):
cand = copy.copy(items[0])
cand.i = i
cand.j = j
barcode_overlaps = barcode_overlap[(cand.chrm,cand.i/d*d,cand.nextchrm,cand.j/d*d)]
barcode_overlaps = barcode_overlap[(cand.chrm,int(cand.i/d)*d,cand.nextchrm,int(cand.j/d)*d)]
if not inblacklist(cand) and ((cand.chrm == cand.nextchrm and cand.j-cand.i < d)\
or barcode_overlaps >= k):
already_appended = sum([1 for x in candidates if x.i == cand.i and x.j == cand.j])
Expand Down Expand Up @@ -224,8 +226,8 @@ def make_barcodeDict_user(candidate):
discs_by_barcode[(peread.chrm,peread.nextchrm,peread.barcode)].append(peread)
elif read.is_proper_pair and fragment_length(read) > lmin:
reads_by_LR[(peread.chrm,barcode)].append((peread.start,peread.nextend,peread.hap,peread.mapq))
if barcode not in LRs_by_pos[(peread.chrm,peread.mid()/R*R)]:
LRs_by_pos[(peread.chrm,peread.mid()/R*R)].append(barcode)
if barcode not in LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)]:
LRs_by_pos[(peread.chrm,int(peread.mid()/R)*R)].append(barcode)
cand = copy.copy(peread)
cand.chrm = chrm1.strip('chr')
cand.i = break1
Expand Down
1 change: 1 addition & 0 deletions src/global_vars.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
### GLOBALS ####
from __future__ import print_function
import os,json,sys,collections,pysam
import numpy as np

Expand Down
14 changes: 6 additions & 8 deletions src/rank.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import print_function,division
import os,sys,subprocess,pysam,collections,time,gc,math,json,copy
import scipy.stats,operator,itertools
import multiprocessing as mp
Expand Down Expand Up @@ -114,10 +115,7 @@ def spanning(x,candidate):

def discs(candidate,barcode):
ds = discs_by_barcode[(candidate.chrm,candidate.nextchrm,barcode)]
# for d in ds:
# if d.chrm != d.nextchrm:
# print d.chrm,d.nextchrm,d.i,d.j
ds = [(read.mapq+read.nextmapq)/2 for read in ds if \
ds = [int((read.mapq+read.nextmapq)/2) for read in ds if \
candidate.chrm == read.chrm and candidate.nextchrm == read.nextchrm\
and is_close(candidate.i,read.i,read.orient[0]) and \
is_close(candidate.j,read.j,read.orient[1]) and candidate.orient == read.orient]
Expand Down Expand Up @@ -182,10 +180,10 @@ def get_LRs(candidate,barcodes):

def get_CSMs(i):
candidate = candidates[i]
break1_barcodes = set(LRs_by_pos[(candidate.chrm,candidate.i/R*R)]+\
LRs_by_pos[(candidate.chrm,candidate.i/R*R-R)]+LRs_by_pos[(candidate.chrm,candidate.i/R*R+R)])
break2_barcodes = set(list(LRs_by_pos[(candidate.nextchrm,candidate.j/R*R)])+\
LRs_by_pos[(candidate.nextchrm,candidate.j/R*R-R)]+LRs_by_pos[(candidate.nextchrm,candidate.j/R*R+R)])
break1_barcodes = set(LRs_by_pos[(candidate.chrm,int(candidate.i/R)*R)]+\
LRs_by_pos[(candidate.chrm,int(candidate.i/R)*R-R)]+LRs_by_pos[(candidate.chrm,int(candidate.i/R)*R+R)])
break2_barcodes = set(list(LRs_by_pos[(candidate.nextchrm,int(candidate.j/R)*R)])+\
LRs_by_pos[(candidate.nextchrm,int(candidate.j/R)*R-R)]+LRs_by_pos[(candidate.nextchrm,int(candidate.j/R)*R+R)])
CSMs,spans = get_LRs(candidate,break1_barcodes.intersection(break2_barcodes))
key = (candidate.i,candidate.j,candidate.orient)
return key,CSMs,spans
Expand Down
23 changes: 13 additions & 10 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import print_function,division
from future.utils import iteritems
from global_vars import *
import math,mpmath,copy
import numpy as np
Expand Down Expand Up @@ -95,7 +97,7 @@ def length(self):

class PERead:
__slots__ = ['barcode', 'chrm','orient','start','disc','mapq','end','hap','nextchrm',\
'nextstart','nextend','nexthap','nextmapq']
'nextstart','nextend','nexthap','nextmapq','i','j']
def __init__(self,read):
read_hap = get_hap(read)
r = [read.reference_name.strip('chr'),read.reference_start,read.reference_end,\
Expand Down Expand Up @@ -238,7 +240,8 @@ def NB(k,m,r):
return mpmath.gamma(k + r)/(mpmath.gamma(k+1)*mpmath.gamma(r))*np.power(m/(r+m),k)*np.power(r/(r+m),r)

def Pois(k,lam):
return (np.power(lam,k)*math.exp(-lam))/mpmath.gamma(k+1)
return (np.power(lam,k)*math.exp(-lam))/(mpmath.gamma(k+1))

def get_hap(r):
if r.has_tag('HP'):
return r.get_tag('HP')
Expand Down Expand Up @@ -305,7 +308,7 @@ def get_orientation(read,mate):
return ['++',read.reference_end,mate.reference_end]

def rnd(num):
return num/R*R
return int(num/R)*R

def flatten(l):
return [item for sublist in l for item in sublist]
Expand Down Expand Up @@ -339,21 +342,21 @@ def collapse(a,c):
score = float(linea[8].split(':')[-1])
if 'Y' not in chrom1 and 'Y' not in chrom2:
l.append([chrom1,int(s1),chrom2,int(s2),split,disc,orient,haps,score])
r = lmax/100*100*5
r = int(lmax/100)*100*5
l.sort(key = lambda x: x[-1],reverse=True)
l2 = []
nas = collections.defaultdict(list)
for chrom1,s1,chrom2,s2,split,disc,orient,haps,score in l:
already_appended = False
for i in [-r,0,r]:
for j in [-r,0,r]:
if (chrom1,s1/r*r+i,chrom2,s2/r*r+j) in nas:
ch1,ds1,ch2,ds2 = nas[(chrom1,s1/r*r+i,chrom2,s2/r*r+j)][0:4]
if (chrom1,int(s1/r)*r+i,chrom2,int(s2/r)*r+j) in nas:
ch1,ds1,ch2,ds2 = nas[(chrom1,int(s1/r)*r+i,chrom2,int(s2/r)*r+j)][0:4]
if abs(ds1-s1) < r and abs(ds2-s2) < r:
already_appended = True
if not already_appended:
nas[(chrom1,s1/r*r,chrom2,s2/r*r)] = [chrom1,s1,chrom2,s2,split,disc,orient,haps,score]
for i,elem in nas.iteritems():
nas[(chrom1,int(s1/r)*r,chrom2,int(s2/r)*r)] = [chrom1,s1,chrom2,s2,split,disc,orient,haps,score]
for i,elem in iteritems(nas):
if elem[-1] >= c:
l2.append(elem+['PASS'])
else:
Expand All @@ -365,7 +368,7 @@ def collapse(a,c):

def write_scores(scores):
fname = 'NAIBR_SVs.bedpe'
print os.path.join(DIR,fname)
print(os.path.join(DIR,fname))
f = open(os.path.join(DIR,fname),'w')
f.write('\t'.join(['Chr1','Break1','Chr2','Break2','Split molecules','Discordant reads',\
'Orientation','Haplotype','Score','Pass filter\n']))
Expand All @@ -378,7 +381,7 @@ def parallel_execute(function,input_list):
if NUM_THREADS != 1:
pool = mp.Pool(NUM_THREADS,maxtasksperchild=1)
map_fn = pool.map
print 'running on %s threads'%str(NUM_THREADS)
print('running on %s threads'%str(NUM_THREADS))
data = map_fn(function,input_list)
pool.close()
pool.join()
Expand Down

0 comments on commit 2bed001

Please sign in to comment.