diff --git a/RE_sites.py b/RE_sites.py index 130a93b..0829136 100755 --- a/RE_sites.py +++ b/RE_sites.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import re import sys import argparse @@ -11,7 +11,7 @@ def parse_fasta(fh): if ln[0] == '>': # check for bad name if ":" in ln: - print >> sys.stderr, "Error, your fasta records contain a ':' (%s). SALSA requires fasta to not contain this character, please update your fasta and re-try"%(ln[1:].rstrip()) + print("Error, your fasta records contain a ':' (%s). SALSA requires fasta to not contain this character, please update your fasta and re-try"%(ln[1:].rstrip()), file=sys.stderr) sys.exit(1) # new name line; remember current sequence's short name @@ -22,7 +22,7 @@ def parse_fasta(fh): # append nucleotides to current sequence fa[current_short_name].append(ln.rstrip()) # Part 2: join lists into strings - for short_name, nuc_list in fa.iteritems(): + for short_name, nuc_list in fa.items(): # join this sequence's lines into one long string fa[short_name] = ''.join(nuc_list) return fa @@ -39,14 +39,14 @@ def main(): if args.enzyme == "DNASE": for key in f: id, seq = key, f[key] - print id, len(seq)/2, len(seq)/2 + print(id, len(seq)/2, len(seq)/2) sys.exit(0) enzymes_input = args.enzyme.replace(' ','').split(',') final_enzymes = [] for each in enzymes_input: if not re.match("^[ACGTN]+$", each): - print >> sys.stderr, "Error, enzyme should be restriction site sequence (e.g. AACTT) not enzyme name or DNASE, you input %s"%(each) + print("Error, enzyme should be restriction site sequence (e.g. AACTT) not enzyme name or DNASE, you input %s"%(each), file=sys.stderr) sys.exit(1) if 'N' in each: @@ -85,7 +85,7 @@ def main(): # else: # rigt_count += 1 - print id, left_count, rigt_count + print(id, left_count, rigt_count) diff --git a/alignments2txt.py b/alignments2txt.py index a2c6be4..3af751c 100755 --- a/alignments2txt.py +++ b/alignments2txt.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import os,sys import argparse import pickle @@ -48,7 +48,7 @@ def update_bed(expanded_scaffold): path = expanded_scaffold[key] scaffold_length[key] = 0 offset = 0 - for i in xrange(0,len(path)-1,2): + for i in range(0,len(path)-1,2): contig = path[i].split(':')[0] contig2scaffold[contig] = key ori = path[i].split(':')[1] + path[i+1].split(':')[1] @@ -117,7 +117,7 @@ def update_bed(expanded_scaffold): #o_lines += curr_scaffold+'\t'+str(new_curr_start)+'\t'+str(new_curr_end)+'\t'+curr_attrs[3]+'\n' count += 1 if count == 1000000: - print olines + print(olines) #output.write(o_lines) count = 0 olines = "" @@ -125,7 +125,7 @@ def update_bed(expanded_scaffold): prev_line = line #write remaining lines - print olines + print(olines) #output.write(o_lines) #output.close() diff --git a/correct.py b/correct.py index 88421e3..ab42e00 100755 --- a/correct.py +++ b/correct.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import sys #reads input assembly, breakpoints given by the method and outputs new contig file with lengths @@ -18,7 +18,7 @@ def parse_fasta(fh): # append nucleotides to current sequence fa[current_short_name].append(ln.rstrip()) # Part 2: join lists into strings - for short_name, nuc_list in fa.iteritems(): + for short_name, nuc_list in fa.items(): # join this sequence's lines into one long string fa[short_name] = ''.join(nuc_list) return fa @@ -94,7 +94,7 @@ def parse_fasta(fh): ofasta = open(sys.argv[4]+'/asm.cleaned.fasta','w') for seq in contig2newseq: contig_seq = contig2newseq[seq] - chunks = [contig_seq[i:i+80] for i in xrange(0,len(contig_seq),80)] + chunks = [contig_seq[i:i+80] for i in range(0,len(contig_seq),80)] ofasta.write('>'+seq+'\n') for chunk in chunks: ofasta.write(chunk+'\n') diff --git a/fast_scaled_scores.py b/fast_scaled_scores.py index cfb9c87..35ecb2d 100755 --- a/fast_scaled_scores.py +++ b/fast_scaled_scores.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import networkx as nx import sys import argparse diff --git a/get_seq.py b/get_seq.py index 8484298..e92a429 100755 --- a/get_seq.py +++ b/get_seq.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import pickle import argparse @@ -16,7 +16,7 @@ def parse_fasta(fh): # append nucleotides to current sequence fa[current_short_name].append(ln.rstrip()) # Part 2: join lists into strings - for short_name, nuc_list in fa.iteritems(): + for short_name, nuc_list in fa.items(): # join this sequence's lines into one long string fa[short_name] = ''.join(nuc_list) return fa @@ -30,7 +30,7 @@ def parse_fasta(fh): args = parser.parse_args() revcompl = lambda x: ''.join([{'A':'T','B':'N','C':'G','G':'C','T':'A','N':'N','R':'N','M':'N','Y':'N','S':'N','W':'N','K':'N','a':'t','c':'g','g':'c','t':'a','n':'n',' ':'',}[B] for B in x][::-1]) -scaff_map = pickle.load(open(args.map,'r')) +scaff_map = pickle.load(open(args.map,'rb')) contig_length = {} id2seq = {} @@ -46,11 +46,11 @@ def parse_fasta(fh): for scaffold in scaff_map: path = scaff_map[scaffold] length = 0 - for i in xrange(0,len(path)-1,2): + for i in range(0,len(path)-1,2): length += contig_length[path[i].split(':')[0]] scaff2length[scaffold] = length -sorted_scaffolds = sorted(scaff2length.items(), key=lambda x: x[1],reverse=True) +sorted_scaffolds = sorted(list(scaff2length.items()), key=lambda x: x[1],reverse=True) c_id = 1 line = "" @@ -114,7 +114,7 @@ def parse_fasta(fh): # rec = SeqRecord(Seq(curr_contig,generic_dna),id='scaffold_'+str(c_id)) # recs.append(rec) # print c_id - chunks = [curr_contig[i:i+80] for i in xrange(0,len(curr_contig),80)] + chunks = [curr_contig[i:i+80] for i in range(0,len(curr_contig),80)] ofile.write('>scaffold_'+str(c_id)+'\n') for chunk in chunks: ofile.write(chunk+'\n') diff --git a/layout_unitigs.py b/layout_unitigs.py index b6a4bb5..820e33e 100755 --- a/layout_unitigs.py +++ b/layout_unitigs.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import networkx as nx import sys import operator @@ -177,7 +177,7 @@ def load_unitig_mapping(): unitig_graph.add_edge(curr_unitig+':E',prev_unitig+':E') prev_line = line - print >> sys.stderr, 'Unitig tiling graph loaded, nodes = ' + str(len(unitig_graph.nodes())) + ' edges = ' + str(len(unitig_graph.edges())) + print('Unitig tiling graph loaded, nodes = ' + str(len(unitig_graph.nodes())) + ' edges = ' + str(len(unitig_graph.edges())), file=sys.stderr) return unitig_graph @@ -219,7 +219,7 @@ def load_tenx_graph(cutoff): contig2scaffold = {} if int(args.iteration) > 1: try: - previous_scaffolds = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','r')) + previous_scaffolds = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','rb')) for key in previous_scaffolds: contigs_1 = previous_scaffolds[key] first = contigs_1[0].split(':')[0] @@ -240,7 +240,7 @@ def load_tenx_graph(cutoff): #Load the best score graph first, keep log of used and unused links -print >> sys.stderr, 'Loading Hi-C links ' +print('Loading Hi-C links ', file=sys.stderr) ''' Counts to keep log of each types of edge loaded in the graph ''' @@ -257,11 +257,11 @@ def test_edge(scaffold_first,scaffold_second,G_test,type): last_first = scaffold_first[-1] first_second = scaffold_second[0] last_second = scaffold_second[-1] - print 'testing' - print first_first - print first_second - print last_first - print last_second + print('testing') + print(first_first) + print(first_second) + print(last_first) + print(last_second) if G_test.has_edge(last_first,first_second): v1 = c1+':E' v2 = c2 +':B' @@ -579,8 +579,8 @@ def generate_scaffold_graph(): for ctg in list(contigs): G.add_edge(ctg+":B", ctg+":E", t="c", score=0, linktype='implicit') - print >> sys.stderr, 'Hybrid scaffold graph loaded, nodes = ' + str(len(G.nodes())) + ' edges = ' + str(len(G.edges())) - print >> sys.stderr, 'Hi-C implied edges = ' + str(hic_edges) + print('Hybrid scaffold graph loaded, nodes = ' + str(len(G.nodes())) + ' edges = ' + str(len(G.edges())), file=sys.stderr) + print('Hi-C implied edges = ' + str(hic_edges), file=sys.stderr) ''' @@ -592,7 +592,8 @@ def get_seed_scaffold(): seed_scaffolds = {} #this stores initial long scaffolds to_merge = set() - for subg in nx.connected_component_subgraphs(G): + #for subg in nx.connected_component_subgraphs(G): + for subg in (G.subgraph(c) for c in nx.connected_components(G)): p = [] for node in subg.nodes(): if subg.degree(node) == 1: @@ -664,7 +665,7 @@ def insert(assignment, seed_scaffolds): orientation = '' pos = -1 #first check at all middle positions - for i in xrange(1,len(path)-1,2): + for i in range(1,len(path)-1,2): score_fow = -1 score_rev = -1 if all_G.has_edge(five_prime,path[i]) and all_G.has_edge(three_prime,path[i+1]): @@ -722,7 +723,7 @@ def update_bed(expanded_scaffold): path = expanded_scaffold[key] scaffold_length[key] = 0 offset = 0 - for i in xrange(0,len(path),2): + for i in range(0,len(path),2): contig = path[i].split(':')[0] contig2scaffold[contig] = key ori = path[i].split(':')[1] + path[i+1].split(':')[1] @@ -735,7 +736,7 @@ def update_bed(expanded_scaffold): scaffold_re = {} for key in expanded_scaffold: - print key + print(key) path = expanded_scaffold[key] length = scaffold_length[key] offset = 0 @@ -751,7 +752,7 @@ def update_bed(expanded_scaffold): scaffold_re[key] = (right,left) else: midpoint = length/2 - for i in xrange(0,len(path),2): + for i in range(0,len(path),2): contig = path[i].split(':')[0] contig2scaffold[contig] = key left,right = re_counts[contig] @@ -915,7 +916,8 @@ def merge(contigs): #print best_hic_graph.nodes() - for g in nx.connected_component_subgraphs(best_hic_graph): + #for g in nx.connected_component_subgraphs(best_hic_graph): + for g in (best_hic_graph.subgraph(c) for c in nx.connected_components(best_hic_graph)): #print g.nodes() p = [] for node in g.nodes(): @@ -983,7 +985,7 @@ def correct_scaffolds(curr_scaffolds): path = final_scaffolds[key] new_path = [] #print path - for i in xrange(0,len(path)-1,2): + for i in range(0,len(path)-1,2): #print path contig = path[i].split(':')[0] scaffolded_contigs[contig] = True @@ -1010,7 +1012,7 @@ def correct_scaffolds(curr_scaffolds): oline = '' oline += 'scaffold_'+str(key) +'\t' cum_len = 0 - for i in xrange(0,len(path)-1,2): + for i in range(0,len(path)-1,2): #print path contig = path[i].split(':')[0] cum_len += prev_len[contig] @@ -1095,7 +1097,7 @@ def correct_scaffolds(curr_scaffolds): # expanded_scaffold_paths = updated_scaffolds -pickle.dump(expanded_scaffold_paths,open(args.directory+'/scaffolds_iteration_'+str(args.iteration)+'.p','w')) +pickle.dump(expanded_scaffold_paths,open(args.directory+'/scaffolds_iteration_'+str(args.iteration)+'.p','wb')) update_bed(expanded_scaffold_paths) diff --git a/make_links.py b/make_links.py index 26c3e07..fe10a34 100755 --- a/make_links.py +++ b/make_links.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import operator import math import sys @@ -55,7 +55,7 @@ def main(): coordinates_map = {} first_in_pair = {} second_in_pair = {} - print >> sys.stderr, 'bedfile started' + print('bedfile started', file=sys.stderr) prev_line = '' with open(args.mapping,'r') as f: for line in f: @@ -78,8 +78,8 @@ def main(): if prev_read.split('/')[0] == curr_read.split('/')[0]: #print 'here' - pos1 = (long(prev_attrs[1]) + long(prev_attrs[2]))/2.0 - pos2 = (long(attrs[1]) + long(attrs[2]))/2.0 + pos1 = (int(prev_attrs[1]) + int(prev_attrs[2]))/2.0 + pos2 = (int(attrs[1]) + int(attrs[2]))/2.0 # l1 = 300000.0 # l2 = 300000.0 #print pos1, pos2 @@ -152,11 +152,11 @@ def main(): # continue - print >> sys.stderr, 'bedfile loaded' + print('bedfile loaded', file=sys.stderr) - print len(contig_links) + print(len(contig_links)) ofile = open(args.directory+'/contig_links_iteration_'+str(iteration),'w') for key in contig_links: if key != '': diff --git a/refactor_breaks.py b/refactor_breaks.py index 109e0a4..df10c1a 100755 --- a/refactor_breaks.py +++ b/refactor_breaks.py @@ -10,7 +10,7 @@ iteration = int(args.iteration) breakpoints = {} -scaffolds_current = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','r')) +scaffolds_current = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','rb')) with open(args.directory+'/misasm_iteration_'+args.iteration+'.report','r') as f: for line in f: @@ -22,7 +22,7 @@ if len(attrs) < 2: continue breakpoints[attrs[0]] = [] - for i in xrange(1,len(attrs)): + for i in range(1,len(attrs)): breakpoints[attrs[0]].append(int(attrs[i])) except: continue @@ -49,7 +49,7 @@ def update_bed(expanded_scaffold): path = expanded_scaffold[key] scaffold_length[key] = 0 offset = 0 - for i in xrange(0,len(path)-1,2): + for i in range(0,len(path)-1,2): contig = path[i].split(':')[0] contig2scaffold[contig] = key ori = path[i].split(':')[1] + path[i+1].split(':')[1] @@ -77,7 +77,7 @@ def update_bed(expanded_scaffold): scaffold_re[key] = (right,left) else: midpoint = length/2 - for i in xrange(0,len(path),2): + for i in range(0,len(path),2): contig = path[i].split(':')[0] left,right = re_counts[contig] curr_contig_start = contig2info[contig][0] @@ -218,7 +218,7 @@ def update_bed(expanded_scaffold): #print key cum_len = 0 breaks_list = [] - for i in xrange(1,len(scaffolds_current[key]),2): + for i in range(1,len(scaffolds_current[key]),2): utg = scaffolds_current[key][i].split(':')[0] cum_len += unitig_length[utg] positions = breakpoints[key] @@ -228,7 +228,7 @@ def update_bed(expanded_scaffold): #print "found" breaks_list.append(i) break - print breaks_list + print(breaks_list) if len(breaks_list) == 0: scaffolds_new['scaffold_'+str(scaff_id)] = scaffolds_current[key] scaff_id += 1 @@ -237,8 +237,8 @@ def update_bed(expanded_scaffold): #print 'here' first_part = scaffolds_current[key][:breaks_list[0]+1] second_part = scaffolds_current[key][breaks_list[0]+1:] - print "first part : " + str(first_part) - print "second part : " + str(second_part) + print("first part : " + str(first_part)) + print("second part : " + str(second_part)) first_id = 'scaffold_'+str(scaff_id) scaff_id += 1 second_id = 'scaffold_'+str(scaff_id) @@ -250,23 +250,23 @@ def update_bed(expanded_scaffold): else: prev = 0 prev_scaffold = '' - for i in xrange(len(breaks_list)): + for i in range(len(breaks_list)): if i == len(breaks_list) - 1: - print "start : " + str(prev)+"\tend : " + str(breaks_list[i]+1) + print("start : " + str(prev)+"\tend : " + str(breaks_list[i]+1)) scaff = scaffolds_current[key][prev:breaks_list[i]+1] - print "part : " + str(scaff) + print("part : " + str(scaff)) scaffolds_new['scaffold_'+str(scaff_id)] = scaff scaff_id += 1 - print "start : "+str(breaks_list[i]+1) + print("start : "+str(breaks_list[i]+1)) scaff = scaffolds_current[key][breaks_list[i]+1:] - print "part : " + str(scaff) + print("part : " + str(scaff)) scaffolds_new['scaffold_'+str(scaff_id)] = scaff avoid_file.write(prev_scaffold+'\t'+'scaffold_'+str(scaff_id)+'\n') scaff_id += 1 continue - print 'start = '+str(prev)+'\tend = '+ str(breaks_list[i]+1) + print('start = '+str(prev)+'\tend = '+ str(breaks_list[i]+1)) scaff = scaffolds_current[key][prev:breaks_list[i]+1] - print "part : " + str(scaff) + print("part : " + str(scaff)) prev = breaks_list[i]+1 scaffolds_new['scaffold_'+str(scaff_id)] = scaff if prev_scaffold == '': @@ -275,9 +275,9 @@ def update_bed(expanded_scaffold): avoid_file.write(prev_scaffold+'\t'+'scaffold_'+str(scaff_id)+'\n') prev_scaffold = 'scaffold_'+str(scaff_id) scaff_id += 1 - print "=================" + print("=================") #print scaffolds_new.keys() -pickle.dump(scaffolds_new,open(args.directory+'/scaffolds_iteration_'+str(int(args.iteration) -1)+'.p','w')) +pickle.dump(scaffolds_new,open(args.directory+'/scaffolds_iteration_'+str(int(args.iteration) -1)+'.p','wb')) update_bed(scaffolds_new) ofile = open(args.directory+'/misasm_'+args.iteration+'.DONE','w') ofile.close() diff --git a/run_pipeline.py b/run_pipeline.py index fc18819..96b2dc5 100755 --- a/run_pipeline.py +++ b/run_pipeline.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python import os import stat import argparse @@ -24,9 +24,9 @@ def NG50(lengths,sz = 0): genome_size = sz if genome_size == 0: genome_size = sum(lengths.values()) - contig_lengths = sorted(lengths.values(),reverse=True) + contig_lengths = sorted(list(lengths.values()),reverse=True) lensum = 0 - for i in xrange(len(contig_lengths)): + for i in range(len(contig_lengths)): lensum += contig_lengths[i] if lensum >= genome_size/2: return contig_lengths[i] @@ -75,7 +75,7 @@ def main(): p = subprocess.check_output(cmd, shell=True) os.chmod(args.output + "/scaffold_length_iteration_1", stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH) except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) @@ -104,14 +104,14 @@ def main(): try: p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: - print >> sys.stderr,str(err.output) + print(str(err.output), file=sys.stderr) - cmd = 'python2 ' +bin+'/correct.py ' + args.assembly + ' '+args.output+'/input_breaks '+args.output+'/alignment_iteration_1.bed '+args.output + cmd = 'python ' +bin+'/correct.py ' + args.assembly + ' '+args.output+'/input_breaks '+args.output+'/alignment_iteration_1.bed '+args.output log.write(cmd) try: p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) os.system("mv " + args.output+"//alignment_iteration_1.tmp.bed " + args.output+"//alignment_iteration_1.bed") os.system("mv " + args.output+"//asm.cleaned.fasta " + args.output+"//assembly.cleaned.fasta") @@ -124,28 +124,28 @@ def main(): #First get RE sites if not fileExists(args.output+'/re_counts_iteration_'+str(iter_num)): try: - cmd = 'python2 '+bin+'/RE_sites.py -a '+args.output + '/assembly.cleaned.fasta -e '+ args.enzyme + ' > '+ args.output+'/re_counts_iteration_'+str(iter_num) - print cmd + cmd = 'python '+bin+'/RE_sites.py -a '+args.output + '/assembly.cleaned.fasta -e '+ args.enzyme + ' > '+ args.output+'/re_counts_iteration_'+str(iter_num) + print(cmd) log.write(cmd+'\n') p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: # if os.path.isfile(args.output+'/re_counts_iteration_'+str(iter_num)): # os.system(args.output+'/RE_sites_iteration_'+str(iter_num)) - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) #Now compute normal links with old new_links code - print >> sys.stderr, "Starting Iteration "+ str(iter_num) + print("Starting Iteration "+ str(iter_num), file=sys.stderr) if not fileExists(args.output+'/contig_links_iteration_'+str(iter_num)): try: - cmd = 'python2 '+bin+'/make_links.py -b '+ args.output+'/alignment_iteration_1.bed' + ' -d '+ args.output +' -i '+str(iter_num) + ' -x ' + args.dup - print cmd + cmd = 'python '+bin+'/make_links.py -b '+ args.output+'/alignment_iteration_1.bed' + ' -d '+ args.output +' -i '+str(iter_num) + ' -x ' + args.dup + print(cmd) log.write(cmd+'\n') p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) if os.path.isfile(args.output+'/contig_links_iteration_'+str(iter_num)): os.system('rm '+args.output+'/contig_links_iteration_'+str(iter_num)) sys.exit(1) @@ -153,14 +153,14 @@ def main(): #now use Serge's code to calculate if not fileExists(args.output+'/contig_links_scaled_iteration_'+str(iter_num)): try: - cmd = 'python2 '+bin+'/fast_scaled_scores.py -d '+args.output+' -i '+str(iter_num) + cmd = 'python '+bin+'/fast_scaled_scores.py -d '+args.output+' -i '+str(iter_num) log.write(cmd+'\n') - print cmd + print(cmd) p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: if os.path.isfile(args.output+'/contig_links_scaled_iteration_'+str(iter_num)): os.system('rm '+args.output+'/contig_links_scaled_iteration_'+str(iter_num)) - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) #Sort the links by column 5 @@ -168,12 +168,12 @@ def main(): try: cmd = 'sort -k 5 -gr '+args.output+'/contig_links_scaled_iteration_'+str(iter_num)+ ' > '+ args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num) log.write(cmd+'\n') - print cmd + print(cmd) p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: if os.path.isfile(args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)): os.system('rm '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)) - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if args.gfa != 'abc' and not os.path.isfile(args.output+'/tmp.links'): @@ -182,7 +182,7 @@ def main(): log.write(cmd+'\n') p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) os.system('mv ' + args.output+'/tmp.links '+args.output+'/contig_links_scaled_sorted_iteration_1') @@ -190,46 +190,46 @@ def main(): if not os.path.isfile(args.output+'/scaffolds_iteration_1.p'): try: - cmd = 'python2 '+bin+'/layout_unitigs.py -x '+args.gfa + ' -l '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num) +' -c ' +str(args.cutoff)+' -i 1 -d '+args.output + cmd = 'python '+bin+'/layout_unitigs.py -x '+args.gfa + ' -l '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num) +' -c ' +str(args.cutoff)+' -i 1 -d '+args.output log.write(cmd+'\n') - print cmd + print(cmd) p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: if os.path.isfile(args.output+'scaffolds_iteration_1.p'): os.system('rm '+args.output+'scaffolds_iteration_1.p') - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if not fileExists(args.output+'/misasm_iteration_'+str(iter_num+1)+'.report'): try: cmd = bin+'/break_contigs -a ' + args.output+'/alignment_iteration_'+str(iter_num+1)+'.bed -b ' + args.output+'/breakpoints_iteration_'+str(iter_num+1)+'.txt -l '+ args.output+'/scaffold_length_iteration_'+str(iter_num+1) + ' -i '+str(iter_num+1)+' -s 100 > ' + args.output+'/misasm_iteration_'+str(iter_num+1)+'.report' p = subprocess.check_output(cmd,shell=True) - print cmd + print(cmd) log.write(cmd+'\n') except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if not fileExists(args.output+'/misasm_'+str(iter_num+1)+'.DONE'): try: - cmd = 'python2 '+bin+'/refactor_breaks.py -d ' + args.output + ' -i ' + str(iter_num+1) + cmd = 'python '+bin+'/refactor_breaks.py -d ' + args.output + ' -i ' + str(iter_num+1) p = subprocess.check_output(cmd,shell=True) - print cmd + print(cmd) log.write(cmd+'\n') #os.system('mv scaffold_length_iteration_'+str(iter_num+1)+'_tmp scaffold_length_iteration'+str(iter_num+1)) #os.system('mv re_counts_iteration_'+str(iter_num+1)+'_tmp re_counts_iteration_'+str(iter_num+1)) #os.system('mv alignment_iteration_'+str(iter_num+1)+'_tmp.bed alignment_iteration_'+str(iter_num+1)+'.bed') except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if args.prnt == 'yes': - cmd = 'python2 ' + bin+'/get_seq.py -a '+ args.output +'/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.fasta -g ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.agp -p '+ args.output+'/scaffolds_iteration_'+str(iter_num)+'.p' + cmd = 'python ' + bin+'/get_seq.py -a '+ args.output +'/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.fasta -g ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.agp -p '+ args.output+'/scaffolds_iteration_'+str(iter_num)+'.p' log.write(cmd+'\n') try: p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) iter_num += 1 @@ -240,37 +240,37 @@ def main(): scaffold_length[attrs[0]] = int(attrs[1]) ng50.append(NG50(scaffold_length,genome_size)) if iter_num - 1 == int(args.iter): - cmd ='python2 '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(args.iter)+'.p' + cmd ='python '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(args.iter)+'.p' log.write(cmd+'\n') os.system(cmd) sys.exit(0) #now do iterative while True: - print >> sys.stderr, "Starting Iteration "+ str(iter_num) + print("Starting Iteration "+ str(iter_num), file=sys.stderr) if not fileExists(args.output+'/contig_links_iteration_'+str(iter_num)): try: - cmd = 'python2 '+bin+'/make_links.py -b '+ args.output+'/alignment_iteration_'+str(iter_num)+'.bed' + ' -d '+ args.output +' -i '+str(iter_num) - print cmd + cmd = 'python '+bin+'/make_links.py -b '+ args.output+'/alignment_iteration_'+str(iter_num)+'.bed' + ' -d '+ args.output +' -i '+str(iter_num) + print(cmd) p = subprocess.check_output(cmd,shell=True) log.write(cmd+'\n') #os.system("rm "+args.output+'/alignment_iteration_'+str(iter_num)+'.bed') except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) if os.path.isfile(args.output+'/contig_links_iteration_'+str(iter_num)): os.system('rm '+args.output+'/contig_links_iteration_'+str(iter_num)) sys.exit(1) - print >> sys.stderr, "Starting Iteration "+ str(iter_num) + print("Starting Iteration "+ str(iter_num), file=sys.stderr) if not os.path.isfile(args.output+'/contig_links_scaled_iteration_'+str(iter_num)): try: - cmd = 'python2 '+bin+'/fast_scaled_scores.py -d '+args.output+' -i '+str(iter_num) + cmd = 'python '+bin+'/fast_scaled_scores.py -d '+args.output+' -i '+str(iter_num) log.write(cmd+'\n') p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: if os.path.isfile(args.output+'/contig_links_scaled_iteration_'+str(iter_num)): os.system('rm '+args.output+'/contig_links_scaled_iteration_'+str(iter_num)) - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if not fileExists(args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)): @@ -281,57 +281,57 @@ def main(): except subprocess.CalledProcessError as err: if os.path.isfile(args.output+'/new_links_scaled_sorted_iteration_'+str(iter_num)): os.system('rm '+args.output+'/new_links_scaled_sorted_iteration_'+str(iter_num)) - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) #NOW check if any useful link here if check(args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num)): - print >> sys.stderr, "WARNING: Not enough Hi-C reads for scaffolding" + print("WARNING: Not enough Hi-C reads for scaffolding", file=sys.stderr) if not fileExists(args.output+'/scaffolds_iteration_'+str(iter_num)+'.p'): try: - cmd = 'python2 '+bin+'/layout_unitigs.py -x abc -l '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num) +' -c ' +str(args.cutoff)+' -i '+str(iter_num)+' -d '+args.output - print cmd + cmd = 'python '+bin+'/layout_unitigs.py -x abc -l '+args.output+'/contig_links_scaled_sorted_iteration_'+str(iter_num) +' -c ' +str(args.cutoff)+' -i '+str(iter_num)+' -d '+args.output + print(cmd) log.write(cmd+'\n') p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: if os.path.isfile(args.output+'scaffolds_iteration_'+str(iter_num)+'.p'): os.system('rm '+args.output+'scaffolds_iteration_'+str(iter_num)+'.p') - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if not os.path.isfile(args.output+'/misasm_iteration_'+str(iter_num+1)+'.report'): try: cmd = bin+'/break_contigs -a ' + args.output+'/alignment_iteration_'+str(iter_num+1)+'.bed -b ' + args.output+'/breakpoints_iteration_'+str(iter_num+1)+'.txt -l '+ args.output+'/scaffold_length_iteration_'+str(iter_num+1) + ' -i '+str(iter_num+1)+' -s 100 > ' + args.output+'/misasm_iteration_'+str(iter_num+1)+'.report' - print cmd + print(cmd) log.write(cmd+'\n') p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if not os.path.isfile(args.output+'/misasm_'+str(iter_num+1)+'.DONE'): try: - cmd = 'python2 '+bin+'/refactor_breaks.py -d ' + args.output + ' -i ' + str(iter_num+1) + ' > '+args.output+'/misasm_'+str(iter_num+1)+'.log' - print cmd + cmd = 'python '+bin+'/refactor_breaks.py -d ' + args.output + ' -i ' + str(iter_num+1) + ' > '+args.output+'/misasm_'+str(iter_num+1)+'.log' + print(cmd) log.write(cmd+'\n') p = subprocess.check_output(cmd,shell=True) #os.system('mv scaffold_length_iteration_'+str(iter_num+1)+'_tmp scaffold_length_iteration'+str(iter_num+1)) #os.system('mv re_counts_iteration_'+str(iter_num+1)+'_tmp re_counts_iteration_'+str(iter_num+1)) #os.system('mv alignment_iteration_'+str(iter_num+1)+'_tmp.bed alignment_iteration_'+str(iter_num+1)+'.bed') except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) sys.exit(1) if args.prnt == 'yes': - cmd = 'python2 ' + bin+'/get_seq.py -a '+ args.output +'/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.fasta -g ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.agp -p '+ args.output+'/scaffolds_iteration_'+str(iter_num)+'.p' + cmd = 'python ' + bin+'/get_seq.py -a '+ args.output +'/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.fasta -g ' + args.output+'/scaffolds_ITERATION_'+str(iter_num)+'.agp -p '+ args.output+'/scaffolds_iteration_'+str(iter_num)+'.p' log.write(cmd+'\n') try: p = subprocess.check_output(cmd,shell=True) except subprocess.CalledProcessError as err: - print >> sys.stderr, str(err.output) + print(str(err.output), file=sys.stderr) scaffold_length = {} with open(args.output+'/scaffold_length_iteration_'+str(iter_num+1),'r') as f: @@ -341,14 +341,14 @@ def main(): ng50.append(NG50(scaffold_length,genome_size)) curr_sz = len(ng50) if ng50[curr_sz - 1] == ng50[curr_sz - 2]: - cmd ='python2 '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(iter_num-1)+'.p' + cmd ='python '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(iter_num-1)+'.p' log.write(cmd+'\n') os.system(cmd) sys.exit(0) if iter_num - 1 == int(args.iter): - cmd ='python2 '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(args.iter)+'.p' + cmd ='python '+bin+'/get_seq.py -a '+ args.output + '/assembly.cleaned.fasta -f ' + args.output+'/scaffolds_FINAL.fasta -g ' + args.output+'/scaffolds_FINAL.agp -p '+args.output+'/scaffolds_iteration_'+str(args.iter)+'.p' log.write(cmd+'\n') os.system(cmd) sys.exit(0) diff --git a/stitch.py b/stitch.py index 720ff6e..b5ef337 100755 --- a/stitch.py +++ b/stitch.py @@ -22,12 +22,12 @@ def parse_fasta(fh): fa[current_short_name] = [] else: fa[current_short_name].append(ln.rstrip()) - for short_name, nuc_list in fa.iteritems(): + for short_name, nuc_list in fa.items(): fa[short_name] = ''.join(nuc_list) return fa -scaffolds = pickle.load(open(args.pickle,'r')) +scaffolds = pickle.load(open(args.pickle,'rb')) contig_seqs = parse_fasta(open(args.contigs,'r')) unitig_seqs = parse_fasta(open(args.unitigs,'r')) @@ -68,7 +68,7 @@ def parse_fasta(fh): contig_end = 0 path_normal = [] path_with_ori = [] - for i in xrange(0,len(path),2): + for i in range(0,len(path),2): path_normal.append(path[i].split(':')[0]) if path[i].split(':')[1] + path[i+1].split(':')[1] == 'BE': path_with_ori.append(path[i].split(':')[0]+':FOW') @@ -166,9 +166,9 @@ def parse_fasta(fh): scaffold_seq = '' gap = '' - for i in xrange(500): + for i in range(500): gap += 'N' - for i in xrange(len(new_path)): + for i in range(len(new_path)): if len(new_path[i]) != 4: utg,ori = new_path[i].split(':') if ori == 'FOW': @@ -191,14 +191,14 @@ def parse_fasta(fh): scaffold_seq += gap ofile.write('>scaffold_'+str(scaffold_id)+'\n') scaffold_id += 1 - chunks = [scaffold_seq[i:i+80] for i in xrange(0,len(scaffold_seq),80)] + chunks = [scaffold_seq[i:i+80] for i in range(0,len(scaffold_seq),80)] for each in chunks: ofile.write(each+'\n') - print "==============================" - print path_with_ori - print new_path - print "=============================" + print("==============================") + print(path_with_ori) + print(new_path) + print("=============================")