forked from enormandeau/Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathalignment_cluster.py
More file actions
executable file
·190 lines (143 loc) · 4.61 KB
/
alignment_cluster.py
File metadata and controls
executable file
·190 lines (143 loc) · 4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Cluster aligned sequences based on the number of mismatches
__authors__ = "Eric Normandeau"
__program_name__ = "alignment_cluster"
__version_info__ = ('0', '0', '1')
__version__ = '.'.join(__version_info__)
__copyright__ = "Copyright (c) 2011 Eric Normandeau"
__license__ = "GPLv3"
__revision_date__ = "2011-06-13"
# Importing modules
import os
import sys
import re
import getopt
import platform
try:
from Bio import SeqIO
except:
print "This program requires the Biopython library"
sys.exit(0)
# Function definitions
def help():
"""Help attained by typing './program_name.py -h'
"""
_plateform = platform.system()
name = __program_name__
text = """
%s(1) User Commands %s(1)
\033[1mNAME\033[0m
\t%s - Cluster aligned sequences
\033[1mSYNOPSIS\033[0m
\t\033[1mpython %s.py \033[0m[\033[4mOPTIONS\033[0m]
\033[1mDESCRIPTION\033[0m
\tRegroup aligned sequences based on similarity
\t%s takes a fasta alignment file for input and clusters the sequences.
\033[1mOPTIONS\033[0m
\t\033[1m-h, --help\033[0m
\t\tDisplay the help of this program
\t\033[1m-i, --input\033[0m
\t\tInput file
\t\033[1m-o, --output\033[0m
\t\tOutput file
\t\033[1m-m, --mismatch\033[0m
\t\tCost of mismatches (integer, default=1)
\t\033[1m-d, --indel\033[0m
\t\tCost of insertions or deletions (integer, default=1)
\t\033[1m-c, --cutoff\033[0m
\t\tMaximum score accepted to cluster sequences (integer, default=3)
\033[1mAUTHORS\033[0m
\t%s
%s %s %s %s(1)
"""%(name, name, name, name, name, __authors__, name, __version__, \
__revision_date__, name)
if _plateform != 'Windows' and "this is great news":
print text
else:
__Windows__ = "This is an abomination"
remove = ["\033[1m","\033[0m","\033[4m"]
for i in remove:
text = text.replace(i, "")
print text
del(__Windows__) # If only we could...
def short_help(msg):
"""Print short help in case of bad user input
"""
print msg
print "Use -h for help"
sys.exit(0)
def score_sequences(s1, s2, m, d):
score = 0
for i in xrange(len(s1)):
n1 = s1[i]
n2 = s2[i]
if n1 != n2:
if n1 == "-" or n2 == "-":
score += d
else:
score += m
return score
# Does alignment_clean
# neatify your sequences
# up to your standards?
# The program itself
if __name__ == "__main__":
try:
opts, args = getopt.getopt(sys.argv[1:], "hi:o:m:d:c:", ["help",
"input=", "output=", "mismatch=", "indel=", "cutoff="])
except getopt.GetoptError, e:
short_help("Input error. Use -h for help")
mismatch = 1
indel = 1
cutoff = 3
for option, value in opts:
if option in ('-h', '--help'):
help()
sys.exit(0)
elif option in ('-i', '--input'):
input_file = value
try:
with open(input_file) as test:
pass
except:
short_help("Input Error: No input file specified or file not found.")
elif option in ('-o', '--output'):
output_file = value
try:
with open(output_file, "w") as test:
pass
except:
short_help("Output Error: No output file specified or incorect path.")
elif option in ('-m', '--mismatch'):
try:
mismatch = int(value)
assert mismatch >= 0
except:
short_help("Positive integer needed for 'mismatch' option")
elif option in ('-d', '--indel'):
try:
indel = int(value)
assert indel >= 0
except:
short_help("Positive integer needed for 'indel' option")
elif option in ('-c', '--cutoff'):
try:
cutoff = int(value)
assert cutoff >= 0
except:
short_help("Positive integer needed for 'cutoff' option")
print __program_name__, __version__
print __copyright__
print
fasta_sequences = list(SeqIO.parse(open(input_file),'fasta'))
nseq = len(fasta_sequences)
with open(output_file, "w") as f:
for i in xrange(nseq):
for j in xrange(i - 1):
s1 = fasta_sequences[i]
s2 = fasta_sequences[j + 1]
if score_sequences(s1.seq.tostring(), s2.seq.tostring(),
mismatch, indel) <= cutoff:
f.write(s1.id + " " + s2.id + "\n")
print "Done"