-
Notifications
You must be signed in to change notification settings - Fork 1
/
js_elmSeqDist_host_flu_fuzzyELMmatches.py
171 lines (148 loc) · 6.55 KB
/
js_elmSeqDist_host_flu_fuzzyELMmatches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""Use Jensen-Shannon divergence to find distances between chicken/human
host/flu. Since host/flu share few ELM sequences, only consider
flu ELMs & find host ELMs that are closest in edit distance."""
import itertools, sys, os, utils, random, global_settings, numpy, math
import Bio.Cluster
from collections import defaultdict
def print_results(elm, clusters, overlap):
"""Print out clustering of ELM sequences.
Ignore sequences in overlap that cannot
be assigned."""
for cluster in clusters:
for seq in clusters[cluster]:
if seq not in overlap:
print('%s\t%d\t%s' %
(elm, cluster, seq))
# this comes from my scratch experiments
human_distance_file = '../../scratch/human_flu_distances'
chicken_distance_file = '../../scratch/chicken_flu_distances'
both_distance_file = 'working/runs/Jun24/closest_dis'
distance_file = both_distance_file
# map each flu elmSeq to a host elmSeq
host_flu_elmSeq_mapping = {}
flu_host_elmSeq_mapping = {}
host_flu_elmSeq_best_dis = {}
flu_host_elmSeq_best_dis = {}
mapped = {}
with open(distance_file) as f:
for line in f:
(elm, flu_seq, host_seq, distance) = line.strip().split('\t')
dis = int(distance)
host_elmSeq = elm + ':' + host_seq
flu_elmSeq = elm + ':' + flu_seq
if dis < 5:
if elm not in flu_host_elmSeq_best_dis:
flu_host_elmSeq_best_dis[elm] = {}
flu_host_elmSeq_mapping[elm] = {}
if flu_elmSeq not in flu_host_elmSeq_best_dis[elm]:
flu_host_elmSeq_best_dis[elm][flu_elmSeq] = {}
flu_host_elmSeq_mapping[elm][flu_elmSeq] = {}
if host_elmSeq not in flu_host_elmSeq_best_dis[elm][flu_elmSeq]:
flu_host_elmSeq_best_dis[elm][flu_elmSeq][host_elmSeq] = dis
flu_host_elmSeq_mapping[elm][flu_elmSeq][host_elmSeq] = True
# if dis < flu_host_elmSeq_best_dis[elm][flu_elmSeq][host_elmSeq]:
# # replace
# flu_host_elmSeq_best_dis[elm][flu_elmSeq][host_elmSeq] = dis
# flu_host_elmSeq_mapping[elm][flu_elmSeq] = {}
# flu_host_elmSeq_mapping[elm][flu_elmSeq][host_elmSeq] = True
# elif dis == flu_host_elmSeq_best_dis[elm][flu_elmSeq][host_elmSeq]:
# append
flu_host_elmSeq_mapping[elm][flu_elmSeq][host_elmSeq] = True
for elm in flu_host_elmSeq_mapping:
dis_mat = []
for seq1,seq2 in itertools.combinations(flu_host_elmSeq_mapping[elm], 2):
match = float(len(set(seq1) & set(seq2)))
dis = numpy.average([match/float(x) for x in [len(seq1), len(seq2)]])
dis_mat.append(dis)
mat = numpy.array(dis_mat)
num_clusters = 10
percent_overlap = float(1)
while num_clusters > 1 and percent_overlap > float(.1):
if len(flu_host_elmSeq_mapping[elm]) > num_clusters:
ans, error, nfound = Bio.Cluster.kmedoids(mat, nclusters=num_clusters,
npass=10)
clusters = defaultdict(dict)
total = {}
for flu_seq, cluster_id in zip(flu_host_elmSeq_mapping[elm],
ans):
clusters[cluster_id][flu_seq] = True
total[flu_seq] = True
for host_seq in flu_host_elmSeq_mapping[elm][flu_seq]:
clusters[cluster_id][host_seq] = True
total[host_seq] = True
overlap = {}
for cluster1, cluster2 in itertools.combinations(clusters, 2):
for gene in set(clusters[cluster1]) & set(clusters[cluster2]):
overlap[gene] = True
percent_overlap = float(len(overlap))/float(len(total))
if percent_overlap < float(.1):
print_results(elm, clusters, overlap)
break
else:
num_clusters -= 1
else:
break
sys.exit(0)
for flu_elmSeq in flu_host_elmSeq_mapping:
for host_elmSeq in flu_host_elmSeq_mapping:
if not host_elmSeq in host_flu_elmSeq_mapping:
host_flu_elmSeq_mapping[host_elmSeq] = {}
host_flu_elmSeq_mapping[host_elmSeq][flu_elmSeq] = True
mapped[flu_elmSeq] = True
def count_0s(ls):
count = 0
for item in ls:
if not item:
count += 1
return count
hosts = ('H_sapiens', 'Gallus_gallus')
flus = ('human', 'chicken')
# count elm:seq occurence
flu_counts = {}
pre_flu_counts = {}
host_counts = {}
all_elmSeqs = {}
for flu in flus:
pre_flu_counts[flu] = utils.get_flu_counts('results/' + flu + '.H5N1.elms',
global_settings.FLU_PROTEINS)
# sample protein sequenes from chicken
new_chicken_counts = {}
new_chicken_proteins = {}
for protein in proteins:
c = len(pre_flu_counts['human'][protein].keys())
new_chicken_proteins[protein] = random.sample(pre_flu_counts['chicken'][protein], c)
for protein in new_chicken_proteins:
new_chicken_counts[protein] = {}
for seq in new_chicken_proteins[protein]:
new_chicken_counts[protein][seq] = pre_flu_counts['chicken'][protein][seq]
flu_counts['human'] = utils.count_flu(pre_flu_counts['human'], all_elmSeqs)
flu_counts['chicken'] = utils.count_flu(new_chicken_counts, all_elmSeqs)
for host in hosts:
host_counts[host] = defaultdict(utils.init_zero)
with open('working/runs/Jun24/elmdict_' + host + '.init') as f:
for line in f:
(elm, seq, count, fq) = line.strip().split('\t')
elmSeq = elm + ':' + seq
if elmSeq in host_flu_elmSeq_mapping:
for a_flu_elmSeq in host_flu_elmSeq_mapping[elmSeq]:
host_counts[host][a_flu_elmSeq] += int(count)
# else:
# host_counts[host][elmSeq] += int(count)
# all_elmSeqs[elmSeq] = True
flu_vecs = utils.mk_count_vecs(flu_counts, mapped)
flu_dists = utils.mk_count_dists(flu_vecs)
host_vecs = utils.mk_count_vecs(host_counts, mapped)
host_dists = utils.mk_count_dists(host_vecs)
js_distances = defaultdict(dict)
for host in hosts:
for flu in flus:
js_dis = utils.jensen_shannon_dists(host_dists[host],
flu_dists[flu])
js_distances[host][flu] = js_dis
print host, flu, js_dis
def print_it(name, vec):
print name, float(count_0s(vec))/float(len(vec))
print_it('chicken_flu', flu_vecs['chicken'])
print_it('human flu', flu_vecs['human'])
print_it('H sapiens', host_vecs['H_sapiens'])
print_it('Gallus gallus', host_vecs['Gallus_gallus'])