-
Notifications
You must be signed in to change notification settings - Fork 1
/
hi_lo_2.py
156 lines (134 loc) · 5.53 KB
/
hi_lo_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Do sequences uniq to either flu
show a preference for hi/low
distributions"""
from collections import defaultdict
import utils, os, sys
use_host = sys.argv[1]
def get_annotations(afile):
d = defaultdict(dict)
with open(afile) as f:
for line in f:
protein, elm = line.strip().split('\t')
d[protein][elm] = True
return d
def get_important(ls1, ls2):
"""Get annotations important on flu"""
elms = {}
for ls in (ls1, ls2):
for protein in ls:
for elm in ls[protein]:
elms[protein + ':' + elm] = True
return elms
def get_uniq(uniq, ls1, ls2):
for protein in ls1:
for elm in ls1[protein]:
if elm not in ls2[protein]:
uniq[protein + ':' + elm] = True
def get_freqs(afile, seq_percents):
with open(afile) as f:
for line in f:
protein, elmseq, freq = line.strip().split('\t')
key = ':'.join([protein, elmseq])
seq_percents[key].append(float(freq))
def get_host_freqs(afile):
freqs = {}
with open(afile) as f:
for line in f:
elm, seq, count, freq = line.strip().split('\t')
freqs[elm + ':' + seq] = float(freq)
return freqs
dir = 'working/Jul7'
years = range(2000,2011,1)
# bird
birds = ('chicken','duck')
strains = ('H9N2', 'H5N1')
seq_percents_bird = defaultdict(list)
total_bird = 0
for bird in birds:
for strain in strains:
for year in years:
file = os.path.join(dir, '.'.join((bird, strain, str(year),
'elms.conservation')))
if os.path.exists(file):
total_bird += 1
get_freqs(file, seq_percents_bird)
# human
hosts = ('human',)
strains = ('H5N1', 'H1N1', 'H3N2', 'H3N8')
seq_percents_human = defaultdict(list)
total_human = 0
for host in hosts:
for strain in strains:
for year in years:
file = os.path.join(dir, '.'.join((host, strain, str(year),
'elms.conservation')))
if os.path.exists(file):
total_human += 1
get_freqs(file, seq_percents_human)
human_elmseqs_file = 'working/Jul7/mammal_elms'
bird_elmseqs_file = 'working/Jul7/bird_elms'
human_elmseqs = get_annotations(human_elmseqs_file)
bird_elmseqs = get_annotations(bird_elmseqs_file)
uniq_human = {}
uniq_bird = {}
get_uniq(uniq_human, human_elmseqs, bird_elmseqs)
get_uniq(uniq_bird, bird_elmseqs, human_elmseqs)
impt_elms = get_important(human_elmseqs, bird_elmseqs)
# uniq = {}
if use_host == 'mammal':
#get_uniq(uniq, human_elmseqs, bird_elmseqs)
sp = seq_percents_human
elif use_host == 'bird':
#get_uniq(uniq, bird_elmseqs, human_elmseqs)
sp = seq_percents_bird
# for elmseq in seq_percents_bird:
# if elmseq in uniq:
# print elmseq + '\t' + str(sum(seq_percents_bird[elmseq])/float(total_bird)) + '\t' + str(sum(seq_percents_human[elmseq])/float(total_human))
human_host_file = 'working/Jul7/elmdict_H_sapiens.RWinit'
chicken_host_file = 'working/Jul7/elmdict_Gallus_gallus.RWinit'
human_host_freqs = get_host_freqs(human_host_file)
chicken_host_freqs = get_host_freqs(chicken_host_file)
if sys.argv[1] == 'bird':
with open('working/Jul7/hi_bird', 'w') as hi:
for elmseq in uniq_bird:
#'X' not in elmseq and 'J' not in elmseq and 'B' not in elmseq and
if sum(seq_percents_bird[elmseq])/float(total_bird) - sum(seq_percents_human[elmseq])/float(total_human) > float(20):
dels = elmseq.split(':')
#new_seq = utils.mk_sub(dels[2])
key = dels[1] + ':' + dels[2]
#frac = 0
if key in chicken_host_freqs:
frac = chicken_host_freqs[key]
hi.write(str(frac) + '\n')
with open('working/Jul7/low_bird', 'w') as low:
for elmseq in uniq_human:
# 'X' not in elmseq and 'J' not in elmseq and 'B' not in elmseq and
if sum(seq_percents_human[elmseq])/float(total_human) - sum(seq_percents_bird[elmseq])/float(total_bird) > float(20):
dels = elmseq.split(':')
#new_seq = utils.mk_sub(dels[2])
key = dels[1] + ':' + dels[2]
if key in chicken_host_freqs:
frac = chicken_host_freqs[key]
low.write(str(frac) + '\n')
elif sys.argv[1] == 'human':
with open('working/Jul7/hi_human', 'w') as hi:
for elmseq in uniq_human:
if sum(seq_percents_human[elmseq])/float(total_human) - sum(seq_percents_bird[elmseq])/float(total_bird) > float(20):
dels = elmseq.split(':')
#new_seq = utils.mk_sub(dels[2])
key = dels[1] + ':' + dels[2]
#frac = 0
if key in human_host_freqs:
frac = human_host_freqs[key]
cfrac = human_host_freqs[key]
hi.write(str(frac) + '\t' + str(cfrac) + '\n')
with open('working/Jul7/low_human', 'w') as low:
for elmseq in uniq_bird:
if sum(seq_percents_bird[elmseq])/float(total_bird) - sum(seq_percents_human[elmseq])/float(total_human) > float(20):
dels = elmseq.split(':')
#new_seq = utils.mk_sub(dels[2])
key = dels[1] + ':' + dels[2]
if key in human_host_freqs:
frac = human_host_freqs[key]
cfrac = human_host_freqs[key]
low.write(str(frac) + '\t' + str(cfrac) + '\n')