-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_uniq_flu.py
109 lines (93 loc) · 3.97 KB
/
get_uniq_flu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""What ELM sequences are present on all mammal/bird flus,
but missing from all of the opposite class?"""
import os, utils
from collections import defaultdict
def get_protein_counts(dir, hosts, strains, years):
"""Count seqs you have per protein"""
counts = {}
for host in hosts:
for strain in strains:
for year in years:
key = '.'.join((host, strain, str(year)))
f = os.path.join(dir, key + '.fa')
if os.path.exists(f):
counts[key] = defaultdict(dict)
for ID, seq in utils.fasta_iter(f):
protein = ID.split('.')[-1]
counts[key][protein][ID] = True
return counts
def get_cons_elms(dir, protein_counts, limit):
"""Get all ELM conservation for proteins w/ over LIMIT seqs"""
cons = {}
for key in protein_counts:
for protein in protein_counts[key]:
if len(protein_counts[key][protein]) > limit:
cons[key] = defaultdict(dict)
f = os.path.join(dir, key + '.elms.conservation')
with open(f) as elms:
for line in elms:
protein, elm, cons_st = line.strip().split('\t')
cons[key][protein][elm] = float(cons_st)
return cons
def get_all_cons(cons, protein_counts, seq_limit, cons_limit):
"""Get protein/elm pairs that are annotated or all flu strains"""
protein_elm_count = defaultdict(utils.init_zero)
protein_count = defaultdict(utils.init_zero)
for key in protein_counts:
for protein in protein_counts[key]:
if len(protein_counts[key][protein]) > seq_limit:
protein_count[protein] += 1
if protein in cons[key]:
for elm in cons[key][protein]:
if cons[key][protein][elm] > cons_limit:
protein_elm_count[protein + '=' + elm] += 1
protein_elm_pass = defaultdict(dict)
for protein_elm in protein_elm_count:
protein, elm = protein_elm.split('=')
if protein_elm_count[protein_elm] == protein_count[protein]:
protein_elm_pass[protein][elm] = True
return protein_elm_pass
def get_uniq(all_cons, cons, low_cons_cut):
"""What protein/ELM pairs are in all_cons, but are below
low_cons_cut for the other flu strains in cons?
Ex. mammal is all_cons, and bird is cons"""
uniq = defaultdict(dict)
for protein in all_cons:
for elm in all_cons[protein]:
found = False
for key in cons:
if protein in cons[key]:
if elm in cons[key][protein]:
if cons[key][protein][elm] > low_cons_cut:
found = True
break
if not found:
uniq[protein][elm] = True
return uniq
def count_it(uniq):
count = 0
for protein in uniq:
for elm in uniq[protein]:
count += 1
return count
dir = 'working/Jul12'
years = range(2000,2011,1)
mammal_hosts = ('human','swine','horse')
mammal_strains = ('H5N1','H1N1')
bird_hosts = ('chicken','duck')
bird_strains = ('H5N1','H9N2')
limit = 50
cons_cut = float(90)
low_cons_cut = float(85)
mammal_protein_counts = get_protein_counts(dir, mammal_hosts,
mammal_strains, years)
bird_protein_counts = get_protein_counts(dir, bird_hosts,
bird_strains, years)
mammal_cons = get_cons_elms(dir, mammal_protein_counts, limit)
bird_cons = get_cons_elms(dir, bird_protein_counts, limit)
mammal_all_cons = get_all_cons(mammal_cons, mammal_protein_counts, limit, cons_cut)
bird_all_cons = get_all_cons(bird_cons, bird_protein_counts, limit, cons_cut)
mammal_uniq = get_uniq(mammal_all_cons, bird_cons, low_cons_cut)
bird_uniq = get_uniq(bird_all_cons, mammal_cons, low_cons_cut)
print 'MAMMAL', count_it(mammal_uniq)
print 'BIRD', count_it(bird_uniq)