-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspecification_controller.py
202 lines (179 loc) · 9.15 KB
/
specification_controller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import logging
import random
import numpy
import upload_controller
from vector_retrieval import retrieve_vector_multiple
from vector_retrieval import retrieve_uploaded_vector_multiple
from data_controller import fasttext_vocab as ft_vocab
from data_controller import fasttext_vectors as ft_vecs
from data_controller import glove_vocab as gv_vocab
from data_controller import glove_vectors as gv_vecs
from data_controller import cbow_vocab as cb_vocab
from data_controller import cbow_vectors as cb_vecs
from data_controller import simlex_vocab as simlex
from data_controller import wordsim_vocab as wordsim
def format_sets(t1, t2, a1, a2):
t1, t2, t_del = format_set_sizes(t1, t2)
a1, a2, a_del = format_set_sizes(a1, a2)
deleted_keys = t_del + a_del
return t1, t2, a1, a2, deleted_keys
def format_set_sizes(vector_set1, vector_set2):
deleted_keys = []
if (len(vector_set1) > 0) & (len(vector_set2) > 0):
if len(vector_set1) == len(vector_set2):
return vector_set1, vector_set2, deleted_keys
elif len(vector_set1) > len(vector_set2):
difference = len(vector_set1) - len(vector_set2)
for i in range(difference):
key = random.choice(list(vector_set1.keys()))
del vector_set1[key]
deleted_keys.append(key)
logging.info("SpecController: Removed keys from dictionary 1: " + str(key))
elif len(vector_set2) > len(vector_set1):
difference = len(vector_set2) - len(vector_set1)
for i in range(difference):
key = random.choice(list(vector_set2.keys()))
del vector_set2[key]
deleted_keys.append(key)
logging.info("SpecController: Removed keys from dictionary 2: " + str(key))
return vector_set1, vector_set2, deleted_keys
def get_vectors_for_spec(space, lower, uploaded, t1, t2, a1, a2, aug1=None, aug2=None):
t1_found, t2_found, a1_found, a2_found, aug1_found, aug2_found = {}, {}, {}, {}, {}, {}
not_found = []
if lower == 'true':
t1 = [x.lower() for x in t1]
t2 = [x.lower() for x in t2]
a1 = [x.lower() for x in a1]
a2 = [x.lower() for x in a2]
if aug1 is not None and aug2 is not None:
aug1 = [x.lower() for x in aug1]
aug2 = [x.lower() for x in aug2]
if space == 'fasttext':
t1_found, t1_not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, t1)
t2_found, t2_not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, t2)
a1_found, a1_not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, a1)
a2_found, a2_not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, a2)
not_found = t1_not_found + t2_not_found + a1_not_found + a2_not_found
if aug1 is not None and aug2 is not None:
aug1_found, aug1_not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, aug1)
aug2_found, aug2_not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, aug2)
not_found += aug1_not_found + aug2_not_found
if space == 'glove':
t1_found, t1_not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, t1)
t2_found, t2_not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, t2)
a1_found, a1_not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, a1)
a2_found, a2_not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, a2)
not_found = t1_not_found + t2_not_found + a1_not_found + a2_not_found
if aug1 is not None and aug2 is not None:
aug1_found, aug1_not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, aug1)
aug2_found, aug2_not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, aug2)
not_found += aug1_not_found + aug2_not_found
if space == 'cbow':
t1_found, t1_not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, t1)
t2_found, t2_not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, t2)
a1_found, a1_not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, a1)
a2_found, a2_not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, a2)
not_found = t1_not_found + t2_not_found + a1_not_found + a2_not_found
if aug1 is not None and aug2 is not None:
aug1_found, aug1_not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, aug1)
aug2_found, aug2_not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, aug2)
not_found += aug1_not_found + aug2_not_found
if uploaded == 'true':
t1_found, t1_not_found = retrieve_uploaded_vector_multiple(space, t1)
t2_found, t2_not_found = retrieve_uploaded_vector_multiple(space, t2)
a1_found, a1_not_found = retrieve_uploaded_vector_multiple(space, a1)
a2_found, a2_not_found = retrieve_uploaded_vector_multiple(space, a2)
not_found = t1_not_found + t2_not_found + a1_not_found + a2_not_found
if aug1 is not None and aug2 is not None:
aug1_found, aug1_not_found = retrieve_uploaded_vector_multiple(space, aug1)
aug2_found, aug2_not_found = retrieve_uploaded_vector_multiple(space, aug2)
not_found += aug1_not_found + aug2_not_found
t1, t2, t_del = format_set_sizes(t1_found, t2_found)
a1, a2, a_del = format_set_sizes(a1_found, a2_found)
deleted_keys = t_del + a_del
if aug1 is not None and aug2 is not None:
# aug1, aug2, aug_del = format_set_sizes(aug1_found, aug2_found)
# deleted_keys += aug_del
aug1 = aug1_found
aug2 = aug2_found
logging.info("SpecController: Returning found vectors")
logging.info("SpecController: NotFound: " + str(not_found) + " ; DeletedKeys: " + str(deleted_keys))
return t1, t2, a1, a2, aug1, aug2, not_found, deleted_keys
return t1, t2, a1, a2, not_found, deleted_keys
def string_dicts_to_numpy_array_dicts(t1, t2, a1, a2):
target1, target2, attribute1, attribute2 = {}, {}, {}, {}
for value in t1:
val = numpy.array(t1[value])
target1[value] = val.astype(numpy.float)
for value in t2:
val = numpy.array(t2[value])
target2[value] = val.astype(numpy.float)
for value in a1:
val = numpy.array(a1[value])
attribute1[value] = val.astype(numpy.float)
for value in a2:
val = numpy.array(a2[value])
attribute2[value] = val.astype(numpy.float)
target1, target2, attribute1, attribute2, deleted = format_sets(target1, target2, attribute1, attribute2)
return target1, target2, attribute1, attribute2, deleted
def string_lex_to_numpy_array_dicts(data):
lex_dict = {}
for value in data:
val = numpy.array(data[value])
lex_dict[value] = val.astype(numpy.float)
return lex_dict
def get_vectors_for_augments(space, lower, uploaded, aug1_list, aug2_list):
found1, found2 = {}, {}
not_found1, not_found2 = [], []
if lower == 'true':
aug1_list = [x.lower() for x in aug1_list]
aug2_list = [x.lower() for x in aug2_list]
if space == 'fasttext':
found1, not_found1 = retrieve_vector_multiple(ft_vocab, ft_vecs, aug1_list)
found2, not_found2 = retrieve_vector_multiple(ft_vocab, ft_vecs, aug2_list)
if space == 'glove':
found1, not_found1 = retrieve_vector_multiple(gv_vocab, gv_vecs, aug1_list)
found2, not_found2 = retrieve_vector_multiple(gv_vocab, gv_vecs, aug2_list)
if space == 'cbow':
found1, not_found1 = retrieve_vector_multiple(cb_vocab, cb_vecs, aug1_list)
found2, not_found2 = retrieve_vector_multiple(cb_vocab, cb_vecs, aug2_list)
if uploaded == 'true':
found1, not_found1 = retrieve_uploaded_vector_multiple(space, aug1_list)
found2, not_found2 = retrieve_uploaded_vector_multiple(space, aug2_list)
not_found = not_found1 + not_found2
aug1, aug2, deleted = format_set_sizes(found1, found2)
return aug1, aug2, not_found, deleted
def get_lex_dict(space, uploaded, lex):
found = {}
if lex == "simlex":
if space == 'fasttext':
found, not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, simlex)
if space == 'glove':
found, not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, simlex)
if space == 'cbow':
found, not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, simlex)
if uploaded == 'true':
found, not_found = retrieve_uploaded_vector_multiple(space, simlex)
if lex == "wordsim":
if space == 'fasttext':
found, not_found = retrieve_vector_multiple(ft_vocab, ft_vecs, wordsim)
if space == 'glove':
found, not_found = retrieve_vector_multiple(gv_vocab, gv_vecs, wordsim)
if space == 'cbow':
found, not_found = retrieve_vector_multiple(cb_vocab, cb_vecs, wordsim)
if uploaded == 'true':
found, not_found = retrieve_uploaded_vector_multiple(space, wordsim)
return found
def return_vocab_vecs(space, uploaded):
vocab = {}
vecs = []
if space == 'fasttext':
return ft_vocab, ft_vecs
if space == 'glove':
return gv_vocab, gv_vecs
if space == 'cbow':
return cb_vocab, cb_vecs
if uploaded == 'true':
return upload_controller.get_vocab_vecs_from_upload()
else:
return vocab, vecs