-
Notifications
You must be signed in to change notification settings - Fork 2
/
align.py
343 lines (341 loc) · 14.5 KB
/
align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
from __future__ import division # Corporate math still sucks
from util import dct
from util.fnc import cur,compose,pipe,negate,iseq,named
from util.reflect import postmortem
from util.lst import concat, avg, fst, snd, car, cdr
from itertools import imap
import lev
from unifeat import unify
from operator import sub, or_, and_
fs = ['../phonology/dialect/utp02datanew.txt',
'../phonology/dialect/see26datanew.txt',
'../phonology/dialect/sgb20datanew.txt',
'../phonology/dialect/sgj20datanew.txt',
'../phonology/dialect/sgl20datanew.txt',
'../phonology/dialect/sif20datanew.txt',
'../phonology/dialect/siw20datanew.txt',
'../phonology/dialect/siz20datanew.txt',
'../phonology/dialect/siy20datanew.txt',
'../phonology/dialect/smd20datanew.txt',]
def read_unicode(f):
"filename->[[utf-8-char]]"
return map(lambda u: map(lambda s:s.encode('utf8'), u),
file(f).read().decode('utf16').split(u'\n'))
def self_sub(change):
"lev.Rule -> bool -- Is this a boring self-substitution?"
return change.type==lev.SUB and change.src==change.dst
class Hash():
"Box with proxied __eq__ and __hash__ to allow custom hashing (dict & set)"
def __init__(self, eq, hash, x):
lev.init_attrs(self, locals())
def __str__(self):
return 'Hash(%s, eq=%s, hash=%s)' % (self.x, self.eq, self.hash)
def __repr__(self):
return 'Hash(eq=%r, hash=%r, x=%r)' % (self.eq, self.hash, self.x)
def __hash__(self):
return self.hash(self.x)
def __eq__(self, other):
return self.eq(self.x, other.x)
def get(self):
return self.x
def cmpset(l, eq, hash):
return set(hx.get() for hx in set(Hash(eq, hash, x) for x in l))
def collapse_envs(rules):
"[lev.Rule] -> set<lev.Rule>"
return cmpset(rules, lev.Rule.eq_env, lev.Rule.hash_env)
def classify(row):
"[[lev.Rule]] -> {utf-8-char:set<lev.Rule>}"
return dct.map(set, #collapse_envs,
dct.collapse(filter(negate(self_sub), concat(row)),
keymap=lambda rule:rule.src))
def compare(l1, l2):
"str*str -> [[lev.Rule]]"
lang1 = read_unicode(l1)
lang2 = read_unicode(l2)
dist = lev.totalavgdistance(map(unify, lang1), map(unify, lang2))
return map(lambda s1,s2:(lev.enviro(s2,s1,dist) if s2 else []),
lang1, lang2)
def run_compare_to_base(fs):
"[str] -> [{utf-8-char:set<lev.Rule>}]"
return map(pipe(cur(compare, fs[0]), classify), fs)
def run_compare_all_to_sgbsiy(fs):
"""[str] -> {utf-8-char:set<lev.Rule>}
(siy<=>sgb) - (map (base<=>) rest)"""
sgb = fs[2]
siy = fs[8]
base = fs[0]
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares
diff = classify(compare(sgb, siy))
others = map(compose(classify, cur(compare, base)), fs)
# return dct_mapall(lambda v,*rest: reduce(sub, rest, v), diff, *others)
kws = {'default':set()}
return dct.zipwith((lambda v,*rest: reduce(sub, rest, v)), diff, *others, **kws)
def run_compare_sgbsiy_to_base(fs):
"""[str] -> {utf-8-char:set<lev.Rule>}
((sgb <=> base) | (sgb <=> base)) - (map (<=> base) rest)"""
sgb = fs[2]
siy = fs[8]
base = fs[0]
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares
outsiders = dct.zipwith(or_,
classify(compare(base, sgb)),
classify(compare(base, siy)),
default=set())
others = map(compose(classify, cur(compare, base)), fs)
kws = {'default':set()}
return dct.zipwith((lambda v,*rest: reduce(sub, rest, v)), outsiders, *others, **kws)
def run_compare_sgb_and_siy_to_base(fs):
"""[str] -> {utf-8-char:set<lev.Rule>}
((sgb <=> base) & (sgb <=> base)) - (map (<=> base) rest)"""
sgb = fs[2]
siy = fs[8]
base = fs[0]
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares
outsiders = dct.zipwith(and_,
classify(compare(base, sgb)),
classify(compare(base, siy)),
default=set())
others = map(compose(classify, cur(compare, base)), fs)
kws = {'default':set()}
return dct.zipwith((lambda v,*rest: reduce(sub, rest, v)), outsiders, *others, **kws)
def run_compare_shared_sgbsiy(fs):
"""this really needs a lenient definition of eq?
(sgb <=> base) & (siy <=> base)"""
sgb = fs[2]
siy = fs[8]
base = fs[0]
del fs[8]; del fs[2]; del fs[0] # dangerous but who cares
return dct.zipwith(and_,
classify(compare(base, sgb)),
classify(compare(base, siy)),
default=set())
getsrc = named('src', lambda rule: rule.src)
getdst = named('dst', lambda rule: rule.dst)
getpair = named('rule', lambda rule: (rule.dst, rule.src))
def run_collapse_differences(fs, get=getdst):
base = fs[0]
del fs[0]
subs = [[get(rule) for rule in concat(compare(base,f))
if rule.type==lev.SUB and rule.dst!=rule.src]
for f in fs]
return dct.zip(dct.count(concat(subs)), default=0, *map(dct.count, subs))
def lst_except(l, *ns):
"""Totally inefficient! You have been warned, dude!
(requiring ns to be ordered could help a lot if I actually cared)"""
acc = []
for i,x in enumerate(l):
if i not in ns:
acc.append(x)
return acc
def find_collapsed(f, collapsed):
"{char:[int]} -> [(char,int)] (sorted)"
return sorted(dct.map(f, collapsed).items(), key=snd, reverse=True)
diff = lambda freqs:avg([freqs[2],freqs[8]]) - avg(lst_except(freqs,0,2,8))
def variance(freqs):
average = avg(cdr(freqs))
return sum((average - c)**2 for c in cdr(freqs)) / average
find_difference = cur(find_collapsed, diff)
find_variance = cur(find_collapsed, variance)
def to_html_group_differences(f, name, differences):
print >>f, "<h1>%s</h1>" % name
print >>f, "<table border=1 cellspacing=0 bordercolor='black'><tr><td></td><th>Char</th><th>Variance</th>",
for i, (sub,variance) in enumerate(differences):
if isinstance(sub, tuple):
s = "<tr><td>%s</td><td>%s → %s</td><td>%s</td></tr>"
row = i, sub[1], sub[0], variance
else:
s = "<tr><td>%s</td><td>%s</td><td>%s</td></tr>"
row = i, sub, variance
print >>f, s % row
print >>f, "</table>"
def to_html_variances(f, name, variances):
print >>f, "<h1>%s</h1>" % name
print >>f, "<table border=1 cellspacing=0 bordercolor='black'><tr><th>Char</th><th>Variance</th>",
for pair in variances:
print >>f, "<tr><td>%s</td><td>%s</td></tr>" % pair
print >>f, "</table>"
def to_html_differences(f, name, combined):
"file*str*{char:[int]} "
print >>f, '''<h1>%s</h1>''' % name
print >>f, "<table border=1 cellspacing=0 bordercolor='black'><tr><th>Char</th><th>All</th>",
print >>f, ''.join('<th>%s</th>' % f[21:24] for f in fs[1:]), "<th>Avg</th></tr>"
for char,counts in combined.items():
print >>f, "<tr><td>%s</td>" % char,
print >>f, ''.join("<td>%s</td>" % c for c in counts),
print >>f, "<td>%.2f</td></tr>" % avg(counts[1:])
print >>f, "</table>"
def to_html(f,name,row):
print >>f, '''<h1>%s</h1>''' % name
for char,changes in row.items():
print >>f, '<h2>%s</h2><p>' % char
for change in changes:
print >>f, '%s<br/>' % change.to_html()
print >>f, '</p>'
if __name__=="__main__":
setup = ((run_compare_sgb_and_siy_to_base,
'rule/smartenv',
'((sgb <=> base) & (siy <=> base)) - (map (<=> base) rest), eq?-rule/smartenv',
'sgb_and_siy_to_base'),
(run_compare_sgb_and_siy_to_base,
'rule',
'((sgb <=> base) & (siy <=> base)) - (map (<=> base) rest), eq?-rule',
'sgb_and_siy_to_base-simple'),
(run_compare_sgbsiy_to_base,
'rule',
'((sgb <=> base) | (siy <=> base)) - (map (<=> base) rest), eq?-rule',
'sgbsiy_to_base-simple'),
(run_compare_shared_sgbsiy,
'all',
'(sgb <=> base) & (siy <=> base), eq?-all',
'shared_sgbsiy-full'),
(run_compare_shared_sgbsiy,
'rule',
'(sgb <=> base) & (siy <=> base), eq?-rule',
'shared_sgbsiy-simple'),
(run_compare_shared_sgbsiy,
'rule/smartenv',
'(sgb <=> base) & (siy <=> base), eq?-rule/smartenv',
'shared_sgbsiy')
)
setup = ((run_collapse_differences,
'xxx',
'Counting differences',
'count_differences'),)
for run,rule,title,fname in setup:
#f = open('align_'+fname+'-revised.html', 'w')
f = open(fname+'.html', 'w')
print >>f, '''<html><head>
<meta http-equiv="content-type" content="text-html; charset=utf-8">
<title>Observed changes from baseline English</title></head><body>'''
lev.setRuleCompare(rule)
#to_html(f, title, run(list(fs)))
for attr in (getsrc,getdst,getpair):
to_html_group_differences(f,
"%s – %s" % (title,attr.func_name),
find_difference(run(list(fs), attr)))
## to_html_differences(f,
## "%s – %s" % (title,attr.func_name),
## run(list(fs), attr))
## map(to_html,
## ('utp02', 'see26', 'sgb20', 'sgj20', 'sgl20', 'sif20','siw20','siz20'),
## run(fs))
print >>f, '</body></html>'
f.close()
# number of things that got thrown out because they were shared
{'': 18,
'\xc9\x99': 3,
'\xc9\x9b': 1,
'b': 7,
'e': 1,
'd': 2,
'\xc9\x91': 6,
'k': 8,
'j': 1,
'\xca\xb0': 6,
'\xc9\x94': 3,
'o': 3,
'n': 2,
'p': 1,
's': 3,
'\xc9\xaa': 4,
't': 7,
'\xca\x8a': 7,
'v': 0,
'w': 2,
'\xca\x83': 0}
{'': 31,
'\xc9\x99': 4,
'\xc9\x9b': 2,
'b': 8,
'e': 1,
'd': 2,
'\xc9\x91': 7,
'k': 8,
'j': 2,
'\xca\xb0': 8,
'\xc9\x94': 4,
'o': 4,
'n': 3,
'p': 5,
's': 6,
'\xc9\xaa': 10,
't': 9,
'\xca\x8a': 7,
'v': 1,
'w': 2,
'\xca\x83': 1}
# changes that weren't even shared by at least one of the others
# (so actually these should be calculated also at some point)
set(['\xc9\x9c', '\xc9\x92', '\xc9\xbe', '\xc3\xa7', '\xc3\xa6', '\xc9\xa8', '\xc9\xab', '\xc9\xac', '\xc9\xaf', '\xc3\xb0', '\xca\x94', '\xce\xb8', '\xca\x8f', '\xca\x89', '\xca\x82', 'a', '\xca\x8c', 'g', 'f', 'i', 'h', 'm', 'l', 'r', 'z'])
# segments with [dorsal] cause a violation (in the OT paper) (this def is a
# little weak)
# try lining up everything logically for the HTML dump
# clustering is really clustering pathologies and then I would like to extract
# the details so that treatment can be prescribed once particular patterns of
# deafness are identified and categorised
# worked on Tuesday 1.5 h looking for this paper.
# Wednesday 2 h + 8.25-22
# Thursday: 8:30 - 11:45
# Friday: 9:15 - 9:45
# try to find 'guidelines for constraints' paper again
# I uh, can't find this, but here some cool papers on the ROA (abstract only)
# 909: Boersma and Hamann show that the 'prototype effect' can be derived
# by OT simulations who optimise their grammars.
# 484: Jonas Kuhn's thesis on computational OT syntax (OT-LFG)
# 895: An alternative to iterative footing (might be relevant, the abstract
# seems a little confused)
# 888: Proves that the computational complexity of stochastic OT learning
# algorithms is k-1
# 883: Tessier's BCD dissertation
# 878 (823) (844.8): McCarthy's OT-CC 'Slouching toward optimality'
# 873: Antilla's T-orders
# 872: Pater et al: Harmonic Grammars translate into linear systems. I think
# these grammars are supersets of OT grammars. They have code available.
# 863/864: Andries Coetzee: I think this is the weird non-OT talk he gave at
# Phonology Fest weekend. (His dissertation is at 687)
# 858: Hayes and Wilson's Maximum Entropy Learning
# 851: Oostendorp argues against Port's incomplete neutralisation
# 835: On-line learning of underlying forms. 10 pages! But it's magic!
# 818: Use a freakin' machine to do OT! Also, Finnish is hard.
# 811: Learn underlying forms by restricting search to a lexical subspace. (short too)
# 798: Prince turns OT back into Harmony Theory via 'utility functions'??
# 844.12: Tesar talks about learning paradigms
# 794: Hey! It's those FRed people! But with a paper instead of bad Ruby.
# 780: Pater shows how to handle variation with an RCD-family algorithm
# 746: Apouussidou and Boersma compare GLA and EDCD for learning stress.
# GLA is better. Surprise!
# 739: Pater modifies BCD to learn Stratal OT (?) grammars.
# 695: Tesar creates Constrast Analysis for learning (see 811)
# 688: Generating 'contenders' from an infinite list of candidates. FSTs+RCD
# 683: McCarthy shows how to learn faithful /B/->[B] mappings after having
# learnt the harder /A/->[B] one.
# 672..675: Keller and Asudeh: GLA sucks! (although RCD does too)
# 638: Boersma reviews Tesar & Smolensky 2000 and says that learnability means
# that not all factorial typologies are possible??!
# 625: Jaeger: compares Stochastic OT with Boersma's MaxEnt model and shows
# that you can get GLA to work with Maximum Entropy too and you get
# guaranteed convergence
# 620: Tesar and Prince use phonotactics (?) to learn phono. alternations
# 618/619: " " et al add inconsistency detection to BCD, speeding it up
# 610: A U Mass thesis on syncope
# 600: Some constraints generate violations quadratic in the length of the word
# like Align(Foot, Word), so you can prove that OT phonology is not regular.
# 592: Catalan may have similar syllabification to Mongolian in its clitics
# 562: Prince explains comparative tableaux (I think have this already)
# 544: Jaeger: Proposes Bidirectional GLA (sets up a speaker/hearer loop?)
# 537: Prince and Smolensky's original OT manuscript, revised slightly
# 536: Prince explores alternative architectures more similar to Harmny Theory
# from the 80s. And sees what happens.
# 500: Entailed Rankings Arguments: Prince formalises what a machine needs to
# know to do OT. (I think I have this already)
# 463: Somebody wrote a contraint runner in 2001. As usual, works on stress.
# 459: More candidates than atoms in the universe: somebody bad at math debunks
# OT again. (n m) not n!m! maybe...
# 446: Broselow writes about Stress-epenthesis interactions.
# (I may have this already)
# 426: (Tesar introduces inconsistency detection)
# 418: Lombardi explains why L2 English speakers use either [s] or [t] based on L1
# 400: Minimal constraint demotion in (human) acquisition of German
# 392: Argument that pure Lexicon Optimisation is too restrictive
# 390: Michael Hammond does some more logic<=>OT isomorphisms
#