thesis/statisticalMachineTranslation.tex at master · jmp84/thesis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\chapter{Statistical Machine Translation}
\label{chap:smt}

% TODOFINAL fix vertical space after "even if one strives"
% TODOFINAL define oracles and how to compute them
% TODOFINAL talk about cmert/lmert/pro in all exps
% TODOFINAL grep for paragraph and replace by sub or subsubsection where appropriate
% TODOFINAL if time, add more hiero rule types, e.g. oov, deletions, etc.
% TODONEVER if time, expand the hiero rule extraction section with the actual implementation where holes are poled
% TODOFINAL review all equations and make sure preceded by colon rather than period.
% TODOFINAL rule filtering
% TODOFINAL grep for mbox and replace by text
% TODOFINAL grep for | and replace by \mid
% TODOFINAL grep footnote and replace by citation when appropriate
% TODOFINAL grep for ?? in pdf
% TODOFINAL check space after argmax
% TODOFINAL (done ?) ask Bill if MTTK and/or GIZA can be used for word based decoding
% TODOFINAL check word alignment vs. word-alignment
% TODOFINAL check notations with \hat: bold inside or outside the hat
% TODOFINAL grep all occurrences of start and end of sentence and use texttt
% TODOFINAL (done ?) remove the clearpage linebreaks newpages, etc.
% TODOFINAL review all source to target target to source and add hyphens
% TODOFINAL review all itemize and use consistent punctuation
% TODOFINAL add urls to thesis.bib where possible
% TODOFINAL (done !) grep for \cite and correct into \citet or \citep
% TODOFINAL review all transitions between sections
% TODOFINAL check title capitalization
% TODOFINAL review and expand all captions
% TODOFINAL check where sentence pair defined
% TODOFINAL check where parallel sentence, parallel corpus defined
% TODOFINAL review proper use of em dashes
% TODOFINAL check use of hierarchical phrase-based and replace with synchr gram where appropriate
% TODOFINAL check acronym MT and SMT
% TODOFINAL grep for source channel and put a hyphen
% TODOFINAL (done !) (remove "and colleagues")
% TODOFINAL or TODONEVER add some related work for reparameterization of model 2 and maybe discriminative alignment ?
% TODOFINAL remove vertical bar for conditional proba
% TODOFINAL replace \ by \, in multiplications
% TODOFINAL replace all {\em
% TODOFINAL remove all \bf and \it and \bfseries and \itshape
% TODOFINAL replace all refs by autoref
% TODOFINAL put a tilda before all citep
% TODOFINAL grep for trailing spaces
% TODOFINAL (done ?) add a note in the extraction from posterior chapter that we use word-to-word HMM phrase posteriors as opposed to word-to-phrase HMM phrase posteriors
% TODOFINAL (done ?) source channel or noisy channel model ?
% TODOFINAL (done ?) Bill quick question: in MTTK f2e directory, what direction is it ?
% TODOFINAL for phrase pair extraction, cite also IEEE paper, not just Yonggang thesis
% TODOFINAL (done) : papers to read: chen and goodman, yonggang IEEE, koehn's book
% TODOFINAL see alternative presentation of rule extraction in one my reading group presentations
% TODOFINAL (done !) replace all refs by autoref
% TODOFINAL replace all HMMs by mbox{HMM}
% TODOFINAL remove all mbox and replace by text
% TODOFINAL maybe cite Yee Whye Teh's 2006 paper for lm
% TODOFINAL (done) soften down claim that koehn et al is wrong in extraction from posteriors (which contradicts etc.)
% TODOFINAL (done) maybe include alignment template paper
% TODOFINAL read papers by Papineni et al 1997, Papineni et al 1998
% TODOFINAL (done) check instance of noisy channel and source channel
% TODONEVER maybe include mention of Berger et al. 1994 The Candide System
% TODOFINAL look at this paper by Venugopal et al 2003: Effective Phrase Translation Extraction from Alignment Models
% TODOFINAL look at this paper: Papineni, Roukos, and Ward (1997, 1998)
% TODOFINAL look at this paper: Comparing and Integrating Alignment Template and Standard Phrase-Based Statistical Machine Translation
% TODOFINAL look at Knight 1999 that MT search is NP-complete
% TODOFINAL learn about A* search
% TODOFINAL learn about (admissible) heuristics used in phrase-based MT
% TODOFINAL review Zens et al 2002 KI 2002
% TODOFINAL maybe look at ngram translation models
% TODOFINAL (done ?) note on how hiero models reordering directly but still can be improved a usual lexicalized reordering model
% TODOFINAL comment on hypothesis recombination and how a hyp may be lost for rescoring
% TODOFINAL maybe look at Tillmann et al 1997 for stack decoding
% TODOFINAL maybe look at A* search Och et al 2001
% TODOFINAL make a section with definitions
% TODOFINAL read the mathematics of SMT bordel de merde
% TODOFINAL grep for hyphen (phrase-based etc.)
% TODOFINAL harmonize notation \bm{f} vs. f_1^J
% TODOFINAL (done) grep for cite grep -v citep or citet
% TODOFINAL (done) grep for noindent
% TODOFINAL grep for {\em}, grep for {\bf}
% TODOFINAL grep for mbox
% TODOFINAL (done) grep for linebreak


% This is a background chapter on SMT with emphasis on the parts
% that will be extended further for research.

%1st year report:
%-- definition hiero grammar
%-- rule patterns
%-- generative model, log linear model
%-- features
%-- metrics
%-- mert
%-- language modelling
%-- word alignment: hmm , ctext hmm, w2p hmm, symmetrisation
%-- rule extraction
%-- decoding with wfst
%-- rescoring

%currently:
%-- generative model
%-- word alignment: hmm, w2p hmm, symmetrisation
%-- language modelling
%-- phrase-based translation
%-- mert
%-- rescoring

%missing:
%-- mapreduce

% TODOFINAL refs to background in other chapters:
%-- rule extraction alignment constraints
%-- wphmm
%-- lexical feature formula
%-- patterns and pattern filtering
%-- HiFST
%-- standard hiero grammar (patterns, etc.)
%-- mapreduce
%-- extraction constraints (retrieval and extraction)
%-- notation in background harmonized with the rest of chapters
%-- filters for grammar retrieval

%original source channel formulation OK
%word alignment OK
%newer log-linear formulation OK
%phrase-based translation OK
%hierarchical phrase-based translation OK
%features OK
%language modelling: one of the features OK
%optimization: metrics, mert OK
%hifst OK
%rescoring OK

%missing: mapreduce, phrase based translation

%\section{Overview}

% This should be an overview of the translation pipeline.
% TODOFINAL TODONEVER ? Maybe move it later as a summary and explanation of how things
% are done in practice

Statistical Machine Translation (SMT)~\citep{brown-dellapietra-dellapietra-mercer-1993,lopez:2008:ACMComputingSurveys,koehn:2010:book}
has become the dominant approach to machine translation, as increasing
amounts of data and computing power have become available.
In the SMT paradigm, given a sentence in a source language,
conceptually all
possible sentences in a target language are assigned a probability, a
score, or a cost and the best translation is picked according to a certain
decision criterion that relates to these probabilities, scores or costs.
The research challenge is to develop models that assign scores that
reflect human judgements of translation quality.

% TODOFINAL TODONEVER ? add some blabla paragraph

In this chapter, we first review the historical background of SMT in
\autoref{sec:historicalBackground}.
We then present the original source-channel model for SMT
in \autoref{sec:sourceChannelModel}.
Word alignment models, which we review in
\autoref{sec:StatisticalMachineTranslationWordAlignment},
were introduced within the framework of the source-channel
model. The original source-channel model was extended into the log-linear
model, presented in \autoref{sec:loglinearModel}.
The field of SMT shifted from word-based models to phrase-based
models, introduced in \autoref{sec:phraseBasedTranslation}, while
retaining word-based models in their first formulation as a preliminary step.
Phrase-based translation was extended into hierarchical
phrase-based translation, which we review in
\autoref{sec:hierarchicalPhraseBasedTranslation}.
% TODOFINAL (done ?) put this somewhere else, e.g. in motivation section of hierarchical phrase based mt
%Hierarchical phrase-based translation model
%``gappy'' phrases and reordering
%with a probabilistic synchronous context-free grammar.
We then
examine various features employed in state-of-the-art decoders in
\autoref{sec:features}. The target language model, which
is one of the most important features in translation, is explored in
more detail in \autoref{sec:languageModelling}. In
\autoref{sec:optimization}, we review optimisation techniques that
are employed in order to tune the decoder parameters. We finally present
how finite state transducers can be used in decoding
in \autoref{sec:hifst}. Various rescoring
procedures are reviewed in \autoref{sec:rescoring}.

% TODO MapReduce !!!!!!!!!!!!!!!!!!!!!!

\section{Historical Background}
\label{sec:historicalBackground}

%\begin{itemize}
%  \item warren weaver and the Translation report
%  \item development of rule based systems
%  \item development of word based systems + source channel model
%  \item development of phrase based systems + discr model
%  \item development of syntactic systems
%  \item neural networks ????
%\end{itemize}

%warren weaver
%historical survey: From First Conception to First Demonstration: the Nascent Years of Machine Translation, 1947–1954. A Chronology
%emphasize that the initial idea at a time when the idea is possible to implement dates from warren weaver. before that, just speculation since computers did not exist.
%talk about resurgence in mt of interlingua methods (tomas mikolov and word2vec software)
%mention quicksort as application of MT ?
%warren weaver Translation report: word 2 word translation not good. multiple meaning solution: look at context
%translation and cryptography a book written in Chinese is simply a book written in English which was coded into the the "Chinese code".
%translate with a certain confidence.
%language invariants: machine translation pyramid, interlingua, shout from building to building or go through the tunnel

In this section, we present a brief historical background of \emph{statistical}
machine translation. A more comprehensive account of the history
of machine translation in general can be found
elsewhere~\citep{hutchins:1997:MT,hutchins:2000:MT}.

Warren Weaver can be considered the father of modern SMT.
At a time when the first computers were being developed, he
examined their potential application to the problem of machine
translation. In his memorandum~\citep{weaver:1955:Translation}, he
addressed the problem of multiple meanings of a source word
by considering the context of that source word, which heralds
phrase based translation techniques and the use of context
in machine translation. He was also
the first to frame machine translation as a source-channel
model by considering that a sentence in a foreign language
is some form of code that needs to be broken, in analogy
to the field of cryptography. Finally, he also emphasised the
statistical aspect of machine translation. However, he also
predicted that the most successful approaches to machine
translation would take advantage of language invariants by
using an intermediate language representation in the translation
process. Even though state-of-the-art statistical translation systems do not
use this kind of approach, we do notice a resurgence in intermediate
language representation techniques~\citep{mikolov-le-sutskever:2013:arxiv}.

The first successful implementations of Warren Weaver's ideas
were carried out by IBM in the 1990s. The source-channel
model together with a series of word alignment models were introduced
by~\citet{brown-dellapietra-dellapietra-mercer-1993} while
\citet{berger-dellapietra-dellapietra:1996:CL} addressed the problem
of multiple meanings using context in a maximum entropy framework.
Word-based models were extended into different variants
of phrase-based models in 1999 and at the
beginning of the
century~\citep{och-tillmann-ney:1999:EMNLP,koehn-och-marcu:2003:NAACL,och-ney:2004:CL}
and later on into synchronous context-free grammar
models~\citep{chiang:2005:ACL,chiang:2007:CL}.

\section{Source-Channel Model}
\label{sec:sourceChannelModel}
% TODOFINAL (done) check whether we say noisy channel or source channel model
% TODOFINAL (done ?) normalize SMT
%brown et al series of papers directly inspired by warren weaver

%additional papers:
%brown et al 90: a statistical approach to machine translation


%notes for: a statistical approach to language translation
%glossary creation
%in the intro, phrase-based translation is basically described !!! partition source text into set of fixed locution (~ phrase), use glossary + contextual info to translate
%the phrases, arrange words in the target !!!!!!!!!!!!!!!!!!!!!!!!!!
%find word pairs using maximum mutual information criterion

Statistical machine translation was originally framed as a source-channel
model~\citep{shannon:1948:BellSystemTechnicalJournal,brown-cocke-dellapietra-dellapietra-jelinek-lafferty-mercer-roossin:1990:CL,brown-dellapietra-dellapietra-mercer-1993}.
Given a
foreign sentence $\bm{f}$, we want to find the original English sentence
$\bm{e}$ that went through a noisy channel and produced $\bm{f}$. Note that in
the source-channel model notation, what we would like to
recover---the English sentence---is
called the \emph{source} while what is observed---the foreign sentence---is
called the \emph{target}. A source-channel model assigns probabilities
from source (English) to target (foreign) but in translation, the model
is used to infer the source that was most likely to have generated
the target.

We do not use this convention here and call the
\emph{source} what we are translating from and the \emph{target} what we are
translating into. This convention is frequently
adopted~\citep{och-tillmann-ney:1999:EMNLP,och-ney:2002:ACL,och-ney:2004:CL}
in SMT,
and more so since SMT has been framed as a log-linear
model (see \autoref{sec:loglinearModel}). We use the
decision rule in \autoref{eq:noisy}, which minimises the risk under
a zero-one loss function (see \autoref{sec:lmbr}):
%
\begin{align}
  \bm{\hat{e}} &= \argmax_{\bm{e}} p(\bm{e} \mid \bm{f}) \nonumber \\
  \bm{\hat{e}} &= \argmax_{\bm{e}} \frac{p(\bm{f} \mid \bm{e}) \, p(\bm{e})}{p(\bm{f})} \mbox{ (Bayes' rule)} \nonumber \\
  \bm{\hat{e}} &= \argmax_{\bm{e}} p(\bm{f} \mid \bm{e}) \, p(\bm{e}) \label{eq:noisy}
\end{align}
%
$\bm{\hat{e}}$ is the hypothesis to be selected.
$p(\bm{f} \mid \bm{e})$ is called the \emph{translation model} while
$p(\bm{e})$ is called the (target) \emph{language model}.

The translation model and the language model are estimated separately
for practical reasons: the amount of parallel data used to train the translation
model is in general orders of magnitude smaller than the amount of monolingual
data used to train the language model. Another justification is that
using two separate models makes the translation process modular: improving
the translation model may help improve \emph{adequacy}, i.e.\ how well the meaning
of the source text is preserved in the translated text, while improving
the language model may help improve \emph{fluency}, i.e.\ how well-formed the
translation is. It is therefore considered
preferable to train both a translation model and a language model.
In these models, parallel sentence pairs and target sentences are
not used directly as parameters because of an obvious sparsity
problem. Parallel sentence pairs are further broken down using
word-based models (see \autoref{sec:StatisticalMachineTranslationWordAlignment}),
phrase-based models (see \autoref{sec:phraseBasedTranslation})
and hierarchical phrase-based models
(see \autoref{sec:hierarchicalPhraseBasedTranslation}). For language
modelling, sentences are broken down into windows of consecutive
words using $n$-gram language models (see \autoref{sec:languageModelling}).
We will see in the next section how to decompose
the translation model using word alignment, which is
introduced as a latent variable into the source-channel model.

\section{Word Alignment}
\label{sec:StatisticalMachineTranslationWordAlignment}

In the previous section, we have briefly described the source-channel model, which
describes the translation process. This model cannot be used directly in
practice as it has too many parameters, namely all imaginable
sentence pairs and target sentences. In order to address this issue,
the \emph{alignment} between source words and target words will be
introduced as a latent variable in the source channel model.

Given a sentence pair $(\bm{f}, \bm{e})$ with source sentence
length $J = |\bm{f}|$ and target sentence
length $I = |\bm{e}|$, a \emph{word alignment} $\bm{a}$
for this sentence pair is a mapping between the source and target
words. In other words, $\bm{a}$ is a subset of the cross product
of the set of source words and their positions and the set of target
words and their positions, as defined in \autoref{eq:alignmentSetDefinition}:
%
\begin{equation}
  \bm{a} \subset \{((f_j, j), (e_i, i)), (j, i) \in [1, J] \times [1, I]\}
  \label{eq:alignmentSetDefinition}
\end{equation}
%
When the context of which sentence pair $(\bm{f}, \bm{e})$ is
being word-aligned is obvious, we may simply consider source word positions
and target word positions. In that case, $\bm{a}$ is simply defined
as a subset (source position, target position), in
\autoref{eq:alignmentSetDefinitionSimpler}:
%
\begin{equation}
  \bm{a} \subset [1, J] \times [1, I]
  \label{eq:alignmentSetDefinitionSimpler}
\end{equation}
%
Each element of $\bm{a}$ is called an \emph{alignment link}.
Alignment links between source and target words
correspond to semantic or syntactic equivalences shared by these words in the
source and target language and in a particular
sentence pair. Alignments can present many-to-one and one-to-many
mappings as well as reordering as highlighted by crossing links. An example
of word alignment is
shown in \autoref{fig:examplealign}.
% TODONEVER maybe add circles around the words
%
\begin{figure}
  \begin{center}
  \begin{tikzpicture} [node distance = 2cm, text height=1.5ex, text depth=.25ex]
    % place nodes
    \node (Sone) {Soñé};
    \node [right of = Sone] (con) {con};
    \node [right of = con] (una) {una};
    \node [right of = una] (piedra) {piedra};
    \node [right of = piedra] (lunar) {lunar};
    \node [right of = lunar] (palida) {pálida};
    \node [below of = Sone] (I) {I};
    \node [right of = I] (dreamt) {dreamt};
    \node [right of = dreamt] (of) {of};
    \node [right of = of] (a) {a};
    \node [right of = a] (pale) {pale};
    \node [right of = pale] (moonstone) {moonstone};
    % draw edges
    \draw (Sone) -- (I);
    \draw (Sone) -- (dreamt);
    \draw (con) -- (of);
    \draw (una) -- (a);
    \draw (piedra) -- (moonstone);
    \draw (lunar) -- (moonstone);
    \draw (palida) -- (pale);
  \end{tikzpicture}
  \end{center}
  \caption{Example of word alignment $\bm{a}$ for a Spanish-English sentence pair.
    $\bm{f}$ is the Spanish sentence, $\bm{e}$ is the English sentence.
    The source (Spanish)
    length $J$ is 6 as well as the target (English) length $I$. This alignment
    exhibits many-to-one mappings (\emph{I} and \emph{dreamt} align
    to \emph{Soñé}), one-to-many mappings (\emph{moonstone} aligns
    to \emph{piedra} and \emph{lunar}), as well as crossing links
    (the link \emph{pale}---\emph{pálida} crosses the
    link \emph{moonstone}---\emph{lunar}).}
  \label{fig:examplealign}
\end{figure}
%

\citet{brown-dellapietra-dellapietra-mercer-1993} introduce the
alignment $\bm{a}$ as a latent variable in the translation model
$p(\bm{f} \mid \bm{e})$, as in \autoref{eq:introduceAlignment}:
\begin{equation}
  p(\bm{f} \mid \bm{e}) = \sum_{\bm{a}} p(\bm{f}, \bm{a} \mid \bm{e})
  \label{eq:introduceAlignment}
\end{equation}
%
We abuse notation by calling $\bm{a}$ both the latent variable
and the set of alignment links, which is an instance of the latent
variable.
For mathematical convenience and in order to allow simplifications,
given a sentence pair $(\bm{f}, \bm{e})$ with source length
$J$ and target length $I$,
$\bm{a}$ is restricted to be a function from source word positions
to target word positions, as in \autoref{eq:alignmentDefinition}:
%
\begin{equation}
\begin{split}
  \bm{a} : [1, J] &\longrightarrow [0, I] \\
                j &\longmapsto a_j
\end{split}
\label{eq:alignmentDefinition}
\end{equation}
%
The target position zero is included to
model source words not aligned to any target word; these unaligned source words
are virtually aligned to a so-called \emph{null word}. Note that this definition
is not symmetric: it only allows many-to-one mappings from source to target.
Various symmetrisation strategies, presented in \autoref{sec:symmetrisationHeuristics},
have been devised to address this limitation.
Also note that we did not
take into account the null word in our initial definition of alignments
in \autoref{eq:alignmentSetDefinition} because in general, alignments
are obtained from symmetrisation heuristics
(see \autoref{sec:symmetrisationHeuristics}) where the null word
is ignored.
We can use the latent variable
$\bm{a}$ to rewrite the translation model in
\autoref{eq:generalEquationIBMModels}, with $\bm{f} = f_1^J$, $\bm{e} = e_1^I$
and $\bm{a} = a_1^J$:
%
\begin{equation}
  \begin{split}
    p(f_1^J \mid e_1^I) &= \sum_{a_1^J} p(f_1^J, a_1^J \mid e_1^I) \\
                        &= \sum_{a_1^J} \prod_{j = 1}^J p(f_j, a_j \mid f_1^{j - 1}, a_1^{j - 1}, e_1^I) \\
                        &= \sum_{a_1^J} \prod_{j = 1}^J p(f_j \mid f_1^{j - 1}, a_1^j, e_1^I) \, p(a_j \mid f_1^{j - 1}, a_1^{j - 1}, e_1^I) \\
  \end{split}
  \label{eq:generalEquationIBMModels}
\end{equation}
%
\citet{brown-dellapietra-dellapietra-mercer-1993} present a series
of five translation models of increasing complexity that parameterise the terms
$p(f_j \mid f_1^{j - 1}, a_1^j, e_1^I)$ and
$p(a_j \mid f_1^{j-1}, a_1^{j-1}, e_1^I)$.
Parameter estimation is carried out with the
expectation-maximisation algorithm~\citep{dempster-laird-rubin:1977:JRSS}.
Also based on \autoref{eq:generalEquationIBMModels}, \citet{vogel-ney-tillmann}
introduce an HMM model~\citep{rabiner:1989:IEEE} for
word alignment and \citet{deng-and-byrne:2008:ASLP}
extend the HMM model to a word-to-phrase HMM model. We describe these two models in the following
sections.
% TODONEVER should I present all IBM models ???

% TODOFINAL think about where to put this
We have described word alignment models in the context of the source-channel
model. In that context, word alignment models can be used directly for word-based
decoding.\footnote{e.g. \url{http://www.isi.edu/licensed-sw/rewrite-decoder}} % TODOFINAL (done) check whether MTTK and/or GIZA++ can also be used in decoding mode
However, nowadays, word alignment models are used as a preliminary
step in the machine translation training pipeline, namely
prior to rule extraction (see \autoref{sec:phrasextract} and
\autoref{sec:hierruleextract}). In that case,
the word alignment models are used to produce Viterbi alignments, defined
in \autoref{eq:viterbiAlignment}:
%
\begin{equation}
  %\begin{split}
    %\hat{a}_1^J &= \argmax_{a_1^J} p(a_1^J \mid f_1^J, e_1^I) \\
    \hat{a}_1^J = \argmax_{a_1^J} p(f_1^J, a_1^J \mid e_1^I)
  %\end{split}
  \label{eq:viterbiAlignment}
\end{equation}
%
One contribution of this
thesis is to use alignment posterior probabilities instead of
Viterbi alignments for rule
extraction (see \autoref{chap:extractionFromPosteriors}).

\subsection{HMM and Word-to-Phrase Alignment Models}
\label{sec:statisticalMachineTranslationHmmAlignmentModel}

% notes on Yonggang's 2008 journal paper
%in the paper notation, assuming translation is from foreign to English,
%source denotes English (target in thesis notation) and target denotes
%foreign (source in thesis notation)
%source sentence of I words s = s_1^I
%target sentence of J words t = t_1^J
%target sentence segmented into K target phrases
%target phrases: v_1^K
%each v_k generated by a single word in the source phrase
%correspondence source words target phrases: alignment a_1^K
%s_{a_k} -> v_k
%number of words in each target phrase: phi_k
%constraint: J = sum_{k=1}^K phi_k
%NULL source word
%alternative to NULL word: h_1^K hallucination sequence
%h_k = 0: NULL -> v_k; h_k = 1: s_{a_k} -> v_k
%a = (phi_1^K, a_1^K, h_1^K, K)
%p(t,a|s) = p(v_1^K, K, a_1^K, h_1^K, phi_1^K | s)


We review HMM and word-to-phrase HMM models as these models are used
in experiments throughout this thesis.
\citet{vogel-ney-tillmann} introduce an HMM alignment model
that treats target word positions as hidden states and source words as
observations. The model is written in \autoref{eq:HmmAlignmentDefinition}:
%
\begin{equation}
  p(f_1^J, a_1^J \mid e_1^I) = \prod_{j=1}^J p(a_j \mid a_{j-1},I) \, p(f_j \mid e_{a_j})
  \label{eq:HmmAlignmentDefinition}
\end{equation}
%
Word-to-phrase HMM models~\citep{deng-and-byrne:2008:ASLP} were designed to
capture interesting properties of IBM Model
4~\citep{brown-dellapietra-dellapietra-mercer-1993} in an HMM framework in order
to keep alignment and estimation procedures exact. We now present this model
in more detail, using our usual source/target convention, which is the reverse
than the one adopted in the original
publication\footnote{$\bm{s}$ in the publication corresponds to $\bm{e}$ in this thesis;
$\bm{t}$ corresponds to $\bm{f}$; $J$ corresponds to $J$; $I$ corresponds to $I$.}.
In the word-to-phrase HMM alignment model, the source sentence
$\bm{f}$ is segmented into source phrases $v_1^K$.
The alignment $\bm{a}$ is represented by a set of variables
$(\phi_1^K, a_1^K, h_1^K, K)$ where:
%
\begin{itemize}
  \item $K$ is the number of source phrases that form a segmentation of the source sentence $\bm{f}$.
  \item $a_1^K$ is the alignment from target words to source phrases.
  \item $\phi_1^K$ indicates the length of each source phrase.
  \item $h_1^K$ is a \emph{hallucination} sequence that indicates whether
    a source phrase was generated by the target null word
    or by a usual target word.
\end{itemize}
%
The general form of the model is presented in \autoref{eq:word2phraseHmmGeneral}:
%
\begin{equation}
  \begin{split}
  p(\bm{f}, \bm{a} \mid \bm{e}) = & \; p(v_1^K, K, a_1^K, h_1^K, \phi_1^K \mid \bm{e}) \\
                                = & \; p(K \mid J, \bm{e}) \times \\
                                & \; p(a_1^K, \phi_1^K, h_1^K \mid K, J, \bm{e}) \times \\
                                & \; p(v_1^K \mid a_1^K, h_1^K, \phi_1^K, K, J, \bm{e})
  \end{split}
  \label{eq:word2phraseHmmGeneral}
\end{equation}
%
We now review the modelling decisions taken for each
of the components from \autoref{eq:word2phraseHmmGeneral}.
The first component is simply modelled by:
%
\begin{equation}
  p(K \mid J, \bm{e}) = \eta^K
\end{equation}
%
where $\eta$ is a threshold that controls the number of segments in the source.
The second component is modelled using the Markov assumption:
%
\begin{align}
  p(a_1^K, \phi_1^K, h_1^K \mid K, J, \bm{e})
    &= \prod_{k = 1}^K p(a_k, h_k, \phi_k \mid a_{k - 1}, \phi_{k - 1}, h_{k - 1}, K, J, \bm{e}) \nonumber \\
    &= \prod_{k = 1}^K p(a_k \mid a_{k - 1}, h_k, I) \, d(h_k) \, n(\phi_k \mid e_{a_k})
\end{align}
%
As in the HMM word alignment model $a_k$ depends only on $a_{k - 1}$, the target
length $I$ and the binary value $h_k$. $d(h_k)$ is simply controlled by the
parameter $p_0$ by $d(0) = p_0$. $n(\phi_k \mid e_{a_k})$ is a finite distribution
on source phrase length that depends on each target word. This parameter
is analogous to the fertility parameter introduced in IBM
Model 3~\citep{brown-dellapietra-dellapietra-mercer-1993} and
that controls how many source words are aligned to a given target word.

The third component from \autoref{eq:word2phraseHmmGeneral} is defined in
\autoref{eq:word2phraseTranslation} and represents the word-to-phrase translation parameter:
%
\begin{equation}
  p(v_1^K | a_1^K, h_1^K, \phi_1^K, K, J, \bm{e}) = \prod_{k = 1}^K p(v_k \mid e_{a_k}, h_k, \phi_k)
  \label{eq:word2phraseTranslation}
\end{equation}
%
One key contribution from the word-to-phrase HMM model is to use bigram translation
probabilities to model one single phrase translation, as shown
in \autoref{eq:bigramTranslation}:
%
\begin{equation}
  p(v_k \mid e_{a_k}, h_k, \phi_k) = t_1(v_k[1] \mid h_k \cdot e_{a_k}) \prod_{j = 2}^{\phi_k} t_2(v_k[j] \mid v_k[j - 1], h_k \cdot e_{a_k})
  \label{eq:bigramTranslation}
\end{equation}
%
where $h_k \cdot e_{a_k}$ is $e_{a_k}$ if $h_k = 1$ and the null word otherwise, $t_1$ is
a word-to-word translation probability and $t_2$ is a bigram translation probability.

\autoref{fig:wordtophrase}
shows a simplified version of the generative story for an HMM word-to-phrase
alignment model: first, pick the number of source phrases $K$ according to
$P(K \mid J,I)$; then
pick a target word given the previously chosen one; finally generate the target
phrase from the source word using fertility and bigram translation probabilities.
For example, we generate the source phrase \emph{les vaches} from
the target word \emph{cows}
according to \autoref{eq:exampleBigramTranslation}:
%
\begin{equation}
  p(\text{\emph{les vaches}} \mid \text{\emph{cows}}) = p(\text{\emph{les}} \mid \text{\emph{cows}}) \, p(\text{\emph{vaches}} \mid \text{\emph{cows}}, \text{\emph{les}})
  \label{eq:exampleBigramTranslation}
\end{equation}
%
Thus bigram probabilities take into account the context of the target word to
some extent.
%
%\begin{figure}
%  \begin{center}
%    \includegraphics[scale=0.5]{figures/wordtophrase2.eps}
%  \end{center}
%  \caption{Illustrative example of an HMM word-to-phrase alignment model. The adjective noun sequence ``fat cows'' is
%    reordered into the noun adjective sequence ``vaches grasses''. The word ``cows'' has fertility 2 as it is translated
%    into the target phrase ``les vaches''.}
%  \label{fig:wordtophrase}
%\end{figure}
\begin{figure}
  \begin{center}
  \begin{tikzpicture} [node distance = 3cm, text height=1.5ex, text depth = .25ex, auto]
    % place nodes
    \node (the) {The};
    \node [right of = the] (wolf) {wolf};
    \node [right of = wolf] (loves) {loves};
    \node [right of = loves] (fat) {fat};
    \node [right of = fat] (cows) {cows};
    \node [above of = loves] (chooseSeg) {$p(K = 5 \mid I = 5, J = 6)$};

    \node [below of = the] (le) {Le};
    \node [below of = wolf] (loup) {loup};
    \node [below of = loves] (aime) {aime};
    \node [below of = fat] (lesVaches) {les vaches};
    \node [below of = cows] (grasses) {grasses};

    % draw edges
    \draw [->] (the) to node {$\phi_1 = 1$} (le);
    \draw [->] (wolf) to node {$\phi_2 = 1$} (loup);
    \draw [->] (loves) to node {$\phi_3 = 1$} (aime);
    \draw [->] (fat) to node [right, xshift = 1.5em, yshift = -1em] {$\phi_5 = 1$} (grasses);
    \draw [->] (cows) to node [left, xshift = -1.5em, yshift = -1em] {$\phi_4 = 2$} (lesVaches);

    \draw [->] (the) to [bend left = 45] node {$p(2 \mid 1)$} (wolf);
    \draw [->] (wolf) to [bend left = 45] node {$p(3 \mid 2)$} (loves);
    \draw [->] (loves) to [bend left = 45] node {$p(5 \mid 3)$} (cows);
    \draw [->] (cows) to [bend right = 45] node {$p(4 \mid 5)$} (fat);
  \end{tikzpicture}
  \end{center}
  \caption{Simplified generative story for an HMM word-to-phrase alignment model.
    Adapted from~\citep{deng-and-byrne:2008:ASLP}.
    The adjective noun sequence \emph{fat cows} is
    reordered into the noun adjective sequence \emph{vaches grasses}.
    The word \emph{cows} has fertility 2 as it is translated
    into the target phrase \emph{les vaches}.}
  \label{fig:wordtophrase}
\end{figure}

\subsection{Symmetrisation Heuristics}
\label{sec:symmetrisationHeuristics}

We have mentioned that the IBM and HMM alignment models
are not symmetric: they only allow a many-to-one mapping from
source words to target words.
In order to address this issue, one can train
alignment models in both source-to-target and target-to-source
directions, obtain Viterbi alignments from
both models and apply symmetrisation
strategies~\citep{och-tillmann-ney:1999:EMNLP,och-ney:2003:CL,koehn-och-marcu:2003:NAACL}.
\citet{och-tillmann-ney:1999:EMNLP} designed
a first symmetrisation heuristic that was later on dubbed
as the \emph{grow} heuristic. \citet{koehn-och-marcu:2003:NAACL}
later extended the \emph{grow} heuristic into
the \emph{grow-diag} and \emph{grow-diag-final} heuristics
and examined the impact on translation
performance for each heuristic.

Alignments from source to target (i.e. in which the alignment is
a function from source positions to target positions)
and target to source are denoted
$\bm{a}_{f2e}$ and $\bm{a}_{e2f}$ respectively.
Let us consider a sentence pair $(\bm{f}, \bm{e})$, and
source-to-target and target-to-source Viterbi alignments
$\bm{a}_{f2e}$ and $\bm{a}_{e2f}$.
The \emph{intersection} and \emph{union} heuristics
are defined as follows:
%
\begin{itemize}
  \item \emph{intersection}: $\bm{a} = \bm{a}_{e2f} \cap \bm{a}_{f2e}$
  \item \emph{union}: $\bm{a} = \bm{a}_{e2f} \cup \bm{a}_{f2e}$
\end{itemize}
%
The \emph{intersection} heuristics typically produces high precision alignments
while the \emph{union} heuristics typically produces high recall
alignments~\citep{och-ney:2003:CL}.
We now present the \emph{grow} heuristic and its variants, which are based
on the initial \emph{intersection} and \emph{union} heuristics.
The \emph{grow} heuristic algorithm is presented in
\autoref{alg:growHeuristic}.
%
\begin{figure}
  %\begin{footnotesize}
  \begin{algorithmic}[1]
    \Function{Grow}{$f_1^J, e_1^J, \bm{a}_{f2e}, \bm{a}_{e2f}$}
      \State{$\bm{a} \gets \bm{a}_{f2e} \cap \bm{a}_{e2f}$} \hypertarget{alg:line:initGrow}{} \label{alg:line:initGrow}
      \While{\textbf{true}}
        \State{added $\gets$ \textbf{false}}
        \For{$i \in [1, I]$}
          \For{$j \in [1, J]$}
            \If{$(j, i) \in \bm{a}$}
            \For{$(k, l) \in $ \Call{Neighbours}{$(j,i)$} $\cap (\bm{a}_{f2e} \cup \bm{a}_{e2f})$} \hypertarget{alg:line:neighbours}{} \label{alg:line:neighbours}
              \If{$k$ not aligned in $\bm{a}$ \textbf{or} $l$ not aligned in $\bm{a}$} \hypertarget{alg:line:notAlreadyAligned}{} \label{alg:line:notAlreadyAligned}
                \State{$\bm{a} \gets \bm{a} \cup (k, l)$} \hypertarget{alg:line:addLink}{} \label{alg:line:addLink}
                \State{added $\gets$ \textbf{true}}
              \EndIf
            \EndFor
            \EndIf
          \EndFor
        \EndFor
        \If{not added}
          \State{\textbf{break}}
        \EndIf
      \EndWhile
      \State{\Return $\bm{a}$}
    \EndFunction
  \end{algorithmic}
  %\end{footnotesize}
  \caption{Algorithm for the \emph{grow} symmetrisation
    heuristic~\citep{koehn:2010:book}.
    The alignment is initialised from the intersection and alignment links
    that are neighbours to existing alignment links are iteratively added if the source
    or the target is not already aligned.}
  \label{alg:growHeuristic}
\end{figure}
%
The input is a sentence
pair $(f_1^J, e_1^I)$, a source-to-target
alignment $\bm{a}_{f2e}$ and a target-to-source alignment
$\bm{a}_{e2f}$. The resulting alignment $\bm{a}$ is initialised
with the
intersection (\hyperlink{alg:line:initGrow}{line \ref{alg:line:initGrow}}).
Then alignment links that are in the union and that are neighbours of already existing
alignment links (\hyperlink{alg:line:neighbours}{line \ref{alg:line:neighbours}})
are considered. If the source or the target word is not already
aligned (\hyperlink{alg:line:notAlreadyAligned}{line \ref{alg:line:notAlreadyAligned}}),
then the link is added to the resulting
alignment (\hyperlink{alg:line:addLink}{line \ref{alg:line:addLink}}).
This is repeated until no more links are added.

In the \emph{grow} heuristic, neighbours are defined as horizontal or
vertical neighbours. If diagonal neighbours are also considered, then
the heuristic becomes \emph{grow-diag}. The \emph{grow} heuristic
also has an optional step called \emph{final}. Alignment
links in the union where the source or the target is not already
aligned can also be added to the resulting alignment. If only links
in the union where the source \emph{and} the target are not already
aligned are considered for the \emph{final} procedure, then
the optional step is called \emph{final-and}.

Symmetrisation heuristics have been shown to be beneficial for
alignments both in terms of alignment quality as measured
by comparing automatic alignments to human alignments and
in translation quality when alignments are used as an
intermediate step in the translation pipeline.

%
%Symmetrised alignments have be shown to produce better translation results than
%unidirectional alignments. However, we find in one of our experiments that this
%is not always the case (see \autoref{sec:extractionFromPosteriorsSymmetrising}).

% This should review phrase based SMT.
% Why: the gyro decoder is like a phrase based SMT decoder.

% phrase based extraction + stack base decoding

\section{Log-Linear Model of Machine Translation}
\label{sec:loglinearModel}

%notes on adam berger paper
%model conditional prob
%p(y | x): x is phrase containing word "in", y is translation of "in"
%collect stats ptilda(x, y)
%binary feature: f(x, y) = 1 if y = en and April follows in
%expected value of f: ptilda(f) = sum_x,y ptilda(x,y) f(x,y)
%p(f) = sum_x,y ptilda(x) p(y | x) f(x,y)
%constraint: p(f) = ptilda(f)
%max entropy principle: choose p that satisfies the constraints
%and that maximizes entropy
%H(p) = -sum_x,y ptilda(x) p(y|x) log p(y|x)
%use Lagrange multiplier and Kuhn Tucker theorem to find that the solution
%is p(y|x) propto exp(\sum lambda_i f_i(x,y)). find lambda by max dual problem.
%also: p that satisfies constr and max entropy is also the
%p in the parametric family ... that maximizes likelihood of training sample.
%application to Candide.
%use max entropy modelling to predict translation of word in context.
%use max entropy modelling to predict word order.
%use max entropy modelling to segment.
%context dependent translation:
%first viterbi align training. then create events (x,y) (6 words
%surrounding in)
%incorporate this context dep translation into general translation model.

As we have seen in \autoref{sec:sourceChannelModel}, SMT was historically
framed as a source-channel model. As an alternative,
\citet{berger-dellapietra-dellapietra:1996:CL} introduce maximum entropy
models for natural language processing. In maximum entropy modelling, we
wish to estimate a conditional probability $p(\bm{y} \mid \bm{x})$.
Given a training sample, various feature functions deemed to be relevant
are picked. We then constrain $p$ such that the expected value of each
feature function $f$ with respect to the empirical distribution is equal
to the expected value of $f$ with respect to the model $p$. Finally, $p$
is chosen among all models that satisfy the constraints defined by the
features and such that its entropy is maximum.
\citet{berger-dellapietra-dellapietra:1996:CL} show how a maximum entropy
model can be parameterised as an exponential, or log-linear model. They
apply this model to three machine translation related tasks. First, they
use a maximum entropy model to predict the translation of a word using
the context for that word. Then, they use a maximum entropy model to
predict the target language word order. Finally, they apply maximum
entropy modelling in order to predict the source sentence segmentation.

%notes och and ney 2002
%search done by the maximum approximation
%first present log linear model
%log linear model generalization of source channel model
%log linear model presented with additional alignment variable
%p(e_1^I, a_1^J | f_1^J) propto exp(sum_1^M lambda_m h_m(e_1^I, f_1^J, a_1^J))
%alignment template model
%p(f_1^J | e_1^I) = sum_{z_1^K, a_1^K} p(a_1^K | e_1^I) . p(z_1^K | a_1^K, e_1^I) . p(f_1^J | z_1^K, a_1^K, e_1^I)
%each component modeled with max entropy model

\citet{och-tillmann-ney:1999:EMNLP} notice that using an erroneous
version of the source-channel model, that is using the following equation:
%
\begin{equation}
  \bm{\hat{e}} = \argmax_{\bm{e}} p(\bm{e} \mid \bm{f}) \, p(\bm{e})
\end{equation}
%
gives comparable performance with respect to using the correct
formulation of the source-channel model given in \autoref{eq:noisy}.
Then \citet{och-ney:2002:ACL} propose the following log-linear model extension:
%
\begin{align}
  \bm{\hat{e}} &= \argmax_{\bm{e}} p(\bm{e} \mid \bm{f}) \nonumber \\
               &= \argmax_{\bm{e}} \frac{\exp(\sum_{m=1}^M \lambda_m h_m(\bm{e}, \bm{f}))}{\sum_{\bm{e'}}\exp(\sum_{m=1}^M \lambda_m h_m(\bm{e'}, \bm{f}))} \nonumber \\
               &= \argmax_{\bm{e}} \exp(\sum_{m=1}^M \lambda_m h_m(\bm{e}, \bm{f})) \label{eq:loglinearModel}
\end{align}
%
where $h_m$ are called \emph{feature functions} and $\lambda_m$
are called \emph{feature weights}. The log-linear model is an extension
to the source-channel model because it can be reduced to the original
source-channel model with the following settings:
%
    \begin{itemize}
      \item $M = 2$
      \item $h_1(\bm{e}, \bm{f}) = \log (p(\bm{f}|\bm{e}))$
      \item $h_2(\bm{e}, \bm{f}) = \log (p(\bm{e}))$
      \item $\lambda_1 = \lambda_2 = 1$
    \end{itemize}
%
Log-linear models were originally trained with the maximum likelihood
criterion, which precisely makes them equivalent to maximum entropy
models~\citep{berger-dellapietra-dellapietra:1996:CL}. More
effective training techniques such as minimum error rate
training~\citep{och:2003:ACL} were introduced subsequently
(see \autoref{sec:mert}). An advantage of minimum error rate training
is that the criterion for optimisation and the evaluation metric are
consistent. Because minimum error rate training does not require computing
a normalisation constant, in practice, SMT models effectively become linear models, with the objective
function presented in \autoref{eq:linearModel}:
%
\begin{equation}
  \bm{\hat{e}} = \argmax_{\bm{e}} \sum_{m=1}^M \lambda_m h_m(\bm{e}, \bm{f})
  \label{eq:linearModel}
\end{equation}

%In practice, \citet{och-ney:2002:ACL} do not use \autoref{eq:loglinearModel}.
%They introduce several latent variables in a so called \emph{alignment template}
%approach. The translation model is defined in \autoref{eq:alignmentTemplate}:
%
%\begin{equation}
%  p(e_1^I \mid f_1^J) = \sum_{z_1^K, a_1^K} p(a_1^K \mid e_1^I) p(z_1^K \mid a_1^K, e_1^I) p(f_1^J \mid z_1^K, a_1^K, e_1^I)
%  \label{eq:alignmentTemplate}
%\end{equation}
%
% TODOFINAL check if it is "correct" to invert the notation in above equation wrt to original publication
%where the variables $z_1^K$ and $a_1^K$ are alignment templates and the alignment of alignment templates.
%Each term in \autoref{eq:alignmentTemplate} is modelled as a maximum entropy model and search
%is carried out using the maximum approximation defined in \autoref{eq:maxApproximation}:
%
% TODONEVER finish this
%\begin{align}
%  \hat{e_1^I} &= \argmax_{z_1}
%  \label{eq:maxApproximation}
%\end{align}

% TODONEVER talk about training methods: GIS vs MERT

\section{Phrase-Based Translation}
\label{sec:phraseBasedTranslation}

%notes on och et al 1999
%compare word based and phrase based
%e = argmax p(e) p(f|e)
%word based model: each source word (french) assigned exactly one target word (english)
%difficult to model context and also to translate compound words
%model two alignment levels: phrase level alignment between phrases and word level alignment between words within phrases
%word based approach: use an HMM
%p(f_1^J | e_1^I) = sum_{a_1^J} prod_j p(a_j | a_{j - 1}) p(f_j | e_{a_j})
%some restrictions are used (so called monotonicity): alignment jump between a_{j - 1} and a_j can only be 0, 1, 2.
%Q_{e'}(j, e): probability of best partial hypothesis (e_1^i, a_1^j) with e_i = e, e_{i - 1} = e' and a_j = i
%search: mapping j -> (a_j, e_{a_j})
%DP recursion:
%Q_e'(j, e) = p(f_j | e) . max {
%  p(0) . Q_e'(j - 1, e),
%  p(1) . max_e'' {p(e | e', e'') Q_e''(j - 1, e')},
%  p(2) . max_{e'', e'''} {p(e | e', e'') . p(e' | e'', e''') . Q_e'''(j - 1, e'')}
%  }
%optimal translation: max_e',e Q_e'(J, e) . p(\$ | e, e')
%extension to one-to-many alignment model.
%solution: reverse translation direction, then extend English vocab with multiple words, then redo standard training for original translation direction.
%alignment template approach
%problem with word based models: only allow one to many or many to one, or many to many but hacky solution
%model phrase to phrase is a way to model context.
%alignment template z: triple (Ftilda, Etilda, Atilda) alignment Atilda between source class sequence Ftilda and
%target class sequence Etilda.
%Atilda: matrix with binary values
%Atilda allows for many to many
%Ftilda and Etilda automatically trained bilingual classes
%use classes for better generalization
%alignment template applicable to sequence source words ftilda if
%alignment template classes and classes of source words are equal
%application of alignment template contraints the target words to have the right classes
%selection target words: p(etilda | z, ftilda)
%p(ftilda | (Ftilda, Etilda, Atilda), etilda) = delta(classes(etilda), Etilda) delta(classes(ftilda), Ftilda) prod_{j=1}^J (I???) p(f_j | Atilda, etilda)
%p(f_j | Atilda, etilda) = sum_{i = 0}^I p(i | j; Atilda) p(f_j | e_i)
%p(i | j, Atilda) = Atilda(i, j) / (sum_i Atilda(i, j))
%rien compris
%phrase level alignment
%decompose f_1^J and e_1^I into sequence of phrases
%f_1^J = ftilda_1^K
%assume that there is only one possible segmentation (possibly one of the differences between Och and Koehn)
%p(f_1^J | e_1^I) = p(ftilda_1^K | etilda_1^K)
%                 = sum_{atilda_1^K} p(atilda_1^K, ftilda_1^K | etilda_1^K)
%                 = sum_{atilda_1^K} p(atilda_1^K | etilda_1^K) p(ftilda_1^K | atilda_1^K, etilda_1^K)
%                 = sum_{atilda_1^K} prod_{k = 1}^K p(atilda_k | atilda_1^{k-1}, K) p(ftilda_k | etilda_{atilda_k})
%p(ftilda | etilda) = sum_z p(z| etilda) p(ftilda | z, etilda)
%training: s2t and t2s hmm without the max approximation
%get the viterbi alignments a_1^J and b_1^I
%use the grow diag symmetrisation heuristic.
%estimate bilingual word lexicon p(f|e): n_A(f, e) | n(e)
%train world classes
%extract consistent phrase pairs from alignment
%obtain n(z) of how often alignment template occurs in aligned corpus.
%relative freq estimate: p(z = (Ftilda, Etilda, Atilda) | etilda) = n(z) . delta(classes(etilda), Etilda) / n(classes(etilda))
%decoding search
%objective: argmax_{e_1^I} p(e_1^I p(e_1^I | f_1^J)) (wrong version of source channel model)
%use class-based 5g lm
%preprocessing before translation: filter alignment templates per source sentence, segment source sentence
%segmentation objective: argmax_{ftilda_1...ftilda_k = f_1^J} prod_{k=1}^K max_z p(z | ftilda_k)
%search: produce partial hypotheses with info: last target word, language model state,
%source coverage, last alignment template, position of last target word in alignment template
%instantiation (???), cost so far, backpointer
%integrate future cost because compare hypotheses that cover different parts of the input

%notes on och-ney 2004 (journal paper version of och et al. 1999)
%overview: align, extract, extracted phrases with alignment and word classes are
%called alignment templates
%log-linear model: hat{e_1^I} = argmax_{e_1^I} p(e_1^I | f_1^J)
%                             = argmax exp(sum_m=1^M lambda_m h_m(e_1^I, f_1^J))/Z
%parameters lambda trained with MLE (same as maximum entropy)
%or trained with MERT
%use latent variable
%p(e_1^I, a_1^J | f_1^J) = (1/Z) . exp(sum_1^M lambda_m h_m(e_1^I, f_1^J, a_1^J))
%description of word alignments etc.
%description of symmetrisation heuristics etc.
%symmetrized Viterbi alignments used to compute translation lexicon
%description of phrase-extract
%alignment templates: replace words with word classes and store
%alignment info for each phrase pair
%alignment template z = (F_1^J', E_1^I', Atilda)
%F_1^J' source class sequence
%E_1^I' target class sequence
%Atilda: alignment between source class seq and target class seq
%automatically train bilingual classes
%notation: etilda, ftilda target and source phrases
%p(z = (F_1^J', E_1^I', Atilda) | ftilda) = N(z) . delta(F_1^J', C(ftilda)) / N(C(ftilda%))
%remove alignment templates with prob less than 0.01
%limit on source phrase: between 4 and 7
%translation model
%f_1^J = ftilda_1^K
%e_1^I = etilda_1^K