-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtransformer.py
169 lines (157 loc) · 6.61 KB
/
transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import math
import tensorflow as tf
import commons
import bleu
def transformer_prepare_encoder(inputs, hparams):
"""Prepare one shard of the model for the encoder.
Args:
inputs: a Tensor.
hparams: run hyperparameters
Returns:
encoder_input: a Tensor, bottom of encoder stack
encoder_self_attention_bias: a bias tensor for use in encoder self-attention
encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
attention
"""
encoder_input = inputs
encoder_padding = commons.embedding_to_padding(encoder_input)
ignore_padding = commons.attention_bias_ignore_padding(
encoder_padding)
encoder_self_attention_bias = ignore_padding
encoder_decoder_attention_bias = ignore_padding
encoder_input = commons.add_timing_signal_1d(encoder_input)
return (encoder_input, encoder_self_attention_bias,
encoder_decoder_attention_bias)
def transformer_encoder(encoder_input,
encoder_self_attention_bias,
hparams,
name="encoder",
make_image_summary=True):
"""A stack of transformer layers.
Args:
encoder_input: a Tensor
encoder_self_attention_bias: bias Tensor for self-attention
hparams: hyperparameters for model
make_image_summary: Whether to make an attention image summary.
Returns:
y: a Tensors
"""
x = encoder_input
with tf.variable_scope(name):
for layer in range(hparams.num_encoder_layers or
hparams.num_hidden_layers):
with tf.variable_scope("layer_%d" % layer):
with tf.variable_scope("self_attention"):
y = commons.multihead_attention(
commons.layer_preprocess(x, hparams),
None,
encoder_self_attention_bias,
hparams.attention_key_channels or hparams.hidden_size,
hparams.attention_value_channels or hparams.hidden_size,
hparams.hidden_size,
hparams.num_heads,
hparams.attention_dropout,
make_image_summary=make_image_summary)
x = commons.layer_postprocess(x, y, hparams)
with tf.variable_scope("ffn"):
y = transformer_ffn_layer(
commons.layer_preprocess(x, hparams), hparams)
x = commons.layer_postprocess(x, y, hparams)
# if normalization is done in layer_preprocess, then it should also be done
# on the output, since the output can grow very large, being the sum of
# a whole stack of unnormalized layer outputs.
return commons.layer_preprocess(x, hparams)
def transformer_prepare_decoder(targets, hparams, features=None):
"""Prepare one shard of the model for the decoder.
Args:
targets: a Tensor.
hparams: run hyperparameters
features: optionally pass the entire features dictionary as well.
This is needed now for "packed" datasets.
Returns:
decoder_input: a Tensor, bottom of decoder stack
decoder_self_attention_bias: a bias tensor for use in decoder self-attention
"""
decoder_self_attention_bias = (
commons.attention_bias_lower_triangle(
commons.shape_list(targets)[1]))
decoder_input = commons.shift_right_3d(targets)
decoder_input = commons.add_timing_signal_1d(decoder_input)
return (decoder_input, decoder_self_attention_bias)
def transformer_decoder(decoder_input,
encoder_output,
decoder_self_attention_bias,
encoder_decoder_attention_bias,
hparams,
cache=None,
name="decoder",
make_image_summary=True):
"""A stack of transformer layers.
Args:
decoder_input: a Tensor
encoder_output: a Tensor
decoder_self_attention_bias: bias Tensor for self-attention
encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
hparams: hyperparameters for model
cache: dict, containing tensors which are the results of previous
attentions, used for fast decoding.
name: a string
make_image_summary: Whether to make an attention image summary.
Returns:
y: a Tensors
"""
x = decoder_input
with tf.variable_scope(name):
for layer in range(hparams.num_decoder_layers or
hparams.num_hidden_layers):
layer_name = "layer_%d" % layer
layer_cache = cache[layer_name] if cache is not None else None
with tf.variable_scope(layer_name):
with tf.variable_scope("self_attention"):
y = commons.multihead_attention(
commons.layer_preprocess(x, hparams),
None,
decoder_self_attention_bias,
hparams.attention_key_channels or hparams.hidden_size,
hparams.attention_value_channels or hparams.hidden_size,
hparams.hidden_size,
hparams.num_heads,
hparams.attention_dropout,
cache=layer_cache,
make_image_summary=make_image_summary)
x = commons.layer_postprocess(x, y, hparams)
if encoder_output is not None:
with tf.variable_scope("encdec_attention"):
y = commons.multihead_attention(
commons.layer_preprocess(x, hparams),
encoder_output,
encoder_decoder_attention_bias,
hparams.attention_key_channels or hparams.hidden_size,
hparams.attention_value_channels or hparams.hidden_size,
hparams.hidden_size,
hparams.num_heads,
hparams.attention_dropout,
cache=layer_cache,
make_image_summary=make_image_summary)
x = commons.layer_postprocess(x, y, hparams)
with tf.variable_scope("ffn"):
y = transformer_ffn_layer(
commons.layer_preprocess(x, hparams), hparams)
x = commons.layer_postprocess(x, y, hparams)
# if normalization is done in layer_preprocess, then it should also be done
# on the output, since the output can grow very large, being the sum of
# a whole stack of unnormalized layer outputs.
return commons.layer_preprocess(x, hparams)
def transformer_ffn_layer(x, hparams):
"""Feed-forward layer in the transformer.
Args:
x: a Tensor of shape [batch_size, length, hparams.hidden_size]
hparams: hyperparmeters for model
Returns:
a Tensor of shape [batch_size, length, hparams.hidden_size]
"""
x = tf.layers.dense(x, hparams.filter_size, activation=tf.nn.relu, name="conv1")
if hparams.relu_dropout != 0.0:
x = tf.nn.dropout(x, 1.0 - hparams.relu_dropout)
x = tf.layers.dense(x, hparams.hidden_size, name="conv2")
return x