-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathobject_motion_nets.py
308 lines (259 loc) · 12.6 KB
/
object_motion_nets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# coding=utf-8
# Copyright 2021 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Networks for motion estimation."""
import tensorflow.compat.v1 as tf
from depth_and_motion_learning import maybe_summary
from tensorflow.contrib import framework as contrib_framework
from tensorflow.contrib import layers as contrib_layers
layers = contrib_layers
arg_scope = contrib_framework.arg_scope
def motion_vector_net(images, weight_reg, predict_intrinsics=True):
"""Predict object-motion vectors from a stack of frames or embeddings.
Args:
images: Input tensor with shape [B, h, w, 2c], containing two
depth-concatenated images.
weight_reg: A float scalar, the amount of weight regularization.
predict_intrinsics: A boolean, if True the network will predict the
intrinsic matrix as well.
Returns:
A tuple of 3 tf.Tensors, (rotation, translation, intrinsic_mat), of shapes
[B, 3], [B, 3] and [B, 3, 3] respectively, representing translation vectors,
rotation angles, and predicted intrinsics matrix respectively. If
predict_intrinsics is false, the latter is not returned.
"""
with tf.variable_scope('MotionVectorNet'):
with arg_scope([layers.conv2d],
weights_regularizer=layers.l2_regularizer(weight_reg),
activation_fn=tf.nn.relu,
stride=2):
conv1 = layers.conv2d(images, 16, [7, 7], scope='Conv1')
conv2 = layers.conv2d(conv1, 32, [5, 5], scope='Conv2')
conv3 = layers.conv2d(conv2, 64, [3, 3], scope='Conv3')
conv4 = layers.conv2d(conv3, 128, [3, 3], scope='Conv4')
conv5 = layers.conv2d(conv4, 256, [3, 3], scope='Conv5')
conv6 = layers.conv2d(conv5, 256, [3, 3], scope='Conv6')
conv7 = layers.conv2d(conv6, 256, [3, 3], scope='Conv7')
bottleneck = tf.reduce_mean(conv7, axis=[1, 2], keepdims=True)
with arg_scope([layers.conv2d],
biases_initializer=None,
activation_fn=None,
stride=1):
rotation = layers.conv2d(bottleneck, 3, [1, 1], scope='Rotation')
translation = layers.conv2d(bottleneck, 3, [1, 1], scope='Translation')
rotation = tf.squeeze(rotation, axis=(1, 2))
translation = tf.squeeze(translation, axis=(1, 2))
image_height, image_width = tf.unstack(tf.shape(images)[1:3])
rot_scale, trans_scale = create_scales(0.001)
if predict_intrinsics:
intrinsic_mat = add_intrinsics_head(bottleneck, image_height, image_width)
return rotation * rot_scale, translation * trans_scale, intrinsic_mat
# returning different number of items to unpack might cause issues.
return rotation * rot_scale, translation * trans_scale, None
def add_intrinsics_head(bottleneck, image_height, image_width):
"""Adds a head the preficts camera intrinsics.
Args:
bottleneck: A tf.Tensor of shape [B, 1, 1, C], typically the bottlenech
features of a netrowk.
image_height: A scalar tf.Tensor or an python scalar, the image height in
pixels.
image_width: A scalar tf.Tensor or an python scalar, the image width in
pixels.
image_height and image_width are used to provide the right scale for the focal
length and the offest parameters.
Returns:
a tf.Tensor of shape [B, 3, 3], and type float32, where the 3x3 part is the
intrinsic matrix: (fx, 0, x0), (0, fy, y0), (0, 0, 1).
"""
with tf.variable_scope('CameraIntrinsics'):
# Since the focal lengths in pixels tend to be in the order of magnitude of
# the image width and height, we multiply the network prediction by them.
focal_lengths = tf.squeeze(
layers.conv2d(
bottleneck,
2, [1, 1],
stride=1,
activation_fn=tf.nn.softplus,
weights_regularizer=None,
scope='foci'),
axis=(1, 2)) * tf.to_float(
tf.convert_to_tensor([[image_width, image_height]]))
# The pixel offsets tend to be around the center of the image, and they
# are typically a fraction the image width and height in pixels. We thus
# multiply the network prediction by the width and height, and the
# additional 0.5 them by default at the center of the image.
offsets = (tf.squeeze(
layers.conv2d(
bottleneck,
2, [1, 1],
stride=1,
activation_fn=None,
weights_regularizer=None,
biases_initializer=None,
scope='offsets'),
axis=(1, 2)) + 0.5) * tf.to_float(
tf.convert_to_tensor([[image_width, image_height]]))
foci = tf.linalg.diag(focal_lengths)
maybe_summary.scalar('foci', tf.reduce_mean(foci))
maybe_summary.scalar('offsets', tf.reduce_mean(offsets))
intrinsic_mat = tf.concat([foci, tf.expand_dims(offsets, -1)], axis=2)
batch_size = tf.shape(bottleneck)[0]
last_row = tf.tile([[[0.0, 0.0, 1.0]]], [batch_size, 1, 1])
intrinsic_mat = tf.concat([intrinsic_mat, last_row], axis=1)
return intrinsic_mat
def motion_field_net(images,
weight_reg=0.0,
align_corners=True,
auto_mask=False):
"""Predict object-motion vectors from a stack of frames or embeddings.
Args:
images: Input tensor with shape [B, h, w, 2c], containing two
depth-concatenated images.
weight_reg: A float scalar, the amount of weight regularization.
align_corners: align_corners in resize_bilinear. Only used in version 2.
auto_mask: True to automatically masking out the residual translations
by thresholding on their mean values.
Returns:
A tuple of 3 tf.Tensors:
rotation: [B, 3], global rotation angles.
background_translation: [B, 1, 1, 3], global translation vectors.
residual_translation: [B, h, w, 3], residual translation vector field. The
overall translation field is background_translation+residual_translation.
"""
with tf.variable_scope('MotionFieldNet'):
with arg_scope([layers.conv2d],
weights_regularizer=layers.l2_regularizer(weight_reg),
activation_fn=tf.nn.relu):
conv1 = layers.conv2d(images, 16, [3, 3], stride=2, scope='Conv1')
conv2 = layers.conv2d(conv1, 32, [3, 3], stride=2, scope='Conv2')
conv3 = layers.conv2d(conv2, 64, [3, 3], stride=2, scope='Conv3')
conv4 = layers.conv2d(conv3, 128, [3, 3], stride=2, scope='Conv4')
conv5 = layers.conv2d(conv4, 256, [3, 3], stride=2, scope='Conv5')
conv6 = layers.conv2d(conv5, 512, [3, 3], stride=2, scope='Conv6')
conv7 = layers.conv2d(conv6, 1024, [3, 3], stride=2, scope='Conv7')
bottleneck = tf.reduce_mean(conv7, axis=[1, 2], keepdims=True)
background_motion = layers.conv2d(
bottleneck,
6, [1, 1],
stride=1,
activation_fn=None,
biases_initializer=None,
scope='background_motion')
rotation = background_motion[:, 0, 0, :3]
background_translation = background_motion[:, :, :, 3:]
residual_translation = layers.conv2d(
background_motion,
3, [1, 1],
stride=1,
activation_fn=None,
scope='unrefined_residual_translation')
residual_translation = _refine_motion_field(
residual_translation, conv7, align_corners, scope='Refine7')
residual_translation = _refine_motion_field(
residual_translation, conv6, align_corners, scope='Refine6')
residual_translation = _refine_motion_field(
residual_translation, conv5, align_corners, scope='Refine5')
residual_translation = _refine_motion_field(
residual_translation, conv4, align_corners, scope='Refine4')
residual_translation = _refine_motion_field(
residual_translation, conv3, align_corners, scope='Refine3')
residual_translation = _refine_motion_field(
residual_translation, conv2, align_corners, scope='Refine2')
residual_translation = _refine_motion_field(
residual_translation, conv1, align_corners, scope='Refine1')
residual_translation = _refine_motion_field(
residual_translation, images, align_corners, scope='RefineImages')
rot_scale, trans_scale = create_scales(0.001)
background_translation *= trans_scale
residual_translation *= trans_scale
rotation *= rot_scale
if auto_mask:
sq_residual_translation = tf.sqrt(
tf.reduce_sum(residual_translation**2, axis=3, keepdims=True))
mean_sq_residual_translation = tf.reduce_mean(
sq_residual_translation, axis=[0, 1, 2])
# A mask of shape [B, h, w, 1]
mask_residual_translation = tf.cast(
sq_residual_translation > mean_sq_residual_translation,
residual_translation.dtype.base_dtype)
residual_translation *= mask_residual_translation
image_height, image_width = tf.unstack(tf.shape(images)[1:3])
intrinsic_mat = add_intrinsics_head(bottleneck, image_height, image_width)
return (rotation, background_translation, residual_translation,
intrinsic_mat)
def create_scales(constraint_minimum):
"""Creates variables representing rotation and translation scaling factors.
Args:
constraint_minimum: A scalar, the variables will be constrained to not fall
below it.
Returns:
Two scalar variables, rotation and translation scale.
"""
def constraint(x):
return tf.nn.relu(x - constraint_minimum) + constraint_minimum
with tf.variable_scope('Scales', initializer=0.01, constraint=constraint):
rot_scale = tf.get_variable('rotation')
trans_scale = tf.get_variable('translation')
maybe_summary.scalar('rotation', rot_scale)
maybe_summary.scalar('translation', trans_scale)
return rot_scale, trans_scale
def _refine_motion_field(motion_field, layer, align_corners, scope=None):
"""Refines a motion field using features from another layer.
This function builds an element of a UNet-like architecture. `motion_field`
has a lower spatial resolution than `layer`. First motion_field is resized to
`layer`'s spatial resolution using bilinear interpolation, then convolutional
filters are applied on `layer` and the result is added to the upscaled
`motion_field`.
This scheme is inspired by FlowNet (https://arxiv.org/abs/1504.06852), and the
realization that keeping the bottenecks at the same (low) dimension as the
motion field will pressure the network to gradually transfer details from
depth channels to space.
The specifics are slightly different form FlowNet: We use two parallel towers,
a 3x3 convolution, and two successive 3x3 convolutions, as opposed to one
3x3 convolution in FLowNet. Also, we add the result to the upscaled
`motion_field`, forming a residual connection, unlike FlowNet. These changes
seemed to improve the depth prediction metrics, but exploration was far from
exhaustive.
Args:
motion_field: a tf.Tensor of shape [B, h1, w1, m]. m is the number of
dimensions in the motion field, for example, 3 in case of a 3D translation
field.
layer: tf.Tensor of shape [B, h2, w2, c].
align_corners: align_corners in resize_bilinear.
scope: the variable scope.
Returns:
A tf.Tensor of shape [B, h2, w2, m], obtained by upscaling motion_field to
h2, w2, and mixing it with layer using a few convolutions.
"""
with tf.variable_scope(scope):
_, h, w, _ = tf.unstack(tf.shape(layer))
# Only align_corners=True is supported on TPU
upsampled_motion_field = tf.image.resize_bilinear(
motion_field, [h, w], align_corners=align_corners)
conv_input = tf.concat([upsampled_motion_field, layer], axis=3)
# pyformat: disable
conv_output = layers.conv2d(
conv_input, max(4, layer.shape.as_list()[-1]), [3, 3], stride=1)
conv_input = layers.conv2d(
conv_input, max(4, layer.shape.as_list()[-1]), [3, 3], stride=1)
conv_output2 = layers.conv2d(
conv_input, max(4, layer.shape.as_list()[-1]), [3, 3], stride=1)
# pyformat: enable
conv_output = tf.concat([conv_output, conv_output2], axis=-1)
return upsampled_motion_field + layers.conv2d(
conv_output,
motion_field.shape.as_list()[-1], [1, 1],
stride=1,
activation_fn=None,
biases_initializer=None)