Skip to content

Commit 214495c

Browse files
Batchnorm1d optimized and parameterized point cnn layer in phoneme det
1 parent 190d3b4 commit 214495c

File tree

8 files changed

+88
-49
lines changed

8 files changed

+88
-49
lines changed

Diff for: c_reference/include/conv1d.h

+10-6
Original file line numberDiff line numberDiff line change
@@ -158,18 +158,22 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
158158
* @param[in] input_signal pointer to the input signal. size = in_time * in_channels
159159
* @param[in] in_time number of time steps in the input
160160
* @param[in] in_channels number of input channels. The output will have the same number of channels
161-
* @param[in] mean pointer to the mean for the batch normalization, size = in_channels
162-
* @param[in] var pointer to the variance for the batch normalization, size = in_channels
163-
* @param[in] affine whether the affine operations are applied
164-
* @param[in] gamma pointer to the scaling factors for the post-norm affine operation, size = in_channels. Provide Null/0 if affine is False(non-zero)
165-
* @param[in] beta pointer to the offsets for the post-norm affine operation, size = in_channels. Provide Null/0 if affine is False(non-zero)
161+
* @param[in] mean pointer to the mean for the batch normalization, size = in_channels. if affine_config = 2, then pass a NULL/0
162+
* @param[in] var pointer to the variance for the batch normalization, size = in_channels. if affine_config = 2, then pass a NULL/0
163+
* @param[in] affine_config whether the affine operations are applied
164+
* if affine_config = 0, then only mean and var are used
165+
* if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
166+
* if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var)
167+
* Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed
168+
* @param[in] gamma pointer to the scaling factors for the post-norm affine operation, size = in_channels. Provide Null/0 if affine_config is 0
169+
* @param[in] beta pointer to the offsets for the post-norm affine operation, size = in_channels. Provide Null/0 if affine_config is 0
166170
* @param[in] in_place in-place computation of the batchnorm i.e. the output is stored in-place of the input signal. Storage efficient
167171
* @param[in] eps a very small +ve value to avoid division by 0. For the default value, assign = 0.00001
168172
*/
169173
int batchnorm1d(float* output_signal, float* input_signal,
170174
unsigned in_time, unsigned in_channels,
171175
const float* const mean, const float* const var,
172-
unsigned affine, const float* const gamma , const float* const beta,
176+
unsigned affine_config, const float* const gamma , const float* const beta,
173177
unsigned in_place, float eps);
174178

175179
#endif

Diff for: c_reference/include/dscnn.h

+26-13
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,27 @@
44
#ifndef __DSCNN_H__
55
#define __DSCNN_H__
66

7+
// Function pointer for the Conv layer to be passed as a parameter. (conv1d or conv1d_lr only)
8+
typedef int (*conv_layer)(float*, unsigned, unsigned, const float*,
9+
unsigned, unsigned, unsigned, unsigned,
10+
const void*, unsigned, unsigned);
11+
712
/**
813
* @brief Model definition for the 1D Convolution block applied before the RNN
914
* @brief sub-layers : batchnorm1d -> conv1d_lr
1015
* @param[out] output_signal pointer to the final output signal, minimum size = out_time * in_channels. out_time has to be calculated based on the reduction from all the conv and pool layers
1116
* @param[in] input_signal pointer to the input signal. size = in_time * in_channels
1217
* @param[in] in_time number of time steps in the input_signal
1318
* @param[in] in_channels number of input channels
14-
* @param[in] mean pointer to the mean for the batch normalization, size = in_channels
15-
* @param[in] var pointer to the variance for the batch normalization, size = in_channels
16-
* @param[in] affine whether the affine operations are applied
17-
* @param[in] gamma pointer to the scaling factors for the post-norm affine operation, size = in_channels
18-
* @param[in] beta pointer to the offsets for the post-norm affine operation, size = in_channels
19+
* @param[in] mean pointer to the mean for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
20+
* @param[in] var pointer to the variance for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
21+
* @param[in] affine_config whether the affine operations are applied
22+
* if affine_config = 0, then only mean and var are used
23+
* if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
24+
* if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var)
25+
* Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed
26+
* @param[in] gamma pointer to the scaling factors for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
27+
* @param[in] beta pointer to the offsets for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
1928
* @param[in] in_place in-place computation check for the batchnorm. Storage efficient
2029
* @param[in] cnn_hidden hidden state/out_channels dimensions for the low-rank CNN. The final channel size of this block
2130
* @param[in] cnn_padding padding for the low-rank CNN layer. Note: applied to both sides of the input
@@ -31,7 +40,7 @@
3140
int phon_pred_lr_cnn(float* output_signal, float* input_signal,
3241
unsigned in_time, unsigned in_channels,
3342
const float* const mean, const float* const var,
34-
unsigned affine, float* gamma, float* beta, unsigned in_place,
43+
unsigned affine_config, float* gamma, float* beta, unsigned in_place,
3544
unsigned cnn_hidden, unsigned cnn_padding, unsigned cnn_kernel_size,
3645
const void* cnn_params, unsigned cnn_stride, unsigned cnn_activation);
3746

@@ -42,11 +51,15 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
4251
* @param[in] input_signal pointer to the input signal. size = in_time * in_channels
4352
* @param[in] in_time number of time steps in the input
4453
* @param[in] in_channels number of input channels
45-
* @param[in] mean pointer to the mean for the batch normalization, size = in_channels
46-
* @param[in] var pointer to the variance for the batch normalization, size = in_channels
47-
* @param[in] affine whether the affine operations are applied
48-
* @param[in] gamma pointer to the scaling factors for the post-norm affine operation, size = in_channels
49-
* @param[in] beta pointer to the offsets for the post-norm affine operation, size = in_channels
54+
* @param[in] mean pointer to the mean for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
55+
* @param[in] var pointer to the variance for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2
56+
* @param[in] affine_config whether the affine operations are applied
57+
* if affine_config = 0, then only mean and var are used
58+
* if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
59+
* if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var)
60+
* Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed
61+
* @param[in] gamma pointer to the scaling factors for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
62+
* @param[in] beta pointer to the offsets for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0
5063
* @param[in] in_place in-place computation of the batchnorm. Storage efficient
5164
* @param[in] depth_cnn_padding padding for the depth CNN layer. Note: applied to both sides of the input to the depth CNN
5265
* @param[in] depth_cnn_kernel_size kernel size of the depth CNN
@@ -77,9 +90,9 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
7790
* 3: relu
7891
*/
7992
int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
80-
unsigned in_time, unsigned in_channels,
93+
conv_layer point_cnn, unsigned in_time, unsigned in_channels,
8194
const float* const mean, const float* const var,
82-
unsigned affine, const float* const gamma, const float* const beta, unsigned in_place,
95+
unsigned affine_config, const float* const gamma, const float* const beta, unsigned in_place,
8396
unsigned depth_cnn_padding, unsigned depth_cnn_kernel_size,
8497
const void* depth_cnn_params, unsigned depth_cnn_stride, unsigned depth_cnn_activation,
8598
unsigned point_cnn_hidden, unsigned point_cnn_padding, unsigned point_cnn_kernel_size,

Diff for: c_reference/src/conv1d.c

+28-6
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
171171
unsigned padding, unsigned kernel_size, unsigned stride, unsigned activation) {
172172

173173
// Iterate over the time steps and average them. Similar to Conv1D_Dept with a filter kernel of ones
174+
float scale = 1.0/(float)kernel_size;
174175
for (unsigned t_in = 0, t_out = 0; t_out < out_time; t_out++, t_in += stride) {
175176
for (unsigned ci = 0; ci < in_channels; ci++) {
176177
float sum = 0;
@@ -183,16 +184,16 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
183184
}
184185
}
185186
if (activation == 1) {
186-
output_signal[t_out * in_channels + ci] = sigmoid(sum / (float)kernel_size);
187+
output_signal[t_out * in_channels + ci] = sigmoid(sum * scale);
187188
}
188189
else if (activation == 2) {
189-
output_signal[t_out * in_channels + ci] = tanh(sum / (float)kernel_size);
190+
output_signal[t_out * in_channels + ci] = tanh(sum * scale);
190191
}
191192
else if (activation == 3) {
192-
output_signal[t_out * in_channels + ci] = relu(sum / (float)kernel_size);
193+
output_signal[t_out * in_channels + ci] = relu(sum * scale);
193194
}
194195
else {
195-
output_signal[t_out * in_channels + ci] = sum / (float)kernel_size;
196+
output_signal[t_out * in_channels + ci] = sum * scale;
196197
}
197198
}
198199
}
@@ -202,10 +203,10 @@ int avgpool1d(float* output_signal, unsigned out_time, const float* input_signal
202203
int batchnorm1d(float* output_signal, float* input_signal,
203204
unsigned in_time, unsigned in_channels,
204205
const float* const mean, const float* const var,
205-
unsigned affine, const float* const gamma , const float* const beta,
206+
unsigned affine_config, const float* const gamma , const float* const beta,
206207
unsigned in_place, float eps) {
207208
// Check if affine values was learnt
208-
if (affine) {
209+
if (affine_config == 1) {
209210
// Check for in-place computation
210211
if (in_place) {
211212
for (unsigned t = 0; t < in_time; t++) {
@@ -228,6 +229,27 @@ int batchnorm1d(float* output_signal, float* input_signal,
228229
}
229230
}
230231
}
232+
else if (affine_config == 2) {
233+
// Check for in-place computation
234+
if (in_place) {
235+
for (unsigned t = 0; t < in_time; t++) {
236+
for (unsigned d = 0; d < in_channels; d++) {
237+
input_signal[t * in_channels + d] = (gamma[d]
238+
* input_signal[t * in_channels + d])
239+
+ beta[d];
240+
}
241+
}
242+
}
243+
else {
244+
for (unsigned t = 0; t < in_time; t++) {
245+
for (unsigned d = 0; d < in_channels; d++) {
246+
output_signal[t * in_channels + d] = (gamma[d]
247+
* input_signal[t * in_channels + d])
248+
+ beta[d];
249+
}
250+
}
251+
}
252+
}
231253
else {
232254
// Check for in-place computation
233255
if (in_place) {

Diff for: c_reference/src/dscnn.c

+8-8
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
int phon_pred_lr_cnn(float* output_signal, float* input_signal,
1212
unsigned in_time, unsigned in_channels,
1313
const float* const mean, const float* const var,
14-
unsigned affine, float* gamma, float* beta, unsigned in_place,
14+
unsigned affine_config, float* gamma, float* beta, unsigned in_place,
1515
unsigned cnn_hidden, unsigned cnn_padding, unsigned cnn_kernel_size,
1616
const void* cnn_params, unsigned cnn_stride, unsigned cnn_activation) {
1717

@@ -20,7 +20,7 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
2020
// BatchNorm
2121
batchnorm1d(0, input_signal,
2222
in_time, in_channels,
23-
mean, var, affine, gamma, beta,
23+
mean, var, affine_config, gamma, beta,
2424
in_place, 0.00001);
2525
// CNN
2626
conv1d_lr(output_signal, out_time, cnn_hidden, input_signal,
@@ -32,7 +32,7 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
3232
float* norm_out = (float*)malloc(in_time * in_channels * sizeof(float));
3333
batchnorm1d(norm_out, input_signal,
3434
in_time, in_channels,
35-
mean, var, affine, gamma, beta,
35+
mean, var, affine_config, gamma, beta,
3636
in_place, 0.00001);
3737
// CNN
3838
conv1d_lr(output_signal, out_time, cnn_hidden, norm_out,
@@ -44,9 +44,9 @@ int phon_pred_lr_cnn(float* output_signal, float* input_signal,
4444
}
4545

4646
int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
47-
unsigned in_time, unsigned in_channels,
47+
conv_layer point_cnn, unsigned in_time, unsigned in_channels,
4848
const float* const mean, const float* const var,
49-
unsigned affine, const float* const gamma, const float* const beta, unsigned in_place,
49+
unsigned affine_config, const float* const gamma, const float* const beta, unsigned in_place,
5050
unsigned depth_cnn_padding, unsigned depth_cnn_kernel_size,
5151
const void* depth_cnn_params, unsigned depth_cnn_stride, unsigned depth_cnn_activation,
5252
unsigned point_cnn_hidden, unsigned point_cnn_padding, unsigned point_cnn_kernel_size,
@@ -66,7 +66,7 @@ int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
6666
batchnorm1d(0, act_out,
6767
in_time, in_channels,
6868
mean, var,
69-
affine, gamma, beta,
69+
affine_config, gamma, beta,
7070
in_place, 0.00001);
7171
// Depth CNN
7272
depth_out = (float*)malloc(out_time * in_channels * sizeof(float));
@@ -81,7 +81,7 @@ int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
8181
batchnorm1d(norm_out, act_out,
8282
in_time, in_channels,
8383
mean, var,
84-
affine, gamma, beta,
84+
affine_config, gamma, beta,
8585
in_place, 0.00001);
8686
free(act_out);
8787
// Depth CNN
@@ -96,7 +96,7 @@ int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
9696
in_time = out_time;
9797
out_time = in_time - point_cnn_kernel_size + 2 * point_cnn_padding + 1;
9898
float* point_out = (float*)malloc(out_time * point_cnn_hidden * sizeof(float));
99-
conv1d_lr(point_out, out_time, point_cnn_hidden, depth_out,
99+
point_cnn(point_out, out_time, point_cnn_hidden, depth_out,
100100
in_time, in_channels, point_cnn_padding, point_cnn_kernel_size,
101101
point_cnn_params, point_cnn_stride, point_cnn_activation);
102102
free(depth_out);

Diff for: c_reference/tests/kws/postcnn_params.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:f00e0e07085a9eb22763815dcdf7a0740a00f847892153e1e50f2511244322a2
3-
size 1377207
2+
oid sha256:1f25d6df179c6316dd0832e49b280ef5ff1f30b9c9508fcd8c79f54d4cab9036
3+
size 1395958

Diff for: c_reference/tests/kws/precnn_params.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:d3fef863a8ac348169326a66b7b7af68310c48ba430efccc3d157f4682a2c3be
3-
size 520621
2+
oid sha256:00406a0d0e0ed8fb87d4f10a81728f282ffd959d3a1a218ac0998d86b5cc206b
3+
size 522657

Diff for: c_reference/tests/kws/rnn_params.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:c7611deba4e8b46c5f8ddc3f162d00f94474d9017c42d0e3bbd3606d2cd50d18
3-
size 1303414
2+
oid sha256:54b39b200e68a9f8c73d1d45bb3a2b6094f490a16bf469773a2d982d6a2b2093
3+
size 1312526

Diff for: c_reference/tests/kws/test_phoneme_det_cnn_rnn.c

+10-10
Original file line numberDiff line numberDiff line change
@@ -165,12 +165,12 @@ void phoneme_prediction(float* mem_buf) {
165165
// Use the in-place computation only if the input can be discarded/altered. Else avoid in-place computation for this layer
166166
phon_pred_lr_cnn(cnn1_out, mem_buf,
167167
in_time, PRE_CNN_IN_FEATURES,
168-
BNORM_CNN1_MEAN, BNORM_CNN1_VAR, 0, 0, 0, PRE_CNN_BNORM_INPLACE,
168+
0, 0, PRE_CNN_BNORM_AFFINE, CNN1_SCALE, CNN1_OFFSET, PRE_CNN_BNORM_INPLACE,
169169
PRE_CNN_OUT_FEATURES, PRE_CNN_FILT_PAD, PRE_CNN_FILT,
170170
&conv_params, PRE_CNN_STRIDE, PRE_CNN_FILT_ACT); // regular tanh activation
171171

172172
batchnorm1d(0, cnn1_out, in_time, RNN_IN_FEATURES,
173-
BNORM_RNN_MEAN, BNORM_RNN_VAR, 0, 0, 0, 1, 0.00001); // Currently in-place only and no affine values
173+
0, 0, RNN_BNORM_AFFINE, RNN_SCALE, RNN_OFFSET, 1, 0.00001);
174174

175175
/* Bricked Bi-FastGRNN Block */
176176
out_time = in_time/RNN_HOP + 1;
@@ -194,8 +194,8 @@ void phoneme_prediction(float* mem_buf) {
194194
out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
195195
float* cnn2_out = (float*)malloc(out_time * POST_CNN_INTER_FEATURES * sizeof(float));
196196
phon_pred_depth_point_lr_cnn(cnn2_out, rnn_out,
197-
in_time, POST_CNN_INTER_FEATURES,
198-
CNN2_BNORM_MEAN, CNN2_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
197+
conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
198+
0, 0, POST_CNN_BNORM_AFFINE, CNN2_SCALE, CNN2_OFFSET, POST_CNN_BNORM_INPLACE,
199199
POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
200200
&depth_param_2, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
201201
POST_CNN_INTER_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,
@@ -209,8 +209,8 @@ void phoneme_prediction(float* mem_buf) {
209209
out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
210210
float* cnn3_out = (float*)malloc(out_time * POST_CNN_INTER_FEATURES * sizeof(float));
211211
phon_pred_depth_point_lr_cnn(cnn3_out, cnn2_out,
212-
in_time, POST_CNN_INTER_FEATURES,
213-
CNN3_BNORM_MEAN, CNN3_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
212+
conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
213+
0, 0, POST_CNN_BNORM_AFFINE, CNN3_SCALE, CNN3_OFFSET, POST_CNN_BNORM_INPLACE,
214214
POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
215215
&depth_param_3, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
216216
POST_CNN_INTER_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,
@@ -224,8 +224,8 @@ void phoneme_prediction(float* mem_buf) {
224224
out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
225225
float* cnn4_out = (float*)malloc(out_time * POST_CNN_INTER_FEATURES * sizeof(float));
226226
phon_pred_depth_point_lr_cnn(cnn4_out, cnn3_out,
227-
in_time, POST_CNN_INTER_FEATURES,
228-
CNN4_BNORM_MEAN, CNN4_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
227+
conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
228+
0, 0, POST_CNN_BNORM_AFFINE, CNN4_SCALE, CNN4_OFFSET, POST_CNN_BNORM_INPLACE,
229229
POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
230230
&depth_param_4, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
231231
POST_CNN_INTER_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,
@@ -239,8 +239,8 @@ void phoneme_prediction(float* mem_buf) {
239239
out_time = out_time - POST_CNN_POOL + (POST_CNN_POOL_PAD << 1) + 1;
240240
float* pred = (float*)malloc(out_time * POST_CNN_OUT_FEATURES * sizeof(float));
241241
phon_pred_depth_point_lr_cnn(pred, cnn4_out,
242-
in_time, POST_CNN_INTER_FEATURES,
243-
CNN5_BNORM_MEAN, CNN5_BNORM_VAR, 0, 0, 0, POST_CNN_BNORM_INPLACE,
242+
conv1d_lr, in_time, POST_CNN_INTER_FEATURES,
243+
0, 0, POST_CNN_BNORM_AFFINE, CNN5_SCALE, CNN5_OFFSET, POST_CNN_BNORM_INPLACE,
244244
POST_CNN_DEPTH_PAD, POST_CNN_DEPTH_FILT,
245245
&depth_param_5, POST_CNN_DEPTH_STRIDE, POST_CNN_DEPTH_ACT,
246246
POST_CNN_OUT_FEATURES, POST_CNN_POINT_PAD, POST_CNN_POINT_FILT,

0 commit comments

Comments
 (0)