Skip to content

Commit f98d9d9

Browse files
Change short sequence threshold for parallel versions
1 parent 2b712a1 commit f98d9d9

File tree

2 files changed

+40
-27
lines changed

2 files changed

+40
-27
lines changed

Diff for: c_reference/include/conv1d.h

+17-13
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
99
NOTES for the conv layers
1010
-> The conv1d & conv1d_lr layers work for all cases and can be used unconstrained.
11-
There are no hard constraints for the parallel version, but a points regarding the optimal usage are given below
11+
There are no hard constraints for the parallel version, but a few points regarding its optimal usage are given below
1212
-> Dilation = 1 (no dilation) for all cases
1313
-> For the non-depthwise cases, store the matrices as described below. Permutation might be necessary
1414
-> The low-rank decomposition cannot be applied to the depthwise weight matrices. This is due to the out_channels/in_channels = 0 constarint imposed by the depthwise convolution.
@@ -22,10 +22,10 @@
2222
2323
Important points regarding parallel versions
2424
-> Due to the above reason, the parallel layers is only recommended for large in_time inputs
25-
This should typically be for in_time (without the padding) > 2 * (kernel_size + stride). Else there would not be enough time-steps to efficiently parallelize
26-
For other shorter input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain)
27-
For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently
28-
The RAM usage would be lower and the function would not have any overheads (calculation of the iterators and MatMul-auxiliary variables)
25+
This should typically be for in_time (without the padding) > 2 * num_steps_one_row + stride. Else there would not be enough time-steps to efficiently parallelise
26+
We need at least 2 rows for a good a MatMul performace. In the worst case the starting time step would be (stride - 1). Hence we choose 2 * num_steps_one_row + stride as the threshold
27+
For the short input cases, the code will skip the MatMul computation and use MatVec instead (but the MatMul-variable computation overhead would remain)
28+
For such cases, the MatVec code (conv1d and conv1d_lr) would work more efficiently due to the lower RAM usage and lack of any major overheads
2929
-> There is no support for depthwise for conv1d_parallel
3030
The regular convolution acts on all the channels while the depthwise acts only on one channel at a time
3131
This results in a non-contiguos memory access. MatMul would need to process multiple such time-steps, while the MatVec would only need to process one
@@ -66,8 +66,9 @@ typedef struct ConvLayers_Params {
6666
* 2: tanh
6767
* 3: relu
6868
*/
69-
int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
70-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
69+
int conv1d(float* output_signal, unsigned out_time, unsigned out_channels,
70+
const float* input_signal, unsigned in_time, unsigned in_channels,
71+
unsigned padding, unsigned kernel_size,
7172
const void* params, unsigned stride, unsigned activation);
7273

7374
/**
@@ -102,8 +103,9 @@ typedef struct ConvLayers_Parallel_Params {
102103
* 2: tanh
103104
* 3: relu
104105
*/
105-
int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
106-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
106+
int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
107+
const float* input_signal, unsigned in_time, unsigned in_channels,
108+
unsigned padding, unsigned kernel_size,
107109
const void* params, unsigned stride, unsigned activation);
108110

109111
/**
@@ -141,8 +143,9 @@ typedef struct ConvLayers_LR_Params {
141143
* 2: tanh
142144
* 3: relu
143145
*/
144-
int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
145-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
146+
int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels,
147+
const float* input_signal, unsigned in_time, unsigned in_channels,
148+
unsigned padding, unsigned kernel_size,
146149
const void* params, unsigned stride, unsigned activation);
147150

148151
/**
@@ -184,8 +187,9 @@ typedef struct ConvLayers_LR_Parallel_Params {
184187
* 2: tanh
185188
* 3: relu
186189
*/
187-
int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
188-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
190+
int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
191+
const float* input_signal, unsigned in_time, unsigned in_channels,
192+
unsigned padding, unsigned kernel_size,
189193
const void* params, unsigned stride, unsigned activation);
190194

191195
// Auxiliary Layers

Diff for: c_reference/src/conv1d.c

+23-14
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
#include "conv1d.h"
88
#include "utils.h"
99

10-
int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
11-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
10+
int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels,
11+
const float* input_signal, unsigned in_time, unsigned in_channels,
12+
unsigned padding, unsigned kernel_size,
1213
const void* params, unsigned stride, unsigned activation) {
1314

1415
const ConvLayers_LR_Params* tparams= (ConvLayers_LR_Params*) params;
@@ -96,8 +97,9 @@ int conv1d_lr(float* output_signal, unsigned out_time, unsigned out_channels, co
9697
return 0;
9798
}
9899

99-
int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
100-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
100+
int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
101+
const float* input_signal, unsigned in_time, unsigned in_channels,
102+
unsigned padding, unsigned kernel_size,
101103
const void* params, unsigned stride, unsigned activation) {
102104

103105
unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0;
@@ -112,7 +114,10 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
112114
// Perform the convolution. Zero-pad is from 0 to padding and in_time + padding to in_time + 2 * padding
113115
// Buffer to hold the output. For corner cases, this will be realtively big.
114116
// But will be needed for the central condition (filter inside input).
115-
unsigned buffer_steps = in_time / num_steps_one_row, rank = tparams->rank;
117+
// If there are not enough time steps to linearise into one row, then allocate only 1 time step
118+
unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ?
119+
in_time / num_steps_one_row : 1;
120+
unsigned rank = tparams->rank;
116121
// Buffer for W2 out
117122
float* temp_rank_out = (float*)malloc(buffer_steps * rank * sizeof(float));
118123
// Buffer for W1 out
@@ -147,9 +152,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
147152
// Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row
148153
// Using the above logic, we can convert the MatVec opeartion into a MatMul operation
149154
// Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied)
150-
// Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows
155+
// Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria
151156
// The MatVec will be used for the computation in-case the following block is skipped
152-
if (in_time > ((kernel_size + stride) << 1)) {
157+
if (in_time > ((num_steps_one_row << 1) + stride)) {
153158
t_in_start -= padding; // remove the padding offset temporarily
154159
t_in_end -= padding; // Used to keep track of the final processed index
155160
for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels);
@@ -251,8 +256,9 @@ int conv1d_lr_parallel(float* output_signal, unsigned out_time, unsigned out_cha
251256
return 0;
252257
}
253258

254-
int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
255-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
259+
int conv1d(float* output_signal, unsigned out_time, unsigned out_channels,
260+
const float* input_signal, unsigned in_time, unsigned in_channels,
261+
unsigned padding, unsigned kernel_size,
256262
const void* params, unsigned stride, unsigned activation) {
257263

258264
const ConvLayers_Params* tparams= (ConvLayers_Params*) params;
@@ -331,8 +337,9 @@ int conv1d(float* output_signal, unsigned out_time, unsigned out_channels, const
331337
return 0;
332338
}
333339

334-
int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels, const float* input_signal,
335-
unsigned in_time, unsigned in_channels, unsigned padding, unsigned kernel_size,
340+
int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channels,
341+
const float* input_signal, unsigned in_time, unsigned in_channels,
342+
unsigned padding, unsigned kernel_size,
336343
const void* params, unsigned stride, unsigned activation) {
337344

338345
unsigned ncols = kernel_size * in_channels, num_iter = 0, num_steps_one_row = 0;
@@ -347,7 +354,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe
347354
// Perform the Convolution. Pad is from 0 to padding and in_time + padding to in_time + 2 * padding
348355
// Buffer to hold the output. For corner cases, this will be realtively big.
349356
// But will be needed for the central condition (filter inside input).
350-
unsigned buffer_steps = in_time / num_steps_one_row;
357+
// If there are not enough time steps to linearise into one row, then allocate only 1 time step
358+
unsigned buffer_steps = ((in_time / num_steps_one_row) > 1) ?
359+
in_time / num_steps_one_row : 1;
351360
float* temp_out = (float*)malloc(buffer_steps * out_channels * sizeof(float));
352361
unsigned t_in_start, t_in_end, t_out; // Values are needed outside the loops. Hence declared here
353362
for (t_in_start = 0, t_in_end = kernel_size - 1, t_out = 0;
@@ -375,9 +384,9 @@ int conv1d_parallel(float* output_signal, unsigned out_time, unsigned out_channe
375384
// Hence we use the num_steps_one_row for calculating the number of time steps to be linearized in one row
376385
// Using the above logic, we can convert the MatVec opeartion into a MatMul operation
377386
// Ideally both implementation would be the same. However for edge devices the matMul was found to be faster matVec (both tilied)
378-
// Skip if atleast 2 rows cannot be formed. The condition 2 * (kernel_size + stride) is the worst case criteria to form 2 rows
387+
// Skip if atleast 2 rows cannot be formed. The condition 2 * num_steps_one_row + stride is the worst case criteria
379388
// The MatVec will be used for the computation in-case the following block is skipped
380-
if (in_time > ((kernel_size + stride) << 1)) {
389+
if (in_time > ((num_steps_one_row << 1) + stride)) {
381390
t_in_start -= padding; // remove the padding offset temporarily
382391
t_in_end -= padding; // Used to keep track of the final processed index
383392
for (unsigned iter = 0; (iter < num_iter) && (t_out < out_channels);

0 commit comments

Comments
 (0)