24
24
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
25
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
26
*/
27
+ #include < NvInfer.h>
27
28
#include " cudaWrapper.h"
28
29
#include " ioHelper.h"
29
- #include < NvInfer.h>
30
30
#include < NvOnnxParser.h>
31
31
#include < algorithm>
32
32
#include < cassert>
33
33
#include < iostream>
34
34
#include < memory>
35
35
#include < string>
36
36
#include < vector>
37
+ #include < numeric>
38
+ #include < math.h>
39
+ #include < cmath>
37
40
38
41
using namespace nvinfer1 ;
39
42
using namespace std ;
@@ -46,52 +49,49 @@ constexpr double ABS_EPSILON = 0.005;
46
49
// Maxmimum relative tolerance for output tensor comparison against reference.
47
50
constexpr double REL_EPSILON = 0.05 ;
48
51
49
- ICudaEngine* createCudaEngine (string const & onnxModelPath, int batchSize)
52
+ nvinfer1:: ICudaEngine* createCudaEngine (string const & onnxModelPath, int batchSize)
50
53
{
51
- unique_ptr<IBuilder, Destroy<IBuilder>> builder{createInferBuilder (gLogger )};
52
- unique_ptr<INetworkDefinition, Destroy<INetworkDefinition>> network{builder->createNetwork ()};
54
+ const auto explicitBatch = 1U << static_cast <uint32_t >(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH );
55
+ unique_ptr<nvinfer1::IBuilder, Destroy<nvinfer1::IBuilder>> builder{nvinfer1::createInferBuilder (gLogger )};
56
+ unique_ptr<nvinfer1::INetworkDefinition, Destroy<nvinfer1::INetworkDefinition>> network{builder->createNetworkV2 (explicitBatch)};
53
57
unique_ptr<nvonnxparser::IParser, Destroy<nvonnxparser::IParser>> parser{nvonnxparser::createParser (*network, gLogger )};
58
+ unique_ptr<nvinfer1::IBuilderConfig,Destroy<nvinfer1::IBuilderConfig>> config{builder->createBuilderConfig ()};
54
59
55
60
if (!parser->parseFromFile (onnxModelPath.c_str (), static_cast <int >(ILogger::Severity::kINFO )))
56
61
{
57
62
cout << " ERROR: could not parse input engine." << endl;
58
63
return nullptr ;
59
64
}
60
65
61
- return builder->buildCudaEngine (*network); // Build and return TensorRT engine.
66
+ builder->setMaxBatchSize (batchSize);
67
+ config->setMaxWorkspaceSize ((1 << 30 ));
68
+
69
+ auto profile = builder->createOptimizationProfile ();
70
+ profile->setDimensions (network->getInput (0 )->getName (), OptProfileSelector::kMIN , Dims4{1 , 3 , 256 , 256 });
71
+ profile->setDimensions (network->getInput (0 )->getName (), OptProfileSelector::kOPT , Dims4{1 , 3 , 256 , 256 });
72
+ profile->setDimensions (network->getInput (0 )->getName (), OptProfileSelector::kMAX , Dims4{32 , 3 , 256 , 256 });
73
+ config->addOptimizationProfile (profile);
74
+
75
+ return builder->buildEngineWithConfig (*network, *config);
62
76
}
63
77
64
- static int getBindingInputIndex (IExecutionContext* context)
78
+ static int getBindingInputIndex (nvinfer1:: IExecutionContext* context)
65
79
{
66
80
return !context->getEngine ().bindingIsInput (0 ); // 0 (false) if bindingIsInput(0), 1 (true) otherwise
67
81
}
68
82
69
83
void launchInference (IExecutionContext* context, cudaStream_t stream, vector<float > const & inputTensor, vector<float >& outputTensor, void ** bindings, int batchSize)
70
84
{
71
85
int inputId = getBindingInputIndex (context);
72
-
73
86
cudaMemcpyAsync (bindings[inputId], inputTensor.data (), inputTensor.size () * sizeof (float ), cudaMemcpyHostToDevice, stream);
74
- context->enqueue (batchSize, bindings, stream, nullptr );
87
+ context->enqueueV2 ( bindings, stream, nullptr );
75
88
cudaMemcpyAsync (outputTensor.data (), bindings[1 - inputId], outputTensor.size () * sizeof (float ), cudaMemcpyDeviceToHost, stream);
76
- }
77
89
78
- void softmax (vector<float >& tensor, int batchSize)
79
- {
80
- size_t batchElements = tensor.size () / batchSize;
81
-
82
- for (int i = 0 ; i < batchSize; ++i)
83
- {
84
- float * batchVector = &tensor[i * batchElements];
85
- double maxValue = *max_element (batchVector, batchVector + batchElements);
86
- double expSum = accumulate (batchVector, batchVector + batchElements, 0.0 , [=](double acc, float value) { return acc + exp (value - maxValue); });
87
-
88
- transform (batchVector, batchVector + batchElements, batchVector, [=](float input) { return static_cast <float >(std::exp (input - maxValue) / expSum); });
89
- }
90
90
}
91
91
92
- void verifyOutput (vector<float > const & outputTensor, vector<float > const & referenceTensor)
92
+ void verifyOutput (vector<float > const & outputTensor, vector<float > const & referenceTensor, int size )
93
93
{
94
- for (size_t i = 0 ; i < referenceTensor. size () ; ++i)
94
+ for (size_t i = 0 ; i < size; ++i)
95
95
{
96
96
double reference = static_cast <double >(referenceTensor[i]);
97
97
// Check absolute and relative tolerance.
@@ -102,8 +102,31 @@ void verifyOutput(vector<float> const& outputTensor, vector<float> const& refere
102
102
return ;
103
103
}
104
104
}
105
+ cout << " OK" << endl;
106
+ }
105
107
106
- cout << " OK" << endl;
108
+ void saveImageAsPGM (vector<float >& outputTensor,int H, int W)
109
+ {
110
+ FILE* pgmimg;
111
+ pgmimg = fopen (" output.pgm" , " wb" );
112
+
113
+ fprintf (pgmimg, " P2\n " );
114
+ // Writing Width and Height
115
+ fprintf (pgmimg, " %d %d\n " , H, W);
116
+ // Writing the maximum gray value
117
+ fprintf (pgmimg, " 255\n " );
118
+
119
+ for (int i=0 ; i< H; ++i)
120
+ {
121
+ for (int j=0 ; j<W; ++j)
122
+ {
123
+ int temp = round (255 * outputTensor[i*H + j]);
124
+ fprintf (pgmimg, " %d " , temp);
125
+ }
126
+ fprintf (pgmimg, " \n " );
127
+ }
128
+
129
+ fclose (pgmimg);
107
130
}
108
131
109
132
int main (int argc, char * argv[])
@@ -141,13 +164,14 @@ int main(int argc, char* argv[])
141
164
for (int i = 0 ; i < engine->getNbBindings (); ++i)
142
165
{
143
166
Dims dims{engine->getBindingDimensions (i)};
144
- size_t size = accumulate (dims.d , dims.d + dims.nbDims , batchSize, multiplies<size_t >());
167
+ size_t size = accumulate (dims.d + 1 , dims.d + dims.nbDims , batchSize, multiplies<size_t >());
145
168
// Create CUDA buffer for Tensor.
146
- cudaMalloc (&bindings[i], size * sizeof (float ));
169
+ cudaMalloc (&bindings[i], batchSize * size * sizeof (float ));
147
170
148
171
// Resize CPU buffers to fit Tensor.
149
- if (engine->bindingIsInput (i))
172
+ if (engine->bindingIsInput (i)){
150
173
inputTensor.resize (size);
174
+ }
151
175
else
152
176
outputTensor.resize (size);
153
177
}
@@ -158,31 +182,39 @@ int main(int argc, char* argv[])
158
182
cout << " Couldn't read input Tensor" << endl;
159
183
return 1 ;
160
184
}
185
+
161
186
162
187
// Create Execution Context.
163
188
context.reset (engine->createExecutionContext ());
189
+
190
+ Dims dims_i{engine->getBindingDimensions (0 )};
191
+ Dims4 inputDims{batchSize, dims_i.d [1 ], dims_i.d [2 ], dims_i.d [3 ]};
192
+ context->setBindingDimensions (0 , inputDims);
164
193
165
194
launchInference (context.get (), stream, inputTensor, outputTensor, bindings, batchSize);
195
+
196
+ Dims dims{engine->getBindingDimensions (1 )};
197
+ saveImageAsPGM (outputTensor, dims.d [2 ], dims.d [3 ]);
166
198
// Wait until the work is finished.
167
199
cudaStreamSynchronize (stream);
168
200
169
201
vector<string> referenceFiles;
170
202
for (string path : inputFiles)
171
203
referenceFiles.push_back (path.replace (path.rfind (" input" ), 5 , " output" ));
172
204
// Try to read and compare against reference tensor from protobuf file.
205
+
206
+
173
207
referenceTensor.resize (outputTensor.size ());
174
208
if (readTensor (referenceFiles, referenceTensor) != referenceTensor.size ())
175
209
{
176
210
cout << " Couldn't read reference Tensor" << endl;
177
211
return 1 ;
178
212
}
179
213
180
- // Apply a softmax on the CPU to create a normalized distribution suitable for measuring relative error in probabilities.
181
- softmax (outputTensor, batchSize);
182
- softmax (referenceTensor, batchSize);
183
-
184
- verifyOutput (outputTensor, referenceTensor);
185
-
214
+ Dims dims_o{engine->getBindingDimensions (1 )};
215
+ int size = batchSize * dims_o.d [2 ] * dims_o.d [3 ];
216
+ verifyOutput (outputTensor, referenceTensor, size);
217
+
186
218
for (void * ptr : bindings)
187
219
cudaFree (ptr);
188
220
0 commit comments