diff --git a/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml b/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml index 8f64b624540e7..38712790feeb0 100644 --- a/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml +++ b/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml @@ -211,6 +211,28 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_bev_detector.launch.xml b/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_bev_detector.launch.xml new file mode 100644 index 0000000000000..a8b1f3f63985d --- /dev/null +++ b/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_bev_detector.launch.xml @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/perception/autoware_tensorrt_bevdet/CMakeLists.txt b/perception/autoware_tensorrt_bevdet/CMakeLists.txt new file mode 100644 index 0000000000000..a383274e5a617 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/CMakeLists.txt @@ -0,0 +1,102 @@ +cmake_minimum_required(VERSION 3.17) +project(autoware_tensorrt_bevdet) + +add_compile_options(-W) +add_compile_options(-std=c++17) +set(CMAKE_CXX_FLAGS_RELEASE "-Wno-deprecated-declarations -O2") +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -O0 -Xcompiler -fPIC -g -w -gencode=arch=compute_86,code=sm_86") +find_package(tensorrt_common) +if(NOT ${tensorrt_common_FOUND}) + message(WARNING "The tensorrt_common package is not found. Please check its dependencies.") + return() +endif() + +find_package(ament_cmake REQUIRED) +find_package(cudnn_cmake_module REQUIRED) +find_package(tensorrt_cmake_module REQUIRED) +find_package(rclcpp REQUIRED) +find_package(yaml-cpp REQUIRED) +find_package(Eigen3 REQUIRED) +find_package(OpenCV REQUIRED) +find_package(PCL REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(pcl_conversions REQUIRED) +find_package(cv_bridge REQUIRED) +find_package(tf2_geometry_msgs REQUIRED) +find_package(autoware_perception_msgs REQUIRED) + +find_package(CUDA REQUIRED) +find_package(CUDAToolkit) +find_package(CUDNN) +find_package(TENSORRT) +if(NOT (CUDAToolkit_FOUND AND CUDNN_FOUND AND TENSORRT_FOUND)) + message(WARNING "cuda, cudnn, tensorrt libraries are not found") + return() +endif() + +include_directories( + include + SYSTEM + ${YAML_CPP_INCLUDE_DIRS} + ${EIGEN3_INCLUDE_DIRS} + ${OpenCV_INCLUDE_DIRS} + ${PCL_INCLUDE_DIRS} + ${cv_bridge_INCLUDE_DIRS} + ${tf2_geometry_msgs_INCLUDE_DIRS} + ${autoware_perception_msgs_INCLUDE_DIRS} +) + +cuda_add_executable(${PROJECT_NAME}_node + src/bevdet_node.cpp + src/bevdet.cpp + src/preprocess.cu + src/iou3d_nms.cu + src/postprocess.cu + src/data.cpp + src/cpu_jpegdecoder.cpp + src/nvjpegdecoder.cpp + + src/preprocess_plugin.cu + src/bevpool_plugin.cu + src/alignbev_plugin.cu + src/gatherbev_plugin.cu +) + +# Link libraries to executable +ament_target_dependencies(${PROJECT_NAME}_node + "rclcpp" + "sensor_msgs" + "pcl_conversions" + "cv_bridge" + "autoware_perception_msgs" + "tf2_geometry_msgs" +) + +target_link_libraries(${PROJECT_NAME}_node + yaml-cpp + libnvinfer.so + libnvonnxparser.so + libz.so + libjpeg.so + rclcpp::rclcpp + stdc++fs + ${NVINFER} + ${TENSORRT_LIBRARIES} + ${CUDA_LIBRARIES} + ${CUBLAS_LIBRARIES} + ${CUDNN_LIBRARY} + ${OpenCV_LIBS} + ${PCL_LIBRARIES} +) + +install(TARGETS ${PROJECT_NAME}_node + DESTINATION lib/${PROJECT_NAME} +) + +install(DIRECTORY + launch + config + DESTINATION share/${PROJECT_NAME}/ +) + +ament_package() diff --git a/perception/autoware_tensorrt_bevdet/README.md b/perception/autoware_tensorrt_bevdet/README.md new file mode 100644 index 0000000000000..ce6b1cc02c2bb --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/README.md @@ -0,0 +1,69 @@ +# tensorrt_bevdet + +## Purpose + +tensorrt_bevdet is a dynamic 3D bev object detection package based on 6 surround view cameras. + +## Inner-workings / Algorithms + +BEVDet is a BEV perception algorithm based on panoramic cameras. It unifies multi-view images into the perspective of BEV for 3D object detection task. In this implementation, BEVDet network to inference with TensorRT. + +## Inputs / Outputs + +### Inputs + +| Name | Type | Description | +| ---------------------------------- | ------------------------------ | ----------------------------------- | +| `~/input/topic_img_fl` | `sensor_msgs::msg::Image` | input front_left camera image | +| `~/input/topic_img_f` | `sensor_msgs::msg::Image` | input front camera image | +| `~/input/topic_img_fr` | `sensor_msgs::msg::Image` | input front_right camera image | +| `~/input/topic_img_bl` | `sensor_msgs::msg::Image` | input back_left camera image | +| `~/input/topic_img_b` | `sensor_msgs::msg::Image` | input back camera image | +| `~/input/topic_img_br` | `sensor_msgs::msg::Image` | input back_right camera image | +| `~/input/topic_img_fl/camera_info` | `sensor_msgs::msg::CameraInfo` | input front_left camera parameters | +| `~/input/topic_img_f/camera_info` | `sensor_msgs::msg::CameraInfo` | input front camera parameters | +| `~/input/topic_img_fr/camera_info` | `sensor_msgs::msg::CameraInfo` | input front_right camera parameters | +| `~/input/topic_img_bl/camera_info` | `sensor_msgs::msg::CameraInfo` | input back_left camera parameters | +| `~/input/topic_img_b/camera_info` | `sensor_msgs::msg::CameraInfo` | input back camera parameters | +| `~/input/topic_img_br/camera_info` | `sensor_msgs::msg::CameraInfo` | input back_right camera parameters | + +### Outputs + +| Name | Type | Description | +| ---------------- | ------------------------------------------------ | ---------------- | +| `~/output/boxes` | `autoware_perception_msgs::msg::DetectedObjects` | detected objects | + +## How to Use Tensorrt BEVDet Node + +1. launch `tensorrt_bevdet_node` + + ```bash + + ros2 launch autoware_tensorrt_bevdet tensorrt_bevdet_node.launch.xml + ``` + +2. play ros2 bag of nuScenes data + + Please refer to open source repository [ros2_dataset_bridge](https://github.com/Owen-Liuyuxuan/ros2_dataset_bridge) to publish the ROS 2 topics. + +## Limitation + +The model is trained on open-source dataset `NuScenes` and has poor generalization on its own dataset, If you want to use this model to infer your data, you need to retrain it. + +## Trained Models + +You can download the onnx format of trained models by clicking on the links below. + +- BEVDet: [bevdet_one_lt_d.onnx](https://drive.google.com/file/d/1eMGJfdCVlDPBphBTjMcnIh3wdW7Q7WZB/view?usp=sharing) + +The model was trained in NuScenes database for 20 epochs. + +If you want to train model using the [TIER IV's internal database(~2600 key frames)](https://drive.google.com/file/d/1UaarK88HZu09sf7Ix-bEVl9zGNGFwTVL/view?usp=sharing), please refer to the following repositories:[BEVDet adapted to TIER IV dataset](https://github.com/cyn-liu/BEVDet/tree/train_export) + +## References/External links + +[1] + +[2] + +[3] diff --git a/perception/autoware_tensorrt_bevdet/config/bevdet.param.yaml b/perception/autoware_tensorrt_bevdet/config/bevdet.param.yaml new file mode 100644 index 0000000000000..e7140328a5e6e --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/config/bevdet.param.yaml @@ -0,0 +1,14 @@ +/**: + ros__parameters: + # weight files + onnx_path: "$(var model_path)/$(var model_name).onnx" + engine_path: "$(var model_path)/$(var model_name).engine" + data_params: + N: 6 # camera num + H: 900 # image height + W: 1600 # image width + cams: ["CAM_FRONT_LEFT", "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_BACK_LEFT", "CAM_BACK", "CAM_BACK_RIGHT"] + post_process_params: + # post-process params + score_threshold: 0.2 + class_names: ["car", "truck", "construction_vehicle", "bus", "trailer", "barrier", "motorcycle", "bicycle", "pedestrian", "traffic_cone"] diff --git a/perception/autoware_tensorrt_bevdet/config/bevdet_r50_4dlongterm_depth.yaml b/perception/autoware_tensorrt_bevdet/config/bevdet_r50_4dlongterm_depth.yaml new file mode 100644 index 0000000000000..8f0e4ac785a97 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/config/bevdet_r50_4dlongterm_depth.yaml @@ -0,0 +1,63 @@ +bev_range: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +mean: [123.675, 116.28, 103.53] +std: [58.395, 57.12, 57.375] +use_depth: true +use_adj: true +adj_num: 8 +transform_size: 6 +cam_params_size: 27 +sampling: nearest # nearest # or bicubic +data_config: + Ncams: 6 + cams: [CAM_FRONT_LEFT, CAM_FRONT, CAM_FRONT_RIGHT, CAM_BACK_LEFT, CAM_BACK, CAM_BACK_RIGHT] + resize_radio: 0.44 + crop: [140, 0] + flip: true + input_size: [256, 704] + resize: [-0.06, 0.11] + resize_test: 0.0 + rot: [-5.4, 5.4] + src_size: [900, 1600] +grid_config: + depth: [1.0, 60.0, 0.5] + x: [-51.2, 51.2, 0.8] + y: [-51.2, 51.2, 0.8] + z: [-5, 3, 8] +model: + bevpool_channels: 80 + coder: + code_size: 9 + max_num: 500 + post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0] + score_threshold: 0.1 + common_head: + channels: [2, 1, 3, 2, 2] + names: [reg, height, dim, rot, vel] + down_sample: 16 + tasks: + - class_names: + [ + car, + truck, + construction_vehicle, + bus, + trailer, + barrier, + motorcycle, + bicycle, + pedestrian, + traffic_cone, + ] + num_class: 10 +test_cfg: + max_per_img: 500 + max_pool_nms: false + min_radius: [4, 12, 10, 1, 0.85, 0.175] + nms_rescale_factor: + - [1.0, 0.7, 0.7, 0.4, 0.55, 1.1, 1.0, 1.0, 1.5, 3.5] + nms_thr: [0.2] + nms_type: [rotate] + post_center_limit_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0] + post_max_size: 500 + pre_max_size: 1000 + score_threshold: 0.1 diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/alignbev_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/alignbev_plugin.hpp new file mode 100644 index 0000000000000..beeeb8ed656d8 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/alignbev_plugin.hpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AUTOWARE__TENSORRT_BEVDET__ALIGNBEV_PLUGIN_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__ALIGNBEV_PLUGIN_HPP_ + +#include +#include + +#include +#include +#include +#include + +namespace +{ +static const char * ALIGN_PLUGIN_NAME{"AlignBEV"}; +static const char * ALIGN_PLUGIN_VERSION{"1"}; +} // namespace + +namespace nvinfer1 +{ +class AlignBEVPlugin : public IPluginV2DynamicExt +{ +private: + const std::string name_; + std::string namespace_; + struct + { + } m_; + +public: + AlignBEVPlugin() = delete; + explicit AlignBEVPlugin(const std::string & name); + AlignBEVPlugin(const std::string & name, const void * buffer, size_t length); + ~AlignBEVPlugin(); + + // Method inherited from IPluginV2 + const char * getPluginType() const noexcept override; + const char * getPluginVersion() const noexcept override; + int32_t getNbOutputs() const noexcept override; + int32_t initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void * buffer) const noexcept override; + void destroy() noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; + + // Method inherited from IPluginV2Ext + DataType getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override; + void attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, + IGpuAllocator * gpuAllocator) noexcept override; + void detachFromContext() noexcept override; + + // Method inherited from IPluginV2DynamicExt + IPluginV2DynamicExt * clone() const noexcept override; + DimsExprs getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept override; + bool supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, + int32_t nbOutputs) noexcept override; + void configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept override; + size_t getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept override; + int32_t enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept override; + +protected: + // To prevent compiler warnings + using nvinfer1::IPluginV2::enqueue; + using nvinfer1::IPluginV2::getOutputDimensions; + using nvinfer1::IPluginV2::getWorkspaceSize; + using nvinfer1::IPluginV2Ext::configurePlugin; +}; + +class AlignBEVPluginCreator : public IPluginCreator +{ +private: + static PluginFieldCollection fc_; + static std::vector attr_; + std::string namespace_; + +public: + AlignBEVPluginCreator(); + ~AlignBEVPluginCreator(); + const char * getPluginName() const noexcept override; + const char * getPluginVersion() const noexcept override; + const PluginFieldCollection * getFieldNames() noexcept override; + IPluginV2DynamicExt * createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept override; + IPluginV2DynamicExt * deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; +}; + +} // namespace nvinfer1 +#endif // AUTOWARE__TENSORRT_BEVDET__ALIGNBEV_PLUGIN_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet.hpp new file mode 100644 index 0000000000000..8c6241ba4e647 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet.hpp @@ -0,0 +1,239 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__TENSORRT_BEVDET__BEVDET_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__BEVDET_HPP_ + +#include "NvInfer.h" +#include "common.hpp" +#include "data.hpp" +#include "postprocess.hpp" + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace autoware::tensorrt_bevdet +{ +class AdjFrame +{ +public: + AdjFrame() {} + AdjFrame(int n, size_t buf_size) + : n_(n), buf_size_(buf_size), scenes_token_(n), ego2global_rot_(n), ego2global_trans_(n) + { + CHECK_CUDA(cudaMalloc(reinterpret_cast(&adj_buffer), n * buf_size)); + CHECK_CUDA(cudaMemset(adj_buffer, 0, n * buf_size)); + last_ = 0; + buffer_num_ = 0; + init_ = false; + + for (auto & rot : ego2global_rot_) { + rot = Eigen::Quaternion(0.f, 0.f, 0.f, 0.f); + } + for (auto & trans : ego2global_trans_) { + trans = Eigen::Translation3f(0.f, 0.f, 0.f); + } + } + const std::string & lastScenesToken() const { return scenes_token_[last_]; } + + void reset() + { + last_ = 0; // origin -1 + buffer_num_ = 0; + init_ = false; + } + + void saveFrameBuffer( + const void * curr_buffer, const std::string & curr_token, + const Eigen::Quaternion & ego2global_rot, const Eigen::Translation3f & ego2global_trans) + { + int iters = init_ ? 1 : n_; + while (iters--) { + last_ = (last_ + 1) % n_; + CHECK_CUDA(cudaMemcpy( + reinterpret_cast(adj_buffer) + last_ * buf_size_, curr_buffer, buf_size_, + cudaMemcpyDeviceToDevice)); + scenes_token_[last_] = curr_token; + ego2global_rot_[last_] = ego2global_rot; + ego2global_trans_[last_] = ego2global_trans; + buffer_num_ = std::min(buffer_num_ + 1, n_); + } + init_ = true; + } + int havingBuffer(int idx) { return static_cast(idx < buffer_num_); } + + const void * getFrameBuffer(int idx) + { + idx = (-idx + last_ + n_) % n_; + return reinterpret_cast(adj_buffer) + idx * buf_size_; + } + void getEgo2Global( + int idx, Eigen::Quaternion & adj_ego2global_rot, + Eigen::Translation3f & adj_ego2global_trans) + { + idx = (-idx + last_ + n_) % n_; + adj_ego2global_rot = ego2global_rot_[idx]; + adj_ego2global_trans = ego2global_trans_[idx]; + } + + ~AdjFrame() { CHECK_CUDA(cudaFree(adj_buffer)); } + +private: + int n_; + size_t buf_size_; + + int last_; + int buffer_num_; + bool init_; + + std::vector scenes_token_; + std::vector> ego2global_rot_; + std::vector ego2global_trans_; + + void * adj_buffer; +}; + +class BEVDet +{ +public: + BEVDet() {} + BEVDet( + const std::string & config_file, int n_img, std::vector cams_intrin, + std::vector> cams2ego_rot, + std::vector cams2ego_trans, const std::string & onnx_file, + const std::string & engine_file); + + int doInfer( + const camsData & cam_data, std::vector & out_detections, float & cost_time, int idx = -1); + ~BEVDet(); + +protected: + void initParams(const std::string & config_file); + + void initViewTransformer( + std::vector & ranks_bev, std::vector & ranks_depth, std::vector & ranks_feat, + std::vector & interval_starts, std::vector & interval_lengths); + void exportEngine(const std::string & onnxFile, const std::string & trtFile); + int initEngine(const std::string & engine_file); + + int deserializeTRTEngine(const std::string & engine_file, nvinfer1::ICudaEngine ** engine_ptr); + + void mallocDeviceMemory(); + + void initCamParams( + const std::vector> & curr_cams2ego_rot, + const std::vector & curr_cams2ego_trans, + const std::vector & cams_intrin); + + void getAdjBEVFeature( + const std::string & curr_scene_token, const Eigen::Quaternion & ego2global_rot, + const Eigen::Translation3f & ego2global_trans); + + void getCurr2AdjTransform( + const Eigen::Quaternion & curr_ego2global_rot, + const Eigen::Quaternion & adj_ego2global_rot, + const Eigen::Translation3f & curr_ego2global_trans, + const Eigen::Translation3f & adj_ego2global_trans, float * transform_dev); + +private: + int n_img_; + + int src_img_h_; + int src_img_w_; + int input_img_h_; + int input_img_w_; + int crop_h_; + int crop_w_; + float resize_radio_; + int down_sample_; + int feat_h_; + int feat_w_; + int bev_h_; + int bev_w_; + int bevpool_channel_; + + float depth_start_; + float depth_end_; + float depth_step_; + int depth_num_; + + float x_start_; + float x_end_; + float x_step_; + int xgrid_num_; + + float y_start_; + float y_end_; + float y_step_; + int ygrid_num_; + + float z_start_; + float z_end_; + float z_step_; + int zgrid_num_; + + std::vector mean_; + std::vector std_; + + bool use_depth_; + bool use_adj_; + int adj_num_; + + int class_num_; + float score_thresh_; + float nms_overlap_thresh_; + int nms_pre_maxnum_; + int nms_post_maxnum_; + std::vector nms_rescale_factor_; + std::vector class_num_pre_task_; + std::map out_num_task_head_; + + std::vector cams_intrin_; + std::vector> cams2ego_rot_; + std::vector cams2ego_trans_; + + Eigen::Matrix3f post_rot_; + Eigen::Translation3f post_trans_; + + std::vector trt_buffer_sizes_; + void ** trt_buffer_dev_; + std::vector cam_params_host_; + void ** post_buffer_; + + std::map buffer_map_; + + int valid_feat_num_; + int unique_bev_num_; + + int transform_size_; + int cam_params_size_; + + Logger g_logger_; + + nvinfer1::ICudaEngine * trt_engine_; + nvinfer1::IExecutionContext * trt_context_; + + std::unique_ptr postprocess_ptr_; + std::unique_ptr adj_frame_ptr_; +}; +} // namespace autoware::tensorrt_bevdet +#endif // AUTOWARE__TENSORRT_BEVDET__BEVDET_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet_node.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet_node.hpp new file mode 100644 index 0000000000000..1a493bb2f8889 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet_node.hpp @@ -0,0 +1,221 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell:ignore BEVDET, thre, TRTBEV, bevdet, caminfo, intrin +#ifndef AUTOWARE__TENSORRT_BEVDET__BEVDET_NODE_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__BEVDET_NODE_HPP_ + +#include "bevdet.hpp" +#include "cpu_jpegdecoder.hpp" +#include "tf2_ros/buffer.h" +#include "tf2_ros/transform_listener.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include // msg2pcl +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +uint8_t getSemanticType(const std::string & class_name); + +void box3DToDetectedObjects( + const std::vector & boxes, autoware_perception_msgs::msg::DetectedObjects & objects, + const std::vector & class_names, float score_thre, const bool has_twist); + +// Get the rotation and translation from a geometry_msgs::msg::TransformStamped +void getTransform( + const geometry_msgs::msg::TransformStamped & transform, Eigen::Quaternion & rot, + Eigen::Translation3f & translation); + +// Get the camera intrinsics from a sensor_msgs::msg::CameraInfo +void getCameraIntrinsics( + const sensor_msgs::msg::CameraInfo::SharedPtr msg, Eigen::Matrix3f & intrinsics); + +class TRTBEVDetNode : public rclcpp::Node +{ + /** + * @class TRTBEVDetNode + * @brief This class represents a node for performing object detection using TensorRT on BEV + * (Bird's Eye View) images. + */ +private: + size_t img_N_; ///< Number of images + uint32_t img_w_; ///< Width of the images + uint32_t img_h_; ///< Height of the images + + std::string model_config_; ///< Path to the model configuration file + + std::string onnx_file_; ///< Path to the ONNX file + std::string engine_file_; ///< Path to the TensorRT engine file + + std::vector imgs_name_; ///< Names of the images + std::vector class_names_; ///< Names of the object classes + + camsData sampleData_; ///< Sample data for camera parameters + std::shared_ptr + bevdet_; ///< Object for performing object detection + + uchar * imgs_dev_ = nullptr; ///< Device pointer for storing the images + float score_thre_; ///< Score threshold for object detection + + rclcpp::Publisher::SharedPtr + pub_boxes_; ///< Publisher for publishing the detected objects + + // Subscribers of camera info for each camera, no need to synchronize + rclcpp::Subscription::SharedPtr + sub_f_caminfo_; ///< Subscriber for front camera info + rclcpp::Subscription::SharedPtr + sub_fl_caminfo_; ///< Subscriber for front-left camera info + rclcpp::Subscription::SharedPtr + sub_fr_caminfo_; ///< Subscriber for front-right camera info + rclcpp::Subscription::SharedPtr + sub_b_caminfo_; ///< Subscriber for back camera info + rclcpp::Subscription::SharedPtr + sub_bl_caminfo_; ///< Subscriber for back-left camera info + rclcpp::Subscription::SharedPtr + sub_br_caminfo_; ///< Subscriber for back-right camera info + std::vector + caminfo_received_; ///< Flag indicating if camera info has been received for each camera + bool camera_info_received_flag_ = + false; ///< Flag indicating if camera info has been received for all cameras + bool initialized_ = false; /// Flag indicating if img_w_ and img_h_ has been initialized + + // tf listener + std::shared_ptr tf_listener_{ + nullptr}; ///< TF listener for transforming coordinates + std::unique_ptr tf_buffer_; ///< Buffer for storing TF transforms + + // Camera parameters; + std::vector cams_intrin; ///< Intrinsic camera parameters for each camera + std::vector> + cams2ego_rot; ///< Rotation from camera frame to ego frame for each camera + std::vector + cams2ego_trans; ///< Translation from camera frame to ego frame for each camera + + // Subscribers of images for each camera, synchronized + message_filters::Subscriber + sub_f_img_; ///< Subscriber for front camera image + message_filters::Subscriber + sub_fl_img_; ///< Subscriber for front-left camera image + message_filters::Subscriber + sub_fr_img_; ///< Subscriber for front-right camera image + message_filters::Subscriber + sub_b_img_; ///< Subscriber for back camera image + message_filters::Subscriber + sub_bl_img_; ///< Subscriber for back-left camera image + message_filters::Subscriber + sub_br_img_; ///< Subscriber for back-right camera image + + typedef message_filters::sync_policies::ApproximateTime< + sensor_msgs::msg::Image, sensor_msgs::msg::Image, sensor_msgs::msg::Image, + sensor_msgs::msg::Image, sensor_msgs::msg::Image, sensor_msgs::msg::Image> + MySyncPolicy; + + typedef message_filters::Synchronizer Sync; + std::shared_ptr sync_; ///< Synchronizer for synchronizing image callbacks + + // Timer for checking initialization + rclcpp::TimerBase::SharedPtr timer_; ///< Timer for checking initialization + + /** + * @brief Starts the subscription to camera info topics for each camera. + */ + void startCameraInfoSubscription(); + + /** + * @brief Checks if the node has been initialized properly. + */ + void checkInitialization(); + + /** + * @brief Initializes the object detection model. + */ + void initModel(); + + /** + * @brief Starts the subscription to image topics for each camera. + */ + void startImageSubscription(); + +public: + /** + * @brief Constructor for TRTBEVDetNode. + * @param node_name The name of the node. + * @param options The options for the node. + */ + TRTBEVDetNode(const std::string & node_name, const rclcpp::NodeOptions & options); + + /** + * @brief Destructor for TRTBEVDetNode. + */ + ~TRTBEVDetNode(); + + /** + * @brief Callback function for synchronized image messages. + * @param msg_fl_img The front-left camera image message. + * @param msg_f_img The front camera image message. + * @param msg_fr_img The front-right camera image message. + * @param msg_bl_img The back-left camera image message. + * @param msg_b_img The back camera image message. + * @param msg_br_img The back-right camera image message. + */ + void callback( + const sensor_msgs::msg::Image::ConstSharedPtr & msg_fl_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_f_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_fr_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_bl_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_b_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_br_img); + + /** + * @brief Callback function for camera info messages. This function also reads from TF to get the + * transformation from the camera frame to the ego frame. + * @param idx The index of the camera. + * @param msg The camera info message. + */ + void camera_info_callback(int idx, const sensor_msgs::msg::CameraInfo::SharedPtr msg); +}; + +#endif // AUTOWARE__TENSORRT_BEVDET__BEVDET_NODE_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevpool_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevpool_plugin.hpp new file mode 100644 index 0000000000000..0a1f6e114589a --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevpool_plugin.hpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AUTOWARE__TENSORRT_BEVDET__BEVPOOL_PLUGIN_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__BEVPOOL_PLUGIN_HPP_ + +#include +#include + +#include +#include +#include +#include +#include + +namespace +{ +static const char * BEVPOOL_PLUGIN_NAME{"BEVPool"}; +static const char * BEVPOOL_PLUGIN_VERSION{"1"}; +} // namespace + +namespace nvinfer1 +{ +class BEVPoolPlugin : public IPluginV2DynamicExt +{ +private: + const std::string name_; + std::string namespace_; + struct + { + int bev_h; + int bev_w; + int n; + } m_; + +public: + BEVPoolPlugin() = delete; + BEVPoolPlugin(const std::string & name, int bev_h, int bev_w, int n); + BEVPoolPlugin(const std::string & name, const void * buffer, size_t length); + ~BEVPoolPlugin(); + + // Method inherited from IPluginV2 + const char * getPluginType() const noexcept override; + const char * getPluginVersion() const noexcept override; + int32_t getNbOutputs() const noexcept override; + int32_t initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void * buffer) const noexcept override; + void destroy() noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; + + // Method inherited from IPluginV2Ext + DataType getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override; + void attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, + IGpuAllocator * gpuAllocator) noexcept override; + void detachFromContext() noexcept override; + + // Method inherited from IPluginV2DynamicExt + IPluginV2DynamicExt * clone() const noexcept override; + DimsExprs getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept override; + bool supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, + int32_t nbOutputs) noexcept override; + void configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept override; + size_t getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept override; + int32_t enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept override; + +protected: + // To prevent compiler warnings + using nvinfer1::IPluginV2::enqueue; + using nvinfer1::IPluginV2::getOutputDimensions; + using nvinfer1::IPluginV2::getWorkspaceSize; + using nvinfer1::IPluginV2Ext::configurePlugin; +}; + +class BEVPoolPluginCreator : public IPluginCreator +{ +private: + static PluginFieldCollection fc_; + static std::vector attr_; + std::string namespace_; + +public: + BEVPoolPluginCreator(); + ~BEVPoolPluginCreator(); + const char * getPluginName() const noexcept override; + const char * getPluginVersion() const noexcept override; + const PluginFieldCollection * getFieldNames() noexcept override; + IPluginV2DynamicExt * createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept override; + IPluginV2DynamicExt * deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; +}; + +} // namespace nvinfer1 +#endif // AUTOWARE__TENSORRT_BEVDET__BEVPOOL_PLUGIN_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/common.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/common.hpp new file mode 100644 index 0000000000000..a1a745d9e604f --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/common.hpp @@ -0,0 +1,148 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__TENSORRT_BEVDET__COMMON_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__COMMON_HPP_ + +#include "rclcpp/logger.hpp" +#include "rclcpp/rclcpp.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include + +typedef unsigned char uchar; + +#define NUM_THREADS 512 + +#define CHECK_CUDA(ans) \ + { \ + GPUAssert((ans), __FILE__, __LINE__); \ + } + +#define DIVUP(m, n) (((m) + (n) - 1) / (n)) + +inline void GPUAssert(cudaError_t code, const char * file, int line, bool abort = true) +{ + if (code != cudaSuccess) { + RCLCPP_ERROR( + rclcpp::get_logger("GPUAssert"), "GPUassert: %s %s %d", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +}; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) + +const int MAXTENSORDIMS = 6; +struct TensorDesc +{ + int shape[MAXTENSORDIMS]; + int stride[MAXTENSORDIMS]; + int dim; +}; + +inline int GET_BLOCKS(const int N) +{ + int optimal_block_num = DIVUP(N, NUM_THREADS); + int max_block_num = 4096; + return optimal_block_num < max_block_num ? optimal_block_num : max_block_num; +} + +__inline__ std::string dataTypeToString(nvinfer1::DataType dataType) +{ + switch (dataType) { + case nvinfer1::DataType::kFLOAT: + return std::string("FP32 "); + case nvinfer1::DataType::kHALF: + return std::string("FP16 "); + case nvinfer1::DataType::kINT8: + return std::string("INT8 "); + case nvinfer1::DataType::kINT32: + return std::string("INT32"); + case nvinfer1::DataType::kBOOL: + return std::string("BOOL "); + default: + return std::string("Unknown"); + } +} + +__inline__ size_t dataTypeToSize(nvinfer1::DataType dataType) +{ + switch (dataType) { + case nvinfer1::DataType::kFLOAT: + return 4; + case nvinfer1::DataType::kHALF: + return 2; + case nvinfer1::DataType::kINT8: + return 1; + case nvinfer1::DataType::kINT32: + return 4; + case nvinfer1::DataType::kBOOL: + return 1; + default: + return 4; + } +} + +__inline__ std::string shapeToString(nvinfer1::Dims32 dim) +{ + std::string output("("); + if (dim.nbDims == 0) { + return output + std::string(")"); + } + for (int i = 0; i < dim.nbDims - 1; ++i) { + output += std::to_string(dim.d[i]) + std::string(", "); + } + output += std::to_string(dim.d[dim.nbDims - 1]) + std::string(")"); + return output; +} + +class Logger : public nvinfer1::ILogger +{ +public: + explicit Logger(Severity severity = Severity::kWARNING) : reportable_severity(severity) {} + + void log(Severity severity, const char * msg) noexcept override + { + // suppress messages with severity enum value greater than the reportable + if (severity > reportable_severity) return; + switch (severity) { + case Severity::kINTERNAL_ERROR: + RCLCPP_ERROR(rclcpp::get_logger("autoware_tensorrt_bevdet"), "INTERNAL_ERROR: %s", msg); + break; + case Severity::kERROR: + RCLCPP_ERROR(rclcpp::get_logger("autoware_tensorrt_bevdet"), "ERROR: %s", msg); + break; + case Severity::kWARNING: + RCLCPP_WARN(rclcpp::get_logger("autoware_tensorrt_bevdet"), "WARNING: %s", msg); + break; + case Severity::kINFO: + RCLCPP_INFO(rclcpp::get_logger("autoware_tensorrt_bevdet"), "INFO: %s", msg); + break; + default: + RCLCPP_INFO(rclcpp::get_logger("autoware_tensorrt_bevdet"), "UNKNOWN: %s", msg); + break; + } + } + + Severity reportable_severity; +}; + +#endif // AUTOWARE__TENSORRT_BEVDET__COMMON_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp new file mode 100644 index 0000000000000..94a301976fe6a --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp @@ -0,0 +1,24 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__TENSORRT_BEVDET__CPU_JPEGDECODER_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__CPU_JPEGDECODER_HPP_ + +#include "common.hpp" + +#include + +int decode_cpu( + const std::vector> & files_data, uchar * out_imgs, size_t width, size_t height); + +#endif // AUTOWARE__TENSORRT_BEVDET__CPU_JPEGDECODER_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/data.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/data.hpp new file mode 100644 index 0000000000000..153795b9d0f16 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/data.hpp @@ -0,0 +1,72 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell:ignore BEVDET, intrin, Quater, BEVDET +#ifndef AUTOWARE__TENSORRT_BEVDET__DATA_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__DATA_HPP_ + +#include "common.hpp" +#include "nvjpegdecoder.hpp" + +#include +#include + +#include + +#include +#include + +struct camParams +{ + camParams() = default; + camParams(const YAML::Node & config, int n, std::vector & cams_name); + camParams( + const std::vector & _cams_intrin, + const std::vector> & _cams2ego_rot, + const std::vector & _cams2ego_trans) + : cams_intrin(_cams_intrin), cams2ego_rot(_cams2ego_rot), cams2ego_trans(_cams2ego_trans) + { + } + + int N_img; + + Eigen::Quaternion ego2global_rot; + Eigen::Translation3f ego2global_trans; + + Eigen::Quaternion lidar2ego_rot; + Eigen::Translation3f lidar2ego_trans; + // + std::vector cams_intrin; + std::vector> cams2ego_rot; + std::vector cams2ego_trans; + // + std::vector imgs_file; + + std::int64_t timestamp; + std::string scene_token; +}; + +struct camsData +{ + camsData() = default; + explicit camsData(const camParams & _param) : param(_param), imgs_dev(nullptr) {} + camParams param; + uchar * imgs_dev; +}; + +Eigen::Translation3f fromYamlTrans(YAML::Node x); +Eigen::Quaternion fromYamlQuater(YAML::Node x); +Eigen::Matrix3f fromYamlMatrix3f(YAML::Node x); + +#endif // AUTOWARE__TENSORRT_BEVDET__DATA_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/gatherbev_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/gatherbev_plugin.hpp new file mode 100644 index 0000000000000..3d91763ed88a1 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/gatherbev_plugin.hpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AUTOWARE__TENSORRT_BEVDET__GATHERBEV_PLUGIN_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__GATHERBEV_PLUGIN_HPP_ + +#include +#include + +#include +#include +#include +#include +#include + +namespace +{ +static const char * GATHERBEV_PLUGIN_NAME{"GatherBEV"}; +static const char * GATHERBEV_PLUGIN_VERSION{"1"}; +} // namespace + +namespace nvinfer1 +{ +class GatherBEVPlugin : public IPluginV2DynamicExt +{ +private: + const std::string name_; + std::string namespace_; + struct + { + } m_; + +public: + GatherBEVPlugin() = delete; + explicit GatherBEVPlugin(const std::string & name); + GatherBEVPlugin(const std::string & name, const void * buffer, size_t length); + ~GatherBEVPlugin(); + + // Method inherited from IPluginV2 + const char * getPluginType() const noexcept override; + const char * getPluginVersion() const noexcept override; + int32_t getNbOutputs() const noexcept override; + int32_t initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void * buffer) const noexcept override; + void destroy() noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; + + // Method inherited from IPluginV2Ext + DataType getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override; + void attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, + IGpuAllocator * gpuAllocator) noexcept override; + void detachFromContext() noexcept override; + + // Method inherited from IPluginV2DynamicExt + IPluginV2DynamicExt * clone() const noexcept override; + DimsExprs getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept override; + bool supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, + int32_t nbOutputs) noexcept override; + void configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept override; + size_t getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept override; + int32_t enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept override; + +protected: + // To prevent compiler warnings + using nvinfer1::IPluginV2::enqueue; + using nvinfer1::IPluginV2::getOutputDimensions; + using nvinfer1::IPluginV2::getWorkspaceSize; + using nvinfer1::IPluginV2Ext::configurePlugin; +}; + +class GatherBEVPluginCreator : public IPluginCreator +{ +private: + static PluginFieldCollection fc_; + static std::vector attr_; + std::string namespace_; + +public: + GatherBEVPluginCreator(); + ~GatherBEVPluginCreator(); + const char * getPluginName() const noexcept override; + const char * getPluginVersion() const noexcept override; + const PluginFieldCollection * getFieldNames() noexcept override; + IPluginV2DynamicExt * createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept override; + IPluginV2DynamicExt * deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; +}; + +} // namespace nvinfer1 +#endif // AUTOWARE__TENSORRT_BEVDET__GATHERBEV_PLUGIN_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/iou3d_nms.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/iou3d_nms.hpp new file mode 100644 index 0000000000000..609b36717feab --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/iou3d_nms.hpp @@ -0,0 +1,74 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* +3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others) +Written by Shaoshuai Shi +All Rights Reserved 2019-2022. +*/ +#pragma once +#ifndef AUTOWARE__TENSORRT_BEVDET__IOU3D_NMS_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__IOU3D_NMS_HPP_ + +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +const int THREADS_PER_BLOCK = 16; +const int THREADS_PER_BLOCK_NMS = sizeof(std::int64_t) * 8; +const float EPS = 1e-8; + +class Iou3dNmsCuda +{ +public: + Iou3dNmsCuda(const int head_x_size, const int head_y_size, const float nms_overlap_thresh); + ~Iou3dNmsCuda() = default; + + int DoIou3dNms( + const int box_num_pre, const float * res_box, const int * res_sorted_indices, + std::int64_t * host_keep_data); + +private: + const int head_x_size_; + const int head_y_size_; + const float nms_overlap_thresh_; +}; + +static const int kBoxBlockSize = 9; + +struct Box +{ + float x; + float y; + float z; + float l; + float w; + float h; + float r; + float vx = 0.0f; // optional + float vy = 0.0f; // optional + float score; + int label; + bool is_drop; // for nms +}; +#endif // AUTOWARE__TENSORRT_BEVDET__IOU3D_NMS_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/nvjpegdecoder.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/nvjpegdecoder.hpp new file mode 100644 index 0000000000000..dad94eb9c4af2 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/nvjpegdecoder.hpp @@ -0,0 +1,118 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__TENSORRT_BEVDET__NVJPEGDECODER_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__NVJPEGDECODER_HPP_ + +#ifdef __HAVE_NVJPEG__ +#include "common.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DECODE_RGB 3 +#define DECODE_BGR 4 +#define DECODE_RGBI 5 +#define DECODE_BGRI 6 +#define DECODE_UNCHANGED 0 + +#define CHECK_NVJPEG(call) \ + { \ + NvJpegAssert((call), __FILE__, __LINE__); \ + } + +inline void NvJpegAssert(nvjpegStatus_t code, const char * file, int line) +{ + if (code != NVJPEG_STATUS_SUCCESS) { + RCLCPP_ERROR( + rclcpp::get_logger("autoware_tensorrt_bevdet"), "NVJPEG failure: '#%d' at %s:%d", code, file, + line); + exit(1); + } +} + +struct decode_params_t +{ + decode_params_t() {} + + cudaStream_t stream; + + // used with decoupled API + nvjpegBufferPinned_t pinned_buffers[2]; // 2 buffers for pipelining + nvjpegBufferDevice_t device_buffer; + nvjpegJpegStream_t jpeg_streams[2]; // 2 streams for pipelining + + nvjpegDecodeParams_t nvjpeg_decode_params; + nvjpegJpegDecoder_t nvjpeg_decoder; + nvjpegJpegState_t nvjpeg_decoupled_state; +}; + +struct share_params +{ + share_params() { fmt = NVJPEG_OUTPUT_UNCHANGED; } + explicit share_params(size_t _fmt) + { + if ( + _fmt == DECODE_RGB || _fmt == DECODE_BGR || _fmt == DECODE_RGBI || _fmt == DECODE_BGRI || + _fmt == DECODE_UNCHANGED) { + fmt = (nvjpegOutputFormat_t)_fmt; + } else { + RCLCPP_WARN( + rclcpp::get_logger("autoware_tensorrt_bevdet"), "Unknown format! Auto switch BGR!"); + fmt = NVJPEG_OUTPUT_BGR; + } + } + + nvjpegOutputFormat_t fmt; + nvjpegJpegState_t nvjpeg_state; + nvjpegHandle_t nvjpeg_handle; + + bool hw_decode_available; +}; + +class nvjpegDecoder +{ +public: + nvjpegDecoder() {} + nvjpegDecoder(size_t n, size_t _fmt); + + int decode(const std::vector> & imgs_data, uchar * out_imgs); + int init(); + ~nvjpegDecoder(); + +private: + size_t N_img; + std::vector iout; + std::vector isz; + + std::vector widths; + std::vector heights; + + share_params share_param; + std::vector params; +}; + +#endif + +#endif // AUTOWARE__TENSORRT_BEVDET__NVJPEGDECODER_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/postprocess.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/postprocess.hpp new file mode 100644 index 0000000000000..35a2fe0d44829 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/postprocess.hpp @@ -0,0 +1,72 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__TENSORRT_BEVDET__POSTPROCESS_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__POSTPROCESS_HPP_ + +#include "common.hpp" +#include "iou3d_nms.hpp" + +#include +#include + +class PostprocessGPU +{ +public: + PostprocessGPU() {} + PostprocessGPU( + const int _class_num, const float _score_thresh, const float _nms_thresh, + const int _nms_pre_maxnum, const int _nms_post_maxnum, const int _down_sample, + const int _output_h, const int _output_w, const float _x_step, const float _y_step, + const float _x_start, const float _y_start, const std::vector & _class_num_pre_task, + const std::vector & _nms_rescale_factor); + + void DoPostprocess(void ** const bev_buffer, std::vector & out_detections); + ~PostprocessGPU(); + +private: + int class_num; + float score_thresh; + float nms_thresh; + int nms_pre_maxnum; + int nms_post_maxnum; + int down_sample; + int output_h; + int output_w; + float x_step; + float y_step; + float x_start; + float y_start; + int map_size; + int task_num; + + std::vector class_num_pre_task; + std::vector nms_rescale_factor; + + std::unique_ptr iou3d_nms; + + float * boxes_dev = nullptr; + float * score_dev = nullptr; + int * cls_dev = nullptr; + int * valid_box_num = nullptr; + int * sorted_indices_dev = nullptr; + std::int64_t * keep_data_host = nullptr; + int * sorted_indices_host = nullptr; + float * boxes_host = nullptr; + float * score_host = nullptr; + int * cls_host = nullptr; + + float * nms_rescale_factor_dev = nullptr; +}; + +#endif // AUTOWARE__TENSORRT_BEVDET__POSTPROCESS_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess.hpp new file mode 100644 index 0000000000000..1c09fc925d8b5 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess.hpp @@ -0,0 +1,21 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef AUTOWARE__TENSORRT_BEVDET__PREPROCESS_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__PREPROCESS_HPP_ + +#include "common.hpp" + +void convert_RGBHWC_to_BGRCHW(uchar * input, uchar * output, int channels, int height, int width); + +#endif // AUTOWARE__TENSORRT_BEVDET__PREPROCESS_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess_plugin.hpp new file mode 100644 index 0000000000000..ed6bbf861ab22 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess_plugin.hpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AUTOWARE__TENSORRT_BEVDET__PREPROCESS_PLUGIN_HPP_ +#define AUTOWARE__TENSORRT_BEVDET__PREPROCESS_PLUGIN_HPP_ + +#include +#include + +#include +#include +#include +#include + +namespace +{ +static const char * PRE_PLUGIN_NAME{"Preprocess"}; +static const char * PRE_PLUGIN_VERSION{"1"}; +} // namespace + +namespace nvinfer1 +{ +class PreprocessPlugin : public IPluginV2DynamicExt +{ +private: + const std::string name_; + std::string namespace_; + struct + { + int crop_h; + int crop_w; + float resize_radio; + } m_; + +public: + PreprocessPlugin() = delete; + PreprocessPlugin(const std::string & name, int crop_h, int crop_w, float resize_radio); + PreprocessPlugin(const std::string & name, const void * buffer, size_t length); + ~PreprocessPlugin(); + + // Method inherited from IPluginV2 + const char * getPluginType() const noexcept override; + const char * getPluginVersion() const noexcept override; + int32_t getNbOutputs() const noexcept override; + int32_t initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void * buffer) const noexcept override; + void destroy() noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; + + // Method inherited from IPluginV2Ext + DataType getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override; + void attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, + IGpuAllocator * gpuAllocator) noexcept override; + void detachFromContext() noexcept override; + + // Method inherited from IPluginV2DynamicExt + IPluginV2DynamicExt * clone() const noexcept override; + DimsExprs getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept override; + bool supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, + int32_t nbOutputs) noexcept override; + void configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept override; + size_t getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept override; + int32_t enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept override; + +protected: + // To prevent compiler warnings + using nvinfer1::IPluginV2::enqueue; + using nvinfer1::IPluginV2::getOutputDimensions; + using nvinfer1::IPluginV2::getWorkspaceSize; + using nvinfer1::IPluginV2Ext::configurePlugin; +}; + +class PreprocessPluginCreator : public IPluginCreator +{ +private: + static PluginFieldCollection fc_; + static std::vector attr_; + std::string namespace_; + +public: + PreprocessPluginCreator(); + ~PreprocessPluginCreator(); + const char * getPluginName() const noexcept override; + const char * getPluginVersion() const noexcept override; + const PluginFieldCollection * getFieldNames() noexcept override; + IPluginV2DynamicExt * createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept override; + IPluginV2DynamicExt * deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept override; + void setPluginNamespace(const char * pluginNamespace) noexcept override; + const char * getPluginNamespace() const noexcept override; +}; + +} // namespace nvinfer1 +#endif // AUTOWARE__TENSORRT_BEVDET__PREPROCESS_PLUGIN_HPP_ diff --git a/perception/autoware_tensorrt_bevdet/launch/tensorrt_bevdet.launch.xml b/perception/autoware_tensorrt_bevdet/launch/tensorrt_bevdet.launch.xml new file mode 100644 index 0000000000000..0017ec5748cc8 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/launch/tensorrt_bevdet.launch.xml @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/perception/autoware_tensorrt_bevdet/package.xml b/perception/autoware_tensorrt_bevdet/package.xml new file mode 100644 index 0000000000000..d73a11cf6bad9 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/package.xml @@ -0,0 +1,39 @@ + + + autoware_tensorrt_bevdet + 0.0.1 + tensorrt library implementation for bevdet + + Cynthia + Cynthia + Yuxuan Liu + Apache License 2.0 + + ament_cmake_auto + autoware_cmake + cudnn_cmake_module + tensorrt_cmake_module + + cudnn_cmake_module + tensorrt_cmake_module + + autoware_perception_msgs + cv_bridge + geometry_msgs + libopencv-dev + pcl_conversions + pcl_ros + rclcpp + rclcpp_components + sensor_msgs + tensorrt_common + tf2_geometry_msgs + yaml-cpp + + ament_lint_auto + autoware_lint_common + + + ament_cmake + + diff --git a/perception/autoware_tensorrt_bevdet/src/alignbev_plugin.cu b/perception/autoware_tensorrt_bevdet/src/alignbev_plugin.cu new file mode 100644 index 0000000000000..0321bf103f049 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/alignbev_plugin.cu @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "autoware/tensorrt_bevdet/alignbev_plugin.hpp" +#include "autoware/tensorrt_bevdet/common.hpp" + +#include +#include + +static inline __device__ bool within_bounds_2d(int h, int w, int H, int W) +{ + return h >= 0 && h < H && w >= 0 && w < W; +} + +template +__global__ void align_bev_kernel( + const int nthreads, const T1 * input, const float * trans, T2 * output, TensorDesc output_desc) +{ + int C = output_desc.shape[1]; // 80 + int out_H = output_desc.shape[2]; // 128 + int out_W = output_desc.shape[3]; // 128 + int out_sN = output_desc.stride[0]; // 80 * 128 * 128 + int out_sC = output_desc.stride[1]; // 128 * 128 + int out_sH = output_desc.stride[2]; // 128 + int out_sW = output_desc.stride[3]; // 1 + + CUDA_1D_KERNEL_LOOP(index, nthreads) + { + const int w = index % out_W; // j + const int h = (index / out_W) % out_H; // i + const int n = index / (out_H * out_W); // batch + + float ix = + trans[n * 6 + 0 * 3 + 0] * w + trans[n * 6 + 0 * 3 + 1] * h + trans[n * 6 + 0 * 3 + 2]; + float iy = + trans[n * 6 + 1 * 3 + 0] * w + trans[n * 6 + 1 * 3 + 1] * h + trans[n * 6 + 1 * 3 + 2]; + + // NE, NW, SE, SW point + int ix_nw = static_cast(::floor(ix)); + int iy_nw = static_cast(::floor(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + T2 nw = (ix_se - ix) * (iy_se - iy); + T2 ne = (ix - ix_sw) * (iy_sw - iy); + T2 sw = (ix_ne - ix) * (iy - iy_ne); + T2 se = (ix - ix_nw) * (iy - iy_nw); + + // bilinear + auto inp_ptr_NC = input + n * out_sN; + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < C; ++c, inp_ptr_NC += out_sC, out_ptr_NCHW += out_sC) { + *out_ptr_NCHW = static_cast(0); + if (within_bounds_2d(iy_nw, ix_nw, out_H, out_W)) { + *out_ptr_NCHW += static_cast(inp_ptr_NC[iy_nw * out_sH + ix_nw * out_sW]) * nw; + } + if (within_bounds_2d(iy_ne, ix_ne, out_H, out_W)) { + *out_ptr_NCHW += static_cast(inp_ptr_NC[iy_ne * out_sH + ix_ne * out_sW]) * ne; + } + if (within_bounds_2d(iy_sw, ix_sw, out_H, out_W)) { + *out_ptr_NCHW += static_cast(inp_ptr_NC[iy_sw * out_sH + ix_sw * out_sW]) * sw; + } + if (within_bounds_2d(iy_se, ix_se, out_H, out_W)) { + *out_ptr_NCHW += static_cast(inp_ptr_NC[iy_se * out_sH + ix_se * out_sW]) * se; + } + } + } +} + +void create_desc(const int * dims, int nb_dims, TensorDesc & desc) +{ + memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims); + desc.stride[nb_dims - 1] = 1; + for (int i = nb_dims - 2; i >= 0; --i) { + desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1]; + } +} + +namespace nvinfer1 +{ +// class AlignBEVPlugin +AlignBEVPlugin::AlignBEVPlugin(const std::string & name) : name_(name) +{ +} + +AlignBEVPlugin::AlignBEVPlugin(const std::string & name, const void * buffer, size_t length) +: name_(name) +{ + memcpy(&m_, buffer, sizeof(m_)); +} + +AlignBEVPlugin::~AlignBEVPlugin() +{ +} + +IPluginV2DynamicExt * AlignBEVPlugin::clone() const noexcept +{ + auto p = new AlignBEVPlugin(name_, &m_, sizeof(m_)); + p->setPluginNamespace(namespace_.c_str()); + return p; +} + +int32_t AlignBEVPlugin::getNbOutputs() const noexcept +{ + return 1; +} + +DataType AlignBEVPlugin::getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept +{ + return DataType::kFLOAT; // FIXME +} + +DimsExprs AlignBEVPlugin::getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept +{ + return inputs[0]; +} + +bool AlignBEVPlugin::supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept +{ + // adj_feat + if (pos == 0) { + return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) && + inOut[pos].format == TensorFormat::kLINEAR; + } else if (pos == 2) { // out + return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) && + inOut[pos].format == TensorFormat::kLINEAR; + } else if (pos == 1) { // transform + return inOut[pos].type == DataType::kFLOAT && inOut[pos].format == TensorFormat::kLINEAR; + } + return false; +} + +size_t AlignBEVPlugin::getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept +{ + return 0; +} + +int32_t AlignBEVPlugin::enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept +{ + // TODO + // inputs[0] == adj_feat b x 8 x 80 x 128 x 128 + // inputs[1] == transform b x 8 x 6 + + int bev_channel = inputDesc[0].dims.d[2]; + int bev_h = inputDesc[0].dims.d[3]; + int bev_w = inputDesc[0].dims.d[4]; + int adj_num = inputDesc[0].dims.d[1]; + + int output_dim[4] = {8, bev_channel, bev_h, bev_w}; + + TensorDesc output_desc; + create_desc(output_dim, 4, output_desc); + + int count = 1; + for (int i = 0; i < 4; ++i) { + if (i == 1) { + continue; + } + count *= output_desc.shape[i]; + } + + switch (int(outputDesc[0].type)) { + case int(DataType::kFLOAT): + if (inputDesc[0].type == DataType::kFLOAT) { + // align : fp32, fp32; + align_bev_kernel<<>>( + count, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(outputs[0]), + output_desc); + } else { + // align : fp16, fp32 + align_bev_kernel<__half, float><<>>( + count, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(outputs[0]), + output_desc); + } + break; + case int(DataType::kHALF): + if (inputDesc[0].type == DataType::kFLOAT) { + // align : fp32, fp16 + align_bev_kernel<<>>( + count, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast<__half *>(outputs[0]), + output_desc); + } else { + // align : fp16, fp16 + align_bev_kernel<__half, __half><<>>( + count, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast<__half *>(outputs[0]), + output_desc); + } + break; + default: // should NOT be here + RCLCPP_ERROR(rclcpp::get_logger("AlignBEVPlugin"), "\tUnsupported datatype!"); + } + + return 0; +} + +void AlignBEVPlugin::destroy() noexcept +{ + delete this; + return; +} + +void AlignBEVPlugin::configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept +{ + return; +} + +int32_t AlignBEVPlugin::initialize() noexcept +{ + return 0; +} + +void AlignBEVPlugin::terminate() noexcept +{ + return; +} + +size_t AlignBEVPlugin::getSerializationSize() const noexcept +{ + return sizeof(m_); +} + +void AlignBEVPlugin::serialize(void * buffer) const noexcept +{ + memcpy(buffer, &m_, sizeof(m_)); + return; +} + +void AlignBEVPlugin::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * AlignBEVPlugin::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * AlignBEVPlugin::getPluginType() const noexcept +{ + return ALIGN_PLUGIN_NAME; +} + +const char * AlignBEVPlugin::getPluginVersion() const noexcept +{ + return ALIGN_PLUGIN_VERSION; +} + +void AlignBEVPlugin::attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept +{ + return; +} + +void AlignBEVPlugin::detachFromContext() noexcept +{ + return; +} + +// class AlignBEVPluginCreator +PluginFieldCollection AlignBEVPluginCreator::fc_{}; +std::vector AlignBEVPluginCreator::attr_; + +AlignBEVPluginCreator::AlignBEVPluginCreator() +{ + attr_.clear(); + fc_.nbFields = attr_.size(); + fc_.fields = attr_.data(); +} + +AlignBEVPluginCreator::~AlignBEVPluginCreator() +{ +} + +IPluginV2DynamicExt * AlignBEVPluginCreator::createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept +{ + // const PluginField *fields = fc->fields; + AlignBEVPlugin * pObj = new AlignBEVPlugin(name); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +IPluginV2DynamicExt * AlignBEVPluginCreator::deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept +{ + AlignBEVPlugin * pObj = new AlignBEVPlugin(name, serialData, serialLength); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +void AlignBEVPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * AlignBEVPluginCreator::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * AlignBEVPluginCreator::getPluginName() const noexcept +{ + return ALIGN_PLUGIN_NAME; +} + +const char * AlignBEVPluginCreator::getPluginVersion() const noexcept +{ + return ALIGN_PLUGIN_VERSION; +} + +const PluginFieldCollection * AlignBEVPluginCreator::getFieldNames() noexcept +{ + return &fc_; +} + +REGISTER_TENSORRT_PLUGIN(AlignBEVPluginCreator); + +} // namespace nvinfer1 diff --git a/perception/autoware_tensorrt_bevdet/src/bevdet.cpp b/perception/autoware_tensorrt_bevdet/src/bevdet.cpp new file mode 100644 index 0000000000000..0296a829f1e35 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/bevdet.cpp @@ -0,0 +1,687 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell:ignore BEVDET, thre, TRTBEV, bevdet, caminfo, intrin, Ncams, bevfeat, bevpool, maxnum, +// zgrid cspell:ignore gridbev, egobev, adjgrid, currgrid +#include "autoware/tensorrt_bevdet/bevdet.hpp" + +#include "autoware/tensorrt_bevdet/alignbev_plugin.hpp" +#include "autoware/tensorrt_bevdet/bevpool_plugin.hpp" +#include "autoware/tensorrt_bevdet/gatherbev_plugin.hpp" +#include "autoware/tensorrt_bevdet/preprocess_plugin.hpp" + +#include +#include + +#include +#include +#include +#include + +using std::chrono::duration; +using std::chrono::high_resolution_clock; + +namespace autoware::tensorrt_bevdet +{ +BEVDet::BEVDet( + const std::string & config_file, int n_img, std::vector cams_intrin, + std::vector> cams2ego_rot, + std::vector cams2ego_trans, const std::string & onnx_file, + const std::string & engine_file) +: cams_intrin_(cams_intrin), cams2ego_rot_(cams2ego_rot), cams2ego_trans_(cams2ego_trans) +{ + initParams(config_file); + if (n_img != n_img_) { + RCLCPP_ERROR( + rclcpp::get_logger("BEVDet"), "BEVDet need %d images, but given %d images!", n_img_, n_img); + } + auto start = high_resolution_clock::now(); + + std::vector ranks_bev; + std::vector ranks_depth; + std::vector ranks_feat; + std::vector interval_starts; + std::vector interval_lengths; + + initViewTransformer(ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths); + auto end = high_resolution_clock::now(); + duration t = end - start; + RCLCPP_INFO( + rclcpp::get_logger("BEVDet"), "InitVewTransformer cost time : %.4lf ms", t.count() * 1000); + + if (access(engine_file.c_str(), F_OK) == 0) { + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Inference engine prepared."); + } else { + // onnx to engine + RCLCPP_WARN( + rclcpp::get_logger("BEVDet"), "Could not find %s, try making TensorRT engine from onnx", + engine_file.c_str()); + exportEngine(onnx_file, engine_file); + } + initEngine(engine_file); // FIXME + mallocDeviceMemory(); + + if (use_adj_) { + adj_frame_ptr_.reset(new AdjFrame(adj_num_, trt_buffer_sizes_[buffer_map_["curr_bevfeat"]])); + } + + cam_params_host_.resize(n_img_ * cam_params_size_); + + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["ranks_bev"]], ranks_bev.data(), valid_feat_num_ * sizeof(int), + cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["ranks_depth"]], ranks_depth.data(), valid_feat_num_ * sizeof(int), + cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["ranks_feat"]], ranks_feat.data(), valid_feat_num_ * sizeof(int), + cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["interval_starts"]], interval_starts.data(), + unique_bev_num_ * sizeof(int), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["interval_lengths"]], interval_lengths.data(), + unique_bev_num_ * sizeof(int), cudaMemcpyHostToDevice)); + + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["mean"]], mean_.data(), 3 * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["std"]], std_.data(), 3 * sizeof(float), cudaMemcpyHostToDevice)); +} + +void BEVDet::initCamParams( + const std::vector> & curr_cams2ego_rot, + const std::vector & curr_cams2ego_trans, + const std::vector & curr_cams_intrin) +{ + for (int i = 0; i < n_img_; i++) { + cam_params_host_[i * cam_params_size_ + 0] = curr_cams_intrin[i](0, 0); + cam_params_host_[i * cam_params_size_ + 1] = curr_cams_intrin[i](1, 1); + cam_params_host_[i * cam_params_size_ + 2] = curr_cams_intrin[i](0, 2); + cam_params_host_[i * cam_params_size_ + 3] = curr_cams_intrin[i](1, 2); + cam_params_host_[i * cam_params_size_ + 4] = post_rot_(0, 0); + cam_params_host_[i * cam_params_size_ + 5] = post_rot_(0, 1); + cam_params_host_[i * cam_params_size_ + 6] = post_trans_.translation()(0); + cam_params_host_[i * cam_params_size_ + 7] = post_rot_(1, 0); + cam_params_host_[i * cam_params_size_ + 8] = post_rot_(1, 1); + cam_params_host_[i * cam_params_size_ + 9] = post_trans_.translation()(1); + cam_params_host_[i * cam_params_size_ + 10] = 1.f; // bda 0 0 + cam_params_host_[i * cam_params_size_ + 11] = 0.f; // bda 0 1 + cam_params_host_[i * cam_params_size_ + 12] = 0.f; // bda 1 0 + cam_params_host_[i * cam_params_size_ + 13] = 1.f; // bda 1 1 + cam_params_host_[i * cam_params_size_ + 14] = 1.f; // bda 2 2 + cam_params_host_[i * cam_params_size_ + 15] = curr_cams2ego_rot[i].matrix()(0, 0); + cam_params_host_[i * cam_params_size_ + 16] = curr_cams2ego_rot[i].matrix()(0, 1); + cam_params_host_[i * cam_params_size_ + 17] = curr_cams2ego_rot[i].matrix()(0, 2); + cam_params_host_[i * cam_params_size_ + 18] = curr_cams2ego_trans[i].translation()(0); + cam_params_host_[i * cam_params_size_ + 19] = curr_cams2ego_rot[i].matrix()(1, 0); + cam_params_host_[i * cam_params_size_ + 20] = curr_cams2ego_rot[i].matrix()(1, 1); + cam_params_host_[i * cam_params_size_ + 21] = curr_cams2ego_rot[i].matrix()(1, 2); + cam_params_host_[i * cam_params_size_ + 22] = curr_cams2ego_trans[i].translation()(1); + cam_params_host_[i * cam_params_size_ + 23] = curr_cams2ego_rot[i].matrix()(2, 0); + cam_params_host_[i * cam_params_size_ + 24] = curr_cams2ego_rot[i].matrix()(2, 1); + cam_params_host_[i * cam_params_size_ + 25] = curr_cams2ego_rot[i].matrix()(2, 2); + cam_params_host_[i * cam_params_size_ + 26] = curr_cams2ego_trans[i].translation()(2); + } + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["cam_params"]], cam_params_host_.data(), + trt_buffer_sizes_[buffer_map_["cam_params"]], cudaMemcpyHostToDevice)); +} + +void BEVDet::initParams(const std::string & config_file) +{ + mean_ = std::vector(3); + std_ = std::vector(3); + + YAML::Node model_config = YAML::LoadFile(config_file); + n_img_ = model_config["data_config"]["Ncams"].as(); + src_img_h_ = model_config["data_config"]["src_size"][0].as(); + src_img_w_ = model_config["data_config"]["src_size"][1].as(); + input_img_h_ = model_config["data_config"]["input_size"][0].as(); + input_img_w_ = model_config["data_config"]["input_size"][1].as(); + resize_radio_ = model_config["data_config"]["resize_radio"].as(); + crop_h_ = model_config["data_config"]["crop"][0].as(); + crop_w_ = model_config["data_config"]["crop"][1].as(); + mean_[0] = model_config["mean"][0].as(); + mean_[1] = model_config["mean"][1].as(); + mean_[2] = model_config["mean"][2].as(); + std_[0] = model_config["std"][0].as(); + std_[1] = model_config["std"][1].as(); + std_[2] = model_config["std"][2].as(); + down_sample_ = model_config["model"]["down_sample"].as(); + depth_start_ = model_config["grid_config"]["depth"][0].as(); + depth_end_ = model_config["grid_config"]["depth"][1].as(); + depth_step_ = model_config["grid_config"]["depth"][2].as(); + x_start_ = model_config["grid_config"]["x"][0].as(); + x_end_ = model_config["grid_config"]["x"][1].as(); + x_step_ = model_config["grid_config"]["x"][2].as(); + y_start_ = model_config["grid_config"]["y"][0].as(); + y_end_ = model_config["grid_config"]["y"][1].as(); + y_step_ = model_config["grid_config"]["y"][2].as(); + z_start_ = model_config["grid_config"]["z"][0].as(); + z_end_ = model_config["grid_config"]["z"][1].as(); + z_step_ = model_config["grid_config"]["z"][2].as(); + bevpool_channel_ = model_config["model"]["bevpool_channels"].as(); + nms_pre_maxnum_ = model_config["test_cfg"]["max_per_img"].as(); + nms_post_maxnum_ = model_config["test_cfg"]["post_max_size"].as(); + score_thresh_ = model_config["test_cfg"]["score_threshold"].as(); + nms_overlap_thresh_ = model_config["test_cfg"]["nms_thr"][0].as(); + use_depth_ = model_config["use_depth"].as(); + use_adj_ = model_config["use_adj"].as(); + transform_size_ = model_config["transform_size"].as(); + cam_params_size_ = model_config["cam_params_size"].as(); + + std::vector> nms_factor_temp = + model_config["test_cfg"]["nms_rescale_factor"].as>>(); + nms_rescale_factor_.clear(); + for (const auto & task_factors : nms_factor_temp) { + for (float factor : task_factors) { + nms_rescale_factor_.push_back(factor); + } + } + + std::vector> class_name_pre_task; + class_num_ = 0; + YAML::Node tasks = model_config["model"]["tasks"]; + class_num_pre_task_ = std::vector(); + for (auto it : tasks) { + int num = it["num_class"].as(); + class_num_pre_task_.push_back(num); + class_num_ += num; + class_name_pre_task.push_back(it["class_names"].as>()); + } + + YAML::Node common_head_channel = model_config["model"]["common_head"]["channels"]; + YAML::Node common_head_name = model_config["model"]["common_head"]["names"]; + for (size_t i = 0; i < common_head_channel.size(); i++) { + out_num_task_head_[common_head_name[i].as()] = common_head_channel[i].as(); + } + + feat_h_ = input_img_h_ / down_sample_; + feat_w_ = input_img_w_ / down_sample_; + depth_num_ = (depth_end_ - depth_start_) / depth_step_; + xgrid_num_ = (x_end_ - x_start_) / x_step_; + ygrid_num_ = (y_end_ - y_start_) / y_step_; + zgrid_num_ = (z_end_ - z_start_) / z_step_; + bev_h_ = ygrid_num_; + bev_w_ = xgrid_num_; + + post_rot_ << resize_radio_, 0, 0, 0, resize_radio_, 0, 0, 0, 1; + post_trans_.translation() << -crop_w_, -crop_h_, 0; + + adj_num_ = 0; + if (use_adj_) { + adj_num_ = model_config["adj_num"].as(); + } + + postprocess_ptr_.reset(new PostprocessGPU( + class_num_, score_thresh_, nms_overlap_thresh_, nms_pre_maxnum_, nms_post_maxnum_, down_sample_, + bev_h_, bev_w_, x_step_, y_step_, x_start_, y_start_, class_num_pre_task_, + nms_rescale_factor_)); +} + +void BEVDet::mallocDeviceMemory() +{ + trt_buffer_sizes_.resize(trt_engine_->getNbBindings()); + trt_buffer_dev_ = reinterpret_cast(new void *[trt_engine_->getNbBindings()]); + for (int i = 0; i < trt_engine_->getNbBindings(); i++) { + nvinfer1::Dims32 dim = trt_context_->getBindingDimensions(i); + size_t size = 1; + for (int j = 0; j < dim.nbDims; j++) { + size *= dim.d[j]; + } + size *= dataTypeToSize(trt_engine_->getBindingDataType(i)); + trt_buffer_sizes_[i] = size; + CHECK_CUDA(cudaMalloc(&trt_buffer_dev_[i], size)); + } + + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "img num binding : %d", trt_engine_->getNbBindings()); + + post_buffer_ = reinterpret_cast(new void *[class_num_pre_task_.size() * 6]); + for (size_t i = 0; i < class_num_pre_task_.size(); i++) { + post_buffer_[i * 6 + 0] = trt_buffer_dev_[buffer_map_["reg_" + std::to_string(i)]]; + post_buffer_[i * 6 + 1] = trt_buffer_dev_[buffer_map_["height_" + std::to_string(i)]]; + post_buffer_[i * 6 + 2] = trt_buffer_dev_[buffer_map_["dim_" + std::to_string(i)]]; + post_buffer_[i * 6 + 3] = trt_buffer_dev_[buffer_map_["rot_" + std::to_string(i)]]; + post_buffer_[i * 6 + 4] = trt_buffer_dev_[buffer_map_["vel_" + std::to_string(i)]]; + post_buffer_[i * 6 + 5] = trt_buffer_dev_[buffer_map_["heatmap_" + std::to_string(i)]]; + } + + return; +} + +void BEVDet::initViewTransformer( + std::vector & ranks_bev, std::vector & ranks_depth, std::vector & ranks_feat, + std::vector & interval_starts, std::vector & interval_lengths) +{ + int num_points = n_img_ * depth_num_ * feat_h_ * feat_w_; + std::vector frustum(num_points); + + for (int i = 0; i < n_img_; i++) { + for (int d_ = 0; d_ < depth_num_; d_++) { + for (int h_ = 0; h_ < feat_h_; h_++) { + for (int w_ = 0; w_ < feat_w_; w_++) { + int offset = + i * depth_num_ * feat_h_ * feat_w_ + d_ * feat_h_ * feat_w_ + h_ * feat_w_ + w_; + frustum[offset].x() = static_cast(w_) * (input_img_w_ - 1) / (feat_w_ - 1); + frustum[offset].y() = static_cast(h_) * (input_img_h_ - 1) / (feat_h_ - 1); + frustum[offset].z() = static_cast(d_) * depth_step_ + depth_start_; + + // eliminate post transformation + frustum[offset] -= post_trans_.translation(); + frustum[offset] = post_rot_.inverse() * frustum[offset]; + // + frustum[offset].x() *= frustum[offset].z(); + frustum[offset].y() *= frustum[offset].z(); + // img to ego -> rot -> trans + frustum[offset] = cams2ego_rot_[i] * cams_intrin_[i].inverse() * frustum[offset] + + cams2ego_trans_[i].translation(); + + // voxelization + frustum[offset] -= Eigen::Vector3f(x_start_, y_start_, z_start_); + frustum[offset].x() = static_cast(frustum[offset].x() / x_step_); + frustum[offset].y() = static_cast(frustum[offset].y() / y_step_); + frustum[offset].z() = static_cast(frustum[offset].z() / z_step_); + } + } + } + } + + std::vector _ranks_depth(num_points); + std::vector _ranks_feat(num_points); + + for (int i = 0; i < num_points; i++) { + _ranks_depth[i] = i; + } + for (int i = 0; i < n_img_; i++) { + for (int d_ = 0; d_ < depth_num_; d_++) { + for (int u = 0; u < feat_h_ * feat_w_; u++) { + int offset = i * (depth_num_ * feat_h_ * feat_w_) + d_ * (feat_h_ * feat_w_) + u; + _ranks_feat[offset] = i * feat_h_ * feat_w_ + u; + } + } + } + + std::vector kept; + for (int i = 0; i < num_points; i++) { + if ( + static_cast(frustum[i].x()) >= 0 && static_cast(frustum[i].x()) < xgrid_num_ && + static_cast(frustum[i].y()) >= 0 && static_cast(frustum[i].y()) < ygrid_num_ && + static_cast(frustum[i].z()) >= 0 && static_cast(frustum[i].z()) < zgrid_num_) { + kept.push_back(i); + } + } + + valid_feat_num_ = kept.size(); + ; + ranks_depth.resize(valid_feat_num_); + ranks_feat.resize(valid_feat_num_); + ranks_bev.resize(valid_feat_num_); + std::vector order(valid_feat_num_); + + for (int i = 0; i < valid_feat_num_; i++) { + Eigen::Vector3f & p = frustum[kept[i]]; + ranks_bev[i] = static_cast(p.z()) * xgrid_num_ * ygrid_num_ + + static_cast(p.y()) * xgrid_num_ + static_cast(p.x()); + order[i] = i; + } + + thrust::sort_by_key(ranks_bev.begin(), ranks_bev.end(), order.begin()); + for (int i = 0; i < valid_feat_num_; i++) { + ranks_depth[i] = _ranks_depth[kept[order[i]]]; + ranks_feat[i] = _ranks_feat[kept[order[i]]]; + } + + interval_starts.push_back(0); + int len = 1; + for (int i = 1; i < valid_feat_num_; i++) { + if (ranks_bev[i] != ranks_bev[i - 1]) { + interval_starts.push_back(i); + interval_lengths.push_back(len); + len = 1; + } else { + len++; + } + } + + interval_lengths.push_back(len); + unique_bev_num_ = interval_lengths.size(); + + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "valid_feat_num: %d", valid_feat_num_); + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "unique_bev_num: %d", unique_bev_num_); +} + +void print_dim(nvinfer1::Dims dim, const std::string & name) +{ + std::ostringstream oss; + oss << name << " : "; + for (auto i = 0; i < dim.nbDims; i++) { + oss << dim.d[i] << ' '; + } + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "%s", oss.str().c_str()); +} + +int BEVDet::initEngine(const std::string & engine_file) +{ + if (deserializeTRTEngine(engine_file, &trt_engine_)) { + return EXIT_FAILURE; + } + + if (trt_engine_ == nullptr) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed to deserialize engine file!"); + return EXIT_FAILURE; + } + trt_context_ = trt_engine_->createExecutionContext(); + + if (trt_context_ == nullptr) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed to create TensorRT Execution Context!"); + return EXIT_FAILURE; + } + + // set bindings + std::vector shapes{ + {4, {n_img_, 3, src_img_h_, src_img_w_ / 4}}, + {1, {3}}, + {1, {3}}, + {3, {1, n_img_, cam_params_size_}}, + {1, {valid_feat_num_}}, + {1, {valid_feat_num_}}, + {1, {valid_feat_num_}}, + {1, {unique_bev_num_}}, + {1, {unique_bev_num_}}, + {5, {1, adj_num_, bevpool_channel_, bev_h_, bev_w_}}, + {3, {1, adj_num_, transform_size_}}, + {2, {1, 1}}}; + + for (size_t i = 0; i < shapes.size(); i++) { + trt_context_->setBindingDimensions(i, shapes[i]); + } + + buffer_map_.clear(); + for (auto i = 0; i < trt_engine_->getNbBindings(); i++) { + auto dim = trt_context_->getBindingDimensions(i); + auto name = trt_engine_->getBindingName(i); + buffer_map_[name] = i; + print_dim(dim, name); + } + + return EXIT_SUCCESS; +} + +int BEVDet::deserializeTRTEngine( + const std::string & engine_file, nvinfer1::ICudaEngine ** engine_ptr) +{ + std::stringstream engine_stream; + engine_stream.seekg(0, engine_stream.beg); + + std::ifstream file(engine_file); + engine_stream << file.rdbuf(); + file.close(); + + nvinfer1::IRuntime * runtime = nvinfer1::createInferRuntime(g_logger_); + if (runtime == nullptr) { + std::string msg("Failed to build runtime parser!"); + g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str()); + return EXIT_FAILURE; + } + engine_stream.seekg(0, std::ios::end); + const int engine_size = engine_stream.tellg(); + + engine_stream.seekg(0, std::ios::beg); + void * engine_str = malloc(engine_size); + engine_stream.read(reinterpret_cast(engine_str), engine_size); + + nvinfer1::ICudaEngine * engine = runtime->deserializeCudaEngine(engine_str, engine_size, NULL); + if (engine == nullptr) { + std::string msg("Failed to build engine parser!"); + g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str()); + return EXIT_FAILURE; + } + *engine_ptr = engine; + for (int bi = 0; bi < engine->getNbBindings(); bi++) { + if (engine->bindingIsInput(bi) == true) { + RCLCPP_INFO( + rclcpp::get_logger("BEVDet"), "Binding %d (%s): Input.", bi, engine->getBindingName(bi)); + } else { + RCLCPP_INFO( + rclcpp::get_logger("BEVDet"), "Binding %d (%s): Output.", bi, engine->getBindingName(bi)); + } + } + return EXIT_SUCCESS; +} + +void BEVDet::getAdjBEVFeature( + const std::string & curr_scene_token, const Eigen::Quaternion & ego2global_rot, + const Eigen::Translation3f & ego2global_trans) +{ + int flag = 1; + if (adj_frame_ptr_->lastScenesToken() != curr_scene_token) { + adj_frame_ptr_->reset(); + flag = 0; + } + + // the smaller the idx, the newer th adj_bevfeat + for (int i = 0; i < adj_num_; i++) { + const void * adj_buffer = adj_frame_ptr_->getFrameBuffer(i); + + size_t buf_size = trt_buffer_sizes_[buffer_map_["adj_feats"]] / adj_num_; + + CHECK_CUDA(cudaMemcpy( + reinterpret_cast(trt_buffer_dev_[buffer_map_["adj_feats"]]) + i * buf_size, + adj_buffer, buf_size, cudaMemcpyDeviceToDevice)); + + Eigen::Quaternion adj_ego2global_rot; + Eigen::Translation3f adj_ego2global_trans; + adj_frame_ptr_->getEgo2Global(i, adj_ego2global_rot, adj_ego2global_trans); + + getCurr2AdjTransform( + ego2global_rot, adj_ego2global_rot, ego2global_trans, adj_ego2global_trans, + reinterpret_cast(trt_buffer_dev_[buffer_map_["transforms"]]) + i * transform_size_); + } + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["flag"]], &flag, trt_buffer_sizes_[buffer_map_["flag"]], + cudaMemcpyHostToDevice)); +} + +void BEVDet::getCurr2AdjTransform( + const Eigen::Quaternion & curr_ego2global_rot, + const Eigen::Quaternion & adj_ego2global_rot, + const Eigen::Translation3f & curr_ego2global_trans, + const Eigen::Translation3f & adj_ego2global_trans, float * transform_dev) +{ + Eigen::Matrix4f curr_e2g_transform; + Eigen::Matrix4f adj_e2g_transform; + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + curr_e2g_transform(i, j) = curr_ego2global_rot.matrix()(i, j); + adj_e2g_transform(i, j) = adj_ego2global_rot.matrix()(i, j); + } + } + for (int i = 0; i < 3; i++) { + curr_e2g_transform(i, 3) = curr_ego2global_trans.vector()(i); + adj_e2g_transform(i, 3) = adj_ego2global_trans.vector()(i); + + curr_e2g_transform(3, i) = 0.f; + adj_e2g_transform(3, i) = 0.f; + } + curr_e2g_transform(3, 3) = 1.f; + adj_e2g_transform(3, 3) = 1.f; + + Eigen::Matrix4f currEgo2adjEgo = adj_e2g_transform.inverse() * curr_e2g_transform; + Eigen::Matrix3f currEgo2adjEgo_2d; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + currEgo2adjEgo_2d(i, j) = currEgo2adjEgo(i, j); + } + } + currEgo2adjEgo_2d(2, 0) = 0.f; + currEgo2adjEgo_2d(2, 1) = 0.f; + currEgo2adjEgo_2d(2, 2) = 1.f; + currEgo2adjEgo_2d(0, 2) = currEgo2adjEgo(0, 3); + currEgo2adjEgo_2d(1, 2) = currEgo2adjEgo(1, 3); + + Eigen::Matrix3f gridbev2egobev; + gridbev2egobev(0, 0) = x_step_; + gridbev2egobev(1, 1) = y_step_; + gridbev2egobev(0, 2) = x_start_; + gridbev2egobev(1, 2) = y_start_; + gridbev2egobev(2, 2) = 1.f; + + gridbev2egobev(0, 1) = 0.f; + gridbev2egobev(1, 0) = 0.f; + gridbev2egobev(2, 0) = 0.f; + gridbev2egobev(2, 1) = 0.f; + + Eigen::Matrix3f currgrid2adjgrid = gridbev2egobev.inverse() * currEgo2adjEgo_2d * gridbev2egobev; + + CHECK_CUDA(cudaMemcpy( + transform_dev, Eigen::Matrix3f(currgrid2adjgrid.transpose()).data(), + transform_size_ * sizeof(float), cudaMemcpyHostToDevice)); +} + +int BEVDet::doInfer( + const camsData & cam_data, std::vector & out_detections, float & cost_time, int idx) +{ + RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "---------%d---------", idx + 1); + + auto start = high_resolution_clock::now(); + CHECK_CUDA(cudaMemcpy( + trt_buffer_dev_[buffer_map_["images"]], cam_data.imgs_dev, + trt_buffer_sizes_[buffer_map_["images"]], cudaMemcpyDeviceToDevice)); + + initCamParams( + cam_data.param.cams2ego_rot, cam_data.param.cams2ego_trans, cam_data.param.cams_intrin); + + getAdjBEVFeature( + cam_data.param.scene_token, cam_data.param.ego2global_rot, cam_data.param.ego2global_trans); + + if (!trt_context_->executeV2(trt_buffer_dev_)) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "BEVDet forward failing!"); + } + + adj_frame_ptr_->saveFrameBuffer( + trt_buffer_dev_[buffer_map_["curr_bevfeat"]], cam_data.param.scene_token, + cam_data.param.ego2global_rot, cam_data.param.ego2global_trans); + + auto end = high_resolution_clock::now(); + + postprocess_ptr_->DoPostprocess(post_buffer_, out_detections); + CHECK_CUDA(cudaDeviceSynchronize()); + + auto post_end = high_resolution_clock::now(); + + duration infer_t = post_end - start; + duration engine_t = end - start; + duration post_t = post_end - end; + + cost_time = infer_t.count() * 1000; + RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Inference time: %.5lf ms", engine_t.count() * 1000); + RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Postprocess time: %.5lf ms", post_t.count() * 1000); + RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Total time: %.5lf ms", infer_t.count() * 1000); + RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Detect %ld objects", out_detections.size()); + + return EXIT_SUCCESS; +} + +void BEVDet::exportEngine(const std::string & onnxFile, const std::string & trtFile) +{ + CHECK_CUDA(cudaSetDevice(0)); + nvinfer1::ICudaEngine * engine = nullptr; + std::vector min_shapes{ + {4, {6, 3, 900, 400}}, {1, {3}}, {1, {3}}, {3, {1, 6, 27}}, {1, {200000}}, + {1, {200000}}, {1, {200000}}, {1, {8000}}, {1, {8000}}, {5, {1, 8, 80, 128, 128}}, + {3, {1, 8, 6}}, {2, {1, 1}}}; + + std::vector opt_shapes{ + {4, {6, 3, 900, 400}}, {1, {3}}, {1, {3}}, {3, {1, 6, 27}}, {1, {356760}}, + {1, {356760}}, {1, {356760}}, {1, {13360}}, {1, {13360}}, {5, {1, 8, 80, 128, 128}}, + {3, {1, 8, 6}}, {2, {1, 1}}}; + + std::vector max_shapes{ + {4, {6, 3, 900, 400}}, {1, {3}}, {1, {3}}, {3, {1, 6, 27}}, {1, {370000}}, + {1, {370000}}, {1, {370000}}, {1, {14000}}, {1, {14000}}, {5, {1, 8, 80, 128, 128}}, + {3, {1, 8, 6}}, {2, {1, 1}}}; + nvinfer1::IBuilder * builder = nvinfer1::createInferBuilder(g_logger_); + nvinfer1::INetworkDefinition * network = + builder->createNetworkV2(1U << int(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + nvinfer1::IOptimizationProfile * profile = builder->createOptimizationProfile(); + nvinfer1::IBuilderConfig * config = builder->createBuilderConfig(); + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 3UL << 32UL); + + config->setFlag(nvinfer1::BuilderFlag::kFP16); + + nvonnxparser::IParser * parser = nvonnxparser::createParser(*network, g_logger_); + + if (!parser->parseFromFile(onnxFile.c_str(), static_cast(g_logger_.reportable_severity))) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed parsing .onnx file!"); + for (int i = 0; i < parser->getNbErrors(); ++i) { + auto * error = parser->getError(i); + RCLCPP_ERROR( + rclcpp::get_logger("BEVDet"), "Error code: %d, Description: %s", + static_cast(error->code()), error->desc()); + } + + return; + } + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded parsing .onnx file!"); + + for (size_t i = 0; i < min_shapes.size(); i++) { + nvinfer1::ITensor * it = network->getInput(i); + profile->setDimensions(it->getName(), nvinfer1::OptProfileSelector::kMIN, min_shapes[i]); + profile->setDimensions(it->getName(), nvinfer1::OptProfileSelector::kOPT, opt_shapes[i]); + profile->setDimensions(it->getName(), nvinfer1::OptProfileSelector::kMAX, max_shapes[i]); + } + config->addOptimizationProfile(profile); + + nvinfer1::IHostMemory * engineString = builder->buildSerializedNetwork(*network, *config); + if (engineString == nullptr || engineString->size() == 0) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed building serialized engine!"); + return; + } + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded building serialized engine!"); + + nvinfer1::IRuntime * runtime{nvinfer1::createInferRuntime(g_logger_)}; + engine = runtime->deserializeCudaEngine(engineString->data(), engineString->size()); + if (engine == nullptr) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed building engine!"); + return; + } + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded building engine!"); + + std::ofstream engineFile(trtFile, std::ios::binary); + if (!engineFile) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed opening file to write"); + return; + } + engineFile.write(static_cast(engineString->data()), engineString->size()); + if (engineFile.fail()) { + RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed saving .engine file!"); + return; + } + RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded saving .engine file!"); +} + +BEVDet::~BEVDet() +{ + for (int i = 0; i < trt_engine_->getNbBindings(); i++) { + CHECK_CUDA(cudaFree(trt_buffer_dev_[i])); + } + delete[] trt_buffer_dev_; + delete[] post_buffer_; + + trt_context_->destroy(); + trt_engine_->destroy(); +} +} // namespace autoware::tensorrt_bevdet diff --git a/perception/autoware_tensorrt_bevdet/src/bevdet_node.cpp b/perception/autoware_tensorrt_bevdet/src/bevdet_node.cpp new file mode 100644 index 0000000000000..46bac289cd501 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/bevdet_node.cpp @@ -0,0 +1,342 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// cspell:ignore BEVDET, thre, TRTBEV, bevdet, caminfo, intrin, Ncams, bevfeat, dlongterm, RGBHWC, +// BGRCHW +#include "autoware/tensorrt_bevdet/bevdet_node.hpp" + +#include "autoware/tensorrt_bevdet/preprocess.hpp" + +using Label = autoware_perception_msgs::msg::ObjectClassification; +std::map> colormap{ + {0, {0, 0, 255}}, // dodger blue + {1, {0, 201, 87}}, {2, {0, 201, 87}}, {3, {160, 32, 240}}, {4, {3, 168, 158}}, {5, {255, 0, 0}}, + {6, {255, 97, 0}}, {7, {30, 0, 255}}, {8, {255, 0, 0}}, {9, {0, 0, 255}}, {10, {0, 0, 0}}}; + +uint8_t getSemanticType(const std::string & class_name) +{ + if (class_name == "car") { + return Label::CAR; + } else if (class_name == "truck") { + return Label::TRUCK; + } else if (class_name == "bus") { + return Label::BUS; + } else if (class_name == "trailer") { + return Label::TRAILER; + } else if (class_name == "bicycle") { + return Label::BICYCLE; + } else if (class_name == "motorcycle") { + return Label::MOTORCYCLE; + } else if (class_name == "pedestrian") { + return Label::PEDESTRIAN; + } else { + return Label::UNKNOWN; + } +} + +void box3DToDetectedObjects( + const std::vector & boxes, autoware_perception_msgs::msg::DetectedObjects & bevdet_objects, + const std::vector & class_names, float score_thre, const bool has_twist = true) +{ + for (auto b : boxes) { + if (b.score < score_thre) continue; + autoware_perception_msgs::msg::DetectedObject obj; + + Eigen::Vector3f center(b.x, b.y, b.z + b.h / 2.); + + obj.existence_probability = b.score; + // classification + autoware_perception_msgs::msg::ObjectClassification classification; + classification.probability = 1.0f; + if (b.label >= 0 && static_cast(b.label) < class_names.size()) { + classification.label = getSemanticType(class_names[b.label]); + } else { + classification.label = Label::UNKNOWN; + } + obj.classification.emplace_back(classification); + + // pose and shape + geometry_msgs::msg::Point p; + p.x = center.x(); + p.y = center.y(); + p.z = center.z(); + obj.kinematics.pose_with_covariance.pose.position = p; + + tf2::Quaternion q; + q.setRPY(0, 0, b.r); + obj.kinematics.pose_with_covariance.pose.orientation = tf2::toMsg(q); + + obj.shape.type = autoware_perception_msgs::msg::Shape::BOUNDING_BOX; + + geometry_msgs::msg::Vector3 v; + v.x = b.l; + v.y = b.w; + v.z = b.h; + obj.shape.dimensions = v; + if (has_twist) { + float vel_x = b.vx; + float vel_y = b.vy; + geometry_msgs::msg::Twist twist; + twist.linear.x = std::sqrt(std::pow(vel_x, 2) + std::pow(vel_y, 2)); + twist.angular.z = 2 * (std::atan2(vel_y, vel_x) - b.r); + obj.kinematics.twist_with_covariance.twist = twist; + obj.kinematics.has_twist = has_twist; + } + + bevdet_objects.objects.emplace_back(obj); + } +} + +void getTransform( + const geometry_msgs::msg::TransformStamped & transform, Eigen::Quaternion & rot, + Eigen::Translation3f & translation) +{ + rot = Eigen::Quaternion( + transform.transform.rotation.w, transform.transform.rotation.x, transform.transform.rotation.y, + transform.transform.rotation.z); + translation = Eigen::Translation3f( + transform.transform.translation.x, transform.transform.translation.y, + transform.transform.translation.z); +} + +void getCameraIntrinsics( + const sensor_msgs::msg::CameraInfo::SharedPtr msg, Eigen::Matrix3f & intrinsics) +{ + intrinsics << msg->k[0], msg->k[1], msg->k[2], msg->k[3], msg->k[4], msg->k[5], msg->k[6], + msg->k[7], msg->k[8]; +} + +TRTBEVDetNode::TRTBEVDetNode( + const std::string & node_name, const rclcpp::NodeOptions & node_options) +: rclcpp::Node(node_name, node_options) +{ // Only start camera info subscription and tf listener at the beginning + img_N_ = this->declare_parameter("data_params.N", 6); // camera num 6 + + caminfo_received_ = std::vector(img_N_, false); + cams_intrin = std::vector(img_N_); + cams2ego_rot = std::vector>(img_N_); + cams2ego_trans = std::vector(img_N_); + + tf_buffer_ = std::make_unique(this->get_clock()); + tf_listener_ = std::make_shared(*tf_buffer_); + + startCameraInfoSubscription(); + + // Wait for camera info and tf transform initialization + timer_ = this->create_wall_timer( + std::chrono::milliseconds(100), std::bind(&TRTBEVDetNode::checkInitialization, this)); +} + +void TRTBEVDetNode::initModel() +{ + score_thre_ = this->declare_parameter("post_process_params.score_threshold", 0.2); + + model_config_ = this->declare_parameter("model_config", "bevdet_r50_4dlongterm_depth.yaml"); + + onnx_file_ = this->declare_parameter("onnx_path", "bevdet_one_lt_d.onnx"); + engine_file_ = this->declare_parameter("engine_path", "bevdet_one_lt_d.engine"); + + imgs_name_ = this->declare_parameter>("data_params.cams"); + class_names_ = + this->declare_parameter>("post_process_params.class_names"); + + RCLCPP_INFO_STREAM(this->get_logger(), "Successful load config!"); + + sampleData_.param = camParams(cams_intrin, cams2ego_rot, cams2ego_trans); + + RCLCPP_INFO_STREAM(this->get_logger(), "Successful load image params!"); + + bevdet_ = std::make_shared( + model_config_, img_N_, sampleData_.param.cams_intrin, sampleData_.param.cams2ego_rot, + sampleData_.param.cams2ego_trans, onnx_file_, engine_file_); + + RCLCPP_INFO_STREAM(this->get_logger(), "Successful create bevdet!"); + + CHECK_CUDA(cudaMalloc( + reinterpret_cast(&imgs_dev_), img_N_ * 3 * img_w_ * img_h_ * sizeof(uchar))); + + pub_boxes_ = this->create_publisher( + "~/output/boxes", rclcpp::QoS{1}); +} + +void TRTBEVDetNode::checkInitialization() +{ + if (camera_info_received_flag_) { + RCLCPP_INFO_STREAM( + this->get_logger(), "Camera Info and TF Transform Initialization completed!"); + initModel(); + startImageSubscription(); + timer_->cancel(); + timer_.reset(); + } else { + RCLCPP_INFO_THROTTLE( + this->get_logger(), *this->get_clock(), 5000, + "Waiting for Camera Info and TF Transform Initialization..."); + } +} + +void TRTBEVDetNode::startImageSubscription() +{ + using std::placeholders::_1; + using std::placeholders::_2; + using std::placeholders::_3; + using std::placeholders::_4; + using std::placeholders::_5; + using std::placeholders::_6; + + sub_f_img_.subscribe( + this, "~/input/topic_img_f", + rclcpp::QoS{1}.get_rmw_qos_profile()); // rmw_qos_profile_sensor_data + sub_b_img_.subscribe(this, "~/input/topic_img_b", rclcpp::QoS{1}.get_rmw_qos_profile()); + + sub_fl_img_.subscribe(this, "~/input/topic_img_fl", rclcpp::QoS{1}.get_rmw_qos_profile()); + sub_fr_img_.subscribe(this, "~/input/topic_img_fr", rclcpp::QoS{1}.get_rmw_qos_profile()); + + sub_bl_img_.subscribe(this, "~/input/topic_img_bl", rclcpp::QoS{1}.get_rmw_qos_profile()); + sub_br_img_.subscribe(this, "~/input/topic_img_br", rclcpp::QoS{1}.get_rmw_qos_profile()); + + sync_ = std::make_shared( + MySyncPolicy(10), sub_fl_img_, sub_f_img_, sub_fr_img_, sub_bl_img_, sub_b_img_, sub_br_img_); + + sync_->registerCallback(std::bind(&TRTBEVDetNode::callback, this, _1, _2, _3, _4, _5, _6)); +} + +void TRTBEVDetNode::startCameraInfoSubscription() +{ + // cams: ["CAM_FRONT_LEFT", "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_BACK_LEFT", "CAM_BACK", + // "CAM_BACK_RIGHT"] + sub_fl_caminfo_ = this->create_subscription( + "~/input/topic_img_fl/camera_info", rclcpp::QoS{1}, + [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(0, msg); }); + + sub_f_caminfo_ = this->create_subscription( + "~/input/topic_img_f/camera_info", rclcpp::QoS{1}, + [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(1, msg); }); + + sub_fr_caminfo_ = this->create_subscription( + "~/input/topic_img_fr/camera_info", rclcpp::QoS{1}, + [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(2, msg); }); + + sub_bl_caminfo_ = this->create_subscription( + "~/input/topic_img_bl/camera_info", rclcpp::QoS{1}, + [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(3, msg); }); + + sub_b_caminfo_ = this->create_subscription( + "~/input/topic_img_b/camera_info", rclcpp::QoS{1}, + [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(4, msg); }); + + sub_br_caminfo_ = this->create_subscription( + "~/input/topic_img_br/camera_info", rclcpp::QoS{1}, + [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(5, msg); }); +} + +void image_transport(std::vector imgs, uchar * out_imgs, size_t width, size_t height) +{ + uchar * temp = new uchar[width * height * 3]; + uchar * temp_gpu = nullptr; + CHECK_CUDA(cudaMalloc(&temp_gpu, width * height * 3)); + + for (size_t i = 0; i < imgs.size(); i++) { + cv::cvtColor(imgs[i], imgs[i], cv::COLOR_BGR2RGB); + CHECK_CUDA(cudaMemcpy(temp_gpu, imgs[i].data, width * height * 3, cudaMemcpyHostToDevice)); + convert_RGBHWC_to_BGRCHW(temp_gpu, out_imgs + i * width * height * 3, 3, height, width); + } + delete[] temp; + CHECK_CUDA(cudaFree(temp_gpu)); +} + +void TRTBEVDetNode::callback( + const sensor_msgs::msg::Image::ConstSharedPtr & msg_fl_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_f_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_fr_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_bl_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_b_img, + const sensor_msgs::msg::Image::ConstSharedPtr & msg_br_img) +{ + cv::Mat img_fl, img_f, img_fr, img_bl, img_b, img_br; + std::vector imgs; + img_fl = cv_bridge::toCvShare(msg_fl_img, "bgr8")->image; + img_f = cv_bridge::toCvShare(msg_f_img, "bgr8")->image; + img_fr = cv_bridge::toCvShare(msg_fr_img, "bgr8")->image; + img_bl = cv_bridge::toCvShare(msg_bl_img, "bgr8")->image; + img_b = cv_bridge::toCvShare(msg_b_img, "bgr8")->image; + img_br = cv_bridge::toCvShare(msg_br_img, "bgr8")->image; + + imgs.emplace_back(img_fl); + imgs.emplace_back(img_f); + imgs.emplace_back(img_fr); + imgs.emplace_back(img_bl); + imgs.emplace_back(img_b); + imgs.emplace_back(img_br); + + image_transport(imgs, imgs_dev_, img_w_, img_h_); + + // uchar *imgs_dev + sampleData_.imgs_dev = imgs_dev_; + + std::vector ego_boxes; + ego_boxes.clear(); + float time = 0.f; + + bevdet_->doInfer(sampleData_, ego_boxes, time); + + autoware_perception_msgs::msg::DetectedObjects bevdet_objects; + bevdet_objects.header.frame_id = "base_link"; + bevdet_objects.header.stamp = msg_f_img->header.stamp; + + box3DToDetectedObjects(ego_boxes, bevdet_objects, class_names_, score_thre_); + + pub_boxes_->publish(bevdet_objects); +} + +void TRTBEVDetNode::camera_info_callback(int idx, const sensor_msgs::msg::CameraInfo::SharedPtr msg) +{ + if (caminfo_received_[idx]) + return; // already received; not expected to modify because of we init the model only once + + if (!initialized_) { // get image width and height + img_w_ = msg->width; + img_h_ = msg->height; + initialized_ = true; + } + Eigen::Matrix3f intrinsics; + getCameraIntrinsics(msg, intrinsics); + cams_intrin[idx] = intrinsics; + + Eigen::Quaternion rot; + Eigen::Translation3f translation; + getTransform( + tf_buffer_->lookupTransform("base_link", msg->header.frame_id, rclcpp::Time(0)), rot, + translation); + cams2ego_rot[idx] = rot; + cams2ego_trans[idx] = translation; + + caminfo_received_[idx] = true; + camera_info_received_flag_ = + std::all_of(caminfo_received_.begin(), caminfo_received_.end(), [](bool i) { return i; }); +} + +TRTBEVDetNode::~TRTBEVDetNode() +{ + delete imgs_dev_; +} + +int main(int argc, char ** argv) +{ + rclcpp::init(argc, argv); + rclcpp::NodeOptions node_options; + auto bevdet_node = std::make_shared("tensorrt_bevdet_node", node_options); + rclcpp::spin(bevdet_node); + return 0; +} diff --git a/perception/autoware_tensorrt_bevdet/src/bevpool_plugin.cu b/perception/autoware_tensorrt_bevdet/src/bevpool_plugin.cu new file mode 100644 index 0000000000000..a498a2a1685ba --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/bevpool_plugin.cu @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "autoware/tensorrt_bevdet/bevpool_plugin.hpp" +#include "autoware/tensorrt_bevdet/common.hpp" + +#include +#include + +// input[0] depth b*n x d x h x w +// input[1] feat b*n x c x h x w +// input[2] ranks_depth m +// input[3] ranks_feat m +// input[4] ranks_bev m +// input[5] interval_starts k +// input[6] interval_lengths k + +// out[0] bevfeat b x c x h x w + +template +__global__ void bev_pool_v2_kernel( + int channel, int n_intervals, int map_size, int img_size, const T1 * __restrict__ depth, + const T1 * __restrict__ feat, const int * __restrict__ ranks_depth, + const int * __restrict__ ranks_feat, const int * __restrict__ ranks_bev, + const int * __restrict__ interval_starts, const int * __restrict__ interval_lengths, + T2 * __restrict__ out) +{ + CUDA_1D_KERNEL_LOOP(idx, n_intervals * channel) + { + int index = idx / channel; // bev grid index + int curr_c = idx % channel; // channel index + int interval_start = interval_starts[index]; + int interval_length = interval_lengths[index]; + + int curr_step = curr_c * img_size; + int chan_step = channel * img_size; + + T2 sum = 0; + + int feat_offset = 0; + for (int i = 0; i < interval_length; i++) { + feat_offset = ranks_feat[interval_start + i] / img_size * chan_step + curr_step + + ranks_feat[interval_start + i] % img_size; + + sum += static_cast(feat[feat_offset]) * + static_cast(depth[ranks_depth[interval_start + i]]); + } + out[curr_c * map_size + ranks_bev[interval_start]] = sum; + } +} + +namespace nvinfer1 +{ +// class BEVPoolPlugin +BEVPoolPlugin::BEVPoolPlugin(const std::string & name, int bev_h, int bev_w, int n) : name_(name) +{ + m_.bev_h = bev_h; + m_.bev_w = bev_w; + m_.n = n; +} + +BEVPoolPlugin::BEVPoolPlugin(const std::string & name, const void * buffer, size_t length) +: name_(name) +{ + memcpy(&m_, buffer, sizeof(m_)); +} + +BEVPoolPlugin::~BEVPoolPlugin() +{ +} + +IPluginV2DynamicExt * BEVPoolPlugin::clone() const noexcept +{ + auto p = new BEVPoolPlugin(name_, &m_, sizeof(m_)); + p->setPluginNamespace(namespace_.c_str()); + return p; +} + +int32_t BEVPoolPlugin::getNbOutputs() const noexcept +{ + return 1; +} + +DataType BEVPoolPlugin::getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept +{ + return DataType::kFLOAT; +} + +DimsExprs BEVPoolPlugin::getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept +{ + // input[0] depth b*n x d x h x w + // input[1] feat b*n x c x h x w + + // input[2] ranks_depth m + // input[3] ranks_feat m + // input[4] ranks_bev m + // input[5] interval_starts k + // input[6] interval_lengths k + + // out[0] bevfeat b x c x h x w + + DimsExprs ret; + ret.nbDims = inputs[1].nbDims; + ret.d[0] = exprBuilder.operation( + DimensionOperation::kFLOOR_DIV, *inputs[1].d[0], *exprBuilder.constant(m_.n)); + ret.d[1] = inputs[1].d[1]; + ret.d[2] = exprBuilder.constant(m_.bev_h); + ret.d[3] = exprBuilder.constant(m_.bev_w); + + return ret; +} + +bool BEVPoolPlugin::supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept +{ + // depth + if (pos == 0) { + return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) && + inOut[pos].format == TensorFormat::kLINEAR; + } else if (pos == 1) { // feat + return inOut[0].type == inOut[1].type && inOut[pos].format == TensorFormat::kLINEAR; + } else if (pos == 7) { // out + return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) && + inOut[pos].format == TensorFormat::kLINEAR; + } else { + return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR; + } + return false; +} + +void BEVPoolPlugin::configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept +{ + return; +} + +size_t BEVPoolPlugin::getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept +{ + return 0; +} + +int32_t BEVPoolPlugin::enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept +{ + // input[0] == depth b*n x d x h x w + // input[1] == feat b*n x c x h x w + // input[2] == ranks_depth m + // input[3] == ranks_feat m + // input[4] == ranks_bev m + // input[5] == interval_starts k + // input[6] == interval_lengths k + + int channel = inputDesc[1].dims.d[1]; + int n_intervals = inputDesc[5].dims.d[0]; + int map_size = m_.bev_h * m_.bev_w; + int img_size = inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3]; + + dim3 grid(GET_BLOCKS(n_intervals * channel)); + dim3 block(NUM_THREADS); + + switch (int(outputDesc[0].type)) { + case int(DataType::kFLOAT): + if (inputDesc[0].type == DataType::kFLOAT) { + // fp32 fp32 + bev_pool_v2_kernel<<>>( + channel, n_intervals, map_size, img_size, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + reinterpret_cast(inputs[3]), reinterpret_cast(inputs[4]), + reinterpret_cast(inputs[5]), reinterpret_cast(inputs[6]), + reinterpret_cast(outputs[0])); + } else { + // fp16 fp32 + bev_pool_v2_kernel<__half, float><<>>( + channel, n_intervals, map_size, img_size, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + reinterpret_cast(inputs[3]), reinterpret_cast(inputs[4]), + reinterpret_cast(inputs[5]), reinterpret_cast(inputs[6]), + reinterpret_cast(outputs[0])); + } + break; + case int(DataType::kHALF): + if (inputDesc[0].type == DataType::kFLOAT) { + // fp32 fp16 + bev_pool_v2_kernel<<>>( + channel, n_intervals, map_size, img_size, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + reinterpret_cast(inputs[3]), reinterpret_cast(inputs[4]), + reinterpret_cast(inputs[5]), reinterpret_cast(inputs[6]), + reinterpret_cast<__half *>(outputs[0])); + } else { + // fp16 fp16 + bev_pool_v2_kernel<__half, __half><<>>( + channel, n_intervals, map_size, img_size, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + reinterpret_cast(inputs[3]), reinterpret_cast(inputs[4]), + reinterpret_cast(inputs[5]), reinterpret_cast(inputs[6]), + reinterpret_cast<__half *>(outputs[0])); + } + break; + default: // should NOT be here + RCLCPP_ERROR(rclcpp::get_logger("BEVPoolPlugin"), "\tUnsupport datatype!"); + } + return 0; +} + +void BEVPoolPlugin::destroy() noexcept +{ + delete this; + return; +} + +int32_t BEVPoolPlugin::initialize() noexcept +{ + return 0; +} + +void BEVPoolPlugin::terminate() noexcept +{ + return; +} + +size_t BEVPoolPlugin::getSerializationSize() const noexcept +{ + return sizeof(m_); +} + +void BEVPoolPlugin::serialize(void * buffer) const noexcept +{ + memcpy(buffer, &m_, sizeof(m_)); + return; +} + +void BEVPoolPlugin::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * BEVPoolPlugin::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * BEVPoolPlugin::getPluginType() const noexcept +{ + return BEVPOOL_PLUGIN_NAME; +} + +const char * BEVPoolPlugin::getPluginVersion() const noexcept +{ + return BEVPOOL_PLUGIN_VERSION; +} + +void BEVPoolPlugin::attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept +{ + return; +} + +void BEVPoolPlugin::detachFromContext() noexcept +{ + return; +} + +// class BEVPoolPluginCreator +PluginFieldCollection BEVPoolPluginCreator::fc_{}; +std::vector BEVPoolPluginCreator::attr_; + +BEVPoolPluginCreator::BEVPoolPluginCreator() +{ + attr_.clear(); + attr_.emplace_back(PluginField("bev_h", nullptr, PluginFieldType::kINT32, 1)); + attr_.emplace_back(PluginField("bev_w", nullptr, PluginFieldType::kINT32, 1)); + attr_.emplace_back(PluginField("n", nullptr, PluginFieldType::kINT32, 1)); + + fc_.nbFields = attr_.size(); + fc_.fields = attr_.data(); +} + +BEVPoolPluginCreator::~BEVPoolPluginCreator() +{ +} + +IPluginV2DynamicExt * BEVPoolPluginCreator::createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept +{ + const PluginField * fields = fc->fields; + + int bev_h = -1; + int bev_w = -1; + int n = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + if (std::string(fc->fields[i].name) == std::string("bev_h")) { + bev_h = *reinterpret_cast(fc->fields[i].data); + } else if (std::string(fc->fields[i].name) == std::string("bev_w")) { + bev_w = *reinterpret_cast(fc->fields[i].data); + } else if (std::string(fc->fields[i].name) == std::string("n")) { + n = *reinterpret_cast(fc->fields[i].data); + } + } + BEVPoolPlugin * pObj = new BEVPoolPlugin(name, bev_h, bev_w, n); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +IPluginV2DynamicExt * BEVPoolPluginCreator::deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept +{ + BEVPoolPlugin * pObj = new BEVPoolPlugin(name, serialData, serialLength); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +void BEVPoolPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * BEVPoolPluginCreator::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * BEVPoolPluginCreator::getPluginName() const noexcept +{ + return BEVPOOL_PLUGIN_NAME; +} + +const char * BEVPoolPluginCreator::getPluginVersion() const noexcept +{ + return BEVPOOL_PLUGIN_VERSION; +} + +const PluginFieldCollection * BEVPoolPluginCreator::getFieldNames() noexcept +{ + return &fc_; +} + +REGISTER_TENSORRT_PLUGIN(BEVPoolPluginCreator); + +} // namespace nvinfer1 diff --git a/perception/autoware_tensorrt_bevdet/src/cpu_jpegdecoder.cpp b/perception/autoware_tensorrt_bevdet/src/cpu_jpegdecoder.cpp new file mode 100644 index 0000000000000..1afe91becfb95 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/cpu_jpegdecoder.cpp @@ -0,0 +1,93 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp" + +#include "autoware/tensorrt_bevdet/preprocess.hpp" + +#include +#include +#include +#include +#include + +#include + +struct jpeg_error_handler +{ + struct jpeg_error_mgr pub; + jmp_buf setjmp_buffer; +}; + +int decode_jpeg(const std::vector & buffer, uchar * output) +{ + struct jpeg_decompress_struct cinfo; + struct jpeg_error_handler jerr; + cinfo.err = jpeg_std_error(&jerr.pub); + + jerr.pub.error_exit = [](j_common_ptr cinfo) { + jpeg_error_handler * myerr = reinterpret_cast(cinfo->err); + longjmp(myerr->setjmp_buffer, 1); + }; + + if (setjmp(jerr.setjmp_buffer)) { + RCLCPP_ERROR( + rclcpp::get_logger("decode_jpeg"), "Failed to decompress jpeg: %s", + jerr.pub.jpeg_message_table[jerr.pub.msg_code]); + jpeg_destroy_decompress(&cinfo); + return EXIT_FAILURE; + } + jpeg_create_decompress(&cinfo); + + if (buffer.size() == 0) { + RCLCPP_ERROR(rclcpp::get_logger("decode_jpeg"), "buffer size is 0"); + return EXIT_FAILURE; + } + jpeg_mem_src(&cinfo, reinterpret_cast(buffer.data()), buffer.size()); + if (jpeg_read_header(&cinfo, TRUE) != JPEG_HEADER_OK) { + RCLCPP_ERROR(rclcpp::get_logger("decode_jpeg"), "Failed to read jpeg header"); + return EXIT_FAILURE; + } + if (jpeg_start_decompress(&cinfo) != TRUE) { + RCLCPP_ERROR(rclcpp::get_logger("decode_jpeg"), "Failed to start decompress"); + return EXIT_FAILURE; + } + + while (cinfo.output_scanline < cinfo.output_height) { + auto row = cinfo.output_scanline; + uchar * ptr = output + row * cinfo.image_width * 3; + jpeg_read_scanlines(&cinfo, (JSAMPARRAY)&ptr, 1); + } + jpeg_finish_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); + return EXIT_SUCCESS; +} + +int decode_cpu( + const std::vector> & files_data, uchar * out_imgs, size_t width, size_t height) +{ + uchar * temp = new uchar[width * height * 3]; + uchar * temp_gpu = nullptr; + CHECK_CUDA(cudaMalloc(&temp_gpu, width * height * 3)); + for (size_t i = 0; i < files_data.size(); i++) { + if (decode_jpeg(files_data[i], temp)) { + return EXIT_FAILURE; + } + CHECK_CUDA(cudaMemcpy(temp_gpu, temp, width * height * 3, cudaMemcpyHostToDevice)); + convert_RGBHWC_to_BGRCHW(temp_gpu, out_imgs + i * width * height * 3, 3, height, width); + CHECK_CUDA(cudaDeviceSynchronize()); + } + CHECK_CUDA(cudaFree(temp_gpu)); + delete[] temp; + return EXIT_SUCCESS; +} diff --git a/perception/autoware_tensorrt_bevdet/src/data.cpp b/perception/autoware_tensorrt_bevdet/src/data.cpp new file mode 100644 index 0000000000000..6dbb02015b5a2 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/data.cpp @@ -0,0 +1,84 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "autoware/tensorrt_bevdet/data.hpp" + +#include "autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp" + +#include + +#include +#include +#include +#include + +using std::chrono::duration; +using std::chrono::high_resolution_clock; + +camParams::camParams(const YAML::Node & config, int n, std::vector & cams_name) +: N_img(n) +{ + if (static_cast(n) != cams_name.size()) { + RCLCPP_ERROR( + rclcpp::get_logger("camParams"), "Error! Need %d camera param, but given %zu camera names!", + n, cams_name.size()); + } + ego2global_rot = fromYamlQuater(config["ego2global_rotation"]); + ego2global_trans = fromYamlTrans(config["ego2global_translation"]); + + lidar2ego_rot = fromYamlQuater(config["lidar2ego_rotation"]); + lidar2ego_trans = fromYamlTrans(config["lidar2ego_translation"]); + + timestamp = config["timestamp"].as(); + scene_token = config["scene_token"].as(); + + imgs_file.clear(); + + cams_intrin.clear(); + cams2ego_rot.clear(); + cams2ego_trans.clear(); + + for (const std::string & name : cams_name) { + imgs_file.push_back("." + config["cams"][name]["data_path"].as()); + + // + cams_intrin.push_back(fromYamlMatrix3f(config["cams"][name]["cam_intrinsic"])); + cams2ego_rot.push_back(fromYamlQuater(config["cams"][name]["sensor2ego_rotation"])); + cams2ego_trans.push_back(fromYamlTrans(config["cams"][name]["sensor2ego_translation"])); + // + } +} + +Eigen::Translation3f fromYamlTrans(YAML::Node x) +{ + std::vector trans = x.as>(); + return Eigen::Translation3f(trans[0], trans[1], trans[2]); +} + +Eigen::Quaternion fromYamlQuater(YAML::Node x) +{ + std::vector quater = x.as>(); + return Eigen::Quaternion(quater[0], quater[1], quater[2], quater[3]); +} + +Eigen::Matrix3f fromYamlMatrix3f(YAML::Node x) +{ + std::vector> m = x.as>>(); + Eigen::Matrix3f mat; + for (size_t i = 0; i < m.size(); i++) { + for (size_t j = 0; j < m[0].size(); j++) { + mat(i, j) = m[i][j]; + } + } + return mat; +} diff --git a/perception/autoware_tensorrt_bevdet/src/gatherbev_plugin.cu b/perception/autoware_tensorrt_bevdet/src/gatherbev_plugin.cu new file mode 100644 index 0000000000000..e9fecf8cef37a --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/gatherbev_plugin.cu @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "autoware/tensorrt_bevdet/common.hpp" +#include "autoware/tensorrt_bevdet/gatherbev_plugin.hpp" + +#include +#include + +// input[0] == adj_feat b x 8 x 80 x 128 x 128 +// input[1] == curr_feat b x 80 x 128 x 128 +// input[2] == flag b x 1 +// out[0] b x (8+1)*80 x 128 x 128 +template +__global__ void copy_feat_kernel( + int nthreads, // b * (adj_num + 1) * map_size + int adj_num, int channel, int map_size, const T * adj_feats, const T * curr_feat, + const int * flag, T * out_feats) +{ + CUDA_1D_KERNEL_LOOP(idx, nthreads) + { + int b = idx / ((adj_num + 1) * map_size); + int n = (idx / map_size) % (adj_num + 1); + int m = idx % map_size; + + int start = b * (adj_num + 1) * channel * map_size + n * channel * map_size + m; + int end = start + channel * map_size; + for (int i = start, c = 0; i < end; i += map_size, c++) { + if (flag[b] == 0 || n == 0) { + out_feats[i] = curr_feat[b * channel * map_size + c * map_size + m]; + } else { + out_feats[i] = adj_feats[i - channel * map_size]; + } + } + } +} + +namespace nvinfer1 +{ +// class GatherBEVPlugin +GatherBEVPlugin::GatherBEVPlugin(const std::string & name) : name_(name) +{ +} + +GatherBEVPlugin::GatherBEVPlugin(const std::string & name, const void * buffer, size_t length) +: name_(name) +{ + memcpy(&m_, buffer, sizeof(m_)); +} + +GatherBEVPlugin::~GatherBEVPlugin() +{ +} + +IPluginV2DynamicExt * GatherBEVPlugin::clone() const noexcept +{ + auto p = new GatherBEVPlugin(name_, &m_, sizeof(m_)); + p->setPluginNamespace(namespace_.c_str()); + return p; +} + +int32_t GatherBEVPlugin::getNbOutputs() const noexcept +{ + return 1; +} + +DataType GatherBEVPlugin::getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept +{ + return inputTypes[0]; // 与adj_feat一致 +} + +DimsExprs GatherBEVPlugin::getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept +{ + // input[0] == adj_feat b x 8 x 80 x 128 x 128 + // input[1] == curr_feat b x 80 x 128 x 128 + // input[2] == flag b x 1 + // out[0] b x (8+1)*80 x 128 x 128 + DimsExprs ret; + ret.nbDims = inputs[0].nbDims - 1; + + IDimensionExpr const * n_feat = + exprBuilder.operation(DimensionOperation::kSUM, *inputs[0].d[1], *exprBuilder.constant(1)); + ret.d[0] = inputs[0].d[0]; + ret.d[1] = exprBuilder.operation(DimensionOperation::kPROD, *n_feat, *inputs[0].d[2]); + ret.d[2] = inputs[0].d[3]; + ret.d[3] = inputs[0].d[4]; + + return ret; +} + +bool GatherBEVPlugin::supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept +{ + // adj_feat + if (pos == 0) { + return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) && + inOut[pos].format == TensorFormat::kLINEAR; + } else if (pos == 1) { // curr_feat + return inOut[0].type == inOut[1].type && inOut[pos].format == TensorFormat::kLINEAR; + } else if (pos == 3) { // out + return inOut[0].type == inOut[3].type && inOut[pos].format == TensorFormat::kLINEAR; + } else if (pos == 2) { + return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR; + } + return false; +} + +void GatherBEVPlugin::configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept +{ + return; +} + +size_t GatherBEVPlugin::getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept +{ + return 0; +} + +int32_t GatherBEVPlugin::enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept +{ + // input[0] == adj_feat b x 8 x 80 x 128 x 128 + // input[1] == curr_feat b x 80 x 128 x 128 + // input[2] == flag b x 1 + // out[0] b x (8+1)*80 x 128 x 128 + + // int nthreads, // b * (adj_num + 1) * map_size + // int adj_num, + // int channel, + // int map_size, + + // int flag = 0; + // CHECK_CUDA(cudaMemcpy(&flag, inputs[2], sizeof(int), cudaMemcpyDeviceToHost)); + + int b = inputDesc[0].dims.d[0]; + int adj_num = inputDesc[0].dims.d[1]; + int map_size = inputDesc[0].dims.d[3] * inputDesc[0].dims.d[4]; + int channel = inputDesc[0].dims.d[2]; + + int feat_step = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2] * inputDesc[1].dims.d[3]; + + int nthreads = b * (adj_num + 1) * map_size; + + dim3 grid(GET_BLOCKS(nthreads)); + dim3 block(NUM_THREADS); + + switch (int(outputDesc[0].type)) { + case int(DataType::kFLOAT): + // fp32 + copy_feat_kernel<<>>( + nthreads, adj_num, channel, map_size, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + reinterpret_cast(outputs[0])); + + break; + case int(DataType::kHALF): + // fp16 + copy_feat_kernel<<>>( + nthreads, adj_num, channel, map_size, reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + reinterpret_cast<__half *>(outputs[0])); + break; + default: // should NOT be here + RCLCPP_ERROR(rclcpp::get_logger("GatherBEVPlugin"), "\tUnsupport datatype!"); + } + return 0; +} + +void GatherBEVPlugin::destroy() noexcept +{ + delete this; + return; +} + +int32_t GatherBEVPlugin::initialize() noexcept +{ + return 0; +} + +void GatherBEVPlugin::terminate() noexcept +{ + return; +} + +size_t GatherBEVPlugin::getSerializationSize() const noexcept +{ + return sizeof(m_); +} + +void GatherBEVPlugin::serialize(void * buffer) const noexcept +{ + memcpy(buffer, &m_, sizeof(m_)); + return; +} + +void GatherBEVPlugin::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * GatherBEVPlugin::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * GatherBEVPlugin::getPluginType() const noexcept +{ + return GATHERBEV_PLUGIN_NAME; +} + +const char * GatherBEVPlugin::getPluginVersion() const noexcept +{ + return GATHERBEV_PLUGIN_VERSION; +} + +void GatherBEVPlugin::attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept +{ + return; +} + +void GatherBEVPlugin::detachFromContext() noexcept +{ + return; +} + +// class GatherBEVPluginCreator +PluginFieldCollection GatherBEVPluginCreator::fc_{}; +std::vector GatherBEVPluginCreator::attr_; + +GatherBEVPluginCreator::GatherBEVPluginCreator() +{ + attr_.clear(); + fc_.nbFields = attr_.size(); + fc_.fields = attr_.data(); +} + +GatherBEVPluginCreator::~GatherBEVPluginCreator() +{ +} + +IPluginV2DynamicExt * GatherBEVPluginCreator::createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept +{ + GatherBEVPlugin * pObj = new GatherBEVPlugin(name); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +IPluginV2DynamicExt * GatherBEVPluginCreator::deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept +{ + GatherBEVPlugin * pObj = new GatherBEVPlugin(name, serialData, serialLength); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +void GatherBEVPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * GatherBEVPluginCreator::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * GatherBEVPluginCreator::getPluginName() const noexcept +{ + return GATHERBEV_PLUGIN_NAME; +} + +const char * GatherBEVPluginCreator::getPluginVersion() const noexcept +{ + return GATHERBEV_PLUGIN_VERSION; +} + +const PluginFieldCollection * GatherBEVPluginCreator::getFieldNames() noexcept +{ + return &fc_; +} + +REGISTER_TENSORRT_PLUGIN(GatherBEVPluginCreator); + +} // namespace nvinfer1 diff --git a/perception/autoware_tensorrt_bevdet/src/iou3d_nms.cu b/perception/autoware_tensorrt_bevdet/src/iou3d_nms.cu new file mode 100644 index 0000000000000..fe68a6f03bb0f --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/iou3d_nms.cu @@ -0,0 +1,633 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/* +3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others) +Written by Shaoshuai Shi +All Rights Reserved 2019-2022. +*/ + +#include "autoware/tensorrt_bevdet/iou3d_nms.hpp" + +struct Point +{ + float x, y; + __device__ Point() {} + __device__ Point(double _x, double _y) { x = _x, y = _y; } + + __device__ void set(float _x, float _y) + { + x = _x; + y = _y; + } + + __device__ Point operator+(const Point & b) const { return Point(x + b.x, y + b.y); } + + __device__ Point operator-(const Point & b) const { return Point(x - b.x, y - b.y); } +}; + +__device__ inline float cross(const Point & a, const Point & b) +{ + return a.x * b.y - a.y * b.x; +} + +__device__ inline float cross(const Point & p1, const Point & p2, const Point & p0) +{ + return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); +} + +__device__ int check_rect_cross( + const Point & p1, const Point & p2, const Point & q1, const Point & q2) +{ + int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) && + min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y); + return ret; +} + +__device__ inline int check_in_box2d(const float * box, const Point & p) +{ + // params: (7) [x, y, z, dx, dy, dz, heading] + const float kMargin = 1e-2; + + float center_x = box[0], center_y = box[1]; + // rotate the point in the opposite direction of box + float angle_cos = cos(-box[6]); + float angle_sin = sin(-box[6]); + float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); + float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; + + return (fabs(rot_x) < box[3] / 2 + kMargin && fabs(rot_y) < box[4] / 2 + kMargin); +} + +__device__ inline int intersection( + const Point & p1, const Point & p0, const Point & q1, const Point & q0, Point & ans) +{ + // fast exclusion + if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; + + // check cross standing + float s1 = cross(q0, p1, p0); + float s2 = cross(p1, q1, p0); + float s3 = cross(p0, q1, q0); + float s4 = cross(q1, p1, q0); + + if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; + + // calculate intersection of two lines + float s5 = cross(q1, p1, p0); + if (fabs(s5 - s1) > EPS) { + ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); + ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); + + } else { + float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; + float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; + float D = a0 * b1 - a1 * b0; + + ans.x = (b0 * c1 - b1 * c0) / D; + ans.y = (a1 * c0 - a0 * c1) / D; + } + + return 1; +} + +__device__ inline void rotate_around_center( + const Point & center, const float angle_cos, const float angle_sin, Point & p) +{ + float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x; + float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; + p.set(new_x, new_y); +} + +__device__ inline int point_cmp(const Point & a, const Point & b, const Point & center) +{ + return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); +} + +__device__ inline float box_overlap(const float * box_a, const float * box_b) +{ + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] + + float a_angle = box_a[6], b_angle = box_b[6]; + float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, + b_dy_half = box_b[4] / 2; + float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; + float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; + float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; + float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; + + Point center_a(box_a[0], box_a[1]); + Point center_b(box_b[0], box_b[1]); + +#ifdef DEBUG + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), + "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)", a_x1, a_y1, a_x2, a_y2, + a_angle, b_x1, b_y1, b_x2, b_y2, b_angle); + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), "center a: (%.3f, %.3f), b: (%.3f, %.3f)", center_a.x, + center_a.y, center_b.x, center_b.y); +#endif + + Point box_a_corners[5]; + box_a_corners[0].set(a_x1, a_y1); + box_a_corners[1].set(a_x2, a_y1); + box_a_corners[2].set(a_x2, a_y2); + box_a_corners[3].set(a_x1, a_y2); + + Point box_b_corners[5]; + box_b_corners[0].set(b_x1, b_y1); + box_b_corners[1].set(b_x2, b_y1); + box_b_corners[2].set(b_x2, b_y2); + box_b_corners[3].set(b_x1, b_y2); + + // get oriented corners + float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); + float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); + + for (int k = 0; k < 4; k++) { +#ifdef DEBUG + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), "before corner %d: a(%.3f, %.3f), b(%.3f, %.3f)", k, + box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); +#endif + rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); + rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); +#ifdef DEBUG + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), "corner %d: a(%.3f, %.3f), b(%.3f, %.3f)", k, + box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); +#endif + } + + box_a_corners[4] = box_a_corners[0]; + box_b_corners[4] = box_b_corners[0]; + + // get intersection of lines + Point cross_points[16]; + Point poly_center; + int cnt = 0, flag = 0; + + poly_center.set(0, 0); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + flag = intersection( + box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], + cross_points[cnt]); + if (flag) { + poly_center = poly_center + cross_points[cnt]; + cnt++; +#ifdef DEBUG + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), + "Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f)", + cross_points[cnt - 1].x, cross_points[cnt - 1].y, box_a_corners[i].x, box_a_corners[i].y, + box_a_corners[i + 1].x, box_a_corners[i + 1].y, box_b_corners[i].x, box_b_corners[i].y, + box_b_corners[i + 1].x, box_b_corners[i + 1].y); +#endif + } + } + } + + // check corners + for (int k = 0; k < 4; k++) { + if (check_in_box2d(box_a, box_b_corners[k])) { + poly_center = poly_center + box_b_corners[k]; + cross_points[cnt] = box_b_corners[k]; + cnt++; +#ifdef DEBUG + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), "b corners in a: corner_b(%.3f, %.3f)", + cross_points[cnt - 1].x, cross_points[cnt - 1].y); +#endif + } + if (check_in_box2d(box_b, box_a_corners[k])) { + poly_center = poly_center + box_a_corners[k]; + cross_points[cnt] = box_a_corners[k]; + cnt++; +#ifdef DEBUG + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), "a corners in b: corner_a(%.3f, %.3f)", + cross_points[cnt - 1].x, cross_points[cnt - 1].y); +#endif + } + } + + poly_center.x /= cnt; + poly_center.y /= cnt; + + // sort the points of polygon + Point temp; + for (int j = 0; j < cnt - 1; j++) { + for (int i = 0; i < cnt - j - 1; i++) { + if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) { + temp = cross_points[i]; + cross_points[i] = cross_points[i + 1]; + cross_points[i + 1] = temp; + } + } + } + +#ifdef DEBUG + RCLCPP_DEBUG(rclcpp::get_logger("iou3d_nms"), "cnt=%d", cnt); + for (int i = 0; i < cnt; i++) { + RCLCPP_DEBUG( + rclcpp::get_logger("iou3d_nms"), "All cross point %d: (%.3f, %.3f)", i, cross_points[i].x, + cross_points[i].y); + } +#endif + + // get the overlap areas + float area = 0; + for (int k = 0; k < cnt - 1; k++) { + area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); + } + + return fabs(area) / 2.0; +} + +__device__ inline float iou_bev(const float * box_a, const float * box_b) +{ + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] + float sa = box_a[3] * box_a[4]; + float sb = box_b[3] * box_b[4]; + float s_overlap = box_overlap(box_a, box_b); + return s_overlap / fmaxf(sa + sb - s_overlap, EPS); +} + +__device__ inline float iou_normal(float const * const a, float const * const b) +{ + // params: a: [x, y, z, dx, dy, dz, heading] + // params: b: [x, y, z, dx, dy, dz, heading] + float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), + right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2); + float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), + bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2); + float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); + float interS = width * height; + float Sa = a[3] * a[4]; + float Sb = b[3] * b[4]; + return interS / fmaxf(Sa + Sb - interS, EPS); +} + +__global__ void boxes_overlap_kernel( + const int num_a, const float * boxes_a, const int num_b, const float * boxes_b, + float * ans_overlap) +{ + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; + const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; + + if (a_idx >= num_a || b_idx >= num_b) { + return; + } + const float * cur_box_a = boxes_a + a_idx * kBoxBlockSize; + const float * cur_box_b = boxes_b + b_idx * kBoxBlockSize; + float s_overlap = box_overlap(cur_box_a, cur_box_b); + ans_overlap[a_idx * num_b + b_idx] = s_overlap; +} + +__global__ void boxes_iou_bev_kernel( + const int num_a, const float * boxes_a, const int num_b, const float * boxes_b, float * ans_iou) +{ + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; + const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; + + if (a_idx >= num_a || b_idx >= num_b) { + return; + } + + const float * cur_box_a = boxes_a + a_idx * kBoxBlockSize; + const float * cur_box_b = boxes_b + b_idx * kBoxBlockSize; + float cur_iou_bev = iou_bev(cur_box_a, cur_box_b); + ans_iou[a_idx * num_b + b_idx] = cur_iou_bev; +} + +__global__ void RotatedNmsKernel( + const int boxes_num, const float nms_overlap_thresh, const float * boxes, std::int64_t * mask) +{ + // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] + // params: mask (N, N/THREADS_PER_BLOCK_NMS) + + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * kBoxBlockSize]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * kBoxBlockSize + 0] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 0]; + block_boxes[threadIdx.x * kBoxBlockSize + 1] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 1]; + block_boxes[threadIdx.x * kBoxBlockSize + 2] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 2]; + block_boxes[threadIdx.x * kBoxBlockSize + 3] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 3]; + block_boxes[threadIdx.x * kBoxBlockSize + 4] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 4]; + block_boxes[threadIdx.x * kBoxBlockSize + 5] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 5]; + block_boxes[threadIdx.x * kBoxBlockSize + 6] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 6]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float * cur_box = boxes + cur_box_idx * kBoxBlockSize; + + int i = 0; + std::int64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_bev(cur_box, block_boxes + i * kBoxBlockSize) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +__global__ void NmsKernel( + const int boxes_num, const float nms_overlap_thresh, const float * boxes, std::int64_t * mask) +{ + // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] + // params: mask (N, N/THREADS_PER_BLOCK_NMS) + + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * kBoxBlockSize]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * kBoxBlockSize + 0] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 0]; + block_boxes[threadIdx.x * kBoxBlockSize + 1] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 1]; + block_boxes[threadIdx.x * kBoxBlockSize + 2] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 2]; + block_boxes[threadIdx.x * kBoxBlockSize + 3] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 3]; + block_boxes[threadIdx.x * kBoxBlockSize + 4] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 4]; + block_boxes[threadIdx.x * kBoxBlockSize + 5] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 5]; + block_boxes[threadIdx.x * kBoxBlockSize + 6] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 6]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float * cur_box = boxes + cur_box_idx * kBoxBlockSize; + + int i = 0; + std::int64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_normal(cur_box, block_boxes + i * kBoxBlockSize) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +__global__ void RotatedNmsWithIndicesKernel( + const int boxes_num, const float nms_overlap_thresh, const float * res_box, + const int * res_sorted_indices, std::int64_t * mask) +{ + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * kBoxBlockSize]; + + if (threadIdx.x < col_size) { // 419, 420 have two situations to discuss separately + const int col_actual_idx = res_sorted_indices[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x]; + block_boxes[threadIdx.x * kBoxBlockSize + 0] = res_box[col_actual_idx * kBoxBlockSize + 0]; + block_boxes[threadIdx.x * kBoxBlockSize + 1] = res_box[col_actual_idx * kBoxBlockSize + 1]; + block_boxes[threadIdx.x * kBoxBlockSize + 2] = res_box[col_actual_idx * kBoxBlockSize + 2]; + block_boxes[threadIdx.x * kBoxBlockSize + 3] = res_box[col_actual_idx * kBoxBlockSize + 3]; + block_boxes[threadIdx.x * kBoxBlockSize + 4] = res_box[col_actual_idx * kBoxBlockSize + 4]; + block_boxes[threadIdx.x * kBoxBlockSize + 5] = res_box[col_actual_idx * kBoxBlockSize + 5]; + block_boxes[threadIdx.x * kBoxBlockSize + 6] = res_box[col_actual_idx * kBoxBlockSize + 6]; + } + __syncthreads(); // Waiting for threads within the block + + if (threadIdx.x < row_size) { // + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const int row_actual_idx = res_sorted_indices[cur_box_idx]; + + float cur_box[kBoxBlockSize]; + cur_box[0] = res_box[row_actual_idx * kBoxBlockSize + 0]; + cur_box[1] = res_box[row_actual_idx * kBoxBlockSize + 1]; + cur_box[2] = res_box[row_actual_idx * kBoxBlockSize + 2]; + cur_box[3] = res_box[row_actual_idx * kBoxBlockSize + 3]; + cur_box[4] = res_box[row_actual_idx * kBoxBlockSize + 4]; + cur_box[5] = res_box[row_actual_idx * kBoxBlockSize + 5]; + cur_box[6] = res_box[row_actual_idx * kBoxBlockSize + 6]; + + int i = 0; + std::int64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; // Isn't it right now to compare with [0~threadIdx. x)? FIXME + } // I think this is correct, and there is no need to compare it with numbers smaller than my + // own. If col is smaller than row, there is also no need to compare it + for (i = start; i < col_size; i++) { + if (iou_bev(cur_box, block_boxes + i * kBoxBlockSize) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + // assume cur_box_idx = 21, col_start = 0, row_start = 0 , threadIdx = 21, + // mark 21 th box and top 64 boxes + mask[cur_box_idx * col_blocks + col_start] = t; + // or col_start * boxes_num + cur_box_idx + } +} + +__global__ void box_assign_kernel( + float * reg, float * height, float * dim, float * rot, float * boxes, float * score, int * label, + float * out_score, int * out_label, int * validIndexs, int head_x_size, int head_y_size) +{ + int boxId = blockIdx.x; + int channel = threadIdx.x; + int idx = validIndexs[boxId]; + if (channel == 0) { + boxes[boxId * kBoxBlockSize + 0] = reg[idx]; + } else if (channel == 1) { + boxes[boxId * kBoxBlockSize + 1] = reg[idx + head_x_size * head_y_size]; + } else if (channel == 2) { + boxes[boxId * kBoxBlockSize + 2] = height[idx]; + } else if (channel == 3) { + boxes[boxId * kBoxBlockSize + 3] = dim[idx]; + } else if (channel == 4) { + boxes[boxId * kBoxBlockSize + 4] = dim[idx + head_x_size * head_y_size]; + } else if (channel == 5) { + boxes[boxId * kBoxBlockSize + 5] = dim[idx + 2 * head_x_size * head_y_size]; + } else if (channel == 6) { + float theta = + atan2f(rot[0 * head_x_size * head_y_size + idx], rot[1 * head_x_size * head_y_size + idx]); + theta = -theta - 3.1415926 / 2; + boxes[boxId * kBoxBlockSize + 6] = theta; + } + // else if(channel == 7) + // out_score[boxId] = score[idx]; + else if (channel == 8) { + out_label[boxId] = label[idx]; + } +} + +void box_assign_launcher( + float * reg, float * height, float * dim, float * rot, float * boxes, float * score, int * label, + float * out_score, int * out_label, int * validIndexs, int boxSize, int head_x_size, + int head_y_size) +{ + box_assign_kernel<<>>( + reg, height, dim, rot, boxes, score, label, out_score, out_label, validIndexs, head_x_size, + head_y_size); +} + +__global__ void index_assign(int * indexs) +{ + int yIdx = blockIdx.x; + int xIdx = threadIdx.x; + int idx = yIdx * blockDim.x + xIdx; + indexs[idx] = idx; +} + +void index_assign_launcher(int * indexs, int head_x_size, int head_y_size) +{ + index_assign<<>>(indexs); +} + +void boxes_overlap_launcher( + const int num_a, const float * boxes_a, const int num_b, const float * boxes_b, + float * ans_overlap) +{ + dim3 blocks( + DIVUP(num_b, THREADS_PER_BLOCK), + DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); + + boxes_overlap_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_overlap); +#ifdef DEBUG + cudaDeviceSynchronize(); // for using printf in kernel function +#endif +} + +void boxes_iou_bev_launcher( + const int num_a, const float * boxes_a, const int num_b, const float * boxes_b, float * ans_iou) +{ + dim3 blocks( + DIVUP(num_b, THREADS_PER_BLOCK), + DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); + + boxes_iou_bev_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_iou); +#ifdef DEBUG + cudaDeviceSynchronize(); // for using printf in kernel function +#endif +} + +void nms_launcher(const float * boxes, std::int64_t * mask, int boxes_num, float nms_overlap_thresh) +{ + dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); + dim3 threads(THREADS_PER_BLOCK_NMS); + + RotatedNmsKernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); +} + +void nms_normal_launcher( + const float * boxes, std::int64_t * mask, int boxes_num, float nms_overlap_thresh) +{ + dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); + dim3 threads(THREADS_PER_BLOCK_NMS); + NmsKernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); +} + +Iou3dNmsCuda::Iou3dNmsCuda( + const int head_x_size, const int head_y_size, const float nms_overlap_thresh) +: head_x_size_(head_x_size), head_y_size_(head_y_size), nms_overlap_thresh_(nms_overlap_thresh) +{ +} + +int Iou3dNmsCuda::DoIou3dNms( + const int box_num_pre, const float * res_box, const int * res_sorted_indices, + std::int64_t * host_keep_data) +{ + const int col_blocks = DIVUP(box_num_pre, THREADS_PER_BLOCK_NMS); // How many blocks are divided + std::int64_t * dev_mask = NULL; + cudaMalloc((void **)&dev_mask, box_num_pre * col_blocks * sizeof(std::int64_t)); // + // THREADS_PER_BLOCK_NMS 掩码的长度 + dim3 blocks(DIVUP(box_num_pre, THREADS_PER_BLOCK_NMS), DIVUP(box_num_pre, THREADS_PER_BLOCK_NMS)); + dim3 threads(THREADS_PER_BLOCK_NMS); + RotatedNmsWithIndicesKernel<<>>( + box_num_pre, nms_overlap_thresh_, res_box, res_sorted_indices, dev_mask); + + std::int64_t host_mask[box_num_pre * col_blocks]; + cudaMemcpy( + host_mask, dev_mask, box_num_pre * col_blocks * sizeof(std::int64_t), cudaMemcpyDeviceToHost); + cudaFree(dev_mask); + + std::int64_t host_remv[col_blocks]; + memset(host_remv, 0, col_blocks * sizeof(std::int64_t)); + int num_to_keep = 0; + for (int i = 0; i < box_num_pre; i++) { + int nblock = i / THREADS_PER_BLOCK_NMS; + int inblock = i % THREADS_PER_BLOCK_NMS; + if (!(host_remv[nblock] & + (1ULL << inblock))) { // Meeting this requirement indicates that it does not conflict + // with any of the boxes selected earlier + host_keep_data[num_to_keep++] = i; // Add the satisfied box + for (int j = nblock; j < col_blocks; + j++) { // Consider boxes with index i that overlap beyond the threshold + host_remv[j] |= host_mask[i * col_blocks + j]; + } + } + } + + if (cudaSuccess != cudaGetLastError()) { + RCLCPP_ERROR( + rclcpp::get_logger("iou3d_nms"), "Error: %s", cudaGetErrorString(cudaGetLastError())); + } + return num_to_keep; +} diff --git a/perception/autoware_tensorrt_bevdet/src/nvjpegdecoder.cpp b/perception/autoware_tensorrt_bevdet/src/nvjpegdecoder.cpp new file mode 100644 index 0000000000000..7428c997e94e9 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/nvjpegdecoder.cpp @@ -0,0 +1,299 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifdef __HAVE_NVJPEG__ + +#include "autoware/tensorrt_bevdet/nvjpegdecoder.hpp" + +#include + +using std::chrono::duration; +using std::chrono::high_resolution_clock; + +int dev_malloc(void ** p, size_t s) +{ + return static_cast(cudaMalloc(p, s)); +} + +int dev_free(void * p) +{ + return static_cast(cudaFree(p)); +} + +int host_malloc(void ** p, size_t s, unsigned int f) +{ + return static_cast(cudaHostAlloc(p, s, f)); +} + +int host_free(void * p) +{ + return static_cast(cudaFreeHost(p)); +} + +int prepare_buffers( + const std::vector> & file_data, std::vector & img_width, + std::vector & img_height, std::vector & ibuf, + std::vector & isz, share_params & share_param) +{ + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + int channels; + nvjpegChromaSubsampling_t subsampling; + + for (size_t i = 0; i < file_data.size(); i++) { + CHECK_NVJPEG(nvjpegGetImageInfo( + share_param.nvjpeg_handle, reinterpret_cast(file_data[i].data()), + file_data[i].size(), &channels, &subsampling, widths, heights)); + + int mul = 1; + // in the case of interleaved RGB output, write only to single channel, but + // 3 samples at once + if (share_param.fmt == NVJPEG_OUTPUT_RGBI || share_param.fmt == NVJPEG_OUTPUT_BGRI) { + channels = 1; + mul = 3; + } else if (share_param.fmt == NVJPEG_OUTPUT_RGB || share_param.fmt == NVJPEG_OUTPUT_BGR) { + // in the case of rgb create 3 buffers with sizes of original image + channels = 3; + widths[1] = widths[2] = widths[0]; + heights[1] = heights[2] = heights[0]; + } + + if (img_width[i] != widths[0] || img_height[i] != heights[0]) { + img_width[i] = widths[0]; + img_height[i] = heights[0]; + // realloc output buffer if required + for (int c = 0; c < channels; c++) { + int aw = mul * widths[c]; + int ah = heights[c]; + size_t sz = aw * ah; + ibuf[i].pitch[c] = aw; + if (sz > isz[i].pitch[c]) { + if (ibuf[i].channel[c]) { + CHECK_CUDA(cudaFree(ibuf[i].channel[c])); + } + CHECK_CUDA(cudaMalloc(reinterpret_cast(&ibuf[i].channel[c]), sz)); + isz[i].pitch[c] = sz; + } + } + } + } + return EXIT_SUCCESS; +} + +void create_decoupled_api_handles(std::vector & params, share_params & share_param) +{ + for (size_t i = 0; i < params.size(); i++) { + CHECK_NVJPEG(nvjpegDecoderCreate( + share_param.nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, ¶ms[i].nvjpeg_decoder)); + CHECK_NVJPEG(nvjpegDecoderStateCreate( + share_param.nvjpeg_handle, params[i].nvjpeg_decoder, ¶ms[i].nvjpeg_decoupled_state)); + + CHECK_NVJPEG( + nvjpegBufferPinnedCreate(share_param.nvjpeg_handle, NULL, ¶ms[i].pinned_buffers[0])); + CHECK_NVJPEG( + nvjpegBufferPinnedCreate(share_param.nvjpeg_handle, NULL, ¶ms[i].pinned_buffers[1])); + CHECK_NVJPEG( + nvjpegBufferDeviceCreate(share_param.nvjpeg_handle, NULL, ¶ms[i].device_buffer)); + + CHECK_NVJPEG(nvjpegJpegStreamCreate(share_param.nvjpeg_handle, ¶ms[i].jpeg_streams[0])); + CHECK_NVJPEG(nvjpegJpegStreamCreate(share_param.nvjpeg_handle, ¶ms[i].jpeg_streams[1])); + + CHECK_NVJPEG( + nvjpegDecodeParamsCreate(share_param.nvjpeg_handle, ¶ms[i].nvjpeg_decode_params)); + } +} + +void destroy_decoupled_api_handles( + std::vector & params, share_params & share_param) +{ + for (size_t i = 0; i < params.size(); i++) { + CHECK_NVJPEG(nvjpegDecodeParamsDestroy(params[i].nvjpeg_decode_params)); + + CHECK_NVJPEG(nvjpegJpegStreamDestroy(params[i].jpeg_streams[0])); + CHECK_NVJPEG(nvjpegJpegStreamDestroy(params[i].jpeg_streams[1])); + CHECK_NVJPEG(nvjpegBufferPinnedDestroy(params[i].pinned_buffers[0])); + CHECK_NVJPEG(nvjpegBufferPinnedDestroy(params[i].pinned_buffers[1])); + CHECK_NVJPEG(nvjpegBufferDeviceDestroy(params[i].device_buffer)); + + CHECK_NVJPEG(nvjpegJpegStateDestroy(params[i].nvjpeg_decoupled_state)); + CHECK_NVJPEG(nvjpegDecoderDestroy(params[i].nvjpeg_decoder)); + } +} + +void release_buffers(std::vector & ibuf) +{ + for (size_t i = 0; i < ibuf.size(); i++) { + for (size_t c = 0; c < NVJPEG_MAX_COMPONENT; c++) + if (ibuf[i].channel[c]) CHECK_CUDA(cudaFree(ibuf[i].channel[c])); + } +} + +int get_img( + const uchar * d_chanR, int pitchR, const uchar * d_chanG, int pitchG, const uchar * d_chanB, + int pitchB, size_t width, size_t height, uchar * img) +{ + size_t step = width * height; + + CHECK_CUDA(cudaMemcpy2D( + img + 0 * step, static_cast(width), d_chanR, static_cast(pitchR), width, height, + cudaMemcpyDeviceToDevice)); + CHECK_CUDA(cudaMemcpy2D( + img + 1 * step, static_cast(width), d_chanG, static_cast(pitchG), width, height, + cudaMemcpyDeviceToDevice)); + CHECK_CUDA(cudaMemcpy2D( + img + 2 * step, static_cast(width), d_chanB, static_cast(pitchB), width, height, + cudaMemcpyDeviceToDevice)); + + return EXIT_SUCCESS; +} + +void decode_single_image( + const std::vector & img_data, nvjpegImage_t & out, decode_params_t & params, + share_params & share_param, double & time) +{ + CHECK_CUDA(cudaStreamSynchronize(params.stream)); + cudaEvent_t startEvent = NULL, stopEvent = NULL; + float loopTime = 0; + // cudaEventBlockingSync + CHECK_CUDA(cudaEventCreate(&startEvent)); + CHECK_CUDA(cudaEventCreate(&stopEvent)); + + CHECK_CUDA(cudaEventRecord(startEvent, params.stream)); + + CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer)); + int buffer_index = 0; + CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, share_param.fmt)); + + CHECK_NVJPEG(nvjpegJpegStreamParse( + share_param.nvjpeg_handle, reinterpret_cast(img_data.data()), img_data.size(), 0, + 0, params.jpeg_streams[buffer_index])); + + CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer( + params.nvjpeg_decoupled_state, params.pinned_buffers[buffer_index])); + + CHECK_NVJPEG(nvjpegDecodeJpegHost( + share_param.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, + params.nvjpeg_decode_params, params.jpeg_streams[buffer_index])); + + CHECK_CUDA(cudaStreamSynchronize(params.stream)); + + CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice( + share_param.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, + params.jpeg_streams[buffer_index], params.stream)); + + buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync + RCLCPP_DEBUG(rclcpp::get_logger("nvjpegDecoder"), buffer_index); + + CHECK_NVJPEG(nvjpegDecodeJpegDevice( + share_param.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, &out, + params.stream)); + + // CHECK_CUDA(cudaStreamSynchronize(params.stream)); //TODO new add + + CHECK_CUDA(cudaEventRecord(stopEvent, params.stream)); + + CHECK_CUDA(cudaEventSynchronize(stopEvent)); + CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent)); + time = 0.001 * static_cast(loopTime); // cudaEventElapsedTime returns milliseconds + // time = 1.0; +} + +nvjpegDecoder::nvjpegDecoder(size_t n, size_t _fmt) +: N_img(n), iout(n), isz(n), widths(n), heights(n), params(n), share_param(_fmt) +{ + init(); +} + +int nvjpegDecoder::decode(const std::vector> & files_data, uchar * out_imgs) +{ + for (size_t i = 0; i < params.size(); i++) { + CHECK_CUDA(cudaStreamCreate(¶ms[i].stream)); + } + if (prepare_buffers(files_data, widths, heights, iout, isz, share_param)) return EXIT_FAILURE; + + double times[6]; + + auto decode_start = high_resolution_clock::now(); + std::vector threads(files_data.size()); + for (size_t i = 0; i < files_data.size(); i++) { + threads[i] = std::thread( + decode_single_image, std::ref(files_data[i]), std::ref(iout[i]), std::ref(params[i]), + std::ref(share_param), std::ref(times[i])); + } + for (size_t i = 0; i < files_data.size(); i++) { + threads[i].join(); + } + auto decode_end = high_resolution_clock::now(); + duration decode_time = decode_end - decode_start; + + RCLCPP_INFO( + rclcpp::get_logger("nvjpegDecoder"), "Decode total time : %.4lf ms", + decode_time.count() * 1000); + + for (size_t i = 0; i < files_data.size(); i++) { + get_img( + iout[i].channel[0], iout[i].pitch[0], iout[i].channel[1], iout[i].pitch[1], + iout[i].channel[2], iout[i].pitch[2], widths[i], heights[i], + out_imgs + i * 3 * widths[i] * heights[i]); + } + + for (size_t i = 0; i < params.size(); i++) { + CHECK_CUDA(cudaStreamDestroy(params[i].stream)); + } + return EXIT_SUCCESS; +} + +int nvjpegDecoder::init() +{ + nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free}; + nvjpegPinnedAllocator_t pinned_allocator = {&host_malloc, &host_free}; + + nvjpegStatus_t status = nvjpegCreateEx( + NVJPEG_BACKEND_HARDWARE, &dev_allocator, &pinned_allocator, NVJPEG_FLAGS_DEFAULT, + &share_param.nvjpeg_handle); + share_param.hw_decode_available = true; + if (status == NVJPEG_STATUS_ARCH_MISMATCH) { + RCLCPP_WARN( + rclcpp::get_logger("nvjpegDecoder"), + "Hardware Decoder not supported. Falling back to default backend"); + CHECK_NVJPEG(nvjpegCreateEx( + NVJPEG_BACKEND_DEFAULT, &dev_allocator, &pinned_allocator, NVJPEG_FLAGS_DEFAULT, + &share_param.nvjpeg_handle)); + share_param.hw_decode_available = false; + } else { + CHECK_NVJPEG(status); + } + + CHECK_NVJPEG(nvjpegJpegStateCreate(share_param.nvjpeg_handle, &share_param.nvjpeg_state)); + + create_decoupled_api_handles(params, share_param); + + for (size_t i = 0; i < iout.size(); i++) { + for (size_t c = 0; c < NVJPEG_MAX_COMPONENT; c++) { + iout[i].channel[c] = NULL; + iout[i].pitch[c] = 0; + isz[i].pitch[c] = 0; + } + } + return EXIT_SUCCESS; +} + +nvjpegDecoder::~nvjpegDecoder() +{ + release_buffers(iout); + destroy_decoupled_api_handles(params, share_param); + CHECK_NVJPEG(nvjpegJpegStateDestroy(share_param.nvjpeg_state)); + CHECK_NVJPEG(nvjpegDestroy(share_param.nvjpeg_handle)); +} + +#endif diff --git a/perception/autoware_tensorrt_bevdet/src/postprocess.cu b/perception/autoware_tensorrt_bevdet/src/postprocess.cu new file mode 100644 index 0000000000000..c7ac66a3236e6 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/postprocess.cu @@ -0,0 +1,212 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "autoware/tensorrt_bevdet/postprocess.hpp" + +#include +#include +#include +#include +#include + +__device__ float sigmoid_gpu(const float x) +{ + return 1.0f / (1.0f + expf(-x)); +} + +__global__ void BEVDecodeObjectKernel( + const int map_size, // 40000 + const float score_thresh, // 0.1 + // const int nms_pre_max_size, // 4096 + const float x_start, const float y_start, const float x_step, const float y_step, + const int output_h, const int output_w, const int downsample_size, const int num_class_in_task, + const int cls_range, const float * reg, const float * hei, const float * dim, const float * rot, + const float * vel, const float * cls, float * res_box, float * res_conf, int * res_cls, + int * res_box_num, float * rescale_factor) +{ // According to the confidence level, the initial screening showed that there were res-box_num + // boxes after screening, not exceeding nms_pre_max_size 4096 + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= map_size) return; + + float max_score = cls[idx]; // Initialize as the first class of the task + int label = cls_range; // Initialize as the first class of the task + for (int i = 1; i < num_class_in_task; ++i) { + float cur_score = cls[idx + i * map_size]; + if (cur_score > max_score) { + max_score = cur_score; + label = i + cls_range; + } + } + + int coor_x = idx % output_h; // + int coor_y = idx / output_w; // + + float conf = sigmoid_gpu(max_score); // Calculate confidence level + if (conf > score_thresh) { + int cur_valid_box_id = atomicAdd(res_box_num, 1); + res_box[cur_valid_box_id * kBoxBlockSize + 0] = + (reg[idx + 0 * map_size] + coor_x) * x_step + x_start; + res_box[cur_valid_box_id * kBoxBlockSize + 1] = + (reg[idx + 1 * map_size] + coor_y) * y_step + y_start; + res_box[cur_valid_box_id * kBoxBlockSize + 2] = hei[idx]; + res_box[cur_valid_box_id * kBoxBlockSize + 3] = + expf(dim[idx + 0 * map_size]) * rescale_factor[label]; // nms scale + res_box[cur_valid_box_id * kBoxBlockSize + 4] = + expf(dim[idx + 1 * map_size]) * rescale_factor[label]; + res_box[cur_valid_box_id * kBoxBlockSize + 5] = + expf(dim[idx + 2 * map_size]) * rescale_factor[label]; + res_box[cur_valid_box_id * kBoxBlockSize + 6] = atan2f(rot[idx], rot[idx + map_size]); + res_box[cur_valid_box_id * kBoxBlockSize + 7] = vel[idx]; + res_box[cur_valid_box_id * kBoxBlockSize + 8] = vel[idx + map_size]; + + res_conf[cur_valid_box_id] = conf; + res_cls[cur_valid_box_id] = label; + } +} + +PostprocessGPU::PostprocessGPU( + const int _class_num, const float _score_thresh, const float _nms_thresh, + const int _nms_pre_maxnum, const int _nms_post_maxnum, const int _down_sample, + const int _output_h, const int _output_w, const float _x_step, const float _y_step, + const float _x_start, const float _y_start, const std::vector & _class_num_pre_task, + const std::vector & _nms_rescale_factor) +: class_num(_class_num), + score_thresh(_score_thresh), + nms_thresh(_nms_thresh), + nms_pre_maxnum(_nms_pre_maxnum), + nms_post_maxnum(_nms_post_maxnum), + down_sample(_down_sample), + output_h(_output_h), + output_w(_output_w), + x_step(_x_step), + y_step(_y_step), + x_start(_x_start), + y_start(_y_start), + map_size(output_h * output_w), + class_num_pre_task(_class_num_pre_task), + nms_rescale_factor(_nms_rescale_factor), + task_num(_class_num_pre_task.size()) +{ + CHECK_CUDA(cudaMalloc((void **)&boxes_dev, kBoxBlockSize * map_size * sizeof(float))); + CHECK_CUDA(cudaMalloc((void **)&score_dev, map_size * sizeof(float))); + CHECK_CUDA(cudaMalloc((void **)&cls_dev, map_size * sizeof(int))); + CHECK_CUDA(cudaMalloc((void **)&sorted_indices_dev, map_size * sizeof(int))); + CHECK_CUDA(cudaMalloc((void **)&valid_box_num, sizeof(int))); + CHECK_CUDA(cudaMalloc((void **)&nms_rescale_factor_dev, class_num * sizeof(float))); + + CHECK_CUDA(cudaMallocHost((void **)&boxes_host, kBoxBlockSize * map_size * sizeof(float))); + CHECK_CUDA(cudaMallocHost((void **)&score_host, nms_pre_maxnum * sizeof(float))); + CHECK_CUDA(cudaMallocHost((void **)&cls_host, map_size * sizeof(float))); + CHECK_CUDA(cudaMallocHost((void **)&sorted_indices_host, nms_pre_maxnum * sizeof(int))); + CHECK_CUDA(cudaMallocHost((void **)&keep_data_host, nms_pre_maxnum * sizeof(std::int64_t))); + + CHECK_CUDA(cudaMemcpy( + nms_rescale_factor_dev, nms_rescale_factor.data(), class_num * sizeof(float), + cudaMemcpyHostToDevice)); + + iou3d_nms.reset(new Iou3dNmsCuda(output_h, output_w, nms_thresh)); + + std::ostringstream oss; + for (auto i = 0; i < nms_rescale_factor.size(); i++) { + oss << std::fixed << std::setprecision(2) << nms_rescale_factor[i] << ' '; + } + RCLCPP_INFO(rclcpp::get_logger("PostprocessGPU"), "%s", oss.str().c_str()); +} +PostprocessGPU::~PostprocessGPU() +{ + CHECK_CUDA(cudaFree(boxes_dev)); + CHECK_CUDA(cudaFree(score_dev)); + CHECK_CUDA(cudaFree(cls_dev)); + CHECK_CUDA(cudaFree(sorted_indices_dev)); + CHECK_CUDA(cudaFree(valid_box_num)); + CHECK_CUDA(cudaFree(nms_rescale_factor_dev)); + + CHECK_CUDA(cudaFreeHost(boxes_host)); + CHECK_CUDA(cudaFreeHost(score_host)); + CHECK_CUDA(cudaFreeHost(cls_host)); + CHECK_CUDA(cudaFreeHost(sorted_indices_host)); + CHECK_CUDA(cudaFreeHost(keep_data_host)); +} + +void PostprocessGPU::DoPostprocess(void ** const bev_buffer, std::vector & out_detections) +{ + // bev_buffer : BEV_feat, reg_0, hei_0, dim_0, rot_0, vel_0, heatmap_0, reg_1 ... + const int task_num = class_num_pre_task.size(); + int cur_start_label = 0; + for (int i = 0; i < task_num; i++) { + float * reg = (float *)bev_buffer[i * 6 + 0]; // 2 x 128 x 128 + float * hei = (float *)bev_buffer[i * 6 + 1]; // 1 x 128 x 128 + float * dim = (float *)bev_buffer[i * 6 + 2]; // 3 x 128 x 128 + float * rot = (float *)bev_buffer[i * 6 + 3]; // 2 x 128 x 128 + float * vel = (float *)bev_buffer[i * 6 + 4]; // 2 x 128 x 128 + float * heatmap = (float *)bev_buffer[i * 6 + 5]; // c x 128 x 128 + + dim3 grid(DIVUP(map_size, NUM_THREADS)); + CHECK_CUDA(cudaMemset(valid_box_num, 0, sizeof(int))); + BEVDecodeObjectKernel<<>>( + map_size, score_thresh, x_start, y_start, x_step, y_step, output_h, output_w, down_sample, + class_num_pre_task[i], cur_start_label, reg, hei, dim, rot, vel, heatmap, boxes_dev, + score_dev, cls_dev, valid_box_num, nms_rescale_factor_dev); + + /* + at this point, boxes_dev, score_dev, cls_dev have valid_box_num elements,which may be greater + than nms_pre_maxnum, and it's arranged in disorder + */ + int box_num_pre = 0; + CHECK_CUDA(cudaMemcpy(&box_num_pre, valid_box_num, sizeof(int), cudaMemcpyDeviceToHost)); + + thrust::sequence(thrust::device, sorted_indices_dev, sorted_indices_dev + box_num_pre); + thrust::sort_by_key( + thrust::device, score_dev, score_dev + box_num_pre, sorted_indices_dev, + thrust::greater()); + // score_dev is sorted in descending order, while sorted_indices_dev indexes the original + // order,, sorted_indices_dev[i] = j; i:current,j:pre-index; j:[0, map_size] + + box_num_pre = std::min(box_num_pre, nms_pre_maxnum); + + int box_num_post = + iou3d_nms->DoIou3dNms(box_num_pre, boxes_dev, sorted_indices_dev, keep_data_host); + + box_num_post = std::min(box_num_post, nms_post_maxnum); + + CHECK_CUDA(cudaMemcpy( + sorted_indices_host, sorted_indices_dev, box_num_pre * sizeof(int), cudaMemcpyDeviceToHost)); + CHECK_CUDA(cudaMemcpy( + boxes_host, boxes_dev, kBoxBlockSize * map_size * sizeof(float), cudaMemcpyDeviceToHost)); + CHECK_CUDA( + cudaMemcpy(score_host, score_dev, box_num_pre * sizeof(float), cudaMemcpyDeviceToHost)); + CHECK_CUDA(cudaMemcpy(cls_host, cls_dev, map_size * sizeof(float), cudaMemcpyDeviceToHost)); + + for (auto j = 0; j < box_num_post; j++) { + int k = keep_data_host[j]; + int idx = sorted_indices_host[k]; + Box box; + box.x = boxes_host[idx * kBoxBlockSize + 0]; + box.y = boxes_host[idx * kBoxBlockSize + 1]; + box.z = boxes_host[idx * kBoxBlockSize + 2]; + box.l = boxes_host[idx * kBoxBlockSize + 3] / nms_rescale_factor[cls_host[idx]]; + box.w = boxes_host[idx * kBoxBlockSize + 4] / nms_rescale_factor[cls_host[idx]]; + box.h = boxes_host[idx * kBoxBlockSize + 5] / nms_rescale_factor[cls_host[idx]]; + box.r = boxes_host[idx * kBoxBlockSize + 6]; + box.vx = boxes_host[idx * kBoxBlockSize + 7]; + box.vy = boxes_host[idx * kBoxBlockSize + 8]; + + box.label = cls_host[idx]; + box.score = score_host[k]; + box.z -= box.h * 0.5; // bottom height + out_detections.push_back(box); + } + + cur_start_label += class_num_pre_task[i]; + } +} diff --git a/perception/autoware_tensorrt_bevdet/src/preprocess.cu b/perception/autoware_tensorrt_bevdet/src/preprocess.cu new file mode 100644 index 0000000000000..2a1bdc426a42f --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/preprocess.cu @@ -0,0 +1,37 @@ +// Copyright 2024 AutoCore, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "autoware/tensorrt_bevdet/preprocess.hpp" + +#include + +#include + +__global__ void convert_RGBHWC_to_BGRCHW_kernel( + uchar * input, uchar * output, int channels, int height, int width) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < channels * height * width) { + int y = index / 3 / width; + int x = index / 3 % width; + int c = 2 - index % 3; // RGB to BGR + + output[c * height * width + y * width + x] = input[index]; + } +} +// RGBHWC to BGRCHW +void convert_RGBHWC_to_BGRCHW(uchar * input, uchar * output, int channels, int height, int width) +{ + convert_RGBHWC_to_BGRCHW_kernel<<>>( + input, output, channels, height, width); +} diff --git a/perception/autoware_tensorrt_bevdet/src/preprocess_plugin.cu b/perception/autoware_tensorrt_bevdet/src/preprocess_plugin.cu new file mode 100644 index 0000000000000..25abf687aaae8 --- /dev/null +++ b/perception/autoware_tensorrt_bevdet/src/preprocess_plugin.cu @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "autoware/tensorrt_bevdet/common.hpp" +#include "autoware/tensorrt_bevdet/preprocess_plugin.hpp" + +#include +#include + +#include + +// kernel for GPU + +template +__global__ void preprocess_kernel( + const uint8_t * src_dev, T * dst_dev, int src_row_step, int dst_row_step, int src_img_step, + int dst_img_step, int src_h, int src_w, float radio_h, float radio_w, float offset_h, + float offset_w, const float * mean, const float * std, int dst_h, int dst_w, int n) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= dst_h * dst_w * n) return; + + int i = (idx / n) / dst_w; + int j = (idx / n) % dst_w; + int k = idx % n; + + int pX = (int)roundf((i / radio_h) + offset_h); + int pY = (int)roundf((j / radio_w) + offset_w); + + if (pX < src_h && pX >= 0 && pY < src_w && pY >= 0) { + int s1 = k * src_img_step + 0 * src_img_step / 3 + pX * src_row_step + pY; + int s2 = k * src_img_step + 1 * src_img_step / 3 + pX * src_row_step + pY; + int s3 = k * src_img_step + 2 * src_img_step / 3 + pX * src_row_step + pY; + + int d1 = k * dst_img_step + 0 * dst_img_step / 3 + i * dst_row_step + j; + int d2 = k * dst_img_step + 1 * dst_img_step / 3 + i * dst_row_step + j; + int d3 = k * dst_img_step + 2 * dst_img_step / 3 + i * dst_row_step + j; + + dst_dev[d1] = (static_cast(src_dev[s1]) - static_cast(mean[0])) / static_cast(std[0]); + dst_dev[d2] = (static_cast(src_dev[s2]) - static_cast(mean[1])) / static_cast(std[1]); + dst_dev[d3] = (static_cast(src_dev[s3]) - static_cast(mean[2])) / static_cast(std[2]); + } +} + +namespace nvinfer1 +{ +// class PreprocessPlugin +PreprocessPlugin::PreprocessPlugin( + const std::string & name, int crop_h, int crop_w, float resize_radio) +: name_(name) +{ + m_.crop_h = crop_h; + m_.crop_w = crop_w; + m_.resize_radio = resize_radio; +} + +PreprocessPlugin::PreprocessPlugin(const std::string & name, const void * buffer, size_t length) +: name_(name) +{ + memcpy(&m_, buffer, sizeof(m_)); +} + +PreprocessPlugin::~PreprocessPlugin() +{ +} + +IPluginV2DynamicExt * PreprocessPlugin::clone() const noexcept +{ + auto p = new PreprocessPlugin(name_, &m_, sizeof(m_)); + p->setPluginNamespace(namespace_.c_str()); + return p; +} + +int32_t PreprocessPlugin::getNbOutputs() const noexcept +{ + return 1; +} + +DataType PreprocessPlugin::getOutputDataType( + int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept +{ + // return DataType::kHALF; + return DataType::kFLOAT; +} + +DimsExprs PreprocessPlugin::getOutputDimensions( + int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs, + IExprBuilder & exprBuilder) noexcept +{ + int input_h = inputs[0].d[2]->getConstantValue(); + int input_w = inputs[0].d[3]->getConstantValue() * 4; + + int output_h = input_h * m_.resize_radio - m_.crop_h; + int output_w = input_w * m_.resize_radio - m_.crop_w; + + DimsExprs ret; + ret.nbDims = inputs[0].nbDims; + ret.d[0] = inputs[0].d[0]; + ret.d[1] = inputs[0].d[1]; + ret.d[2] = exprBuilder.constant(output_h); + ret.d[3] = exprBuilder.constant(output_w); + + return ret; +} + +bool PreprocessPlugin::supportsFormatCombination( + int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept +{ + bool res; + switch (pos) { + case 0: // images + res = (inOut[0].type == DataType::kINT8 || inOut[0].type == DataType::kINT32) && + inOut[0].format == TensorFormat::kLINEAR; + break; + case 1: // mean + res = (inOut[1].type == DataType::kFLOAT) && inOut[1].format == TensorFormat::kLINEAR; + break; + case 2: // std + res = (inOut[2].type == DataType::kFLOAT) && inOut[2].format == TensorFormat::kLINEAR; + break; + case 3: // output img tensor + res = (inOut[3].type == DataType::kFLOAT || inOut[3].type == DataType::kHALF) && + inOut[3].format == inOut[0].format; + + // res = inOut[3].type == DataType::kHALF && inOut[3].format == inOut[0].format; + break; + default: + res = false; + } + return res; +} + +void PreprocessPlugin::configurePlugin( + const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out, + int32_t nbOutputs) noexcept +{ + return; +} + +size_t PreprocessPlugin::getWorkspaceSize( + const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs, + int32_t nbOutputs) const noexcept +{ + return 0; +} + +int32_t PreprocessPlugin::enqueue( + const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc, + const void * const * inputs, void * const * outputs, void * workspace, + cudaStream_t stream) noexcept +{ + int n_img = inputDesc[0].dims.d[0]; + int src_img_h = inputDesc[0].dims.d[2]; + int src_img_w = inputDesc[0].dims.d[3] * 4; + + int dst_img_h = outputDesc[0].dims.d[2]; + int dst_img_w = outputDesc[0].dims.d[3]; + + int src_row_step = src_img_w; + int dst_row_step = dst_img_w; + + int src_img_step = src_img_w * src_img_h * 3; + int dst_img_step = dst_img_w * dst_img_h * 3; + + float offset_h = m_.crop_h / m_.resize_radio; + float offset_w = m_.crop_w / m_.resize_radio; + + dim3 grid(DIVUP(dst_img_h * dst_img_w * n_img, NUM_THREADS)); + dim3 block(NUM_THREADS); + + switch (int(outputDesc[0].type)) { + case int(DataType::kFLOAT): + // float + preprocess_kernel<<>>( + reinterpret_cast(inputs[0]), reinterpret_cast(outputs[0]), + src_row_step, dst_row_step, src_img_step, dst_img_step, src_img_h, src_img_w, + m_.resize_radio, m_.resize_radio, offset_h, offset_w, + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + dst_img_h, dst_img_w, n_img); + break; + case int(DataType::kHALF): + // half + preprocess_kernel<<>>( + reinterpret_cast(inputs[0]), reinterpret_cast<__half *>(outputs[0]), + src_row_step, dst_row_step, src_img_step, dst_img_step, src_img_h, src_img_w, + m_.resize_radio, m_.resize_radio, offset_h, offset_w, + reinterpret_cast(inputs[1]), reinterpret_cast(inputs[2]), + dst_img_h, dst_img_w, n_img); + + break; + default: // should NOT be here + RCLCPP_ERROR(rclcpp::get_logger("PreprocessPlugin"), "\tUnsupport datatype!"); + } + return 0; +} + +void PreprocessPlugin::destroy() noexcept +{ + delete this; + return; +} + +int32_t PreprocessPlugin::initialize() noexcept +{ + return 0; +} + +void PreprocessPlugin::terminate() noexcept +{ + return; +} + +size_t PreprocessPlugin::getSerializationSize() const noexcept +{ + return sizeof(m_); +} + +void PreprocessPlugin::serialize(void * buffer) const noexcept +{ + memcpy(buffer, &m_, sizeof(m_)); + return; +} + +void PreprocessPlugin::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * PreprocessPlugin::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * PreprocessPlugin::getPluginType() const noexcept +{ + return PRE_PLUGIN_NAME; +} + +const char * PreprocessPlugin::getPluginVersion() const noexcept +{ + return PRE_PLUGIN_VERSION; +} + +void PreprocessPlugin::attachToContext( + cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept +{ + return; +} + +void PreprocessPlugin::detachFromContext() noexcept +{ + return; +} + +// class PreprocessPluginCreator +PluginFieldCollection PreprocessPluginCreator::fc_{}; +std::vector PreprocessPluginCreator::attr_; + +PreprocessPluginCreator::PreprocessPluginCreator() +{ + attr_.clear(); + attr_.emplace_back(PluginField("crop_h", nullptr, PluginFieldType::kINT32, 1)); + attr_.emplace_back(PluginField("crop_w", nullptr, PluginFieldType::kINT32, 1)); + attr_.emplace_back(PluginField("resize_radio", nullptr, PluginFieldType::kFLOAT32, 1)); + + fc_.nbFields = attr_.size(); + fc_.fields = attr_.data(); +} + +PreprocessPluginCreator::~PreprocessPluginCreator() +{ +} + +IPluginV2DynamicExt * PreprocessPluginCreator::createPlugin( + const char * name, const PluginFieldCollection * fc) noexcept +{ + const PluginField * fields = fc->fields; + + int crop_h = -1; + int crop_w = -1; + float resize_radio = 0.f; + + for (int i = 0; i < fc->nbFields; ++i) { + if (std::string(fc->fields[i].name) == std::string("crop_h")) { + crop_h = *reinterpret_cast(fc->fields[i].data); + } else if (std::string(fc->fields[i].name) == std::string("crop_w")) { + crop_w = *reinterpret_cast(fc->fields[i].data); + } else if (std::string(fc->fields[i].name) == std::string("resize_radio")) { + resize_radio = *reinterpret_cast(fc->fields[i].data); + } + } + PreprocessPlugin * pObj = new PreprocessPlugin(name, crop_h, crop_w, resize_radio); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +IPluginV2DynamicExt * PreprocessPluginCreator::deserializePlugin( + const char * name, const void * serialData, size_t serialLength) noexcept +{ + PreprocessPlugin * pObj = new PreprocessPlugin(name, serialData, serialLength); + pObj->setPluginNamespace(namespace_.c_str()); + return pObj; +} + +void PreprocessPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept +{ + namespace_ = pluginNamespace; + return; +} + +const char * PreprocessPluginCreator::getPluginNamespace() const noexcept +{ + return namespace_.c_str(); +} + +const char * PreprocessPluginCreator::getPluginName() const noexcept +{ + return PRE_PLUGIN_NAME; +} + +const char * PreprocessPluginCreator::getPluginVersion() const noexcept +{ + return PRE_PLUGIN_VERSION; +} + +const PluginFieldCollection * PreprocessPluginCreator::getFieldNames() noexcept +{ + return &fc_; +} + +REGISTER_TENSORRT_PLUGIN(PreprocessPluginCreator); + +} // namespace nvinfer1