diff --git a/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml b/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml
index 8f64b624540e7..38712790feeb0 100644
--- a/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml
+++ b/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml
@@ -211,6 +211,28 @@
       <arg name="radar_object_clustering_param_path" value="$(var radar_object_clustering_param_path)"/>
     </include>
   </group>
+  <group if="$(eval '&quot;$(var mode)&quot;==&quot;camera&quot;')">
+    <!-- Camera-based bev detectors-->
+    <include file="$(find-pkg-share tier4_perception_launch)/launch/object_recognition/detection/detector/camera_bev_detector.launch.xml">
+      <arg name="input/camera0/image" value="$(var input/camera0/image)"/>
+      <arg name="input/camera0/info" value="$(var input/camera0/info)"/>
+      <arg name="input/camera1/image" value="$(var input/camera1/image)"/>
+      <arg name="input/camera1/info" value="$(var input/camera1/info)"/>
+      <arg name="input/camera2/image" value="$(var input/camera2/image)"/>
+      <arg name="input/camera2/info" value="$(var input/camera2/info)"/>
+      <arg name="input/camera3/image" value="$(var input/camera3/image)"/>
+      <arg name="input/camera3/info" value="$(var input/camera3/info)"/>
+      <arg name="input/camera4/image" value="$(var input/camera4/image)"/>
+      <arg name="input/camera4/info" value="$(var input/camera4/info)"/>
+      <arg name="input/camera5/image" value="$(var input/camera5/image)"/>
+      <arg name="input/camera5/info" value="$(var input/camera5/info)"/>
+      <arg name="input/camera6/image" value="$(var input/camera6/image)"/>
+      <arg name="input/camera6/info" value="$(var input/camera6/info)"/>
+      <arg name="input/camera7/image" value="$(var input/camera7/image)"/>
+      <arg name="input/camera7/info" value="$(var input/camera7/info)"/>
+      <arg name="number_of_cameras" value="$(var number_of_cameras)"/>
+    </include>
+  </group>
   <!-- DetectionByTracker -->
   <group if="$(var switch/detector/tracker_based)">
     <include file="$(find-pkg-share tier4_perception_launch)/launch/object_recognition/detection/detector/tracker_based_detector.launch.xml">
diff --git a/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_bev_detector.launch.xml b/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_bev_detector.launch.xml
new file mode 100644
index 0000000000000..a8b1f3f63985d
--- /dev/null
+++ b/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_bev_detector.launch.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+<launch>
+  <!-- Camera parameters -->
+  <arg name="input/camera0/image"/>
+  <arg name="input/camera0/info"/>
+  <arg name="input/camera1/image"/>
+  <arg name="input/camera1/info"/>
+  <arg name="input/camera2/image"/>
+  <arg name="input/camera2/info"/>
+  <arg name="input/camera3/image"/>
+  <arg name="input/camera3/info"/>
+  <arg name="input/camera4/image"/>
+  <arg name="input/camera4/info"/>
+  <arg name="input/camera5/image"/>
+  <arg name="input/camera5/info"/>
+  <arg name="input/camera6/image"/>
+  <arg name="input/camera6/info"/>
+  <arg name="input/camera7/image"/>
+  <arg name="input/camera7/info"/>
+  <arg name="number_of_cameras"/>
+
+  <!-- Model parameters -->
+  <arg name="data_path" default="$(env HOME)/autoware_data" description="packages data and artifacts directory path"/>
+  <arg name="bevdet_model_name" default="bevdet_one_lt_d"/>
+  <arg name="bevdet_model_path" default="$(var data_path)/tensorrt_bevdet"/>
+  <!-- BEVDet -->
+  <group>
+    <push-ros-namespace namespace="bevdet"/>
+    <include file="$(find-pkg-share autoware_tensorrt_bevdet)/launch/tensorrt_bevdet.launch.xml">
+      <arg name="input/img_fl" value="$(var input/camera0/image)"/>
+      <arg name="input/img_fl/camera_info" value="$(var input/camera0/info)"/>
+      <arg name="input/img_f" value="$(var input/camera1/image)"/>
+      <arg name="input/img_f/camera_info" value="$(var input/camera1/info)"/>
+      <arg name="input/img_fr" value="$(var input/camera2/image)"/>
+      <arg name="input/img_fr/camera_info" value="$(var input/camera2/info)"/>
+      <arg name="input/img_bl" value="$(var input/camera3/image)"/>
+      <arg name="input/img_bl/camera_info" value="$(var input/camera3/info)"/>
+      <arg name="input/img_b" value="$(var input/camera4/image)"/>
+      <arg name="input/img_b/camera_info" value="$(var input/camera4/info)"/>
+      <arg name="input/img_br" value="$(var input/camera5/image)"/>
+      <arg name="input/img_br/camera_info" value="$(var input/camera5/info)"/>
+      <arg name="output/boxes" value="boxes"/>
+      <arg name="data_path" value="$(var data_path)"/>
+      <arg name="model_name" value="$(var bevdet_model_name)"/>
+      <arg name="model_path" value="$(var bevdet_model_path)"/>
+    </include>
+  </group>
+</launch>
diff --git a/perception/autoware_tensorrt_bevdet/CMakeLists.txt b/perception/autoware_tensorrt_bevdet/CMakeLists.txt
new file mode 100644
index 0000000000000..a383274e5a617
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/CMakeLists.txt
@@ -0,0 +1,102 @@
+cmake_minimum_required(VERSION 3.17)
+project(autoware_tensorrt_bevdet)
+
+add_compile_options(-W)
+add_compile_options(-std=c++17)
+set(CMAKE_CXX_FLAGS_RELEASE "-Wno-deprecated-declarations -O2")
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -O0 -Xcompiler -fPIC -g -w -gencode=arch=compute_86,code=sm_86")
+find_package(tensorrt_common)
+if(NOT ${tensorrt_common_FOUND})
+  message(WARNING "The tensorrt_common package is not found. Please check its dependencies.")
+  return()
+endif()
+
+find_package(ament_cmake REQUIRED)
+find_package(cudnn_cmake_module REQUIRED)
+find_package(tensorrt_cmake_module REQUIRED)
+find_package(rclcpp REQUIRED)
+find_package(yaml-cpp REQUIRED)
+find_package(Eigen3 REQUIRED)
+find_package(OpenCV REQUIRED)
+find_package(PCL REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(pcl_conversions REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(tf2_geometry_msgs REQUIRED)
+find_package(autoware_perception_msgs REQUIRED)
+
+find_package(CUDA REQUIRED)
+find_package(CUDAToolkit)
+find_package(CUDNN)
+find_package(TENSORRT)
+if(NOT (CUDAToolkit_FOUND AND CUDNN_FOUND AND TENSORRT_FOUND))
+  message(WARNING "cuda, cudnn, tensorrt libraries are not found")
+  return()
+endif()
+
+include_directories(
+  include
+  SYSTEM
+  ${YAML_CPP_INCLUDE_DIRS}
+  ${EIGEN3_INCLUDE_DIRS}
+  ${OpenCV_INCLUDE_DIRS}
+  ${PCL_INCLUDE_DIRS}
+  ${cv_bridge_INCLUDE_DIRS}
+  ${tf2_geometry_msgs_INCLUDE_DIRS}
+  ${autoware_perception_msgs_INCLUDE_DIRS}
+)
+
+cuda_add_executable(${PROJECT_NAME}_node
+    src/bevdet_node.cpp
+    src/bevdet.cpp
+    src/preprocess.cu
+    src/iou3d_nms.cu
+    src/postprocess.cu
+    src/data.cpp
+    src/cpu_jpegdecoder.cpp
+    src/nvjpegdecoder.cpp
+
+    src/preprocess_plugin.cu
+    src/bevpool_plugin.cu
+    src/alignbev_plugin.cu
+    src/gatherbev_plugin.cu
+)
+
+# Link libraries to executable
+ament_target_dependencies(${PROJECT_NAME}_node
+  "rclcpp"
+  "sensor_msgs"
+  "pcl_conversions"
+  "cv_bridge"
+  "autoware_perception_msgs"
+  "tf2_geometry_msgs"
+)
+
+target_link_libraries(${PROJECT_NAME}_node
+    yaml-cpp
+    libnvinfer.so
+    libnvonnxparser.so
+    libz.so
+    libjpeg.so
+    rclcpp::rclcpp
+    stdc++fs
+    ${NVINFER}
+    ${TENSORRT_LIBRARIES}
+    ${CUDA_LIBRARIES}
+    ${CUBLAS_LIBRARIES}
+    ${CUDNN_LIBRARY}
+    ${OpenCV_LIBS}
+    ${PCL_LIBRARIES}
+)
+
+install(TARGETS ${PROJECT_NAME}_node
+  DESTINATION lib/${PROJECT_NAME}
+)
+
+install(DIRECTORY
+  launch
+  config
+  DESTINATION share/${PROJECT_NAME}/
+)
+
+ament_package()
diff --git a/perception/autoware_tensorrt_bevdet/README.md b/perception/autoware_tensorrt_bevdet/README.md
new file mode 100644
index 0000000000000..ce6b1cc02c2bb
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/README.md
@@ -0,0 +1,69 @@
+# tensorrt_bevdet <!-- cspell: ignore bevdet -->
+
+## Purpose
+
+tensorrt_bevdet is a dynamic 3D bev object detection package based on 6 surround view cameras.
+
+## Inner-workings / Algorithms
+
+BEVDet is a BEV perception algorithm based on panoramic cameras. It unifies multi-view images into the perspective of BEV for 3D object detection task. In this implementation, BEVDet network to inference with TensorRT.
+
+## Inputs / Outputs
+
+### Inputs
+
+| Name                               | Type                           | Description                         |
+| ---------------------------------- | ------------------------------ | ----------------------------------- |
+| `~/input/topic_img_fl`             | `sensor_msgs::msg::Image`      | input front_left camera image       |
+| `~/input/topic_img_f`              | `sensor_msgs::msg::Image`      | input front camera image            |
+| `~/input/topic_img_fr`             | `sensor_msgs::msg::Image`      | input front_right camera image      |
+| `~/input/topic_img_bl`             | `sensor_msgs::msg::Image`      | input back_left camera image        |
+| `~/input/topic_img_b`              | `sensor_msgs::msg::Image`      | input back camera image             |
+| `~/input/topic_img_br`             | `sensor_msgs::msg::Image`      | input back_right camera image       |
+| `~/input/topic_img_fl/camera_info` | `sensor_msgs::msg::CameraInfo` | input front_left camera parameters  |
+| `~/input/topic_img_f/camera_info`  | `sensor_msgs::msg::CameraInfo` | input front camera parameters       |
+| `~/input/topic_img_fr/camera_info` | `sensor_msgs::msg::CameraInfo` | input front_right camera parameters |
+| `~/input/topic_img_bl/camera_info` | `sensor_msgs::msg::CameraInfo` | input back_left camera parameters   |
+| `~/input/topic_img_b/camera_info`  | `sensor_msgs::msg::CameraInfo` | input back camera parameters        |
+| `~/input/topic_img_br/camera_info` | `sensor_msgs::msg::CameraInfo` | input back_right camera parameters  |
+
+### Outputs
+
+| Name             | Type                                             | Description      |
+| ---------------- | ------------------------------------------------ | ---------------- |
+| `~/output/boxes` | `autoware_perception_msgs::msg::DetectedObjects` | detected objects |
+
+## How to Use Tensorrt BEVDet Node
+
+1. launch `tensorrt_bevdet_node`
+
+   ```bash
+
+   ros2 launch autoware_tensorrt_bevdet tensorrt_bevdet_node.launch.xml
+   ```
+
+2. play ros2 bag of nuScenes data
+
+   Please refer to open source repository [ros2_dataset_bridge](https://github.com/Owen-Liuyuxuan/ros2_dataset_bridge) to publish the ROS 2 topics.
+
+## Limitation
+
+The model is trained on open-source dataset `NuScenes` and has poor generalization on its own dataset, If you want to use this model to infer your data, you need to retrain it.
+
+## Trained Models
+
+You can download the onnx format of trained models by clicking on the links below.
+
+- BEVDet: [bevdet_one_lt_d.onnx](https://drive.google.com/file/d/1eMGJfdCVlDPBphBTjMcnIh3wdW7Q7WZB/view?usp=sharing)
+
+The model was trained in NuScenes database for 20 epochs.
+
+If you want to train model using the [TIER IV's internal database(~2600 key frames)](https://drive.google.com/file/d/1UaarK88HZu09sf7Ix-bEVl9zGNGFwTVL/view?usp=sharing), please refer to the following repositories:[BEVDet adapted to TIER IV dataset](https://github.com/cyn-liu/BEVDet/tree/train_export)
+
+## References/External links
+
+[1] <https://github.com/HuangJunJie2017/BEVDet/tree/dev2.1>
+
+[2] <https://github.com/LCH1238/BEVDet/tree/export>
+
+[3] <https://github.com/LCH1238/bevdet-tensorrt-cpp/tree/one>
diff --git a/perception/autoware_tensorrt_bevdet/config/bevdet.param.yaml b/perception/autoware_tensorrt_bevdet/config/bevdet.param.yaml
new file mode 100644
index 0000000000000..e7140328a5e6e
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/config/bevdet.param.yaml
@@ -0,0 +1,14 @@
+/**:
+    ros__parameters:
+      # weight files
+      onnx_path: "$(var model_path)/$(var model_name).onnx"
+      engine_path: "$(var model_path)/$(var model_name).engine"
+      data_params:
+        N: 6  # camera num
+        H: 900 # image height
+        W: 1600 # image width
+        cams: ["CAM_FRONT_LEFT", "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_BACK_LEFT", "CAM_BACK", "CAM_BACK_RIGHT"]
+      post_process_params:
+        # post-process params
+        score_threshold: 0.2
+        class_names: ["car", "truck", "construction_vehicle", "bus", "trailer", "barrier", "motorcycle", "bicycle", "pedestrian", "traffic_cone"]
diff --git a/perception/autoware_tensorrt_bevdet/config/bevdet_r50_4dlongterm_depth.yaml b/perception/autoware_tensorrt_bevdet/config/bevdet_r50_4dlongterm_depth.yaml
new file mode 100644
index 0000000000000..8f0e4ac785a97
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/config/bevdet_r50_4dlongterm_depth.yaml
@@ -0,0 +1,63 @@
+bev_range: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+mean: [123.675, 116.28, 103.53]
+std: [58.395, 57.12, 57.375]
+use_depth: true
+use_adj: true
+adj_num: 8
+transform_size: 6
+cam_params_size: 27
+sampling: nearest # nearest # or bicubic
+data_config:
+  Ncams: 6
+  cams: [CAM_FRONT_LEFT, CAM_FRONT, CAM_FRONT_RIGHT, CAM_BACK_LEFT, CAM_BACK, CAM_BACK_RIGHT]
+  resize_radio: 0.44
+  crop: [140, 0]
+  flip: true
+  input_size: [256, 704]
+  resize: [-0.06, 0.11]
+  resize_test: 0.0
+  rot: [-5.4, 5.4]
+  src_size: [900, 1600]
+grid_config:
+  depth: [1.0, 60.0, 0.5]
+  x: [-51.2, 51.2, 0.8]
+  y: [-51.2, 51.2, 0.8]
+  z: [-5, 3, 8]
+model:
+  bevpool_channels: 80
+  coder:
+    code_size: 9
+    max_num: 500
+    post_center_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
+    score_threshold: 0.1
+  common_head:
+    channels: [2, 1, 3, 2, 2]
+    names: [reg, height, dim, rot, vel]
+  down_sample: 16
+  tasks:
+    - class_names:
+        [
+          car,
+          truck,
+          construction_vehicle,
+          bus,
+          trailer,
+          barrier,
+          motorcycle,
+          bicycle,
+          pedestrian,
+          traffic_cone,
+        ]
+      num_class: 10
+test_cfg:
+  max_per_img: 500
+  max_pool_nms: false
+  min_radius: [4, 12, 10, 1, 0.85, 0.175]
+  nms_rescale_factor:
+    - [1.0, 0.7, 0.7, 0.4, 0.55, 1.1, 1.0, 1.0, 1.5, 3.5]
+  nms_thr: [0.2]
+  nms_type: [rotate]
+  post_center_limit_range: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
+  post_max_size: 500
+  pre_max_size: 1000
+  score_threshold: 0.1
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/alignbev_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/alignbev_plugin.hpp
new file mode 100644
index 0000000000000..beeeb8ed656d8
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/alignbev_plugin.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AUTOWARE__TENSORRT_BEVDET__ALIGNBEV_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__ALIGNBEV_PLUGIN_HPP_
+
+#include <NvInfer.h>
+#include <unistd.h>
+
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace
+{
+static const char * ALIGN_PLUGIN_NAME{"AlignBEV"};
+static const char * ALIGN_PLUGIN_VERSION{"1"};
+}  // namespace
+
+namespace nvinfer1
+{
+class AlignBEVPlugin : public IPluginV2DynamicExt
+{
+private:
+  const std::string name_;
+  std::string namespace_;
+  struct
+  {
+  } m_;
+
+public:
+  AlignBEVPlugin() = delete;
+  explicit AlignBEVPlugin(const std::string & name);
+  AlignBEVPlugin(const std::string & name, const void * buffer, size_t length);
+  ~AlignBEVPlugin();
+
+  // Method inherited from IPluginV2
+  const char * getPluginType() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  int32_t getNbOutputs() const noexcept override;
+  int32_t initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void * buffer) const noexcept override;
+  void destroy() noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+
+  // Method inherited from IPluginV2Ext
+  DataType getOutputDataType(
+    int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override;
+  void attachToContext(
+    cudnnContext * contextCudnn, cublasContext * contextCublas,
+    IGpuAllocator * gpuAllocator) noexcept override;
+  void detachFromContext() noexcept override;
+
+  // Method inherited from IPluginV2DynamicExt
+  IPluginV2DynamicExt * clone() const noexcept override;
+  DimsExprs getOutputDimensions(
+    int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+    IExprBuilder & exprBuilder) noexcept override;
+  bool supportsFormatCombination(
+    int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs,
+    int32_t nbOutputs) noexcept override;
+  void configurePlugin(
+    const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+    int32_t nbOutputs) noexcept override;
+  size_t getWorkspaceSize(
+    const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+    int32_t nbOutputs) const noexcept override;
+  int32_t enqueue(
+    const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+protected:
+  // To prevent compiler warnings
+  using nvinfer1::IPluginV2::enqueue;
+  using nvinfer1::IPluginV2::getOutputDimensions;
+  using nvinfer1::IPluginV2::getWorkspaceSize;
+  using nvinfer1::IPluginV2Ext::configurePlugin;
+};
+
+class AlignBEVPluginCreator : public IPluginCreator
+{
+private:
+  static PluginFieldCollection fc_;
+  static std::vector<PluginField> attr_;
+  std::string namespace_;
+
+public:
+  AlignBEVPluginCreator();
+  ~AlignBEVPluginCreator();
+  const char * getPluginName() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  const PluginFieldCollection * getFieldNames() noexcept override;
+  IPluginV2DynamicExt * createPlugin(
+    const char * name, const PluginFieldCollection * fc) noexcept override;
+  IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+};
+
+}  // namespace nvinfer1
+#endif  // AUTOWARE__TENSORRT_BEVDET__ALIGNBEV_PLUGIN_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet.hpp
new file mode 100644
index 0000000000000..8c6241ba4e647
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet.hpp
@@ -0,0 +1,239 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef AUTOWARE__TENSORRT_BEVDET__BEVDET_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__BEVDET_HPP_
+
+#include "NvInfer.h"
+#include "common.hpp"
+#include "data.hpp"
+#include "postprocess.hpp"
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+
+#include <NvOnnxParser.h>
+#include <yaml-cpp/yaml.h>
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace autoware::tensorrt_bevdet
+{
+class AdjFrame
+{
+public:
+  AdjFrame() {}
+  AdjFrame(int n, size_t buf_size)
+  : n_(n), buf_size_(buf_size), scenes_token_(n), ego2global_rot_(n), ego2global_trans_(n)
+  {
+    CHECK_CUDA(cudaMalloc(reinterpret_cast<void **>(&adj_buffer), n * buf_size));
+    CHECK_CUDA(cudaMemset(adj_buffer, 0, n * buf_size));
+    last_ = 0;
+    buffer_num_ = 0;
+    init_ = false;
+
+    for (auto & rot : ego2global_rot_) {
+      rot = Eigen::Quaternion<float>(0.f, 0.f, 0.f, 0.f);
+    }
+    for (auto & trans : ego2global_trans_) {
+      trans = Eigen::Translation3f(0.f, 0.f, 0.f);
+    }
+  }
+  const std::string & lastScenesToken() const { return scenes_token_[last_]; }
+
+  void reset()
+  {
+    last_ = 0;  // origin -1
+    buffer_num_ = 0;
+    init_ = false;
+  }
+
+  void saveFrameBuffer(
+    const void * curr_buffer, const std::string & curr_token,
+    const Eigen::Quaternion<float> & ego2global_rot, const Eigen::Translation3f & ego2global_trans)
+  {
+    int iters = init_ ? 1 : n_;
+    while (iters--) {
+      last_ = (last_ + 1) % n_;
+      CHECK_CUDA(cudaMemcpy(
+        reinterpret_cast<char *>(adj_buffer) + last_ * buf_size_, curr_buffer, buf_size_,
+        cudaMemcpyDeviceToDevice));
+      scenes_token_[last_] = curr_token;
+      ego2global_rot_[last_] = ego2global_rot;
+      ego2global_trans_[last_] = ego2global_trans;
+      buffer_num_ = std::min(buffer_num_ + 1, n_);
+    }
+    init_ = true;
+  }
+  int havingBuffer(int idx) { return static_cast<int>(idx < buffer_num_); }
+
+  const void * getFrameBuffer(int idx)
+  {
+    idx = (-idx + last_ + n_) % n_;
+    return reinterpret_cast<char *>(adj_buffer) + idx * buf_size_;
+  }
+  void getEgo2Global(
+    int idx, Eigen::Quaternion<float> & adj_ego2global_rot,
+    Eigen::Translation3f & adj_ego2global_trans)
+  {
+    idx = (-idx + last_ + n_) % n_;
+    adj_ego2global_rot = ego2global_rot_[idx];
+    adj_ego2global_trans = ego2global_trans_[idx];
+  }
+
+  ~AdjFrame() { CHECK_CUDA(cudaFree(adj_buffer)); }
+
+private:
+  int n_;
+  size_t buf_size_;
+
+  int last_;
+  int buffer_num_;
+  bool init_;
+
+  std::vector<std::string> scenes_token_;
+  std::vector<Eigen::Quaternion<float>> ego2global_rot_;
+  std::vector<Eigen::Translation3f> ego2global_trans_;
+
+  void * adj_buffer;
+};
+
+class BEVDet
+{
+public:
+  BEVDet() {}
+  BEVDet(
+    const std::string & config_file, int n_img, std::vector<Eigen::Matrix3f> cams_intrin,
+    std::vector<Eigen::Quaternion<float>> cams2ego_rot,
+    std::vector<Eigen::Translation3f> cams2ego_trans, const std::string & onnx_file,
+    const std::string & engine_file);
+
+  int doInfer(
+    const camsData & cam_data, std::vector<Box> & out_detections, float & cost_time, int idx = -1);
+  ~BEVDet();
+
+protected:
+  void initParams(const std::string & config_file);
+
+  void initViewTransformer(
+    std::vector<int> & ranks_bev, std::vector<int> & ranks_depth, std::vector<int> & ranks_feat,
+    std::vector<int> & interval_starts, std::vector<int> & interval_lengths);
+  void exportEngine(const std::string & onnxFile, const std::string & trtFile);
+  int initEngine(const std::string & engine_file);
+
+  int deserializeTRTEngine(const std::string & engine_file, nvinfer1::ICudaEngine ** engine_ptr);
+
+  void mallocDeviceMemory();
+
+  void initCamParams(
+    const std::vector<Eigen::Quaternion<float>> & curr_cams2ego_rot,
+    const std::vector<Eigen::Translation3f> & curr_cams2ego_trans,
+    const std::vector<Eigen::Matrix3f> & cams_intrin);
+
+  void getAdjBEVFeature(
+    const std::string & curr_scene_token, const Eigen::Quaternion<float> & ego2global_rot,
+    const Eigen::Translation3f & ego2global_trans);
+
+  void getCurr2AdjTransform(
+    const Eigen::Quaternion<float> & curr_ego2global_rot,
+    const Eigen::Quaternion<float> & adj_ego2global_rot,
+    const Eigen::Translation3f & curr_ego2global_trans,
+    const Eigen::Translation3f & adj_ego2global_trans, float * transform_dev);
+
+private:
+  int n_img_;
+
+  int src_img_h_;
+  int src_img_w_;
+  int input_img_h_;
+  int input_img_w_;
+  int crop_h_;
+  int crop_w_;
+  float resize_radio_;
+  int down_sample_;
+  int feat_h_;
+  int feat_w_;
+  int bev_h_;
+  int bev_w_;
+  int bevpool_channel_;
+
+  float depth_start_;
+  float depth_end_;
+  float depth_step_;
+  int depth_num_;
+
+  float x_start_;
+  float x_end_;
+  float x_step_;
+  int xgrid_num_;
+
+  float y_start_;
+  float y_end_;
+  float y_step_;
+  int ygrid_num_;
+
+  float z_start_;
+  float z_end_;
+  float z_step_;
+  int zgrid_num_;
+
+  std::vector<float> mean_;
+  std::vector<float> std_;
+
+  bool use_depth_;
+  bool use_adj_;
+  int adj_num_;
+
+  int class_num_;
+  float score_thresh_;
+  float nms_overlap_thresh_;
+  int nms_pre_maxnum_;
+  int nms_post_maxnum_;
+  std::vector<float> nms_rescale_factor_;
+  std::vector<int> class_num_pre_task_;
+  std::map<std::string, int> out_num_task_head_;
+
+  std::vector<Eigen::Matrix3f> cams_intrin_;
+  std::vector<Eigen::Quaternion<float>> cams2ego_rot_;
+  std::vector<Eigen::Translation3f> cams2ego_trans_;
+
+  Eigen::Matrix3f post_rot_;
+  Eigen::Translation3f post_trans_;
+
+  std::vector<size_t> trt_buffer_sizes_;
+  void ** trt_buffer_dev_;
+  std::vector<float> cam_params_host_;
+  void ** post_buffer_;
+
+  std::map<std::string, int> buffer_map_;
+
+  int valid_feat_num_;
+  int unique_bev_num_;
+
+  int transform_size_;
+  int cam_params_size_;
+
+  Logger g_logger_;
+
+  nvinfer1::ICudaEngine * trt_engine_;
+  nvinfer1::IExecutionContext * trt_context_;
+
+  std::unique_ptr<PostprocessGPU> postprocess_ptr_;
+  std::unique_ptr<AdjFrame> adj_frame_ptr_;
+};
+}  // namespace autoware::tensorrt_bevdet
+#endif  // AUTOWARE__TENSORRT_BEVDET__BEVDET_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet_node.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet_node.hpp
new file mode 100644
index 0000000000000..1a493bb2f8889
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevdet_node.hpp
@@ -0,0 +1,221 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell:ignore BEVDET, thre, TRTBEV, bevdet, caminfo, intrin
+#ifndef AUTOWARE__TENSORRT_BEVDET__BEVDET_NODE_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__BEVDET_NODE_HPP_
+
+#include "bevdet.hpp"
+#include "cpu_jpegdecoder.hpp"
+#include "tf2_ros/buffer.h"
+#include "tf2_ros/transform_listener.h"
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <opencv2/opencv.hpp>
+
+#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/detected_objects.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/shape.hpp>
+#include <geometry_msgs/msg/point32.hpp>
+#include <geometry_msgs/msg/quaternion.hpp>
+#include <geometry_msgs/msg/vector3.hpp>
+#include <sensor_msgs/msg/camera_info.hpp>
+#include <sensor_msgs/msg/image.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+#include <tf2_geometry_msgs/tf2_geometry_msgs.hpp>
+
+#include <cv_bridge/cv_bridge.h>
+#include <message_filters/subscriber.h>
+#include <message_filters/sync_policies/approximate_time.h>
+#include <message_filters/synchronizer.h>
+#include <pcl/common/transforms.h>
+#include <pcl/io/pcd_io.h>
+#include <pcl/point_types.h>
+#include <pcl_conversions/pcl_conversions.h>  // msg2pcl
+#include <yaml-cpp/yaml.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <thread>
+#include <vector>
+
+uint8_t getSemanticType(const std::string & class_name);
+
+void box3DToDetectedObjects(
+  const std::vector<Box> & boxes, autoware_perception_msgs::msg::DetectedObjects & objects,
+  const std::vector<std::string> & class_names, float score_thre, const bool has_twist);
+
+// Get the rotation and translation from a geometry_msgs::msg::TransformStamped
+void getTransform(
+  const geometry_msgs::msg::TransformStamped & transform, Eigen::Quaternion<float> & rot,
+  Eigen::Translation3f & translation);
+
+// Get the camera intrinsics from a sensor_msgs::msg::CameraInfo
+void getCameraIntrinsics(
+  const sensor_msgs::msg::CameraInfo::SharedPtr msg, Eigen::Matrix3f & intrinsics);
+
+class TRTBEVDetNode : public rclcpp::Node
+{
+  /**
+   * @class TRTBEVDetNode
+   * @brief This class represents a node for performing object detection using TensorRT on BEV
+   * (Bird's Eye View) images.
+   */
+private:
+  size_t img_N_;    ///< Number of images
+  uint32_t img_w_;  ///< Width of the images
+  uint32_t img_h_;  ///< Height of the images
+
+  std::string model_config_;  ///< Path to the model configuration file
+
+  std::string onnx_file_;    ///< Path to the ONNX file
+  std::string engine_file_;  ///< Path to the TensorRT engine file
+
+  std::vector<std::string> imgs_name_;    ///< Names of the images
+  std::vector<std::string> class_names_;  ///< Names of the object classes
+
+  camsData sampleData_;  ///< Sample data for camera parameters
+  std::shared_ptr<autoware::tensorrt_bevdet::BEVDet>
+    bevdet_;  ///< Object for performing object detection
+
+  uchar * imgs_dev_ = nullptr;  ///< Device pointer for storing the images
+  float score_thre_;            ///< Score threshold for object detection
+
+  rclcpp::Publisher<autoware_perception_msgs::msg::DetectedObjects>::SharedPtr
+    pub_boxes_;  ///< Publisher for publishing the detected objects
+
+  // Subscribers of camera info for each camera, no need to synchronize
+  rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::SharedPtr
+    sub_f_caminfo_;  ///< Subscriber for front camera info
+  rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::SharedPtr
+    sub_fl_caminfo_;  ///< Subscriber for front-left camera info
+  rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::SharedPtr
+    sub_fr_caminfo_;  ///< Subscriber for front-right camera info
+  rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::SharedPtr
+    sub_b_caminfo_;  ///< Subscriber for back camera info
+  rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::SharedPtr
+    sub_bl_caminfo_;  ///< Subscriber for back-left camera info
+  rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::SharedPtr
+    sub_br_caminfo_;  ///< Subscriber for back-right camera info
+  std::vector<bool>
+    caminfo_received_;  ///< Flag indicating if camera info has been received for each camera
+  bool camera_info_received_flag_ =
+    false;                    ///< Flag indicating if camera info has been received for all cameras
+  bool initialized_ = false;  /// Flag indicating if img_w_ and img_h_ has been initialized
+
+  // tf listener
+  std::shared_ptr<tf2_ros::TransformListener> tf_listener_{
+    nullptr};                                   ///< TF listener for transforming coordinates
+  std::unique_ptr<tf2_ros::Buffer> tf_buffer_;  ///< Buffer for storing TF transforms
+
+  // Camera parameters;
+  std::vector<Eigen::Matrix3f> cams_intrin;  ///< Intrinsic camera parameters for each camera
+  std::vector<Eigen::Quaternion<float>>
+    cams2ego_rot;  ///< Rotation from camera frame to ego frame for each camera
+  std::vector<Eigen::Translation3f>
+    cams2ego_trans;  ///< Translation from camera frame to ego frame for each camera
+
+  // Subscribers of images for each camera, synchronized
+  message_filters::Subscriber<sensor_msgs::msg::Image>
+    sub_f_img_;  ///< Subscriber for front camera image
+  message_filters::Subscriber<sensor_msgs::msg::Image>
+    sub_fl_img_;  ///< Subscriber for front-left camera image
+  message_filters::Subscriber<sensor_msgs::msg::Image>
+    sub_fr_img_;  ///< Subscriber for front-right camera image
+  message_filters::Subscriber<sensor_msgs::msg::Image>
+    sub_b_img_;  ///< Subscriber for back camera image
+  message_filters::Subscriber<sensor_msgs::msg::Image>
+    sub_bl_img_;  ///< Subscriber for back-left camera image
+  message_filters::Subscriber<sensor_msgs::msg::Image>
+    sub_br_img_;  ///< Subscriber for back-right camera image
+
+  typedef message_filters::sync_policies::ApproximateTime<
+    sensor_msgs::msg::Image, sensor_msgs::msg::Image, sensor_msgs::msg::Image,
+    sensor_msgs::msg::Image, sensor_msgs::msg::Image, sensor_msgs::msg::Image>
+    MySyncPolicy;
+
+  typedef message_filters::Synchronizer<MySyncPolicy> Sync;
+  std::shared_ptr<Sync> sync_;  ///< Synchronizer for synchronizing image callbacks
+
+  // Timer for checking initialization
+  rclcpp::TimerBase::SharedPtr timer_;  ///< Timer for checking initialization
+
+  /**
+   * @brief Starts the subscription to camera info topics for each camera.
+   */
+  void startCameraInfoSubscription();
+
+  /**
+   * @brief Checks if the node has been initialized properly.
+   */
+  void checkInitialization();
+
+  /**
+   * @brief Initializes the object detection model.
+   */
+  void initModel();
+
+  /**
+   * @brief Starts the subscription to image topics for each camera.
+   */
+  void startImageSubscription();
+
+public:
+  /**
+   * @brief Constructor for TRTBEVDetNode.
+   * @param node_name The name of the node.
+   * @param options The options for the node.
+   */
+  TRTBEVDetNode(const std::string & node_name, const rclcpp::NodeOptions & options);
+
+  /**
+   * @brief Destructor for TRTBEVDetNode.
+   */
+  ~TRTBEVDetNode();
+
+  /**
+   * @brief Callback function for synchronized image messages.
+   * @param msg_fl_img The front-left camera image message.
+   * @param msg_f_img The front camera image message.
+   * @param msg_fr_img The front-right camera image message.
+   * @param msg_bl_img The back-left camera image message.
+   * @param msg_b_img The back camera image message.
+   * @param msg_br_img The back-right camera image message.
+   */
+  void callback(
+    const sensor_msgs::msg::Image::ConstSharedPtr & msg_fl_img,
+    const sensor_msgs::msg::Image::ConstSharedPtr & msg_f_img,
+    const sensor_msgs::msg::Image::ConstSharedPtr & msg_fr_img,
+    const sensor_msgs::msg::Image::ConstSharedPtr & msg_bl_img,
+    const sensor_msgs::msg::Image::ConstSharedPtr & msg_b_img,
+    const sensor_msgs::msg::Image::ConstSharedPtr & msg_br_img);
+
+  /**
+   * @brief Callback function for camera info messages. This function also reads from TF to get the
+   * transformation from the camera frame to the ego frame.
+   * @param idx The index of the camera.
+   * @param msg The camera info message.
+   */
+  void camera_info_callback(int idx, const sensor_msgs::msg::CameraInfo::SharedPtr msg);
+};
+
+#endif  // AUTOWARE__TENSORRT_BEVDET__BEVDET_NODE_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevpool_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevpool_plugin.hpp
new file mode 100644
index 0000000000000..0a1f6e114589a
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/bevpool_plugin.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AUTOWARE__TENSORRT_BEVDET__BEVPOOL_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__BEVPOOL_PLUGIN_HPP_
+
+#include <NvInfer.h>
+#include <unistd.h>
+
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace
+{
+static const char * BEVPOOL_PLUGIN_NAME{"BEVPool"};
+static const char * BEVPOOL_PLUGIN_VERSION{"1"};
+}  // namespace
+
+namespace nvinfer1
+{
+class BEVPoolPlugin : public IPluginV2DynamicExt
+{
+private:
+  const std::string name_;
+  std::string namespace_;
+  struct
+  {
+    int bev_h;
+    int bev_w;
+    int n;
+  } m_;
+
+public:
+  BEVPoolPlugin() = delete;
+  BEVPoolPlugin(const std::string & name, int bev_h, int bev_w, int n);
+  BEVPoolPlugin(const std::string & name, const void * buffer, size_t length);
+  ~BEVPoolPlugin();
+
+  // Method inherited from IPluginV2
+  const char * getPluginType() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  int32_t getNbOutputs() const noexcept override;
+  int32_t initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void * buffer) const noexcept override;
+  void destroy() noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+
+  // Method inherited from IPluginV2Ext
+  DataType getOutputDataType(
+    int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override;
+  void attachToContext(
+    cudnnContext * contextCudnn, cublasContext * contextCublas,
+    IGpuAllocator * gpuAllocator) noexcept override;
+  void detachFromContext() noexcept override;
+
+  // Method inherited from IPluginV2DynamicExt
+  IPluginV2DynamicExt * clone() const noexcept override;
+  DimsExprs getOutputDimensions(
+    int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+    IExprBuilder & exprBuilder) noexcept override;
+  bool supportsFormatCombination(
+    int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs,
+    int32_t nbOutputs) noexcept override;
+  void configurePlugin(
+    const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+    int32_t nbOutputs) noexcept override;
+  size_t getWorkspaceSize(
+    const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+    int32_t nbOutputs) const noexcept override;
+  int32_t enqueue(
+    const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+protected:
+  // To prevent compiler warnings
+  using nvinfer1::IPluginV2::enqueue;
+  using nvinfer1::IPluginV2::getOutputDimensions;
+  using nvinfer1::IPluginV2::getWorkspaceSize;
+  using nvinfer1::IPluginV2Ext::configurePlugin;
+};
+
+class BEVPoolPluginCreator : public IPluginCreator
+{
+private:
+  static PluginFieldCollection fc_;
+  static std::vector<PluginField> attr_;
+  std::string namespace_;
+
+public:
+  BEVPoolPluginCreator();
+  ~BEVPoolPluginCreator();
+  const char * getPluginName() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  const PluginFieldCollection * getFieldNames() noexcept override;
+  IPluginV2DynamicExt * createPlugin(
+    const char * name, const PluginFieldCollection * fc) noexcept override;
+  IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+};
+
+}  // namespace nvinfer1
+#endif  // AUTOWARE__TENSORRT_BEVDET__BEVPOOL_PLUGIN_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/common.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/common.hpp
new file mode 100644
index 0000000000000..a1a745d9e604f
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/common.hpp
@@ -0,0 +1,148 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef AUTOWARE__TENSORRT_BEVDET__COMMON_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__COMMON_HPP_
+
+#include "rclcpp/logger.hpp"
+#include "rclcpp/rclcpp.hpp"
+
+#include <NvInfer.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <thrust/sort.h>
+
+#include <iostream>
+#include <string>
+
+typedef unsigned char uchar;
+
+#define NUM_THREADS 512
+
+#define CHECK_CUDA(ans)                   \
+  {                                       \
+    GPUAssert((ans), __FILE__, __LINE__); \
+  }
+
+#define DIVUP(m, n) (((m) + (n) - 1) / (n))
+
+inline void GPUAssert(cudaError_t code, const char * file, int line, bool abort = true)
+{
+  if (code != cudaSuccess) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("GPUAssert"), "GPUassert: %s %s %d", cudaGetErrorString(code), file, line);
+    if (abort) exit(code);
+  }
+};
+
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+const int MAXTENSORDIMS = 6;
+struct TensorDesc
+{
+  int shape[MAXTENSORDIMS];
+  int stride[MAXTENSORDIMS];
+  int dim;
+};
+
+inline int GET_BLOCKS(const int N)
+{
+  int optimal_block_num = DIVUP(N, NUM_THREADS);
+  int max_block_num = 4096;
+  return optimal_block_num < max_block_num ? optimal_block_num : max_block_num;
+}
+
+__inline__ std::string dataTypeToString(nvinfer1::DataType dataType)
+{
+  switch (dataType) {
+    case nvinfer1::DataType::kFLOAT:
+      return std::string("FP32 ");
+    case nvinfer1::DataType::kHALF:
+      return std::string("FP16 ");
+    case nvinfer1::DataType::kINT8:
+      return std::string("INT8 ");
+    case nvinfer1::DataType::kINT32:
+      return std::string("INT32");
+    case nvinfer1::DataType::kBOOL:
+      return std::string("BOOL ");
+    default:
+      return std::string("Unknown");
+  }
+}
+
+__inline__ size_t dataTypeToSize(nvinfer1::DataType dataType)
+{
+  switch (dataType) {
+    case nvinfer1::DataType::kFLOAT:
+      return 4;
+    case nvinfer1::DataType::kHALF:
+      return 2;
+    case nvinfer1::DataType::kINT8:
+      return 1;
+    case nvinfer1::DataType::kINT32:
+      return 4;
+    case nvinfer1::DataType::kBOOL:
+      return 1;
+    default:
+      return 4;
+  }
+}
+
+__inline__ std::string shapeToString(nvinfer1::Dims32 dim)
+{
+  std::string output("(");
+  if (dim.nbDims == 0) {
+    return output + std::string(")");
+  }
+  for (int i = 0; i < dim.nbDims - 1; ++i) {
+    output += std::to_string(dim.d[i]) + std::string(", ");
+  }
+  output += std::to_string(dim.d[dim.nbDims - 1]) + std::string(")");
+  return output;
+}
+
+class Logger : public nvinfer1::ILogger
+{
+public:
+  explicit Logger(Severity severity = Severity::kWARNING) : reportable_severity(severity) {}
+
+  void log(Severity severity, const char * msg) noexcept override
+  {
+    // suppress messages with severity enum value greater than the reportable
+    if (severity > reportable_severity) return;
+    switch (severity) {
+      case Severity::kINTERNAL_ERROR:
+        RCLCPP_ERROR(rclcpp::get_logger("autoware_tensorrt_bevdet"), "INTERNAL_ERROR: %s", msg);
+        break;
+      case Severity::kERROR:
+        RCLCPP_ERROR(rclcpp::get_logger("autoware_tensorrt_bevdet"), "ERROR: %s", msg);
+        break;
+      case Severity::kWARNING:
+        RCLCPP_WARN(rclcpp::get_logger("autoware_tensorrt_bevdet"), "WARNING: %s", msg);
+        break;
+      case Severity::kINFO:
+        RCLCPP_INFO(rclcpp::get_logger("autoware_tensorrt_bevdet"), "INFO: %s", msg);
+        break;
+      default:
+        RCLCPP_INFO(rclcpp::get_logger("autoware_tensorrt_bevdet"), "UNKNOWN: %s", msg);
+        break;
+    }
+  }
+
+  Severity reportable_severity;
+};
+
+#endif  // AUTOWARE__TENSORRT_BEVDET__COMMON_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp
new file mode 100644
index 0000000000000..94a301976fe6a
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp
@@ -0,0 +1,24 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef AUTOWARE__TENSORRT_BEVDET__CPU_JPEGDECODER_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__CPU_JPEGDECODER_HPP_
+
+#include "common.hpp"
+
+#include <vector>
+
+int decode_cpu(
+  const std::vector<std::vector<char>> & files_data, uchar * out_imgs, size_t width, size_t height);
+
+#endif  // AUTOWARE__TENSORRT_BEVDET__CPU_JPEGDECODER_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/data.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/data.hpp
new file mode 100644
index 0000000000000..153795b9d0f16
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/data.hpp
@@ -0,0 +1,72 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell:ignore BEVDET, intrin, Quater, BEVDET
+#ifndef AUTOWARE__TENSORRT_BEVDET__DATA_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__DATA_HPP_
+
+#include "common.hpp"
+#include "nvjpegdecoder.hpp"
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+
+#include <yaml-cpp/yaml.h>
+
+#include <string>
+#include <vector>
+
+struct camParams
+{
+  camParams() = default;
+  camParams(const YAML::Node & config, int n, std::vector<std::string> & cams_name);
+  camParams(
+    const std::vector<Eigen::Matrix3f> & _cams_intrin,
+    const std::vector<Eigen::Quaternion<float>> & _cams2ego_rot,
+    const std::vector<Eigen::Translation3f> & _cams2ego_trans)
+  : cams_intrin(_cams_intrin), cams2ego_rot(_cams2ego_rot), cams2ego_trans(_cams2ego_trans)
+  {
+  }
+
+  int N_img;
+
+  Eigen::Quaternion<float> ego2global_rot;
+  Eigen::Translation3f ego2global_trans;
+
+  Eigen::Quaternion<float> lidar2ego_rot;
+  Eigen::Translation3f lidar2ego_trans;
+  //
+  std::vector<Eigen::Matrix3f> cams_intrin;
+  std::vector<Eigen::Quaternion<float>> cams2ego_rot;
+  std::vector<Eigen::Translation3f> cams2ego_trans;
+  //
+  std::vector<std::string> imgs_file;
+
+  std::int64_t timestamp;
+  std::string scene_token;
+};
+
+struct camsData
+{
+  camsData() = default;
+  explicit camsData(const camParams & _param) : param(_param), imgs_dev(nullptr) {}
+  camParams param;
+  uchar * imgs_dev;
+};
+
+Eigen::Translation3f fromYamlTrans(YAML::Node x);
+Eigen::Quaternion<float> fromYamlQuater(YAML::Node x);
+Eigen::Matrix3f fromYamlMatrix3f(YAML::Node x);
+
+#endif  // AUTOWARE__TENSORRT_BEVDET__DATA_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/gatherbev_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/gatherbev_plugin.hpp
new file mode 100644
index 0000000000000..3d91763ed88a1
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/gatherbev_plugin.hpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AUTOWARE__TENSORRT_BEVDET__GATHERBEV_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__GATHERBEV_PLUGIN_HPP_
+
+#include <NvInfer.h>
+#include <unistd.h>
+
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace
+{
+static const char * GATHERBEV_PLUGIN_NAME{"GatherBEV"};
+static const char * GATHERBEV_PLUGIN_VERSION{"1"};
+}  // namespace
+
+namespace nvinfer1
+{
+class GatherBEVPlugin : public IPluginV2DynamicExt
+{
+private:
+  const std::string name_;
+  std::string namespace_;
+  struct
+  {
+  } m_;
+
+public:
+  GatherBEVPlugin() = delete;
+  explicit GatherBEVPlugin(const std::string & name);
+  GatherBEVPlugin(const std::string & name, const void * buffer, size_t length);
+  ~GatherBEVPlugin();
+
+  // Method inherited from IPluginV2
+  const char * getPluginType() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  int32_t getNbOutputs() const noexcept override;
+  int32_t initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void * buffer) const noexcept override;
+  void destroy() noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+
+  // Method inherited from IPluginV2Ext
+  DataType getOutputDataType(
+    int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override;
+  void attachToContext(
+    cudnnContext * contextCudnn, cublasContext * contextCublas,
+    IGpuAllocator * gpuAllocator) noexcept override;
+  void detachFromContext() noexcept override;
+
+  // Method inherited from IPluginV2DynamicExt
+  IPluginV2DynamicExt * clone() const noexcept override;
+  DimsExprs getOutputDimensions(
+    int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+    IExprBuilder & exprBuilder) noexcept override;
+  bool supportsFormatCombination(
+    int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs,
+    int32_t nbOutputs) noexcept override;
+  void configurePlugin(
+    const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+    int32_t nbOutputs) noexcept override;
+  size_t getWorkspaceSize(
+    const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+    int32_t nbOutputs) const noexcept override;
+  int32_t enqueue(
+    const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+protected:
+  // To prevent compiler warnings
+  using nvinfer1::IPluginV2::enqueue;
+  using nvinfer1::IPluginV2::getOutputDimensions;
+  using nvinfer1::IPluginV2::getWorkspaceSize;
+  using nvinfer1::IPluginV2Ext::configurePlugin;
+};
+
+class GatherBEVPluginCreator : public IPluginCreator
+{
+private:
+  static PluginFieldCollection fc_;
+  static std::vector<PluginField> attr_;
+  std::string namespace_;
+
+public:
+  GatherBEVPluginCreator();
+  ~GatherBEVPluginCreator();
+  const char * getPluginName() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  const PluginFieldCollection * getFieldNames() noexcept override;
+  IPluginV2DynamicExt * createPlugin(
+    const char * name, const PluginFieldCollection * fc) noexcept override;
+  IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+};
+
+}  // namespace nvinfer1
+#endif  // AUTOWARE__TENSORRT_BEVDET__GATHERBEV_PLUGIN_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/iou3d_nms.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/iou3d_nms.hpp
new file mode 100644
index 0000000000000..609b36717feab
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/iou3d_nms.hpp
@@ -0,0 +1,74 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2022.
+*/
+#pragma once
+#ifndef AUTOWARE__TENSORRT_BEVDET__IOU3D_NMS_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__IOU3D_NMS_HPP_
+
+#include "common.hpp"
+
+#include <stdio.h>
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <iostream>
+#include <vector>
+
+const int THREADS_PER_BLOCK = 16;
+const int THREADS_PER_BLOCK_NMS = sizeof(std::int64_t) * 8;
+const float EPS = 1e-8;
+
+class Iou3dNmsCuda
+{
+public:
+  Iou3dNmsCuda(const int head_x_size, const int head_y_size, const float nms_overlap_thresh);
+  ~Iou3dNmsCuda() = default;
+
+  int DoIou3dNms(
+    const int box_num_pre, const float * res_box, const int * res_sorted_indices,
+    std::int64_t * host_keep_data);
+
+private:
+  const int head_x_size_;
+  const int head_y_size_;
+  const float nms_overlap_thresh_;
+};
+
+static const int kBoxBlockSize = 9;
+
+struct Box
+{
+  float x;
+  float y;
+  float z;
+  float l;
+  float w;
+  float h;
+  float r;
+  float vx = 0.0f;  // optional
+  float vy = 0.0f;  // optional
+  float score;
+  int label;
+  bool is_drop;  // for nms
+};
+#endif  // AUTOWARE__TENSORRT_BEVDET__IOU3D_NMS_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/nvjpegdecoder.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/nvjpegdecoder.hpp
new file mode 100644
index 0000000000000..dad94eb9c4af2
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/nvjpegdecoder.hpp
@@ -0,0 +1,118 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef AUTOWARE__TENSORRT_BEVDET__NVJPEGDECODER_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__NVJPEGDECODER_HPP_
+
+#ifdef __HAVE_NVJPEG__
+#include "common.hpp"
+
+#include <dirent.h>
+#include <nvjpeg.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+#define DECODE_RGB 3
+#define DECODE_BGR 4
+#define DECODE_RGBI 5
+#define DECODE_BGRI 6
+#define DECODE_UNCHANGED 0
+
+#define CHECK_NVJPEG(call)                    \
+  {                                           \
+    NvJpegAssert((call), __FILE__, __LINE__); \
+  }
+
+inline void NvJpegAssert(nvjpegStatus_t code, const char * file, int line)
+{
+  if (code != NVJPEG_STATUS_SUCCESS) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("autoware_tensorrt_bevdet"), "NVJPEG failure: '#%d' at %s:%d", code, file,
+      line);
+    exit(1);
+  }
+}
+
+struct decode_params_t
+{
+  decode_params_t() {}
+
+  cudaStream_t stream;
+
+  // used with decoupled API
+  nvjpegBufferPinned_t pinned_buffers[2];  // 2 buffers for pipelining
+  nvjpegBufferDevice_t device_buffer;
+  nvjpegJpegStream_t jpeg_streams[2];  //  2 streams for pipelining
+
+  nvjpegDecodeParams_t nvjpeg_decode_params;
+  nvjpegJpegDecoder_t nvjpeg_decoder;
+  nvjpegJpegState_t nvjpeg_decoupled_state;
+};
+
+struct share_params
+{
+  share_params() { fmt = NVJPEG_OUTPUT_UNCHANGED; }
+  explicit share_params(size_t _fmt)
+  {
+    if (
+      _fmt == DECODE_RGB || _fmt == DECODE_BGR || _fmt == DECODE_RGBI || _fmt == DECODE_BGRI ||
+      _fmt == DECODE_UNCHANGED) {
+      fmt = (nvjpegOutputFormat_t)_fmt;
+    } else {
+      RCLCPP_WARN(
+        rclcpp::get_logger("autoware_tensorrt_bevdet"), "Unknown format! Auto switch BGR!");
+      fmt = NVJPEG_OUTPUT_BGR;
+    }
+  }
+
+  nvjpegOutputFormat_t fmt;
+  nvjpegJpegState_t nvjpeg_state;
+  nvjpegHandle_t nvjpeg_handle;
+
+  bool hw_decode_available;
+};
+
+class nvjpegDecoder
+{
+public:
+  nvjpegDecoder() {}
+  nvjpegDecoder(size_t n, size_t _fmt);
+
+  int decode(const std::vector<std::vector<char>> & imgs_data, uchar * out_imgs);
+  int init();
+  ~nvjpegDecoder();
+
+private:
+  size_t N_img;
+  std::vector<nvjpegImage_t> iout;
+  std::vector<nvjpegImage_t> isz;
+
+  std::vector<int> widths;
+  std::vector<int> heights;
+
+  share_params share_param;
+  std::vector<decode_params_t> params;
+};
+
+#endif
+
+#endif  // AUTOWARE__TENSORRT_BEVDET__NVJPEGDECODER_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/postprocess.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/postprocess.hpp
new file mode 100644
index 0000000000000..35a2fe0d44829
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/postprocess.hpp
@@ -0,0 +1,72 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef AUTOWARE__TENSORRT_BEVDET__POSTPROCESS_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__POSTPROCESS_HPP_
+
+#include "common.hpp"
+#include "iou3d_nms.hpp"
+
+#include <memory>
+#include <vector>
+
+class PostprocessGPU
+{
+public:
+  PostprocessGPU() {}
+  PostprocessGPU(
+    const int _class_num, const float _score_thresh, const float _nms_thresh,
+    const int _nms_pre_maxnum, const int _nms_post_maxnum, const int _down_sample,
+    const int _output_h, const int _output_w, const float _x_step, const float _y_step,
+    const float _x_start, const float _y_start, const std::vector<int> & _class_num_pre_task,
+    const std::vector<float> & _nms_rescale_factor);
+
+  void DoPostprocess(void ** const bev_buffer, std::vector<Box> & out_detections);
+  ~PostprocessGPU();
+
+private:
+  int class_num;
+  float score_thresh;
+  float nms_thresh;
+  int nms_pre_maxnum;
+  int nms_post_maxnum;
+  int down_sample;
+  int output_h;
+  int output_w;
+  float x_step;
+  float y_step;
+  float x_start;
+  float y_start;
+  int map_size;
+  int task_num;
+
+  std::vector<int> class_num_pre_task;
+  std::vector<float> nms_rescale_factor;
+
+  std::unique_ptr<Iou3dNmsCuda> iou3d_nms;
+
+  float * boxes_dev = nullptr;
+  float * score_dev = nullptr;
+  int * cls_dev = nullptr;
+  int * valid_box_num = nullptr;
+  int * sorted_indices_dev = nullptr;
+  std::int64_t * keep_data_host = nullptr;
+  int * sorted_indices_host = nullptr;
+  float * boxes_host = nullptr;
+  float * score_host = nullptr;
+  int * cls_host = nullptr;
+
+  float * nms_rescale_factor_dev = nullptr;
+};
+
+#endif  // AUTOWARE__TENSORRT_BEVDET__POSTPROCESS_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess.hpp
new file mode 100644
index 0000000000000..1c09fc925d8b5
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess.hpp
@@ -0,0 +1,21 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef AUTOWARE__TENSORRT_BEVDET__PREPROCESS_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__PREPROCESS_HPP_
+
+#include "common.hpp"
+
+void convert_RGBHWC_to_BGRCHW(uchar * input, uchar * output, int channels, int height, int width);
+
+#endif  // AUTOWARE__TENSORRT_BEVDET__PREPROCESS_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess_plugin.hpp b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess_plugin.hpp
new file mode 100644
index 0000000000000..ed6bbf861ab22
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/include/autoware/tensorrt_bevdet/preprocess_plugin.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AUTOWARE__TENSORRT_BEVDET__PREPROCESS_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_BEVDET__PREPROCESS_PLUGIN_HPP_
+
+#include <NvInfer.h>
+#include <unistd.h>
+
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace
+{
+static const char * PRE_PLUGIN_NAME{"Preprocess"};
+static const char * PRE_PLUGIN_VERSION{"1"};
+}  // namespace
+
+namespace nvinfer1
+{
+class PreprocessPlugin : public IPluginV2DynamicExt
+{
+private:
+  const std::string name_;
+  std::string namespace_;
+  struct
+  {
+    int crop_h;
+    int crop_w;
+    float resize_radio;
+  } m_;
+
+public:
+  PreprocessPlugin() = delete;
+  PreprocessPlugin(const std::string & name, int crop_h, int crop_w, float resize_radio);
+  PreprocessPlugin(const std::string & name, const void * buffer, size_t length);
+  ~PreprocessPlugin();
+
+  // Method inherited from IPluginV2
+  const char * getPluginType() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  int32_t getNbOutputs() const noexcept override;
+  int32_t initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void * buffer) const noexcept override;
+  void destroy() noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+
+  // Method inherited from IPluginV2Ext
+  DataType getOutputDataType(
+    int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept override;
+  void attachToContext(
+    cudnnContext * contextCudnn, cublasContext * contextCublas,
+    IGpuAllocator * gpuAllocator) noexcept override;
+  void detachFromContext() noexcept override;
+
+  // Method inherited from IPluginV2DynamicExt
+  IPluginV2DynamicExt * clone() const noexcept override;
+  DimsExprs getOutputDimensions(
+    int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+    IExprBuilder & exprBuilder) noexcept override;
+  bool supportsFormatCombination(
+    int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs,
+    int32_t nbOutputs) noexcept override;
+  void configurePlugin(
+    const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+    int32_t nbOutputs) noexcept override;
+  size_t getWorkspaceSize(
+    const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+    int32_t nbOutputs) const noexcept override;
+  int32_t enqueue(
+    const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+protected:
+  // To prevent compiler warnings
+  using nvinfer1::IPluginV2::enqueue;
+  using nvinfer1::IPluginV2::getOutputDimensions;
+  using nvinfer1::IPluginV2::getWorkspaceSize;
+  using nvinfer1::IPluginV2Ext::configurePlugin;
+};
+
+class PreprocessPluginCreator : public IPluginCreator
+{
+private:
+  static PluginFieldCollection fc_;
+  static std::vector<PluginField> attr_;
+  std::string namespace_;
+
+public:
+  PreprocessPluginCreator();
+  ~PreprocessPluginCreator();
+  const char * getPluginName() const noexcept override;
+  const char * getPluginVersion() const noexcept override;
+  const PluginFieldCollection * getFieldNames() noexcept override;
+  IPluginV2DynamicExt * createPlugin(
+    const char * name, const PluginFieldCollection * fc) noexcept override;
+  IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) noexcept override;
+  void setPluginNamespace(const char * pluginNamespace) noexcept override;
+  const char * getPluginNamespace() const noexcept override;
+};
+
+}  // namespace nvinfer1
+#endif  // AUTOWARE__TENSORRT_BEVDET__PREPROCESS_PLUGIN_HPP_
diff --git a/perception/autoware_tensorrt_bevdet/launch/tensorrt_bevdet.launch.xml b/perception/autoware_tensorrt_bevdet/launch/tensorrt_bevdet.launch.xml
new file mode 100644
index 0000000000000..0017ec5748cc8
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/launch/tensorrt_bevdet.launch.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0"?>
+<launch>
+  <!-- cspell:ignore nuscenes, bevdet, dlongterm -->
+
+  <!-- input topic name -->
+  <arg name="input/img_fl" default="/CAM_FRONT_LEFT/image"/>
+  <arg name="input/img_f" default="/CAM_FRONT/image"/>
+  <arg name="input/img_fr" default="/CAM_FRONT_RIGHT/image"/>
+  <arg name="input/img_bl" default="/CAM_BACK_LEFT/image"/>
+  <arg name="input/img_b" default="/CAM_BACK/image"/>
+  <arg name="input/img_br" default="/CAM_BACK_RIGHT/image"/>
+  <arg name="output/boxes" default="boxes"/>
+
+  <arg name="input/img_fl/camera_info" default="/CAM_FRONT_LEFT/camera_info"/>
+  <arg name="input/img_f/camera_info" default="/CAM_FRONT/camera_info"/>
+  <arg name="input/img_fr/camera_info" default="/CAM_FRONT_RIGHT/camera_info"/>
+  <arg name="input/img_bl/camera_info" default="/CAM_BACK_LEFT/camera_info"/>
+  <arg name="input/img_b/camera_info" default="/CAM_BACK/camera_info"/>
+  <arg name="input/img_br/camera_info" default="/CAM_BACK_RIGHT/camera_info"/>
+
+  <arg name="data_path" default="$(env HOME)/autoware_data"/>
+  <arg name="model_name" default="bevdet_one_lt_d"/>
+  <arg name="model_path" default="$(var data_path)/tensorrt_bevdet"/>
+  <arg name="model_config" default="$(find-pkg-share autoware_tensorrt_bevdet)/config/bevdet_r50_4dlongterm_depth.yaml"/>
+  <arg name="param_file" default="$(find-pkg-share autoware_tensorrt_bevdet)/config/bevdet.param.yaml"/>
+
+  <!-- bevdet node -->
+  <node pkg="autoware_tensorrt_bevdet" exec="autoware_tensorrt_bevdet_node" name="tensorrt_bevdet_node" output="screen">
+    <!-- ros topic name -->
+    <remap from="~/input/topic_img_fl" to="$(var input/img_fl)"/>
+    <remap from="~/input/topic_img_f" to="$(var input/img_f)"/>
+    <remap from="~/input/topic_img_fr" to="$(var input/img_fr)"/>
+    <remap from="~/input/topic_img_bl" to="$(var input/img_bl)"/>
+    <remap from="~/input/topic_img_b" to="$(var input/img_b)"/>
+    <remap from="~/input/topic_img_br" to="$(var input/img_br)"/>
+
+    <remap from="~/input/topic_img_fl/camera_info" to="$(var input/img_fl/camera_info)"/>
+    <remap from="~/input/topic_img_f/camera_info" to="$(var input/img_f/camera_info)"/>
+    <remap from="~/input/topic_img_fr/camera_info" to="$(var input/img_fr/camera_info)"/>
+    <remap from="~/input/topic_img_bl/camera_info" to="$(var input/img_bl/camera_info)"/>
+    <remap from="~/input/topic_img_b/camera_info" to="$(var input/img_b/camera_info)"/>
+    <remap from="~/input/topic_img_br/camera_info" to="$(var input/img_br/camera_info)"/>
+
+    <remap from="~/output/boxes" to="$(var output/boxes)"/>
+
+    <param name="model_config" value="$(var model_config)"/>
+    <param from="$(var param_file)" allow_substs="true"/>
+  </node>
+</launch>
diff --git a/perception/autoware_tensorrt_bevdet/package.xml b/perception/autoware_tensorrt_bevdet/package.xml
new file mode 100644
index 0000000000000..d73a11cf6bad9
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/package.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<package format="3">
+  <name>autoware_tensorrt_bevdet</name>
+  <version>0.0.1</version>
+  <description>tensorrt library implementation for bevdet</description>
+
+  <author email="cynthia.liu@autocore.ai">Cynthia</author>
+  <maintainer email="cynthia.liu@autocore.ai">Cynthia</maintainer>
+  <maintainer email="yliuhb@connect.ust.hk">Yuxuan Liu</maintainer>
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+  <buildtool_depend>autoware_cmake</buildtool_depend>
+  <buildtool_depend>cudnn_cmake_module</buildtool_depend>
+  <buildtool_depend>tensorrt_cmake_module</buildtool_depend>
+
+  <buildtool_export_depend>cudnn_cmake_module</buildtool_export_depend>
+  <buildtool_export_depend>tensorrt_cmake_module</buildtool_export_depend>
+
+  <depend>autoware_perception_msgs</depend>
+  <depend>cv_bridge</depend>
+  <depend>geometry_msgs</depend>
+  <depend>libopencv-dev</depend>
+  <depend>pcl_conversions</depend>
+  <depend>pcl_ros</depend>
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+  <depend>sensor_msgs</depend>
+  <depend>tensorrt_common</depend>
+  <depend>tf2_geometry_msgs</depend>
+  <depend>yaml-cpp</depend>
+
+  <test_depend>ament_lint_auto</test_depend>
+  <test_depend>autoware_lint_common</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/perception/autoware_tensorrt_bevdet/src/alignbev_plugin.cu b/perception/autoware_tensorrt_bevdet/src/alignbev_plugin.cu
new file mode 100644
index 0000000000000..0321bf103f049
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/alignbev_plugin.cu
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "autoware/tensorrt_bevdet/alignbev_plugin.hpp"
+#include "autoware/tensorrt_bevdet/common.hpp"
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+static inline __device__ bool within_bounds_2d(int h, int w, int H, int W)
+{
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename T1, typename T2>
+__global__ void align_bev_kernel(
+  const int nthreads, const T1 * input, const float * trans, T2 * output, TensorDesc output_desc)
+{
+  int C = output_desc.shape[1];        // 80
+  int out_H = output_desc.shape[2];    // 128
+  int out_W = output_desc.shape[3];    // 128
+  int out_sN = output_desc.stride[0];  // 80 * 128 * 128
+  int out_sC = output_desc.stride[1];  // 128 * 128
+  int out_sH = output_desc.stride[2];  // 128
+  int out_sW = output_desc.stride[3];  // 1
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads)
+  {
+    const int w = index % out_W;            //  j
+    const int h = (index / out_W) % out_H;  //  i
+    const int n = index / (out_H * out_W);  // batch
+
+    float ix =
+      trans[n * 6 + 0 * 3 + 0] * w + trans[n * 6 + 0 * 3 + 1] * h + trans[n * 6 + 0 * 3 + 2];
+    float iy =
+      trans[n * 6 + 1 * 3 + 0] * w + trans[n * 6 + 1 * 3 + 1] * h + trans[n * 6 + 1 * 3 + 2];
+
+    // NE, NW, SE, SW point
+    int ix_nw = static_cast<int>(::floor(ix));
+    int iy_nw = static_cast<int>(::floor(iy));
+    int ix_ne = ix_nw + 1;
+    int iy_ne = iy_nw;
+    int ix_sw = ix_nw;
+    int iy_sw = iy_nw + 1;
+    int ix_se = ix_nw + 1;
+    int iy_se = iy_nw + 1;
+
+    T2 nw = (ix_se - ix) * (iy_se - iy);
+    T2 ne = (ix - ix_sw) * (iy_sw - iy);
+    T2 sw = (ix_ne - ix) * (iy - iy_ne);
+    T2 se = (ix - ix_nw) * (iy - iy_nw);
+
+    // bilinear
+    auto inp_ptr_NC = input + n * out_sN;
+    auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+    for (int c = 0; c < C; ++c, inp_ptr_NC += out_sC, out_ptr_NCHW += out_sC) {
+      *out_ptr_NCHW = static_cast<T2>(0);
+      if (within_bounds_2d(iy_nw, ix_nw, out_H, out_W)) {
+        *out_ptr_NCHW += static_cast<T2>(inp_ptr_NC[iy_nw * out_sH + ix_nw * out_sW]) * nw;
+      }
+      if (within_bounds_2d(iy_ne, ix_ne, out_H, out_W)) {
+        *out_ptr_NCHW += static_cast<T2>(inp_ptr_NC[iy_ne * out_sH + ix_ne * out_sW]) * ne;
+      }
+      if (within_bounds_2d(iy_sw, ix_sw, out_H, out_W)) {
+        *out_ptr_NCHW += static_cast<T2>(inp_ptr_NC[iy_sw * out_sH + ix_sw * out_sW]) * sw;
+      }
+      if (within_bounds_2d(iy_se, ix_se, out_H, out_W)) {
+        *out_ptr_NCHW += static_cast<T2>(inp_ptr_NC[iy_se * out_sH + ix_se * out_sW]) * se;
+      }
+    }
+  }
+}
+
+void create_desc(const int * dims, int nb_dims, TensorDesc & desc)
+{
+  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
+  desc.stride[nb_dims - 1] = 1;
+  for (int i = nb_dims - 2; i >= 0; --i) {
+    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
+  }
+}
+
+namespace nvinfer1
+{
+// class AlignBEVPlugin
+AlignBEVPlugin::AlignBEVPlugin(const std::string & name) : name_(name)
+{
+}
+
+AlignBEVPlugin::AlignBEVPlugin(const std::string & name, const void * buffer, size_t length)
+: name_(name)
+{
+  memcpy(&m_, buffer, sizeof(m_));
+}
+
+AlignBEVPlugin::~AlignBEVPlugin()
+{
+}
+
+IPluginV2DynamicExt * AlignBEVPlugin::clone() const noexcept
+{
+  auto p = new AlignBEVPlugin(name_, &m_, sizeof(m_));
+  p->setPluginNamespace(namespace_.c_str());
+  return p;
+}
+
+int32_t AlignBEVPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+DataType AlignBEVPlugin::getOutputDataType(
+  int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept
+{
+  return DataType::kFLOAT;  // FIXME
+}
+
+DimsExprs AlignBEVPlugin::getOutputDimensions(
+  int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+  IExprBuilder & exprBuilder) noexcept
+{
+  return inputs[0];
+}
+
+bool AlignBEVPlugin::supportsFormatCombination(
+  int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
+{
+  // adj_feat
+  if (pos == 0) {
+    return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) &&
+           inOut[pos].format == TensorFormat::kLINEAR;
+  } else if (pos == 2) {  // out
+    return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) &&
+           inOut[pos].format == TensorFormat::kLINEAR;
+  } else if (pos == 1) {  // transform
+    return inOut[pos].type == DataType::kFLOAT && inOut[pos].format == TensorFormat::kLINEAR;
+  }
+  return false;
+}
+
+size_t AlignBEVPlugin::getWorkspaceSize(
+  const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+  int32_t nbOutputs) const noexcept
+{
+  return 0;
+}
+
+int32_t AlignBEVPlugin::enqueue(
+  const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+  const void * const * inputs, void * const * outputs, void * workspace,
+  cudaStream_t stream) noexcept
+{
+  // TODO
+  // inputs[0] == adj_feat  b x 8 x 80 x 128 x 128
+  // inputs[1] == transform b x 8 x 6
+
+  int bev_channel = inputDesc[0].dims.d[2];
+  int bev_h = inputDesc[0].dims.d[3];
+  int bev_w = inputDesc[0].dims.d[4];
+  int adj_num = inputDesc[0].dims.d[1];
+
+  int output_dim[4] = {8, bev_channel, bev_h, bev_w};
+
+  TensorDesc output_desc;
+  create_desc(output_dim, 4, output_desc);
+
+  int count = 1;
+  for (int i = 0; i < 4; ++i) {
+    if (i == 1) {
+      continue;
+    }
+    count *= output_desc.shape[i];
+  }
+
+  switch (int(outputDesc[0].type)) {
+    case int(DataType::kFLOAT):
+      if (inputDesc[0].type == DataType::kFLOAT) {
+        // align : fp32, fp32;
+        align_bev_kernel<float, float><<<GET_BLOCKS(count), NUM_THREADS, 0, stream>>>(
+          count, reinterpret_cast<const float *>(inputs[0]),
+          reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<float *>(outputs[0]),
+          output_desc);
+      } else {
+        // align : fp16, fp32
+        align_bev_kernel<__half, float><<<GET_BLOCKS(count), NUM_THREADS, 0, stream>>>(
+          count, reinterpret_cast<const __half *>(inputs[0]),
+          reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<float *>(outputs[0]),
+          output_desc);
+      }
+      break;
+    case int(DataType::kHALF):
+      if (inputDesc[0].type == DataType::kFLOAT) {
+        // align : fp32, fp16
+        align_bev_kernel<float, __half><<<GET_BLOCKS(count), NUM_THREADS, 0, stream>>>(
+          count, reinterpret_cast<const float *>(inputs[0]),
+          reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<__half *>(outputs[0]),
+          output_desc);
+      } else {
+        // align : fp16, fp16
+        align_bev_kernel<__half, __half><<<GET_BLOCKS(count), NUM_THREADS, 0, stream>>>(
+          count, reinterpret_cast<const __half *>(inputs[0]),
+          reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<__half *>(outputs[0]),
+          output_desc);
+      }
+      break;
+    default:  // should NOT be here
+      RCLCPP_ERROR(rclcpp::get_logger("AlignBEVPlugin"), "\tUnsupported datatype!");
+  }
+
+  return 0;
+}
+
+void AlignBEVPlugin::destroy() noexcept
+{
+  delete this;
+  return;
+}
+
+void AlignBEVPlugin::configurePlugin(
+  const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+  int32_t nbOutputs) noexcept
+{
+  return;
+}
+
+int32_t AlignBEVPlugin::initialize() noexcept
+{
+  return 0;
+}
+
+void AlignBEVPlugin::terminate() noexcept
+{
+  return;
+}
+
+size_t AlignBEVPlugin::getSerializationSize() const noexcept
+{
+  return sizeof(m_);
+}
+
+void AlignBEVPlugin::serialize(void * buffer) const noexcept
+{
+  memcpy(buffer, &m_, sizeof(m_));
+  return;
+}
+
+void AlignBEVPlugin::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * AlignBEVPlugin::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * AlignBEVPlugin::getPluginType() const noexcept
+{
+  return ALIGN_PLUGIN_NAME;
+}
+
+const char * AlignBEVPlugin::getPluginVersion() const noexcept
+{
+  return ALIGN_PLUGIN_VERSION;
+}
+
+void AlignBEVPlugin::attachToContext(
+  cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept
+{
+  return;
+}
+
+void AlignBEVPlugin::detachFromContext() noexcept
+{
+  return;
+}
+
+// class AlignBEVPluginCreator
+PluginFieldCollection AlignBEVPluginCreator::fc_{};
+std::vector<PluginField> AlignBEVPluginCreator::attr_;
+
+AlignBEVPluginCreator::AlignBEVPluginCreator()
+{
+  attr_.clear();
+  fc_.nbFields = attr_.size();
+  fc_.fields = attr_.data();
+}
+
+AlignBEVPluginCreator::~AlignBEVPluginCreator()
+{
+}
+
+IPluginV2DynamicExt * AlignBEVPluginCreator::createPlugin(
+  const char * name, const PluginFieldCollection * fc) noexcept
+{
+  // const PluginField *fields = fc->fields;
+  AlignBEVPlugin * pObj = new AlignBEVPlugin(name);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+IPluginV2DynamicExt * AlignBEVPluginCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) noexcept
+{
+  AlignBEVPlugin * pObj = new AlignBEVPlugin(name, serialData, serialLength);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+void AlignBEVPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * AlignBEVPluginCreator::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * AlignBEVPluginCreator::getPluginName() const noexcept
+{
+  return ALIGN_PLUGIN_NAME;
+}
+
+const char * AlignBEVPluginCreator::getPluginVersion() const noexcept
+{
+  return ALIGN_PLUGIN_VERSION;
+}
+
+const PluginFieldCollection * AlignBEVPluginCreator::getFieldNames() noexcept
+{
+  return &fc_;
+}
+
+REGISTER_TENSORRT_PLUGIN(AlignBEVPluginCreator);
+
+}  // namespace nvinfer1
diff --git a/perception/autoware_tensorrt_bevdet/src/bevdet.cpp b/perception/autoware_tensorrt_bevdet/src/bevdet.cpp
new file mode 100644
index 0000000000000..0296a829f1e35
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/bevdet.cpp
@@ -0,0 +1,687 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell:ignore BEVDET, thre, TRTBEV, bevdet, caminfo, intrin, Ncams, bevfeat, bevpool, maxnum,
+// zgrid cspell:ignore gridbev, egobev, adjgrid, currgrid
+#include "autoware/tensorrt_bevdet/bevdet.hpp"
+
+#include "autoware/tensorrt_bevdet/alignbev_plugin.hpp"
+#include "autoware/tensorrt_bevdet/bevpool_plugin.hpp"
+#include "autoware/tensorrt_bevdet/gatherbev_plugin.hpp"
+#include "autoware/tensorrt_bevdet/preprocess_plugin.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#include <chrono>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+
+using std::chrono::duration;
+using std::chrono::high_resolution_clock;
+
+namespace autoware::tensorrt_bevdet
+{
+BEVDet::BEVDet(
+  const std::string & config_file, int n_img, std::vector<Eigen::Matrix3f> cams_intrin,
+  std::vector<Eigen::Quaternion<float>> cams2ego_rot,
+  std::vector<Eigen::Translation3f> cams2ego_trans, const std::string & onnx_file,
+  const std::string & engine_file)
+: cams_intrin_(cams_intrin), cams2ego_rot_(cams2ego_rot), cams2ego_trans_(cams2ego_trans)
+{
+  initParams(config_file);
+  if (n_img != n_img_) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("BEVDet"), "BEVDet need %d images, but given %d images!", n_img_, n_img);
+  }
+  auto start = high_resolution_clock::now();
+
+  std::vector<int> ranks_bev;
+  std::vector<int> ranks_depth;
+  std::vector<int> ranks_feat;
+  std::vector<int> interval_starts;
+  std::vector<int> interval_lengths;
+
+  initViewTransformer(ranks_bev, ranks_depth, ranks_feat, interval_starts, interval_lengths);
+  auto end = high_resolution_clock::now();
+  duration<float> t = end - start;
+  RCLCPP_INFO(
+    rclcpp::get_logger("BEVDet"), "InitVewTransformer cost time : %.4lf ms", t.count() * 1000);
+
+  if (access(engine_file.c_str(), F_OK) == 0) {
+    RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Inference engine prepared.");
+  } else {
+    // onnx to engine
+    RCLCPP_WARN(
+      rclcpp::get_logger("BEVDet"), "Could not find %s, try making TensorRT engine from onnx",
+      engine_file.c_str());
+    exportEngine(onnx_file, engine_file);
+  }
+  initEngine(engine_file);  // FIXME
+  mallocDeviceMemory();
+
+  if (use_adj_) {
+    adj_frame_ptr_.reset(new AdjFrame(adj_num_, trt_buffer_sizes_[buffer_map_["curr_bevfeat"]]));
+  }
+
+  cam_params_host_.resize(n_img_ * cam_params_size_);
+
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["ranks_bev"]], ranks_bev.data(), valid_feat_num_ * sizeof(int),
+    cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["ranks_depth"]], ranks_depth.data(), valid_feat_num_ * sizeof(int),
+    cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["ranks_feat"]], ranks_feat.data(), valid_feat_num_ * sizeof(int),
+    cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["interval_starts"]], interval_starts.data(),
+    unique_bev_num_ * sizeof(int), cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["interval_lengths"]], interval_lengths.data(),
+    unique_bev_num_ * sizeof(int), cudaMemcpyHostToDevice));
+
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["mean"]], mean_.data(), 3 * sizeof(float), cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["std"]], std_.data(), 3 * sizeof(float), cudaMemcpyHostToDevice));
+}
+
+void BEVDet::initCamParams(
+  const std::vector<Eigen::Quaternion<float>> & curr_cams2ego_rot,
+  const std::vector<Eigen::Translation3f> & curr_cams2ego_trans,
+  const std::vector<Eigen::Matrix3f> & curr_cams_intrin)
+{
+  for (int i = 0; i < n_img_; i++) {
+    cam_params_host_[i * cam_params_size_ + 0] = curr_cams_intrin[i](0, 0);
+    cam_params_host_[i * cam_params_size_ + 1] = curr_cams_intrin[i](1, 1);
+    cam_params_host_[i * cam_params_size_ + 2] = curr_cams_intrin[i](0, 2);
+    cam_params_host_[i * cam_params_size_ + 3] = curr_cams_intrin[i](1, 2);
+    cam_params_host_[i * cam_params_size_ + 4] = post_rot_(0, 0);
+    cam_params_host_[i * cam_params_size_ + 5] = post_rot_(0, 1);
+    cam_params_host_[i * cam_params_size_ + 6] = post_trans_.translation()(0);
+    cam_params_host_[i * cam_params_size_ + 7] = post_rot_(1, 0);
+    cam_params_host_[i * cam_params_size_ + 8] = post_rot_(1, 1);
+    cam_params_host_[i * cam_params_size_ + 9] = post_trans_.translation()(1);
+    cam_params_host_[i * cam_params_size_ + 10] = 1.f;  // bda 0 0
+    cam_params_host_[i * cam_params_size_ + 11] = 0.f;  // bda 0 1
+    cam_params_host_[i * cam_params_size_ + 12] = 0.f;  // bda 1 0
+    cam_params_host_[i * cam_params_size_ + 13] = 1.f;  // bda 1 1
+    cam_params_host_[i * cam_params_size_ + 14] = 1.f;  // bda 2 2
+    cam_params_host_[i * cam_params_size_ + 15] = curr_cams2ego_rot[i].matrix()(0, 0);
+    cam_params_host_[i * cam_params_size_ + 16] = curr_cams2ego_rot[i].matrix()(0, 1);
+    cam_params_host_[i * cam_params_size_ + 17] = curr_cams2ego_rot[i].matrix()(0, 2);
+    cam_params_host_[i * cam_params_size_ + 18] = curr_cams2ego_trans[i].translation()(0);
+    cam_params_host_[i * cam_params_size_ + 19] = curr_cams2ego_rot[i].matrix()(1, 0);
+    cam_params_host_[i * cam_params_size_ + 20] = curr_cams2ego_rot[i].matrix()(1, 1);
+    cam_params_host_[i * cam_params_size_ + 21] = curr_cams2ego_rot[i].matrix()(1, 2);
+    cam_params_host_[i * cam_params_size_ + 22] = curr_cams2ego_trans[i].translation()(1);
+    cam_params_host_[i * cam_params_size_ + 23] = curr_cams2ego_rot[i].matrix()(2, 0);
+    cam_params_host_[i * cam_params_size_ + 24] = curr_cams2ego_rot[i].matrix()(2, 1);
+    cam_params_host_[i * cam_params_size_ + 25] = curr_cams2ego_rot[i].matrix()(2, 2);
+    cam_params_host_[i * cam_params_size_ + 26] = curr_cams2ego_trans[i].translation()(2);
+  }
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["cam_params"]], cam_params_host_.data(),
+    trt_buffer_sizes_[buffer_map_["cam_params"]], cudaMemcpyHostToDevice));
+}
+
+void BEVDet::initParams(const std::string & config_file)
+{
+  mean_ = std::vector<float>(3);
+  std_ = std::vector<float>(3);
+
+  YAML::Node model_config = YAML::LoadFile(config_file);
+  n_img_ = model_config["data_config"]["Ncams"].as<int>();
+  src_img_h_ = model_config["data_config"]["src_size"][0].as<int>();
+  src_img_w_ = model_config["data_config"]["src_size"][1].as<int>();
+  input_img_h_ = model_config["data_config"]["input_size"][0].as<int>();
+  input_img_w_ = model_config["data_config"]["input_size"][1].as<int>();
+  resize_radio_ = model_config["data_config"]["resize_radio"].as<float>();
+  crop_h_ = model_config["data_config"]["crop"][0].as<int>();
+  crop_w_ = model_config["data_config"]["crop"][1].as<int>();
+  mean_[0] = model_config["mean"][0].as<float>();
+  mean_[1] = model_config["mean"][1].as<float>();
+  mean_[2] = model_config["mean"][2].as<float>();
+  std_[0] = model_config["std"][0].as<float>();
+  std_[1] = model_config["std"][1].as<float>();
+  std_[2] = model_config["std"][2].as<float>();
+  down_sample_ = model_config["model"]["down_sample"].as<int>();
+  depth_start_ = model_config["grid_config"]["depth"][0].as<float>();
+  depth_end_ = model_config["grid_config"]["depth"][1].as<float>();
+  depth_step_ = model_config["grid_config"]["depth"][2].as<float>();
+  x_start_ = model_config["grid_config"]["x"][0].as<float>();
+  x_end_ = model_config["grid_config"]["x"][1].as<float>();
+  x_step_ = model_config["grid_config"]["x"][2].as<float>();
+  y_start_ = model_config["grid_config"]["y"][0].as<float>();
+  y_end_ = model_config["grid_config"]["y"][1].as<float>();
+  y_step_ = model_config["grid_config"]["y"][2].as<float>();
+  z_start_ = model_config["grid_config"]["z"][0].as<float>();
+  z_end_ = model_config["grid_config"]["z"][1].as<float>();
+  z_step_ = model_config["grid_config"]["z"][2].as<float>();
+  bevpool_channel_ = model_config["model"]["bevpool_channels"].as<int>();
+  nms_pre_maxnum_ = model_config["test_cfg"]["max_per_img"].as<int>();
+  nms_post_maxnum_ = model_config["test_cfg"]["post_max_size"].as<int>();
+  score_thresh_ = model_config["test_cfg"]["score_threshold"].as<float>();
+  nms_overlap_thresh_ = model_config["test_cfg"]["nms_thr"][0].as<float>();
+  use_depth_ = model_config["use_depth"].as<bool>();
+  use_adj_ = model_config["use_adj"].as<bool>();
+  transform_size_ = model_config["transform_size"].as<int>();
+  cam_params_size_ = model_config["cam_params_size"].as<int>();
+
+  std::vector<std::vector<float>> nms_factor_temp =
+    model_config["test_cfg"]["nms_rescale_factor"].as<std::vector<std::vector<float>>>();
+  nms_rescale_factor_.clear();
+  for (const auto & task_factors : nms_factor_temp) {
+    for (float factor : task_factors) {
+      nms_rescale_factor_.push_back(factor);
+    }
+  }
+
+  std::vector<std::vector<std::string>> class_name_pre_task;
+  class_num_ = 0;
+  YAML::Node tasks = model_config["model"]["tasks"];
+  class_num_pre_task_ = std::vector<int>();
+  for (auto it : tasks) {
+    int num = it["num_class"].as<int>();
+    class_num_pre_task_.push_back(num);
+    class_num_ += num;
+    class_name_pre_task.push_back(it["class_names"].as<std::vector<std::string>>());
+  }
+
+  YAML::Node common_head_channel = model_config["model"]["common_head"]["channels"];
+  YAML::Node common_head_name = model_config["model"]["common_head"]["names"];
+  for (size_t i = 0; i < common_head_channel.size(); i++) {
+    out_num_task_head_[common_head_name[i].as<std::string>()] = common_head_channel[i].as<int>();
+  }
+
+  feat_h_ = input_img_h_ / down_sample_;
+  feat_w_ = input_img_w_ / down_sample_;
+  depth_num_ = (depth_end_ - depth_start_) / depth_step_;
+  xgrid_num_ = (x_end_ - x_start_) / x_step_;
+  ygrid_num_ = (y_end_ - y_start_) / y_step_;
+  zgrid_num_ = (z_end_ - z_start_) / z_step_;
+  bev_h_ = ygrid_num_;
+  bev_w_ = xgrid_num_;
+
+  post_rot_ << resize_radio_, 0, 0, 0, resize_radio_, 0, 0, 0, 1;
+  post_trans_.translation() << -crop_w_, -crop_h_, 0;
+
+  adj_num_ = 0;
+  if (use_adj_) {
+    adj_num_ = model_config["adj_num"].as<int>();
+  }
+
+  postprocess_ptr_.reset(new PostprocessGPU(
+    class_num_, score_thresh_, nms_overlap_thresh_, nms_pre_maxnum_, nms_post_maxnum_, down_sample_,
+    bev_h_, bev_w_, x_step_, y_step_, x_start_, y_start_, class_num_pre_task_,
+    nms_rescale_factor_));
+}
+
+void BEVDet::mallocDeviceMemory()
+{
+  trt_buffer_sizes_.resize(trt_engine_->getNbBindings());
+  trt_buffer_dev_ = reinterpret_cast<void **>(new void *[trt_engine_->getNbBindings()]);
+  for (int i = 0; i < trt_engine_->getNbBindings(); i++) {
+    nvinfer1::Dims32 dim = trt_context_->getBindingDimensions(i);
+    size_t size = 1;
+    for (int j = 0; j < dim.nbDims; j++) {
+      size *= dim.d[j];
+    }
+    size *= dataTypeToSize(trt_engine_->getBindingDataType(i));
+    trt_buffer_sizes_[i] = size;
+    CHECK_CUDA(cudaMalloc(&trt_buffer_dev_[i], size));
+  }
+
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "img num binding : %d", trt_engine_->getNbBindings());
+
+  post_buffer_ = reinterpret_cast<void **>(new void *[class_num_pre_task_.size() * 6]);
+  for (size_t i = 0; i < class_num_pre_task_.size(); i++) {
+    post_buffer_[i * 6 + 0] = trt_buffer_dev_[buffer_map_["reg_" + std::to_string(i)]];
+    post_buffer_[i * 6 + 1] = trt_buffer_dev_[buffer_map_["height_" + std::to_string(i)]];
+    post_buffer_[i * 6 + 2] = trt_buffer_dev_[buffer_map_["dim_" + std::to_string(i)]];
+    post_buffer_[i * 6 + 3] = trt_buffer_dev_[buffer_map_["rot_" + std::to_string(i)]];
+    post_buffer_[i * 6 + 4] = trt_buffer_dev_[buffer_map_["vel_" + std::to_string(i)]];
+    post_buffer_[i * 6 + 5] = trt_buffer_dev_[buffer_map_["heatmap_" + std::to_string(i)]];
+  }
+
+  return;
+}
+
+void BEVDet::initViewTransformer(
+  std::vector<int> & ranks_bev, std::vector<int> & ranks_depth, std::vector<int> & ranks_feat,
+  std::vector<int> & interval_starts, std::vector<int> & interval_lengths)
+{
+  int num_points = n_img_ * depth_num_ * feat_h_ * feat_w_;
+  std::vector<Eigen::Vector3f> frustum(num_points);
+
+  for (int i = 0; i < n_img_; i++) {
+    for (int d_ = 0; d_ < depth_num_; d_++) {
+      for (int h_ = 0; h_ < feat_h_; h_++) {
+        for (int w_ = 0; w_ < feat_w_; w_++) {
+          int offset =
+            i * depth_num_ * feat_h_ * feat_w_ + d_ * feat_h_ * feat_w_ + h_ * feat_w_ + w_;
+          frustum[offset].x() = static_cast<float>(w_) * (input_img_w_ - 1) / (feat_w_ - 1);
+          frustum[offset].y() = static_cast<float>(h_) * (input_img_h_ - 1) / (feat_h_ - 1);
+          frustum[offset].z() = static_cast<float>(d_) * depth_step_ + depth_start_;
+
+          // eliminate post transformation
+          frustum[offset] -= post_trans_.translation();
+          frustum[offset] = post_rot_.inverse() * frustum[offset];
+          //
+          frustum[offset].x() *= frustum[offset].z();
+          frustum[offset].y() *= frustum[offset].z();
+          // img to ego -> rot -> trans
+          frustum[offset] = cams2ego_rot_[i] * cams_intrin_[i].inverse() * frustum[offset] +
+                            cams2ego_trans_[i].translation();
+
+          // voxelization
+          frustum[offset] -= Eigen::Vector3f(x_start_, y_start_, z_start_);
+          frustum[offset].x() = static_cast<int>(frustum[offset].x() / x_step_);
+          frustum[offset].y() = static_cast<int>(frustum[offset].y() / y_step_);
+          frustum[offset].z() = static_cast<int>(frustum[offset].z() / z_step_);
+        }
+      }
+    }
+  }
+
+  std::vector<int> _ranks_depth(num_points);
+  std::vector<int> _ranks_feat(num_points);
+
+  for (int i = 0; i < num_points; i++) {
+    _ranks_depth[i] = i;
+  }
+  for (int i = 0; i < n_img_; i++) {
+    for (int d_ = 0; d_ < depth_num_; d_++) {
+      for (int u = 0; u < feat_h_ * feat_w_; u++) {
+        int offset = i * (depth_num_ * feat_h_ * feat_w_) + d_ * (feat_h_ * feat_w_) + u;
+        _ranks_feat[offset] = i * feat_h_ * feat_w_ + u;
+      }
+    }
+  }
+
+  std::vector<int> kept;
+  for (int i = 0; i < num_points; i++) {
+    if (
+      static_cast<int>(frustum[i].x()) >= 0 && static_cast<int>(frustum[i].x()) < xgrid_num_ &&
+      static_cast<int>(frustum[i].y()) >= 0 && static_cast<int>(frustum[i].y()) < ygrid_num_ &&
+      static_cast<int>(frustum[i].z()) >= 0 && static_cast<int>(frustum[i].z()) < zgrid_num_) {
+      kept.push_back(i);
+    }
+  }
+
+  valid_feat_num_ = kept.size();
+  ;
+  ranks_depth.resize(valid_feat_num_);
+  ranks_feat.resize(valid_feat_num_);
+  ranks_bev.resize(valid_feat_num_);
+  std::vector<int> order(valid_feat_num_);
+
+  for (int i = 0; i < valid_feat_num_; i++) {
+    Eigen::Vector3f & p = frustum[kept[i]];
+    ranks_bev[i] = static_cast<int>(p.z()) * xgrid_num_ * ygrid_num_ +
+                   static_cast<int>(p.y()) * xgrid_num_ + static_cast<int>(p.x());
+    order[i] = i;
+  }
+
+  thrust::sort_by_key(ranks_bev.begin(), ranks_bev.end(), order.begin());
+  for (int i = 0; i < valid_feat_num_; i++) {
+    ranks_depth[i] = _ranks_depth[kept[order[i]]];
+    ranks_feat[i] = _ranks_feat[kept[order[i]]];
+  }
+
+  interval_starts.push_back(0);
+  int len = 1;
+  for (int i = 1; i < valid_feat_num_; i++) {
+    if (ranks_bev[i] != ranks_bev[i - 1]) {
+      interval_starts.push_back(i);
+      interval_lengths.push_back(len);
+      len = 1;
+    } else {
+      len++;
+    }
+  }
+
+  interval_lengths.push_back(len);
+  unique_bev_num_ = interval_lengths.size();
+
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "valid_feat_num: %d", valid_feat_num_);
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "unique_bev_num: %d", unique_bev_num_);
+}
+
+void print_dim(nvinfer1::Dims dim, const std::string & name)
+{
+  std::ostringstream oss;
+  oss << name << " : ";
+  for (auto i = 0; i < dim.nbDims; i++) {
+    oss << dim.d[i] << ' ';
+  }
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "%s", oss.str().c_str());
+}
+
+int BEVDet::initEngine(const std::string & engine_file)
+{
+  if (deserializeTRTEngine(engine_file, &trt_engine_)) {
+    return EXIT_FAILURE;
+  }
+
+  if (trt_engine_ == nullptr) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed to deserialize engine file!");
+    return EXIT_FAILURE;
+  }
+  trt_context_ = trt_engine_->createExecutionContext();
+
+  if (trt_context_ == nullptr) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed to create TensorRT Execution Context!");
+    return EXIT_FAILURE;
+  }
+
+  // set bindings
+  std::vector<nvinfer1::Dims32> shapes{
+    {4, {n_img_, 3, src_img_h_, src_img_w_ / 4}},
+    {1, {3}},
+    {1, {3}},
+    {3, {1, n_img_, cam_params_size_}},
+    {1, {valid_feat_num_}},
+    {1, {valid_feat_num_}},
+    {1, {valid_feat_num_}},
+    {1, {unique_bev_num_}},
+    {1, {unique_bev_num_}},
+    {5, {1, adj_num_, bevpool_channel_, bev_h_, bev_w_}},
+    {3, {1, adj_num_, transform_size_}},
+    {2, {1, 1}}};
+
+  for (size_t i = 0; i < shapes.size(); i++) {
+    trt_context_->setBindingDimensions(i, shapes[i]);
+  }
+
+  buffer_map_.clear();
+  for (auto i = 0; i < trt_engine_->getNbBindings(); i++) {
+    auto dim = trt_context_->getBindingDimensions(i);
+    auto name = trt_engine_->getBindingName(i);
+    buffer_map_[name] = i;
+    print_dim(dim, name);
+  }
+
+  return EXIT_SUCCESS;
+}
+
+int BEVDet::deserializeTRTEngine(
+  const std::string & engine_file, nvinfer1::ICudaEngine ** engine_ptr)
+{
+  std::stringstream engine_stream;
+  engine_stream.seekg(0, engine_stream.beg);
+
+  std::ifstream file(engine_file);
+  engine_stream << file.rdbuf();
+  file.close();
+
+  nvinfer1::IRuntime * runtime = nvinfer1::createInferRuntime(g_logger_);
+  if (runtime == nullptr) {
+    std::string msg("Failed to build runtime parser!");
+    g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+    return EXIT_FAILURE;
+  }
+  engine_stream.seekg(0, std::ios::end);
+  const int engine_size = engine_stream.tellg();
+
+  engine_stream.seekg(0, std::ios::beg);
+  void * engine_str = malloc(engine_size);
+  engine_stream.read(reinterpret_cast<char *>(engine_str), engine_size);
+
+  nvinfer1::ICudaEngine * engine = runtime->deserializeCudaEngine(engine_str, engine_size, NULL);
+  if (engine == nullptr) {
+    std::string msg("Failed to build engine parser!");
+    g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+    return EXIT_FAILURE;
+  }
+  *engine_ptr = engine;
+  for (int bi = 0; bi < engine->getNbBindings(); bi++) {
+    if (engine->bindingIsInput(bi) == true) {
+      RCLCPP_INFO(
+        rclcpp::get_logger("BEVDet"), "Binding %d (%s): Input.", bi, engine->getBindingName(bi));
+    } else {
+      RCLCPP_INFO(
+        rclcpp::get_logger("BEVDet"), "Binding %d (%s): Output.", bi, engine->getBindingName(bi));
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+void BEVDet::getAdjBEVFeature(
+  const std::string & curr_scene_token, const Eigen::Quaternion<float> & ego2global_rot,
+  const Eigen::Translation3f & ego2global_trans)
+{
+  int flag = 1;
+  if (adj_frame_ptr_->lastScenesToken() != curr_scene_token) {
+    adj_frame_ptr_->reset();
+    flag = 0;
+  }
+
+  // the smaller the idx, the newer th adj_bevfeat
+  for (int i = 0; i < adj_num_; i++) {
+    const void * adj_buffer = adj_frame_ptr_->getFrameBuffer(i);
+
+    size_t buf_size = trt_buffer_sizes_[buffer_map_["adj_feats"]] / adj_num_;
+
+    CHECK_CUDA(cudaMemcpy(
+      reinterpret_cast<char *>(trt_buffer_dev_[buffer_map_["adj_feats"]]) + i * buf_size,
+      adj_buffer, buf_size, cudaMemcpyDeviceToDevice));
+
+    Eigen::Quaternion<float> adj_ego2global_rot;
+    Eigen::Translation3f adj_ego2global_trans;
+    adj_frame_ptr_->getEgo2Global(i, adj_ego2global_rot, adj_ego2global_trans);
+
+    getCurr2AdjTransform(
+      ego2global_rot, adj_ego2global_rot, ego2global_trans, adj_ego2global_trans,
+      reinterpret_cast<float *>(trt_buffer_dev_[buffer_map_["transforms"]]) + i * transform_size_);
+  }
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["flag"]], &flag, trt_buffer_sizes_[buffer_map_["flag"]],
+    cudaMemcpyHostToDevice));
+}
+
+void BEVDet::getCurr2AdjTransform(
+  const Eigen::Quaternion<float> & curr_ego2global_rot,
+  const Eigen::Quaternion<float> & adj_ego2global_rot,
+  const Eigen::Translation3f & curr_ego2global_trans,
+  const Eigen::Translation3f & adj_ego2global_trans, float * transform_dev)
+{
+  Eigen::Matrix4f curr_e2g_transform;
+  Eigen::Matrix4f adj_e2g_transform;
+
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      curr_e2g_transform(i, j) = curr_ego2global_rot.matrix()(i, j);
+      adj_e2g_transform(i, j) = adj_ego2global_rot.matrix()(i, j);
+    }
+  }
+  for (int i = 0; i < 3; i++) {
+    curr_e2g_transform(i, 3) = curr_ego2global_trans.vector()(i);
+    adj_e2g_transform(i, 3) = adj_ego2global_trans.vector()(i);
+
+    curr_e2g_transform(3, i) = 0.f;
+    adj_e2g_transform(3, i) = 0.f;
+  }
+  curr_e2g_transform(3, 3) = 1.f;
+  adj_e2g_transform(3, 3) = 1.f;
+
+  Eigen::Matrix4f currEgo2adjEgo = adj_e2g_transform.inverse() * curr_e2g_transform;
+  Eigen::Matrix3f currEgo2adjEgo_2d;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      currEgo2adjEgo_2d(i, j) = currEgo2adjEgo(i, j);
+    }
+  }
+  currEgo2adjEgo_2d(2, 0) = 0.f;
+  currEgo2adjEgo_2d(2, 1) = 0.f;
+  currEgo2adjEgo_2d(2, 2) = 1.f;
+  currEgo2adjEgo_2d(0, 2) = currEgo2adjEgo(0, 3);
+  currEgo2adjEgo_2d(1, 2) = currEgo2adjEgo(1, 3);
+
+  Eigen::Matrix3f gridbev2egobev;
+  gridbev2egobev(0, 0) = x_step_;
+  gridbev2egobev(1, 1) = y_step_;
+  gridbev2egobev(0, 2) = x_start_;
+  gridbev2egobev(1, 2) = y_start_;
+  gridbev2egobev(2, 2) = 1.f;
+
+  gridbev2egobev(0, 1) = 0.f;
+  gridbev2egobev(1, 0) = 0.f;
+  gridbev2egobev(2, 0) = 0.f;
+  gridbev2egobev(2, 1) = 0.f;
+
+  Eigen::Matrix3f currgrid2adjgrid = gridbev2egobev.inverse() * currEgo2adjEgo_2d * gridbev2egobev;
+
+  CHECK_CUDA(cudaMemcpy(
+    transform_dev, Eigen::Matrix3f(currgrid2adjgrid.transpose()).data(),
+    transform_size_ * sizeof(float), cudaMemcpyHostToDevice));
+}
+
+int BEVDet::doInfer(
+  const camsData & cam_data, std::vector<Box> & out_detections, float & cost_time, int idx)
+{
+  RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "---------%d---------", idx + 1);
+
+  auto start = high_resolution_clock::now();
+  CHECK_CUDA(cudaMemcpy(
+    trt_buffer_dev_[buffer_map_["images"]], cam_data.imgs_dev,
+    trt_buffer_sizes_[buffer_map_["images"]], cudaMemcpyDeviceToDevice));
+
+  initCamParams(
+    cam_data.param.cams2ego_rot, cam_data.param.cams2ego_trans, cam_data.param.cams_intrin);
+
+  getAdjBEVFeature(
+    cam_data.param.scene_token, cam_data.param.ego2global_rot, cam_data.param.ego2global_trans);
+
+  if (!trt_context_->executeV2(trt_buffer_dev_)) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "BEVDet forward failing!");
+  }
+
+  adj_frame_ptr_->saveFrameBuffer(
+    trt_buffer_dev_[buffer_map_["curr_bevfeat"]], cam_data.param.scene_token,
+    cam_data.param.ego2global_rot, cam_data.param.ego2global_trans);
+
+  auto end = high_resolution_clock::now();
+
+  postprocess_ptr_->DoPostprocess(post_buffer_, out_detections);
+  CHECK_CUDA(cudaDeviceSynchronize());
+
+  auto post_end = high_resolution_clock::now();
+
+  duration<double> infer_t = post_end - start;
+  duration<double> engine_t = end - start;
+  duration<double> post_t = post_end - end;
+
+  cost_time = infer_t.count() * 1000;
+  RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Inference time: %.5lf ms", engine_t.count() * 1000);
+  RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Postprocess time: %.5lf ms", post_t.count() * 1000);
+  RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Total time: %.5lf ms", infer_t.count() * 1000);
+  RCLCPP_DEBUG(rclcpp::get_logger("BEVDet"), "Detect %ld objects", out_detections.size());
+
+  return EXIT_SUCCESS;
+}
+
+void BEVDet::exportEngine(const std::string & onnxFile, const std::string & trtFile)
+{
+  CHECK_CUDA(cudaSetDevice(0));
+  nvinfer1::ICudaEngine * engine = nullptr;
+  std::vector<nvinfer1::Dims32> min_shapes{
+    {4, {6, 3, 900, 400}}, {1, {3}},      {1, {3}},    {3, {1, 6, 27}}, {1, {200000}},
+    {1, {200000}},         {1, {200000}}, {1, {8000}}, {1, {8000}},     {5, {1, 8, 80, 128, 128}},
+    {3, {1, 8, 6}},        {2, {1, 1}}};
+
+  std::vector<nvinfer1::Dims32> opt_shapes{
+    {4, {6, 3, 900, 400}}, {1, {3}},      {1, {3}},     {3, {1, 6, 27}}, {1, {356760}},
+    {1, {356760}},         {1, {356760}}, {1, {13360}}, {1, {13360}},    {5, {1, 8, 80, 128, 128}},
+    {3, {1, 8, 6}},        {2, {1, 1}}};
+
+  std::vector<nvinfer1::Dims32> max_shapes{
+    {4, {6, 3, 900, 400}}, {1, {3}},      {1, {3}},     {3, {1, 6, 27}}, {1, {370000}},
+    {1, {370000}},         {1, {370000}}, {1, {14000}}, {1, {14000}},    {5, {1, 8, 80, 128, 128}},
+    {3, {1, 8, 6}},        {2, {1, 1}}};
+  nvinfer1::IBuilder * builder = nvinfer1::createInferBuilder(g_logger_);
+  nvinfer1::INetworkDefinition * network =
+    builder->createNetworkV2(1U << int(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+  nvinfer1::IOptimizationProfile * profile = builder->createOptimizationProfile();
+  nvinfer1::IBuilderConfig * config = builder->createBuilderConfig();
+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 3UL << 32UL);
+
+  config->setFlag(nvinfer1::BuilderFlag::kFP16);
+
+  nvonnxparser::IParser * parser = nvonnxparser::createParser(*network, g_logger_);
+
+  if (!parser->parseFromFile(onnxFile.c_str(), static_cast<int>(g_logger_.reportable_severity))) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed parsing .onnx file!");
+    for (int i = 0; i < parser->getNbErrors(); ++i) {
+      auto * error = parser->getError(i);
+      RCLCPP_ERROR(
+        rclcpp::get_logger("BEVDet"), "Error code: %d, Description: %s",
+        static_cast<int>(error->code()), error->desc());
+    }
+
+    return;
+  }
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded parsing .onnx file!");
+
+  for (size_t i = 0; i < min_shapes.size(); i++) {
+    nvinfer1::ITensor * it = network->getInput(i);
+    profile->setDimensions(it->getName(), nvinfer1::OptProfileSelector::kMIN, min_shapes[i]);
+    profile->setDimensions(it->getName(), nvinfer1::OptProfileSelector::kOPT, opt_shapes[i]);
+    profile->setDimensions(it->getName(), nvinfer1::OptProfileSelector::kMAX, max_shapes[i]);
+  }
+  config->addOptimizationProfile(profile);
+
+  nvinfer1::IHostMemory * engineString = builder->buildSerializedNetwork(*network, *config);
+  if (engineString == nullptr || engineString->size() == 0) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed building serialized engine!");
+    return;
+  }
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded building serialized engine!");
+
+  nvinfer1::IRuntime * runtime{nvinfer1::createInferRuntime(g_logger_)};
+  engine = runtime->deserializeCudaEngine(engineString->data(), engineString->size());
+  if (engine == nullptr) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed building engine!");
+    return;
+  }
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded building engine!");
+
+  std::ofstream engineFile(trtFile, std::ios::binary);
+  if (!engineFile) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed opening file to write");
+    return;
+  }
+  engineFile.write(static_cast<char *>(engineString->data()), engineString->size());
+  if (engineFile.fail()) {
+    RCLCPP_ERROR(rclcpp::get_logger("BEVDet"), "Failed saving .engine file!");
+    return;
+  }
+  RCLCPP_INFO(rclcpp::get_logger("BEVDet"), "Succeeded saving .engine file!");
+}
+
+BEVDet::~BEVDet()
+{
+  for (int i = 0; i < trt_engine_->getNbBindings(); i++) {
+    CHECK_CUDA(cudaFree(trt_buffer_dev_[i]));
+  }
+  delete[] trt_buffer_dev_;
+  delete[] post_buffer_;
+
+  trt_context_->destroy();
+  trt_engine_->destroy();
+}
+}  // namespace autoware::tensorrt_bevdet
diff --git a/perception/autoware_tensorrt_bevdet/src/bevdet_node.cpp b/perception/autoware_tensorrt_bevdet/src/bevdet_node.cpp
new file mode 100644
index 0000000000000..46bac289cd501
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/bevdet_node.cpp
@@ -0,0 +1,342 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// cspell:ignore BEVDET, thre, TRTBEV, bevdet, caminfo, intrin, Ncams, bevfeat, dlongterm, RGBHWC,
+// BGRCHW
+#include "autoware/tensorrt_bevdet/bevdet_node.hpp"
+
+#include "autoware/tensorrt_bevdet/preprocess.hpp"
+
+using Label = autoware_perception_msgs::msg::ObjectClassification;
+std::map<int, std::vector<int>> colormap{
+  {0, {0, 0, 255}},  // dodger blue
+  {1, {0, 201, 87}}, {2, {0, 201, 87}}, {3, {160, 32, 240}}, {4, {3, 168, 158}}, {5, {255, 0, 0}},
+  {6, {255, 97, 0}}, {7, {30, 0, 255}}, {8, {255, 0, 0}},    {9, {0, 0, 255}},   {10, {0, 0, 0}}};
+
+uint8_t getSemanticType(const std::string & class_name)
+{
+  if (class_name == "car") {
+    return Label::CAR;
+  } else if (class_name == "truck") {
+    return Label::TRUCK;
+  } else if (class_name == "bus") {
+    return Label::BUS;
+  } else if (class_name == "trailer") {
+    return Label::TRAILER;
+  } else if (class_name == "bicycle") {
+    return Label::BICYCLE;
+  } else if (class_name == "motorcycle") {
+    return Label::MOTORCYCLE;
+  } else if (class_name == "pedestrian") {
+    return Label::PEDESTRIAN;
+  } else {
+    return Label::UNKNOWN;
+  }
+}
+
+void box3DToDetectedObjects(
+  const std::vector<Box> & boxes, autoware_perception_msgs::msg::DetectedObjects & bevdet_objects,
+  const std::vector<std::string> & class_names, float score_thre, const bool has_twist = true)
+{
+  for (auto b : boxes) {
+    if (b.score < score_thre) continue;
+    autoware_perception_msgs::msg::DetectedObject obj;
+
+    Eigen::Vector3f center(b.x, b.y, b.z + b.h / 2.);
+
+    obj.existence_probability = b.score;
+    // classification
+    autoware_perception_msgs::msg::ObjectClassification classification;
+    classification.probability = 1.0f;
+    if (b.label >= 0 && static_cast<size_t>(b.label) < class_names.size()) {
+      classification.label = getSemanticType(class_names[b.label]);
+    } else {
+      classification.label = Label::UNKNOWN;
+    }
+    obj.classification.emplace_back(classification);
+
+    // pose and shape
+    geometry_msgs::msg::Point p;
+    p.x = center.x();
+    p.y = center.y();
+    p.z = center.z();
+    obj.kinematics.pose_with_covariance.pose.position = p;
+
+    tf2::Quaternion q;
+    q.setRPY(0, 0, b.r);
+    obj.kinematics.pose_with_covariance.pose.orientation = tf2::toMsg(q);
+
+    obj.shape.type = autoware_perception_msgs::msg::Shape::BOUNDING_BOX;
+
+    geometry_msgs::msg::Vector3 v;
+    v.x = b.l;
+    v.y = b.w;
+    v.z = b.h;
+    obj.shape.dimensions = v;
+    if (has_twist) {
+      float vel_x = b.vx;
+      float vel_y = b.vy;
+      geometry_msgs::msg::Twist twist;
+      twist.linear.x = std::sqrt(std::pow(vel_x, 2) + std::pow(vel_y, 2));
+      twist.angular.z = 2 * (std::atan2(vel_y, vel_x) - b.r);
+      obj.kinematics.twist_with_covariance.twist = twist;
+      obj.kinematics.has_twist = has_twist;
+    }
+
+    bevdet_objects.objects.emplace_back(obj);
+  }
+}
+
+void getTransform(
+  const geometry_msgs::msg::TransformStamped & transform, Eigen::Quaternion<float> & rot,
+  Eigen::Translation3f & translation)
+{
+  rot = Eigen::Quaternion<float>(
+    transform.transform.rotation.w, transform.transform.rotation.x, transform.transform.rotation.y,
+    transform.transform.rotation.z);
+  translation = Eigen::Translation3f(
+    transform.transform.translation.x, transform.transform.translation.y,
+    transform.transform.translation.z);
+}
+
+void getCameraIntrinsics(
+  const sensor_msgs::msg::CameraInfo::SharedPtr msg, Eigen::Matrix3f & intrinsics)
+{
+  intrinsics << msg->k[0], msg->k[1], msg->k[2], msg->k[3], msg->k[4], msg->k[5], msg->k[6],
+    msg->k[7], msg->k[8];
+}
+
+TRTBEVDetNode::TRTBEVDetNode(
+  const std::string & node_name, const rclcpp::NodeOptions & node_options)
+: rclcpp::Node(node_name, node_options)
+{  // Only start camera info subscription and tf listener at the beginning
+  img_N_ = this->declare_parameter<int>("data_params.N", 6);  // camera num 6
+
+  caminfo_received_ = std::vector<bool>(img_N_, false);
+  cams_intrin = std::vector<Eigen::Matrix3f>(img_N_);
+  cams2ego_rot = std::vector<Eigen::Quaternion<float>>(img_N_);
+  cams2ego_trans = std::vector<Eigen::Translation3f>(img_N_);
+
+  tf_buffer_ = std::make_unique<tf2_ros::Buffer>(this->get_clock());
+  tf_listener_ = std::make_shared<tf2_ros::TransformListener>(*tf_buffer_);
+
+  startCameraInfoSubscription();
+
+  // Wait for camera info and tf transform initialization
+  timer_ = this->create_wall_timer(
+    std::chrono::milliseconds(100), std::bind(&TRTBEVDetNode::checkInitialization, this));
+}
+
+void TRTBEVDetNode::initModel()
+{
+  score_thre_ = this->declare_parameter<float>("post_process_params.score_threshold", 0.2);
+
+  model_config_ = this->declare_parameter("model_config", "bevdet_r50_4dlongterm_depth.yaml");
+
+  onnx_file_ = this->declare_parameter<std::string>("onnx_path", "bevdet_one_lt_d.onnx");
+  engine_file_ = this->declare_parameter<std::string>("engine_path", "bevdet_one_lt_d.engine");
+
+  imgs_name_ = this->declare_parameter<std::vector<std::string>>("data_params.cams");
+  class_names_ =
+    this->declare_parameter<std::vector<std::string>>("post_process_params.class_names");
+
+  RCLCPP_INFO_STREAM(this->get_logger(), "Successful load config!");
+
+  sampleData_.param = camParams(cams_intrin, cams2ego_rot, cams2ego_trans);
+
+  RCLCPP_INFO_STREAM(this->get_logger(), "Successful load image params!");
+
+  bevdet_ = std::make_shared<autoware::tensorrt_bevdet::BEVDet>(
+    model_config_, img_N_, sampleData_.param.cams_intrin, sampleData_.param.cams2ego_rot,
+    sampleData_.param.cams2ego_trans, onnx_file_, engine_file_);
+
+  RCLCPP_INFO_STREAM(this->get_logger(), "Successful create bevdet!");
+
+  CHECK_CUDA(cudaMalloc(
+    reinterpret_cast<void **>(&imgs_dev_), img_N_ * 3 * img_w_ * img_h_ * sizeof(uchar)));
+
+  pub_boxes_ = this->create_publisher<autoware_perception_msgs::msg::DetectedObjects>(
+    "~/output/boxes", rclcpp::QoS{1});
+}
+
+void TRTBEVDetNode::checkInitialization()
+{
+  if (camera_info_received_flag_) {
+    RCLCPP_INFO_STREAM(
+      this->get_logger(), "Camera Info and TF Transform Initialization completed!");
+    initModel();
+    startImageSubscription();
+    timer_->cancel();
+    timer_.reset();
+  } else {
+    RCLCPP_INFO_THROTTLE(
+      this->get_logger(), *this->get_clock(), 5000,
+      "Waiting for Camera Info and TF Transform Initialization...");
+  }
+}
+
+void TRTBEVDetNode::startImageSubscription()
+{
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  using std::placeholders::_3;
+  using std::placeholders::_4;
+  using std::placeholders::_5;
+  using std::placeholders::_6;
+
+  sub_f_img_.subscribe(
+    this, "~/input/topic_img_f",
+    rclcpp::QoS{1}.get_rmw_qos_profile());  // rmw_qos_profile_sensor_data
+  sub_b_img_.subscribe(this, "~/input/topic_img_b", rclcpp::QoS{1}.get_rmw_qos_profile());
+
+  sub_fl_img_.subscribe(this, "~/input/topic_img_fl", rclcpp::QoS{1}.get_rmw_qos_profile());
+  sub_fr_img_.subscribe(this, "~/input/topic_img_fr", rclcpp::QoS{1}.get_rmw_qos_profile());
+
+  sub_bl_img_.subscribe(this, "~/input/topic_img_bl", rclcpp::QoS{1}.get_rmw_qos_profile());
+  sub_br_img_.subscribe(this, "~/input/topic_img_br", rclcpp::QoS{1}.get_rmw_qos_profile());
+
+  sync_ = std::make_shared<Sync>(
+    MySyncPolicy(10), sub_fl_img_, sub_f_img_, sub_fr_img_, sub_bl_img_, sub_b_img_, sub_br_img_);
+
+  sync_->registerCallback(std::bind(&TRTBEVDetNode::callback, this, _1, _2, _3, _4, _5, _6));
+}
+
+void TRTBEVDetNode::startCameraInfoSubscription()
+{
+  // cams: ["CAM_FRONT_LEFT", "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_BACK_LEFT", "CAM_BACK",
+  // "CAM_BACK_RIGHT"]
+  sub_fl_caminfo_ = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+    "~/input/topic_img_fl/camera_info", rclcpp::QoS{1},
+    [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(0, msg); });
+
+  sub_f_caminfo_ = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+    "~/input/topic_img_f/camera_info", rclcpp::QoS{1},
+    [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(1, msg); });
+
+  sub_fr_caminfo_ = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+    "~/input/topic_img_fr/camera_info", rclcpp::QoS{1},
+    [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(2, msg); });
+
+  sub_bl_caminfo_ = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+    "~/input/topic_img_bl/camera_info", rclcpp::QoS{1},
+    [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(3, msg); });
+
+  sub_b_caminfo_ = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+    "~/input/topic_img_b/camera_info", rclcpp::QoS{1},
+    [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(4, msg); });
+
+  sub_br_caminfo_ = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+    "~/input/topic_img_br/camera_info", rclcpp::QoS{1},
+    [this](const sensor_msgs::msg::CameraInfo::SharedPtr msg) { camera_info_callback(5, msg); });
+}
+
+void image_transport(std::vector<cv::Mat> imgs, uchar * out_imgs, size_t width, size_t height)
+{
+  uchar * temp = new uchar[width * height * 3];
+  uchar * temp_gpu = nullptr;
+  CHECK_CUDA(cudaMalloc(&temp_gpu, width * height * 3));
+
+  for (size_t i = 0; i < imgs.size(); i++) {
+    cv::cvtColor(imgs[i], imgs[i], cv::COLOR_BGR2RGB);
+    CHECK_CUDA(cudaMemcpy(temp_gpu, imgs[i].data, width * height * 3, cudaMemcpyHostToDevice));
+    convert_RGBHWC_to_BGRCHW(temp_gpu, out_imgs + i * width * height * 3, 3, height, width);
+  }
+  delete[] temp;
+  CHECK_CUDA(cudaFree(temp_gpu));
+}
+
+void TRTBEVDetNode::callback(
+  const sensor_msgs::msg::Image::ConstSharedPtr & msg_fl_img,
+  const sensor_msgs::msg::Image::ConstSharedPtr & msg_f_img,
+  const sensor_msgs::msg::Image::ConstSharedPtr & msg_fr_img,
+  const sensor_msgs::msg::Image::ConstSharedPtr & msg_bl_img,
+  const sensor_msgs::msg::Image::ConstSharedPtr & msg_b_img,
+  const sensor_msgs::msg::Image::ConstSharedPtr & msg_br_img)
+{
+  cv::Mat img_fl, img_f, img_fr, img_bl, img_b, img_br;
+  std::vector<cv::Mat> imgs;
+  img_fl = cv_bridge::toCvShare(msg_fl_img, "bgr8")->image;
+  img_f = cv_bridge::toCvShare(msg_f_img, "bgr8")->image;
+  img_fr = cv_bridge::toCvShare(msg_fr_img, "bgr8")->image;
+  img_bl = cv_bridge::toCvShare(msg_bl_img, "bgr8")->image;
+  img_b = cv_bridge::toCvShare(msg_b_img, "bgr8")->image;
+  img_br = cv_bridge::toCvShare(msg_br_img, "bgr8")->image;
+
+  imgs.emplace_back(img_fl);
+  imgs.emplace_back(img_f);
+  imgs.emplace_back(img_fr);
+  imgs.emplace_back(img_bl);
+  imgs.emplace_back(img_b);
+  imgs.emplace_back(img_br);
+
+  image_transport(imgs, imgs_dev_, img_w_, img_h_);
+
+  // uchar *imgs_dev
+  sampleData_.imgs_dev = imgs_dev_;
+
+  std::vector<Box> ego_boxes;
+  ego_boxes.clear();
+  float time = 0.f;
+
+  bevdet_->doInfer(sampleData_, ego_boxes, time);
+
+  autoware_perception_msgs::msg::DetectedObjects bevdet_objects;
+  bevdet_objects.header.frame_id = "base_link";
+  bevdet_objects.header.stamp = msg_f_img->header.stamp;
+
+  box3DToDetectedObjects(ego_boxes, bevdet_objects, class_names_, score_thre_);
+
+  pub_boxes_->publish(bevdet_objects);
+}
+
+void TRTBEVDetNode::camera_info_callback(int idx, const sensor_msgs::msg::CameraInfo::SharedPtr msg)
+{
+  if (caminfo_received_[idx])
+    return;  // already received;  not expected to modify because of we init the model only once
+
+  if (!initialized_) {  // get image width and height
+    img_w_ = msg->width;
+    img_h_ = msg->height;
+    initialized_ = true;
+  }
+  Eigen::Matrix3f intrinsics;
+  getCameraIntrinsics(msg, intrinsics);
+  cams_intrin[idx] = intrinsics;
+
+  Eigen::Quaternion<float> rot;
+  Eigen::Translation3f translation;
+  getTransform(
+    tf_buffer_->lookupTransform("base_link", msg->header.frame_id, rclcpp::Time(0)), rot,
+    translation);
+  cams2ego_rot[idx] = rot;
+  cams2ego_trans[idx] = translation;
+
+  caminfo_received_[idx] = true;
+  camera_info_received_flag_ =
+    std::all_of(caminfo_received_.begin(), caminfo_received_.end(), [](bool i) { return i; });
+}
+
+TRTBEVDetNode::~TRTBEVDetNode()
+{
+  delete imgs_dev_;
+}
+
+int main(int argc, char ** argv)
+{
+  rclcpp::init(argc, argv);
+  rclcpp::NodeOptions node_options;
+  auto bevdet_node = std::make_shared<TRTBEVDetNode>("tensorrt_bevdet_node", node_options);
+  rclcpp::spin(bevdet_node);
+  return 0;
+}
diff --git a/perception/autoware_tensorrt_bevdet/src/bevpool_plugin.cu b/perception/autoware_tensorrt_bevdet/src/bevpool_plugin.cu
new file mode 100644
index 0000000000000..a498a2a1685ba
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/bevpool_plugin.cu
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "autoware/tensorrt_bevdet/bevpool_plugin.hpp"
+#include "autoware/tensorrt_bevdet/common.hpp"
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+// input[0] depth            b*n x d x h x w
+// input[1] feat             b*n x c x h x w
+// input[2] ranks_depth      m
+// input[3] ranks_feat       m
+// input[4] ranks_bev        m
+// input[5] interval_starts  k
+// input[6] interval_lengths k
+
+// out[0]   bevfeat          b x c x h x w
+
+template <typename T1, typename T2>
+__global__ void bev_pool_v2_kernel(
+  int channel, int n_intervals, int map_size, int img_size, const T1 * __restrict__ depth,
+  const T1 * __restrict__ feat, const int * __restrict__ ranks_depth,
+  const int * __restrict__ ranks_feat, const int * __restrict__ ranks_bev,
+  const int * __restrict__ interval_starts, const int * __restrict__ interval_lengths,
+  T2 * __restrict__ out)
+{
+  CUDA_1D_KERNEL_LOOP(idx, n_intervals * channel)
+  {
+    int index = idx / channel;   // bev grid index
+    int curr_c = idx % channel;  // channel index
+    int interval_start = interval_starts[index];
+    int interval_length = interval_lengths[index];
+
+    int curr_step = curr_c * img_size;
+    int chan_step = channel * img_size;
+
+    T2 sum = 0;
+
+    int feat_offset = 0;
+    for (int i = 0; i < interval_length; i++) {
+      feat_offset = ranks_feat[interval_start + i] / img_size * chan_step + curr_step +
+                    ranks_feat[interval_start + i] % img_size;
+
+      sum += static_cast<T2>(feat[feat_offset]) *
+             static_cast<T2>(depth[ranks_depth[interval_start + i]]);
+    }
+    out[curr_c * map_size + ranks_bev[interval_start]] = sum;
+  }
+}
+
+namespace nvinfer1
+{
+// class BEVPoolPlugin
+BEVPoolPlugin::BEVPoolPlugin(const std::string & name, int bev_h, int bev_w, int n) : name_(name)
+{
+  m_.bev_h = bev_h;
+  m_.bev_w = bev_w;
+  m_.n = n;
+}
+
+BEVPoolPlugin::BEVPoolPlugin(const std::string & name, const void * buffer, size_t length)
+: name_(name)
+{
+  memcpy(&m_, buffer, sizeof(m_));
+}
+
+BEVPoolPlugin::~BEVPoolPlugin()
+{
+}
+
+IPluginV2DynamicExt * BEVPoolPlugin::clone() const noexcept
+{
+  auto p = new BEVPoolPlugin(name_, &m_, sizeof(m_));
+  p->setPluginNamespace(namespace_.c_str());
+  return p;
+}
+
+int32_t BEVPoolPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+DataType BEVPoolPlugin::getOutputDataType(
+  int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept
+{
+  return DataType::kFLOAT;
+}
+
+DimsExprs BEVPoolPlugin::getOutputDimensions(
+  int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+  IExprBuilder & exprBuilder) noexcept
+{
+  // input[0] depth            b*n x d x h x w
+  // input[1] feat             b*n x c x h x w
+
+  // input[2] ranks_depth      m
+  // input[3] ranks_feat       m
+  // input[4] ranks_bev        m
+  // input[5] interval_starts  k
+  // input[6] interval_lengths k
+
+  // out[0]   bevfeat          b x c x h x w
+
+  DimsExprs ret;
+  ret.nbDims = inputs[1].nbDims;
+  ret.d[0] = exprBuilder.operation(
+    DimensionOperation::kFLOOR_DIV, *inputs[1].d[0], *exprBuilder.constant(m_.n));
+  ret.d[1] = inputs[1].d[1];
+  ret.d[2] = exprBuilder.constant(m_.bev_h);
+  ret.d[3] = exprBuilder.constant(m_.bev_w);
+
+  return ret;
+}
+
+bool BEVPoolPlugin::supportsFormatCombination(
+  int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
+{
+  // depth
+  if (pos == 0) {
+    return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) &&
+           inOut[pos].format == TensorFormat::kLINEAR;
+  } else if (pos == 1) {  // feat
+    return inOut[0].type == inOut[1].type && inOut[pos].format == TensorFormat::kLINEAR;
+  } else if (pos == 7) {  // out
+    return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) &&
+           inOut[pos].format == TensorFormat::kLINEAR;
+  } else {
+    return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR;
+  }
+  return false;
+}
+
+void BEVPoolPlugin::configurePlugin(
+  const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+  int32_t nbOutputs) noexcept
+{
+  return;
+}
+
+size_t BEVPoolPlugin::getWorkspaceSize(
+  const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+  int32_t nbOutputs) const noexcept
+{
+  return 0;
+}
+
+int32_t BEVPoolPlugin::enqueue(
+  const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+  const void * const * inputs, void * const * outputs, void * workspace,
+  cudaStream_t stream) noexcept
+{
+  // input[0] == depth            b*n x d x h x w
+  // input[1] == feat             b*n x c x h x w
+  // input[2] == ranks_depth      m
+  // input[3] == ranks_feat       m
+  // input[4] == ranks_bev        m
+  // input[5] == interval_starts  k
+  // input[6] == interval_lengths k
+
+  int channel = inputDesc[1].dims.d[1];
+  int n_intervals = inputDesc[5].dims.d[0];
+  int map_size = m_.bev_h * m_.bev_w;
+  int img_size = inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
+
+  dim3 grid(GET_BLOCKS(n_intervals * channel));
+  dim3 block(NUM_THREADS);
+
+  switch (int(outputDesc[0].type)) {
+    case int(DataType::kFLOAT):
+      if (inputDesc[0].type == DataType::kFLOAT) {
+        // fp32 fp32
+        bev_pool_v2_kernel<float, float><<<grid, block, 0, stream>>>(
+          channel, n_intervals, map_size, img_size, reinterpret_cast<const float *>(inputs[0]),
+          reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<const int *>(inputs[2]),
+          reinterpret_cast<const int *>(inputs[3]), reinterpret_cast<const int *>(inputs[4]),
+          reinterpret_cast<const int *>(inputs[5]), reinterpret_cast<const int *>(inputs[6]),
+          reinterpret_cast<float *>(outputs[0]));
+      } else {
+        // fp16 fp32
+        bev_pool_v2_kernel<__half, float><<<grid, block, 0, stream>>>(
+          channel, n_intervals, map_size, img_size, reinterpret_cast<const __half *>(inputs[0]),
+          reinterpret_cast<const __half *>(inputs[1]), reinterpret_cast<const int *>(inputs[2]),
+          reinterpret_cast<const int *>(inputs[3]), reinterpret_cast<const int *>(inputs[4]),
+          reinterpret_cast<const int *>(inputs[5]), reinterpret_cast<const int *>(inputs[6]),
+          reinterpret_cast<float *>(outputs[0]));
+      }
+      break;
+    case int(DataType::kHALF):
+      if (inputDesc[0].type == DataType::kFLOAT) {
+        // fp32 fp16
+        bev_pool_v2_kernel<float, __half><<<grid, block, 0, stream>>>(
+          channel, n_intervals, map_size, img_size, reinterpret_cast<const float *>(inputs[0]),
+          reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<const int *>(inputs[2]),
+          reinterpret_cast<const int *>(inputs[3]), reinterpret_cast<const int *>(inputs[4]),
+          reinterpret_cast<const int *>(inputs[5]), reinterpret_cast<const int *>(inputs[6]),
+          reinterpret_cast<__half *>(outputs[0]));
+      } else {
+        // fp16 fp16
+        bev_pool_v2_kernel<__half, __half><<<grid, block, 0, stream>>>(
+          channel, n_intervals, map_size, img_size, reinterpret_cast<const __half *>(inputs[0]),
+          reinterpret_cast<const __half *>(inputs[1]), reinterpret_cast<const int *>(inputs[2]),
+          reinterpret_cast<const int *>(inputs[3]), reinterpret_cast<const int *>(inputs[4]),
+          reinterpret_cast<const int *>(inputs[5]), reinterpret_cast<const int *>(inputs[6]),
+          reinterpret_cast<__half *>(outputs[0]));
+      }
+      break;
+    default:  // should NOT be here
+      RCLCPP_ERROR(rclcpp::get_logger("BEVPoolPlugin"), "\tUnsupport datatype!");
+  }
+  return 0;
+}
+
+void BEVPoolPlugin::destroy() noexcept
+{
+  delete this;
+  return;
+}
+
+int32_t BEVPoolPlugin::initialize() noexcept
+{
+  return 0;
+}
+
+void BEVPoolPlugin::terminate() noexcept
+{
+  return;
+}
+
+size_t BEVPoolPlugin::getSerializationSize() const noexcept
+{
+  return sizeof(m_);
+}
+
+void BEVPoolPlugin::serialize(void * buffer) const noexcept
+{
+  memcpy(buffer, &m_, sizeof(m_));
+  return;
+}
+
+void BEVPoolPlugin::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * BEVPoolPlugin::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * BEVPoolPlugin::getPluginType() const noexcept
+{
+  return BEVPOOL_PLUGIN_NAME;
+}
+
+const char * BEVPoolPlugin::getPluginVersion() const noexcept
+{
+  return BEVPOOL_PLUGIN_VERSION;
+}
+
+void BEVPoolPlugin::attachToContext(
+  cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept
+{
+  return;
+}
+
+void BEVPoolPlugin::detachFromContext() noexcept
+{
+  return;
+}
+
+// class BEVPoolPluginCreator
+PluginFieldCollection BEVPoolPluginCreator::fc_{};
+std::vector<PluginField> BEVPoolPluginCreator::attr_;
+
+BEVPoolPluginCreator::BEVPoolPluginCreator()
+{
+  attr_.clear();
+  attr_.emplace_back(PluginField("bev_h", nullptr, PluginFieldType::kINT32, 1));
+  attr_.emplace_back(PluginField("bev_w", nullptr, PluginFieldType::kINT32, 1));
+  attr_.emplace_back(PluginField("n", nullptr, PluginFieldType::kINT32, 1));
+
+  fc_.nbFields = attr_.size();
+  fc_.fields = attr_.data();
+}
+
+BEVPoolPluginCreator::~BEVPoolPluginCreator()
+{
+}
+
+IPluginV2DynamicExt * BEVPoolPluginCreator::createPlugin(
+  const char * name, const PluginFieldCollection * fc) noexcept
+{
+  const PluginField * fields = fc->fields;
+
+  int bev_h = -1;
+  int bev_w = -1;
+  int n = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    if (std::string(fc->fields[i].name) == std::string("bev_h")) {
+      bev_h = *reinterpret_cast<const int *>(fc->fields[i].data);
+    } else if (std::string(fc->fields[i].name) == std::string("bev_w")) {
+      bev_w = *reinterpret_cast<const int *>(fc->fields[i].data);
+    } else if (std::string(fc->fields[i].name) == std::string("n")) {
+      n = *reinterpret_cast<const int *>(fc->fields[i].data);
+    }
+  }
+  BEVPoolPlugin * pObj = new BEVPoolPlugin(name, bev_h, bev_w, n);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+IPluginV2DynamicExt * BEVPoolPluginCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) noexcept
+{
+  BEVPoolPlugin * pObj = new BEVPoolPlugin(name, serialData, serialLength);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+void BEVPoolPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * BEVPoolPluginCreator::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * BEVPoolPluginCreator::getPluginName() const noexcept
+{
+  return BEVPOOL_PLUGIN_NAME;
+}
+
+const char * BEVPoolPluginCreator::getPluginVersion() const noexcept
+{
+  return BEVPOOL_PLUGIN_VERSION;
+}
+
+const PluginFieldCollection * BEVPoolPluginCreator::getFieldNames() noexcept
+{
+  return &fc_;
+}
+
+REGISTER_TENSORRT_PLUGIN(BEVPoolPluginCreator);
+
+}  // namespace nvinfer1
diff --git a/perception/autoware_tensorrt_bevdet/src/cpu_jpegdecoder.cpp b/perception/autoware_tensorrt_bevdet/src/cpu_jpegdecoder.cpp
new file mode 100644
index 0000000000000..1afe91becfb95
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/cpu_jpegdecoder.cpp
@@ -0,0 +1,93 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp"
+
+#include "autoware/tensorrt_bevdet/preprocess.hpp"
+
+#include <cuda_runtime.h>
+#include <jpeglib.h>
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <vector>
+
+struct jpeg_error_handler
+{
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+};
+
+int decode_jpeg(const std::vector<char> & buffer, uchar * output)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_handler jerr;
+  cinfo.err = jpeg_std_error(&jerr.pub);
+
+  jerr.pub.error_exit = [](j_common_ptr cinfo) {
+    jpeg_error_handler * myerr = reinterpret_cast<jpeg_error_handler *>(cinfo->err);
+    longjmp(myerr->setjmp_buffer, 1);
+  };
+
+  if (setjmp(jerr.setjmp_buffer)) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("decode_jpeg"), "Failed to decompress jpeg: %s",
+      jerr.pub.jpeg_message_table[jerr.pub.msg_code]);
+    jpeg_destroy_decompress(&cinfo);
+    return EXIT_FAILURE;
+  }
+  jpeg_create_decompress(&cinfo);
+
+  if (buffer.size() == 0) {
+    RCLCPP_ERROR(rclcpp::get_logger("decode_jpeg"), "buffer size is 0");
+    return EXIT_FAILURE;
+  }
+  jpeg_mem_src(&cinfo, reinterpret_cast<const uchar *>(buffer.data()), buffer.size());
+  if (jpeg_read_header(&cinfo, TRUE) != JPEG_HEADER_OK) {
+    RCLCPP_ERROR(rclcpp::get_logger("decode_jpeg"), "Failed to read jpeg header");
+    return EXIT_FAILURE;
+  }
+  if (jpeg_start_decompress(&cinfo) != TRUE) {
+    RCLCPP_ERROR(rclcpp::get_logger("decode_jpeg"), "Failed to start decompress");
+    return EXIT_FAILURE;
+  }
+
+  while (cinfo.output_scanline < cinfo.output_height) {
+    auto row = cinfo.output_scanline;
+    uchar * ptr = output + row * cinfo.image_width * 3;
+    jpeg_read_scanlines(&cinfo, (JSAMPARRAY)&ptr, 1);
+  }
+  jpeg_finish_decompress(&cinfo);
+  jpeg_destroy_decompress(&cinfo);
+  return EXIT_SUCCESS;
+}
+
+int decode_cpu(
+  const std::vector<std::vector<char>> & files_data, uchar * out_imgs, size_t width, size_t height)
+{
+  uchar * temp = new uchar[width * height * 3];
+  uchar * temp_gpu = nullptr;
+  CHECK_CUDA(cudaMalloc(&temp_gpu, width * height * 3));
+  for (size_t i = 0; i < files_data.size(); i++) {
+    if (decode_jpeg(files_data[i], temp)) {
+      return EXIT_FAILURE;
+    }
+    CHECK_CUDA(cudaMemcpy(temp_gpu, temp, width * height * 3, cudaMemcpyHostToDevice));
+    convert_RGBHWC_to_BGRCHW(temp_gpu, out_imgs + i * width * height * 3, 3, height, width);
+    CHECK_CUDA(cudaDeviceSynchronize());
+  }
+  CHECK_CUDA(cudaFree(temp_gpu));
+  delete[] temp;
+  return EXIT_SUCCESS;
+}
diff --git a/perception/autoware_tensorrt_bevdet/src/data.cpp b/perception/autoware_tensorrt_bevdet/src/data.cpp
new file mode 100644
index 0000000000000..6dbb02015b5a2
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/data.cpp
@@ -0,0 +1,84 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "autoware/tensorrt_bevdet/data.hpp"
+
+#include "autoware/tensorrt_bevdet/cpu_jpegdecoder.hpp"
+
+#include <stdlib.h>
+
+#include <chrono>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+
+using std::chrono::duration;
+using std::chrono::high_resolution_clock;
+
+camParams::camParams(const YAML::Node & config, int n, std::vector<std::string> & cams_name)
+: N_img(n)
+{
+  if (static_cast<size_t>(n) != cams_name.size()) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("camParams"), "Error! Need %d camera param, but given %zu camera names!",
+      n, cams_name.size());
+  }
+  ego2global_rot = fromYamlQuater(config["ego2global_rotation"]);
+  ego2global_trans = fromYamlTrans(config["ego2global_translation"]);
+
+  lidar2ego_rot = fromYamlQuater(config["lidar2ego_rotation"]);
+  lidar2ego_trans = fromYamlTrans(config["lidar2ego_translation"]);
+
+  timestamp = config["timestamp"].as<std::int64_t>();
+  scene_token = config["scene_token"].as<std::string>();
+
+  imgs_file.clear();
+
+  cams_intrin.clear();
+  cams2ego_rot.clear();
+  cams2ego_trans.clear();
+
+  for (const std::string & name : cams_name) {
+    imgs_file.push_back("." + config["cams"][name]["data_path"].as<std::string>());
+
+    //
+    cams_intrin.push_back(fromYamlMatrix3f(config["cams"][name]["cam_intrinsic"]));
+    cams2ego_rot.push_back(fromYamlQuater(config["cams"][name]["sensor2ego_rotation"]));
+    cams2ego_trans.push_back(fromYamlTrans(config["cams"][name]["sensor2ego_translation"]));
+    //
+  }
+}
+
+Eigen::Translation3f fromYamlTrans(YAML::Node x)
+{
+  std::vector<float> trans = x.as<std::vector<float>>();
+  return Eigen::Translation3f(trans[0], trans[1], trans[2]);
+}
+
+Eigen::Quaternion<float> fromYamlQuater(YAML::Node x)
+{
+  std::vector<float> quater = x.as<std::vector<float>>();
+  return Eigen::Quaternion<float>(quater[0], quater[1], quater[2], quater[3]);
+}
+
+Eigen::Matrix3f fromYamlMatrix3f(YAML::Node x)
+{
+  std::vector<std::vector<float>> m = x.as<std::vector<std::vector<float>>>();
+  Eigen::Matrix3f mat;
+  for (size_t i = 0; i < m.size(); i++) {
+    for (size_t j = 0; j < m[0].size(); j++) {
+      mat(i, j) = m[i][j];
+    }
+  }
+  return mat;
+}
diff --git a/perception/autoware_tensorrt_bevdet/src/gatherbev_plugin.cu b/perception/autoware_tensorrt_bevdet/src/gatherbev_plugin.cu
new file mode 100644
index 0000000000000..e9fecf8cef37a
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/gatherbev_plugin.cu
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "autoware/tensorrt_bevdet/common.hpp"
+#include "autoware/tensorrt_bevdet/gatherbev_plugin.hpp"
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+// input[0] == adj_feat   b x 8 x 80 x 128 x 128
+// input[1] == curr_feat  b x 80 x 128 x 128
+// input[2] == flag       b x 1
+// out[0]                 b x (8+1)*80 x 128 x 128
+template <typename T>
+__global__ void copy_feat_kernel(
+  int nthreads,  // b * (adj_num + 1) * map_size
+  int adj_num, int channel, int map_size, const T * adj_feats, const T * curr_feat,
+  const int * flag, T * out_feats)
+{
+  CUDA_1D_KERNEL_LOOP(idx, nthreads)
+  {
+    int b = idx / ((adj_num + 1) * map_size);
+    int n = (idx / map_size) % (adj_num + 1);
+    int m = idx % map_size;
+
+    int start = b * (adj_num + 1) * channel * map_size + n * channel * map_size + m;
+    int end = start + channel * map_size;
+    for (int i = start, c = 0; i < end; i += map_size, c++) {
+      if (flag[b] == 0 || n == 0) {
+        out_feats[i] = curr_feat[b * channel * map_size + c * map_size + m];
+      } else {
+        out_feats[i] = adj_feats[i - channel * map_size];
+      }
+    }
+  }
+}
+
+namespace nvinfer1
+{
+// class GatherBEVPlugin
+GatherBEVPlugin::GatherBEVPlugin(const std::string & name) : name_(name)
+{
+}
+
+GatherBEVPlugin::GatherBEVPlugin(const std::string & name, const void * buffer, size_t length)
+: name_(name)
+{
+  memcpy(&m_, buffer, sizeof(m_));
+}
+
+GatherBEVPlugin::~GatherBEVPlugin()
+{
+}
+
+IPluginV2DynamicExt * GatherBEVPlugin::clone() const noexcept
+{
+  auto p = new GatherBEVPlugin(name_, &m_, sizeof(m_));
+  p->setPluginNamespace(namespace_.c_str());
+  return p;
+}
+
+int32_t GatherBEVPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+DataType GatherBEVPlugin::getOutputDataType(
+  int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept
+{
+  return inputTypes[0];  // 与adj_feat一致
+}
+
+DimsExprs GatherBEVPlugin::getOutputDimensions(
+  int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+  IExprBuilder & exprBuilder) noexcept
+{
+  // input[0] == adj_feat   b x 8 x 80 x 128 x 128
+  // input[1] == curr_feat  b x 80 x 128 x 128
+  // input[2] == flag       b x 1
+  // out[0]                 b x (8+1)*80 x 128 x 128
+  DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims - 1;
+
+  IDimensionExpr const * n_feat =
+    exprBuilder.operation(DimensionOperation::kSUM, *inputs[0].d[1], *exprBuilder.constant(1));
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = exprBuilder.operation(DimensionOperation::kPROD, *n_feat, *inputs[0].d[2]);
+  ret.d[2] = inputs[0].d[3];
+  ret.d[3] = inputs[0].d[4];
+
+  return ret;
+}
+
+bool GatherBEVPlugin::supportsFormatCombination(
+  int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
+{
+  // adj_feat
+  if (pos == 0) {
+    return (inOut[pos].type == DataType::kFLOAT || inOut[pos].type == DataType::kHALF) &&
+           inOut[pos].format == TensorFormat::kLINEAR;
+  } else if (pos == 1) {  // curr_feat
+    return inOut[0].type == inOut[1].type && inOut[pos].format == TensorFormat::kLINEAR;
+  } else if (pos == 3) {  // out
+    return inOut[0].type == inOut[3].type && inOut[pos].format == TensorFormat::kLINEAR;
+  } else if (pos == 2) {
+    return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR;
+  }
+  return false;
+}
+
+void GatherBEVPlugin::configurePlugin(
+  const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+  int32_t nbOutputs) noexcept
+{
+  return;
+}
+
+size_t GatherBEVPlugin::getWorkspaceSize(
+  const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+  int32_t nbOutputs) const noexcept
+{
+  return 0;
+}
+
+int32_t GatherBEVPlugin::enqueue(
+  const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+  const void * const * inputs, void * const * outputs, void * workspace,
+  cudaStream_t stream) noexcept
+{
+  // input[0] == adj_feat   b x 8 x 80 x 128 x 128
+  // input[1] == curr_feat  b x 80 x 128 x 128
+  // input[2] == flag       b x 1
+  // out[0]                 b x (8+1)*80 x 128 x 128
+
+  // int nthreads, // b * (adj_num + 1) * map_size
+  // int adj_num,
+  // int channel,
+  // int map_size,
+
+  // int flag = 0;
+  // CHECK_CUDA(cudaMemcpy(&flag, inputs[2], sizeof(int), cudaMemcpyDeviceToHost));
+
+  int b = inputDesc[0].dims.d[0];
+  int adj_num = inputDesc[0].dims.d[1];
+  int map_size = inputDesc[0].dims.d[3] * inputDesc[0].dims.d[4];
+  int channel = inputDesc[0].dims.d[2];
+
+  int feat_step = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2] * inputDesc[1].dims.d[3];
+
+  int nthreads = b * (adj_num + 1) * map_size;
+
+  dim3 grid(GET_BLOCKS(nthreads));
+  dim3 block(NUM_THREADS);
+
+  switch (int(outputDesc[0].type)) {
+    case int(DataType::kFLOAT):
+      // fp32
+      copy_feat_kernel<<<grid, block, 0, stream>>>(
+        nthreads, adj_num, channel, map_size, reinterpret_cast<const float *>(inputs[0]),
+        reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<const int *>(inputs[2]),
+        reinterpret_cast<float *>(outputs[0]));
+
+      break;
+    case int(DataType::kHALF):
+      // fp16
+      copy_feat_kernel<<<grid, block, 0, stream>>>(
+        nthreads, adj_num, channel, map_size, reinterpret_cast<const __half *>(inputs[0]),
+        reinterpret_cast<const __half *>(inputs[1]), reinterpret_cast<const int *>(inputs[2]),
+        reinterpret_cast<__half *>(outputs[0]));
+      break;
+    default:  // should NOT be here
+      RCLCPP_ERROR(rclcpp::get_logger("GatherBEVPlugin"), "\tUnsupport datatype!");
+  }
+  return 0;
+}
+
+void GatherBEVPlugin::destroy() noexcept
+{
+  delete this;
+  return;
+}
+
+int32_t GatherBEVPlugin::initialize() noexcept
+{
+  return 0;
+}
+
+void GatherBEVPlugin::terminate() noexcept
+{
+  return;
+}
+
+size_t GatherBEVPlugin::getSerializationSize() const noexcept
+{
+  return sizeof(m_);
+}
+
+void GatherBEVPlugin::serialize(void * buffer) const noexcept
+{
+  memcpy(buffer, &m_, sizeof(m_));
+  return;
+}
+
+void GatherBEVPlugin::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * GatherBEVPlugin::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * GatherBEVPlugin::getPluginType() const noexcept
+{
+  return GATHERBEV_PLUGIN_NAME;
+}
+
+const char * GatherBEVPlugin::getPluginVersion() const noexcept
+{
+  return GATHERBEV_PLUGIN_VERSION;
+}
+
+void GatherBEVPlugin::attachToContext(
+  cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept
+{
+  return;
+}
+
+void GatherBEVPlugin::detachFromContext() noexcept
+{
+  return;
+}
+
+// class GatherBEVPluginCreator
+PluginFieldCollection GatherBEVPluginCreator::fc_{};
+std::vector<PluginField> GatherBEVPluginCreator::attr_;
+
+GatherBEVPluginCreator::GatherBEVPluginCreator()
+{
+  attr_.clear();
+  fc_.nbFields = attr_.size();
+  fc_.fields = attr_.data();
+}
+
+GatherBEVPluginCreator::~GatherBEVPluginCreator()
+{
+}
+
+IPluginV2DynamicExt * GatherBEVPluginCreator::createPlugin(
+  const char * name, const PluginFieldCollection * fc) noexcept
+{
+  GatherBEVPlugin * pObj = new GatherBEVPlugin(name);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+IPluginV2DynamicExt * GatherBEVPluginCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) noexcept
+{
+  GatherBEVPlugin * pObj = new GatherBEVPlugin(name, serialData, serialLength);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+void GatherBEVPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * GatherBEVPluginCreator::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * GatherBEVPluginCreator::getPluginName() const noexcept
+{
+  return GATHERBEV_PLUGIN_NAME;
+}
+
+const char * GatherBEVPluginCreator::getPluginVersion() const noexcept
+{
+  return GATHERBEV_PLUGIN_VERSION;
+}
+
+const PluginFieldCollection * GatherBEVPluginCreator::getFieldNames() noexcept
+{
+  return &fc_;
+}
+
+REGISTER_TENSORRT_PLUGIN(GatherBEVPluginCreator);
+
+}  // namespace nvinfer1
diff --git a/perception/autoware_tensorrt_bevdet/src/iou3d_nms.cu b/perception/autoware_tensorrt_bevdet/src/iou3d_nms.cu
new file mode 100644
index 0000000000000..fe68a6f03bb0f
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/iou3d_nms.cu
@@ -0,0 +1,633 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2022.
+*/
+
+#include "autoware/tensorrt_bevdet/iou3d_nms.hpp"
+
+struct Point
+{
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+  __device__ void set(float _x, float _y)
+  {
+    x = _x;
+    y = _y;
+  }
+
+  __device__ Point operator+(const Point & b) const { return Point(x + b.x, y + b.y); }
+
+  __device__ Point operator-(const Point & b) const { return Point(x - b.x, y - b.y); }
+};
+
+__device__ inline float cross(const Point & a, const Point & b)
+{
+  return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point & p1, const Point & p2, const Point & p0)
+{
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(
+  const Point & p1, const Point & p2, const Point & q1, const Point & q2)
+{
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+__device__ inline int check_in_box2d(const float * box, const Point & p)
+{
+  // params: (7) [x, y, z, dx, dy, dz, heading]
+  const float kMargin = 1e-2;
+
+  float center_x = box[0], center_y = box[1];
+  // rotate the point in the opposite direction of box
+  float angle_cos = cos(-box[6]);
+  float angle_sin = sin(-box[6]);
+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+  return (fabs(rot_x) < box[3] / 2 + kMargin && fabs(rot_y) < box[4] / 2 + kMargin);
+}
+
+__device__ inline int intersection(
+  const Point & p1, const Point & p0, const Point & q1, const Point & q0, Point & ans)
+{
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans.x = (b0 * c1 - b1 * c0) / D;
+    ans.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+__device__ inline void rotate_around_center(
+  const Point & center, const float angle_cos, const float angle_sin, Point & p)
+{
+  float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+  float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point & a, const Point & b, const Point & center)
+{
+  return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float * box_a, const float * box_b)
+{
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+
+  float a_angle = box_a[6], b_angle = box_b[6];
+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2,
+        b_dy_half = box_b[4] / 2;
+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+  Point center_a(box_a[0], box_a[1]);
+  Point center_b(box_b[0], box_b[1]);
+
+#ifdef DEBUG
+  RCLCPP_DEBUG(
+    rclcpp::get_logger("iou3d_nms"),
+    "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)", a_x1, a_y1, a_x2, a_y2,
+    a_angle, b_x1, b_y1, b_x2, b_y2, b_angle);
+  RCLCPP_DEBUG(
+    rclcpp::get_logger("iou3d_nms"), "center a: (%.3f, %.3f), b: (%.3f, %.3f)", center_a.x,
+    center_a.y, center_b.x, center_b.y);
+#endif
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+#ifdef DEBUG
+    RCLCPP_DEBUG(
+      rclcpp::get_logger("iou3d_nms"), "before corner %d: a(%.3f, %.3f), b(%.3f, %.3f)", k,
+      box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+    RCLCPP_DEBUG(
+      rclcpp::get_logger("iou3d_nms"), "corner %d: a(%.3f, %.3f), b(%.3f, %.3f)", k,
+      box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(
+        box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j],
+        cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+#ifdef DEBUG
+        RCLCPP_DEBUG(
+          rclcpp::get_logger("iou3d_nms"),
+          "Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f)",
+          cross_points[cnt - 1].x, cross_points[cnt - 1].y, box_a_corners[i].x, box_a_corners[i].y,
+          box_a_corners[i + 1].x, box_a_corners[i + 1].y, box_b_corners[i].x, box_b_corners[i].y,
+          box_b_corners[i + 1].x, box_b_corners[i + 1].y);
+#endif
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+#ifdef DEBUG
+      RCLCPP_DEBUG(
+        rclcpp::get_logger("iou3d_nms"), "b corners in a: corner_b(%.3f, %.3f)",
+        cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+#ifdef DEBUG
+      RCLCPP_DEBUG(
+        rclcpp::get_logger("iou3d_nms"), "a corners in b: corner_a(%.3f, %.3f)",
+        cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+#ifdef DEBUG
+  RCLCPP_DEBUG(rclcpp::get_logger("iou3d_nms"), "cnt=%d", cnt);
+  for (int i = 0; i < cnt; i++) {
+    RCLCPP_DEBUG(
+      rclcpp::get_logger("iou3d_nms"), "All cross point %d: (%.3f, %.3f)", i, cross_points[i].x,
+      cross_points[i].y);
+  }
+#endif
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float * box_a, const float * box_b)
+{
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+  float sa = box_a[3] * box_a[4];
+  float sb = box_b[3] * box_b[4];
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__device__ inline float iou_normal(float const * const a, float const * const b)
+{
+  // params: a: [x, y, z, dx, dy, dz, heading]
+  // params: b: [x, y, z, dx, dy, dz, heading]
+  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
+        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
+        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = a[3] * a[4];
+  float Sb = b[3] * b[4];
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void boxes_overlap_kernel(
+  const int num_a, const float * boxes_a, const int num_b, const float * boxes_b,
+  float * ans_overlap)
+{
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+  const float * cur_box_a = boxes_a + a_idx * kBoxBlockSize;
+  const float * cur_box_b = boxes_b + b_idx * kBoxBlockSize;
+  float s_overlap = box_overlap(cur_box_a, cur_box_b);
+  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void boxes_iou_bev_kernel(
+  const int num_a, const float * boxes_a, const int num_b, const float * boxes_b, float * ans_iou)
+{
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+
+  const float * cur_box_a = boxes_a + a_idx * kBoxBlockSize;
+  const float * cur_box_b = boxes_b + b_idx * kBoxBlockSize;
+  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void RotatedNmsKernel(
+  const int boxes_num, const float nms_overlap_thresh, const float * boxes, std::int64_t * mask)
+{
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * kBoxBlockSize];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * kBoxBlockSize + 0] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 0];
+    block_boxes[threadIdx.x * kBoxBlockSize + 1] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 1];
+    block_boxes[threadIdx.x * kBoxBlockSize + 2] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 2];
+    block_boxes[threadIdx.x * kBoxBlockSize + 3] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 3];
+    block_boxes[threadIdx.x * kBoxBlockSize + 4] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 4];
+    block_boxes[threadIdx.x * kBoxBlockSize + 5] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 5];
+    block_boxes[threadIdx.x * kBoxBlockSize + 6] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 6];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float * cur_box = boxes + cur_box_idx * kBoxBlockSize;
+
+    int i = 0;
+    std::int64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_bev(cur_box, block_boxes + i * kBoxBlockSize) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+__global__ void NmsKernel(
+  const int boxes_num, const float nms_overlap_thresh, const float * boxes, std::int64_t * mask)
+{
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * kBoxBlockSize];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * kBoxBlockSize + 0] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 0];
+    block_boxes[threadIdx.x * kBoxBlockSize + 1] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 1];
+    block_boxes[threadIdx.x * kBoxBlockSize + 2] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 2];
+    block_boxes[threadIdx.x * kBoxBlockSize + 3] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 3];
+    block_boxes[threadIdx.x * kBoxBlockSize + 4] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 4];
+    block_boxes[threadIdx.x * kBoxBlockSize + 5] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 5];
+    block_boxes[threadIdx.x * kBoxBlockSize + 6] =
+      boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * kBoxBlockSize + 6];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float * cur_box = boxes + cur_box_idx * kBoxBlockSize;
+
+    int i = 0;
+    std::int64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_normal(cur_box, block_boxes + i * kBoxBlockSize) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+__global__ void RotatedNmsWithIndicesKernel(
+  const int boxes_num, const float nms_overlap_thresh, const float * res_box,
+  const int * res_sorted_indices, std::int64_t * mask)
+{
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * kBoxBlockSize];
+
+  if (threadIdx.x < col_size) {  // 419, 420 have two situations to discuss separately
+    const int col_actual_idx = res_sorted_indices[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x];
+    block_boxes[threadIdx.x * kBoxBlockSize + 0] = res_box[col_actual_idx * kBoxBlockSize + 0];
+    block_boxes[threadIdx.x * kBoxBlockSize + 1] = res_box[col_actual_idx * kBoxBlockSize + 1];
+    block_boxes[threadIdx.x * kBoxBlockSize + 2] = res_box[col_actual_idx * kBoxBlockSize + 2];
+    block_boxes[threadIdx.x * kBoxBlockSize + 3] = res_box[col_actual_idx * kBoxBlockSize + 3];
+    block_boxes[threadIdx.x * kBoxBlockSize + 4] = res_box[col_actual_idx * kBoxBlockSize + 4];
+    block_boxes[threadIdx.x * kBoxBlockSize + 5] = res_box[col_actual_idx * kBoxBlockSize + 5];
+    block_boxes[threadIdx.x * kBoxBlockSize + 6] = res_box[col_actual_idx * kBoxBlockSize + 6];
+  }
+  __syncthreads();  // Waiting for threads within the block
+
+  if (threadIdx.x < row_size) {  //
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const int row_actual_idx = res_sorted_indices[cur_box_idx];
+
+    float cur_box[kBoxBlockSize];
+    cur_box[0] = res_box[row_actual_idx * kBoxBlockSize + 0];
+    cur_box[1] = res_box[row_actual_idx * kBoxBlockSize + 1];
+    cur_box[2] = res_box[row_actual_idx * kBoxBlockSize + 2];
+    cur_box[3] = res_box[row_actual_idx * kBoxBlockSize + 3];
+    cur_box[4] = res_box[row_actual_idx * kBoxBlockSize + 4];
+    cur_box[5] = res_box[row_actual_idx * kBoxBlockSize + 5];
+    cur_box[6] = res_box[row_actual_idx * kBoxBlockSize + 6];
+
+    int i = 0;
+    std::int64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;  // Isn't it right now to compare with [0~threadIdx. x)? FIXME
+    }  // I think this is correct, and there is no need to compare it with numbers smaller than my
+       // own. If col is smaller than row, there is also no need to compare it
+    for (i = start; i < col_size; i++) {
+      if (iou_bev(cur_box, block_boxes + i * kBoxBlockSize) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    // assume cur_box_idx = 21, col_start = 0, row_start = 0 , threadIdx = 21,
+    // mark 21 th box and top 64 boxes
+    mask[cur_box_idx * col_blocks + col_start] = t;
+    // or col_start * boxes_num + cur_box_idx
+  }
+}
+
+__global__ void box_assign_kernel(
+  float * reg, float * height, float * dim, float * rot, float * boxes, float * score, int * label,
+  float * out_score, int * out_label, int * validIndexs, int head_x_size, int head_y_size)
+{
+  int boxId = blockIdx.x;
+  int channel = threadIdx.x;
+  int idx = validIndexs[boxId];
+  if (channel == 0) {
+    boxes[boxId * kBoxBlockSize + 0] = reg[idx];
+  } else if (channel == 1) {
+    boxes[boxId * kBoxBlockSize + 1] = reg[idx + head_x_size * head_y_size];
+  } else if (channel == 2) {
+    boxes[boxId * kBoxBlockSize + 2] = height[idx];
+  } else if (channel == 3) {
+    boxes[boxId * kBoxBlockSize + 3] = dim[idx];
+  } else if (channel == 4) {
+    boxes[boxId * kBoxBlockSize + 4] = dim[idx + head_x_size * head_y_size];
+  } else if (channel == 5) {
+    boxes[boxId * kBoxBlockSize + 5] = dim[idx + 2 * head_x_size * head_y_size];
+  } else if (channel == 6) {
+    float theta =
+      atan2f(rot[0 * head_x_size * head_y_size + idx], rot[1 * head_x_size * head_y_size + idx]);
+    theta = -theta - 3.1415926 / 2;
+    boxes[boxId * kBoxBlockSize + 6] = theta;
+  }
+  // else if(channel == 7)
+  // out_score[boxId] = score[idx];
+  else if (channel == 8) {
+    out_label[boxId] = label[idx];
+  }
+}
+
+void box_assign_launcher(
+  float * reg, float * height, float * dim, float * rot, float * boxes, float * score, int * label,
+  float * out_score, int * out_label, int * validIndexs, int boxSize, int head_x_size,
+  int head_y_size)
+{
+  box_assign_kernel<<<boxSize, 9>>>(
+    reg, height, dim, rot, boxes, score, label, out_score, out_label, validIndexs, head_x_size,
+    head_y_size);
+}
+
+__global__ void index_assign(int * indexs)
+{
+  int yIdx = blockIdx.x;
+  int xIdx = threadIdx.x;
+  int idx = yIdx * blockDim.x + xIdx;
+  indexs[idx] = idx;
+}
+
+void index_assign_launcher(int * indexs, int head_x_size, int head_y_size)
+{
+  index_assign<<<head_x_size, head_y_size>>>(indexs);
+}
+
+void boxes_overlap_launcher(
+  const int num_a, const float * boxes_a, const int num_b, const float * boxes_b,
+  float * ans_overlap)
+{
+  dim3 blocks(
+    DIVUP(num_b, THREADS_PER_BLOCK),
+    DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+  boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_overlap);
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void boxes_iou_bev_launcher(
+  const int num_a, const float * boxes_a, const int num_b, const float * boxes_b, float * ans_iou)
+{
+  dim3 blocks(
+    DIVUP(num_b, THREADS_PER_BLOCK),
+    DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+  boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_iou);
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void nms_launcher(const float * boxes, std::int64_t * mask, int boxes_num, float nms_overlap_thresh)
+{
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  RotatedNmsKernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
+
+void nms_normal_launcher(
+  const float * boxes, std::int64_t * mask, int boxes_num, float nms_overlap_thresh)
+{
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  NmsKernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
+
+Iou3dNmsCuda::Iou3dNmsCuda(
+  const int head_x_size, const int head_y_size, const float nms_overlap_thresh)
+: head_x_size_(head_x_size), head_y_size_(head_y_size), nms_overlap_thresh_(nms_overlap_thresh)
+{
+}
+
+int Iou3dNmsCuda::DoIou3dNms(
+  const int box_num_pre, const float * res_box, const int * res_sorted_indices,
+  std::int64_t * host_keep_data)
+{
+  const int col_blocks = DIVUP(box_num_pre, THREADS_PER_BLOCK_NMS);  // How many blocks are divided
+  std::int64_t * dev_mask = NULL;
+  cudaMalloc((void **)&dev_mask, box_num_pre * col_blocks * sizeof(std::int64_t));  //
+  // THREADS_PER_BLOCK_NMS 掩码的长度
+  dim3 blocks(DIVUP(box_num_pre, THREADS_PER_BLOCK_NMS), DIVUP(box_num_pre, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  RotatedNmsWithIndicesKernel<<<blocks, threads>>>(
+    box_num_pre, nms_overlap_thresh_, res_box, res_sorted_indices, dev_mask);
+
+  std::int64_t host_mask[box_num_pre * col_blocks];
+  cudaMemcpy(
+    host_mask, dev_mask, box_num_pre * col_blocks * sizeof(std::int64_t), cudaMemcpyDeviceToHost);
+  cudaFree(dev_mask);
+
+  std::int64_t host_remv[col_blocks];
+  memset(host_remv, 0, col_blocks * sizeof(std::int64_t));
+  int num_to_keep = 0;
+  for (int i = 0; i < box_num_pre; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+    if (!(host_remv[nblock] &
+          (1ULL << inblock))) {  // Meeting this requirement indicates that it does not conflict
+                                 // with any of the boxes selected earlier
+      host_keep_data[num_to_keep++] = i;  // Add the satisfied box
+      for (int j = nblock; j < col_blocks;
+           j++) {  // Consider boxes with index i that overlap beyond the threshold
+        host_remv[j] |= host_mask[i * col_blocks + j];
+      }
+    }
+  }
+
+  if (cudaSuccess != cudaGetLastError()) {
+    RCLCPP_ERROR(
+      rclcpp::get_logger("iou3d_nms"), "Error: %s", cudaGetErrorString(cudaGetLastError()));
+  }
+  return num_to_keep;
+}
diff --git a/perception/autoware_tensorrt_bevdet/src/nvjpegdecoder.cpp b/perception/autoware_tensorrt_bevdet/src/nvjpegdecoder.cpp
new file mode 100644
index 0000000000000..7428c997e94e9
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/nvjpegdecoder.cpp
@@ -0,0 +1,299 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef __HAVE_NVJPEG__
+
+#include "autoware/tensorrt_bevdet/nvjpegdecoder.hpp"
+
+#include <chrono>
+
+using std::chrono::duration;
+using std::chrono::high_resolution_clock;
+
+int dev_malloc(void ** p, size_t s)
+{
+  return static_cast<int>(cudaMalloc(p, s));
+}
+
+int dev_free(void * p)
+{
+  return static_cast<int>(cudaFree(p));
+}
+
+int host_malloc(void ** p, size_t s, unsigned int f)
+{
+  return static_cast<int>(cudaHostAlloc(p, s, f));
+}
+
+int host_free(void * p)
+{
+  return static_cast<int>(cudaFreeHost(p));
+}
+
+int prepare_buffers(
+  const std::vector<std::vector<char>> & file_data, std::vector<int> & img_width,
+  std::vector<int> & img_height, std::vector<nvjpegImage_t> & ibuf,
+  std::vector<nvjpegImage_t> & isz, share_params & share_param)
+{
+  int widths[NVJPEG_MAX_COMPONENT];
+  int heights[NVJPEG_MAX_COMPONENT];
+  int channels;
+  nvjpegChromaSubsampling_t subsampling;
+
+  for (size_t i = 0; i < file_data.size(); i++) {
+    CHECK_NVJPEG(nvjpegGetImageInfo(
+      share_param.nvjpeg_handle, reinterpret_cast<uchar *>(file_data[i].data()),
+      file_data[i].size(), &channels, &subsampling, widths, heights));
+
+    int mul = 1;
+    // in the case of interleaved RGB output, write only to single channel, but
+    // 3 samples at once
+    if (share_param.fmt == NVJPEG_OUTPUT_RGBI || share_param.fmt == NVJPEG_OUTPUT_BGRI) {
+      channels = 1;
+      mul = 3;
+    } else if (share_param.fmt == NVJPEG_OUTPUT_RGB || share_param.fmt == NVJPEG_OUTPUT_BGR) {
+      // in the case of rgb create 3 buffers with sizes of original image
+      channels = 3;
+      widths[1] = widths[2] = widths[0];
+      heights[1] = heights[2] = heights[0];
+    }
+
+    if (img_width[i] != widths[0] || img_height[i] != heights[0]) {
+      img_width[i] = widths[0];
+      img_height[i] = heights[0];
+      // realloc output buffer if required
+      for (int c = 0; c < channels; c++) {
+        int aw = mul * widths[c];
+        int ah = heights[c];
+        size_t sz = aw * ah;
+        ibuf[i].pitch[c] = aw;
+        if (sz > isz[i].pitch[c]) {
+          if (ibuf[i].channel[c]) {
+            CHECK_CUDA(cudaFree(ibuf[i].channel[c]));
+          }
+          CHECK_CUDA(cudaMalloc(reinterpret_cast<void **>(&ibuf[i].channel[c]), sz));
+          isz[i].pitch[c] = sz;
+        }
+      }
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+void create_decoupled_api_handles(std::vector<decode_params_t> & params, share_params & share_param)
+{
+  for (size_t i = 0; i < params.size(); i++) {
+    CHECK_NVJPEG(nvjpegDecoderCreate(
+      share_param.nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, &params[i].nvjpeg_decoder));
+    CHECK_NVJPEG(nvjpegDecoderStateCreate(
+      share_param.nvjpeg_handle, params[i].nvjpeg_decoder, &params[i].nvjpeg_decoupled_state));
+
+    CHECK_NVJPEG(
+      nvjpegBufferPinnedCreate(share_param.nvjpeg_handle, NULL, &params[i].pinned_buffers[0]));
+    CHECK_NVJPEG(
+      nvjpegBufferPinnedCreate(share_param.nvjpeg_handle, NULL, &params[i].pinned_buffers[1]));
+    CHECK_NVJPEG(
+      nvjpegBufferDeviceCreate(share_param.nvjpeg_handle, NULL, &params[i].device_buffer));
+
+    CHECK_NVJPEG(nvjpegJpegStreamCreate(share_param.nvjpeg_handle, &params[i].jpeg_streams[0]));
+    CHECK_NVJPEG(nvjpegJpegStreamCreate(share_param.nvjpeg_handle, &params[i].jpeg_streams[1]));
+
+    CHECK_NVJPEG(
+      nvjpegDecodeParamsCreate(share_param.nvjpeg_handle, &params[i].nvjpeg_decode_params));
+  }
+}
+
+void destroy_decoupled_api_handles(
+  std::vector<decode_params_t> & params, share_params & share_param)
+{
+  for (size_t i = 0; i < params.size(); i++) {
+    CHECK_NVJPEG(nvjpegDecodeParamsDestroy(params[i].nvjpeg_decode_params));
+
+    CHECK_NVJPEG(nvjpegJpegStreamDestroy(params[i].jpeg_streams[0]));
+    CHECK_NVJPEG(nvjpegJpegStreamDestroy(params[i].jpeg_streams[1]));
+    CHECK_NVJPEG(nvjpegBufferPinnedDestroy(params[i].pinned_buffers[0]));
+    CHECK_NVJPEG(nvjpegBufferPinnedDestroy(params[i].pinned_buffers[1]));
+    CHECK_NVJPEG(nvjpegBufferDeviceDestroy(params[i].device_buffer));
+
+    CHECK_NVJPEG(nvjpegJpegStateDestroy(params[i].nvjpeg_decoupled_state));
+    CHECK_NVJPEG(nvjpegDecoderDestroy(params[i].nvjpeg_decoder));
+  }
+}
+
+void release_buffers(std::vector<nvjpegImage_t> & ibuf)
+{
+  for (size_t i = 0; i < ibuf.size(); i++) {
+    for (size_t c = 0; c < NVJPEG_MAX_COMPONENT; c++)
+      if (ibuf[i].channel[c]) CHECK_CUDA(cudaFree(ibuf[i].channel[c]));
+  }
+}
+
+int get_img(
+  const uchar * d_chanR, int pitchR, const uchar * d_chanG, int pitchG, const uchar * d_chanB,
+  int pitchB, size_t width, size_t height, uchar * img)
+{
+  size_t step = width * height;
+
+  CHECK_CUDA(cudaMemcpy2D(
+    img + 0 * step, static_cast<size_t>(width), d_chanR, static_cast<size_t>(pitchR), width, height,
+    cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(cudaMemcpy2D(
+    img + 1 * step, static_cast<size_t>(width), d_chanG, static_cast<size_t>(pitchG), width, height,
+    cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(cudaMemcpy2D(
+    img + 2 * step, static_cast<size_t>(width), d_chanB, static_cast<size_t>(pitchB), width, height,
+    cudaMemcpyDeviceToDevice));
+
+  return EXIT_SUCCESS;
+}
+
+void decode_single_image(
+  const std::vector<char> & img_data, nvjpegImage_t & out, decode_params_t & params,
+  share_params & share_param, double & time)
+{
+  CHECK_CUDA(cudaStreamSynchronize(params.stream));
+  cudaEvent_t startEvent = NULL, stopEvent = NULL;
+  float loopTime = 0;
+  // cudaEventBlockingSync
+  CHECK_CUDA(cudaEventCreate(&startEvent));
+  CHECK_CUDA(cudaEventCreate(&stopEvent));
+
+  CHECK_CUDA(cudaEventRecord(startEvent, params.stream));
+
+  CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
+  int buffer_index = 0;
+  CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, share_param.fmt));
+
+  CHECK_NVJPEG(nvjpegJpegStreamParse(
+    share_param.nvjpeg_handle, reinterpret_cast<const uchar *>(img_data.data()), img_data.size(), 0,
+    0, params.jpeg_streams[buffer_index]));
+
+  CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer(
+    params.nvjpeg_decoupled_state, params.pinned_buffers[buffer_index]));
+
+  CHECK_NVJPEG(nvjpegDecodeJpegHost(
+    share_param.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
+    params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));
+
+  CHECK_CUDA(cudaStreamSynchronize(params.stream));
+
+  CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice(
+    share_param.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
+    params.jpeg_streams[buffer_index], params.stream));
+
+  buffer_index = 1 - buffer_index;  // switch pinned buffer in pipeline mode to avoid an extra sync
+  RCLCPP_DEBUG(rclcpp::get_logger("nvjpegDecoder"), buffer_index);
+
+  CHECK_NVJPEG(nvjpegDecodeJpegDevice(
+    share_param.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state, &out,
+    params.stream));
+
+  // CHECK_CUDA(cudaStreamSynchronize(params.stream)); //TODO new add
+
+  CHECK_CUDA(cudaEventRecord(stopEvent, params.stream));
+
+  CHECK_CUDA(cudaEventSynchronize(stopEvent));
+  CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
+  time = 0.001 * static_cast<double>(loopTime);  // cudaEventElapsedTime returns milliseconds
+                                                 // time = 1.0;
+}
+
+nvjpegDecoder::nvjpegDecoder(size_t n, size_t _fmt)
+: N_img(n), iout(n), isz(n), widths(n), heights(n), params(n), share_param(_fmt)
+{
+  init();
+}
+
+int nvjpegDecoder::decode(const std::vector<std::vector<char>> & files_data, uchar * out_imgs)
+{
+  for (size_t i = 0; i < params.size(); i++) {
+    CHECK_CUDA(cudaStreamCreate(&params[i].stream));
+  }
+  if (prepare_buffers(files_data, widths, heights, iout, isz, share_param)) return EXIT_FAILURE;
+
+  double times[6];
+
+  auto decode_start = high_resolution_clock::now();
+  std::vector<std::thread> threads(files_data.size());
+  for (size_t i = 0; i < files_data.size(); i++) {
+    threads[i] = std::thread(
+      decode_single_image, std::ref(files_data[i]), std::ref(iout[i]), std::ref(params[i]),
+      std::ref(share_param), std::ref(times[i]));
+  }
+  for (size_t i = 0; i < files_data.size(); i++) {
+    threads[i].join();
+  }
+  auto decode_end = high_resolution_clock::now();
+  duration<double> decode_time = decode_end - decode_start;
+
+  RCLCPP_INFO(
+    rclcpp::get_logger("nvjpegDecoder"), "Decode total time : %.4lf ms",
+    decode_time.count() * 1000);
+
+  for (size_t i = 0; i < files_data.size(); i++) {
+    get_img(
+      iout[i].channel[0], iout[i].pitch[0], iout[i].channel[1], iout[i].pitch[1],
+      iout[i].channel[2], iout[i].pitch[2], widths[i], heights[i],
+      out_imgs + i * 3 * widths[i] * heights[i]);
+  }
+
+  for (size_t i = 0; i < params.size(); i++) {
+    CHECK_CUDA(cudaStreamDestroy(params[i].stream));
+  }
+  return EXIT_SUCCESS;
+}
+
+int nvjpegDecoder::init()
+{
+  nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
+  nvjpegPinnedAllocator_t pinned_allocator = {&host_malloc, &host_free};
+
+  nvjpegStatus_t status = nvjpegCreateEx(
+    NVJPEG_BACKEND_HARDWARE, &dev_allocator, &pinned_allocator, NVJPEG_FLAGS_DEFAULT,
+    &share_param.nvjpeg_handle);
+  share_param.hw_decode_available = true;
+  if (status == NVJPEG_STATUS_ARCH_MISMATCH) {
+    RCLCPP_WARN(
+      rclcpp::get_logger("nvjpegDecoder"),
+      "Hardware Decoder not supported. Falling back to default backend");
+    CHECK_NVJPEG(nvjpegCreateEx(
+      NVJPEG_BACKEND_DEFAULT, &dev_allocator, &pinned_allocator, NVJPEG_FLAGS_DEFAULT,
+      &share_param.nvjpeg_handle));
+    share_param.hw_decode_available = false;
+  } else {
+    CHECK_NVJPEG(status);
+  }
+
+  CHECK_NVJPEG(nvjpegJpegStateCreate(share_param.nvjpeg_handle, &share_param.nvjpeg_state));
+
+  create_decoupled_api_handles(params, share_param);
+
+  for (size_t i = 0; i < iout.size(); i++) {
+    for (size_t c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
+      iout[i].channel[c] = NULL;
+      iout[i].pitch[c] = 0;
+      isz[i].pitch[c] = 0;
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+nvjpegDecoder::~nvjpegDecoder()
+{
+  release_buffers(iout);
+  destroy_decoupled_api_handles(params, share_param);
+  CHECK_NVJPEG(nvjpegJpegStateDestroy(share_param.nvjpeg_state));
+  CHECK_NVJPEG(nvjpegDestroy(share_param.nvjpeg_handle));
+}
+
+#endif
diff --git a/perception/autoware_tensorrt_bevdet/src/postprocess.cu b/perception/autoware_tensorrt_bevdet/src/postprocess.cu
new file mode 100644
index 0000000000000..c7ac66a3236e6
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/postprocess.cu
@@ -0,0 +1,212 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "autoware/tensorrt_bevdet/postprocess.hpp"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+__device__ float sigmoid_gpu(const float x)
+{
+  return 1.0f / (1.0f + expf(-x));
+}
+
+__global__ void BEVDecodeObjectKernel(
+  const int map_size,        // 40000
+  const float score_thresh,  // 0.1
+                             //    const int nms_pre_max_size, // 4096
+  const float x_start, const float y_start, const float x_step, const float y_step,
+  const int output_h, const int output_w, const int downsample_size, const int num_class_in_task,
+  const int cls_range, const float * reg, const float * hei, const float * dim, const float * rot,
+  const float * vel, const float * cls, float * res_box, float * res_conf, int * res_cls,
+  int * res_box_num, float * rescale_factor)
+{  // According to the confidence level, the initial screening showed that there were res-box_num
+   // boxes after screening, not exceeding nms_pre_max_size 4096
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx >= map_size) return;
+
+  float max_score = cls[idx];  // Initialize as the first class of the task
+  int label = cls_range;       // Initialize as the first class of the task
+  for (int i = 1; i < num_class_in_task; ++i) {
+    float cur_score = cls[idx + i * map_size];
+    if (cur_score > max_score) {
+      max_score = cur_score;
+      label = i + cls_range;
+    }
+  }
+
+  int coor_x = idx % output_h;  //
+  int coor_y = idx / output_w;  //
+
+  float conf = sigmoid_gpu(max_score);  // Calculate confidence level
+  if (conf > score_thresh) {
+    int cur_valid_box_id = atomicAdd(res_box_num, 1);
+    res_box[cur_valid_box_id * kBoxBlockSize + 0] =
+      (reg[idx + 0 * map_size] + coor_x) * x_step + x_start;
+    res_box[cur_valid_box_id * kBoxBlockSize + 1] =
+      (reg[idx + 1 * map_size] + coor_y) * y_step + y_start;
+    res_box[cur_valid_box_id * kBoxBlockSize + 2] = hei[idx];
+    res_box[cur_valid_box_id * kBoxBlockSize + 3] =
+      expf(dim[idx + 0 * map_size]) * rescale_factor[label];  // nms scale
+    res_box[cur_valid_box_id * kBoxBlockSize + 4] =
+      expf(dim[idx + 1 * map_size]) * rescale_factor[label];
+    res_box[cur_valid_box_id * kBoxBlockSize + 5] =
+      expf(dim[idx + 2 * map_size]) * rescale_factor[label];
+    res_box[cur_valid_box_id * kBoxBlockSize + 6] = atan2f(rot[idx], rot[idx + map_size]);
+    res_box[cur_valid_box_id * kBoxBlockSize + 7] = vel[idx];
+    res_box[cur_valid_box_id * kBoxBlockSize + 8] = vel[idx + map_size];
+
+    res_conf[cur_valid_box_id] = conf;
+    res_cls[cur_valid_box_id] = label;
+  }
+}
+
+PostprocessGPU::PostprocessGPU(
+  const int _class_num, const float _score_thresh, const float _nms_thresh,
+  const int _nms_pre_maxnum, const int _nms_post_maxnum, const int _down_sample,
+  const int _output_h, const int _output_w, const float _x_step, const float _y_step,
+  const float _x_start, const float _y_start, const std::vector<int> & _class_num_pre_task,
+  const std::vector<float> & _nms_rescale_factor)
+: class_num(_class_num),
+  score_thresh(_score_thresh),
+  nms_thresh(_nms_thresh),
+  nms_pre_maxnum(_nms_pre_maxnum),
+  nms_post_maxnum(_nms_post_maxnum),
+  down_sample(_down_sample),
+  output_h(_output_h),
+  output_w(_output_w),
+  x_step(_x_step),
+  y_step(_y_step),
+  x_start(_x_start),
+  y_start(_y_start),
+  map_size(output_h * output_w),
+  class_num_pre_task(_class_num_pre_task),
+  nms_rescale_factor(_nms_rescale_factor),
+  task_num(_class_num_pre_task.size())
+{
+  CHECK_CUDA(cudaMalloc((void **)&boxes_dev, kBoxBlockSize * map_size * sizeof(float)));
+  CHECK_CUDA(cudaMalloc((void **)&score_dev, map_size * sizeof(float)));
+  CHECK_CUDA(cudaMalloc((void **)&cls_dev, map_size * sizeof(int)));
+  CHECK_CUDA(cudaMalloc((void **)&sorted_indices_dev, map_size * sizeof(int)));
+  CHECK_CUDA(cudaMalloc((void **)&valid_box_num, sizeof(int)));
+  CHECK_CUDA(cudaMalloc((void **)&nms_rescale_factor_dev, class_num * sizeof(float)));
+
+  CHECK_CUDA(cudaMallocHost((void **)&boxes_host, kBoxBlockSize * map_size * sizeof(float)));
+  CHECK_CUDA(cudaMallocHost((void **)&score_host, nms_pre_maxnum * sizeof(float)));
+  CHECK_CUDA(cudaMallocHost((void **)&cls_host, map_size * sizeof(float)));
+  CHECK_CUDA(cudaMallocHost((void **)&sorted_indices_host, nms_pre_maxnum * sizeof(int)));
+  CHECK_CUDA(cudaMallocHost((void **)&keep_data_host, nms_pre_maxnum * sizeof(std::int64_t)));
+
+  CHECK_CUDA(cudaMemcpy(
+    nms_rescale_factor_dev, nms_rescale_factor.data(), class_num * sizeof(float),
+    cudaMemcpyHostToDevice));
+
+  iou3d_nms.reset(new Iou3dNmsCuda(output_h, output_w, nms_thresh));
+
+  std::ostringstream oss;
+  for (auto i = 0; i < nms_rescale_factor.size(); i++) {
+    oss << std::fixed << std::setprecision(2) << nms_rescale_factor[i] << ' ';
+  }
+  RCLCPP_INFO(rclcpp::get_logger("PostprocessGPU"), "%s", oss.str().c_str());
+}
+PostprocessGPU::~PostprocessGPU()
+{
+  CHECK_CUDA(cudaFree(boxes_dev));
+  CHECK_CUDA(cudaFree(score_dev));
+  CHECK_CUDA(cudaFree(cls_dev));
+  CHECK_CUDA(cudaFree(sorted_indices_dev));
+  CHECK_CUDA(cudaFree(valid_box_num));
+  CHECK_CUDA(cudaFree(nms_rescale_factor_dev));
+
+  CHECK_CUDA(cudaFreeHost(boxes_host));
+  CHECK_CUDA(cudaFreeHost(score_host));
+  CHECK_CUDA(cudaFreeHost(cls_host));
+  CHECK_CUDA(cudaFreeHost(sorted_indices_host));
+  CHECK_CUDA(cudaFreeHost(keep_data_host));
+}
+
+void PostprocessGPU::DoPostprocess(void ** const bev_buffer, std::vector<Box> & out_detections)
+{
+  // bev_buffer : BEV_feat, reg_0, hei_0, dim_0, rot_0, vel_0, heatmap_0, reg_1 ...
+  const int task_num = class_num_pre_task.size();
+  int cur_start_label = 0;
+  for (int i = 0; i < task_num; i++) {
+    float * reg = (float *)bev_buffer[i * 6 + 0];      // 2 x 128 x 128
+    float * hei = (float *)bev_buffer[i * 6 + 1];      // 1 x 128 x 128
+    float * dim = (float *)bev_buffer[i * 6 + 2];      // 3 x 128 x 128
+    float * rot = (float *)bev_buffer[i * 6 + 3];      // 2 x 128 x 128
+    float * vel = (float *)bev_buffer[i * 6 + 4];      // 2 x 128 x 128
+    float * heatmap = (float *)bev_buffer[i * 6 + 5];  // c x 128 x 128
+
+    dim3 grid(DIVUP(map_size, NUM_THREADS));
+    CHECK_CUDA(cudaMemset(valid_box_num, 0, sizeof(int)));
+    BEVDecodeObjectKernel<<<grid, NUM_THREADS>>>(
+      map_size, score_thresh, x_start, y_start, x_step, y_step, output_h, output_w, down_sample,
+      class_num_pre_task[i], cur_start_label, reg, hei, dim, rot, vel, heatmap, boxes_dev,
+      score_dev, cls_dev, valid_box_num, nms_rescale_factor_dev);
+
+    /*
+    at this point, boxes_dev, score_dev, cls_dev have valid_box_num elements，which may be greater
+    than nms_pre_maxnum, and it's arranged in disorder
+    */
+    int box_num_pre = 0;
+    CHECK_CUDA(cudaMemcpy(&box_num_pre, valid_box_num, sizeof(int), cudaMemcpyDeviceToHost));
+
+    thrust::sequence(thrust::device, sorted_indices_dev, sorted_indices_dev + box_num_pre);
+    thrust::sort_by_key(
+      thrust::device, score_dev, score_dev + box_num_pre, sorted_indices_dev,
+      thrust::greater<float>());
+    // score_dev is sorted in descending order, while sorted_indices_dev indexes the original
+    // order,， sorted_indices_dev[i] = j; i:current，j:pre-index;  j:[0, map_size]
+
+    box_num_pre = std::min(box_num_pre, nms_pre_maxnum);
+
+    int box_num_post =
+      iou3d_nms->DoIou3dNms(box_num_pre, boxes_dev, sorted_indices_dev, keep_data_host);
+
+    box_num_post = std::min(box_num_post, nms_post_maxnum);
+
+    CHECK_CUDA(cudaMemcpy(
+      sorted_indices_host, sorted_indices_dev, box_num_pre * sizeof(int), cudaMemcpyDeviceToHost));
+    CHECK_CUDA(cudaMemcpy(
+      boxes_host, boxes_dev, kBoxBlockSize * map_size * sizeof(float), cudaMemcpyDeviceToHost));
+    CHECK_CUDA(
+      cudaMemcpy(score_host, score_dev, box_num_pre * sizeof(float), cudaMemcpyDeviceToHost));
+    CHECK_CUDA(cudaMemcpy(cls_host, cls_dev, map_size * sizeof(float), cudaMemcpyDeviceToHost));
+
+    for (auto j = 0; j < box_num_post; j++) {
+      int k = keep_data_host[j];
+      int idx = sorted_indices_host[k];
+      Box box;
+      box.x = boxes_host[idx * kBoxBlockSize + 0];
+      box.y = boxes_host[idx * kBoxBlockSize + 1];
+      box.z = boxes_host[idx * kBoxBlockSize + 2];
+      box.l = boxes_host[idx * kBoxBlockSize + 3] / nms_rescale_factor[cls_host[idx]];
+      box.w = boxes_host[idx * kBoxBlockSize + 4] / nms_rescale_factor[cls_host[idx]];
+      box.h = boxes_host[idx * kBoxBlockSize + 5] / nms_rescale_factor[cls_host[idx]];
+      box.r = boxes_host[idx * kBoxBlockSize + 6];
+      box.vx = boxes_host[idx * kBoxBlockSize + 7];
+      box.vy = boxes_host[idx * kBoxBlockSize + 8];
+
+      box.label = cls_host[idx];
+      box.score = score_host[k];
+      box.z -= box.h * 0.5;  // bottom height
+      out_detections.push_back(box);
+    }
+
+    cur_start_label += class_num_pre_task[i];
+  }
+}
diff --git a/perception/autoware_tensorrt_bevdet/src/preprocess.cu b/perception/autoware_tensorrt_bevdet/src/preprocess.cu
new file mode 100644
index 0000000000000..2a1bdc426a42f
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/preprocess.cu
@@ -0,0 +1,37 @@
+// Copyright 2024 AutoCore, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "autoware/tensorrt_bevdet/preprocess.hpp"
+
+#include <thrust/fill.h>
+
+#include <fstream>
+
+__global__ void convert_RGBHWC_to_BGRCHW_kernel(
+  uchar * input, uchar * output, int channels, int height, int width)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < channels * height * width) {
+    int y = index / 3 / width;
+    int x = index / 3 % width;
+    int c = 2 - index % 3;  // RGB to BGR
+
+    output[c * height * width + y * width + x] = input[index];
+  }
+}
+// RGBHWC to BGRCHW
+void convert_RGBHWC_to_BGRCHW(uchar * input, uchar * output, int channels, int height, int width)
+{
+  convert_RGBHWC_to_BGRCHW_kernel<<<DIVUP(channels * height * width, NUM_THREADS), NUM_THREADS>>>(
+    input, output, channels, height, width);
+}
diff --git a/perception/autoware_tensorrt_bevdet/src/preprocess_plugin.cu b/perception/autoware_tensorrt_bevdet/src/preprocess_plugin.cu
new file mode 100644
index 0000000000000..25abf687aaae8
--- /dev/null
+++ b/perception/autoware_tensorrt_bevdet/src/preprocess_plugin.cu
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "autoware/tensorrt_bevdet/common.hpp"
+#include "autoware/tensorrt_bevdet/preprocess_plugin.hpp"
+
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+#include <iostream>
+
+// kernel for GPU
+
+template <typename T>
+__global__ void preprocess_kernel(
+  const uint8_t * src_dev, T * dst_dev, int src_row_step, int dst_row_step, int src_img_step,
+  int dst_img_step, int src_h, int src_w, float radio_h, float radio_w, float offset_h,
+  float offset_w, const float * mean, const float * std, int dst_h, int dst_w, int n)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= dst_h * dst_w * n) return;
+
+  int i = (idx / n) / dst_w;
+  int j = (idx / n) % dst_w;
+  int k = idx % n;
+
+  int pX = (int)roundf((i / radio_h) + offset_h);
+  int pY = (int)roundf((j / radio_w) + offset_w);
+
+  if (pX < src_h && pX >= 0 && pY < src_w && pY >= 0) {
+    int s1 = k * src_img_step + 0 * src_img_step / 3 + pX * src_row_step + pY;
+    int s2 = k * src_img_step + 1 * src_img_step / 3 + pX * src_row_step + pY;
+    int s3 = k * src_img_step + 2 * src_img_step / 3 + pX * src_row_step + pY;
+
+    int d1 = k * dst_img_step + 0 * dst_img_step / 3 + i * dst_row_step + j;
+    int d2 = k * dst_img_step + 1 * dst_img_step / 3 + i * dst_row_step + j;
+    int d3 = k * dst_img_step + 2 * dst_img_step / 3 + i * dst_row_step + j;
+
+    dst_dev[d1] = (static_cast<T>(src_dev[s1]) - static_cast<T>(mean[0])) / static_cast<T>(std[0]);
+    dst_dev[d2] = (static_cast<T>(src_dev[s2]) - static_cast<T>(mean[1])) / static_cast<T>(std[1]);
+    dst_dev[d3] = (static_cast<T>(src_dev[s3]) - static_cast<T>(mean[2])) / static_cast<T>(std[2]);
+  }
+}
+
+namespace nvinfer1
+{
+// class PreprocessPlugin
+PreprocessPlugin::PreprocessPlugin(
+  const std::string & name, int crop_h, int crop_w, float resize_radio)
+: name_(name)
+{
+  m_.crop_h = crop_h;
+  m_.crop_w = crop_w;
+  m_.resize_radio = resize_radio;
+}
+
+PreprocessPlugin::PreprocessPlugin(const std::string & name, const void * buffer, size_t length)
+: name_(name)
+{
+  memcpy(&m_, buffer, sizeof(m_));
+}
+
+PreprocessPlugin::~PreprocessPlugin()
+{
+}
+
+IPluginV2DynamicExt * PreprocessPlugin::clone() const noexcept
+{
+  auto p = new PreprocessPlugin(name_, &m_, sizeof(m_));
+  p->setPluginNamespace(namespace_.c_str());
+  return p;
+}
+
+int32_t PreprocessPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+DataType PreprocessPlugin::getOutputDataType(
+  int32_t index, DataType const * inputTypes, int32_t nbInputs) const noexcept
+{
+  // return DataType::kHALF;
+  return DataType::kFLOAT;
+}
+
+DimsExprs PreprocessPlugin::getOutputDimensions(
+  int32_t outputIndex, const DimsExprs * inputs, int32_t nbInputs,
+  IExprBuilder & exprBuilder) noexcept
+{
+  int input_h = inputs[0].d[2]->getConstantValue();
+  int input_w = inputs[0].d[3]->getConstantValue() * 4;
+
+  int output_h = input_h * m_.resize_radio - m_.crop_h;
+  int output_w = input_w * m_.resize_radio - m_.crop_w;
+
+  DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  ret.d[2] = exprBuilder.constant(output_h);
+  ret.d[3] = exprBuilder.constant(output_w);
+
+  return ret;
+}
+
+bool PreprocessPlugin::supportsFormatCombination(
+  int32_t pos, const PluginTensorDesc * inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
+{
+  bool res;
+  switch (pos) {
+    case 0:  // images
+      res = (inOut[0].type == DataType::kINT8 || inOut[0].type == DataType::kINT32) &&
+            inOut[0].format == TensorFormat::kLINEAR;
+      break;
+    case 1:  // mean
+      res = (inOut[1].type == DataType::kFLOAT) && inOut[1].format == TensorFormat::kLINEAR;
+      break;
+    case 2:  // std
+      res = (inOut[2].type == DataType::kFLOAT) && inOut[2].format == TensorFormat::kLINEAR;
+      break;
+    case 3:  // output img tensor
+      res = (inOut[3].type == DataType::kFLOAT || inOut[3].type == DataType::kHALF) &&
+            inOut[3].format == inOut[0].format;
+
+      // res = inOut[3].type == DataType::kHALF && inOut[3].format == inOut[0].format;
+      break;
+    default:
+      res = false;
+  }
+  return res;
+}
+
+void PreprocessPlugin::configurePlugin(
+  const DynamicPluginTensorDesc * in, int32_t nbInputs, const DynamicPluginTensorDesc * out,
+  int32_t nbOutputs) noexcept
+{
+  return;
+}
+
+size_t PreprocessPlugin::getWorkspaceSize(
+  const PluginTensorDesc * inputs, int32_t nbInputs, const PluginTensorDesc * outputs,
+  int32_t nbOutputs) const noexcept
+{
+  return 0;
+}
+
+int32_t PreprocessPlugin::enqueue(
+  const PluginTensorDesc * inputDesc, const PluginTensorDesc * outputDesc,
+  const void * const * inputs, void * const * outputs, void * workspace,
+  cudaStream_t stream) noexcept
+{
+  int n_img = inputDesc[0].dims.d[0];
+  int src_img_h = inputDesc[0].dims.d[2];
+  int src_img_w = inputDesc[0].dims.d[3] * 4;
+
+  int dst_img_h = outputDesc[0].dims.d[2];
+  int dst_img_w = outputDesc[0].dims.d[3];
+
+  int src_row_step = src_img_w;
+  int dst_row_step = dst_img_w;
+
+  int src_img_step = src_img_w * src_img_h * 3;
+  int dst_img_step = dst_img_w * dst_img_h * 3;
+
+  float offset_h = m_.crop_h / m_.resize_radio;
+  float offset_w = m_.crop_w / m_.resize_radio;
+
+  dim3 grid(DIVUP(dst_img_h * dst_img_w * n_img, NUM_THREADS));
+  dim3 block(NUM_THREADS);
+
+  switch (int(outputDesc[0].type)) {
+    case int(DataType::kFLOAT):
+      // float
+      preprocess_kernel<<<grid, block, 0, stream>>>(
+        reinterpret_cast<const uint8_t *>(inputs[0]), reinterpret_cast<float *>(outputs[0]),
+        src_row_step, dst_row_step, src_img_step, dst_img_step, src_img_h, src_img_w,
+        m_.resize_radio, m_.resize_radio, offset_h, offset_w,
+        reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<const float *>(inputs[2]),
+        dst_img_h, dst_img_w, n_img);
+      break;
+    case int(DataType::kHALF):
+      // half
+      preprocess_kernel<<<grid, block, 0, stream>>>(
+        reinterpret_cast<const uint8_t *>(inputs[0]), reinterpret_cast<__half *>(outputs[0]),
+        src_row_step, dst_row_step, src_img_step, dst_img_step, src_img_h, src_img_w,
+        m_.resize_radio, m_.resize_radio, offset_h, offset_w,
+        reinterpret_cast<const float *>(inputs[1]), reinterpret_cast<const float *>(inputs[2]),
+        dst_img_h, dst_img_w, n_img);
+
+      break;
+    default:  // should NOT be here
+      RCLCPP_ERROR(rclcpp::get_logger("PreprocessPlugin"), "\tUnsupport datatype!");
+  }
+  return 0;
+}
+
+void PreprocessPlugin::destroy() noexcept
+{
+  delete this;
+  return;
+}
+
+int32_t PreprocessPlugin::initialize() noexcept
+{
+  return 0;
+}
+
+void PreprocessPlugin::terminate() noexcept
+{
+  return;
+}
+
+size_t PreprocessPlugin::getSerializationSize() const noexcept
+{
+  return sizeof(m_);
+}
+
+void PreprocessPlugin::serialize(void * buffer) const noexcept
+{
+  memcpy(buffer, &m_, sizeof(m_));
+  return;
+}
+
+void PreprocessPlugin::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * PreprocessPlugin::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * PreprocessPlugin::getPluginType() const noexcept
+{
+  return PRE_PLUGIN_NAME;
+}
+
+const char * PreprocessPlugin::getPluginVersion() const noexcept
+{
+  return PRE_PLUGIN_VERSION;
+}
+
+void PreprocessPlugin::attachToContext(
+  cudnnContext * contextCudnn, cublasContext * contextCublas, IGpuAllocator * gpuAllocator) noexcept
+{
+  return;
+}
+
+void PreprocessPlugin::detachFromContext() noexcept
+{
+  return;
+}
+
+// class PreprocessPluginCreator
+PluginFieldCollection PreprocessPluginCreator::fc_{};
+std::vector<PluginField> PreprocessPluginCreator::attr_;
+
+PreprocessPluginCreator::PreprocessPluginCreator()
+{
+  attr_.clear();
+  attr_.emplace_back(PluginField("crop_h", nullptr, PluginFieldType::kINT32, 1));
+  attr_.emplace_back(PluginField("crop_w", nullptr, PluginFieldType::kINT32, 1));
+  attr_.emplace_back(PluginField("resize_radio", nullptr, PluginFieldType::kFLOAT32, 1));
+
+  fc_.nbFields = attr_.size();
+  fc_.fields = attr_.data();
+}
+
+PreprocessPluginCreator::~PreprocessPluginCreator()
+{
+}
+
+IPluginV2DynamicExt * PreprocessPluginCreator::createPlugin(
+  const char * name, const PluginFieldCollection * fc) noexcept
+{
+  const PluginField * fields = fc->fields;
+
+  int crop_h = -1;
+  int crop_w = -1;
+  float resize_radio = 0.f;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    if (std::string(fc->fields[i].name) == std::string("crop_h")) {
+      crop_h = *reinterpret_cast<const int *>(fc->fields[i].data);
+    } else if (std::string(fc->fields[i].name) == std::string("crop_w")) {
+      crop_w = *reinterpret_cast<const int *>(fc->fields[i].data);
+    } else if (std::string(fc->fields[i].name) == std::string("resize_radio")) {
+      resize_radio = *reinterpret_cast<const float *>(fc->fields[i].data);
+    }
+  }
+  PreprocessPlugin * pObj = new PreprocessPlugin(name, crop_h, crop_w, resize_radio);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+IPluginV2DynamicExt * PreprocessPluginCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) noexcept
+{
+  PreprocessPlugin * pObj = new PreprocessPlugin(name, serialData, serialLength);
+  pObj->setPluginNamespace(namespace_.c_str());
+  return pObj;
+}
+
+void PreprocessPluginCreator::setPluginNamespace(const char * pluginNamespace) noexcept
+{
+  namespace_ = pluginNamespace;
+  return;
+}
+
+const char * PreprocessPluginCreator::getPluginNamespace() const noexcept
+{
+  return namespace_.c_str();
+}
+
+const char * PreprocessPluginCreator::getPluginName() const noexcept
+{
+  return PRE_PLUGIN_NAME;
+}
+
+const char * PreprocessPluginCreator::getPluginVersion() const noexcept
+{
+  return PRE_PLUGIN_VERSION;
+}
+
+const PluginFieldCollection * PreprocessPluginCreator::getFieldNames() noexcept
+{
+  return &fc_;
+}
+
+REGISTER_TENSORRT_PLUGIN(PreprocessPluginCreator);
+
+}  // namespace nvinfer1