From 82fa6cada0a2738d9aecb4186d56bc3e9a626a41 Mon Sep 17 00:00:00 2001
From: co63oc <4617245+co63oc@users.noreply.github.com>
Date: Tue, 6 Jan 2026 11:12:50 +0800
Subject: [PATCH] optimize phi::CPUPlace in paddle/fluid/inference/

---
 .../passes/convert_to_mixed_precision.cc      |  4 +-
 .../analysis/passes/ir_graph_build_pass.cc    |  2 +-
 .../ir_params_sync_among_devices_pass.cc      |  6 +-
 .../passes/save_optimized_model_pass.cc       |  2 +-
 paddle/fluid/inference/io.cc                  |  2 +-
 .../tensorrt/convert/batch_norm_op.cc         | 14 ++--
 .../inference/tensorrt/convert/dropout_op.cc  |  4 +-
 .../tensorrt/convert/fill_constant_op.cc      |  4 +-
 .../inference/tensorrt/convert/gelu_op.cc     |  4 +-
 .../tensorrt/convert/leaky_relu_op.cc         |  2 +-
 .../inference/tensorrt/convert/op_converter.h |  4 +-
 .../tensorrt/convert/test_io_converter.cc     |  4 +-
 .../tensorrt/convert/test_op_converter.cc     |  2 +-
 .../inference/tensorrt/convert/ut_helper.h    |  4 +-
 paddle/fluid/inference/tensorrt/engine.cc     | 64 ++++++++-----------
 paddle/fluid/inference/tensorrt/engine.h      |  4 +-
 .../inference/tensorrt/pir/generic_plugin.cu  |  4 +-
 .../tensorrt/plugin/custom_generic_plugin.cu  |  2 +-
 .../plugin/deformable_conv_op_plugin.cu       |  2 +-
 .../elementwiseadd_transpose_op_plugin.cu     |  4 +-
 .../plugin/fused_token_prune_op_plugin.cu     |  2 +-
 .../tensorrt/plugin/generic_plugin.cu         |  4 +-
 .../plugin/instance_norm_op_plugin.cu         |  8 +--
 .../tensorrt/plugin/layer_norm_op_plugin.cu   |  8 +--
 .../multihead_matmul_roformer_plugin.cu       | 16 ++---
 .../tensorrt/plugin/qkv_to_context_plugin.cu  | 22 +++----
 .../plugin/trans_layernorm_op_plugin.cu       |  2 +-
 ...transformer_input_output_convert_plugin.cu |  2 +-
 28 files changed, 97 insertions(+), 105 deletions(-)

diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index d7bba0ada52926..548dff58bba9a8 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -74,7 +74,7 @@ ConvertToMixedPrecisionPass::ConvertToMixedPrecisionPass(
 }
 
 void ConvertToMixedPrecisionPass::LoadModel() {
-  framework::Executor exe{phi::CPUPlace{}};
+  framework::Executor exe{CPUPlace{}};
   // If we did not find the provided weight path,
   // we assume that the model to be converted only has a model file and no
   // params file, we believe this situation is reasonable. In this case, weight
@@ -177,7 +177,7 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() {
     op->SetAttr("file_path", save_params_path);
     op->CheckAttrs();
 
-    framework::Executor exe(phi::CPUPlace{});
+    framework::Executor exe(CPUPlace{});
     exe.Run(save_program, &scope_, 0, true, true);
   };
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 9397794cd818b6..54d0fd92a03aa7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -42,7 +42,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   // so that the parameters will on the same device, or they will keep copying
   // between difference devices.
   phi::Place place;
-  place = phi::CPUPlace();
+  place = CPUPlace();
 
   if (argument->model_dir_valid()) {
     auto program =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 2ce0290d47835a..93f097da216749 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -53,7 +53,7 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
                     true,
                     common::errors::PreconditionNotMet(
                         "The gpu_device_id field should be valid"));
-  phi::Place place = phi::GPUPlace(argument->gpu_device_id());
+  phi::Place place = GPUPlace(argument->gpu_device_id());
   auto *scope = argument->scope_ptr();
   std::vector<std::string> all_vars = scope->LocalVarNames();
 
@@ -130,7 +130,7 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
       dst_ptr = dev_ctx->Alloc(t, t->dtype());
       phi::memory_utils::Copy(place,
                               dst_ptr,
-                              phi::CPUPlace(),
+                              CPUPlace(),
                               src_ptr,
                               t->numel() * phi::SizeOf(t->dtype()),
                               stream);
@@ -269,7 +269,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
     CopyParamsToXpu(argument);
   }
 #endif
-  paddle::memory::Release(phi::CPUPlace());
+  paddle::memory::Release(CPUPlace());
 }
 
 std::string IrParamsSyncAmongDevicesPass::repr() const {
diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
index c8d8153cfa0f3d..f5a012a9aac06a 100644
--- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
@@ -91,7 +91,7 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
     op->SetAttr("file_path", save_params_path);
     op->CheckAttrs();
 
-    framework::Executor exe(phi::CPUPlace{});
+    framework::Executor exe(CPUPlace{});
     exe.Run(save_program, &scope, 0, true, true);
   };
   // TODO(shentanyue01): Setting hardware and version identification for
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 0cb825058670b2..7d12d94bf019ee 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -273,7 +273,7 @@ void SaveVars(const framework::Scope& scope,
   op->SetAttr("file_path", dirname + "/param");
   op->CheckAttrs();
 
-  phi::CPUPlace place;
+  CPUPlace place;
   framework::Executor exe(place);
   exe.Run(prog, const_cast<framework::Scope*>(&scope), 0, true, true);
 }
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index e72f0075f0ef8b..c9f8139de8ede5 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -70,7 +70,7 @@ class BatchNormOpConverter : public OpConverter {
     scale_tensor.Resize(Scale_t->dims());
     variance_tensor.Resize(Variance_t->dims());
 
-    phi::CPUPlace cpu_place;
+    CPUPlace cpu_place;
     // copy data from gpu to cpu
     paddle::framework::TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
     paddle::framework::TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
@@ -78,10 +78,10 @@ class BatchNormOpConverter : public OpConverter {
     paddle::framework::TensorCopySync(
         (*Variance_t), cpu_place, &variance_tensor);
 
-    auto* bias_data = bias_tensor.mutable_data<float>(phi::CPUPlace());
-    auto* mean_data = mean_tensor.mutable_data<float>(phi::CPUPlace());
-    auto* scale_data = scale_tensor.mutable_data<float>(phi::CPUPlace());
-    auto* variance_data = variance_tensor.mutable_data<float>(phi::CPUPlace());
+    auto* bias_data = bias_tensor.mutable_data<float>(CPUPlace());
+    auto* mean_data = mean_tensor.mutable_data<float>(CPUPlace());
+    auto* scale_data = scale_tensor.mutable_data<float>(CPUPlace());
+    auto* variance_data = variance_tensor.mutable_data<float>(CPUPlace());
 
     std::unique_ptr<phi::DenseTensor> combine_scale_tensor(
         new phi::DenseTensor());
@@ -92,9 +92,9 @@ class BatchNormOpConverter : public OpConverter {
     combine_bias_tensor->Resize(bias_tensor.dims());
 
     auto* combine_scale_data =
-        combine_scale_tensor->mutable_data<float>(phi::CPUPlace());
+        combine_scale_tensor->mutable_data<float>(CPUPlace());
     auto* combine_bias_data =
-        combine_bias_tensor->mutable_data<float>(phi::CPUPlace());
+        combine_bias_tensor->mutable_data<float>(CPUPlace());
 
     size_t ele_num = combine_scale_tensor->memory_size() / sizeof(float);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 5bd89a353e4aab..dddd34b67fa898 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -45,10 +45,10 @@ class DropoutOpConverter : public OpConverter {
       return;
     }
 
-    phi::CPUPlace cpu_place;
+    CPUPlace cpu_place;
     std::unique_ptr<phi::DenseTensor> weight_tensor(new phi::DenseTensor());
     weight_tensor->Resize(common::make_ddim({1}));
-    auto* weight_data = weight_tensor->mutable_data<float>(phi::CPUPlace());
+    auto* weight_data = weight_tensor->mutable_data<float>(CPUPlace());
     weight_data[0] = 1 - dropout_prob;
 
     TensorRTEngine::Weight scale_weights{
diff --git a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
index 02c11b5dce6d15..19a03681e8afc3 100644
--- a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
@@ -95,13 +95,13 @@ class FillConstantOpConverter : public OpConverter {
       void* trt_data = nullptr;
       size_t trt_num;
       if (dtype == 2 || dtype == 3) {  // int,int64
-        auto* tmp_ptr = out_tensor->mutable_data<int>(phi::CPUPlace());
+        auto* tmp_ptr = out_tensor->mutable_data<int>(CPUPlace());
         for (int64_t i = 0; i < out_tensor->numel(); i++)
           tmp_ptr[i] = std::stoi(str_value);
         trt_dtype = nvinfer1::DataType::kINT32;
         trt_data = static_cast<void*>(tmp_ptr);
       } else if (dtype == 5) {  // float
-        auto* tmp_ptr = out_tensor->mutable_data<float>(phi::CPUPlace());
+        auto* tmp_ptr = out_tensor->mutable_data<float>(CPUPlace());
         for (int64_t i = 0; i < out_tensor->numel(); i++)
           tmp_ptr[i] = std::stof(str_value);
         trt_data = static_cast<void*>(tmp_ptr);
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index ea4933e6a7ce4e..27d8b90da7d679 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -45,7 +45,7 @@ class GeluOpConverter : public OpConverter {
       auto create_weights = [&](float data, std::string type) -> float* {
         std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
         tmp_tensor->Resize({1});
-        auto* tmp_data = tmp_tensor->mutable_data<float>(phi::CPUPlace());
+        auto* tmp_data = tmp_tensor->mutable_data<float>(CPUPlace());
         tmp_data[0] = data;
         engine_->SetWeights(out_name + "_gelu_op_" + type,
                             std::move(tmp_tensor));
@@ -146,7 +146,7 @@ class GeluOpConverter : public OpConverter {
       auto create_weights = [&](float data, std::string type) -> float* {
         std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
         tmp_tensor->Resize({1});
-        auto* tmp_data = tmp_tensor->mutable_data<float>(phi::CPUPlace());
+        auto* tmp_data = tmp_tensor->mutable_data<float>(CPUPlace());
         tmp_data[0] = data;
         engine_->SetWeights(out_name + "_gelu_op_" + type,
                             std::move(tmp_tensor));
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index 5ce403aa7cc228..b76ab435254b5f 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -47,7 +47,7 @@ class LeakyReluOpConverter : public OpConverter {
       engine_->SetTensorDynamicRange(input, in_scale);
     }
 #else
-    phi::CPUPlace place;
+    CPUPlace place;
     std::unique_ptr<phi::DenseTensor> alpha_tensor(new phi::DenseTensor());
     alpha_tensor->Resize(common::make_ddim({2}));
     float* alpha_data = alpha_tensor->mutable_data<float>(place);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 260b89be715c61..6384fe514cfc63 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -704,7 +704,7 @@ class OpConverter {
         shape.d, shape.d + shape.nbDims, 1, std::multiplies<int>());
     std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
     tmp_tensor->Resize({data_size});
-    auto* tmp_data = tmp_tensor->mutable_data<T>(phi::CPUPlace());
+    auto* tmp_data = tmp_tensor->mutable_data<T>(CPUPlace());
     for (int i = 0; i < data_size; i++) {
       tmp_data[i] = data[i];
     }
@@ -740,7 +740,7 @@ class OpConverter {
     std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
     int data_size = data.size();
     tmp_tensor->Resize({data_size});
-    auto* tmp_data = tmp_tensor->mutable_data<T>(phi::CPUPlace());
+    auto* tmp_data = tmp_tensor->mutable_data<T>(CPUPlace());
     for (int i = 0; i < data_size; i++) {
       tmp_data[i] = data[i];
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
index 8f7d89a7dbef35..ef4cb1f1ed5a3b 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -61,13 +61,13 @@ void IOConverterTester(const phi::DeviceContext& ctx) {
 }
 
 TEST(EngineIOConverterTester, DefaultCPU) {
-  phi::CPUPlace place;
+  CPUPlace place;
   phi::CPUContext ctx(place);
   IOConverterTester(ctx);
 }
 
 TEST(EngineIOConverterTester, DefaultGPU) {
-  phi::GPUPlace place;
+  GPUPlace place;
   phi::GPUContext ctx(place);
   IOConverterTester(ctx);
 }
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 8f928dd6d9cf81..308bc6e78ed126 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -58,7 +58,7 @@ TEST(OpConverter, ConvertBlock) {
   auto* x = scope.Var("conv2d-Y");
   auto* x_tensor = x->GetMutable<phi::DenseTensor>();
   x_tensor->Resize(common::make_ddim(dim_vec));
-  x_tensor->mutable_data<float>(phi::GPUPlace(0));
+  x_tensor->mutable_data<float>(GPUPlace(0));
 
   OpTeller::Global().SetOpConverterType(conv2d_op, OpConverterType::Default);
   OpConverter converter;
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index d12e45e46e6f14..38d5e72241f446 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -56,7 +56,7 @@ void RandomizeTensor(phi::DenseTensor* tensor,
       common::errors::PermissionDenied("RandomizeTensor only can be used for "
                                        "tensor which dims is not zero."));
 
-  phi::CPUPlace cpu_place;
+  CPUPlace cpu_place;
   phi::DenseTensor temp_tensor;
   temp_tensor.Resize(dims);
   auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
@@ -250,7 +250,7 @@ class TRTConvertValidation {
   framework::Scope& scope() { return scope_; }
 
  private:
-  phi::GPUPlace place_;
+  GPUPlace place_;
   std::unique_ptr<TensorRTEngine> engine_;
   cudaStream_t stream_;
   std::unique_ptr<framework::OperatorBase> op_;
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 7ce32bc55e0cfe..61730a6b95c689 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -131,7 +131,7 @@ void TensorRTEngine::Execute(int batch_size,
         inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
             .GetContextMemory(
                 predictor_id_per_thread,
-                phi::GPUPlace(device_id()),
+                GPUPlace(device_id()),
                 phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
     infer_context->setDeviceMemory(context_memory);
   }
@@ -669,7 +669,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
   std::string name_suffix = std::to_string(name_suffix_counter);
   std::string splitter = "__";
   std::string name_with_suffix = name + splitter + name_suffix;
-  phi::CPUPlace cpu_place;
+  CPUPlace cpu_place;
   PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix),
                     0,
                     common::errors::AlreadyExists(
@@ -686,12 +686,11 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
   if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
     phi::DenseTensor bf16_tensor;
     bf16_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &bf16_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &bf16_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::FLOAT16);
     auto *fp16_data =
-        weight_map[name_with_suffix]->mutable_data<float16>(phi::CPUPlace());
-    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<float16>(CPUPlace());
+    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       fp16_data[i] = static_cast<float16>(bf16_data[i]);
     }
@@ -700,12 +699,11 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
   } else if (weight_tensor.dtype() == phi::DataType::FLOAT32) {
     phi::DenseTensor fp32_tensor;
     fp32_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &fp32_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &fp32_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::FLOAT16);
     auto *fp16_data =
-        weight_map[name_with_suffix]->mutable_data<float16>(phi::CPUPlace());
-    auto *fp32_data = fp32_tensor.mutable_data<float>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<float16>(CPUPlace());
+    auto *fp32_data = fp32_tensor.mutable_data<float>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       fp16_data[i] = static_cast<float16>(fp32_data[i]);
     }
@@ -714,12 +712,11 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
   } else if (weight_tensor.dtype() == phi::DataType::INT64) {
     phi::DenseTensor int64_tensor;
     int64_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &int64_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &int64_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::INT32);
     auto *int32_data =
-        weight_map[name_with_suffix]->mutable_data<int32_t>(phi::CPUPlace());
-    auto *int64_data = int64_tensor.mutable_data<int64_t>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<int32_t>(CPUPlace());
+    auto *int64_data = int64_tensor.mutable_data<int64_t>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       int32_data[i] = int64_data[i];
     }
@@ -742,7 +739,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
   std::string name_suffix = std::to_string(name_suffix_counter);
   std::string splitter = "__";
   std::string name_with_suffix = name + splitter + name_suffix;
-  phi::CPUPlace cpu_place;
+  CPUPlace cpu_place;
   PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix),
                     0,
                     common::errors::AlreadyExists(
@@ -759,12 +756,11 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
   if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
     phi::DenseTensor bf16_tensor;
     bf16_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &bf16_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &bf16_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::FLOAT32);
     auto *fp32_data =
-        weight_map[name_with_suffix]->mutable_data<float>(phi::CPUPlace());
-    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<float>(CPUPlace());
+    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       fp32_data[i] = static_cast<float>(bf16_data[i]);
     }
@@ -773,12 +769,11 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
   } else if (weight_tensor.dtype() == phi::DataType::FLOAT16) {
     phi::DenseTensor fp16_tensor;
     fp16_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &fp16_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &fp16_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::FLOAT32);
     auto *fp32_data =
-        weight_map[name_with_suffix]->mutable_data<float>(phi::CPUPlace());
-    auto *fp16_data = fp16_tensor.mutable_data<float16>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<float>(CPUPlace());
+    auto *fp16_data = fp16_tensor.mutable_data<float16>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       fp32_data[i] = static_cast<float>(fp16_data[i]);
     }
@@ -787,12 +782,11 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
   } else if (weight_tensor.dtype() == phi::DataType::INT64) {
     phi::DenseTensor int64_tensor;
     int64_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &int64_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &int64_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::INT32);
     auto *int32_data =
-        weight_map[name_with_suffix]->mutable_data<int32_t>(phi::CPUPlace());
-    auto *int64_data = int64_tensor.mutable_data<int64_t>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<int32_t>(CPUPlace());
+    auto *int64_data = int64_tensor.mutable_data<int64_t>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       int32_data[i] = int64_data[i];
     }
@@ -814,7 +808,7 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
   std::string name_suffix = std::to_string(name_suffix_counter);
   std::string splitter = "__";
   std::string name_with_suffix = name + splitter + name_suffix;
-  phi::CPUPlace cpu_place;
+  CPUPlace cpu_place;
   PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix),
                     0,
                     common::errors::AlreadyExists(
@@ -835,12 +829,11 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
   if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
     phi::DenseTensor bf16_tensor;
     bf16_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &bf16_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &bf16_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::FLOAT32);
     auto *fp32_data =
-        weight_map[name_with_suffix]->mutable_data<float>(phi::CPUPlace());
-    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<float>(CPUPlace());
+    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       fp32_data[i] = static_cast<float>(bf16_data[i]);
     }
@@ -849,12 +842,11 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
   } else if (weight_tensor.dtype() == phi::DataType::INT64) {
     phi::DenseTensor int64_tensor;
     int64_tensor.clear();
-    paddle::framework::TensorCopySync(
-        weight_tensor, phi::CPUPlace(), &int64_tensor);
+    paddle::framework::TensorCopySync(weight_tensor, CPUPlace(), &int64_tensor);
     weight_map[name_with_suffix]->set_type(phi::DataType::INT32);
     auto *int32_data =
-        weight_map[name_with_suffix]->mutable_data<int32_t>(phi::CPUPlace());
-    auto *int64_data = int64_tensor.mutable_data<int64_t>(phi::CPUPlace());
+        weight_map[name_with_suffix]->mutable_data<int32_t>(CPUPlace());
+    auto *int64_data = int64_tensor.mutable_data<int64_t>(CPUPlace());
     for (int i = 0; i < weight_tensor.numel(); i++) {
       int32_data[i] = int64_data[i];
     }
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 6d2c0af53b3004..a94f3db0e34e58 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -698,7 +698,7 @@ class PADDLE_API TRTEngineManager {
   }
 
   void* GetContextMemory(PredictorID predictor_id,
-                         const phi::GPUPlace& place,
+                         const GPUPlace& place,
                          const phi::Stream& stream) {
     std::lock_guard<std::mutex> lock(mutex_);
     static auto alignment = GetAlignmentSize(place);
@@ -719,7 +719,7 @@ class PADDLE_API TRTEngineManager {
   }
 
  private:
-  size_t GetAlignmentSize(const phi::GPUPlace& place) {
+  size_t GetAlignmentSize(const GPUPlace& place) {
     const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
     return prop.textureAlignment;
   }
diff --git a/paddle/fluid/inference/tensorrt/pir/generic_plugin.cu b/paddle/fluid/inference/tensorrt/pir/generic_plugin.cu
index ca4049adac3432..f648beb3fec996 100644
--- a/paddle/fluid/inference/tensorrt/pir/generic_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/pir/generic_plugin.cu
@@ -493,7 +493,7 @@ int GenericPlugin::initialize() TRT_NOEXCEPT {
                             op_name_.c_str()));
 
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
-  phi::GPUPlace place(phi::backends::gpu::GetCurrentDeviceId());
+  GPUPlace place(phi::backends::gpu::GetCurrentDeviceId());
   auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(place));
 
   std::vector<phi::DataType> precision_types{phi::DataType::FLOAT32,
@@ -573,7 +573,7 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                            void* const* outputs,
                            void* workspace,
                            cudaStream_t stream) TRT_NOEXCEPT {
-  phi::GPUPlace place(phi::backends::gpu::GetCurrentDeviceId());
+  GPUPlace place(phi::backends::gpu::GetCurrentDeviceId());
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   // TODO(inference): generic plugin do not support INT8 precision now.
   auto nvType2PhiType =
diff --git a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
index 7522d847c93124..0fb37e1c458b1a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/custom_generic_plugin.cu
@@ -477,7 +477,7 @@ int CustomGenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                                  void* const* outputs,
                                  void* workspace,
                                  cudaStream_t stream) TRT_NOEXCEPT {
-  phi::GPUPlace place(platform::GetCurrentDeviceId());
+  GPUPlace place(platform::GetCurrentDeviceId());
   // TODO(inference): custom generic plugin do not support INT8 precision now.
   auto protoType2PhiType =
       [&](GenerateCustomGenericPluginDataType proto_type,
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 9bbc96bf9379e6..91d43f729d7068 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -559,7 +559,7 @@ void gemm_impl_new(int m,
                    const T* beta,
                    T* C) {
   auto* device_ctx = static_cast<phi::GPUContext*>(
-      phi::DeviceContextPool::Instance().Get(phi::GPUPlace(0)));
+      phi::DeviceContextPool::Instance().Get(GPUPlace(0)));
   const phi::GPUContext& dev_ctx = *device_ctx;
 
   typedef typename CUDATypeTraits<T>::TYPE run_type;
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
index 117b492fa232bf..dad0bc195f3526 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
@@ -127,7 +127,7 @@ void ElementwiseAddTransposePluginDynamic::configurePlugin(
   }
   ele_out_tensor_.Resize(common::make_ddim(x_shape));
   phi::DeviceContextPool &pool = phi::DeviceContextPool::Instance();
-  phi::GPUPlace place(platform::GetCurrentDeviceId());
+  GPUPlace place(platform::GetCurrentDeviceId());
   auto *device_context = static_cast<phi::GPUContext *>(pool.Get(place));
   const phi::GPUContext &dev_ctx = *device_context;
 
@@ -171,7 +171,7 @@ int ElementwiseAddTransposePluginDynamic::enqueue(
     void *workspace,
     cudaStream_t stream) TRT_NOEXCEPT {
   phi::DeviceContextPool &pool = phi::DeviceContextPool::Instance();
-  phi::GPUPlace place(platform::GetCurrentDeviceId());
+  GPUPlace place(platform::GetCurrentDeviceId());
   auto *device_context = static_cast<phi::GPUContext *>(pool.Get(place));
   const phi::GPUContext &dev_ctx = *device_context;
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
index 6c4695afd66a63..628a1062fb52e4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -389,7 +389,7 @@ int FusedTokenPrunePluginDynamic::enqueue(
         NULL, temp_storage_bytes, pruned_token_lengths_, output3, B + 1);
     // Allocate temporary storage
 
-    phi::GPUPlace place(platform::GetCurrentDeviceId());
+    GPUPlace place(platform::GetCurrentDeviceId());
     auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
 
     // Run exclusive prefix sum
diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
index 67f149d5431a91..d35be103ef7131 100644
--- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
@@ -504,7 +504,7 @@ int GenericPlugin::initialize() TRT_NOEXCEPT {
                             op_type.c_str()));
 
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
-  phi::GPUPlace place(platform::GetCurrentDeviceId());
+  GPUPlace place(platform::GetCurrentDeviceId());
   auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(place));
 
   std::vector<phi::DataType> precision_types{phi::DataType::FLOAT32,
@@ -595,7 +595,7 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                            void* const* outputs,
                            void* workspace,
                            cudaStream_t stream) TRT_NOEXCEPT {
-  phi::GPUPlace place(platform::GetCurrentDeviceId());
+  GPUPlace place(platform::GetCurrentDeviceId());
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   // TODO(inference): generic plugin do not support INT8 precision now.
   auto nvType2PhiType =
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index 197a828d12af28..2f6594cb68dac7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -74,8 +74,8 @@ int InstanceNormPlugin::enqueue(int batch_size,
   bias_t.Resize(common::make_ddim({batch_size, c}));
   int device_id;
   cudaGetDevice(&device_id);
-  float *scale_d = scale_t.mutable_data<float>(phi::GPUPlace(device_id));
-  float *bias_d = bias_t.mutable_data<float>(phi::GPUPlace(device_id));
+  float *scale_d = scale_t.mutable_data<float>(GPUPlace(device_id));
+  float *bias_d = bias_t.mutable_data<float>(GPUPlace(device_id));
 
   for (int i = 0; i < batch_size; i++) {
     cudaMemcpyAsync(scale_d + i * c,
@@ -150,8 +150,8 @@ int InstanceNormPluginEnqueue(const nvinfer1::PluginTensorDesc *inputDesc,
   bias_t.Resize(common::make_ddim({n, c}));
   int device_id;
   cudaGetDevice(&device_id);
-  float *scale_d = scale_t.mutable_data<float>(phi::GPUPlace(device_id));
-  float *bias_d = bias_t.mutable_data<float>(phi::GPUPlace(device_id));
+  float *scale_d = scale_t.mutable_data<float>(GPUPlace(device_id));
+  float *bias_d = bias_t.mutable_data<float>(GPUPlace(device_id));
 
   for (int i = 0; i < n; i++) {
     cudaMemcpyAsync(scale_d + i * c,
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index cf57ee90260e5e..d750857df27095 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -124,8 +124,8 @@ int LayerNormPlugin::enqueue(int batch_size,
   cudaGetDevice(&device_id);
   mean_t.Resize(common::make_ddim({batched_mean_shape}));
   variance_t.Resize(common::make_ddim({batched_variance_shape}));
-  float *mean_d = mean_t.mutable_data<float>(phi::GPUPlace(device_id));
-  float *variance_d = variance_t.mutable_data<float>(phi::GPUPlace(device_id));
+  float *mean_d = mean_t.mutable_data<float>(GPUPlace(device_id));
+  float *variance_d = variance_t.mutable_data<float>(GPUPlace(device_id));
   auto input_type = getDataType();
   if (input_type == nvinfer1::DataType::kFLOAT) {
     VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
@@ -325,8 +325,8 @@ int LayerNormPluginDynamic::enqueue(
   cudaGetDevice(&device_id);
   mean_t.Resize(common::make_ddim(mean_shape_));
   variance_t.Resize(common::make_ddim(variance_shape_));
-  float *mean_d = mean_t.mutable_data<float>(phi::GPUPlace(device_id));
-  float *variance_d = variance_t.mutable_data<float>(phi::GPUPlace(device_id));
+  float *mean_d = mean_t.mutable_data<float>(GPUPlace(device_id));
+  float *variance_d = variance_t.mutable_data<float>(GPUPlace(device_id));
   auto input_type = input_desc[0].type;
   if (input_type == nvinfer1::DataType::kFLOAT) {
     VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
diff --git a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
index e115e525130252..155b1522f64ff7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/multihead_matmul_roformer_plugin.cu
@@ -190,10 +190,10 @@ int MultiheadMatmulRoformerPlugin::enqueue(
   if (input_type == nvinfer1::DataType::kFLOAT) {
     VLOG(1) << "TRT Plugin DataType selected. RoformerQkvToContext-->fp32";
     auto *multihead_temp_data =
-        multihead_temp_tensor.mutable_data<float>(phi::GPUPlace(device_id));
+        multihead_temp_tensor.mutable_data<float>(GPUPlace(device_id));
     auto *temp_roformer_data =
         temp_roformer_tensor.mutable_data<float>(  // NOLINT
-            phi::GPUPlace(device_id));
+            GPUPlace(device_id));
     auto *tmp_roformer_ptr = reinterpret_cast<float *>(temp_roformer_data);
     auto *qkptr = multihead_temp_data;
     auto *tptr = multihead_temp_data + scratch_size;
@@ -205,7 +205,7 @@ int MultiheadMatmulRoformerPlugin::enqueue(
     if (ProductDim(input_desc[3].dims) == (batch * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
       auto *temp_qk_bias =
-          temp_qk_bias_tensor.mutable_data<float>(phi::GPUPlace(device_id));
+          temp_qk_bias_tensor.mutable_data<float>(GPUPlace(device_id));
       int grid = batch * head_number_ * seq_len;
       int block = round_up(seq_len);
       broadcast<<<grid, block, 0, stream>>>(
@@ -242,7 +242,7 @@ int MultiheadMatmulRoformerPlugin::enqueue(
                                                  head_size_);  // k
 
     auto *device_ctx = static_cast<phi::GPUContext *>(
-        phi::DeviceContextPool::Instance().Get(phi::GPUPlace(device_id)));
+        phi::DeviceContextPool::Instance().Get(GPUPlace(device_id)));
 
     const phi::GPUContext &dev_ctx = *device_ctx;
     phi::funcs::MultiheadGPUComputeFunctor<float> multihead_compute_func;
@@ -268,11 +268,11 @@ int MultiheadMatmulRoformerPlugin::enqueue(
     VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp16";
     auto *multihead_temp_data =
         multihead_temp_tensor.mutable_data<int16_t>(  // NOLINT
-            phi::GPUPlace(device_id));
+            GPUPlace(device_id));
 
     auto *temp_roformer_data =
         temp_roformer_tensor.mutable_data<int16_t>(  // NOLINT
-            phi::GPUPlace(device_id));
+            GPUPlace(device_id));
     half *tmp_roformer_ptr = reinterpret_cast<half *>(temp_roformer_data);
     half *qkptr = reinterpret_cast<half *>(multihead_temp_data);
     half *tptr = qkptr + scratch_size;
@@ -284,7 +284,7 @@ int MultiheadMatmulRoformerPlugin::enqueue(
     if (ProductDim(input_desc[3].dims) == (batch * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
       auto *temp_qk_bias = reinterpret_cast<half *>(
-          temp_qk_bias_tensor.mutable_data<int16_t>(phi::GPUPlace(device_id)));
+          temp_qk_bias_tensor.mutable_data<int16_t>(GPUPlace(device_id)));
       int grid = batch * head_number_ * seq_len;
       int block = round_up(seq_len);
       broadcast<<<grid, block, 0, stream>>>(
@@ -304,7 +304,7 @@ int MultiheadMatmulRoformerPlugin::enqueue(
                cudaMemcpyDeviceToDevice);
 
     auto *device_ctx = static_cast<phi::GPUContext *>(
-        phi::DeviceContextPool::Instance().Get(phi::GPUPlace(device_id)));
+        phi::DeviceContextPool::Instance().Get(GPUPlace(device_id)));
 
     int n_q = seq_len * head_number_ * head_size_ * batch;
     constexpr int threads = 128;
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 3e135f7c3db943..28cab8d310d344 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -196,7 +196,7 @@ void QkvToContextPluginDynamic::configurePlugin(
     int device_id = 0;
     cudaGetDevice(&device_id);
     auto *device_ctx = static_cast<phi::GPUContext *>(
-        phi::DeviceContextPool::Instance().Get(phi::GPUPlace(device_id)));
+        phi::DeviceContextPool::Instance().Get(GPUPlace(device_id)));
     const phi::GPUContext &dev_ctx = *device_ctx;
     auto stream = dev_ctx.stream();
     tensor_.Resize({batch, seq_len, seq_len, head_number_});
@@ -204,12 +204,12 @@ void QkvToContextPluginDynamic::configurePlugin(
       tensor_.Resize({batch, seq_len, seq_len, 1});
       int blocks = batch * 1 * seq_len;
       mask_half_ = reinterpret_cast<half *>(
-          tensor_.mutable_data<int16_t>(phi::GPUPlace(device_id)));
+          tensor_.mutable_data<int16_t>(GPUPlace(device_id)));
       reset_qk_bias<<<blocks, 1024, 0, stream>>>(
           mask_half_, real_seq_len, seq_len);
     } else if (in[0].desc.type == nvinfer1::DataType::kFLOAT) {
       fake_qk_bias_ = reinterpret_cast<float *>(
-          tensor_.mutable_data<int32_t>(phi::GPUPlace(device_id)));
+          tensor_.mutable_data<int32_t>(GPUPlace(device_id)));
       int64_t size = sizeof(int32_t) * batch * seq_len * seq_len * head_number_;
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -336,7 +336,7 @@ int QkvToContextPluginDynamic::enqueue(
   if (input_type == nvinfer1::DataType::kFLOAT) {
     VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp32";
     auto *multihead_temp_data =
-        multihead_temp_tensor.mutable_data<float>(phi::GPUPlace(device_id));
+        multihead_temp_tensor.mutable_data<float>(GPUPlace(device_id));
     auto *qkptr = multihead_temp_data;
     auto *tptr = multihead_temp_data + scratch_size;
 
@@ -347,7 +347,7 @@ int QkvToContextPluginDynamic::enqueue(
     if (ProductDim(input_desc[1].dims) == (batch * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
       auto *temp_qk_bias =
-          temp_qk_bias_tensor.mutable_data<float>(phi::GPUPlace(device_id));
+          temp_qk_bias_tensor.mutable_data<float>(GPUPlace(device_id));
       int grid = batch * head_number_ * seq_len;
       int block = round_up(seq_len);
       broadcast<<<grid, block, 0, stream>>>(
@@ -361,7 +361,7 @@ int QkvToContextPluginDynamic::enqueue(
     if (ProductDim(input_desc[1].dims) == (seq_len * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
       auto *temp_qk_bias = reinterpret_cast<float *>(
-          temp_qk_bias_tensor.mutable_data<float>(phi::GPUPlace(device_id)));
+          temp_qk_bias_tensor.mutable_data<float>(GPUPlace(device_id)));
       int grid = batch * head_number_ * seq_len;
       int block = round_up(seq_len);
       broadcast_batch_head_number<<<grid, block, 0, stream>>>(
@@ -382,7 +382,7 @@ int QkvToContextPluginDynamic::enqueue(
         batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);
 
     auto *device_ctx = static_cast<phi::GPUContext *>(
-        phi::DeviceContextPool::Instance().Get(phi::GPUPlace(device_id)));
+        phi::DeviceContextPool::Instance().Get(GPUPlace(device_id)));
 
     const phi::GPUContext &dev_ctx = *device_ctx;
     phi::funcs::MultiheadGPUComputeFunctor<float> multihead_compute_func;
@@ -418,7 +418,7 @@ int QkvToContextPluginDynamic::enqueue(
     }
     auto *multihead_temp_data =
         multihead_temp_tensor.mutable_data<int16_t>(  // NOLINT
-            phi::GPUPlace(device_id));
+            GPUPlace(device_id));
 
     half *qkptr = reinterpret_cast<half *>(multihead_temp_data);
     half *tptr = qkptr + scratch_size;
@@ -430,7 +430,7 @@ int QkvToContextPluginDynamic::enqueue(
     if (ProductDim(input_desc[1].dims) == (batch * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
       auto *temp_qk_bias = reinterpret_cast<half *>(
-          temp_qk_bias_tensor.mutable_data<int16_t>(phi::GPUPlace(device_id)));
+          temp_qk_bias_tensor.mutable_data<int16_t>(GPUPlace(device_id)));
       int grid = batch * head_number_ * seq_len;
       int block = round_up(seq_len);
       broadcast<<<grid, block, 0, stream>>>(
@@ -444,7 +444,7 @@ int QkvToContextPluginDynamic::enqueue(
     if (ProductDim(input_desc[1].dims) == (seq_len * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
       auto *temp_qk_bias = reinterpret_cast<half *>(
-          temp_qk_bias_tensor.mutable_data<int16_t>(phi::GPUPlace(device_id)));
+          temp_qk_bias_tensor.mutable_data<int16_t>(GPUPlace(device_id)));
       int grid = batch * head_number_ * seq_len;
       int block = round_up(seq_len);
       broadcast_batch_head_number<<<grid, block, 0, stream>>>(
@@ -481,7 +481,7 @@ int QkvToContextPluginDynamic::enqueue(
     }
 
     auto *device_ctx = static_cast<phi::GPUContext *>(
-        phi::DeviceContextPool::Instance().Get(phi::GPUPlace(device_id)));
+        phi::DeviceContextPool::Instance().Get(GPUPlace(device_id)));
 
     int n_q = seq_len * head_number_ * head_size_ * batch;
     constexpr int threads = 128;
diff --git a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
index 151d00f9e45494..63c37e0f092749 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
@@ -363,7 +363,7 @@ int TransLayerNormPluginDynamic::enqueue(
   auto input_type = input_desc[0].type;
 
   phi::DeviceContextPool &pool = phi::DeviceContextPool::Instance();
-  phi::GPUPlace place(platform::GetCurrentDeviceId());
+  GPUPlace place(platform::GetCurrentDeviceId());
   auto *device_context = static_cast<phi::GPUContext *>(pool.Get(place));
   const phi::GPUContext &dev_ctx = *device_context;
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu
index 9ee2cf83f248f4..a5a9afb98bd05b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_output_convert_plugin.cu
@@ -186,7 +186,7 @@ int TransformerInputConvertPlugin::enqueue(
   cub::DeviceScan::ExclusiveSum(
       NULL, temp_storage_bytes, input1, output2, B + 1);
   // Allocate temporary storage
-  phi::GPUPlace place(platform::GetCurrentDeviceId());
+  GPUPlace place(platform::GetCurrentDeviceId());
   auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
   // Run exclusive prefix sum
   cub::DeviceScan::ExclusiveSum(