diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc
index 0c505e2da670f9..d799dee8a4144c 100644
--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -499,7 +499,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
           phi::XPUPlace place = in_tensor.place();
 #if defined(PADDLE_WITH_FLAGCX)
           auto allocator_cpu = std::unique_ptr<phi::Allocator>(
-              new paddle::experimental::DefaultAllocator(phi::CPUPlace()));
+              new paddle::experimental::DefaultAllocator(CPUPlace()));
 #endif
           auto allocator = std::unique_ptr<phi::Allocator>(
               new paddle::experimental::DefaultAllocator(place));
@@ -517,49 +517,49 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
 #endif
 
 #if defined(PADDLE_WITH_FLAGCX)
-          memory::Copy(phi::CPUPlace(),
+          memory::Copy(CPUPlace(),
                        in_size_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        in_numel_vec.data(),
                        in_size_tensor.numel() * sizeof(int64_t));
-          memory::Copy(phi::CPUPlace(),
+          memory::Copy(CPUPlace(),
                        in_offset_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        in_offset_vec.data(),
                        in_offset_tensor.numel() * sizeof(int64_t));
-          memory::Copy(phi::CPUPlace(),
+          memory::Copy(CPUPlace(),
                        out_size_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        out_numel_vec.data(),
                        out_size_tensor.numel() * sizeof(int64_t));
-          memory::Copy(phi::CPUPlace(),
+          memory::Copy(CPUPlace(),
                        out_offset_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        out_offset_vec.data(),
                        out_offset_tensor.numel() * sizeof(int64_t));
 #else
 
           memory::Copy(place,
                        in_size_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        in_numel_vec.data(),
                        in_size_tensor.numel() * sizeof(int64_t));
 
           memory::Copy(place,
                        in_offset_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        in_offset_vec.data(),
                        in_offset_tensor.numel() * sizeof(int64_t));
 
           memory::Copy(place,
                        out_size_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        out_numel_vec.data(),
                        out_size_tensor.numel() * sizeof(int64_t));
 
           memory::Copy(place,
                        out_offset_tensor.data(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        out_offset_vec.data(),
                        out_offset_tensor.numel() * sizeof(int64_t));
 #endif
@@ -679,7 +679,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
         phi::XPUPlace place = in_tensors[0].place();
 #if defined(PADDLE_WITH_FLAGCX)
         auto allocator_cpu = std::unique_ptr<phi::Allocator>(
-            new paddle::experimental::DefaultAllocator(phi::CPUPlace()));
+            new paddle::experimental::DefaultAllocator(CPUPlace()));
 #endif
         auto allocator = std::unique_ptr<phi::Allocator>(
             new paddle::experimental::DefaultAllocator(place));
@@ -713,51 +713,51 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
                               &concated_in_tensor);
         }
 #if defined(PADDLE_WITH_FLAGCX)
-        memory::Copy(phi::CPUPlace(),
+        memory::Copy(CPUPlace(),
                      in_size_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      in_numel_vec.data(),
                      in_size_tensor.numel() * sizeof(int64_t));
 
-        memory::Copy(phi::CPUPlace(),
+        memory::Copy(CPUPlace(),
                      in_offset_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      in_offset_vec.data(),
                      in_offset_tensor.numel() * sizeof(int64_t));
 
-        memory::Copy(phi::CPUPlace(),
+        memory::Copy(CPUPlace(),
                      out_size_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      out_numel_vec.data(),
                      out_size_tensor.numel() * sizeof(int64_t));
 
-        memory::Copy(phi::CPUPlace(),
+        memory::Copy(CPUPlace(),
                      out_offset_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      out_offset_vec.data(),
                      out_offset_tensor.numel() * sizeof(int64_t));
 #else
         memory::Copy(place,
                      in_size_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      in_numel_vec.data(),
                      in_size_tensor.numel() * sizeof(int64_t));
 
         memory::Copy(place,
                      in_offset_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      in_offset_vec.data(),
                      in_offset_tensor.numel() * sizeof(int64_t));
 
         memory::Copy(place,
                      out_size_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      out_numel_vec.data(),
                      out_size_tensor.numel() * sizeof(int64_t));
 
         memory::Copy(place,
                      out_offset_tensor.data(),
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      out_offset_vec.data(),
                      out_offset_tensor.numel() * sizeof(int64_t));
 #endif
diff --git a/paddle/fluid/distributed/collective/process_group_flagcx.cc b/paddle/fluid/distributed/collective/process_group_flagcx.cc
index 81ea61549d9d54..87f3a14d0116b7 100644
--- a/paddle/fluid/distributed/collective/process_group_flagcx.cc
+++ b/paddle/fluid/distributed/collective/process_group_flagcx.cc
@@ -427,7 +427,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupFlagcx::Barrier(
                     0,
                     common::errors::PreconditionNotMet(
                         "The barrier device id must greater or equal than 0."));
-  phi::GPUPlace place(opts.device_id);
+  GPUPlace place(opts.device_id);
   auto allocator = std::unique_ptr<phi::Allocator>(
       new paddle::experimental::DefaultAllocator(place));
   phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1});
@@ -816,7 +816,7 @@ void ProcessGroupFlagcx::SyncCalcStream(const Place& place,
 
 void ProcessGroupFlagcx::EagerConnect() {
   const auto deviceId = phi::backends::gpu::GetCurrentDeviceId();
-  const auto& place = phi::GPUPlace(deviceId);
+  const auto& place = GPUPlace(deviceId);
   const auto key = GetKeyFromPlace(place);
 
   platform::CUDADeviceGuard cuda_guard(place);
@@ -831,7 +831,7 @@ void ProcessGroupFlagcx::EagerConnect() {
 
 void ProcessGroupFlagcx::EagerConnectRingExchange() {
   std::vector<std::pair<int, int>> peers;
-  const auto& place = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
+  const auto& place = GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
 
   for (int rank = 0; rank < size_; rank++) {
     auto peer_rank = rank + 1 >= size_ ? 0 : rank + 1;
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
index ddabc9a495f49b..6ef4029977b6ee 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -488,7 +488,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
                     0,
                     common::errors::PreconditionNotMet(
                         "The barrier device id must greater or equal than 0."));
-  phi::GPUPlace place(opts.device_id);
+  GPUPlace place(opts.device_id);
   auto allocator = std::unique_ptr<phi::Allocator>(
       new paddle::experimental::DefaultAllocator(place));
   phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1});
@@ -923,18 +923,18 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(
     // gather global ranks in current group
     size_t gpu_global_rank_size = sizeof(int);
     auto gpu_global_rank = phi::memory_utils::Alloc(
-        phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
+        GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
         gpu_global_rank_size);
 
-    phi::memory_utils::Copy(phi::GPUPlace(),
+    phi::memory_utils::Copy(GPUPlace(),
                             gpu_global_rank->ptr(),
-                            phi::CPUPlace(),
+                            CPUPlace(),
                             &global_rank_,
                             gpu_global_rank_size);
 
     size_t gpu_global_ranks_size = num_ranks * sizeof(int);
     auto gpu_global_ranks = phi::memory_utils::Alloc(
-        phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
+        GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
         gpu_global_ranks_size);
 
     NCCL_CHECK(phi::dynload::ncclAllGather(gpu_global_rank->ptr(),
@@ -945,9 +945,9 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(
                                            comm_ctx->stream()));
 
     std::vector<int> global_ranks(num_ranks);
-    phi::memory_utils::Copy(phi::CPUPlace(),
+    phi::memory_utils::Copy(CPUPlace(),
                             global_ranks.data(),
-                            phi::GPUPlace(),
+                            GPUPlace(),
                             gpu_global_ranks->ptr(),
                             gpu_global_ranks_size);
 
@@ -1032,7 +1032,7 @@ void ProcessGroupNCCL::SyncCalcStream(const Place& place,
 
 void ProcessGroupNCCL::EagerConnect() {
   const auto deviceId = phi::backends::gpu::GetCurrentDeviceId();
-  const auto& place = phi::GPUPlace(deviceId);
+  const auto& place = GPUPlace(deviceId);
   const auto key = GetKeyFromPlace(place);
 
   platform::CUDADeviceGuard cuda_guard(place);
@@ -1049,7 +1049,7 @@ void ProcessGroupNCCL::EagerConnect() {
 void ProcessGroupNCCL::EagerConnectRingExchange(
     std::shared_ptr<phi::distributed::NCCLConfig> nccl_config_ptr) {
   std::vector<std::pair<int, int>> peers;
-  const auto& place = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
+  const auto& place = GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
 
   for (int rank = 0; rank < size_; rank++) {
     auto peer_rank = rank + 1 >= size_ ? 0 : rank + 1;