PaddlePaddle · luotao1 · Jan 6, 2026 · Jan 5, 2026
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -189,7 +189,7 @@ void AddNKernel(const Context &dev_ctx,
           dev_ctx.GetPlace(), in_data.size() * sizeof(void *));
       memory_utils::Copy(dev_ctx.GetPlace(),
                          tmp_in_array->ptr(),
-                         phi::CPUPlace(),
+                         CPUPlace(),
                          reinterpret_cast<void *>(in_data.data()),
                          in_data.size() * sizeof(void *),
                          dev_ctx.stream());
@@ -280,7 +280,7 @@ void AddNKernel(const Context &dev_ctx,
 
       memory_utils::Copy(dev_ctx.GetPlace(),
                          tmp_sr_in_out_array->ptr(),
-                         phi::CPUPlace(),
+                         CPUPlace(),
                          reinterpret_cast<void *>(sr_in_out_data.data()),
                          sr_in_out_data.size() * sizeof(T *),
                          dev_ctx.stream());
@@ -301,7 +301,7 @@ void AddNKernel(const Context &dev_ctx,
 
     memory_utils::Copy(dev_ctx.GetPlace(),
                        tmp_in_array->ptr(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        reinterpret_cast<void *>(in_data.data()),
                        in_data.size() * sizeof(T *),
                        dev_ctx.stream());

diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -155,7 +155,7 @@ class LazyZeros<phi::GPUContext, T> {
     size_t xs_size = xs.size();
     if (xs_size == 0) return;
 
-    const auto& cpu_place = phi::CPUPlace();
+    const auto& cpu_place = CPUPlace();
     // alloc each tensor's start index and copy to device
     auto h_in_starts_mem =
         phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
@@ -284,7 +284,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
   size_t xs_size = xs.size();
   if (xs_size == 0) return;
 
-  const auto& cpu_place = phi::CPUPlace();
+  const auto& cpu_place = CPUPlace();
   // calculate each tensor's start index and copy to device
   auto h_starts_tensor =
       phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));

diff --git a/paddle/phi/kernels/gpu/assign_pos_kernel.cu b/paddle/phi/kernels/gpu/assign_pos_kernel.cu
@@ -62,11 +62,11 @@ void AssignPosKernel(const Context& dev_ctx,
 
   DenseTensor cpu_eff_num_len;
   int64_t cpu_eff_num_len_data = 0;
-  bool is_cpu_place = eff_num_len_ptr->place() == phi::CPUPlace();
+  bool is_cpu_place = eff_num_len_ptr->place() == CPUPlace();
   if (is_cpu_place) {
     cpu_eff_num_len_data = eff_num_len_ptr->data<T>()[0];
   } else {
-    Copy(dev_ctx, eff_num_len, phi::CPUPlace(), false, &cpu_eff_num_len);
+    Copy(dev_ctx, eff_num_len, CPUPlace(), false, &cpu_eff_num_len);
     cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
   }
 

diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -30,19 +30,19 @@ void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
                                       int64_t* old_num_accumulates) {
   auto stream = dev_ctx.stream();
   auto cuda_place = in_old_num_accumulates.place();
-  memory_utils::Copy(phi::CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                      old_num_accumulates,
                      cuda_place,
                      in_old_num_accumulates.data<int64_t>(),
                      sizeof(int64_t),
                      stream);
-  memory_utils::Copy(phi::CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                      num_accumulates,
                      cuda_place,
                      in_num_accumulates.data<int64_t>(),
                      sizeof(int64_t),
                      stream);
-  memory_utils::Copy(phi::CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                      num_updates,
                      cuda_place,
                      in_num_updates.data<int64_t>(),
@@ -70,21 +70,21 @@ void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
   auto cuda_place = out_old_num_accumulates->place();
   memory_utils::Copy(dev_ctx.GetPlace(),
                      out_num_accumulates_ptr,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      &num_accumulates,
                      sizeof(int64_t),
                      stream);
 
   memory_utils::Copy(dev_ctx.GetPlace(),
                      out_old_num_accumulates_ptr,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      &old_num_accumulates,
                      sizeof(int64_t),
                      stream);
 
   memory_utils::Copy(cuda_place,
                      out_num_updates_ptr,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      &num_updates,
                      sizeof(int64_t),
                      stream);

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -937,7 +937,7 @@ void BatchNormKernel(const Context &dev_ctx,
     // if (dev_ctx.HasInput("MomentumTensor")) {
     //   const auto *mom_tensor = MomentumTensor;
     //   DenseTensor mom_cpu;
-    //   paddle::framework::TensorCopySync(*mom_tensor, phi::CPUPlace(),
+    //   paddle::framework::TensorCopySync(*mom_tensor, CPUPlace(),
     //                                     &mom_cpu);
     //   momentum = mom_cpu.data<float>()[0];
     // }

diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -125,7 +125,7 @@ void BincountCUDAInner(const Context& dev_ctx,
       <<<num_blocks, PADDLE_CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
           input_data, input_numel, input_min_max_data, input_min_max_data + 1);
 
-  Copy(dev_ctx, input_min_max_t, phi::CPUPlace(), true, &input_min_max_cpu);
+  Copy(dev_ctx, input_min_max_t, CPUPlace(), true, &input_min_max_cpu);
 
   InputT input_min = input_min_max_cpu.data<InputT>()[0];
 

diff --git a/paddle/phi/kernels/gpu/box_coder_kernel.cu b/paddle/phi/kernels/gpu/box_coder_kernel.cu
@@ -216,7 +216,7 @@ void BoxCoderKernel(const Context &dev_ctx,
       bytes,
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
-  auto cplace = phi::CPUPlace();
+  auto cplace = CPUPlace();
   const auto gplace = dev_ctx.GetPlace();
   memory_utils::Copy(
       gplace, dev_var_data, cplace, &variance[0], bytes, dev_ctx.stream());

diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -349,10 +349,9 @@ static void PrintStack(const phi::GPUContext& dev_ctx,
                        const std::string& op_type,
                        const std::string& var_name,
                        int dev_id) {
-  auto cpu_stats =
-      phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3);
+  auto cpu_stats = phi::memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 3);
   int64_t* cpu_stats_ptr = reinterpret_cast<int64_t*>(cpu_stats->ptr());
-  phi::memory_utils::Copy(phi::CPUPlace(),
+  phi::memory_utils::Copy(CPUPlace(),
                           cpu_stats_ptr,
                           stats.place(),
                           stats.data(),
@@ -381,11 +380,11 @@ static void WriteToOutputDir(const phi::GPUContext& dev_ctx,
   // Copy stats and values from GPU to CPU.
   DenseTensor cpu_stats;
   cpu_stats.Resize({static_cast<int64_t>(3)});
-  Copy(dev_ctx, stats, phi::CPUPlace(), false, &cpu_stats);
+  Copy(dev_ctx, stats, CPUPlace(), false, &cpu_stats);
 
   DenseTensor cpu_values;
   cpu_values.Resize({static_cast<int64_t>(3)});
-  Copy(dev_ctx, values, phi::CPUPlace(), false, &cpu_values);
+  Copy(dev_ctx, values, CPUPlace(), false, &cpu_values);
   dev_ctx.Wait();
 
   int dev_id = tensor.place().device;

diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -123,7 +123,7 @@ FUNC_WITH_TYPES(POTRF_INSTANCE);
         workspace_device_size,                                            \
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));  \
     auto workspace_host =                                                 \
-        phi::memory_utils::Alloc(phi::CPUPlace(), workspace_host_size);   \
+        phi::memory_utils::Alloc(CPUPlace(), workspace_host_size);        \
     PADDLE_ENFORCE_GPU_SUCCESS(                                           \
         dynload::cusolverDnXpotrf(handle,                                 \
                                   params,                                 \

diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -561,7 +561,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
   // step 14: Get sampled class center for output
   Copy<Context>(dev_ctx,
                 num_classes_per_device,
-                phi::CPUPlace(),
+                CPUPlace(),
                 true,
                 &num_classes_per_device);
   T actual_num_samples = num_classes_per_device.data<T>()[rank + 1];

diff --git a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
@@ -95,7 +95,7 @@ void GPUCollectFpnProposalsOpKernel(
     auto score_in = score_ins[i];
     if (multi_rois_num.size() > 0) {
       DenseTensor temp;
-      Copy(dev_ctx, *multi_rois_num[i], phi::CPUPlace(), true, &temp);
+      Copy(dev_ctx, *multi_rois_num[i], CPUPlace(), true, &temp);
       const int* length_in = temp.data<int>();
       lod_size = multi_rois_num[i]->numel();
       for (size_t n = 0; n < lod_size; ++n) {
@@ -240,7 +240,7 @@ void GPUCollectFpnProposalsOpKernel(
   GetLengthLoD<<<blocks, threads, 0, dev_ctx.stream()>>>(
       real_post_num, out_id_data, length_lod_data);
   std::vector<int> length_lod_cpu(lod_size);
-  phi::memory_utils::Copy(phi::CPUPlace(),
+  phi::memory_utils::Copy(CPUPlace(),
                           length_lod_cpu.data(),
                           place,
                           length_lod_data,

diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -238,7 +238,7 @@ bool CumprodGradCompatible(const Context &dev_ctx,
   bool has_zero = false;
 #ifdef PADDLE_WITH_CUDA
   DenseTensor any_zero_cpu;
-  phi::Copy(dev_ctx, any_zero, phi::CPUPlace(), true, &any_zero_cpu);
+  phi::Copy(dev_ctx, any_zero, CPUPlace(), true, &any_zero_cpu);
   has_zero = *any_zero_cpu.data<bool>();
 #else
   has_zero = *any_zero.data<bool>();

diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -168,7 +168,7 @@ struct DeterminantCudaFunctor<phi::dtype::complex<T>, Context> {
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     memory_utils::Copy(dev_ctx.GetPlace(),
                        tmp_gpu_ptrs_data->ptr(),
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        static_cast<void*>(cpu_ptrs.data()),
                        cpu_ptrs.size() * sizeof(phi::dtype::complex<T>*),
                        dev_ctx.stream());

diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -227,7 +227,7 @@ void DistributeFpnProposalsKernel(
   size_t start = 0;
 
   std::vector<int> sub_lod_list_cpu(lod_size * num_level);
-  memory_utils::Copy(phi::CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                      sub_lod_list_cpu.data(),
                      place,
                      sub_lod_list_data,

diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
@@ -104,16 +104,8 @@ void EditDistanceKernel(const Context& dev_ctx,
   if (use_length) {
     DenseTensor hyp_length_cpu;
     DenseTensor ref_length_cpu;
-    Copy(dev_ctx,
-         *(hypslength.get_ptr()),
-         phi::CPUPlace(),
-         false,
-         &hyp_length_cpu);
-    Copy(dev_ctx,
-         *(refslength.get_ptr()),
-         phi::CPUPlace(),
-         false,
-         &ref_length_cpu);
+    Copy(dev_ctx, *(hypslength.get_ptr()), CPUPlace(), false, &hyp_length_cpu);
+    Copy(dev_ctx, *(refslength.get_ptr()), CPUPlace(), false, &ref_length_cpu);
 
     for (auto i = 0; i < batch_size; i++) {
       hyp_lod[i + 1] = hyp_lod[i] + hyp_length_cpu.data<int64_t>()[i];

diff --git a/paddle/phi/kernels/gpu/eig_grad_kernel.cu b/paddle/phi/kernels/gpu/eig_grad_kernel.cu
@@ -211,7 +211,7 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
   }
 
   std::vector<int> h_info(batch_count, 0);
-  phi::memory_utils::Copy(phi::CPUPlace(),
+  phi::memory_utils::Copy(CPUPlace(),
                           h_info.data(),
                           dev_ctx.GetPlace(),
                           d_info,
@@ -392,7 +392,7 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
   }
 
   std::vector<int> h_info(batch_count, 0);
-  phi::memory_utils::Copy(phi::CPUPlace(),
+  phi::memory_utils::Copy(CPUPlace(),
                           h_info.data(),
                           dev_ctx.GetPlace(),
                           d_info,
@@ -565,13 +565,13 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
   }
 
   // Check error info
-  phi::CPUPlace cpu_place;
+  CPUPlace cpu_place;
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
 
   std::vector<rocblas_int> h_info(batch_count, 0);
   phi::memory_utils::Copy(
-      phi::CPUPlace(),
+      CPUPlace(),
       h_info.data(),
       dev_ctx.GetPlace(),
       d_info,
@@ -741,13 +741,13 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
         X_row,
         rhs_cols));  // X_row ldc = rhs_cols (row-major leading dimension)
   }
-  phi::CPUPlace cpu_place;
+  CPUPlace cpu_place;
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
 
   std::vector<rocblas_int> h_info(batch_count, 0);
   phi::memory_utils::Copy(
-      phi::CPUPlace(),
+      CPUPlace(),
       h_info.data(),
       dev_ctx.GetPlace(),
       d_info,
@@ -799,7 +799,7 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L,
   DenseTensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV_safe);
   DenseTensor diag_real = phi::Real<T>(dev_ctx, VhgV);
 
-  auto cpu_place = phi::CPUPlace();
+  auto cpu_place = CPUPlace();
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
 
@@ -812,7 +812,7 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L,
 
   DenseTensor diag_res;
   dev_ctx.template Alloc<T>(&diag_res);
-  Copy(dev_ctx, diag_res_cpu, phi::GPUPlace(), false, &diag_res);
+  Copy(dev_ctx, diag_res_cpu, GPUPlace(), false, &diag_res);
 
   DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
 

diff --git a/paddle/phi/kernels/gpu/eig_kernel.cu b/paddle/phi/kernels/gpu/eig_kernel.cu
@@ -32,7 +32,7 @@ void EigKernel(const Context& dev_ctx,
     return;
   }
 
-  auto cpu_place = phi::CPUPlace();
+  auto cpu_place = CPUPlace();
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
 
@@ -108,8 +108,8 @@ void EigKernel(const Context& dev_ctx,
   }
 
   // copy result from cpu to gpu tensor
-  Copy(dev_ctx, out_w_cpu, phi::GPUPlace(), false, out_w);
-  Copy(dev_ctx, out_v_cpu, phi::GPUPlace(), false, out_v);
+  Copy(dev_ctx, out_w_cpu, GPUPlace(), false, out_w);
+  Copy(dev_ctx, out_v_cpu, GPUPlace(), false, out_v);
 }
 
 }  // namespace phi

diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -522,7 +522,7 @@ void GenerateProposalsKernel(const Context &dev_ctx,
   T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
 
   auto place = dev_ctx.GetPlace();
-  auto cpu_place = phi::CPUPlace();
+  auto cpu_place = CPUPlace();
 
   int64_t num_proposals = 0;
   std::vector<size_t> offset(1, 0);

diff --git a/paddle/phi/kernels/gpu/global_gather_kernel.cu b/paddle/phi/kernels/gpu/global_gather_kernel.cu
@@ -63,7 +63,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
       cpu_local_count_data = local_count->data<int64_t>();
       local_count_len = local_count->numel();
     } else {
-      Copy(dev_ctx, *local_count, phi::CPUPlace(), true, &cpu_local_count);
+      Copy(dev_ctx, *local_count, CPUPlace(), true, &cpu_local_count);
       cpu_local_count_data = cpu_local_count.data<int64_t>();
       local_count_len = cpu_local_count.numel();
     }
@@ -72,7 +72,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     if (global_count->place().GetType() == AllocationType::CPU) {
       cpu_global_count_data = global_count->data<int64_t>();
     } else {
-      Copy(dev_ctx, *global_count, phi::CPUPlace(), true, &cpu_global_count);
+      Copy(dev_ctx, *global_count, CPUPlace(), true, &cpu_global_count);
       cpu_global_count_data = cpu_global_count.data<int64_t>();
     }
 

diff --git a/paddle/phi/kernels/gpu/global_scatter_kernel.cu b/paddle/phi/kernels/gpu/global_scatter_kernel.cu
@@ -61,7 +61,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     if (local_count->place().GetType() == AllocationType::CPU) {
       cpu_local_count_data = local_count->data<int64_t>();
     } else {
-      Copy(dev_ctx, *local_count, phi::CPUPlace(), true, &cpu_local_count);
+      Copy(dev_ctx, *local_count, CPUPlace(), true, &cpu_local_count);
       cpu_local_count_data = cpu_local_count.data<int64_t>();
     }
     auto global_count_len = 0;
@@ -70,7 +70,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
       cpu_global_count_data = global_count->data<int64_t>();
       global_count_len = global_count->numel();
     } else {
-      Copy(dev_ctx, *global_count, phi::CPUPlace(), true, &cpu_global_count);
+      Copy(dev_ctx, *global_count, CPUPlace(), true, &cpu_global_count);
       cpu_global_count_data = cpu_global_count.data<int64_t>();
       global_count_len = cpu_global_count.numel();
     }