PaddlePaddle · luotao1 · Jan 7, 2026 · Jan 6, 2026
diff --git a/paddle/phi/kernels/xpu/amp_kernel.cc b/paddle/phi/kernels/xpu/amp_kernel.cc
@@ -54,7 +54,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
   const bool* found_inf_data = found_infinite.data<bool>();
   bool cpu_found_inf_data = false;
   if (found_infinite.place().GetType() == AllocationType::XPU) {
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        static_cast<void*>(&cpu_found_inf_data),
                        found_infinite.place(),
                        static_cast<const void*>(found_inf_data),
@@ -94,7 +94,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
   int cpu_good_in_data;
   MPDType cpu_pre_loss_scaling_data;
   if (in_bad_steps.place().GetType() == AllocationType::XPU) {
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        static_cast<void*>(&cpu_bad_in_data),
                        in_bad_steps.place(),
                        static_cast<const void*>(bad_in_data),
@@ -104,7 +104,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
   }
 
   if (in_good_steps.place().GetType() == AllocationType::XPU) {
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        static_cast<void*>(&cpu_good_in_data),
                        in_good_steps.place(),
                        static_cast<const void*>(good_in_data),
@@ -114,7 +114,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
   }
 
   if (prev_loss_scaling.place().GetType() == AllocationType::XPU) {
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        static_cast<void*>(&cpu_pre_loss_scaling_data),
                        prev_loss_scaling.place(),
                        static_cast<const void*>(pre_loss_scaling_data),
@@ -151,17 +151,17 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
   // copy to device
   memory_utils::Copy(dev_ctx.GetPlace(),
                      bad_out_data,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      &cpu_bad_out_data,
                      sizeof(int));
   memory_utils::Copy(dev_ctx.GetPlace(),
                      good_out_data,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      &cpu_good_out_data,
                      sizeof(int));
   memory_utils::Copy(dev_ctx.GetPlace(),
                      updated_loss_scaling_data,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      &cpu_updated_loss_scaling_data,
                      sizeof(MPDType));
 }
@@ -186,7 +186,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
   bool has_inf_nans = false;
   MPDType cpu_scale_data;
   if (scale.place().GetType() == AllocationType::XPU) {
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        static_cast<void*>(&cpu_scale_data),
                        scale.place(),
                        static_cast<const void*>(scale_data),
@@ -221,7 +221,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
           inf_nan_check_ptr + i);
       PADDLE_ENFORCE_XDNN_SUCCESS(r, "check_finite_unscale");
     }
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        cpu_found_tensor.data<bool>(),
                        dev_ctx.GetPlace(),
                        inf_nan_check.data<bool>(),
@@ -249,7 +249,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
             inf_nan_check.data<bool>(),
             x->numel());
         PADDLE_ENFORCE_XDNN_SUCCESS(r, "check_nan_or_inf");
-        memory_utils::Copy(phi::CPUPlace(),
+        memory_utils::Copy(CPUPlace(),
                            &has_inf_nans,
                            dev_ctx.GetPlace(),
                            inf_nan_check.data<bool>(),
@@ -303,7 +303,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
   }
   memory_utils::Copy(dev_ctx.GetPlace(),
                      found_inf_data,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      &cpu_found_inf_data,
                      sizeof(bool));
 }

diff --git a/paddle/phi/kernels/xpu/distribute_fpn_proposals_kernel.cc b/paddle/phi/kernels/xpu/distribute_fpn_proposals_kernel.cc
@@ -26,7 +26,7 @@ static void Sort(const XPUContext& dev_ctx,
                  DenseTensor* index_out) {
   auto* value_data = value.data<T>();
   auto place = dev_ctx.GetPlace();
-  auto cpu_place = phi::CPUPlace();
+  auto cpu_place = CPUPlace();
 
   DenseTensor scores_slice_cpu;
   scores_slice_cpu.Resize({value.numel()});

diff --git a/paddle/phi/kernels/xpu/dropout_kernel.cc b/paddle/phi/kernels/xpu/dropout_kernel.cc
@@ -49,7 +49,7 @@ void DropoutRawKernel(const Context& dev_ctx,
     int seed_data = 0;
     if (seed_tensor.get_ptr() != nullptr) {
       if ((seed_tensor->place()).GetType() == AllocationType::XPU) {
-        memory_utils::Copy(phi::CPUPlace(),
+        memory_utils::Copy(CPUPlace(),
                            &seed_data,
                            seed_tensor->place(),
                            seed_tensor->data<int>(),

diff --git a/paddle/phi/kernels/xpu/eig_kernel.cc b/paddle/phi/kernels/xpu/eig_kernel.cc
@@ -32,7 +32,7 @@ void EigKernel(const Context& dev_ctx,
     return;
   }
 
-  auto cpu_place = phi::CPUPlace();
+  auto cpu_place = CPUPlace();
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
 

diff --git a/paddle/phi/kernels/xpu/flash_attn_utils.h b/paddle/phi/kernels/xpu/flash_attn_utils.h
@@ -80,7 +80,7 @@ static void GenerateRNGState(
     const int64_t num_heads) {
   if (fixed_seed_offset.get_ptr()) {
     if ((fixed_seed_offset->place()).GetType() == AllocationType::XPU) {
-      memory_utils::Copy(phi::CPUPlace(),
+      memory_utils::Copy(CPUPlace(),
                          seed_offset_data,
                          fixed_seed_offset->place(),
                          fixed_seed_offset->data<int64_t>(),

diff --git a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
@@ -31,7 +31,7 @@ static void SortDescending(const XPUContext& dev_ctx,
                            int pre_nms_top_n) {
   auto* value_data = value.data<T>();
   auto place = dev_ctx.GetPlace();
-  auto cpu_place = phi::CPUPlace();
+  auto cpu_place = CPUPlace();
 
   DenseTensor scores_slice_cpu;
   scores_slice_cpu.Resize({value.numel()});
@@ -180,11 +180,8 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
 
   int keep_num;
   const auto xpu_place = dev_ctx.GetPlace();
-  memory_utils::Copy(phi::CPUPlace(),
-                     &keep_num,
-                     xpu_place,
-                     keep_num_t.data<int>(),
-                     sizeof(int));
+  memory_utils::Copy(
+      CPUPlace(), &keep_num, xpu_place, keep_num_t.data<int>(), sizeof(int));
   keep_index.Resize({keep_num});
 
   DenseTensor scores_filter, proposals_filter;
@@ -354,7 +351,7 @@ void GenerateProposalsKernel(const Context& dev_ctx,
   tmp_variances.Resize(common::make_ddim({tmp_variances.numel() / 4, 4}));
 
   auto place = dev_ctx.GetPlace();
-  auto cpu_place = phi::CPUPlace();
+  auto cpu_place = CPUPlace();
 
   int num_proposals = 0;
   std::vector<size_t> offset(1, 0);

diff --git a/paddle/phi/kernels/xpu/increment_kernel.cc b/paddle/phi/kernels/xpu/increment_kernel.cc
@@ -44,7 +44,7 @@ void IncrementKernel(const Context& dev_ctx,
   T* value_xpu = RAII_GUARD.alloc_l3_or_gm<T>(1);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      value_xpu,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      reinterpret_cast<void*>(&value_as_t),
                      sizeof(T));
 

diff --git a/paddle/phi/kernels/xpu/lamb_kernel.cc b/paddle/phi/kernels/xpu/lamb_kernel.cc
@@ -62,7 +62,7 @@ void LambKernel(const Context& dev_ctx,
       cpu_skip_update = *(skip_update->data<bool>());
     } else {
       const bool* skip_update_flag = skip_update->data<bool>();
-      memory_utils::Copy(phi::CPUPlace(),
+      memory_utils::Copy(CPUPlace(),
                          static_cast<void*>(&cpu_skip_update),
                          dev_ctx.GetPlace(),
                          static_cast<const void*>(skip_update_flag),

diff --git a/paddle/phi/kernels/xpu/masked_select_kernel.cc b/paddle/phi/kernels/xpu/masked_select_kernel.cc
@@ -58,7 +58,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
       xpu::nonzero_count(
           dev_ctx.x_context(), mask_data, out_size, mask.numel()),
       "nonzero_count ");
-  memory_utils::Copy(phi::CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                      static_cast<void*>(&out_size_cpu),
                      mask.place(),
                      static_cast<void*>(out_size),

diff --git a/paddle/phi/kernels/xpu/mean_all_grad_kernel.cc b/paddle/phi/kernels/xpu/mean_all_grad_kernel.cc
@@ -54,7 +54,7 @@ void MeanAllGradKernel(const Context& dev_ctx,
   const T* dy = OG->data<T>();
   T dy0_value;
   xpu_wait(dev_ctx.x_context()->xpu_stream);
-  memory_utils::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
+  memory_utils::Copy(CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
   float dy0_fp32 = static_cast<float>(dy0_value);
   dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel());
 

diff --git a/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc b/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc
@@ -157,7 +157,7 @@ void MultiClassNMSKernel(const Context& dev_ctx,
       std::vector<T> temp_value(out_dim, 0.0f);
       memory_utils::Copy(dev_ctx.GetPlace(),
                          out_ptr,
-                         phi::CPUPlace(),
+                         CPUPlace(),
                          temp_value.data(),
                          1 * out_dim * sizeof(T));
 
@@ -167,7 +167,7 @@ void MultiClassNMSKernel(const Context& dev_ctx,
       std::vector<int> temp_idx(1, 0);
       memory_utils::Copy(dev_ctx.GetPlace(),
                          out_index_ptr,
-                         phi::CPUPlace(),
+                         CPUPlace(),
                          temp_idx.data(),
                          1 * sizeof(int));
     } else {
@@ -182,7 +182,7 @@ void MultiClassNMSKernel(const Context& dev_ctx,
     T* out_ptr = out->template data<T>();
     memory_utils::Copy(dev_ctx.GetPlace(),
                        out_ptr,
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        outs_vec_.data(),
                        num_kept * out_dim * sizeof(T));
     if (return_index) {
@@ -191,7 +191,7 @@ void MultiClassNMSKernel(const Context& dev_ctx,
       int* out_index_ptr = index->template data<int>();
       memory_utils::Copy(dev_ctx.GetPlace(),
                          out_index_ptr,
-                         phi::CPUPlace(),
+                         CPUPlace(),
                          out_index_vec_.data(),
                          num_kept * sizeof(int));
     }

diff --git a/paddle/phi/kernels/xpu/nonzero_kernel.cc b/paddle/phi/kernels/xpu/nonzero_kernel.cc
@@ -47,7 +47,7 @@ void NonZeroKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_XDNN_SUCCESS(ret, "nonzero_count");
 
   int64_t true_num_cpu;
-  memory_utils::Copy(phi::CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                      static_cast<void*>(&true_num_cpu),
                      dev_ctx.GetPlace(),
                      static_cast<void*>(true_num),

diff --git a/paddle/phi/kernels/xpu/randint_kernel.cc b/paddle/phi/kernels/xpu/randint_kernel.cc
@@ -49,7 +49,7 @@ void RandintKernel(const Context& dev_ctx,
   }
   memory_utils::Copy(dev_ctx.GetPlace(),
                      data,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      reinterpret_cast<void*>(data_cpu.get()),
                      size * sizeof(T));
 }

diff --git a/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc
@@ -49,7 +49,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
   }
   DenseTensor roi_batch_id_list;
   roi_batch_id_list.Resize({rois_num});
-  auto cplace = phi::CPUPlace();
+  auto cplace = CPUPlace();
   auto xplace = dev_ctx.GetPlace();
 
   int rois_batch_size = 0;

diff --git a/paddle/phi/kernels/xpu/roi_align_kernel.cc b/paddle/phi/kernels/xpu/roi_align_kernel.cc
@@ -48,7 +48,7 @@ void RoiAlignKernel(const Context& dev_ctx,
 
   DenseTensor roi_batch_id_list;
   roi_batch_id_list.Resize({rois_num});
-  auto cplace = phi::CPUPlace();
+  auto cplace = CPUPlace();
   int* roi_batch_id_data = dev_ctx.template HostAlloc<int>(&roi_batch_id_list);
   auto xplace = dev_ctx.GetPlace();
   int rois_batch_size = 0;

diff --git a/paddle/phi/kernels/xpu/scatter_kernel.cc b/paddle/phi/kernels/xpu/scatter_kernel.cc
@@ -91,7 +91,7 @@ void ScatterKernel(const Context &dev_ctx,
   int64_t dim1 = common::product(common::slice_ddim(x_dims, 1, x_dims.size()));
 
   DenseTensor indices_cpu(index.type());
-  phi::Copy(dev_ctx, index, phi::CPUPlace(), true, &indices_cpu);
+  phi::Copy(dev_ctx, index, CPUPlace(), true, &indices_cpu);
 
   int r = 0;
   if (index_type == phi::DataType::INT32) {

diff --git a/paddle/phi/kernels/xpu/set_value_kernel.cc b/paddle/phi/kernels/xpu/set_value_kernel.cc
@@ -412,7 +412,7 @@ void SetValueKernel(const Context& dev_ctx,
       reinterpret_cast<T*>(RAII_GUARD.alloc_l3_or_gm<XPUType>(values_size));
   memory_utils::Copy(dev_ctx.GetPlace(),
                      value_data,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      value_data_uint8_cpu,
                      values_length);
   auto value_dims = common::make_ddim(shape);

diff --git a/paddle/phi/kernels/xpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/xpu/truncated_gaussian_random_kernel.cc
@@ -56,7 +56,7 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
 
   memory_utils::Copy(dev_ctx.GetPlace(),
                      data,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      reinterpret_cast<void*>(data_cpu.get()),
                      size * sizeof(T));
 }

diff --git a/paddle/phi/kernels/xpu/uniform_inplace_grad_kernel.cc b/paddle/phi/kernels/xpu/uniform_inplace_grad_kernel.cc
@@ -37,7 +37,7 @@ void XPUUniformRandomInplaceGradKernel(const Context& dev_ctx,
     }
     phi::memory_utils::Copy(dev_ctx.GetPlace(),
                             data,
-                            phi::CPUPlace(),
+                            CPUPlace(),
                             reinterpret_cast<void*>(data_cpu.get()),
                             size * sizeof(T));
   }

diff --git a/paddle/phi/kernels/xpu/uniform_inplace_kernel.cc b/paddle/phi/kernels/xpu/uniform_inplace_kernel.cc
@@ -60,7 +60,7 @@ void XPUUniformRandomInplaceKernel(const Context& dev_ctx,
   }
   phi::memory_utils::Copy(dev_ctx.GetPlace(),
                           data,
-                          phi::CPUPlace(),
+                          CPUPlace(),
                           reinterpret_cast<void*>(data_cpu.get()),
                           size * sizeof(T));
 }

diff --git a/paddle/phi/kernels/xpu/unique_kernel.cc b/paddle/phi/kernels/xpu/unique_kernel.cc
@@ -57,7 +57,7 @@ void XPUFlattenUniqueKernelImpl(const Context& dev_ctx,
         nullptr,
         false);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "unique_count");
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        &unique_len_cpu,
                        dev_ctx.GetPlace(),
                        unique_len_xpu,
@@ -218,7 +218,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
   std::vector<IndexT> inverse_cpu(axis_len);
   std::vector<IndexT> counts_cpu;
   std::vector<IndexT> ori_idx_cpu(axis_len);
-  memory_utils::Copy(phi::CPUPlace(),
+  memory_utils::Copy(CPUPlace(),
                      ori_idx_cpu.data(),
                      dev_ctx.GetPlace(),
                      ori_idx_xpu,
@@ -241,7 +241,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
                               {1});
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_all");
 
-    memory_utils::Copy(phi::CPUPlace(),
+    memory_utils::Copy(CPUPlace(),
                        adj_identical_cpu_data,
                        dev_ctx.GetPlace(),
                        adj_identical_xpu,
@@ -271,7 +271,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
       RAII_GUARD.alloc_l3_or_gm<XPUType>(unique_len * slice_size);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      unique_axis_idx_xpu,
-                     phi::CPUPlace(),
+                     CPUPlace(),
                      unique_axis.data(),
                      unique_len * sizeof(IndexT));
   r = xpu::paddle_gather<XPUType, IndexT>(dev_ctx.x_context(),
@@ -304,7 +304,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
     auto* indices_data = dev_ctx.template Alloc<IndexT>(indices);
     memory_utils::Copy(dev_ctx.GetPlace(),
                        indices_data,
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        indices_cpu.data(),
                        sizeof(IndexT) * unique_len);
   }
@@ -314,7 +314,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
     auto* reverse_data = dev_ctx.template Alloc<IndexT>(index);
     memory_utils::Copy(dev_ctx.GetPlace(),
                        reverse_data,
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        inverse_cpu.data(),
                        sizeof(IndexT) * axis_len);
   }
@@ -324,7 +324,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
     auto* counts_data = dev_ctx.template Alloc<IndexT>(counts);
     memory_utils::Copy(dev_ctx.GetPlace(),
                        counts_data,
-                       phi::CPUPlace(),
+                       CPUPlace(),
                        counts_cpu.data(),
                        sizeof(IndexT) * unique_len);
   }