diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc index 0c505e2da670f9..d799dee8a4144c 100644 --- a/paddle/fluid/distributed/collective/process_group_bkcl.cc +++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc @@ -499,7 +499,7 @@ std::shared_ptr ProcessGroupBKCL::AllToAll( phi::XPUPlace place = in_tensor.place(); #if defined(PADDLE_WITH_FLAGCX) auto allocator_cpu = std::unique_ptr( - new paddle::experimental::DefaultAllocator(phi::CPUPlace())); + new paddle::experimental::DefaultAllocator(CPUPlace())); #endif auto allocator = std::unique_ptr( new paddle::experimental::DefaultAllocator(place)); @@ -517,49 +517,49 @@ std::shared_ptr ProcessGroupBKCL::AllToAll( #endif #if defined(PADDLE_WITH_FLAGCX) - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), in_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_numel_vec.data(), in_size_tensor.numel() * sizeof(int64_t)); - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), in_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_offset_vec.data(), in_offset_tensor.numel() * sizeof(int64_t)); - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), out_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_numel_vec.data(), out_size_tensor.numel() * sizeof(int64_t)); - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), out_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_offset_vec.data(), out_offset_tensor.numel() * sizeof(int64_t)); #else memory::Copy(place, in_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_numel_vec.data(), in_size_tensor.numel() * sizeof(int64_t)); memory::Copy(place, in_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_offset_vec.data(), in_offset_tensor.numel() * sizeof(int64_t)); memory::Copy(place, out_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_numel_vec.data(), out_size_tensor.numel() * sizeof(int64_t)); memory::Copy(place, out_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_offset_vec.data(), out_offset_tensor.numel() * sizeof(int64_t)); #endif @@ -679,7 +679,7 @@ std::shared_ptr ProcessGroupBKCL::AllToAll( phi::XPUPlace place = in_tensors[0].place(); #if defined(PADDLE_WITH_FLAGCX) auto allocator_cpu = std::unique_ptr( - new paddle::experimental::DefaultAllocator(phi::CPUPlace())); + new paddle::experimental::DefaultAllocator(CPUPlace())); #endif auto allocator = std::unique_ptr( new paddle::experimental::DefaultAllocator(place)); @@ -713,51 +713,51 @@ std::shared_ptr ProcessGroupBKCL::AllToAll( &concated_in_tensor); } #if defined(PADDLE_WITH_FLAGCX) - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), in_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_numel_vec.data(), in_size_tensor.numel() * sizeof(int64_t)); - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), in_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_offset_vec.data(), in_offset_tensor.numel() * sizeof(int64_t)); - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), out_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_numel_vec.data(), out_size_tensor.numel() * sizeof(int64_t)); - memory::Copy(phi::CPUPlace(), + memory::Copy(CPUPlace(), out_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_offset_vec.data(), out_offset_tensor.numel() * sizeof(int64_t)); #else memory::Copy(place, in_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_numel_vec.data(), in_size_tensor.numel() * sizeof(int64_t)); memory::Copy(place, in_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), in_offset_vec.data(), in_offset_tensor.numel() * sizeof(int64_t)); memory::Copy(place, out_size_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_numel_vec.data(), out_size_tensor.numel() * sizeof(int64_t)); memory::Copy(place, out_offset_tensor.data(), - phi::CPUPlace(), + CPUPlace(), out_offset_vec.data(), out_offset_tensor.numel() * sizeof(int64_t)); #endif diff --git a/paddle/fluid/distributed/collective/process_group_flagcx.cc b/paddle/fluid/distributed/collective/process_group_flagcx.cc index 81ea61549d9d54..87f3a14d0116b7 100644 --- a/paddle/fluid/distributed/collective/process_group_flagcx.cc +++ b/paddle/fluid/distributed/collective/process_group_flagcx.cc @@ -427,7 +427,7 @@ std::shared_ptr ProcessGroupFlagcx::Barrier( 0, common::errors::PreconditionNotMet( "The barrier device id must greater or equal than 0.")); - phi::GPUPlace place(opts.device_id); + GPUPlace place(opts.device_id); auto allocator = std::unique_ptr( new paddle::experimental::DefaultAllocator(place)); phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1}); @@ -816,7 +816,7 @@ void ProcessGroupFlagcx::SyncCalcStream(const Place& place, void ProcessGroupFlagcx::EagerConnect() { const auto deviceId = phi::backends::gpu::GetCurrentDeviceId(); - const auto& place = phi::GPUPlace(deviceId); + const auto& place = GPUPlace(deviceId); const auto key = GetKeyFromPlace(place); platform::CUDADeviceGuard cuda_guard(place); @@ -831,7 +831,7 @@ void ProcessGroupFlagcx::EagerConnect() { void ProcessGroupFlagcx::EagerConnectRingExchange() { std::vector> peers; - const auto& place = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()); + const auto& place = GPUPlace(phi::backends::gpu::GetCurrentDeviceId()); for (int rank = 0; rank < size_; rank++) { auto peer_rank = rank + 1 >= size_ ? 0 : rank + 1; diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc index ddabc9a495f49b..6ef4029977b6ee 100644 --- a/paddle/fluid/distributed/collective/process_group_nccl.cc +++ b/paddle/fluid/distributed/collective/process_group_nccl.cc @@ -488,7 +488,7 @@ std::shared_ptr ProcessGroupNCCL::Barrier( 0, common::errors::PreconditionNotMet( "The barrier device id must greater or equal than 0.")); - phi::GPUPlace place(opts.device_id); + GPUPlace place(opts.device_id); auto allocator = std::unique_ptr( new paddle::experimental::DefaultAllocator(place)); phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1}); @@ -923,18 +923,18 @@ void ProcessGroupNCCL::CreateNCCLEnvCache( // gather global ranks in current group size_t gpu_global_rank_size = sizeof(int); auto gpu_global_rank = phi::memory_utils::Alloc( - phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()), + GPUPlace(phi::backends::gpu::GetCurrentDeviceId()), gpu_global_rank_size); - phi::memory_utils::Copy(phi::GPUPlace(), + phi::memory_utils::Copy(GPUPlace(), gpu_global_rank->ptr(), - phi::CPUPlace(), + CPUPlace(), &global_rank_, gpu_global_rank_size); size_t gpu_global_ranks_size = num_ranks * sizeof(int); auto gpu_global_ranks = phi::memory_utils::Alloc( - phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()), + GPUPlace(phi::backends::gpu::GetCurrentDeviceId()), gpu_global_ranks_size); NCCL_CHECK(phi::dynload::ncclAllGather(gpu_global_rank->ptr(), @@ -945,9 +945,9 @@ void ProcessGroupNCCL::CreateNCCLEnvCache( comm_ctx->stream())); std::vector global_ranks(num_ranks); - phi::memory_utils::Copy(phi::CPUPlace(), + phi::memory_utils::Copy(CPUPlace(), global_ranks.data(), - phi::GPUPlace(), + GPUPlace(), gpu_global_ranks->ptr(), gpu_global_ranks_size); @@ -1032,7 +1032,7 @@ void ProcessGroupNCCL::SyncCalcStream(const Place& place, void ProcessGroupNCCL::EagerConnect() { const auto deviceId = phi::backends::gpu::GetCurrentDeviceId(); - const auto& place = phi::GPUPlace(deviceId); + const auto& place = GPUPlace(deviceId); const auto key = GetKeyFromPlace(place); platform::CUDADeviceGuard cuda_guard(place); @@ -1049,7 +1049,7 @@ void ProcessGroupNCCL::EagerConnect() { void ProcessGroupNCCL::EagerConnectRingExchange( std::shared_ptr nccl_config_ptr) { std::vector> peers; - const auto& place = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()); + const auto& place = GPUPlace(phi::backends::gpu::GetCurrentDeviceId()); for (int rank = 0; rank < size_; rank++) { auto peer_rank = rank + 1 >= size_ ? 0 : rank + 1;