Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions paddle/fluid/distributed/collective/process_group_bkcl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
phi::XPUPlace place = in_tensor.place();
#if defined(PADDLE_WITH_FLAGCX)
auto allocator_cpu = std::unique_ptr<phi::Allocator>(
new paddle::experimental::DefaultAllocator(phi::CPUPlace()));
new paddle::experimental::DefaultAllocator(CPUPlace()));
#endif
auto allocator = std::unique_ptr<phi::Allocator>(
new paddle::experimental::DefaultAllocator(place));
Expand All @@ -517,49 +517,49 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
#endif

#if defined(PADDLE_WITH_FLAGCX)
memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
in_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_numel_vec.data(),
in_size_tensor.numel() * sizeof(int64_t));
memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
in_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_offset_vec.data(),
in_offset_tensor.numel() * sizeof(int64_t));
memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
out_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_numel_vec.data(),
out_size_tensor.numel() * sizeof(int64_t));
memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
out_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_offset_vec.data(),
out_offset_tensor.numel() * sizeof(int64_t));
#else

memory::Copy(place,
in_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_numel_vec.data(),
in_size_tensor.numel() * sizeof(int64_t));

memory::Copy(place,
in_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_offset_vec.data(),
in_offset_tensor.numel() * sizeof(int64_t));

memory::Copy(place,
out_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_numel_vec.data(),
out_size_tensor.numel() * sizeof(int64_t));

memory::Copy(place,
out_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_offset_vec.data(),
out_offset_tensor.numel() * sizeof(int64_t));
#endif
Expand Down Expand Up @@ -679,7 +679,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
phi::XPUPlace place = in_tensors[0].place();
#if defined(PADDLE_WITH_FLAGCX)
auto allocator_cpu = std::unique_ptr<phi::Allocator>(
new paddle::experimental::DefaultAllocator(phi::CPUPlace()));
new paddle::experimental::DefaultAllocator(CPUPlace()));
#endif
auto allocator = std::unique_ptr<phi::Allocator>(
new paddle::experimental::DefaultAllocator(place));
Expand Down Expand Up @@ -713,51 +713,51 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllToAll(
&concated_in_tensor);
}
#if defined(PADDLE_WITH_FLAGCX)
memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
in_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_numel_vec.data(),
in_size_tensor.numel() * sizeof(int64_t));

memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
in_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_offset_vec.data(),
in_offset_tensor.numel() * sizeof(int64_t));

memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
out_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_numel_vec.data(),
out_size_tensor.numel() * sizeof(int64_t));

memory::Copy(phi::CPUPlace(),
memory::Copy(CPUPlace(),
out_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_offset_vec.data(),
out_offset_tensor.numel() * sizeof(int64_t));
#else
memory::Copy(place,
in_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_numel_vec.data(),
in_size_tensor.numel() * sizeof(int64_t));

memory::Copy(place,
in_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
in_offset_vec.data(),
in_offset_tensor.numel() * sizeof(int64_t));

memory::Copy(place,
out_size_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_numel_vec.data(),
out_size_tensor.numel() * sizeof(int64_t));

memory::Copy(place,
out_offset_tensor.data(),
phi::CPUPlace(),
CPUPlace(),
out_offset_vec.data(),
out_offset_tensor.numel() * sizeof(int64_t));
#endif
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/distributed/collective/process_group_flagcx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupFlagcx::Barrier(
0,
common::errors::PreconditionNotMet(
"The barrier device id must greater or equal than 0."));
phi::GPUPlace place(opts.device_id);
GPUPlace place(opts.device_id);
auto allocator = std::unique_ptr<phi::Allocator>(
new paddle::experimental::DefaultAllocator(place));
phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1});
Expand Down Expand Up @@ -816,7 +816,7 @@ void ProcessGroupFlagcx::SyncCalcStream(const Place& place,

void ProcessGroupFlagcx::EagerConnect() {
const auto deviceId = phi::backends::gpu::GetCurrentDeviceId();
const auto& place = phi::GPUPlace(deviceId);
const auto& place = GPUPlace(deviceId);
const auto key = GetKeyFromPlace(place);

platform::CUDADeviceGuard cuda_guard(place);
Expand All @@ -831,7 +831,7 @@ void ProcessGroupFlagcx::EagerConnect() {

void ProcessGroupFlagcx::EagerConnectRingExchange() {
std::vector<std::pair<int, int>> peers;
const auto& place = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
const auto& place = GPUPlace(phi::backends::gpu::GetCurrentDeviceId());

for (int rank = 0; rank < size_; rank++) {
auto peer_rank = rank + 1 >= size_ ? 0 : rank + 1;
Expand Down
18 changes: 9 additions & 9 deletions paddle/fluid/distributed/collective/process_group_nccl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
0,
common::errors::PreconditionNotMet(
"The barrier device id must greater or equal than 0."));
phi::GPUPlace place(opts.device_id);
GPUPlace place(opts.device_id);
auto allocator = std::unique_ptr<phi::Allocator>(
new paddle::experimental::DefaultAllocator(place));
phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1});
Expand Down Expand Up @@ -923,18 +923,18 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(
// gather global ranks in current group
size_t gpu_global_rank_size = sizeof(int);
auto gpu_global_rank = phi::memory_utils::Alloc(
phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
gpu_global_rank_size);

phi::memory_utils::Copy(phi::GPUPlace(),
phi::memory_utils::Copy(GPUPlace(),
gpu_global_rank->ptr(),
phi::CPUPlace(),
CPUPlace(),
&global_rank_,
gpu_global_rank_size);

size_t gpu_global_ranks_size = num_ranks * sizeof(int);
auto gpu_global_ranks = phi::memory_utils::Alloc(
phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
gpu_global_ranks_size);

NCCL_CHECK(phi::dynload::ncclAllGather(gpu_global_rank->ptr(),
Expand All @@ -945,9 +945,9 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(
comm_ctx->stream()));

std::vector<int> global_ranks(num_ranks);
phi::memory_utils::Copy(phi::CPUPlace(),
phi::memory_utils::Copy(CPUPlace(),
global_ranks.data(),
phi::GPUPlace(),
GPUPlace(),
gpu_global_ranks->ptr(),
gpu_global_ranks_size);

Expand Down Expand Up @@ -1032,7 +1032,7 @@ void ProcessGroupNCCL::SyncCalcStream(const Place& place,

void ProcessGroupNCCL::EagerConnect() {
const auto deviceId = phi::backends::gpu::GetCurrentDeviceId();
const auto& place = phi::GPUPlace(deviceId);
const auto& place = GPUPlace(deviceId);
const auto key = GetKeyFromPlace(place);

platform::CUDADeviceGuard cuda_guard(place);
Expand All @@ -1049,7 +1049,7 @@ void ProcessGroupNCCL::EagerConnect() {
void ProcessGroupNCCL::EagerConnectRingExchange(
std::shared_ptr<phi::distributed::NCCLConfig> nccl_config_ptr) {
std::vector<std::pair<int, int>> peers;
const auto& place = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
const auto& place = GPUPlace(phi::backends::gpu::GetCurrentDeviceId());

for (int rank = 0; rank < size_; rank++) {
auto peer_rank = rank + 1 >= size_ ? 0 : rank + 1;
Expand Down
Loading