Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,12 @@ __global__ void KeLstmBackward(Op op,
if (is_batch) {
if (value.prev_state_value) {
if (grad.check_ig_grad)
phi::CudaAtomicAdd(grad.check_ig_grad + frame_idx, r_checkIGrad);
CudaAtomicAdd(grad.check_ig_grad + frame_idx, r_checkIGrad);
if (grad.check_fg_grad)
phi::CudaAtomicAdd(grad.check_fg_grad + frame_idx, r_checkFGrad);
CudaAtomicAdd(grad.check_fg_grad + frame_idx, r_checkFGrad);
}
if (grad.check_og_grad)
phi::CudaAtomicAdd(grad.check_og_grad + frame_idx, r_checkOGrad);
CudaAtomicAdd(grad.check_og_grad + frame_idx, r_checkOGrad);
} else {
if (value.prev_state_value) {
if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/funcs/gather.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ __global__ void GatherGradGPUKernel(const T* input,
int64_t out_index =
inner_dim_index * (outer_dim_size * out_index_dim_size) +
index[index_dim_index] * outer_dim_size + out_dim_index;
phi::CudaAtomicAdd(out + out_index, *(input + idx));
CudaAtomicAdd(out + out_index, *(input + idx));
}
}

Expand Down
14 changes: 7 additions & 7 deletions paddle/phi/kernels/funcs/gather_scatter_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class ReduceAdd {
template <typename tensor_t>
__device__ void operator()(tensor_t* __restrict__ self_data,
const tensor_t* __restrict__ src_data) const {
phi::CudaAtomicAdd(self_data, *src_data);
CudaAtomicAdd(self_data, *src_data);
}
};
static ReduceAdd reduce_add;
Expand Down Expand Up @@ -317,7 +317,7 @@ __global__ void GatherScatterGPUKernel(
reduce_op(static_cast<tensor_t*>(self_data + replace_index_self),
static_cast<const tensor_t*>(src_data + replace_index_src));
if (atomic_cnt_buffer) {
phi::CudaAtomicAdd(atomic_cnt_buffer + replace_index_self, 1);
CudaAtomicAdd(atomic_cnt_buffer + replace_index_self, 1);
}
}

Expand Down Expand Up @@ -856,26 +856,26 @@ __global__ void ScatterGradPrePassKernel(
// as the 2nd param for compute offset
COMPUTE_OFFSET_DOUBLE_OUTPUT(replace_index_value, replace_index, tid, 2, 1)
if (value_data[replace_index_value] == out_data[replace_index])
phi::CudaAtomicAdd(aux_buffer + replace_index, 1);
CudaAtomicAdd(aux_buffer + replace_index, 1);
} else if constexpr (dispatch == GradDispatchTag::MeanInputGrad) {
COMPUTE_OFFSET_SINGLE_OUTPUT(replace_index, 1, tid, 2)
atomicMax(aux_buffer + replace_index, tid);
phi::CudaAtomicAdd(aux_buffer + grad_numel + replace_index, 1);
CudaAtomicAdd(aux_buffer + grad_numel + replace_index, 1);
} else if constexpr (dispatch == GradDispatchTag::ValueGrad) {
COMPUTE_OFFSET_SINGLE_OUTPUT(replace_index_self, 2, tid, 3)
atomicMax(aux_buffer + replace_index_self, tid);
} else if constexpr (dispatch == GradDispatchTag::MeanValueGrad) {
COMPUTE_OFFSET_SINGLE_OUTPUT(replace_index_self, 2, tid, 3)
phi::CudaAtomicAdd(aux_buffer + replace_index_self, 1);
CudaAtomicAdd(aux_buffer + replace_index_self, 1);
} else if constexpr (dispatch == GradDispatchTag::MinMaxValueGrad) {
COMPUTE_OFFSET_DOUBLE_OUTPUT(
replace_index_grad, replace_index_self, tid, 1, 2)
grad_data[replace_index_grad] = 0;
if (include_self &&
x_data[replace_index_self] == out_data[replace_index_self])
phi::CudaAtomicAdd(aux_buffer + replace_index_self, 1);
CudaAtomicAdd(aux_buffer + replace_index_self, 1);
if (value_data[replace_index_grad] == out_data[replace_index_self])
phi::CudaAtomicAdd(aux_buffer + replace_index_self, 1);
CudaAtomicAdd(aux_buffer + replace_index_self, 1);
}
}

Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/funcs/im2col.cu
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ __global__ void col2imOCF(const T* col_data,

if (height_offset >= 0 && height_offset < im_height &&
width_offset >= 0 && width_offset < im_width) {
phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/funcs/math/cos_sim_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
for (size_t i = 0; i < cols; ++i) {
T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
z_data * y[i] * reciprocal_y_norm_square);
phi::CudaAtomicAdd(dy + i, dy_data);
CudaAtomicAdd(dy + i, dy_data);
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions paddle/phi/kernels/funcs/pooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ __global__ void KernelMaxPool2DGrad(const IndexT nthreads,

if (maxIndex != -1) {
// atomic add
phi::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
}
}
}
Expand Down Expand Up @@ -1480,7 +1480,7 @@ __global__ void KernelMaxPool3DGrad(const IndexT nthreads,
}
if (maxIdx != -1) {
// atomic add
phi::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
}
}
}
Expand Down Expand Up @@ -2552,7 +2552,7 @@ __global__ void KernelMaxPool3DWithIdxGrad(
w_offset;
IndexT max_index = mask[output_index];
if (max_index != -1) {
phi::CudaAtomicAdd(
CudaAtomicAdd(
&input_grad[nc_offset * input_depth * input_height * input_width +
max_index],
output_grad[output_index]);
Expand Down Expand Up @@ -2928,7 +2928,7 @@ __global__ void FractionalKernelMaxPool2dGrad(

IndexT max_index = mask_data[output_index];
if (max_index != -1) {
phi::CudaAtomicAdd(
CudaAtomicAdd(
&input_grad[nc_offset * input_height * input_width + max_index],
output_grad[output_index]);
}
Expand Down Expand Up @@ -3306,7 +3306,7 @@ __global__ void FractionalKernelMaxPool3dGrad(
w_offset;
IndexT max_index = mask[output_index];
if (max_index != -1) {
phi::CudaAtomicAdd(
CudaAtomicAdd(
&input_grad[nc_offset * input_depth * input_height * input_width +
max_index],
output_grad[output_index]);
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/funcs/scatter.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ __global__ void ScatterCUDAKernel(const T* params,
VecType* dst = reinterpret_cast<VecType*>(output + out_i);
*dst = *src;
} else {
phi::CudaAtomicAdd(output + out_i, *(params + i));
CudaAtomicAdd(output + out_i, *(params + i));
}
}
}
Expand Down Expand Up @@ -149,7 +149,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,

#pragma unroll
for (int k = 0; k < VecSize; ++k) {
phi::CudaAtomicAdd(&(dst->val[k]), src->val[k]);
CudaAtomicAdd(&(dst->val[k]), src->val[k]);
}
}
}
Expand Down
13 changes: 6 additions & 7 deletions paddle/phi/kernels/funcs/segment_pooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
}
if (j > 0) {
if (last_segment_id == first_segment_id) {
phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
CudaAtomicAdd(summed_ids + last_segment_id, sum);
} else {
*(summed_ids + last_segment_id) = sum;
}
Expand All @@ -71,7 +71,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
sum += T(1);
last_segment_id = current_segment_id;
}
phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
CudaAtomicAdd(summed_ids + last_segment_id, sum);
}
}

Expand Down Expand Up @@ -112,8 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
last_segment_id * inner_dim_size + segment_offset;

if (last_segment_id == first_segment_id) {
phi::CudaAtomicAdd(output + output_index,
sum / *(summed_ids + last_segment_id));
CudaAtomicAdd(output + output_index,
sum / *(summed_ids + last_segment_id));
} else {
*(output + output_index) = sum / *(summed_ids + last_segment_id);
}
Expand All @@ -124,8 +124,7 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
last_segment_id = current_segment_id;
}
Index output_index = last_segment_id * inner_dim_size + segment_offset;
phi::CudaAtomicAdd(output + output_index,
sum / *(summed_ids + last_segment_id));
CudaAtomicAdd(output + output_index, sum / *(summed_ids + last_segment_id));
}
}

Expand Down Expand Up @@ -236,7 +235,7 @@ class SumPool {
DEVICE inline T initial() { return static_cast<T>(0); }
DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
DEVICE inline T atomic(T* address, const T val) {
return phi::CudaAtomicAdd(address, val);
return CudaAtomicAdd(address, val);
}
};

Expand Down
6 changes: 3 additions & 3 deletions paddle/phi/kernels/funcs/selected_rows_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
// Since index in rows of SelectedRows can be duplicate, we can not use
// tensor_out[index] += selected_rows[index]; Instead, we have to use
// AtomicAdd to avoid concurrent write error.
phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
CudaAtomicAdd(tensor_out + index, selected_rows[index]);
}
}
} // namespace
Expand Down Expand Up @@ -279,7 +279,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
for (int64_t index = tid; index < row_numel; index += block_size) {
// Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error.
phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
CudaAtomicAdd(tensor_out + index, selected_rows[index]);
}
}
} // namespace
Expand Down Expand Up @@ -362,7 +362,7 @@ __global__ void MergeAddKernel(const T* input,
input += ty * row_numel;
out += out_idx * row_numel;
for (int64_t index = tid; index < row_numel; index += block_size) {
phi::CudaAtomicAdd(out + index, input[index]);
CudaAtomicAdd(out + index, input[index]);
}
}

Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/funcs/top_k_function_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ __device__ void RadixCountUsingMask(const T* input,
if (GetLaneId() == 0) {
#pragma unroll
for (uint32_t i = 0; i < RadixSize; ++i) {
phi::CudaAtomicAdd(&shared_mem[i], counts[i]);
CudaAtomicAdd(&shared_mem[i], counts[i]);
}
}

Expand Down
8 changes: 4 additions & 4 deletions paddle/phi/kernels/gpu/adagrad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ __global__ void MergeGradKernel(const T* grad,
grad += ty * row_numel;
grad_merge += grad_merge_idx * row_numel;
for (int64_t index = tid; index < row_numel; index += block_size) {
phi::CudaAtomicAdd(grad_merge + index, grad[index]);
CudaAtomicAdd(grad_merge + index, grad[index]);
}
}

Expand All @@ -147,9 +147,9 @@ __global__ void SparseAdagradFunctorKernel(const T* grad,
for (int64_t index = tid; index < row_numel; index += block_size) {
// Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error.
phi::CudaAtomicAdd(param + index,
-1.0 * learning_rate[0] * grad[index] /
(sqrt(moment[index]) + epsilon));
CudaAtomicAdd(param + index,
-1.0 * learning_rate[0] * grad[index] /
(sqrt(moment[index]) + epsilon));
}
}

Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/assign_pos_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ __global__ void AssignPos(T* cum_count,
CUDA_KERNEL_LOOP(i, limit) {
int number_idx = numbers[i];
if (number_idx > -1) {
int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
int p = CudaAtomicAdd(cum_count + number_idx, -1);
out[p - 1] = i;
}
}
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/gpu/auc_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ __global__ void AddDataKernel(const int64_t *label_data,
"The predict data must gather or equal 0.");
uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
if (label_data[i]) {
phi::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
CudaAtomicAdd(pos + cur_step_begin + binIdx, 1);
} else {
phi::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
CudaAtomicAdd(neg + cur_step_begin + binIdx, 1);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/gpu/bincount_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ __global__ void KernelBincount(const InputT* input,
for (int64_t i = global_tid; i < total_elements; i += stride) {
InputT index = input[i];
if (!has_weights) {
phi::CudaAtomicAdd(&output[index], 1L);
CudaAtomicAdd(&output[index], 1L);
} else {
phi::CudaAtomicAdd(&output[index], static_cast<OutT>(weights[i]));
CudaAtomicAdd(&output[index], static_cast<OutT>(weights[i]));
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ __global__ void CEmbeddingGrad(T* table,
auto id = ids[row];
if (id >= start_idx && id < end_idx) {
auto real_idx = id - start_idx;
phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
}
}
}
Expand Down
4 changes: 1 addition & 3 deletions paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ static inline int NumBlocks(const int N) {
static __global__ void GetLengthLoD(const int nthreads,
const int* batch_ids,
int* length_lod) {
CUDA_KERNEL_LOOP(i, nthreads) {
phi::CudaAtomicAdd(length_lod + batch_ids[i], 1);
}
CUDA_KERNEL_LOOP(i, nthreads) { CudaAtomicAdd(length_lod + batch_ids[i], 1); }
}

template <typename T, typename Context>
Expand Down
3 changes: 1 addition & 2 deletions paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
height,
width);

phi::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
weight * cur_top_grad);
CudaAtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions paddle/phi/kernels/gpu/depthwise_conv.h
Original file line number Diff line number Diff line change
Expand Up @@ -1026,7 +1026,7 @@ __device__ __forceinline__ void NoReturnAtomicAdd(T* tensor,
T value) {
#if (defined(PADDLE_WITH_HIP) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
phi::CudaAtomicAdd(tensor + index, value);
CudaAtomicAdd(tensor + index, value);
#else
// Check if 32 bit aligned
__half* target_addr = reinterpret_cast<__half*>(tensor + index);
Expand Down Expand Up @@ -1059,7 +1059,7 @@ __device__ __forceinline__ void NoReturnAtomicAdd(T* tensor,
T value) {
#if (defined(PADDLE_WITH_HIP) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
phi::CudaAtomicAdd(tensor + index, value);
CudaAtomicAdd(tensor + index, value);
#else
// Check if 32 bit aligned
__nv_bfloat16* target_addr = reinterpret_cast<__nv_bfloat16*>(tensor + index);
Expand Down Expand Up @@ -1093,7 +1093,7 @@ __device__ __forceinline__ void NoReturnAtomicAdd(T* tensor,
index_t index,
const index_t numel,
T value) {
phi::CudaAtomicAdd(tensor + index, value);
CudaAtomicAdd(tensor + index, value);
}

template <typename T, bool fuse_relu_before_conv>
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ __global__ void GPUDistFpnProposalsHelper(const int nthreads,
tgt_lvl = min(max_level, max(tgt_lvl, min_level));
target_lvls[i] = tgt_lvl;
// compute number of rois in the same batch and same target level
phi::CudaAtomicAdd(
CudaAtomicAdd(
sub_lod_list + (tgt_lvl - min_level) * lod_size + roi_batch_ind, 1);
}
}
Expand Down
3 changes: 1 addition & 2 deletions paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ __global__ void EmbeddingGradAddTo(T* main_grad_out,
const phi::bfloat16* token_out_grad = out_grad + idy * token_length;
T* token_main_grad = main_grad_out + id * token_length;
for (int64_t i = idx; i < token_length; i += blockDim.x) {
phi::CudaAtomicAdd(&token_main_grad[i],
static_cast<T>(token_out_grad[i]));
CudaAtomicAdd(&token_main_grad[i], static_cast<T>(token_out_grad[i]));
}
idy += blockDim.y * gridDim.x;
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/embedding_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ __global__ void EmbeddingGrad(T* table,
phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
#else
for (int64_t i = idx; i < D; i += blockDim.x) {
phi::CudaAtomicAdd(&tab[i], out[i]);
CudaAtomicAdd(&tab[i], out[i]);
}
#endif
idy += blockDim.y * gridDim.x;
Expand Down
Loading
Loading