Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 12 additions & 167 deletions patches/paddle-corex.patch
Original file line number Diff line number Diff line change
Expand Up @@ -898,55 +898,6 @@ index 85615d0dd3..a48ac51308 100644
common::errors::Unavailable("softmax_with_cross_entropy operator's "
"CUDA kernel only runs on GPU device."));
const T* loss_grad_data = loss_grad.data<T>();
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 9d382a8763..bb9512af53 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -84,7 +84,7 @@ __global__ void CrossEntropySoftLabel(T* loss,

const int kThreadPerBlock = 512;
const int kBatchPerBlock = 1;
- const int kWarpSize = 32; // (dim < 32) ? dim : 32;
+ const int kWarpSize = 64; // (dim < 32) ? dim : 32;
const int kBatchSize = 1;
const int kThreadPerBatch = kThreadPerBlock / kBatchPerBlock;
const int kWarpPerBatch = kThreadPerBatch / kWarpSize;
@@ -543,7 +543,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss,
const bool LogMode = true;

constexpr int kDimCeil = 1 << Log2Elements;
- constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+ constexpr int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
constexpr int kVSize = sizeof(VecT) / sizeof(T);
constexpr int kIterations = kDimCeil / kWarpSize;
constexpr int kIterationsV =
@@ -743,7 +743,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
auto stream = dev_ctx.stream();

if (D == 1 && dim <= max_dim) {
- int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+ int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;

// use 128 threads per block to maximize gpu utilization
@@ -841,7 +841,7 @@ __global__ void WarpSoftmaxForward(T* loss,
const int element_count,
const int ignore_index) {
constexpr int kDimCeil = 1 << Log2Elements;
- constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+ constexpr int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
constexpr int kVSize = sizeof(VecT) / sizeof(T);
constexpr int kIterations = kDimCeil / kWarpSize;
constexpr int kIterationsV =
@@ -1089,7 +1089,7 @@ void SwitchWarpSoftmaxForward(T* loss,
// use 128 threads per block to maximimize gpu utilization
const int log2_elements = static_cast<int>(Log2Ceil(element_count));
const int kDimCeil = 1 << log2_elements;
- int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+ int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
constexpr int threads_per_block = 128;
int warps_per_block = (threads_per_block / kWarpSize);
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 2d9a3493a6..460be448db 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
Expand Down Expand Up @@ -1166,7 +1117,7 @@ index 085845dfb3..defe09ec84 100644
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH));
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index be6ee4f854..87ad910d79 100644
index 43c92298f1..6811e4f3e0 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -29,6 +29,9 @@ limitations under the License. */
Expand All @@ -1179,93 +1130,25 @@ index be6ee4f854..87ad910d79 100644

COMMON_DECLARE_bool(use_accuracy_compatible_kernel);

@@ -100,7 +103,7 @@ inline int CalcBlockSize(int vec_size, uint64_t dim_size) {
}

while (block_size < (max_block_size)) block_size *= 2;
- block_size = std::max(block_size, static_cast<uint64_t>(32));
+ block_size = std::max(block_size, static_cast<uint64_t>(64));
return block_size;
}

@@ -132,37 +135,37 @@ __device__ __forceinline__ void WarpReduceMax(T* sum) {

template <typename T>
__inline__ __device__ void BlockReduceMax(T* val) {
- static __shared__ T shared[32];
- int lane = threadIdx.x & 0x1f;
- int wid = threadIdx.x >> 5;
+ static __shared__ T shared[64];
+ int lane = threadIdx.x & 0x3f;
+ int wid = threadIdx.x >> 6;

- WarpReduceMax<T, 1, 32>(val);
+ WarpReduceMax<T, 1, 64>(val);

if (lane == 0) shared[wid] = *val;

@@ -170,7 +173,7 @@ __inline__ __device__ void BlockReduceMax(T* val) {
__syncthreads();

- int block_span = (blockDim.x + warpSize - 1) >> 5;
int block_span = (blockDim.x + warpSize - 1) >> PADDLE_WARP_SHIFT;
- *val = (lane < block_span) ? shared[lane] : -1e10f;
- WarpReduceMax<T, 1, 32>(val);
+ int block_span = (blockDim.x + warpSize - 1) >> 6;
+ *val = (lane < block_span) ? shared[lane] : std::numeric_limits<T>::lowest();
+ WarpReduceMax<T, 1, 64>(val);
WarpReduceMax<T, 1, PADDLE_WARP_SIZE>(val);
}

template <typename T>
__inline__ __device__ void BlockReduceSum(T* val) {
- static __shared__ T shared[32];
- int lane = threadIdx.x & 0x1f;
- int wid = threadIdx.x >> 5;
+ static __shared__ T shared[64];
+ int lane = threadIdx.x & 0x3f;
+ int wid = threadIdx.x >> 6;

- WarpReduceSum<T, 1, 32>(val);
+ WarpReduceSum<T, 1, 64>(val);

@@ -206,7 +209,7 @@ __inline__ __device__ void BlockReduceMaxDown(T* val) {
__syncthreads();
if (lane == 0) shared[wid] = *val;

__syncthreads();

- int block_span = (blockDim.x + warpSize - 1) >> 5;
+ int block_span = (blockDim.x + warpSize - 1) >> 6;
*val = (lane < block_span) ? shared[lane] : static_cast<T>(0.0f);
- WarpReduceSum<T, 1, 32>(val);
+ WarpReduceSum<T, 1, 64>(val);
}

template <typename Tx, typename Ty = Tx>
@@ -531,11 +534,11 @@ __global__ void WarpSoftmaxForward(T* softmax,
const IndexType stride,
const IndexType element_count) {
constexpr IndexType kDimCeil = 1 << Log2Elements;
- constexpr IndexType kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+ constexpr IndexType kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
constexpr IndexType kVSize = sizeof(VecT) / sizeof(T);
constexpr IndexType kLoops = kDimCeil / kWarpSize;
constexpr IndexType kLoopsV = (kLoops >= kVSize) ? (kLoops / kVSize) : 1;
- constexpr IndexType kBatchSize = (kDimCeil <= 32) ? 2 : 1;
+ constexpr IndexType kBatchSize = (kDimCeil <= 64) ? 2 : 1;
IndexType first_batch =
(static_cast<IndexType>(blockDim.y) * blockIdx.x + threadIdx.y) *
kBatchSize;
@@ -652,9 +655,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
IndexType element_count) {
constexpr IndexType kVSize = sizeof(VecT) / sizeof(T);
constexpr IndexType kDimCeil = 1 << Log2Elements;
- constexpr IndexType kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+ constexpr IndexType kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
constexpr IndexType kLoops = kDimCeil / kWarpSize;
- constexpr IndexType kBatchSize = (kDimCeil <= 128) ? 2 : 1;
+ constexpr IndexType kBatchSize = (kDimCeil <= 256) ? 2 : 1;
constexpr IndexType kLoopsV = (kLoops >= kVSize) ? (kLoops / kVSize) : 1;
IndexType element_count_v = element_count / kVSize;
IndexType first_batch =
@@ -865,6 +868,10 @@ static void GetGridDim(int64_t high_dim,
int block_span = (blockDim.x + warpSize - 1) >> PADDLE_WARP_SHIFT;
- *val = (lane < block_span) ? shared[lane] : -1e10f;
+ *val = (lane < block_span) ? shared[lane] : std::numeric_limits<T>::lowest();
if (wid == 0) {
WarpReduceMaxDown<T, 1, PADDLE_WARP_SIZE>(val);
}
@@ -953,6 +956,10 @@ static void GetGridDim(int64_t high_dim,
grid_x = std::min(grid_x, max_num_blocks);
int64_t grid_y = (max_num_blocks + grid_x - 1) / grid_x;
grid_y = std::min(grid_y, high_dim);
Expand All @@ -1276,44 +1159,6 @@ index be6ee4f854..87ad910d79 100644
grid->x = grid_x;
grid->y = grid_y;
}
@@ -873,7 +880,7 @@ static void GetBlockDim(int64_t mid_dim, int64_t low_dim, dim3* block) {
constexpr int max_num_threads = 1024;
int64_t block_x = int64_t(1) << Log2Ceil(low_dim);
int64_t block_y = int64_t(1) << Log2Ceil(mid_dim);
- block->x = std::min<int64_t>(block_x, 32);
+ block->x = std::min<int64_t>(block_x, 64);
block->y = std::min<int64_t>(block_y, max_num_threads / block->x);
block->x = std::min<int64_t>(block_x, max_num_threads / block->y);
}
@@ -2663,11 +2670,11 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
D > std::numeric_limits<int32_t>::max()) {
int dim_log2 = static_cast<int>(Log2Ceil(dim));
IndexType dim_ceil = 1 << dim_log2;
- int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
- int batches_per_warp = (dim_ceil <= 32) ? 2 : 1;
+ int warp_size = (dim_ceil < 64) ? dim_ceil : 64;
+ int batches_per_warp = (dim_ceil <= 64) ? 2 : 1;

// use 128 threads per block to maximize gpu utilization
- constexpr int threads_per_block = 128;
+ constexpr int threads_per_block = 256;

int warps_per_block = (threads_per_block / warp_size);
int batches_per_block = warps_per_block * batches_per_warp;
@@ -2802,10 +2809,10 @@ void SoftmaxBackwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
D > std::numeric_limits<int32_t>::max()) {
int dim_log2 = Log2Ceil(dim);
IndexType dim_ceil = 1 << dim_log2;
- int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
- int batches_per_warp = (dim_ceil <= 128) ? 2 : 1;
+ int warp_size = (dim_ceil < 64) ? dim_ceil : 64;
+ int batches_per_warp = (dim_ceil <= 256) ? 2 : 1;

- constexpr int threads_per_block = 128;
+ constexpr int threads_per_block = 256;

int warps_per_block = (threads_per_block / warp_size);
int batches_per_block = warps_per_block * batches_per_warp;
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 2972c2bd85..0bb78d97b5 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu
Expand Down
Loading