PaddlePaddle · YqGe585 · Mar 16, 2026
diff --git a/patches/paddle-corex.patch b/patches/paddle-corex.patch
@@ -898,55 +898,6 @@ index 85615d0dd3..a48ac51308 100644
        common::errors::Unavailable("softmax_with_cross_entropy operator's "
                                    "CUDA kernel only runs on GPU device."));
    const T* loss_grad_data = loss_grad.data<T>();
-diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
-index 9d382a8763..bb9512af53 100644
---- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
-+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
-@@ -84,7 +84,7 @@ __global__ void CrossEntropySoftLabel(T* loss,
-
-   const int kThreadPerBlock = 512;
-   const int kBatchPerBlock = 1;
--  const int kWarpSize = 32;  // (dim < 32) ? dim : 32;
-+  const int kWarpSize = 64;  // (dim < 32) ? dim : 32;
-   const int kBatchSize = 1;
-   const int kThreadPerBatch = kThreadPerBlock / kBatchPerBlock;
-   const int kWarpPerBatch = kThreadPerBatch / kWarpSize;
-@@ -543,7 +543,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss,
-   const bool LogMode = true;
-
-   constexpr int kDimCeil = 1 << Log2Elements;
--  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-+  constexpr int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
-   constexpr int kVSize = sizeof(VecT) / sizeof(T);
-   constexpr int kIterations = kDimCeil / kWarpSize;
-   constexpr int kIterationsV =
-@@ -743,7 +743,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
-   auto stream = dev_ctx.stream();
-
-   if (D == 1 && dim <= max_dim) {
--    int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-+    int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
-     int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
-
-     // use 128 threads per block to maximize gpu utilization
-@@ -841,7 +841,7 @@ __global__ void WarpSoftmaxForward(T* loss,
-                                    const int element_count,
-                                    const int ignore_index) {
-   constexpr int kDimCeil = 1 << Log2Elements;
--  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-+  constexpr int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
-   constexpr int kVSize = sizeof(VecT) / sizeof(T);
-   constexpr int kIterations = kDimCeil / kWarpSize;
-   constexpr int kIterationsV =
-@@ -1089,7 +1089,7 @@ void SwitchWarpSoftmaxForward(T* loss,
-   // use 128 threads per block to maximimize gpu utilization
-   const int log2_elements = static_cast<int>(Log2Ceil(element_count));
-   const int kDimCeil = 1 << log2_elements;
--  int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-+  int kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
-   int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
-   constexpr int threads_per_block = 128;
-   int warps_per_block = (threads_per_block / kWarpSize);
 diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
 index 2d9a3493a6..460be448db 100644
 --- a/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -1166,7 +1117,7 @@ index 085845dfb3..defe09ec84 100644
        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
            cdesc.desc(), CUDNN_DEFAULT_MATH));
 diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
-index be6ee4f854..87ad910d79 100644
+index 43c92298f1..6811e4f3e0 100644
 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
 +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
 @@ -29,6 +29,9 @@ limitations under the License. */
@@ -1179,93 +1130,25 @@ index be6ee4f854..87ad910d79 100644
 
  COMMON_DECLARE_bool(use_accuracy_compatible_kernel);
 
-@@ -100,7 +103,7 @@ inline int CalcBlockSize(int vec_size, uint64_t dim_size) {
-   }
-
-   while (block_size < (max_block_size)) block_size *= 2;
--  block_size = std::max(block_size, static_cast<uint64_t>(32));
-+  block_size = std::max(block_size, static_cast<uint64_t>(64));
-   return block_size;
- }
-
-@@ -132,37 +135,37 @@ __device__ __forceinline__ void WarpReduceMax(T* sum) {
-
- template <typename T>
- __inline__ __device__ void BlockReduceMax(T* val) {
--  static __shared__ T shared[32];
--  int lane = threadIdx.x & 0x1f;
--  int wid = threadIdx.x >> 5;
-+  static __shared__ T shared[64];
-+  int lane = threadIdx.x & 0x3f;
-+  int wid = threadIdx.x >> 6;
-
--  WarpReduceMax<T, 1, 32>(val);
-+  WarpReduceMax<T, 1, 64>(val);
-
-   if (lane == 0) shared[wid] = *val;
-
+@@ -170,7 +173,7 @@ __inline__ __device__ void BlockReduceMax(T* val) {
    __syncthreads();
 
--  int block_span = (blockDim.x + warpSize - 1) >> 5;
+   int block_span = (blockDim.x + warpSize - 1) >> PADDLE_WARP_SHIFT;
 -  *val = (lane < block_span) ? shared[lane] : -1e10f;
--  WarpReduceMax<T, 1, 32>(val);
-+  int block_span = (blockDim.x + warpSize - 1) >> 6;
 +  *val = (lane < block_span) ? shared[lane] : std::numeric_limits<T>::lowest();
-+  WarpReduceMax<T, 1, 64>(val);
+   WarpReduceMax<T, 1, PADDLE_WARP_SIZE>(val);
  }
 
- template <typename T>
- __inline__ __device__ void BlockReduceSum(T* val) {
--  static __shared__ T shared[32];
--  int lane = threadIdx.x & 0x1f;
--  int wid = threadIdx.x >> 5;
-+  static __shared__ T shared[64];
-+  int lane = threadIdx.x & 0x3f;
-+  int wid = threadIdx.x >> 6;
-
--  WarpReduceSum<T, 1, 32>(val);
-+  WarpReduceSum<T, 1, 64>(val);
-
+@@ -206,7 +209,7 @@ __inline__ __device__ void BlockReduceMaxDown(T* val) {
    __syncthreads();
-   if (lane == 0) shared[wid] = *val;
-
-   __syncthreads();
-
--  int block_span = (blockDim.x + warpSize - 1) >> 5;
-+  int block_span = (blockDim.x + warpSize - 1) >> 6;
-   *val = (lane < block_span) ? shared[lane] : static_cast<T>(0.0f);
--  WarpReduceSum<T, 1, 32>(val);
-+  WarpReduceSum<T, 1, 64>(val);
- }
 
- template <typename Tx, typename Ty = Tx>
-@@ -531,11 +534,11 @@ __global__ void WarpSoftmaxForward(T* softmax,
-                                    const IndexType stride,
-                                    const IndexType element_count) {
-   constexpr IndexType kDimCeil = 1 << Log2Elements;
--  constexpr IndexType kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-+  constexpr IndexType kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
-   constexpr IndexType kVSize = sizeof(VecT) / sizeof(T);
-   constexpr IndexType kLoops = kDimCeil / kWarpSize;
-   constexpr IndexType kLoopsV = (kLoops >= kVSize) ? (kLoops / kVSize) : 1;
--  constexpr IndexType kBatchSize = (kDimCeil <= 32) ? 2 : 1;
-+  constexpr IndexType kBatchSize = (kDimCeil <= 64) ? 2 : 1;
-   IndexType first_batch =
-       (static_cast<IndexType>(blockDim.y) * blockIdx.x + threadIdx.y) *
-       kBatchSize;
-@@ -652,9 +655,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
-                                     IndexType element_count) {
-   constexpr IndexType kVSize = sizeof(VecT) / sizeof(T);
-   constexpr IndexType kDimCeil = 1 << Log2Elements;
--  constexpr IndexType kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-+  constexpr IndexType kWarpSize = (kDimCeil < 64) ? kDimCeil : 64;
-   constexpr IndexType kLoops = kDimCeil / kWarpSize;
--  constexpr IndexType kBatchSize = (kDimCeil <= 128) ? 2 : 1;
-+  constexpr IndexType kBatchSize = (kDimCeil <= 256) ? 2 : 1;
-   constexpr IndexType kLoopsV = (kLoops >= kVSize) ? (kLoops / kVSize) : 1;
-   IndexType element_count_v = element_count / kVSize;
-   IndexType first_batch =
-@@ -865,6 +868,10 @@ static void GetGridDim(int64_t high_dim,
+   int block_span = (blockDim.x + warpSize - 1) >> PADDLE_WARP_SHIFT;
+-  *val = (lane < block_span) ? shared[lane] : -1e10f;
++  *val = (lane < block_span) ? shared[lane] : std::numeric_limits<T>::lowest();
+   if (wid == 0) {
+     WarpReduceMaxDown<T, 1, PADDLE_WARP_SIZE>(val);
+   }
+@@ -953,6 +956,10 @@ static void GetGridDim(int64_t high_dim,
    grid_x = std::min(grid_x, max_num_blocks);
    int64_t grid_y = (max_num_blocks + grid_x - 1) / grid_x;
    grid_y = std::min(grid_y, high_dim);
@@ -1276,44 +1159,6 @@ index be6ee4f854..87ad910d79 100644
    grid->x = grid_x;
    grid->y = grid_y;
  }
-@@ -873,7 +880,7 @@ static void GetBlockDim(int64_t mid_dim, int64_t low_dim, dim3* block) {
-   constexpr int max_num_threads = 1024;
-   int64_t block_x = int64_t(1) << Log2Ceil(low_dim);
-   int64_t block_y = int64_t(1) << Log2Ceil(mid_dim);
--  block->x = std::min<int64_t>(block_x, 32);
-+  block->x = std::min<int64_t>(block_x, 64);
-   block->y = std::min<int64_t>(block_y, max_num_threads / block->x);
-   block->x = std::min<int64_t>(block_x, max_num_threads / block->y);
- }
-@@ -2663,11 +2670,11 @@ void SoftmaxForwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
-         D > std::numeric_limits<int32_t>::max()) {
-       int dim_log2 = static_cast<int>(Log2Ceil(dim));
-       IndexType dim_ceil = 1 << dim_log2;
--      int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
--      int batches_per_warp = (dim_ceil <= 32) ? 2 : 1;
-+      int warp_size = (dim_ceil < 64) ? dim_ceil : 64;
-+      int batches_per_warp = (dim_ceil <= 64) ? 2 : 1;
-
-       // use 128 threads per block to maximize gpu utilization
--      constexpr int threads_per_block = 128;
-+      constexpr int threads_per_block = 256;
-
-       int warps_per_block = (threads_per_block / warp_size);
-       int batches_per_block = warps_per_block * batches_per_warp;
-@@ -2802,10 +2809,10 @@ void SoftmaxBackwardCUDAKernelDriverImpl(const GPUContext& dev_ctx,
-         D > std::numeric_limits<int32_t>::max()) {
-       int dim_log2 = Log2Ceil(dim);
-       IndexType dim_ceil = 1 << dim_log2;
--      int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
--      int batches_per_warp = (dim_ceil <= 128) ? 2 : 1;
-+      int warp_size = (dim_ceil < 64) ? dim_ceil : 64;
-+      int batches_per_warp = (dim_ceil <= 256) ? 2 : 1;
-
--      constexpr int threads_per_block = 128;
-+      constexpr int threads_per_block = 256;
-
-       int warps_per_block = (threads_per_block / warp_size);
-       int batches_per_block = warps_per_block * batches_per_warp;
 diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
 index 2972c2bd85..0bb78d97b5 100644
 --- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu