triton softmax modify

XingerZhu · XingerZhu · commit d0cf397dc6cb · 2025-11-26T05:23:41.000Z
diff --git a/aiter/ops/triton/softmax.py b/aiter/ops/triton/softmax.py
@@ -5,6 +5,38 @@
 
 _LOGGER = AiterTritonLogger()
 
+@triton.jit
+def _softmax_kernel(
+    output_ptr,
+    input_ptr,
+    input_row_stride,
+    output_row_stride,
+    n_rows,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr
+):
+    # Rows of softmax are independent, so we parallelize across these rows
+    row_idx = tl.program_id(0)
+    if row_idx >= n_rows:
+        return
+    # Stride indicates how many elements we need to increment the pointer by to move to the next row
+    row_start_ptr = input_ptr + row_idx * input_row_stride
+    # Block size is the next power of two greater than n_cols, allowing us to
+    # fit each row into a single block
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    input_ptrs = row_start_ptr + col_offsets
+    # Load the row into SRAM using a mask since BLOCK_SIZE may be larger than n_cols
+    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))
+    # Subtract the maximum value for numerical stability
+    row_minus_max = row - tl.max(row, axis=0)
+    # Note: Exponential operation in Triton is fast but approximate (i.e., imagine __expf in CUDA)
+    numerator = tl.exp(row_minus_max)
+    denominator = tl.sum(numerator, axis=0)
+    softmax_output = numerator / denominator
+    # Write the output back to DRAM
+    output_row_start_ptr = output_ptr + row_idx * output_row_stride
+    output_ptrs = output_row_start_ptr + col_offsets
+    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)
 
 @triton.jit
 def _softmax_kernel_online(
@@ -16,7 +48,6 @@ def _softmax_kernel_online(
     n_cols,
     BLOCK_SIZE: tl.constexpr,
 ):
-
     row_start = tl.program_id(0)
     row_idx = row_start
 
@@ -54,7 +85,6 @@ def _softmax_kernel_online(
         output_ptrs = output_row_start_ptr + col_offsets
         tl.store(output_ptrs, softmax_output, mask=mask)
 
-
 def softmax(x):
     """
     Computes the row-wise softmax of a 2D input tensor.
@@ -73,17 +103,20 @@ def softmax(x):
     n_rows, n_cols = x.shape
 
     MAX_FUSED_SIZE = 65536 // x.element_size()
+    print("MAX_FUSED_SIZE: ", MAX_FUSED_SIZE)
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(n_cols))
+    print("BLOCK_SIZE: ", BLOCK_SIZE)
     y = torch.empty_like(x)
 
-    waves_per_eu = 2
+    waves_per_eu = 4 # 2
     num_warps = 8
     num_stages = 2
 
     num_programs = n_rows
 
     grid = lambda meta: (num_programs,)  # noqa: E731
     _softmax_kernel_online[grid](
+    # _softmax_kernel[grid](
         y,
         x,
         x.stride(0),
diff --git a/op_tests/triton_tests/test_softmax.py b/op_tests/triton_tests/test_softmax.py
@@ -36,3 +36,5 @@ def test_softmax(M, N, dtype):
         atol, rtol = 1e-5, 1e-5
 
     triton.testing.assert_close(y_triton, y_torch, atol=atol, rtol=rtol)
+
+test_softmax(32768, 8192, "bf16")