ROCm · anhminhnguyenhoang · Jan 16, 2026 · Jan 19, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/aiter/ops/triton/fusions/mhc.py b/aiter/ops/triton/fusions/mhc.py
@@ -34,17 +34,18 @@ def mhc(
     out_res: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Compute mHC projection mapping with all three streams (equations 14-18).
+    Compute mHC projection mapping with all three streams (equations 14-19).
 
     This function implements:
     - Eq 14: H̃ = x̃φ (matrix multiplication)
     - Eq 15: r = ||x̃||₂ / √(nC) (RMS normalization)
     - Eq 16: [H^pre, H^post, H^res] = 1/r [α^pre·H̃^pre, α^post·H̃^post, α^res·H̃^res] + b
     - Eq 17: H^pre = σ(H^pre) - sigmoid activation for pre-stream
     - Eq 18: H^post = 2σ(H^post) - scaled sigmoid activation for post-stream
-    - H^res: identity (no activation, ready for Eq 19: Sinkhorn-Knopp)
+    - Eq 19: H^res = Sinkhorn(H^res) - project residual stream onto doubly stochastic
+             manifold (identity activation followed by iterative row/column normalization)
 
-    All operations are fused in a single Triton kernel for optimal performance.
+    All operations are fused in optimized Triton kernels for maximum performance.
 
     Args:
         x: Input tensor with shape (M, nC) where M is batch/sequence length and
@@ -66,7 +67,7 @@ def mhc(
         Tuple of three tensors (H_pre, H_post, H_res):
         - H_pre: (M, n) - manifold projection with sigmoid activation (H^{pre} ∈ ℝ^{M×n})
         - H_post: (M, n) - post-processing with scaled sigmoid (H^{post} ∈ ℝ^{M×n})
-        - H_res: (M, n²) - residual connection, identity activation (H^{res} ∈ ℝ^{M×n²})
+        - H_res: (M, n²) - doubly stochastic residual connection (H^{res} ∈ ℝ^{M×n²})
 
     Shape requirements:
         - x: (M, nC) where nC = n * C (flattened streams)
@@ -82,8 +83,11 @@ def mhc(
         >>> phi = torch.randn(nC, N_total, dtype=torch.bfloat16, device='cuda')
         >>> bias = torch.randn(N_total, dtype=torch.float32, device='cuda')
         >>> alpha_pre, alpha_post, alpha_res = 1.0, 1.5, 0.8
+        >>> 
+        >>> # Full mHC with Sinkhorn-Knopp (Eq 14-19)
         >>> H_pre, H_post, H_res = mhc(x, phi, alpha_pre, alpha_post, alpha_res, bias, n)
         >>> H_pre.shape, H_post.shape, H_res.shape  # (32, 4), (32, 4), (32, 16)
+        >>> # H_res is doubly stochastic: rows and columns sum to 1
     """
     _LOGGER.info(
         f"MHC: x={tuple(x.shape)} phi={tuple(phi.shape)} alpha_pre={alpha_pre} alpha_post={alpha_post} alpha_res={alpha_res}"
@@ -170,6 +174,11 @@ def mhc(
         BLOCK_K=BLOCK_K,
     )
 
+    # Apply Sinkhorn-Knopp (Equation 19) to make H_res doubly stochastic
+    # Reshape H_res from (M, n²) to (M, n, n) for Sinkhorn kernel
+    H_res_3d = out_res.view(M, n, n)
+    sinkhorn_knopp(H_res_3d, out=H_res_3d)
+
     return out_pre, out_post, out_res
 
 

diff --git a/op_tests/triton_tests/fusions/test_mhc.py b/op_tests/triton_tests/fusions/test_mhc.py
@@ -68,11 +68,13 @@ def test_mhc_correctness(M, n, C, dtype):
         atol=1e-2,
         rtol=1e-2,
     )
+    # Relaxed tolerance for H_res due to Sinkhorn-Knopp iterative algorithm
+    # which amplifies small numerical differences, especially with bfloat16
     torch.testing.assert_close(
         H_res_triton.to(torch.float32),
         H_res_torch.to(torch.float32),
-        atol=1e-2,
-        rtol=1e-2,
+        atol=5e-2,
+        rtol=5e-2,
     )
 
 
@@ -111,11 +113,12 @@ def test_mhc_preallocated_output(M, n, C):
         atol=1e-2,
         rtol=1e-2,
     )
+    # Relaxed tolerance for H_res due to Sinkhorn-Knopp iterative algorithm
     torch.testing.assert_close(
         out_res.to(torch.float32),
         H_res_torch.to(torch.float32),
-        atol=1e-2,
-        rtol=1e-2,
+        atol=5e-2,
+        rtol=5e-2,
     )
 
 
@@ -135,11 +138,15 @@ def test_mhc_different_epsilon(eps, M, n, C):
     H_pre_triton, H_post_triton, H_res_triton = mhc(x, phi, alpha_pre, alpha_post, alpha_res, bias, n_streams, eps=eps)
 
     for torch_out, triton_out in [(H_pre_torch, H_pre_triton), (H_post_torch, H_post_triton), (H_res_torch, H_res_triton)]:
+        # Use relaxed tolerance for H_res due to Sinkhorn-Knopp
+        is_res = torch_out is H_res_torch
+        atol = 5e-2 if is_res else 1e-2
+        rtol = 5e-2 if is_res else 1e-2
         torch.testing.assert_close(
             triton_out.to(torch.float32),
             torch_out.to(torch.float32),
-            atol=1e-2,
-            rtol=1e-2,
+            atol=atol,
+            rtol=rtol,
         )
 
 
@@ -164,11 +171,15 @@ def test_mhc_different_alpha(alpha_scale):
     H_pre_triton, H_post_triton, H_res_triton = mhc(x, phi, alpha_pre, alpha_post, alpha_res, bias, n_streams)
 
     for torch_out, triton_out in [(H_pre_torch, H_pre_triton), (H_post_torch, H_post_triton), (H_res_torch, H_res_triton)]:
+        # Use relaxed tolerance for H_res due to Sinkhorn-Knopp
+        is_res = torch_out is H_res_torch
+        atol = 5e-2 if is_res else 1e-2
+        rtol = 5e-2 if is_res else 1e-2
         torch.testing.assert_close(
             triton_out.to(torch.float32),
             torch_out.to(torch.float32),
-            atol=1e-2,
-            rtol=1e-2,
+            atol=atol,
+            rtol=rtol,
         )
 
 
@@ -193,11 +204,15 @@ def test_mhc_zero_input():
     H_pre_triton, H_post_triton, H_res_triton = mhc(x, phi, alpha_pre, alpha_post, alpha_res, bias, n)
 
     for torch_out, triton_out in [(H_pre_torch, H_pre_triton), (H_post_torch, H_post_triton), (H_res_torch, H_res_triton)]:
+        # Use relaxed tolerance for H_res due to Sinkhorn-Knopp
+        is_res = torch_out is H_res_torch
+        atol = 5e-2 if is_res else 1e-2
+        rtol = 5e-2 if is_res else 1e-2
         torch.testing.assert_close(
             triton_out.to(torch.float32),
             torch_out.to(torch.float32),
-            atol=1e-2,
-            rtol=1e-2,
+            atol=atol,
+            rtol=rtol,
         )
 
 
@@ -222,11 +237,15 @@ def test_mhc_large_values():
     H_pre_triton, H_post_triton, H_res_triton = mhc(x, phi, alpha_pre, alpha_post, alpha_res, bias, n)
 
     for torch_out, triton_out in [(H_pre_torch, H_pre_triton), (H_post_torch, H_post_triton), (H_res_torch, H_res_triton)]:
+        # Use even more relaxed tolerance for large values + Sinkhorn-Knopp
+        is_res = torch_out is H_res_torch
+        atol = 0.2 if is_res else 0.1
+        rtol = 0.1 if is_res else 0.05
         torch.testing.assert_close(
             triton_out.to(torch.float32),
             torch_out.to(torch.float32),
-            atol=0.1,
-            rtol=0.05,
+            atol=atol,
+            rtol=rtol,
         )
 
 
@@ -247,11 +266,15 @@ def test_mhc_small_shapes(M, n, C, dtype):
     H_pre_triton, H_post_triton, H_res_triton = mhc(x, phi, alpha_pre, alpha_post, alpha_res, bias, n_streams)
 
     for torch_out, triton_out in [(H_pre_torch, H_pre_triton), (H_post_torch, H_post_triton), (H_res_torch, H_res_triton)]:
+        # Use relaxed tolerance for H_res due to Sinkhorn-Knopp
+        is_res = torch_out is H_res_torch
+        atol = 5e-2 if is_res else 1e-2
+        rtol = 5e-2 if is_res else 1e-2
         torch.testing.assert_close(
             triton_out.to(torch.float32),
             torch_out.to(torch.float32),
-            atol=1e-2,
-            rtol=1e-2,
+            atol=atol,
+            rtol=rtol,
         )
 
 

diff --git a/op_tests/triton_tests/utils/mhc_ref.py b/op_tests/triton_tests/utils/mhc_ref.py
@@ -42,7 +42,7 @@ def mhc_torch(
     eps: float = 1e-6,
 ) -> torch.Tensor:
     """
-    PyTorch reference implementation of mHC projection mapping (Eq 14-18).
+    PyTorch reference implementation of mHC projection mapping (Eq 14-19).
 
     This serves as ground truth for validating the Triton kernel implementation.
 
@@ -52,7 +52,7 @@ def mhc_torch(
     - Eq 16: [H^pre, H^post, H^res] = 1/r [α^pre·H̃^pre, α^post·H̃^post, α^res·H̃^res] + b (scaling)
     - Eq 17: H^pre = σ(H^pre) (sigmoid activation for pre-stream)
     - Eq 18: H^post = 2σ(H^post) (scaled sigmoid activation for post-stream)
-    - H^res: identity (no activation, ready for Eq 19: Sinkhorn-Knopp)
+    - Eq 19: H^res = Sinkhorn(H^res) (project residual stream onto doubly stochastic manifold)
 
     Args:
         x: Input x_l with shape (M, nC) - flattened n-stream residual
@@ -68,10 +68,9 @@ def mhc_torch(
         Tuple of three tensors (H_pre, H_post, H_res):
         - H_pre: (M, n) manifold projection with sigmoid
         - H_post: (M, n) post-processing with 2*sigmoid
-        - H_res: (M, n²) residual connection (identity)
+        - H_res: (M, n²) doubly stochastic residual connection
     """
     x_f32 = x.to(torch.float32)
-    nC = x.shape[1]
 
     # Eq 15: r = ||x̃||₂ / √(nC)
     mean_sq = torch.mean(x_f32 ** 2, dim=-1, keepdim=True)
@@ -83,7 +82,6 @@ def mhc_torch(
     H_tilde = x_norm @ phi_f32
 
     # Split into three streams
-    n_squared = n * n
     H_tilde_pre = H_tilde[:, :n]  # n coefficients (H^{pre} ∈ ℝ^{1×n})
     H_tilde_post = H_tilde[:, n:2*n]  # n coefficients (H^{post} ∈ ℝ^{1×n})
     H_tilde_res = H_tilde[:, 2*n:]  # n² coefficients (H^{res} ∈ ℝ^{n×n})
@@ -108,9 +106,12 @@ def mhc_torch(
     # H^post = 2σ(H^post)
     H_post = 2.0 * torch.sigmoid(H_post)
 
-    # H^res: identity activation (no change)
-    # Preserves values for subsequent Sinkhorn-Knopp normalization (Eq 19)
-    # H_res stays as is
+    # Eq 19: Apply Sinkhorn-Knopp to H^res for doubly stochastic constraint
+    # Reshape H_res from (M, n²) to (M, n, n) for Sinkhorn algorithm
+    M = H_res.shape[0]
+    H_res_3d = H_res.view(M, n, n)
+    H_res_ds = sinkhorn_knopp_log_domain_torch(H_res_3d)
+    H_res = H_res_ds.view(M, -1)  # Reshape back to (M, n²)
 
     # Return three separate streams
     return H_pre.to(x.dtype), H_post.to(x.dtype), H_res.to(x.dtype)