NVIDIA
diff --git a/‎benchmarks/linear/benchmark_grouped_linear.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/linear/benchmark_grouped_linear.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/jax/conftest.py
Lines changed: 53 additions & 0 deletions b/‎tests/jax/conftest.py
Lines changed: 53 additions & 0 deletions
diff --git a/‎tests/jax/test_distributed_layernorm_mlp.py
Lines changed: 0 additions & 2 deletions b/‎tests/jax/test_distributed_layernorm_mlp.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/jax/test_distributed_softmax.py
Lines changed: 4 additions & 4 deletions b/‎tests/jax/test_distributed_softmax.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/pytorch/test_float8blockwisetensor.py
Lines changed: 1 addition & 1 deletion b/‎tests/pytorch/test_float8blockwisetensor.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎transformer_engine/common/common.cu
Lines changed: 1 addition & 0 deletions b/‎transformer_engine/common/common.cu
Lines changed: 1 addition & 0 deletions
diff --git a/‎transformer_engine/common/include/transformer_engine/swizzle.h
Lines changed: 14 additions & 0 deletions b/‎transformer_engine/common/include/transformer_engine/swizzle.h
Lines changed: 14 additions & 0 deletions
@@ -247,7 +247,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
     num_gemms_list = [8]
 
     if args.profile:
-        mkns = [(4096, 4096, 4096)]
+        mkns = [(4096 * 8, 4096, 4096)]
         # in profile mode, only run one recipe specified in args.recipe
         assert args.recipe != "all", (
             "In profile mode, only one recipe can be specified, please specify the recipe as"
 
@@ -5,6 +5,8 @@
 import os
 import jax
 import pytest
+from collections import defaultdict
+import time
 
 
 import transformer_engine.jax
@@ -32,3 +34,54 @@ def enable_fused_attn_after_hopper():
     yield
     if "NVTE_FUSED_ATTN" in os.environ:
         del os.environ["NVTE_FUSED_ATTN"]
+
+
+class TestTimingPlugin:
+    """
+    Plugin to measure test execution time. Enable test timing by setting NVTE_JAX_TEST_TIMING=1
+    in the environment.
+    """
+
+    def __init__(self):
+        self.test_timings = defaultdict(list)
+
+    @pytest.hookimpl(tryfirst=True)
+    def pytest_runtest_setup(self, item):
+        item._timing_start = time.time()
+
+    @pytest.hookimpl(trylast=True)
+    def pytest_runtest_teardown(self, item, nextitem):
+        if hasattr(item, "_timing_start"):
+            duration = time.time() - item._timing_start
+
+            # Extract base function name without parameters
+            test_name = item.name
+            if "[" in test_name:
+                base_name = test_name.split("[")[0]
+            else:
+                base_name = test_name
+
+            self.test_timings[base_name].append(duration)
+
+    def pytest_sessionfinish(self, session, exitstatus):
+        print("\n" + "=" * 80)
+        print("TEST RUNTIME SUMMARY (grouped by function)")
+        print("=" * 80)
+
+        total_overall = 0
+        for test_name, durations in sorted(self.test_timings.items()):
+            total_time = sum(durations)
+            count = len(durations)
+            avg_time = total_time / count if count > 0 else 0
+            total_overall += total_time
+
+            print(f"{test_name:<60} | {count:3}x | {total_time:7.2f}s | avg: {avg_time:6.2f}s")
+
+        print("=" * 80)
+        print(f"{'TOTAL RUNTIME':<60} | {'':>3}  | {total_overall:7.2f}s |")
+        print("=" * 80)
+
+
+def pytest_configure(config):
+    if os.getenv("NVTE_JAX_TEST_TIMING", "0") == "1":
+        config.pluginmanager.register(TestTimingPlugin(), "test_timing")
@@ -333,7 +333,6 @@ def _test_layernorm_mlp(
             with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe):
                 ln_mlp_single = LayerNormMLP(
                     layernorm_type=layernorm_type,
-                    transpose_batch_sequence=False,  # input: [batch, seqlen, hidden]
                     intermediate_dim=INTERMEDIATE,
                     activations=activation_type,
                     use_bias=use_bias,
@@ -352,7 +351,6 @@ def _test_layernorm_mlp(
             ):
                 ln_mlp_sharded = LayerNormMLP(
                     layernorm_type=layernorm_type,
-                    transpose_batch_sequence=False,
                     intermediate_dim=INTERMEDIATE,
                     activations=activation_type,
                     scale_axes=LN_SCALE_AXES,
 
@@ -135,7 +135,7 @@ def impl_test_softmax(
                         )
 
     @pytest.mark.parametrize("device_count,mesh_shape,mesh_axes,mesh_resource", generate_configs())
-    @pytest.mark.parametrize("data_shape", [[32, 12, 128, 128], [64, 16, 1024, 1024]])
+    @pytest.mark.parametrize("data_shape", [[32, 12, 128, 128], [8, 8, 1024, 1024]])
     @pytest.mark.parametrize(
         "softmax_type",
         [SoftmaxType.SCALED, SoftmaxType.SCALED_MASKED, SoftmaxType.SCALED_UPPER_TRIANG_MASKED],
@@ -168,14 +168,14 @@ def test_softmax(
             dtype,
             bad_sharding,
             broadcast_batch_mask,
-            use_shardy=False,
+            use_shardy=True,
         )
 
     @pytest.mark.parametrize("device_count,mesh_shape,mesh_axes,mesh_resource", generate_configs())
     @pytest.mark.parametrize("softmax_type", [SoftmaxType.SCALED, SoftmaxType.SCALED_MASKED])
     @pytest.mark.parametrize("bad_sharding", [False, True])
     @pytest.mark.parametrize("broadcast_batch_mask", [False, True])
-    def test_softmax_shardy(
+    def test_softmax_gspmd(
         self,
         device_count,
         mesh_shape,
@@ -196,5 +196,5 @@ def test_softmax_shardy(
             dtype=DTYPES[0],
             bad_sharding=bad_sharding,
             broadcast_batch_mask=broadcast_batch_mask,
-            use_shardy=True,
+            use_shardy=False,
         )
@@ -219,7 +219,7 @@ def test_quantize_dequantize_compact_format(
             rowwise=True,
             columnwise=dq_columnwise,
             block_scaling_dim=block_scaling_dim,
-            all_gather_usage=True,
+            all_gather_usage=(block_scaling_dim == 1),
         )
         self._test_quantize_dequantize(
             quantizer=quantizer,
 
@@ -138,6 +138,7 @@ void create_2D_tensor_map(CUtensorMap &tensorMap, const SimpleTensor &tensor,
                           const uint64_t globalY, const uint64_t globalX, const uint32_t shmemY,
                           const uint32_t shmemX, const uint32_t stride_elems,
                           const uint32_t offset_elems, const size_t type_num_bits) {
+  cuda_driver::ensure_context_exists();
   // Get a function pointer to the cuTensorMapEncodeTiled driver API
   // Note: PFN_cuTensorMapEncodeTiled is not defined in cuda13
   static PFN_cuTensorMapEncodeTiled_v12000 cuDriverTensorMapEncodeTiled = []() {
 
@@ -30,6 +30,20 @@ extern "C" {
  */
 void nvte_swizzle_scaling_factors(const NVTETensor input, NVTETensor output, cudaStream_t stream);
 
+/*! \brief Swizzling scaling factors into the required interleaved layout for GEMM
+ *
+ *  \param[in]     inputs       Input tensors with non-swizzled scale_inv.
+ *  \param[in,out] outputs      Output tensors which hosts swizzled scale_inv.
+ *  \param[in]     stream       CUDA stream used for the operation.
+ *
+ *  Requirements:
+ *  - scale_inv is stored in row-major.
+ *  - scale_inv size is padded to 128x4 for row-scale and 4x128 for col-scale.
+ *  - data is quantitized along K-dimension, i.e. 1D-scaling block lies along the K-dimension.
+ */
+void nvte_multi_tensor_swizzle_scaling_factors(const NVTETensor* inputs, NVTETensor* outputs,
+                                               const size_t num_tensors, cudaStream_t stream);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
Original file line number	Diff line number	Diff line change
`@@ -219,7 +219,7 @@ def test_quantize_dequantize_compact_format(`
`219`	`219`	`rowwise=True,`
`220`	`220`	`columnwise=dq_columnwise,`
`221`	`221`	`block_scaling_dim=block_scaling_dim,`
`222`		`- all_gather_usage=True,`
	`222`	`+ all_gather_usage=(block_scaling_dim == 1),`
`223`	`223`	`)`
`224`	`224`	`self._test_quantize_dequantize(`
`225`	`225`	`quantizer=quantizer,`