[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 6f01bc8869e5 · 2025-08-13T04:22:48.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/pytorch/test_group_gemm.py b/tests/pytorch/test_group_gemm.py
@@ -63,9 +63,7 @@ def generate_inputs_outputs(self) -> Tuple[List[torch.Tensor], List[torch.Tensor
             ref_C_list.append(ref_C)
         return A_list, B_list, C_list, ref_C_list
 
-    def test_grouped_gemm(
-        self, atol=1e-2, rtol=1e-2, check_accuracy=True, check_performance=False
-    ):
+    def test_grouped_gemm(self, atol=1e-2, rtol=1e-2, check_accuracy=True, check_performance=False):
 
         WARM_ITERS = 10
         ITERS = 1000
@@ -91,11 +89,14 @@ def test_grouped_gemm(
             get_multi_stream_cublas_workspace(),
             layout=layout,
             m_splits=self.m_splits,
-            accumulate=self.accumulate
+            accumulate=self.accumulate,
         )
         torch.cuda.synchronize()
 
-        print(f'\n=== Accuracy Testing with Layout:{layout} GemmType:{os.getenv("NVTE_USE_CUTLASS_GROUPGEMM", "0")}')
+        print(
+            "\n=== Accuracy Testing with"
+            f" Layout:{layout} GemmType:{os.getenv('NVTE_USE_CUTLASS_GROUPGEMM', '0')}"
+        )
         if check_accuracy:
 
             alpha = 1.0
@@ -140,7 +141,7 @@ def test_grouped_gemm(
                     get_multi_stream_cublas_workspace(),
                     layout=layout,
                     m_splits=self.m_splits,
-                    accumulate=self.accumulate
+                    accumulate=self.accumulate,
                 )
 
             torch.cuda.synchronize()
@@ -154,7 +155,7 @@ def test_grouped_gemm(
                     get_multi_stream_cublas_workspace(),
                     layout=layout,
                     m_splits=self.m_splits,
-                    accumulate=self.accumulate
+                    accumulate=self.accumulate,
                 )
             torch.cuda.synchronize()
             end_time = time.perf_counter()
@@ -193,12 +194,12 @@ def run_grouped_gemm(group_config, check_performance, transa, transb, accumulate
                     [4096, 768, 2048],
                     [4096, 768, 2048],
                     [4096, 768, 2048],
-                    [4096, 768, 2048]
+                    [4096, 768, 2048],
                 ],
                 "accumulate": False,
                 "check_performance": True,
                 "transa": False,
-                "transb": True
+                "transb": True,
             },
             {
                 "group_config": [
@@ -217,16 +218,16 @@ def run_grouped_gemm(group_config, check_performance, transa, transb, accumulate
                     [2048, 768, 2048],
                     [2048, 768, 2048],
                     [2048, 768, 2048],
-                    [2048, 768, 2048]
+                    [2048, 768, 2048],
                 ],
                 "accumulate": False,
                 "check_performance": True,
                 "transa": False,
-                "transb": True
-            }
+                "transb": True,
+            },
         ]
     }
-    
+
     for i, case in enumerate(config_data["configs"]):
         group_config = [tuple(x) for x in case["group_config"]]
         accumulate = case.get("accumulate", False)
diff --git a/transformer_engine/pytorch/cpp_extensions/gemm.py b/transformer_engine/pytorch/cpp_extensions/gemm.py
@@ -133,7 +133,7 @@ def general_grouped_gemm(
     use_bias: bool = False,
     use_split_accumulator: bool = False,
     D_dtype: Optional[tex.DType] = None,
-    single_output=False
+    single_output=False,
 ) -> Tuple[List[torch.Tensor], ...]:
     """
     TN layout Grouped GEMM with fp8 inputs.