apply to more modules

zhongbozhu · zhongbozhu · commit dc2532a35223 · 2025-09-02T12:47:13.000-07:00
Signed-off-by: zhongboz &lt;zhongboz@nvidia.com&gt;
diff --git a/benchmarks/linear/benchmark_linear_cpu_overhead.py b/benchmarks/linear/benchmark_linear_cpu_overhead.py
@@ -6,7 +6,13 @@
 import transformer_engine.pytorch as te
 
 
-def run_once(module: torch.nn.Module, args: List[torch.Tensor], iters = 2000, use_te: bool = True, is_first_microbatch: bool = True):
+def run_once(
+    module: torch.nn.Module,
+    args: List[torch.Tensor],
+    iters=2000,
+    use_te: bool = True,
+    is_first_microbatch: bool = True,
+):
     if use_te:
         for _ in range(iters):
             module(*args, is_first_microbatch=is_first_microbatch)
@@ -45,36 +51,64 @@ def speedometer(
         gpu_times.append(gpu_elapsed)
         cpu_times.append(cpu_elapsed)
         print(
-            f"Round {round_idx+1}/{num_rounds}: GPU {gpu_elapsed/timing_iters*1000:.2f} µs, CPU {cpu_elapsed/timing_iters*1000:.2f} µs"
+            f"Round {round_idx+1}/{num_rounds}: GPU {gpu_elapsed/timing_iters*1000:.2f} µs, CPU"
+            f" {cpu_elapsed/timing_iters*1000:.2f} µs"
         )
-    print(f"Average GPU time over {num_rounds} rounds: {sum(gpu_times)/(num_rounds*timing_iters)*1000:.2f} µs")
-    print(f"Average CPU time over {num_rounds} rounds: {sum(cpu_times)/(num_rounds*timing_iters)*1000:.2f} µs")
+    print(
+        f"Average GPU time over {num_rounds} rounds:"
+        f" {sum(gpu_times)/(num_rounds*timing_iters)*1000:.2f} µs"
+    )
+    print(
+        f"Average CPU time over {num_rounds} rounds:"
+        f" {sum(cpu_times)/(num_rounds*timing_iters)*1000:.2f} µs"
+    )
 
     return sum(gpu_times) / num_rounds
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Benchmark torch.nn.Linear performance and CPU overhead.")
+    parser = argparse.ArgumentParser(
+        description="Benchmark torch.nn.Linear performance and CPU overhead."
+    )
     parser.add_argument("--hidden_size", type=int, default=3072, help="Hidden size")
     parser.add_argument("--seq_length", type=int, default=2048, help="Sequence length")
     parser.add_argument("--warmup", type=int, default=500, help="Number of warmup iterations")
-    parser.add_argument("--timing_iters", type=int, default=2000, help="Number of timing iterations per round")
+    parser.add_argument(
+        "--timing_iters", type=int, default=2000, help="Number of timing iterations per round"
+    )
     parser.add_argument("--num_rounds", type=int, default=5, help="Number of timing rounds")
     parser.add_argument(
-        "--backend", type=str, choices=["torch", "te"], default="te", help="Linear backend: torch or te"
+        "--backend",
+        type=str,
+        choices=["torch", "te"],
+        default="te",
+        help="Linear backend: torch or te",
     )
     args = parser.parse_args()
 
-    x = torch.randn((args.seq_length, args.hidden_size), dtype=torch.bfloat16, device="cuda", requires_grad=True)
+    x = torch.randn(
+        (args.seq_length, args.hidden_size), dtype=torch.bfloat16, device="cuda", requires_grad=True
+    )
     use_te = True
     if args.backend == "torch":
-        model = torch.nn.Linear(args.hidden_size, args.hidden_size, bias=False).to(torch.bfloat16).cuda()
+        model = (
+            torch.nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+            .to(torch.bfloat16)
+            .cuda()
+        )
         use_te = False
     else:
-        model = te.Linear(args.hidden_size, args.hidden_size, bias=False, device="cuda").to(torch.bfloat16)
+        model = te.Linear(args.hidden_size, args.hidden_size, bias=False, device="cuda").to(
+            torch.bfloat16
+        )
     with torch.no_grad():
         avg_gpu_time_per_round = speedometer(
-            model, [x], timing_iters=args.timing_iters, warmup_iters=args.warmup, num_rounds=args.num_rounds, use_te=use_te
+            model,
+            [x],
+            timing_iters=args.timing_iters,
+            warmup_iters=args.warmup,
+            num_rounds=args.num_rounds,
+            use_te=use_te,
         )
 
     total_ops = 2 * args.hidden_size * args.hidden_size * args.seq_length * args.timing_iters
@@ -84,4 +118,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -748,9 +748,14 @@ def forward(
         if skip_fp8_weight_update is not None:
             is_first_microbatch = False
 
-        with torch.cuda.device(
-            getattr(self, list(self.named_parameters())[0][0]).device
-        ), self.prepare_forward(inp, num_gemms=self.num_gemms) as inp:
+        if is_first_microbatch is None or is_first_microbatch:
+            device_ctx = torch.cuda.device(
+                getattr(self, list(self.named_parameters())[0][0]).device
+            )
+        else:
+            device_ctx = contextlib.nullcontext()
+
+        with device_ctx, self.prepare_forward(inp, num_gemms=self.num_gemms) as inp:
             weight_tensors = self._get_weight_tensors()
             bias_tensors = [getattr(self, f"bias{i}") for i in range(self.num_gemms)]
 
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -1502,10 +1502,16 @@ def forward(
             ).is_fp8_ubuf():
                 fp8_grad = True
 
-        with torch.cuda.device(
-            getattr(self, list(self.named_parameters())[0][0]).device
-        ), self.prepare_forward(
-            inp, allow_non_contiguous=False  # removed .contiguous from inside the layer
+        if is_first_microbatch is None or is_first_microbatch:
+            device_ctx = torch.cuda.device(
+                getattr(self, list(self.named_parameters())[0][0]).device
+            )
+        else:
+            device_ctx = contextlib.nullcontext()
+
+        with device_ctx, self.prepare_forward(
+            inp,
+            allow_non_contiguous=False,  # removed .contiguous from inside the layer
         ) as inp:
 
             # Get concatenated weight and bias tensors
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -1764,9 +1764,14 @@ def forward(
             if get_ub("fc2_fprop", FP8GlobalStateManager.is_fp8_enabled()).is_fp8_ubuf():
                 fp8_output = True
 
-        with torch.cuda.device(
-            getattr(self, list(self.named_parameters())[0][0]).device
-        ), self.prepare_forward(inp, num_gemms=2) as inp:
+        if is_first_microbatch is None or is_first_microbatch:
+            device_ctx = torch.cuda.device(
+                getattr(self, list(self.named_parameters())[0][0]).device
+            )
+        else:
+            device_ctx = contextlib.nullcontext()
+
+        with device_ctx, self.prepare_forward(inp, num_gemms=2) as inp:
 
             quantizers = (
                 self._get_quantizers(fp8_output)
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -1389,7 +1389,9 @@ def forward(
                 fp8_grad = True
 
         if is_first_microbatch is None or is_first_microbatch:
-            device_ctx = torch.cuda.device(getattr(self, list(self.named_parameters())[0][0]).device)
+            device_ctx = torch.cuda.device(
+                getattr(self, list(self.named_parameters())[0][0]).device
+            )
         else:
             device_ctx = contextlib.nullcontext()