NVIDIA
diff --git a/‎tests/pytorch/attention/test_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/pytorch/attention/test_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎transformer_engine/pytorch/module/base.py‎
Lines changed: 62 additions & 37 deletions b/‎transformer_engine/pytorch/module/base.py‎
Lines changed: 62 additions & 37 deletions
diff --git a/‎transformer_engine/pytorch/module/grouped_linear.py‎
Lines changed: 56 additions & 54 deletions b/‎transformer_engine/pytorch/module/grouped_linear.py‎
Lines changed: 56 additions & 54 deletions
@@ -2751,7 +2751,7 @@ def forward(
         cu_seqlens,
         max_s,
     ) -> torch.Tensor:
-        with self.prepare_forward(inp, num_gemms=3) as inp:
+        with self.prepare_forward_ctx(inp, num_gemms=3) as inp:
             out = _custom_mha_fp8.apply(
                 inp,
                 self.qkv_weight,
 
@@ -1000,7 +1000,7 @@ def forward(
             cases. It is ignored for other backends and when context parallelism is enabled.
         """
 
-        with self.prepare_forward(
+        with self.prepare_forward_ctx(
             query_layer,
             num_gemms=3,
             allow_non_contiguous=True,
 
@@ -49,6 +49,7 @@
     is_non_tn_fp8_gemm_supported,
     torch_get_autocast_gpu_dtype,
     get_nvtx_range_context,
+    _nvtx_enabled,
 )
 from ..tensor.storage.float8_blockwise_tensor_storage import Float8BlockwiseQTensorStorage
 from ...common.recipe import DelayedScaling, Recipe
@@ -640,16 +641,20 @@ def __init__(self) -> None:
         "fp8_parameters",
     }
 
+    def fast_set_attr(self, name: str, value: Any) -> None:
+        self.__dict__[name] = value
+
     def __setattr__(self, name: str, value: Any) -> None:
         if name in TransformerEngineBaseModule._fast_setattr_names:
             # torch.nn.Module has a custom __setattr__ that handles
             # modules, parameters, and buffers. This is unnecessary
             # overhead when setting plain attrs.
-            self.__dict__[name] = value
+            self.fast_set_attr(name, value)
         else:
             # Default case
             super().__setattr__(name, value)
 
+
     def adjust_amax_history_length(self, length: int, fwd: Optional[bool] = None) -> None:
         """
         Delayed scaling only.
@@ -926,7 +931,7 @@ def set_activation_dtype(self, inp: torch.Tensor) -> None:
         """Get activation data type for AMP."""
         # Native AMP (`torch.autocast`) gets highest priority
         if torch.is_autocast_enabled():
-            self.activation_dtype = torch_get_autocast_gpu_dtype()
+            self.fast_set_attr("activation_dtype", torch_get_autocast_gpu_dtype())
             return
 
         # All checks after this have already been performed once, thus skip
@@ -941,7 +946,7 @@ def set_activation_dtype(self, inp: torch.Tensor) -> None:
                         "Data types for parameters must match when outside of autocasted region. "
                         f" Found input dtype: {dtype} and {name!r} dtype: {param.dtype}"
                     )
-        self.activation_dtype = dtype
+        self.fast_set_attr("activation_dtype", dtype)
 
     def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> None:
         """
@@ -970,48 +975,54 @@ def _get_fp8_params(self) -> Union[List[torch.Tensor], None]:
     # assume FP8 execution.
     def init_fp8_metadata(self, num_gemms: int = 1) -> None:
         """Initialize fp8 related metadata and tensors during fprop."""
-        _original_recipe = self.fp8_meta.get("recipe", None)
+        meta = self.fp8_meta
 
-        self.fp8_parameters = FP8GlobalStateManager.with_fp8_parameters()
-        self.fp8 = FP8GlobalStateManager.is_fp8_enabled()
-        self.fp8_calibration = FP8GlobalStateManager.is_fp8_calibration()
-        fp8_enabled = self.fp8 or self.fp8_calibration
-        self.fp8_meta["fp8_checkpoint"] = self.fp8 or self.fp8_calibration
+        fp8 = FP8GlobalStateManager.is_fp8_enabled()
+        fp8_parameters = FP8GlobalStateManager.with_fp8_parameters()
+        fp8_calibration = FP8GlobalStateManager.is_fp8_calibration()
+        self.fast_set_attr("fp8_parameters", fp8_parameters)
+        self.fast_set_attr("fp8", fp8)
+        self.fast_set_attr("fp8_calibration", fp8_calibration)
+        fp8_enabled = fp8 or fp8_calibration
+        meta["fp8_checkpoint"] = fp8_enabled
 
-        if self.fp8_parameters or fp8_enabled:
+        _original_recipe = None
+
+        if fp8_parameters or fp8_enabled:
+            _original_recipe = meta.get("recipe", None)
             if (
                 self.fp8_initialized
-                and FP8GlobalStateManager.get_fp8_recipe() == self.fp8_meta["recipe"]
+                and FP8GlobalStateManager.get_fp8_recipe() == _original_recipe
             ):
                 # FP8 init has already been run and recipe is the same, don't do anything.
                 return
-            self.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
+            meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
         else:
             # If fp8 isn't enabled, turn off and return.
-            self.fp8_initialized = False
+            self.fast_set_attr("fp8_initialized", False)
             return
 
-        if self.fp8_parameters and not self.fp8_initialized:
-            self.fp8_meta["num_gemms"] = num_gemms
-            self.init_fp8_meta_tensors(self.fp8_meta["recipe"])
+        if fp8_parameters and not self.fp8_initialized:
+            meta["num_gemms"] = num_gemms
+            self.init_fp8_meta_tensors(meta["recipe"])
 
         if fp8_enabled:
             # Set FP8 and other FP8 metadata
-            self.fp8_meta["num_gemms"] = num_gemms
-            self.fp8_meta["fp8_group"] = FP8GlobalStateManager.get_fp8_group()
+            meta["num_gemms"] = num_gemms
+            meta["fp8_group"] = FP8GlobalStateManager.get_fp8_group()
 
             # Set FP8_MAX per tensor according to recipe
-            if hasattr(self.fp8_meta["recipe"], "fp8_format"):
-                self.fp8_meta["fp8_max_fwd"] = self.fp8_meta["recipe"].fp8_format.value.max_fwd
-                self.fp8_meta["fp8_max_bwd"] = self.fp8_meta["recipe"].fp8_format.value.max_bwd
+            if hasattr(meta["recipe"], "fp8_format"):
+                meta["fp8_max_fwd"] = meta["recipe"].fp8_format.value.max_fwd
+                meta["fp8_max_bwd"] = meta["recipe"].fp8_format.value.max_bwd
 
             # Allocate scales and amaxes
-            self.init_fp8_meta_tensors(self.fp8_meta["recipe"])
+            self.init_fp8_meta_tensors(meta["recipe"])
             self.fp8_initialized = True
 
-            self.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
+            meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
 
-        _current_recipe = self.fp8_meta["recipe"]
+        _current_recipe = meta["recipe"]
         if _original_recipe is not None and not (
             issubclass(_current_recipe.__class__, _original_recipe.__class__)
             or issubclass(_original_recipe.__class__, _current_recipe.__class__)
@@ -1024,22 +1035,17 @@ def init_fp8_metadata(self, num_gemms: int = 1) -> None:
             # Clear cached workspaces as they were created with the old recipe/quantizer type
             self._fp8_workspaces.clear()
 
-    @contextmanager
     def prepare_forward(
         self,
         inp: torch.Tensor,
         num_gemms: int = 1,
         allow_non_contiguous: bool = False,
         allow_different_data_and_param_types: bool = False,
-    ) -> Generator[torch.Tensor, None, None]:
-        """Checks and prep for FWD.
-        The context manager is needed because there isn't a way for a module to know
-        if it's the last FP8 module in the forward autocast. It is useful
-        to setup the forward aggregated amax reduction for every module
-        just in case. The autocast exit will pick up the most recent one.
+    ) -> torch.Tensor:
+        """Checks and prepare for FWD execution.
         """
-        self.allow_different_data_and_param_types = allow_different_data_and_param_types
-        self.forwarded_at_least_once = True
+        self.fast_set_attr("allow_different_data_and_param_types", allow_different_data_and_param_types)
+        self.fast_set_attr("forwarded_at_least_once", True)
 
         # Activation recomputation is used and this is the second forward phase.
         if self.fp8 and in_fp8_activation_recompute_phase():
@@ -1070,13 +1076,32 @@ def prepare_forward(
                 if self.training and is_fp8_activation_recompute_enabled():
                     FP8GlobalStateManager.copy_forward_fp8_meta_tensors_for_recompute(self.fp8_meta)
 
-        with get_nvtx_range_context(self.__class__.__name__ + " forward"):
-            if not allow_non_contiguous and not inp.is_contiguous():
-                inp = inp.contiguous()
-            yield inp
+        # with get_nvtx_range_context(self.__class__.__name__ + " forward"):
+        if _nvtx_enabled():
+            torch.cuda.nvtx.range_push(self.__class__.__name__ + " forward")
+        if not allow_non_contiguous and not inp.is_contiguous():
+            inp = inp.contiguous()
+        return inp
 
+    def end_forward(self):
+        delayed_scaling_recipe = self.fp8 and self.fp8_meta["recipe"].delayed()
         if delayed_scaling_recipe and self.fp8 and in_fp8_activation_recompute_phase():
             FP8GlobalStateManager.restore_fp8_meta_tensors(self.fp8_meta)
+        if _nvtx_enabled():
+            torch.cuda.nvtx.range_pop()
+
+    @contextmanager
+    def prepare_forward_ctx(
+        self,
+        inp: torch.Tensor,
+        num_gemms: int = 1,
+        allow_non_contiguous: bool = False,
+        allow_different_data_and_param_types: bool = False,
+    ) -> Generator[torch.Tensor, None, None]:
+        yield self.prepare_forward(inp, num_gemms,
+                                   allow_non_contiguous,
+                                   allow_different_data_and_param_types)
+        self.end_forward()
 
     def set_nccl_overlap_warning_if_tp(self) -> None:
         """When using TP, the NCCL communication needs to be scheduled
 
@@ -787,60 +787,62 @@ def forward(
 
         is_grad_enabled = torch.is_grad_enabled()
 
-        with self.prepare_forward(inp, num_gemms=self.num_gemms) as inp:
-            weight_tensors = self._get_weight_tensors()
-            bias_tensors = [getattr(self, f"bias{i}") for i in range(self.num_gemms)]
-
-            quantizers = self._get_quantizers() if not debug else self._get_debug_quantizers()
-
-            if debug:
-                if self.no_debug_features_active(list(chain(*quantizers))):
-                    debug = False
-                    quantizers = self._get_quantizers()
-
-                if isinstance(weight_tensors, QuantizedTensorStorage):
-                    raise RuntimeError("FP8 weights are not supported in debug mode.")
-
-            (
-                input_quantizers,
-                weight_quantizers,
-                output_quantizers,
-                grad_input_quantizers,
-                grad_weight_quantizers,
-                grad_output_quantizers,
-            ) = quantizers
-
-            if is_grad_enabled:
-                linear_fn = _GroupedLinear.apply
-                autograd_ctx = []
-            else:
-                linear_fn = _GroupedLinear.forward
-                autograd_ctx = [None]
-
-            non_tensor_args = (
-                m_splits,
-                self.apply_bias,
-                is_first_microbatch,
-                self.fp8,
-                self.fp8_calibration,
-                self.wgrad_store,
-                input_quantizers,
-                weight_quantizers,
-                output_quantizers,
-                grad_input_quantizers,
-                grad_weight_quantizers,
-                grad_output_quantizers,
-                self.fuse_wgrad_accumulation,
-                is_cpu_offload_enabled(),
-                self.sequence_parallel,
-                self.activation_dtype,
-                is_grad_enabled,
-                self,
-                None,  # skip_fp8_weight_update
-                self.save_original_input,
-                debug,
-            )
-            out = linear_fn(*autograd_ctx, inp, non_tensor_args, *weight_tensors, *bias_tensors)
+        inp = self.prepare_forward(inp, num_gemms=self.num_gemms)
+        weight_tensors = self._get_weight_tensors()
+        bias_tensors = [getattr(self, f"bias{i}") for i in range(self.num_gemms)]
+
+        quantizers = self._get_quantizers() if not debug else self._get_debug_quantizers()
+
+        if debug:
+            if self.no_debug_features_active(list(chain(*quantizers))):
+                debug = False
+                quantizers = self._get_quantizers()
+
+            if isinstance(weight_tensors, QuantizedTensorStorage):
+                raise RuntimeError("FP8 weights are not supported in debug mode.")
+
+        (
+            input_quantizers,
+            weight_quantizers,
+            output_quantizers,
+            grad_input_quantizers,
+            grad_weight_quantizers,
+            grad_output_quantizers,
+        ) = quantizers
+
+        if is_grad_enabled:
+            linear_fn = _GroupedLinear.apply
+            autograd_ctx = []
+        else:
+            linear_fn = _GroupedLinear.forward
+            autograd_ctx = [None]
+
+        non_tensor_args = (
+            m_splits,
+            self.apply_bias,
+            is_first_microbatch,
+            self.fp8,
+            self.fp8_calibration,
+            self.wgrad_store,
+            input_quantizers,
+            weight_quantizers,
+            output_quantizers,
+            grad_input_quantizers,
+            grad_weight_quantizers,
+            grad_output_quantizers,
+            self.fuse_wgrad_accumulation,
+            is_cpu_offload_enabled(),
+            self.sequence_parallel,
+            self.activation_dtype,
+            is_grad_enabled,
+            self,
+            None,  # skip_fp8_weight_update
+            self.save_original_input,
+            debug,
+        )
+        out = linear_fn(*autograd_ctx, inp, non_tensor_args, *weight_tensors, *bias_tensors)
+
+        self.end_forward()
 
         if self.return_bias:
             return out, [cast_if_needed(b, self.activation_dtype) for b in bias_tensors]