NVIDIA · sanandaraj5597 · Aug 3, 2025 · Aug 3, 2025 · Aug 3, 2025 · Aug 3, 2025
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -1078,7 +1078,6 @@ def forward(
             else:
                 tensor_list = [q, k, v, out_save]
 
-            qkv_layout = "sbhd_sbhd_sbhd"
             mark_activation_offload(*tensor_list)
             mark_activation_offload(*aux_ctx_tensors)
 
@@ -1111,7 +1110,31 @@ def forward(
         ctx.attn_scale = attn_scale
         ctx.dropout_p = dropout_p
         ctx.fast_zero_fill = fast_zero_fill
-        ctx.qkv_layout = qkv_layout
+
+        from transformer_engine.pytorch.cpu_offload import (
+            CPUOffloadedLayer,
+        )
+
+        # If interleaved tensor is offloaded, reloaded tensor will be
+        # non-interleaved, so we need to modify the QKV layout
+        # for backward
+        if CPUOffloadedLayer and CPUOffloadEnabled:
+            reload_layout = ""
+            split_list = qkv_layout.split("_")
+            for split in split_list:
+                temp_layout = ""
+                rep_count = 1
+                for s in split:
+                    if s.isalpha():
+                        temp_layout = temp_layout + s
+                    else:
+                        rep_count = int(s)
+                for i in range(rep_count):
+                    reload_layout = reload_layout + temp_layout + "_"
+            ctx.qkv_layout = reload_layout[:-1]
+        else:
+            ctx.qkv_layout = qkv_layout
+
         ctx.attn_bias_type = attn_bias_type
         ctx.attn_mask_type = attn_mask_type
         ctx.window_size = window_size

diff --git a/transformer_engine/pytorch/cpu_offload.py b/transformer_engine/pytorch/cpu_offload.py
@@ -16,6 +16,7 @@
 __all__ = ["get_cpu_offload_context"]
 
 CPUOffloadEnabled = False
+CPUOffloadedLayer = False
 
 
 def mark_activation_offload(*tensors):
@@ -408,6 +409,11 @@ def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any:
                         tensor.clear()
                     else:
                         self.tensor_tag_to_buf[tensor_tag] = t
+
+                    # Needed to differentiate non offloaded layer's attention
+                    # QKV layout of attention of non-offloaded layer needs
+                    # to be modified while reloading
+                    CPUOffloadedLayer = True
         else:
             tensor_tag = (-1, self.torch_tensor_count)
             self.torch_tensor_count += 1
@@ -528,6 +534,9 @@ def synchronize_on_group_commit_forward(self, current_group):
             # Increment the offload group count to keep track
             self.offloaded_group_count += 1
 
+        if current_group == (self.num_offload_group - 1):
+            CPUOffloadedLayer = False
+
         if not self.double_buffer_created:
             # Creating second copy of double buffer for tensors that are offloaded
             if current_group == (self.num_layers - 1):