ROCm
diff --git a/‎aiter/dist/device_communicators/custom_all_reduce.py‎
Lines changed: 71 additions & 21 deletions b/‎aiter/dist/device_communicators/custom_all_reduce.py‎
Lines changed: 71 additions & 21 deletions
diff --git a/‎aiter/dist/parallel_state.py‎
Lines changed: 23 additions & 19 deletions b/‎aiter/dist/parallel_state.py‎
Lines changed: 23 additions & 19 deletions
diff --git a/‎aiter/ops/custom_all_reduce.py‎
Lines changed: 9 additions & 2 deletions b/‎aiter/ops/custom_all_reduce.py‎
Lines changed: 9 additions & 2 deletions
@@ -54,7 +54,10 @@ def __init__(
         self,
         group: ProcessGroup,
         device: Union[int, str, torch.device],
-        max_size=8192 * 1024 * 8 * 2, # In allreduce 2stage writemode, use 2x tmp buffer
+        max_size=8192
+        * 1024
+        * 8
+        * 2,  # In allreduce 2stage writemode, use 2x tmp buffer
         enable_register_for_capturing: bool = True,
     ) -> None:
         """
@@ -160,7 +163,9 @@ def __init__(
         self.input_buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
         # This is a pre-registered IPC buffer for output. In eager mode, kernel
         # writes results to this buffer, then it's copied to the actual output
-        self.output_buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
+        self.output_buffer = torch.empty(
+            max_size, dtype=torch.uint8, device=self.device
+        )
         # This is a buffer for storing the tuples of pointers pointing to
         # IPC buffers from all ranks. Each registered tuple has size of
         # 8*world_size bytes where world_size is at most 8. Allocating 8MB
@@ -247,7 +252,7 @@ def _gather_ipc_meta(self, shard_data):
     def register_input_buffer(self, inp: torch.Tensor):
         handles, offsets = self._get_ipc_meta(inp)
         ops.register_input_buffer(self._ptr, inp, handles, offsets)
-    
+
     def register_output_buffer(self, out: torch.Tensor):
         handles, offsets = self._get_ipc_meta(out)
         ops.register_output_buffer(self._ptr, out, handles, offsets)
@@ -316,7 +321,7 @@ def custom_all_reduce(
                     use_new=use_new,
                     open_fp8_quant=open_fp8_quant,
                     registered_input=self.enable_register_for_capturing,
-                    registered_output=self.enable_register_for_capturing
+                    registered_output=self.enable_register_for_capturing,
                 )
             else:
                 # if warm up, mimic the allocation pattern
@@ -332,7 +337,7 @@ def custom_all_reduce(
                 use_new=use_new,
                 open_fp8_quant=open_fp8_quant,
                 registered_input=False,
-                registered_output=False
+                registered_output=False,
             )
 
     def reduce_scatter(
@@ -361,31 +366,50 @@ def custom_reduce_scatter(
         else:
             return self.reduce_scatter(input, output, registered=False)
 
-    def all_gather_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
+    def _allgather_out_shape(self, inp: torch.Tensor, dim: int):
+        ndim = inp.dim()
+        if dim == 0:
+            return (inp.shape[0] * self.world_size,) + inp.shape[1:]
+        if dim == -1 or dim == ndim - 1:
+            return inp.shape[:-1] + (inp.shape[-1] * self.world_size,)
+        print(
+            f"[aiter] allgather does not support dim={dim}, falling back to 1-D output"
+        )
+        return (inp.numel() * self.world_size,)
+
+    def all_gather_reg(self, inp: torch.Tensor, out: torch.Tensor = None, dim: int = 0):
         if out is None:
             out = torch.empty(
-                inp.numel() * self.world_size, dtype=inp.dtype, device=inp.device
+                self._allgather_out_shape(inp, dim),
+                dtype=inp.dtype,
+                device=inp.device,
             )
-        ops.all_gather_reg(self._ptr, inp, out)
+        ops.all_gather_reg(self._ptr, inp, out, inp.shape[-1], dim)
         return out
 
-    def all_gather_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
+    def all_gather_unreg(
+        self, inp: torch.Tensor, out: torch.Tensor = None, dim: int = 0
+    ):
         if out is None:
             out = torch.empty(
-                inp.numel() * self.world_size, dtype=inp.dtype, device=inp.device
+                self._allgather_out_shape(inp, dim),
+                dtype=inp.dtype,
+                device=inp.device,
             )
-        ops.all_gather_unreg(self._ptr, inp, self.input_buffer, out)
+        ops.all_gather_unreg(self._ptr, inp, self.input_buffer, out, inp.shape[-1], dim)
         return out
 
-    def custom_all_gather(self, inp: torch.Tensor) -> Optional[torch.Tensor]:
+    def custom_all_gather(
+        self, inp: torch.Tensor, dim: int = 0
+    ) -> Optional[torch.Tensor]:
         if self._IS_CAPTURING:
             if torch.cuda.is_current_stream_capturing():
-                return self.all_gather_reg(inp)
+                return self.all_gather_reg(inp, dim=dim)
             else:
                 print("allgather capture hipgraph error")
                 return torch.zeros_like(inp)
         else:
-            return self.all_gather_unreg(inp)
+            return self.all_gather_unreg(inp, dim=dim)
 
     def fused_ar_rms(
         self,
@@ -422,7 +446,9 @@ def fused_ar_rms(
             if out is None:
                 out = torch.empty(inp.shape, dtype=fp8, device=inp.device)
             if scale_out is None:
-                scale_out = torch.empty(inp.shape[:-1] + (1,), dtype=torch.float32, device=inp.device)
+                scale_out = torch.empty(
+                    inp.shape[:-1] + (1,), dtype=torch.float32, device=inp.device
+                )
             ops.fused_allreduce_rmsnorm_quant(
                 self._ptr,
                 inp,
@@ -451,15 +477,25 @@ def custom_fused_ar_rms(
         if self._IS_CAPTURING:
             if torch.cuda.is_current_stream_capturing():
                 return self.fused_ar_rms(
-                    input, residual_inp, w=weight, eps=eps, registered=True, use_1stage=use_1stage,
+                    input,
+                    residual_inp,
+                    w=weight,
+                    eps=eps,
+                    registered=True,
+                    use_1stage=use_1stage,
                 )
             else:
                 return torch.zeros_like(input), torch.zeros_like(input)
         else:
             return self.fused_ar_rms(
-                input, residual_inp, w=weight, eps=eps, registered=False, use_1stage=use_1stage,
+                input,
+                residual_inp,
+                w=weight,
+                eps=eps,
+                registered=False,
+                use_1stage=use_1stage,
             )
-    
+
     def custom_fused_ar_rms_quant(
         self,
         input: torch.Tensor,
@@ -474,15 +510,29 @@ def custom_fused_ar_rms_quant(
         if self._IS_CAPTURING:
             if torch.cuda.is_current_stream_capturing():
                 return self.fused_ar_rms(
-                    input, residual_inp, w=weight, eps=eps, registered=True, use_1stage=use_1stage, post_per_token_quant=True,
+                    input,
+                    residual_inp,
+                    w=weight,
+                    eps=eps,
+                    registered=True,
+                    use_1stage=use_1stage,
+                    post_per_token_quant=True,
                 )
             else:
                 dummy_out = torch.zeros(input.shape, dtype=fp8, device=input.device)
-                dummy_scale_out = torch.zeros(input.shape[:-1] + (1,), dtype=torch.float32, device=input.device)
+                dummy_scale_out = torch.zeros(
+                    input.shape[:-1] + (1,), dtype=torch.float32, device=input.device
+                )
                 return dummy_out, torch.zeros_like(input), dummy_scale_out
         else:
             return self.fused_ar_rms(
-                input, residual_inp, w=weight, eps=eps, registered=False, use_1stage=use_1stage, post_per_token_quant=True,
+                input,
+                residual_inp,
+                w=weight,
+                eps=eps,
+                registered=False,
+                use_1stage=use_1stage,
+                post_per_token_quant=True,
             )
 
     def close(self):
 
@@ -175,12 +175,14 @@ def fused_allreduce_rmsnorm_quant_(
 if supports_custom_op():
 
     # @torch.library.custom_op("aiter::outplace_all_gather", mutates_args=[])
-    def outplace_all_gather(input: torch.Tensor, group_name: str) -> torch.Tensor:
+    def outplace_all_gather(
+        input: torch.Tensor, group_name: str, dim: int = 0
+    ) -> torch.Tensor:
         assert group_name in _groups, f"Group {group_name} is not found."
         group = _groups[group_name]()
         if group is None:
             raise ValueError(f"Group {group_name} is destroyed.")
-        return group._all_gather_out_place(input)
+        return group._all_gather_out_place(input, dim)
 
     def outplace_reduce_scatter(
         input: torch.Tensor, output: torch.Tensor, group_name: str, dim: int
@@ -442,11 +444,11 @@ def _fused_allreduce_rmsnorm_quant_out_place(
             input_, residual_inp_, weight_, eps
         )
 
-    def _all_gather_out_place(self, input_: torch.Tensor) -> torch.Tensor:
+    def _all_gather_out_place(self, input_: torch.Tensor, dim: int = 0) -> torch.Tensor:
         ca_comm = self.device_communicator.ca_comm
         assert ca_comm is not None
         assert not ca_comm.disabled
-        out = ca_comm.custom_all_gather(input_)
+        out = ca_comm.custom_all_gather(input_, dim)
         assert out is not None
         return out
 
@@ -491,30 +493,32 @@ def all_gather(
         self, input_: torch.Tensor, use_custom: bool = False, dim: int = -1
     ) -> torch.Tensor:
         world_size = self.world_size
-        # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
             return input_
         assert (
             -input_.dim() <= dim < input_.dim()
         ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
 
         if dim < 0:
-            # Convert negative dim to positive.
             dim += input_.dim()
         input_size = input_.size()
-        if use_custom:
-            output_tensor = outplace_all_gather(input_, group_name=self.unique_name)
-            output_tensor = output_tensor.reshape((world_size,) + input_size)
-        else:
-            # Allocate output tensor.
-            output_tensor = torch.empty(
-                (world_size,) + input_size, dtype=input_.dtype, device=input_.device
-            )
-            # All-gather.
-            torch.distributed.all_gather_into_tensor(
-                output_tensor, input_, group=self.device_group
-            )
-        # Reshape
+
+        is_last_dim = dim == input_.dim() - 1
+        can_use_custom = use_custom and (
+            dim == 0
+            or (is_last_dim and input_size[-1] * input_.element_size() % 16 == 0)
+        )
+
+        if can_use_custom:
+            return outplace_all_gather(input_, group_name=self.unique_name, dim=dim)
+
+        # NCCL path
+        output_tensor = torch.empty(
+            (world_size,) + input_size, dtype=input_.dtype, device=input_.device
+        )
+        torch.distributed.all_gather_into_tensor(
+            output_tensor, input_, group=self.device_group
+        )
         output_tensor = output_tensor.movedim(0, dim)
         output_tensor = output_tensor.reshape(
             input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
 
@@ -43,12 +43,19 @@ def reduce_scatter(
 
 
 @compile_ops("module_custom_all_reduce")
-def all_gather_reg(_fa: int, inp: torch.Tensor, out: torch.Tensor) -> None: ...
+def all_gather_reg(
+    _fa: int, inp: torch.Tensor, out: torch.Tensor, last_dim_size: int, dim: int
+) -> None: ...
 
 
 @compile_ops("module_custom_all_reduce")
 def all_gather_unreg(
-    _fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
+    _fa: int,
+    inp: torch.Tensor,
+    reg_buffer: torch.Tensor,
+    out: torch.Tensor,
+    last_dim_size: int,
+    dim: int,
 ) -> None: ...