apple · apoorvtintin · Feb 14, 2025 · Feb 26, 2025 · Mar 5, 2025 · kelvin-zou
@@ -13,7 +13,7 @@
 from axlearn.common.config import ConfigOr, maybe_instantiate
 from axlearn.common.metrics import MetricAccumulator
 from axlearn.common.update_transformation import ForwardFn, ForwardOutputs
-from axlearn.common.utils import Nested, Tensor, input_partition_spec, with_sharding_constraint
+from axlearn.common.utils import Nested, Tensor
 
 
 def _compute_minibatch_size(input_batch: Nested[Tensor], *, steps: int) -> int:
@@ -55,41 +55,29 @@ def _make_scan_minibatch_inputs(
     *,
     forward_key: Tensor,
     param_noise_key: Tensor,
-    minibatch_size: int,
     minibatch_index: int,
 ) -> tuple[Nested[Tensor], Tensor, Tensor]:
     """Creates minibatch inputs from inputs.
 
     This is a utility function that is only meant to be called from
-    within a scan function body and is meant to slice the inputs
-    into `minibatch_size` sized slices to run the ForwardFn on.
-
-    Note that this only preserves the input sharding if the `input_partition_spec`
-    returns the correct partition spec to shard the input slices with.
+    within a scan function body and is meant to return sliced minibatches
+    to run the ForwardFn on.
 
     Args:
         inputs: Same pytree as ForwardFn inputs.
         forward_key: The `forward_key` from the ForwardFn inputs
         param_noise_key: The `param_noise_key` from the ForwardFn inputs
-        minibatch_size: Size of the minibatch.
         minibatch_index: Current scan minibatch index.
 
     Returns:
         A tuple of minibatch inputs which of the same structure as `inputs`
         and new (carry) forward_key and param_noise_key.
     """
-    minibatch_input = with_sharding_constraint(
-        jax.tree.map(
-            lambda x: jax.lax.dynamic_slice_in_dim(
-                x,
-                start_index=minibatch_index * minibatch_size,
-                slice_size=minibatch_size,
-                axis=0,
-            ),
-            inputs["input_batch"],
-        ),
-        input_partition_spec(),
+    minibatch_input = jax.tree.map(
+        lambda x: x[minibatch_index],
+        inputs["input_batch"],
     )
+
     next_forward_key, forward_key = jax.random.split(forward_key)
     next_param_noise_key, param_noise_key = jax.random.split(param_noise_key)
 
@@ -172,12 +160,56 @@ def fwd_helper(
                 otherwise None.
             """
             minibatch_size = _compute_minibatch_size(inputs["input_batch"], steps=steps)
+
+            def reshape_for_scan(x: Tensor):
+                """Helper function that adds a minibatch dimension while evenly dividing
+                batches across gradient accumulation iterations.
+
+                Input dimension is [GBS, seq], this first reshaped to [MBS, steps, seq],
+                then transposed to [steps, MBS, seq] this ensures that batches picked
+                up from the global batch in a staggered pattern.
+
+                The main benefit is that this avoids extra communication incurred in reshard
+                for every minibatch.
+
+                Args:
+                    x: Tensor to be reshaped.
+
+                Returns:
+                    The reshaped tensor.
+                """
+                if x.shape[0] % minibatch_size != 0:
+                    raise ValueError(
+                        f"minibatch_size {minibatch_size} does not evenly divide "
+                        f"global batch size of {x.shape[0]}"
+                    )
+
+                x = x.reshape(minibatch_size, -1, *x.shape[1:])
+                # Set up transpose to swap the first two dimensions.
+                dims = list(range(x.ndim))
+                dims[0], dims[1] = dims[1], dims[0]
+                return x.transpose(dims)
+
+            inputs["input_batch"] = jax.tree_map(reshape_for_scan, inputs["input_batch"])
+
+            # Create a sample minibatch for the carry buffer creation below
+            (
+                sample_minibatch_inputs,
+                _,
+                _,
+            ) = _make_scan_minibatch_inputs(
+                inputs,
+                forward_key=inputs["forward_key"],
+                param_noise_key=inputs["param_noise_key"],
+                minibatch_index=0,
+            )
+
             # Carry initialization for the lax.scan procedure. Since we are passing a
             # `MetricAccumulator` into carry and carry input/output shapes must match
             # we need initialize the `MetricAccumulator` summary with the right PyTree
             # structure.
             _, primal_output_shape = jax.eval_shape(
-                original_func_positional_args, model_params, inputs
+                original_func_positional_args, model_params, sample_minibatch_inputs
             )
             init_primal_out = jax.tree.map(jnp.zeros_like, primal_output_shape)
             init_accumulator = maybe_instantiate(metric_accumulator)
@@ -211,7 +243,6 @@ def scan_body(
                     inputs,
                     forward_key=forward_key,
                     param_noise_key=param_noise_key,
-                    minibatch_size=minibatch_size,
                     minibatch_index=minibatch_index,
                 )
                 minibatch_args = (model_params, minibatch_inputs)

@@ -1,5 +1,6 @@
 # Copyright © 2024 Apple Inc.
 """Test module for gradient_accumulation.py"""
+
 import chex
 import jax
 import jax.numpy as jnp

@@ -352,6 +352,9 @@ def get_trainer_kwargs(
                             ),
                             *trn2_config.module_modifications,
                             *trn2_config.partition_spec_modifications,
+                            GradientAccumulationModifier.default_config().set(
+                                grad_acc_steps=4,
+                            ),
                         ],
                     ),
                 ),