apple · apoorvtintin · Feb 14, 2025 · Feb 26, 2025 · Mar 5, 2025 · apghml
@@ -8,12 +8,14 @@
 import jax
 import numpy as np
 from jax import numpy as jnp
+from jax.sharding import PartitionSpec
 
 from axlearn.common import utils
 from axlearn.common.config import ConfigOr, maybe_instantiate
+from axlearn.common.input_base import InputPartitionFn, partition_by_path_rank
 from axlearn.common.metrics import MetricAccumulator
 from axlearn.common.update_transformation import ForwardFn, ForwardOutputs
-from axlearn.common.utils import Nested, Tensor, input_partition_spec, with_sharding_constraint
+from axlearn.common.utils import Nested, Tensor
 
 
 def _compute_minibatch_size(input_batch: Nested[Tensor], *, steps: int) -> int:
@@ -57,39 +59,38 @@ def _make_scan_minibatch_inputs(
     param_noise_key: Tensor,
     minibatch_size: int,
     minibatch_index: int,
+    minibatch_partitioner: Optional[InputPartitionFn],
 ) -> tuple[Nested[Tensor], Tensor, Tensor]:
     """Creates minibatch inputs from inputs.
 
     This is a utility function that is only meant to be called from
     within a scan function body and is meant to slice the inputs
     into `minibatch_size` sized slices to run the ForwardFn on.
 
-    Note that this only preserves the input sharding if the `input_partition_spec`
-    returns the correct partition spec to shard the input slices with.
-
     Args:
         inputs: Same pytree as ForwardFn inputs.
         forward_key: The `forward_key` from the ForwardFn inputs
         param_noise_key: The `param_noise_key` from the ForwardFn inputs
         minibatch_size: Size of the minibatch.
         minibatch_index: Current scan minibatch index.
+        minibatch_partitioner: Applies sharding constraints
+            on each minibatch created.
 
     Returns:
         A tuple of minibatch inputs which of the same structure as `inputs`
         and new (carry) forward_key and param_noise_key.
     """
-    minibatch_input = with_sharding_constraint(
-        jax.tree.map(
-            lambda x: jax.lax.dynamic_slice_in_dim(
-                x,
-                start_index=minibatch_index * minibatch_size,
-                slice_size=minibatch_size,
-                axis=0,
-            ),
-            inputs["input_batch"],
+    minibatch_input = jax.tree.map(
+        lambda x: jax.lax.dynamic_slice_in_dim(
+            x,
+            start_index=minibatch_index * minibatch_size,
+            slice_size=minibatch_size,
+            axis=0,
         ),
-        input_partition_spec(),
+        inputs["input_batch"],
     )
+
+    minibatch_input = minibatch_partitioner(minibatch_input)
     next_forward_key, forward_key = jax.random.split(forward_key)
     next_param_noise_key, param_noise_key = jax.random.split(param_noise_key)
 
@@ -106,6 +107,7 @@ def with_minibatch_steps(
     steps: int,
     metric_accumulator: ConfigOr[MetricAccumulator],
     grad_dtype: Optional[jnp.dtype] = None,
+    minibatch_partitioner: Optional[ConfigOr[InputPartitionFn]] = None,
 ) -> Callable[[ForwardFn], ForwardFn]:
     """Decorate a ForwardFn to accumulate gradients over minibatch steps.
 
@@ -134,16 +136,32 @@ def with_minibatch_steps(
 
     TODO(cemkoc): Investigate the slight difference in loss curves when decorated.
 
+    A minibatch_partitioner is used to partition minibatch inputs to the original_func.
+    Note that if minibatch_partitioner is None, the default minibatch partitioner is used which
+    partitions the minibatch along (("data", "expert", "fsdp"), "seq"). Otherwise the
+    minibatch_partitioner passed in is used.
+
     Args:
         steps: Number of gradient accumulation steps.
         metric_accumulator: A `MetricAccumulator` to accumulate minibatch summaries from the
             forward output.
         grad_dtype: Optional dtype to cast the grads back to after accumulating in fp32.
+        minibatch_partitioner: If not None, contains config for a partitioner that applies
+            additional sharding constraints on each minibatch created.
 
     Returns:
         Decorated ForwardFn.
     """
 
+    # Default partitioner for minibatches.
+    if not minibatch_partitioner:
+        minibatch_partitioner = partition_by_path_rank(
+            path_rank_to_partition={
+                (None, 1): PartitionSpec(("data", "expert", "fsdp")),
+                (None, 2): PartitionSpec(("data", "expert", "fsdp"), "seq"),
+            }
+        )
+
     def decorator(fn: ForwardFn) -> ForwardFn:
         # We define a positional arg only version of the original function
         # that is passed because jax.value_and_grad does not accept
@@ -171,13 +189,29 @@ def fwd_helper(
                 and second is the accumulated grads (if `compute_grad` is True)
                 otherwise None.
             """
+            partitioner = maybe_instantiate(minibatch_partitioner)
             minibatch_size = _compute_minibatch_size(inputs["input_batch"], steps=steps)
+
+            # Create a sample minibatch for the carry buffer creation below
+            (
+                sample_minibatch_inputs,
+                _,
+                _,
+            ) = _make_scan_minibatch_inputs(
+                inputs,
+                forward_key=inputs["forward_key"],
+                param_noise_key=inputs["param_noise_key"],
+                minibatch_size=minibatch_size,
+                minibatch_index=0,
+                minibatch_partitioner=partitioner,
+            )
+
             # Carry initialization for the lax.scan procedure. Since we are passing a
             # `MetricAccumulator` into carry and carry input/output shapes must match
             # we need initialize the `MetricAccumulator` summary with the right PyTree
             # structure.
             _, primal_output_shape = jax.eval_shape(
-                original_func_positional_args, model_params, inputs
+                original_func_positional_args, model_params, sample_minibatch_inputs
             )
             init_primal_out = jax.tree.map(jnp.zeros_like, primal_output_shape)
             init_accumulator = maybe_instantiate(metric_accumulator)
@@ -213,6 +247,7 @@ def scan_body(
                     param_noise_key=param_noise_key,
                     minibatch_size=minibatch_size,
                     minibatch_index=minibatch_index,
+                    minibatch_partitioner=partitioner,
                 )
                 minibatch_args = (model_params, minibatch_inputs)
 

@@ -1,14 +1,175 @@
 # Copyright © 2024 Apple Inc.
 """Test module for gradient_accumulation.py"""
+from typing import Callable
+
 import chex
 import jax
 import jax.numpy as jnp
+import numpy as np
+import pytest
 from absl.testing import absltest, parameterized
+from jax.experimental.pjit import pjit
 
 from axlearn.common import gradient_accumulation, test_utils
+from axlearn.common.config import config_for_function
+from axlearn.common.input_base import partition_by_path_rank
 from axlearn.common.metrics import MetricAccumulator, WeightedScalar
 from axlearn.common.module import new_output_collection
 from axlearn.common.update_transformation import ForwardOutputs
+from axlearn.common.utils import Nested, PartitionSpec, Tensor, tree_paths
+
+
+class TestMinibatchPartitioner(test_utils.TestCase):
+    """Test `with_minibatch_steps` decorator argument minibatch_partitioner."""
+
+    def create_dummy_inputs(self, steps):
+        # Multiply by accumulation steps
+        self.batch_size = 4 * steps
+        self.seq_len = 8
+        self.params = dict(
+            w=jnp.asarray([0.0, 2.0, 2.0, -3.0]),
+            b=jnp.asarray([0.0, -1.0, 0.0, 0.0]),
+        )
+
+        self.input_batch = {
+            "input_ids": jnp.ones((self.batch_size, self.seq_len), dtype=jnp.int32),
+            "target_labels": jnp.ones((self.batch_size, self.seq_len), dtype=jnp.int32),
+            "target_num_bytes": jnp.ones((self.batch_size,), dtype=jnp.int32),
+        }
+        forward_key, param_noise_key = jax.random.split(jax.random.PRNGKey(0), 2)
+        self.inputs = dict(
+            input_batch=self.input_batch,
+            forward_key=forward_key,
+            param_noise_key=param_noise_key,
+        )
+
+    def create_loss_fn(self, expected_minibatch_sharding):
+        """Simple ForwardFn with a check for minibatch sharding."""
+
+        def _check_equal_sharding(input_batch: Nested[Tensor], expected: dict):
+            """Checks if sharding for input_batch matches expected."""
+
+            def callback_sharding(
+                *,
+                input_batch: Nested[Tensor],
+                callback: Callable[[str, jax.sharding.Sharding], None],
+            ):
+                """Invokes callback with the sharding.
+
+                The callback is invoked with (path: str, sharding: Sharding).
+                """
+
+                def check_sharding(path, value):
+                    jax.debug.inspect_array_sharding(
+                        value, callback=lambda sharding: callback(path, sharding)
+                    )
+
+                jax.tree_map(check_sharding, tree_paths(input_batch), input_batch)
+                return input_batch
+
+            callback = lambda path, sharding: self.assertEqual(expected[path], sharding.spec)
+
+            callback_sharding(
+                input_batch=input_batch,
+                callback=callback,
+            )
+
+        def loss_fn(*, model_params, inputs) -> ForwardOutputs:
+            """Simple ForwardFn."""
+            _check_equal_sharding(
+                input_batch=inputs["input_batch"],
+                expected=expected_minibatch_sharding,
+            )
+            loss = -jax.nn.log_softmax(model_params["w"] + model_params["b"])[1]
+            output_collection = new_output_collection()
+            output_collection.state_updates["w"] = model_params["w"] + 1
+            output_collection.state_updates["loss"] = WeightedScalar(loss, 1)
+            return ForwardOutputs(loss=loss, aux={}, output_collection=output_collection)
+
+        return loss_fn
+
+    @pytest.mark.skipif(
+        jax.device_count() != 4 or jax.process_count() != 1,
+        reason=(
+            "Incorrect device & process count for mesh.\n"
+            "Use XLA_FLAGS=--xla_force_host_platform_device_count=4 to run locally."
+        ),
+    )
+    @parameterized.named_parameters(
+        ("one_step", 1),  # no accumulation
+        ("two_steps", 2),
+        ("four_steps", 4),
+    )
+    def test_minibatch_partitioner_default(self, steps):
+        """Tests grad accumulation with minibatch steps and default minibatch partitioner."""
+
+        # pylint: disable=too-many-function-args
+        with jax.sharding.Mesh(
+            devices=np.array(jax.devices()).reshape(1, 2, 1, 2)[..., None],
+            axis_names=("expert", "data", "fsdp", "seq", "model"),
+        ):
+            self.create_dummy_inputs(steps)
+            loss_fn = self.create_loss_fn(
+                expected_minibatch_sharding={
+                    "input_ids": PartitionSpec(("data"), "seq"),
+                    "target_labels": PartitionSpec(("data"), "seq"),
+                    "target_num_bytes": PartitionSpec(("data")),
+                },
+            )
+
+            loss_fn = gradient_accumulation.with_minibatch_steps(
+                steps=steps,
+                metric_accumulator=MetricAccumulator.default_config(),
+                minibatch_partitioner=None,
+            )(loss_fn)
+
+            pjit(loss_fn, in_shardings=None).lower(
+                model_params=self.params, inputs=self.inputs
+            ).compile()
+
+    @pytest.mark.skipif(
+        jax.device_count() != 4 or jax.process_count() != 1,
+        reason=(
+            "Incorrect device & process count for mesh.\n"
+            "Use XLA_FLAGS=--xla_force_host_platform_device_count=4 to run locally."
+        ),
+    )
+    @parameterized.named_parameters(
+        ("one_step", 1),  # no accumulation
+        ("two_steps", 2),
+        ("four_steps", 4),
+    )
+    def test_minibatch_partitioner_non_default(self, steps):
+        """Tests grad accumulation with minibatch steps and a custom minibatch partitioner."""
+
+        with jax.sharding.Mesh(
+            devices=np.array(jax.devices()).reshape(2, 2)[..., None],
+            axis_names=("data", "seq", "model"),
+        ):
+            self.create_dummy_inputs(steps)
+            loss_fn = self.create_loss_fn(
+                expected_minibatch_sharding={
+                    "input_ids": PartitionSpec(("data", "seq")),
+                    "target_labels": PartitionSpec(("data", "seq")),
+                    "target_num_bytes": PartitionSpec(("data", "seq")),
+                },
+            )
+
+            loss_fn = gradient_accumulation.with_minibatch_steps(
+                steps=steps,
+                metric_accumulator=MetricAccumulator.default_config(),
+                minibatch_partitioner=config_for_function(partition_by_path_rank).set(
+                    path_rank_to_partition={
+                        # Shard batch dim on all available axis
+                        (None, 1): PartitionSpec(("data", "seq")),
+                        (None, 2): PartitionSpec(("data", "seq"), None),
+                    }
+                ),
+            )(loss_fn)
+
+            pjit(loss_fn, in_shardings=None).lower(
+                model_params=self.params, inputs=self.inputs
+            ).compile()
 
 
 class TestMinibatchSteps(test_utils.TestCase):

diff --git a/axlearn/common/trainer_config_modifier.py b/axlearn/common/trainer_config_modifier.py
@@ -2,7 +2,7 @@
 
 """Defines trainer config modifiers, which will be used in model definitions."""
 
-from typing import Dict, Sequence, Union
+from typing import Dict, Optional, Sequence, Union
 
 from axlearn.common import config
 from axlearn.common.base_layer import RematSpec
@@ -16,6 +16,7 @@
     maybe_instantiate,
 )
 from axlearn.common.gradient_accumulation import with_minibatch_steps
+from axlearn.common.input_base import InputPartitionFn
 from axlearn.common.metrics import MetricAccumulator
 from axlearn.common.trainer import SpmdTrainer
 from axlearn.common.utils import HybridMeshShape, MeshShape, PartitionSpec
@@ -29,18 +30,22 @@ class Config(ConfigModifier.Config):
         """Configure GradientAccumulationModifier.
 
         Attributes:
+            grad_acc_steps: The number of steps to accumulate the gradients from mini-batches.
             grad_acc_steps: The number of steps to accumulate the gradients from mini-batches.
             metric_accumulator: The metric accumulator to export the metrics.
+            minibatch_partitioner: Constraints the minibatch to a PartitionSpec.
         """
 
         grad_acc_steps: Required[int] = REQUIRED
         metric_accumulator: MetricAccumulator.Config = MetricAccumulator.default_config()
+        minibatch_partitioner: Optional[ConfigOr[InputPartitionFn]] = None
 
     def __init__(self, cfg: Config):
         super().__init__(cfg)
         cfg = self.config
         self._grad_acc_steps = cfg.grad_acc_steps
         self._metric_accumulator = cfg.metric_accumulator
+        self._minibatch_partitioner = cfg.minibatch_partitioner
 
     def __call__(self, cfg: SpmdTrainer.Config) -> SpmdTrainer.Config:
         """Overwrite the forward_fn_transformation to accumulate gradients for grad_acc_steps steps.
@@ -63,6 +68,7 @@ def __call__(self, cfg: SpmdTrainer.Config) -> SpmdTrainer.Config:
         ).set(
             steps=self._grad_acc_steps,
             metric_accumulator=self._metric_accumulator,
+            minibatch_partitioner=self._minibatch_partitioner,
         )
         return cfg
 

@@ -352,6 +352,9 @@ def get_trainer_kwargs(
                             ),
                             *trn2_config.module_modifications,
                             *trn2_config.partition_spec_modifications,
+                            GradientAccumulationModifier.default_config().set(
+                                grad_acc_steps=4,
+                            ),
                         ],
                     ),
                 ),