feat(archon): improve pipeline parallelism memory handling

rchardx · rchardx · commit 68e26302cf78 · 2026-03-02T14:43:58.000+08:00
Add reshard_after_forward_policy config, MoE-aware donated_buffer
management, output chunk memory optimization, and comprehensive
PP memory guide documentation.

Key changes:
- Add reshard_after_forward_policy config for FSDP forward resharding control
- Add is_moe_model_config utility; only disable donated_buffer for MoE models
- Add _NullOutputChunks to free logits during PP training step
- Fix microbatch validation to use num_total_stages instead of pp_size
- Add PP Memory Guide appendix to archon.md
- Replace handling_oom.md PP section with seealso cross-reference
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@
 
 # opencode
 .opencode/sessions/
+.sisyphus/
 
 # Ruff
 .ruff_cache/
diff --git a/AGENTS.md b/AGENTS.md
@@ -9,6 +9,7 @@
 ```bash
 # Environment
 uv sync --extra cuda            # dependencies (or `uv sync` without CUDA)
+source .venv/bin/activate        # activate venv BEFORE pre-commit or git commit if venv exists
 pre-commit install               # formatting hooks (Ruff, mdformat, clang-format, nbstripout, autoflake)
 pre-commit run --all-files       # lint + format everything
 
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -489,6 +489,18 @@ class ArchonEngineConfig:
         },
     )
 
+    # FSDP reshard policy after forward pass
+    reshard_after_forward_policy: str = field(
+        default="default",
+        metadata={
+            "help": "FSDP reshard policy after forward pass. "
+            "'default': reshard when pipeline parallelism is off; keep unsharded when on to avoid repeated all-gather per microbatch. "
+            "'always': always reshard after forward (saves memory). "
+            "'never': never reshard after forward.",
+            "choices": ["default", "always", "never"],
+        },
+    )
+
     # Deterministic mode
     use_deterministic_algorithms: bool = field(
         default=False,
@@ -515,6 +527,12 @@ def __post_init__(self):
                 f"pp_last_stage_less_layers must be >= 0, "
                 f"got {self.pp_last_stage_less_layers}"
             )
+        valid_reshard_policies = ("default", "always", "never")
+        if self.reshard_after_forward_policy not in valid_reshard_policies:
+            raise ValueError(
+                f"reshard_after_forward_policy must be one of {valid_reshard_policies}, "
+                f"got '{self.reshard_after_forward_policy}'"
+            )
 
 
 # These configurations are used by Megatron Bridge to build Megatron models.
diff --git a/areal/experimental/engine/archon_engine.py b/areal/experimental/engine/archon_engine.py
@@ -74,6 +74,7 @@
     ulysses_gather_output,
     ulysses_slice_inputs,
 )
+from areal.experimental.models.archon.utils import is_moe_model_config
 from areal.infra.dist_rollout import DistRolloutCoordinator
 from areal.infra.platforms import current_platform
 from areal.models.tree_attn.functional import (
@@ -292,13 +293,18 @@ def initialize(self, addr: str | None, ft_spec: FinetuneSpec, *args, **kwargs):
         ac_config = self._build_ac_config()
         enable_compile = self.config.archon.enable_compile
 
+        # NOTE: Upgrading PyTorch may resolve these in the future.
         # Zero-bubble schedules (InterleavedZeroBubble, ZBVZeroBubble, DualPipeV)
-        # use split backward (I/W steps). This is incompatible with:
-        # 1. torch.compile - donated buffer optimization assumes a single
-        #    backward pass (retain_graph=False).
-        # 2. Op-level selective AC - its per-op cache (storage.pop) is consumed
+        # use split backward (I/W steps) with retain_graph=True between them.
+        # This is incompatible with:
+        # 1. torch.compile - disabled unconditionally for zero-bubble.
+        # 2. donated_buffer (MoE only) - MoE models have internally compiled
+        #    ops (via AOTAutograd) whose backward uses donated buffers. These
+        #    are freed after backward, conflicting with retain_graph=True.
+        #    Dense models have no such ops and are unaffected.
+        # 3. Op-level selective AC - its per-op cache (storage.pop) is consumed
         #    by the I step, leaving nothing for the W step recompute.
-        # 3. memory_budget AC - it depends on torch.compile.
+        # 4. memory_budget AC - it depends on torch.compile.
         # Full AC / layer-level selective AC use standard checkpoint_wrapper
         # whose gid-based recompute supports multiple backward passes.
         schedule_class = get_schedule_class(self.config.archon.pp_schedule)
@@ -316,6 +322,22 @@ def initialize(self, addr: str | None, ft_spec: FinetuneSpec, *args, **kwargs):
                 )
                 enable_compile = False
 
+            # NOTE: Upgrading PyTorch may resolve this in the future.
+            # MoE models have internally compiled ops (via AOTAutograd)
+            # whose backward uses donated buffers - these conflict with
+            # retain_graph=True in split backward. Dense models have no
+            # such ops and are unaffected.
+            if is_moe_model_config(self.model_config):
+                import torch._functorch.config as functorch_config
+
+                if getattr(functorch_config, "donated_buffer", False):
+                    self.logger.info(
+                        f"{schedule_name} requires donated_buffer=False "
+                        "for MoE models (internally compiled ops conflict "
+                        "with retain_graph=True in split backward). Disabling."
+                    )
+                    functorch_config.donated_buffer = False
+
             if ac_config is not None and (
                 (
                     ac_config.mode == "selective"
@@ -899,7 +921,7 @@ def _apply_pipeline_parallelism(
             reduce_dtype=torch.float32,
             loss_parallel=True,
             cpu_offload=self.config.archon.offload_params,
-            reshard_after_forward_policy="default",
+            reshard_after_forward_policy=self.config.archon.reshard_after_forward_policy,
             ac_config=ac_config,
             enable_compile=enable_compile,
         )
@@ -938,7 +960,7 @@ def _apply_parallelism(
             reduce_dtype=torch.float32,
             loss_parallel=True,
             cpu_offload=self.config.archon.offload_params,
-            reshard_after_forward_policy="default",
+            reshard_after_forward_policy=self.config.archon.reshard_after_forward_policy,
             ac_config=ac_config,
             enable_compile=enable_compile,
         )
@@ -1239,16 +1261,20 @@ def _prepare_mb_list(self, input_: dict[str, Any]) -> MicroBatchList:
 
         input_ = amend_position_ids(input_)
 
-        # Pipeline parallelism requires n_microbatches >= pp_stages
+        # Pipeline parallelism requires n_microbatches >= num_total_stages
         if self.parallel_dims.pp_enabled:
             pp_size = self.parallel_dims.pp
+            stages_per_rank = len(self.pp_stages)
+            num_total_stages = pp_size * stages_per_rank
             n_seqs = input_["attention_mask"].shape[0]
-            if n_seqs < pp_size:
+            if n_seqs < num_total_stages:
                 raise RuntimeError(
-                    f"Pipeline parallelism requires at least {pp_size} sequences, "
-                    f"but got {n_seqs}. Increase batch size or reduce PP degree."
+                    f"Pipeline parallelism requires at least {num_total_stages} "
+                    f"sequences (pp_size={pp_size} * stages_per_rank="
+                    f"{stages_per_rank}), but got {n_seqs}. "
+                    f"Increase batch size or reduce PP degree/stages."
                 )
-            min_n_mbs = pp_size
+            min_n_mbs = num_total_stages
             mb_spec = MicroBatchSpec.new(
                 self.config.mb_spec,
                 n_mbs=max(min_n_mbs, self.config.mb_spec.n_mbs or 1),
diff --git a/areal/experimental/engine/archon_runner.py b/areal/experimental/engine/archon_runner.py
@@ -20,6 +20,11 @@
 logger = logging.getLogger("ArchonRunner")
 
 
+class _NullOutputChunks(list):
+    def append(self, item: Any) -> None:
+        pass
+
+
 class ForwardBackwardRunner(ABC):
     """Abstract base for forward/backward execution strategies."""
 
@@ -216,9 +221,11 @@ def _run_eval(
         if not self.has_last_stage:
             return None
         output_stage = self._get_output_stage()
-        return self._process_outputs(
+        results = self._process_outputs(
             output_stage.output_chunks, contexts, process_output_fn
         )
+        output_stage.output_chunks.clear()
+        return results
 
     def _run_train(
         self,
@@ -232,7 +239,24 @@ def _run_train(
         pp_loss_fn = self._create_loss_fn(contexts, process_output_fn)
         schedule = self._create_schedule(n_microbatches, loss_fn=pp_loss_fn)
         self._patch_skip_output_merge(schedule)
+
+        # NOTE: Upgrading PyTorch may resolve this in the future.
+        # Replace output_chunks with a null list so
+        # forward_one_chunk's `output_chunks.append(output)` becomes a no-op.
+        # (torch/distributed/pipelining/schedules.py)
+        # This lets each microbatch's logits be freed right after its backward,
+        # instead of holding all N sets of logits until step() returns.
+        output_stage = None
+        if self.has_last_stage:
+            output_stage = self._get_output_stage()
+            output_stage.output_chunks = _NullOutputChunks()
+
         schedule.step(*args, target=batched_target, **batched_kwargs)
+
+        # Restore normal list so subsequent eval() calls on the same
+        # stage can read output_chunks normally.
+        if output_stage is not None:
+            output_stage.output_chunks = []
         return []
 
     def _create_loss_fn(
diff --git a/areal/experimental/models/archon/utils.py b/areal/experimental/models/archon/utils.py
@@ -137,9 +137,29 @@ def validate_ep_constraints(
         )
 
 
+def is_moe_model_config(model_config: object) -> bool:
+    """Check if a HuggingFace PretrainedConfig represents a Mixture-of-Experts model.
+
+    Inspects common HF config attributes (num_experts, num_local_experts)
+    to determine whether the model uses MoE layers.
+
+    Args:
+        model_config: A HuggingFace PretrainedConfig (or any object with
+            num_experts / num_local_experts attributes).
+
+    Returns:
+        True if the config indicates an MoE model with more than one expert.
+    """
+    num_experts = getattr(model_config, "num_experts", None)
+    if num_experts is None:
+        num_experts = getattr(model_config, "num_local_experts", None)
+    return num_experts is not None and num_experts > 1
+
+
 __all__ = [
     "ModelArgsProtocol",
     "MoEModelArgsProtocol",
+    "is_moe_model_config",
     "validate_cp_constraints",
     "validate_tp_constraints",
     "validate_ep_constraints",
diff --git a/docs/best_practices/handling_oom.md b/docs/best_practices/handling_oom.md
@@ -144,6 +144,14 @@ allocation_mode: sglang:d4+archon:d2p2e2
 We recommend pipeline and expert parallelism over tensor/context parallelism. Check
 [Allocation Mode Reference](../reference/alloc_mode.md) for more details.
 
+```{seealso}
+Pipeline parallelism introduces unique memory challenges (microbatch warmup accumulation,
+zero-bubble `retain_graph` overhead, FSDP resharding trade-offs, gradient accumulation
+costs, and per-rank memory budgeting). See the
+[Archon PP Memory Guide](../tutorial/archon.md#appendix-pipeline-parallelism-memory-guide)
+for a comprehensive walkthrough.
+```
+
 ### 4. Switch to a Lightweight Optimizer
 
 AReaL supports different optimizers depending on the training engine.
diff --git a/docs/cli_reference.md b/docs/cli_reference.md
diff --git a/docs/tutorial/archon.md b/docs/tutorial/archon.md