Implemented flexible PP schedule (pytorch#129597)

haocizhang · pytorchmergebot · commit 9a5782876f13 · 2024-07-02T21:25:22.000Z
Enabled some cases to work where num_microbatches % pp_size != 0. Using the flex_pp schedule, we will have num_rounds = max(1, n_microbatches // pp_group_size) and it works as long as n_microbatches % num_rounds is 0. As a few examples, support pp_group_size = 4, n_microbatches = 10. We will have num_rounds = 2 and n_microbatches % 2 is 0. pp_group_size = 4, n_microbatches = 3. We will have num_rounds = 1 and n_microbatches % 1 is 0. Moved over from PiPPy (pytorch/PiPPy#1129) Tested using the config in (1), schedule looks like the following graph: ``` =========== ALL_RANK_ACTIONS =========== Rank 0 Rank 1 Rank 2 Rank 3 Step 00: F0_s0 None None None Step 01: F1_s0 F0_s1 None None Step 02: F2_s0 F1_s1 F0_s2 None Step 03: F3_s0 F2_s1 F1_s2 F0_s3 Step 04: F4_s0 F3_s1 F2_s2 F1_s3 Step 05: F0_s4 F4_s1 F3_s2 F2_s3 Step 06: F1_s4 F0_s5 F4_s2 F3_s3 Step 07: F2_s4 F1_s5 F0_s6 F4_s3 Step 08: F3_s4 F2_s5 F1_s6 F0_s7 Step 09: F4_s4 F3_s5 None B0_s7 Step 10: F5_s0 None F2_s6 F1_s7 Step 11: None None B0_s6 B1_s7 Step 12: None F4_s5 F3_s6 F2_s7 Step 13: None B0_s5 B1_s6 B2_s7 Step 14: F6_s0 F5_s1 F4_s6 F3_s7 Step 15: B0_s4 B1_s5 B2_s6 B3_s7 Step 16: F7_s0 F6_s1 F5_s2 F4_s7 Step 17: B1_s4 B2_s5 B3_s6 B4_s7 Step 18: F8_s0 F7_s1 F6_s2 F5_s3 Step 19: B2_s4 B3_s5 B4_s6 B0_s3 Step 20: F9_s0 F8_s1 F7_s2 F6_s3 Step 21: B3_s4 B4_s5 B0_s2 B1_s3 Step 22: F5_s4 F9_s1 F8_s2 F7_s3 Step 23: B4_s4 B0_s1 B1_s2 B2_s3 Step 24: F6_s4 F5_s5 F9_s2 F8_s3 Step 25: B0_s0 B1_s1 B2_s2 B3_s3 Step 26: F7_s4 F6_s5 F5_s6 F9_s3 Step 27: B1_s0 B2_s1 B3_s2 B4_s3 Step 28: F8_s4 F7_s5 F6_s6 F5_s7 Step 29: B2_s0 B3_s1 B4_s2 B5_s7 Step 30: F9_s4 F8_s5 F7_s6 F6_s7 Step 31: B3_s0 B4_s1 B5_s6 B6_s7 Step 32: None F9_s5 F8_s6 F7_s7 Step 33: B4_s0 B5_s5 B6_s6 B7_s7 Step 34: None None F9_s6 F8_s7 Step 35: B5_s4 B6_s5 B7_s6 B8_s7 Step 36: None None None F9_s7 Step 37: B6_s4 B7_s5 B8_s6 B9_s7 Step 38: None None None None Step 39: B7_s4 B8_s5 B9_s6 B5_s3 Step 40: None None None None Step 41: B8_s4 B9_s5 B5_s2 B6_s3 Step 42: None None None None Step 43: B9_s4 B5_s1 B6_s2 B7_s3 Step 44: None None None None Step 45: B5_s0 B6_s1 B7_s2 B8_s3 Step 46: None None None None Step 47: B6_s0 B7_s1 B8_s2 B9_s3 Step 48: None None None Step 49: B7_s0 B8_s1 B9_s2 Step 50: None None Step 51: B8_s0 B9_s1 Step 52: None ``` Pull Request resolved: pytorch#129597 Approved by: https://github.com/H-Huang
diff --git a/docs/source/distributed.pipelining.rst b/docs/source/distributed.pipelining.rst
@@ -414,7 +414,8 @@ You can implement your own pipeline schedule by extending one of the following t
 ``PipelineScheduleMulti`` is for schedules that assigns multiple stages per rank.
 
 For example, ``ScheduleGPipe`` and ``Schedule1F1B`` are subclasses of ``PipelineScheduleSingle``.
-Whereas, ``ScheduleInterleaved1F1B`` and ``ScheduleLoopedBFS`` are subclasses of ``PipelineScheduleMulti``.
+Whereas, ``ScheduleFlexibleInterleaved1F1B``, ``ScheduleInterleaved1F1B`` and ``ScheduleLoopedBFS``
+are subclasses of ``PipelineScheduleMulti``.
 
 
 API Reference
@@ -472,6 +473,8 @@ Pipeline Schedules
 
 .. autoclass:: Schedule1F1B
 
+.. autoclass:: ScheduleFlexibleInterleaved1F1B
+
 .. autoclass:: ScheduleInterleaved1F1B
 
 .. autoclass:: ScheduleLoopedBFS
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
@@ -17,6 +17,7 @@
     pipeline,
     PipelineStage,
     Schedule1F1B,
+    ScheduleFlexibleInterleaved1F1B,
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleLoopedBFS,
@@ -754,7 +755,10 @@ def _validate_pipeline_order(
             if len(error_msg) != 0:
                 self.fail(f"Error at timestep {timestep}: " + ",".join(error_msg))
 
-    @parametrize("ScheduleClass", [ScheduleInterleaved1F1B, ScheduleLoopedBFS])
+    @parametrize(
+        "ScheduleClass",
+        [ScheduleFlexibleInterleaved1F1B, ScheduleInterleaved1F1B, ScheduleLoopedBFS],
+    )
     def test_pipeline_order(self, ScheduleClass):
         # Define a list of test cases with varying num_local_stages, num_microbatches, and group_size
         # These should succeed since num_microbatches % group_size == 0
@@ -783,13 +787,24 @@ def test_pipeline_order(self, ScheduleClass):
             # odd group_sizes
             (4, 6, 3),
             (4, 10, 5),
+            # n_mb non divisible by group_size
+            (2, 3, 4),
+            (2, 4, 4),
+            (2, 10, 4),
+            (2, 15, 4),
         ]
         for num_local_stages, num_microbatches, group_size in test_cases:
             with self.subTest(
                 num_local_stages=num_local_stages,
                 num_microbatches=num_microbatches,
                 group_size=group_size,
             ):
+                only_run_in_flex_pp = num_microbatches % group_size != 0
+                if only_run_in_flex_pp and not isinstance(
+                    ScheduleClass, ScheduleFlexibleInterleaved1F1B
+                ):
+                    continue
+
                 print(f"{num_local_stages=} {num_microbatches=} {group_size=}")
                 num_stages = num_local_stages * group_size
                 stages = [
diff --git a/torch/distributed/pipelining/__init__.py b/torch/distributed/pipelining/__init__.py
@@ -2,6 +2,7 @@
 from ._IR import Pipe, pipe_split, pipeline, SplitPoint
 from .schedules import (
     Schedule1F1B,
+    ScheduleFlexibleInterleaved1F1B,
     ScheduleGPipe,
     ScheduleInterleaved1F1B,
     ScheduleLoopedBFS,
@@ -17,6 +18,7 @@
     "PipelineStage",
     "build_stage",
     "Schedule1F1B",
+    "ScheduleFlexibleInterleaved1F1B",
     "ScheduleGPipe",
     "ScheduleInterleaved1F1B",
     "ScheduleLoopedBFS",
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
@@ -21,6 +21,7 @@
     "PipelineScheduleSingle",
     "PipelineScheduleMulti",
     "Schedule1F1B",
+    "ScheduleFlexibleInterleaved1F1B",
     "ScheduleGPipe",
     "ScheduleInterleaved1F1B",
     "ScheduleLoopedBFS",
@@ -955,6 +956,82 @@ def _calculate_single_rank_operations(self, rank):
         return rank_ops
 
 
+def _get_1f1b_rank_ops(
+    n_local_stages,
+    pp_group_size,
+    warmup_ops,
+    fwd_bwd_ops,
+    cooldown_ops,
+    rank,
+    forward_stage_index,
+    backward_stage_index,
+):
+    # All stages start with handling microbatch 0
+    fwd_stage_mb_index: Dict[int, int] = defaultdict(int)
+    bwd_stage_mb_index: Dict[int, int] = defaultdict(int)
+    # Store the list of operations used for that rank
+    rank_ops: List[Optional[_Action]] = []
+    # Pre-padding, rank starts with no-ops based on the warmup.
+    for _ in range(rank):
+        rank_ops.append(None)
+    # These are used to calculate the number of slots to fill with no-ops, to account for the delay in warmup
+    # when we want to wait for the backward to trickle back up and start 1f1b to align all ranks.
+    # Formula:
+    # pre-padding + warmup_ops + post_warmup_ops = earliest time step of first backward
+    # post_warmup_ops = [earliest time step of first backward] - (warmup_ops + pre-padding)
+    # earliest time step of first backward = [local_stages * group_size + 2 * (group_size - 1 - rank)]
+    # warmup_ops = calculated above
+    post_warmup_ops = (
+        n_local_stages * pp_group_size + 2 * (pp_group_size - 1 - rank)
+    ) - (warmup_ops + rank)
+
+    total_ops = warmup_ops + fwd_bwd_ops + cooldown_ops
+
+    for op in range(total_ops):
+        # Warmup phase
+        if op < warmup_ops:
+            fwd_stage_index = forward_stage_index(op)
+            # This will assign the current microbatch index and update it as well
+            fwd_stage_mb_index[fwd_stage_index] = (
+                mb_index := fwd_stage_mb_index[fwd_stage_index]
+            ) + 1
+            rank_ops.append(
+                _Action(_ComputationType.FORWARD, mb_index, fwd_stage_index)
+            )
+            if op == warmup_ops - 1:
+                # This is the last step in the warmup phase, so we need to wait for the backward to trickle back up
+                rank_ops.extend([None] * post_warmup_ops)
+        # 1F1B Phase (forward and backward)
+        elif warmup_ops <= op < warmup_ops + fwd_bwd_ops:
+            fwd_stage_index = forward_stage_index(op)
+            fwd_stage_mb_index[fwd_stage_index] = (
+                fwd_mb_index := fwd_stage_mb_index[fwd_stage_index]
+            ) + 1
+            rank_ops.append(
+                _Action(_ComputationType.FORWARD, fwd_mb_index, fwd_stage_index)
+            )
+            bwd_stage_index = backward_stage_index(op)
+            bwd_stage_mb_index[bwd_stage_index] = (
+                bwd_mb_index := bwd_stage_mb_index[bwd_stage_index]
+            ) + 1
+            rank_ops.append(
+                _Action(_ComputationType.BACKWARD, bwd_mb_index, bwd_stage_index)
+            )
+        # Cooldown phase
+        else:
+            # During cooldown phase, we need steps to align with 1f1b happening in other ranks
+            # TODO: we don't need to always append, after all 1f1b are finished we can stop appending None
+            rank_ops.append(None)
+            bwd_stage_index = backward_stage_index(op)
+            bwd_stage_mb_index[bwd_stage_index] = (
+                bwd_mb_index := bwd_stage_mb_index[bwd_stage_index]
+            ) + 1
+            rank_ops.append(
+                _Action(_ComputationType.BACKWARD, bwd_mb_index, bwd_stage_index)
+            )
+    return rank_ops
+
+
 class ScheduleInterleaved1F1B(PipelineScheduleMulti):
     """
     The Interleaved 1F1B schedule.
@@ -1046,74 +1123,119 @@ def backward_stage_index(step):
             )
             return (local_index * self.pp_group_size) + rank
 
-        # Dictionary for tracking {stage index : current microbatch index}
-        # All stages start with handling microbatch 0
-        fwd_stage_mb_index: Dict[int, int] = defaultdict(int)
-        bwd_stage_mb_index: Dict[int, int] = defaultdict(int)
+        return _get_1f1b_rank_ops(
+            self.n_local_stages,
+            self.pp_group_size,
+            warmup_ops,
+            fwd_bwd_ops,
+            cooldown_ops,
+            rank,
+            forward_stage_index,
+            backward_stage_index,
+        )
 
-        # Store the list of operations used for that rank
-        rank_ops: List[Optional[_Action]] = []
-        # Pre-padding, rank starts with no-ops based on the warmup.
-        for _ in range(rank):
-            rank_ops.append(None)
 
-        # These are used to calculate the number of slots to fill with no-ops, to account for the delay in warmup
-        # when we want to wait for the backward to trickle back up and start 1f1b to align all ranks.
-        # Formula:
-        # pre-padding + warmup_ops + post_warmup_ops = earliest time step of first backward
-        # post_warmup_ops = [earliest time step of first backward] - (warmup_ops + pre-padding)
-        # earliest time step of first backward = [local_stages * group_size + 2 * (group_size - 1 - rank)]
-        # warmup_ops = calculated above
-        post_warmup_ops = (
-            self.n_local_stages * self.pp_group_size
-            + 2 * (self.pp_group_size - 1 - rank)
-        ) - (warmup_ops + rank)
-
-        for op in range(total_ops):
-            # Warmup phase
-            if op < warmup_ops:
-                fwd_stage_index = forward_stage_index(op)
-                # This will assign the current microbatch index and update it as well
-                fwd_stage_mb_index[fwd_stage_index] = (
-                    mb_index := fwd_stage_mb_index[fwd_stage_index]
-                ) + 1
-                rank_ops.append(
-                    _Action(_ComputationType.FORWARD, mb_index, fwd_stage_index)
-                )
-                if op == warmup_ops - 1:
-                    # This is the last step in the warmup phase, so we need to wait for the backward to trickle back up
-                    rank_ops.extend([None] * post_warmup_ops)
-            # 1F1B Phase (forward and backward)
-            elif warmup_ops <= op < warmup_ops + fwd_bwd_ops:
-                fwd_stage_index = forward_stage_index(op)
-                fwd_stage_mb_index[fwd_stage_index] = (
-                    fwd_mb_index := fwd_stage_mb_index[fwd_stage_index]
-                ) + 1
-                rank_ops.append(
-                    _Action(_ComputationType.FORWARD, fwd_mb_index, fwd_stage_index)
-                )
+class ScheduleFlexibleInterleaved1F1B(PipelineScheduleMulti):
+    """
+    The Flexible Interleaved 1F1B schedule.
 
-                bwd_stage_index = backward_stage_index(op)
-                bwd_stage_mb_index[bwd_stage_index] = (
-                    bwd_mb_index := bwd_stage_mb_index[bwd_stage_index]
-                ) + 1
-                rank_ops.append(
-                    _Action(_ComputationType.BACKWARD, bwd_mb_index, bwd_stage_index)
-                )
-            # Cooldown phase
-            else:
-                # During cooldown phase, we need steps to align with 1f1b happening in other ranks
-                # TODO: we don't need to always append, after all 1f1b are finished we can stop appending None
-                rank_ops.append(None)
-                bwd_stage_index = backward_stage_index(op)
-                bwd_stage_mb_index[bwd_stage_index] = (
-                    bwd_mb_index := bwd_stage_mb_index[bwd_stage_index]
-                ) + 1
-                rank_ops.append(
-                    _Action(_ComputationType.BACKWARD, bwd_mb_index, bwd_stage_index)
-                )
+    This schedule is mostly similar to the interleaved 1F1B schedule.
+    It differs by being relaxing the requirement of num_microbatch % pp_size == 0.
+    Using the flex_pp schedule, we will have num_rounds = max(1, n_microbatches // pp_group_size) and
+    it works as long as n_microbatches % num_rounds is 0. As a few examples, support
 
-        # Post padding
-        for _ in range(self.pp_group_size - rank - 1):
-            rank_ops.append(None)
-        return rank_ops
+    1. pp_group_size = 4, n_microbatches = 10. We will have num_rounds = 2 and n_microbatches % 2 is 0.
+    2. pp_group_size = 4, n_microbatches = 3. We will have num_rounds = 1 and n_microbatches % 1 is 0.
+    """
+
+    def __init__(
+        self,
+        stages: List[_PipelineStageBase],
+        n_microbatches: int,
+        loss_fn: Optional[Callable] = None,
+        args_chunk_spec: Optional[Tuple[TensorChunkSpec, ...]] = None,
+        kwargs_chunk_spec: Optional[Dict[str, TensorChunkSpec]] = None,
+        output_merge_spec: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    ):
+        self.pp_group_size = stages[0].group_size
+        super().__init__(
+            stages=stages,
+            n_microbatches=n_microbatches,
+            loss_fn=loss_fn,
+            args_chunk_spec=args_chunk_spec,
+            kwargs_chunk_spec=kwargs_chunk_spec,
+            output_merge_spec=output_merge_spec,
+        )
+        self.n_local_stages = len(stages)
+        self.rank = stages[0].group_rank
+        self.number_of_rounds = max(1, n_microbatches // self.pp_group_size)
+        self.microbatches_per_round = n_microbatches // self.number_of_rounds
+        if n_microbatches % self.number_of_rounds != 0:
+            raise ValueError(
+                "Flexible Interleaved 1F1B requires the number of microbatches to be a "
+                f"multiple of the number of rounds ({self.number_of_rounds}), "
+                f"but got {n_microbatches}."
+            )
+        # 1. Create the pipeline_order (all ranks do this calculation)
+        # This will be used to keep track of the current state of the entire pipeline
+        # pipeline_order[rank] = [Action(computation_type, microbatch_index, stage_index), ...]
+        self.pipeline_order: Dict[int, List[Optional[_Action]]] = {}
+        for rank in range(self.pp_group_size):
+            rank_ops = self._calculate_single_rank_operations(rank)
+            self.pipeline_order[rank] = rank_ops
+
+    def _calculate_single_rank_operations(self, rank) -> List[Optional[_Action]]:
+        def get_rank_warmup_ops(rank):
+            # Warms up operations for last stage
+            warmups_ops_last_stage = (
+                self.n_local_stages - 1
+            ) * self.microbatches_per_round
+            # Increment warmup operations by 2 for each hop away from the last stage
+            warmup_ops = warmups_ops_last_stage + 2 * ((self.pp_group_size - 1) - rank)
+            # We cannot have more warmup operations than there are number of microbatches, so cap it there
+            return min(warmup_ops, self._n_microbatches * self.n_local_stages)
+
+        warmup_ops = get_rank_warmup_ops(rank)
+        microbatch_ops = self.n_local_stages * self._n_microbatches
+        # fwd_bwd_ops should encompass the remaining forwards
+        fwd_bwd_ops = microbatch_ops - warmup_ops
+        # cooldown_ops should encompass the remaining backwards
+        cooldown_ops = microbatch_ops - fwd_bwd_ops
+        # total ops encompass both forward and backward ops
+        total_ops = warmup_ops + fwd_bwd_ops + cooldown_ops
+        # warmup_ops + fwd_bwd_ops * 2 + cooldown_ops == microbatch_ops * 2
+        logger.debug(
+            "rank %s, warmup_ops %s, 1f1b %s, cooldown_ops %s total_ops %s",
+            rank,
+            warmup_ops,
+            fwd_bwd_ops,
+            cooldown_ops,
+            total_ops,
+        )
+
+        # Calculates the stage index based on step and pp_group_size
+
+        def forward_stage_index(step):
+            # Get the local index from 0 to n_local_stages-1
+            local_index = (step // self.microbatches_per_round) % self.n_local_stages
+            return (local_index * self.pp_group_size) + rank
+
+        def backward_stage_index(step):
+            local_index = (
+                self.n_local_stages
+                - 1
+                - ((step - warmup_ops) // self.microbatches_per_round)
+                % self.n_local_stages
+            )
+            return (local_index * self.pp_group_size) + rank
+
+        return _get_1f1b_rank_ops(
+            self.n_local_stages,
+            self.pp_group_size,
+            warmup_ops,
+            fwd_bwd_ops,
+            cooldown_ops,
+            rank,
+            forward_stage_index,
+            backward_stage_index,
+        )