address comments

mamtsing · quic-mamta · commit c0b4e1887c3c · 2025-07-16T06:49:00.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -26,9 +26,9 @@
     generate_peft_config,
     update_config,
 )
-from QEfficient.finetune.utils.dataset_utils import get_dataloader
+from QEfficient.finetune.utils.dataset_utils import get_dataloader, get_longest_seq_length
 from QEfficient.finetune.utils.device_map import get_device_map
-from QEfficient.finetune.utils.helper import Task_Mode, get_longest_seq_length
+from QEfficient.finetune.utils.helper import Task_Mode
 from QEfficient.finetune.utils.logging_utils import logger
 from QEfficient.finetune.utils.parser import get_finetune_parser
 from QEfficient.finetune.utils.train_utils import print_model_size, print_trainable_parameters, train
@@ -67,7 +67,7 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
     assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
     dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
     dist.init_process_group(backend=dist_backend_map[torch_device.type])
-    if train_config.enable_pp:
+    if train_config.num_pp_stages > 1:
         assert dist.get_world_size() * train_config.num_pp_stages == getattr(torch, torch_device.type).device_count(), (
             "Total available devices should be multiple of number of pipeline stages."
         )
@@ -97,7 +97,7 @@ def load_model_and_tokenizer(
     """Load the pre-trained model and tokenizer from Hugging Face.
 
     Args:
-        config (TrainConfig): Training configuration object containing model and tokenizer names.
+        train_config (TrainConfig): Training configuration object containing model and tokenizer names.
         dataset_config (Any): A dataclass object representing dataset configuration.
         kwargs: Additional arguments to override PEFT config.
 
@@ -135,26 +135,14 @@ def load_model_and_tokenizer(
             if param.requires_grad:
                 param.data = param.data.to(torch.float32)
     else:
-        if train_config.enable_pp:
-            if train_config.enable_ddp:
-                device_map = get_device_map(train_config.model_name, train_config.num_pp_stages, rank=dist.get_rank())
-            else:
-                device_map = "auto"
-            model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_path,
-                use_cache=False,
-                attn_implementation="sdpa",
-                torch_dtype=torch.float16,
-                device_map=device_map,
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_path,
-                use_cache=False,
-                attn_implementation="sdpa",
-                torch_dtype=torch.float16,
-            )
-
+        device_map = get_device_map(train_config)
+        model = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_path,
+            use_cache=False,
+            attn_implementation="sdpa",
+            torch_dtype=torch.float16,
+            device_map=device_map,
+        )
     tokenizer = AutoTokenizer.from_pretrained(
         train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name
     )
@@ -307,7 +295,7 @@ def main(**kwargs) -> None:
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
     )
-    if not train_config.enable_pp:
+    if train_config.num_pp_stages == 1:
         model.to(train_config.device)
     optimizer = optim.AdamW(
         model.parameters(),
@@ -320,6 +308,8 @@ def main(**kwargs) -> None:
         for name, param in model.named_parameters():
             if not param.requires_grad:
                 ignore_names.add(name)
+        # Adding params in ignore list will enforce DDP to ignore them during synchronization,
+        # which will further reduce the tensor exchange across devices.
         torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(model, ignore_names)
         model = nn.parallel.DistributedDataParallel(model)
 
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
@@ -50,7 +50,6 @@ class TrainConfig:
         convergence_counter (int): Steps to check convergence (default: 5).
         convergence_loss (float): Loss threshold for convergence (default: 1e-4).
         use_profiler (bool): Enable profiling (default: False).
-        enable_pp (bool): Enable training with pipeline parallelism (default: False).
         num_pp_stages (int): Number of stages in which model is split layerwise when training using pipeline (default: 1).
         enable_ddp (bool): Enable distributed data parallel (default: False).
         enable_sorting_for_ddp (bool): Sort data for DDP (default: True).
@@ -101,7 +100,6 @@ class TrainConfig:
     # profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
 
     # dist-related
-    enable_pp: bool = False
     num_pp_stages: int = 1
     enable_ddp: bool = False
     enable_sorting_for_ddp: bool = True
diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py
@@ -4,6 +4,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+from typing import Dict, List, Tuple
+
 import datasets
 import torch
 import torch.distributed as dist
@@ -116,3 +119,11 @@ def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"
         **dl_kwargs,
     )
     return dataloader
+
+
+def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
+    # find out the minimum max_seq_length required during fine-tuning (saves memory!)
+    lengths = [len(d["input_ids"]) for d in data]
+    longest_seq_length = max(lengths)
+    longest_seq_ix = lengths.index(longest_seq_length)
+    return longest_seq_length, longest_seq_ix
diff --git a/QEfficient/finetune/utils/device_map.py b/QEfficient/finetune/utils/device_map.py
@@ -5,20 +5,40 @@
 #
 # -----------------------------------------------------------------------------
 
-import math
+import os
 
+import numpy as np
 from transformers import AutoConfig
 
 from QEfficient.utils._utils import get_num_layers_from_config
 
 
-def get_device_map(model_name, num_pp_stages, rank):
-    """Returns device map for model layers based number of pipeline stages and given process rank.
+def get_device_map(train_config):
+    """Returns device map for the given model.
 
     Args:
-        model_name (str): model name to get the device map for.
-        num_pp_stages (int): number of stages in pipeline
-        rank (int): process rank
+        train_config (TrainConfig): Training configuration object contaning model name and number of pipeline stages etc.
+
+    Returns:
+        Dict: A dictionary of layers and corresponding device id.
+    """
+
+    if train_config.num_pp_stages > 1:
+        if train_config.enable_ddp:
+            device_map = custom_device_map(train_config)
+        else:
+            device_map = "auto"
+    else:
+        device_map = None
+
+    return device_map
+
+
+def custom_device_map(train_config):
+    """Returns custom device map for model layers based number of pipeline stages and given process rank.
+
+    Args:
+        train_config (TrainConfig): Training configuration object contaning model name and number of pipeline stages etc.
 
     Returns:
         Dict: A dictionary of layers and corresponding device id.
@@ -33,30 +53,31 @@ def get_device_map(model_name, num_pp_stages, rank):
         PP (Pipeline Parallelism): Each copy of the model is split into 2 stages
         DDP (Distributed Data Parallel): 2 model copies run in parallel
 
-        |-------------------------------------------------------------------------------
-        | Process Rank |   Assigned Device IDs  | Model Component |
-        |-------------------------------------------------------------------------------
-        | Rank 0       | 0                 | model.embed_tokens |
-        |              |                   | model.lm_head |
-        |              |                   | model.layers.0 - model.layers.7 |
-        |-------------------------------------------------------------------------------
-        | Rank 0       | 1                 | model.norm |
-        |              |                   | model.rotary_emb |
-        |              |                   | model.layers.8 - model.layers.15 |
-        |-------------------------------------------------------------------------------
-        | Rank 1       | 2                 | model.embed_tokens |
-        |              |                   | model.lm_head |
-        |              |                   | model.layers.0 - model.layers.7 |
-        |-------------------------------------------------------------------------------
-        | Rank 1       | 3                 | model.norm |
-        |              |                   | model.rotary_emb |
-        |              |                   | model.layers.8 - model.layers.15 |
-        |-------------------------------------------------------------------------------
+        |--------------------------------------------------------------------------|
+        | Process Rank |   Assigned Device IDs  | Model Component                  |
+        |--------------------------------------------------------------------------|
+        | Rank 0       | 0                      | model.embed_tokens               |
+        |              |                        | model.lm_head                    |
+        |              |                        | model.layers.0 - model.layers.7  |
+        |--------------------------------------------------------------------------|
+        | Rank 0       | 1                      | model.norm                       |
+        |              |                        | model.rotary_emb                 |
+        |              |                        | model.layers.8 - model.layers.15 |
+        |--------------------------------------------------------------------------|
+        | Rank 1       | 2                      | model.embed_tokens               |
+        |              |                        | model.lm_head                    |
+        |              |                        | model.layers.0 - model.layers.7  |
+        |--------------------------------------------------------------------------|
+        | Rank 1       | 3                      | model.norm                       |
+        |              |                        | model.rotary_emb                 |
+        |              |                        | model.layers.8 - model.layers.15 |
+        |--------------------------------------------------------------------------|
     """
 
-    config = AutoConfig.from_pretrained(model_name)
+    config = AutoConfig.from_pretrained(train_config.model_name)
     num_layers = get_num_layers_from_config(config)
-
+    num_pp_stages = train_config.num_pp_stages
+    rank = int(os.getenv("LOCAL_RANK", 0))
     first_device = rank * num_pp_stages
     last_device = rank * num_pp_stages + (num_pp_stages - 1)
 
@@ -71,11 +92,12 @@ def get_device_map(model_name, num_pp_stages, rank):
         "model.norm": last_device,
         "model.rotary_emb": last_device,
     }
+    n_layer_per_stage = np.ceil(num_layers / num_pp_stages)
 
-    n_layer_per_stage = math.ceil(num_layers / num_pp_stages)
+    pp_stage_ids = np.arange(num_pp_stages)
+    pp_device_map = np.repeat(pp_stage_ids, n_layer_per_stage)
 
-    for j in range(num_pp_stages):
-        for i in range(n_layer_per_stage * j, min(n_layer_per_stage * (j + 1), num_layers)):
-            device_map[f"model.layers.{i}"] = first_device + j
+    for i in range(num_layers):
+        device_map[f"model.layers.{i}"] = pp_device_map[i] + rank * num_pp_stages
 
     return device_map
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
@@ -9,7 +9,6 @@
 import os
 from contextlib import nullcontext
 from enum import Enum
-from typing import Dict, List, Tuple
 
 import torch
 
@@ -82,14 +81,6 @@ def get_op_verifier_ctx(
     )
 
 
-def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
-    # find out the minimum max_seq_length required during fine-tuning (saves memory!)
-    lengths = [len(d["input_ids"]) for d in data]
-    longest_seq_length = max(lengths)
-    longest_seq_ix = lengths.index(longest_seq_length)
-    return longest_seq_length, longest_seq_ix
-
-
 def save_to_json(
     output_filename,
     train_step_loss,
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
@@ -262,12 +262,6 @@ def get_finetune_parser():
         action="store_true",
         help="Enable distributed data parallel training. This will load the replicas of model on given number of devices and train the model. This should be used using torchrun interface. Please check docs for exact usage.",
     )
-    parser.add_argument(
-        "--enable_pp",
-        "--enable-pp",
-        action="store_true",
-        help="Enable pipeline parallel training. This will split the of model layerwise in given number of stages and train the model.",
-    )
     parser.add_argument(
         "--num_pp_stages",
         "--num-pp-stages",