address comments

quic-mamta · quic-mamta · commit d87a24971285 · 2025-07-15T12:11:59.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -26,9 +26,9 @@
     generate_peft_config,
     update_config,
 )
-from QEfficient.finetune.utils.dataset_utils import get_dataloader
+from QEfficient.finetune.utils.dataset_utils import get_dataloader, get_longest_seq_length
 from QEfficient.finetune.utils.device_map import get_device_map
-from QEfficient.finetune.utils.helper import Task_Mode, get_longest_seq_length
+from QEfficient.finetune.utils.helper import Task_Mode
 from QEfficient.finetune.utils.logging_utils import logger
 from QEfficient.finetune.utils.parser import get_finetune_parser
 from QEfficient.finetune.utils.train_utils import print_model_size, print_trainable_parameters, train
@@ -67,7 +67,7 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
     assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
     dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
     dist.init_process_group(backend=dist_backend_map[torch_device.type])
-    if train_config.enable_pp:
+    if train_config.num_pp_stages > 1:
         assert dist.get_world_size() * train_config.num_pp_stages == getattr(torch, torch_device.type).device_count(), (
             "Total available devices should be multiple of number of pipeline stages."
         )
@@ -97,7 +97,7 @@ def load_model_and_tokenizer(
     """Load the pre-trained model and tokenizer from Hugging Face.
 
     Args:
-        config (TrainConfig): Training configuration object containing model and tokenizer names.
+        train_config (TrainConfig): Training configuration object containing model and tokenizer names.
         dataset_config (Any): A dataclass object representing dataset configuration.
         kwargs: Additional arguments to override PEFT config.
 
@@ -135,26 +135,14 @@ def load_model_and_tokenizer(
             if param.requires_grad:
                 param.data = param.data.to(torch.float32)
     else:
-        if train_config.enable_pp:
-            if train_config.enable_ddp:
-                device_map = get_device_map(train_config.model_name, train_config.num_pp_stages, rank=dist.get_rank())
-            else:
-                device_map = "auto"
-            model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_path,
-                use_cache=False,
-                attn_implementation="sdpa",
-                torch_dtype=torch.float16,
-                device_map=device_map,
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_path,
-                use_cache=False,
-                attn_implementation="sdpa",
-                torch_dtype=torch.float16,
-            )
-
+        device_map = get_device_map(train_config)
+        model = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_path,
+            use_cache=False,
+            attn_implementation="sdpa",
+            torch_dtype=torch.float16,
+            device_map=device_map,
+        )
     tokenizer = AutoTokenizer.from_pretrained(
         train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name
     )
@@ -307,7 +295,7 @@ def main(**kwargs) -> None:
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
     )
-    if not train_config.enable_pp:
+    if train_config.num_pp_stages == 1:
         model.to(train_config.device)
     optimizer = optim.AdamW(
         model.parameters(),
@@ -320,6 +308,8 @@ def main(**kwargs) -> None:
         for name, param in model.named_parameters():
             if not param.requires_grad:
                 ignore_names.add(name)
+        # Adding params in ignore list will enforce DDP to ignore them during synchronization,
+        # which will further reduce the tensor exchange across devices.
         torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(model, ignore_names)
         model = nn.parallel.DistributedDataParallel(model)
 
diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py
@@ -4,6 +4,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+from typing import Dict, List, Tuple
+
 import datasets
 import torch
 import torch.distributed as dist
@@ -116,3 +119,11 @@ def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"
         **dl_kwargs,
     )
     return dataloader
+
+
+def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
+    # find out the minimum max_seq_length required during fine-tuning (saves memory!)
+    lengths = [len(d["input_ids"]) for d in data]
+    longest_seq_length = max(lengths)
+    longest_seq_ix = lengths.index(longest_seq_length)
+    return longest_seq_length, longest_seq_ix
diff --git a/QEfficient/finetune/utils/device_map.py b/QEfficient/finetune/utils/device_map.py
@@ -5,20 +5,40 @@
 #
 # -----------------------------------------------------------------------------
 
-import math
+import os
 
+import numpy as np
 from transformers import AutoConfig
 
 from QEfficient.utils._utils import get_num_layers_from_config
 
 
-def get_device_map(model_name, num_pp_stages, rank):
-    """Returns device map for model layers based number of pipeline stages and given process rank.
+def get_device_map(train_config):
+    """Returns device map for the given model.
 
     Args:
-        model_name (str): model name to get the device map for.
-        num_pp_stages (int): number of stages in pipeline
-        rank (int): process rank
+        train_config (TrainConfig): Training configuration object contaning model name and number of pipeline stages etc.
+
+    Returns:
+        Dict: A dictionary of layers and corresponding device id.
+    """
+
+    if train_config.num_pp_stages > 1:
+        if train_config.enable_ddp:
+            device_map = custom_device_map(train_config)
+        else:
+            device_map = "auto"
+    else:
+        device_map = None
+
+    return device_map
+
+
+def custom_device_map(train_config):
+    """Returns custom device map for model layers based number of pipeline stages and given process rank.
+
+    Args:
+        train_config (TrainConfig): Training configuration object contaning model name and number of pipeline stages etc.
 
     Returns:
         Dict: A dictionary of layers and corresponding device id.
@@ -33,32 +53,34 @@ def get_device_map(model_name, num_pp_stages, rank):
         PP (Pipeline Parallelism): Each copy of the model is split into 2 stages
         DDP (Distributed Data Parallel): 2 model copies run in parallel
 
-        |-------------------------------------------------------------------------------
-        | Process Rank |   Assigned Device IDs  | Model Component |
-        |-------------------------------------------------------------------------------
-        | Rank 0       | 0                 | model.embed_tokens |
-        |              |                   | model.lm_head |
-        |              |                   | model.layers.0 - model.layers.7 |
-        |-------------------------------------------------------------------------------
-        | Rank 0       | 1                 | model.norm |
-        |              |                   | model.rotary_emb |
-        |              |                   | model.layers.8 - model.layers.15 |
-        |-------------------------------------------------------------------------------
-        | Rank 1       | 2                 | model.embed_tokens |
-        |              |                   | model.lm_head |
-        |              |                   | model.layers.0 - model.layers.7 |
-        |-------------------------------------------------------------------------------
-        | Rank 1       | 3                 | model.norm |
-        |              |                   | model.rotary_emb |
-        |              |                   | model.layers.8 - model.layers.15 |
-        |-------------------------------------------------------------------------------
+        |--------------------------------------------------------------------------|
+        | Process Rank |   Assigned Device IDs  | Model Component                  |
+        |--------------------------------------------------------------------------|
+        | Rank 0       | 0                      | model.embed_tokens               |
+        |              |                        | model.lm_head                    |
+        |              |                        | model.layers.0 - model.layers.7  |
+        |--------------------------------------------------------------------------|
+        | Rank 0       | 1                      | model.norm                       |
+        |              |                        | model.rotary_emb                 |
+        |              |                        | model.layers.8 - model.layers.15 |
+        |--------------------------------------------------------------------------|
+        | Rank 1       | 2                      | model.embed_tokens               |
+        |              |                        | model.lm_head                    |
+        |              |                        | model.layers.0 - model.layers.7  |
+        |--------------------------------------------------------------------------|
+        | Rank 1       | 3                      | model.norm                       |
+        |              |                        | model.rotary_emb                 |
+        |              |                        | model.layers.8 - model.layers.15 |
+        |--------------------------------------------------------------------------|
     """
 
-    config = AutoConfig.from_pretrained(model_name)
+    config = AutoConfig.from_pretrained(train_config.model_name)
     num_layers = get_num_layers_from_config(config)
 
-    first_device = rank * num_pp_stages
-    last_device = rank * num_pp_stages + (num_pp_stages - 1)
+    n = train_config.num_pp_stages
+    rank = int(os.getenv("LOCAL_RANK", 0))
+    first_device = rank * n
+    last_device = rank * n + (n - 1)
 
     if config.tie_word_embeddings:
         lm_head_device = first_device
@@ -71,11 +93,12 @@ def get_device_map(model_name, num_pp_stages, rank):
         "model.norm": last_device,
         "model.rotary_emb": last_device,
     }
+    n_layer_per_stage = np.ceil(num_layers / n)
 
-    n_layer_per_stage = math.ceil(num_layers / num_pp_stages)
+    pp_stage_ids = np.arange(n)
+    pp_device_map = np.repeat(pp_stage_ids, n_layer_per_stage)
 
-    for j in range(num_pp_stages):
-        for i in range(n_layer_per_stage * j, min(n_layer_per_stage * (j + 1), num_layers)):
-            device_map[f"model.layers.{i}"] = first_device + j
+    for i in range(num_layers):
+        device_map[f"model.layers.{i}"] = pp_device_map[i] + rank * n
 
     return device_map
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
@@ -9,7 +9,6 @@
 import os
 from contextlib import nullcontext
 from enum import Enum
-from typing import Dict, List, Tuple
 
 import torch
 
@@ -82,14 +81,6 @@ def get_op_verifier_ctx(
     )
 
 
-def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
-    # find out the minimum max_seq_length required during fine-tuning (saves memory!)
-    lengths = [len(d["input_ids"]) for d in data]
-    longest_seq_length = max(lengths)
-    longest_seq_ix = lengths.index(longest_seq_length)
-    return longest_seq_length, longest_seq_ix
-
-
 def save_to_json(
     output_filename,
     train_step_loss,