Merge branch 'main' into pp_ddp

quic-mamta · quic-mamta · commit a67091ad5bfd · 2025-07-14T06:58:42.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -28,10 +28,10 @@
 )
 from QEfficient.finetune.utils.dataset_utils import get_dataloader
 from QEfficient.finetune.utils.device_map import get_device_map
-from QEfficient.finetune.utils.helper import get_longest_seq_length, print_model_size, print_trainable_parameters
+from QEfficient.finetune.utils.helper import get_longest_seq_length
 from QEfficient.finetune.utils.logging_utils import logger
 from QEfficient.finetune.utils.parser import get_finetune_parser
-from QEfficient.finetune.utils.train_utils import train
+from QEfficient.finetune.utils.train_utils import print_model_size, print_trainable_parameters, train
 from QEfficient.utils._utils import hf_download
 
 # Try importing QAIC-specific module, proceed without it if unavailable
@@ -85,7 +85,7 @@ def setup_seeds(seed: int) -> None:
     Notes:
         - Sets seeds for PyTorch, Python's random module, and NumPy.
     """
-    # torch.use_deterministic_algorithms(True)
+    torch.use_deterministic_algorithms(True)
     torch.manual_seed(seed)
     random.seed(seed)
     np.random.seed(seed)
@@ -114,7 +114,10 @@ def load_model_and_tokenizer(
         - Sets pad_token_id to eos_token_id if not defined in the tokenizer.
     """
     logger.log_rank_zero(f"Loading HuggingFace model for {train_config.model_name}")
-    pretrained_model_path = hf_download(train_config.model_name)
+    pretrained_model_path = hf_download(
+        train_config.model_name,
+        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5", "*.pth"],
+    )
     if train_config.task_type == "seq_classification":
         model = AutoModelForSequenceClassification.from_pretrained(
             pretrained_model_path,
diff --git a/QEfficient/finetune/configs/dataset_config.py b/QEfficient/finetune/configs/dataset_config.py
@@ -8,13 +8,6 @@
 from dataclasses import dataclass
 
 
-@dataclass
-class samsum_dataset:
-    dataset: str = "samsum_dataset"
-    train_split: str = "train"
-    test_split: str = "validation"
-
-
 @dataclass
 class grammar_dataset:
     dataset: str = "grammar_dataset"
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
@@ -34,12 +34,12 @@ class TrainConfig:
         weight_decay (float): Weight decay for optimizer (default: 0.0).
         gamma (float): Learning rate decay factor (default: 0.85).
         seed (int): Random seed for reproducibility (default: 42).
-        dataset (str): Dataset name for training (default: "samsum_dataset").
+        dataset (str): Dataset name for training (default: "alpaca_dataset").
         task_type (str): Type of task for which the finetuning is to be done. Options: "generation" and "seq_classification". (default: "generation")
         use_peft (bool): Whether to use PEFT (default: True).
         peft_method (str): Parameter-efficient fine-tuning method (default: "lora").
         from_peft_checkpoint (str): Path to PEFT checkpoint (default: "").
-        output_dir (str): Directory to save outputs (default: "meta-llama-samsum").
+        output_dir (str): Directory to save outputs (default: "training_results").
         save_model (bool): Save the trained model (default: True).
         save_metrics (bool): Save training metrics (default: True).
         intermediate_step_save (int): Steps between intermediate saves (default: 1000).
diff --git a/QEfficient/finetune/dataset/dataset_config.py b/QEfficient/finetune/dataset/dataset_config.py
@@ -21,14 +21,10 @@
 from QEfficient.finetune.dataset.imdb_dataset import (
     get_preprocessed_imdb as get_imdb_dataset,
 )
-from QEfficient.finetune.dataset.samsum_dataset import (
-    get_preprocessed_samsum as get_samsum_dataset,
-)
 
 DATASET_PREPROC = {
     "alpaca_dataset": partial(get_alpaca_dataset),
     "grammar_dataset": get_grammar_dataset,
-    "samsum_dataset": get_samsum_dataset,
     "gsm8k_dataset": get_gsm8k_dataset,
     "custom_dataset": get_custom_dataset,
     "imdb_dataset": get_imdb_dataset,
diff --git a/QEfficient/finetune/dataset/samsum_dataset.py b/QEfficient/finetune/dataset/samsum_dataset.py
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
@@ -12,8 +12,6 @@
 
 import torch
 
-from QEfficient.finetune.utils.logging_utils import logger
-
 try:
     import torch_qaic.debug as qaic_debug  # noqa: F401
 except ImportError as e:
@@ -73,30 +71,6 @@ def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
     return longest_seq_length, longest_seq_ix
 
 
-def print_model_size(model) -> None:
-    """
-    Print the number of trainable parameters.
-
-    Args:
-        model: PyTorch model.
-    """
-    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    logger.log_rank_zero(f"Model has {total_params / 1e6} Million params.")
-
-
-def print_trainable_parameters(model) -> None:
-    """
-    Print the number of trainable parameters, all params and percentage of trainable params.
-
-    Args:
-        model: The PyTorch model.
-    """
-    trainable_params, all_param = model.get_nb_trainable_parameters()
-    logger.log_rank_zero(
-        f"Trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}"
-    )
-
-
 def save_to_json(
     output_filename,
     train_step_loss,
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -202,14 +202,13 @@ def train(
 
             total_loss += loss.detach().float()
             if is_rank_zero():
+                tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps)
                 if loss <= train_config.convergence_loss:
                     loss_0_counter += 1
                 else:
                     loss_0_counter = torch.tensor([0]).to(device)
             if train_config.enable_ddp:
                 dist.broadcast(loss_0_counter, src=0)
-            if is_rank_zero():
-                tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps)
 
             if train_config.save_metrics:
                 train_step_loss.append(loss.detach().float().item())
@@ -451,3 +450,25 @@ def evaluation(model, train_config, eval_dataloader, device):
     logger.log_rank_zero(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
 
     return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric
+
+
+def print_model_size(model) -> None:
+    """
+    Print model name, the number of trainable parameters and initialization time.
+    Args:
+        model: PyTorch model.
+    """
+    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.log_rank_zero(f"Model has {total_params / 1e6} Million params.")
+
+
+def print_trainable_parameters(model) -> None:
+    """
+    Print the number of trainable parameters, all params and percentage of trainablke params.
+    Args:
+        model: The PyTorch model.
+    """
+    trainable_params, all_param = model.get_nb_trainable_parameters()
+    logger.log_rank_zero(
+        f"Trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}"
+    )
diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py
@@ -183,7 +183,11 @@ def forward(
     ):
         residual = hidden_states
 
-        attention_layernorm_out = self.input_layernorm(hidden_states)
+        if self.config.new_decoder_architecture:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
 
         # Self attention.
         attn_outputs = self.self_attention(
diff --git a/docs/source/finetune.md b/docs/source/finetune.md
@@ -84,7 +84,7 @@ To run fine tuning for any user specific dataset, prepare the dataset using the
 3. Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. 
 4. get_custom_dataset() should have following 4 parameters:  dataset_config, tokenizer, split, context_length.  
 5. Inside get_custom_dataset(), user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset().
-6. For examples, please refer python files present in [dataset](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset). In case of Samsum dataset, get_preprocessed_samsum() of efficient-transformers/QEfficient/finetune/dataset/samsum_dataset.py is called. 
+6. For examples, please refer python files present in [dataset](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset).
 7. In [dataset_config.py](https://github.com/quic/efficient-transformers/blob/main/QEfficient/finetune/configs/dataset_config.py), for custom_dataset class, pass the appropriate value for train_split and test_split. As an alternative, these values can be passed as command line arguments as well with the finetune command. For example "--train_split train".
 8. While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset.