update branch

mamtsing · quic-mamta · commit d405a80a0843 · 2025-06-23T13:24:33.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -17,7 +17,7 @@
 import torch.utils.data
 from peft import PeftModel, get_peft_model
 from torch.optim.lr_scheduler import StepLR
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
 from QEfficient.finetune.configs.training import TrainConfig
 from QEfficient.finetune.utils.config_utils import (
@@ -26,18 +26,22 @@
     update_config,
 )
 from QEfficient.finetune.utils.dataset_utils import get_dataloader
+from QEfficient.finetune.utils.logging_utils import logger
 from QEfficient.finetune.utils.parser import get_finetune_parser
-from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
-from QEfficient.utils._utils import login_and_download_hf_lm
+from QEfficient.finetune.utils.train_utils import (
+    get_longest_seq_length,
+    print_model_size,
+    print_trainable_parameters,
+    train,
+)
+from QEfficient.utils._utils import hf_download
 
 # Try importing QAIC-specific module, proceed without it if unavailable
 try:
     import torch_qaic  # noqa: F401
 except ImportError as e:
-    print(f"Warning: {e}. Proceeding without QAIC modules.")
-
+    logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.")
 
-from transformers import AutoModelForSequenceClassification
 
 # Suppress all warnings
 warnings.filterwarnings("ignore")
@@ -106,7 +110,8 @@ def load_model_and_tokenizer(
         - Resizes model embeddings if tokenizer vocab size exceeds model embedding size.
         - Sets pad_token_id to eos_token_id if not defined in the tokenizer.
     """
-    pretrained_model_path = login_and_download_hf_lm(train_config.model_name)
+    logger.log_rank_zero(f"Loading HuggingFace model for {train_config.model_name}")
+    pretrained_model_path = hf_download(train_config.model_name)
     if train_config.task_type == "seq_classification":
         model = AutoModelForSequenceClassification.from_pretrained(
             pretrained_model_path,
@@ -116,7 +121,7 @@ def load_model_and_tokenizer(
         )
 
         if not hasattr(model, "base_model_prefix"):
-            raise RuntimeError("Given huggingface model does not have 'base_model_prefix' attribute.")
+            logger.raise_runtimeerror("Given huggingface model does not have 'base_model_prefix' attribute.")
 
         for param in getattr(model, model.base_model_prefix).parameters():
             param.requires_grad = False
@@ -141,11 +146,10 @@ def load_model_and_tokenizer(
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing embedding matrix to match tokenizer vocab size.")
+        logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logger.WARNING)
         model.resize_token_embeddings(len(tokenizer))
 
-    # FIXME (Meet): Cover below line inside the logger once it is implemented.
-    print_model_size(model, train_config)
+    print_model_size(model)
 
     # Note: Need to call this before calling PeftModel.from_pretrained or get_peft_model.
     # Because, both makes model.is_gradient_checkpointing = True which is used in peft library to
@@ -157,7 +161,9 @@ def load_model_and_tokenizer(
         if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing:
             model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False})
         else:
-            raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.")
+            logger.raise_runtimeerror(
+                "Given model doesn't support gradient checkpointing. Please disable it and run it."
+            )
 
     model = apply_peft(model, train_config, peft_config_file, **kwargs)
 
@@ -192,7 +198,7 @@ def apply_peft(
     else:
         peft_config = generate_peft_config(train_config, peft_config_file, **kwargs)
         model = get_peft_model(model, peft_config)
-    model.print_trainable_parameters()
+    print_trainable_parameters(model)
 
     return model
 
@@ -217,25 +223,25 @@ def setup_dataloaders(
             - Length of longest sequence in the dataset.
 
     Raises:
-        ValueError: If validation is enabled but the validation set is too small.
+        RuntimeError: If validation is enabled but the validation set is too small.
 
     Notes:
         - Applies a custom data collator if provided by get_custom_data_collator.
         - Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits.
     """
 
     train_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="train")
-    print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
+    logger.log_rank_zero(f"Number of Training Set Batches loaded = {len(train_dataloader)}")
 
     eval_dataloader = None
     if train_config.run_validation:
         eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="val")
         if len(eval_dataloader) == 0:
-            raise ValueError(
+            logger.raise_runtimeerror(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")
 
         longest_seq_length, _ = get_longest_seq_length(
             torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
@@ -274,13 +280,16 @@ def main(peft_config_file: str = None, **kwargs) -> None:
     dataset_config = generate_dataset_config(train_config.dataset)
     update_config(dataset_config, **kwargs)
 
+    logger.prepare_dump_logs(train_config.dump_logs)
+    logger.setLevel(train_config.log_level)
+
     setup_distributed_training(train_config)
     setup_seeds(train_config.seed)
     model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs)
 
     # Create DataLoaders for the training and validation dataset
     train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer)
-    print(
+    logger.log_rank_zero(
         f"The longest sequence length in the train data is {longest_seq_length}, "
         f"passed context length is {train_config.context_length} and overall model's context length is "
         f"{model.config.max_position_embeddings}"
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import logging
 from dataclasses import dataclass
 
 
@@ -96,3 +97,6 @@ class TrainConfig:
 
     dump_root_dir: str = "mismatches/step_"
     opByOpVerifier: bool = False
+
+    dump_logs: bool = True
+    log_level: str = logging.INFO
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
@@ -8,6 +8,8 @@
 import importlib
 from pathlib import Path
 
+from QEfficient.finetune.utils.logging_utils import logger
+
 
 def load_module_from_py_file(py_file: str) -> object:
     """
@@ -30,20 +32,19 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
         module_path, func_name = dataset_config.file, "get_custom_dataset"
 
     if not module_path.endswith(".py"):
-        raise ValueError(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
-    except AttributeError as e:
-        print(
+    except AttributeError:
+        logger.raise_runtimeerror(
             f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
         )
-        raise e
 
 
 def get_data_collator(dataset_processer, dataset_config):
@@ -53,16 +54,16 @@ def get_data_collator(dataset_processer, dataset_config):
         module_path, func_name = dataset_config.file, "get_data_collator"
 
     if not module_path.endswith(".py"):
-        raise ValueError(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_processer)
     except AttributeError:
-        print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
-        print("Using the default data_collator instead.")
+        logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
+        logger.info("Using the default data_collator instead.")
         return None
diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py
@@ -10,6 +10,8 @@
 from datasets import load_dataset
 from torch.utils.data import Dataset
 
+from QEfficient.finetune.utils.logging_utils import logger
+
 
 class grammar(Dataset):
     def __init__(self, tokenizer, csv_name=None, context_length=None):
@@ -20,8 +22,8 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
                 delimiter=",",
             )
         except Exception as e:
-            print(
-                "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
+            logger.raise_runtimeerror(
+                "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
             )
             raise e
 
@@ -36,7 +38,7 @@ def convert_to_features(self, example_batch):
         # Create prompt and tokenize contexts and questions
 
         if self.print_text:
-            print("Input Text: ", self.clean_text(example_batch["text"]))
+            logger.info("Input Text: ", self.clean_text(example_batch["text"]))
 
         input_ = example_batch["input"]
         target_ = example_batch["target"]
@@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
     """cover function for handling loading the working dataset"""
     """dataset loading"""
     currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
-    print(f"Loading dataset {currPath}")
+    logger.info(f"Loading dataset {currPath}")
     csv_name = str(currPath)
-    print(csv_name)
+    logger.info(csv_name)
     dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)
 
     return dataset
diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py
@@ -19,13 +19,14 @@
 from utils.train_utils import evaluation, print_model_size
 
 from QEfficient.finetune.configs.training import TrainConfig
+from QEfficient.finetune.utils.logging_utils import logger
 
 try:
     import torch_qaic  # noqa: F401
 
     device = "qaic:0"
 except ImportError as e:
-    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+    logger.warning(f"{e}. Moving ahead without these qaic modules.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 # Suppress all warnings
@@ -77,25 +78,24 @@ def main(**kwargs):
     # If there is a mismatch between tokenizer vocab size and embedding matrix,
     # throw a warning and then expand the embedding matrix
     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
-        print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
+        logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
         model.resize_token_embeddings(len(tokenizer))
 
-    print_model_size(model, train_config)
+    print_model_size(model)
 
     if train_config.run_validation:
         # TODO: vbaddi enable packing later in entire infra.
         # if train_config.batching_strategy == "packing":
         #    dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length)
 
         eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="test")
-
-        print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+        logger.log_rank_zero(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
         if len(eval_dataloader) == 0:
             raise ValueError(
                 f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
             )
         else:
-            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
+            logger.log_rank_zero(f"Num of Validation Set Batches loaded = {len(eval_dataloader)}")
 
     model.to(device)
     _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
@@ -18,6 +18,7 @@
 from QEfficient.finetune.configs.peft_config import LoraConfig
 from QEfficient.finetune.configs.training import TrainConfig
 from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC
+from QEfficient.finetune.utils.logging_utils import logger
 
 
 def update_config(config, **kwargs):
@@ -46,8 +47,7 @@ def update_config(config, **kwargs):
                         raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'")
             else:
                 config_type = type(config).__name__
-                # FIXME (Meet): Once logger is available put this in debug level.
-                print(f"[WARNING]: Unknown parameter '{k}' for config type '{config_type}'")
+                logger.debug(f"Unknown parameter '{k}' for config type '{config_type}'")
 
 
 def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None, **kwargs) -> Any:
diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py
@@ -11,6 +11,7 @@
 
 from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler
 from QEfficient.finetune.dataset.dataset_config import DATALOADER_COLLATE_FUNC, DATASET_PREPROC
+from QEfficient.finetune.utils.logging_utils import logger
 
 
 def get_preprocessed_dataset(
@@ -72,7 +73,7 @@ def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"
         print("custom_data_collator is used")
         dl_kwargs["collate_fn"] = custom_data_collator
 
-    print(f"length of dataset_{split}", len(dataset))
+    logger.log_rank_zero(f"Length of {split} dataset is {len(dataset)}")
 
     # Create data loader
     dataloader = torch.utils.data.DataLoader(
diff --git a/QEfficient/finetune/utils/logging_utils.py b/QEfficient/finetune/utils/logging_utils.py
@@ -0,0 +1,57 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import logging
+import os
+from datetime import datetime
+
+import torch.distributed as dist
+
+from QEfficient.utils.constants import ROOT_DIR
+
+
+class FTLogger:
+    def __init__(self, level=logging.DEBUG):
+        self.logger = logging.getLogger("QEfficient")
+        if not getattr(self.logger, "_custom_methods_added", False):
+            self._bind_custom_methods()
+            self.logger._custom_methods_added = True  # Prevent adding handlers/methods twice
+
+    def _bind_custom_methods(self):
+        def raise_runtimeerror(message):
+            self.logger.error(message)
+            raise RuntimeError(message)
+
+        def log_rank_zero(msg: str, level: int = logging.INFO):
+            rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+            if rank != 0:
+                return
+            self.logger.log(level, msg, stacklevel=2)
+
+        def prepare_dump_logs(dump_logs=False, level=logging.INFO):
+            if dump_logs:
+                logs_path = os.path.join(ROOT_DIR, "logs")
+                if not os.path.exists(logs_path):
+                    os.makedirs(logs_path, exist_ok=True)
+                file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt"
+                log_file = os.path.join(logs_path, file_name)
+
+                fh = logging.FileHandler(log_file)
+                fh.setLevel(level)
+                formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s")
+                fh.setFormatter(formatter)
+                self.logger.addHandler(fh)
+
+        self.logger.raise_runtimeerror = raise_runtimeerror
+        self.logger.log_rank_zero = log_rank_zero
+        self.logger.prepare_dump_logs = prepare_dump_logs
+
+    def get_logger(self):
+        return self.logger
+
+
+logger = FTLogger().get_logger()
diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py