modify error handling

mamtsing · quic-mamta · commit 4d46105d1ace · 2025-06-25T14:08:30.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -121,7 +121,7 @@ def load_model_and_tokenizer(
         )
 
         if not hasattr(model, "base_model_prefix"):
-            logger.raise_runtimeerror("Given huggingface model does not have 'base_model_prefix' attribute.")
+            logger.raise_error("Given huggingface model does not have 'base_model_prefix' attribute.", RuntimeError)
 
         for param in getattr(model, model.base_model_prefix).parameters():
             param.requires_grad = False
@@ -161,8 +161,8 @@ def load_model_and_tokenizer(
         if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing:
             model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False})
         else:
-            logger.raise_runtimeerror(
-                "Given model doesn't support gradient checkpointing. Please disable it and run it."
+            logger.raise_error(
+                "Given model doesn't support gradient checkpointing. Please disable it and run it.", RuntimeError
             )
 
     model = apply_peft(model, train_config, peft_config_file, **kwargs)
@@ -237,8 +237,9 @@ def setup_dataloaders(
     if train_config.run_validation:
         eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="val")
         if len(eval_dataloader) == 0:
-            logger.raise_runtimeerror(
-                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
+            logger.raise_error(
+                f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})",
+                ValueError,
             )
         else:
             logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")
@@ -280,8 +281,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
     dataset_config = generate_dataset_config(train_config.dataset)
     update_config(dataset_config, **kwargs)
 
-    logger.prepare_dump_logs(train_config.dump_logs)
-    logger.setLevel(train_config.log_level)
+    logger.prepare_dump_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
 
     setup_distributed_training(train_config)
     setup_seeds(train_config.seed)
diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
@@ -95,7 +95,6 @@ class TrainConfig:
     use_profiler: bool = False  # Enable pytorch profiler, can not be used with flop counter at the same time.
     # profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
 
-    dump_root_dir: str = "mismatches/step_"
     opByOpVerifier: bool = False
 
     dump_logs: bool = True
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
@@ -32,18 +32,21 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
         module_path, func_name = dataset_config.file, "get_custom_dataset"
 
     if not module_path.endswith(".py"):
-        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_error(
+            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+        )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
-    except AttributeError:
-        logger.raise_runtimeerror(
-            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
+    except AttributeError as e:
+        logger.raise_error(
+            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
+            e,
         )
 
 
@@ -54,11 +57,13 @@ def get_data_collator(dataset_processer, dataset_config):
         module_path, func_name = dataset_config.file, "get_data_collator"
 
     if not module_path.endswith(".py"):
-        logger.raise_runtimeerror(f"Dataset file {module_path} is not a .py file.")
+        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        logger.raise_runtimeerror(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        logger.raise_error(
+            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+        )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py
@@ -22,10 +22,10 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
                 delimiter=",",
             )
         except Exception as e:
-            logger.raise_runtimeerror(
-                "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
+            logger.raise_error(
+                "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset.",
+                e,
             )
-            raise e
 
         self.context_length = context_length
         self.tokenizer = tokenizer
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
@@ -44,7 +44,9 @@ def update_config(config, **kwargs):
                     if hasattr(config, param_name):
                         setattr(config, param_name, v)
                     else:
-                        raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'")
+                        logger.raise_error(
+                            f"Config '{config_name}' does not have parameter: '{param_name}'", ValueError
+                        )
             else:
                 config_type = type(config).__name__
                 logger.debug(f"Unknown parameter '{k}' for config type '{config_type}'")
@@ -70,7 +72,7 @@ def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None
     else:
         config_map = {"lora": (LoraConfig, PeftLoraConfig)}
         if train_config.peft_method not in config_map:
-            raise RuntimeError(f"Peft config not found: {train_config.peft_method}")
+            logger.raise_error(f"Peft config not found: {train_config.peft_method}", RuntimeError)
 
         config_cls, peft_config_cls = config_map[train_config.peft_method]
         if config_cls is None:
@@ -119,7 +121,7 @@ def validate_config(config_data: Dict[str, Any], config_type: str = "lora") -> N
         - Ensures types match expected values (int, float, list, etc.).
     """
     if config_type.lower() != "lora":
-        raise ValueError(f"Unsupported config_type: {config_type}. Only 'lora' is supported.")
+        logger.raise_error(f"Unsupported config_type: {config_type}. Only 'lora' is supported.", ValueError)
 
     required_fields = {
         "r": int,
@@ -136,26 +138,28 @@ def validate_config(config_data: Dict[str, Any], config_type: str = "lora") -> N
     # Check for missing required fields
     missing_fields = [field for field in required_fields if field not in config_data]
     if missing_fields:
-        raise ValueError(f"Missing required fields in {config_type} config: {missing_fields}")
+        logger.raise_error(f"Missing required fields in {config_type} config: {missing_fields}", ValueError)
 
     # Validate types of required fields
     for field, expected_type in required_fields.items():
         if not isinstance(config_data[field], expected_type):
-            raise ValueError(
+            logger.raise_error(
                 f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, "
-                f"got {type(config_data[field]).__name__}"
+                f"got {type(config_data[field]).__name__}",
+                ValueError,
             )
 
     # Validate target_modules contains strings
     if not all(isinstance(mod, str) for mod in config_data["target_modules"]):
-        raise ValueError("All elements in 'target_modules' must be strings")
+        logger.raise_error("All elements in 'target_modules' must be strings", ValueError)
 
     # Validate types of optional fields if present
     for field, expected_type in optional_fields.items():
         if field in config_data and not isinstance(config_data[field], expected_type):
-            raise ValueError(
+            logger.raise_error(
                 f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, "
-                f"got {type(config_data[field]).__name__}"
+                f"got {type(config_data[field]).__name__}",
+                ValueError,
             )
 
 
@@ -173,12 +177,12 @@ def load_config_file(config_path: str) -> Dict[str, Any]:
         ValueError: If the file format is unsupported.
     """
     if not os.path.exists(config_path):
-        raise FileNotFoundError(f"Config file not found: {config_path}")
+        logger.raise_error(f"Config file not found: {config_path}", FileNotFoundError)
 
     with open(config_path, "r") as f:
         if config_path.endswith(".yaml") or config_path.endswith(".yml"):
             return yaml.safe_load(f)
         elif config_path.endswith(".json"):
             return json.load(f)
         else:
-            raise ValueError("Unsupported config file format. Use .yaml, .yml, or .json")
+            logger.raise_error("Unsupported config file format. Use .yaml, .yml, or .json", ValueError)
diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py
@@ -18,7 +18,7 @@ def get_preprocessed_dataset(
     tokenizer, dataset_config, split: str = "train", context_length: int = None
 ) -> torch.utils.data.Dataset:
     if dataset_config.dataset not in DATASET_PREPROC:
-        raise NotImplementedError(f"{dataset_config.dataset} is not (yet) implemented")
+        logger.raise_error(f"{dataset_config.dataset} is not (yet) implemented", NotImplementedError)
 
     def get_split():
         return dataset_config.train_split if split == "train" else dataset_config.test_split
@@ -39,8 +39,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split):
     if train_config.enable_ddp:
         if train_config.enable_sorting_for_ddp:
             if train_config.context_length:
-                raise ValueError(
-                    "Sorting cannot be done with padding, Please disable sorting or pass context_length as None to disable padding"
+                logger.raise_error(
+                    "Sorting cannot be done with padding, Please disable sorting or pass context_length as None to disable padding",
+                    ValueError,
                 )
             else:
                 kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler(
diff --git a/QEfficient/finetune/utils/logging_utils.py b/QEfficient/finetune/utils/logging_utils.py
@@ -10,29 +10,29 @@
 from datetime import datetime
 
 from QEfficient.finetune.utils.helper import is_rank_zero
-from QEfficient.utils.constants import ROOT_DIR
 
 
 class FTLogger:
-    def __init__(self, level=logging.DEBUG):
+    def __init__(self):
         self.logger = logging.getLogger("QEfficient")
         if not getattr(self.logger, "_custom_methods_added", False):
             self._bind_custom_methods()
             self.logger._custom_methods_added = True  # Prevent adding handlers/methods twice
 
     def _bind_custom_methods(self):
-        def raise_runtimeerror(message):
+        def raise_error(message, errortype=RuntimeError):
             self.logger.error(message)
-            raise RuntimeError(message)
+            raise errortype(message)
 
         def log_rank_zero(msg: str, level: int = logging.INFO):
             if not is_rank_zero:
                 return
             self.logger.log(level, msg, stacklevel=2)
 
-        def prepare_dump_logs(dump_logs=False, level=logging.INFO):
+        def prepare_dump_logs(output_path, dump_logs=False, level=logging.INFO):
+            self.logger.setLevel(level)
             if dump_logs:
-                logs_path = os.path.join(ROOT_DIR, "logs")
+                logs_path = os.path.join(output_path, "logs")
                 if not os.path.exists(logs_path):
                     os.makedirs(logs_path, exist_ok=True)
                 file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt"
@@ -44,7 +44,7 @@ def prepare_dump_logs(dump_logs=False, level=logging.INFO):
                 fh.setFormatter(formatter)
                 self.logger.addHandler(fh)
 
-        self.logger.raise_runtimeerror = raise_runtimeerror
+        self.logger.raise_error = raise_error
         self.logger.log_rank_zero = log_rank_zero
         self.logger.prepare_dump_logs = prepare_dump_logs
 
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
@@ -254,18 +254,14 @@ def get_finetune_parser():
         action="store_true",
         help="Enable distributed data parallel training. This will load the replicas of model on given number of devices and train the model. This should be used using torchrun interface. Please check docs for exact usage.",
     )
-    parser.add_argument(
-        "--dump_root_dir",
-        "--dump-root-dir",
-        required=False,
-        type=str,
-        default="mismatches/step_",
-        help="Directory for mismatch dumps by opByOpVerifier",
-    )
     parser.add_argument(
         "--opByOpVerifier",
         action="store_true",
-        help="Enable operation-by-operation verification w.r.t reference device(cpu). It is a context manager interface that captures and verifies each operator against reference device. In case results of test & reference do not match under given tolerances, a standalone unittest is generated at dump_root_dir.",
+        help=argparse.SUPPRESS,
+        # This is for debugging purpose only.
+        # Enables operation-by-operation verification w.r.t reference device(cpu).
+        # It is a context manager interface that captures and verifies each operator against reference device.
+        # In case results of test & reference do not match under given tolerances, a standalone unittest is generated at dump_root_dir.
     )
 
     return parser
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -85,8 +85,9 @@ def train(
     max_steps_reached = False  # Flag to indicate max training steps reached
 
     tensorboard_updates = None
+    tensorboard_log_dir = train_config.output_dir + "/runs/" + f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
     if is_rank_zero():
-        tensorboard_updates = SummaryWriter()
+        tensorboard_updates = SummaryWriter(log_dir=tensorboard_log_dir)
 
     device_type = torch.device(device).type
 
@@ -181,7 +182,7 @@ def train(
                         atol=1e-1,
                         use_ref_output_on_mismatch=True,
                         filter_config=qaic_debug.DispatchFilterConfig.default(device),
-                        dump_root_dir=train_config.dump_root_dir + str(step),
+                        dump_root_dir=train_config.output_dir + "/mismatches/step_" + str(step),
                     ) as verifier:
                         model_outputs = model(**batch)
                         loss = model_outputs.loss  # Forward call

Original file line number	Diff line number	Diff line change
`@@ -22,10 +22,10 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):`
`22`	`22`	`delimiter=",",`
`23`	`23`	`)`
`24`	`24`	`except Exception as e:`
`25`		`- logger.raise_runtimeerror(`
`26`		`- "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."`
	`25`	`+ logger.raise_error(`
	`26`	`+ "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset.",`
	`27`	`+ e,`
`27`	`28`	`)`
`28`		`- raise e`
`29`	`29`
`30`	`30`	`self.context_length = context_length`
`31`	`31`	`self.tokenizer = tokenizer`