Merge branch 'main' into use_logger

mamtsing · quic-mamta · commit 516ac3e3d2d3 · 2025-07-09T14:08:07.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
@@ -5,6 +5,15 @@
 #
 # -----------------------------------------------------------------------------
 import os
+from contextlib import nullcontext
+
+import torch
+
+try:
+    import torch_qaic.debug as qaic_debug  # noqa: F401
+except ImportError as e:
+    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+
 
 TASK_TYPE = ["generation", "seq_classification"]
 PEFT_METHOD = ["lora"]
@@ -18,3 +27,34 @@ def is_rank_zero():
 
 def get_num_ddp_devices():
     return int(os.getenv("WORLD_SIZE", 1))
+
+
+def get_autocast_ctx(use_autocast, device_type, dtype=torch.float16):
+    return torch.autocast(device_type=device_type, dtype=dtype) if use_autocast else nullcontext()
+
+
+def get_op_verifier_ctx(
+    use_op_by_op_verifier,
+    train_device,
+    dump_dir,
+    step,
+    ref_device="cpu",
+    ref_dtype=torch.float32,
+    atol=1e-1,
+    rtol=1e-5,
+    use_ref_output_on_mismatch=True,
+):
+    if not use_op_by_op_verifier:
+        return nullcontext()
+
+    filter_config = qaic_debug.DispatchFilterConfig.default(train_device)
+    dump_dir = dump_dir + "/mismatches/step_" + str(step)
+    return qaic_debug.OpByOpVerifierMode(
+        ref_device=ref_device,
+        ref_dtype=ref_dtype,
+        atol=atol,
+        rtol=rtol,
+        use_ref_output_on_mismatch=use_ref_output_on_mismatch,
+        filter_config=filter_config,
+        dump_root_dir=dump_dir,
+    )
diff --git a/QEfficient/finetune/utils/logging_utils.py b/QEfficient/finetune/utils/logging_utils.py
@@ -25,9 +25,8 @@ def raise_error(message, errortype=RuntimeError):
             raise errortype(message)
 
         def log_rank_zero(msg: str, level: int = logging.INFO):
-            if not is_rank_zero:
-                return
-            self.logger.log(level, msg, stacklevel=2)
+            if is_rank_zero():
+                self.logger.log(level, msg, stacklevel=2)
 
         def prepare_for_logs(output_path, dump_logs=False, level=logging.INFO):
             self.logger.setLevel(level)
diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py
@@ -75,8 +75,8 @@ def plot_metrics(file_path):
     with open(file_path, "r") as f:
         try:
             data = json.load(f)
-        except json.JSONDecodeError:
-            logger.raise_error("Invalid JSON file.", json.JSONDecodeError)
+        except json.JSONDecodeError as e:
+            logger.raise_error("Invalid JSON file.", e)
             return
 
     directory = os.path.dirname(file_path)
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -8,8 +8,8 @@
 import json
 import os
 import time
-from contextlib import nullcontext
 from datetime import datetime
+from functools import partial
 from typing import Dict, List, Tuple
 
 import torch
@@ -19,7 +19,7 @@
 from tqdm import tqdm
 
 from QEfficient.finetune.configs.training import TrainConfig
-from QEfficient.finetune.utils.helper import is_rank_zero
+from QEfficient.finetune.utils.helper import get_autocast_ctx, get_op_verifier_ctx, is_rank_zero
 from QEfficient.finetune.utils.logging_utils import logger
 
 try:
@@ -85,8 +85,8 @@ def train(
     max_steps_reached = False  # Flag to indicate max training steps reached
 
     tensorboard_updates = None
-    tensorboard_log_dir = train_config.output_dir + "/runs/" + f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
     if is_rank_zero():
+        tensorboard_log_dir = train_config.output_dir + "/runs/" + f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
         tensorboard_updates = SummaryWriter(log_dir=tensorboard_log_dir)
 
     device_type = torch.device(device).type
@@ -110,6 +110,9 @@ def train(
             num_classes = model.classifier.out_features
         acc_helper = torchmetrics.classification.MulticlassAccuracy(num_classes=num_classes).to(device)
 
+    autocast_ctx = get_autocast_ctx(train_config.use_autocast, device_type, dtype=torch.float16)
+    op_verifier_ctx = partial(get_op_verifier_ctx, train_config.opByOpVerifier, device, train_config.output_dir)
+
     # Start the training loop
     for epoch in range(train_config.num_epochs):
         if loss_0_counter.item() == train_config.convergence_counter:
@@ -168,60 +171,38 @@ def train(
                 break
             batch = {k: v.to(device) for k, v in batch.items()}  # move the batch elements to qaic device
 
-            with (
-                torch.autocast(device_type=device_type, dtype=torch.float16)
-                if train_config.use_autocast
-                else nullcontext()
-            ):
-                # an additional condition can be put here to avoid opByOpVerifier getting triggered for each step
-                if train_config.opByOpVerifier:
-                    with qaic_debug.OpByOpVerifierMode(
-                        ref_device="cpu",
-                        ref_dtype=torch.float32,
-                        # adjust atol & rtol this as required
-                        atol=1e-1,
-                        use_ref_output_on_mismatch=True,
-                        filter_config=qaic_debug.DispatchFilterConfig.default(device),
-                        dump_root_dir=train_config.output_dir + "/mismatches/step_" + str(step),
-                    ) as verifier:
-                        model_outputs = model(**batch)
-                        loss = model_outputs.loss  # Forward call
-                        if (batch["labels"] != -100).sum() == 0:
-                            loss = loss.nan_to_num(nan=0.0)
-                            num_dummy_samples += train_config.train_batch_size
-                        else:
-                            num_dummy_samples_per_batch = (
-                                (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item()
-                            )
-                            if num_dummy_samples_per_batch > 0:
-                                num_dummy_samples += num_dummy_samples_per_batch
-                                loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch
-
-                        if train_config.task_type == "seq_classification":
-                            logits = model_outputs.logits
-                            labels = batch["labels"][:, 0]
-                            preds = torch.nn.functional.softmax(logits, dim=-1)
-                            acc_helper.forward(preds, labels)
-                    logger.info("Mismatches detected:", verifier.get_perop_mismatch_count())
+            is_optimizer_step = (step + 1) % train_config.gradient_accumulation_steps == 0 or step == len(
+                train_dataloader
+            ) - 1
+            if train_config.enable_ddp:
+                # Below block derived from : https://github.com/karpathy/nanoGPT/blob/93a43d9a5c22450bbf06e78da2cb6eeef084b717/train.py#L293
+                # in DDP training we only need to sync gradients at the last micro step.
+                # the official way to do this is with model.no_sync() context manager, but
+                # using too many context managers may bloat the code and forces us to repeat code
+                # looking at the source of that context manager, it just toggles this variable
+                model.require_backward_grad_sync = is_optimizer_step
+
+            with autocast_ctx, op_verifier_ctx(step) as verifier:
+                model_outputs = model(**batch)
+                loss = model_outputs.loss  # Forward call
+                if (batch["labels"] != -100).sum() == 0:
+                    loss = loss.nan_to_num(nan=0.0)
+                    num_dummy_samples += train_config.train_batch_size
                 else:
-                    model_outputs = model(**batch)
-                    loss = model_outputs.loss  # Forward call
-                    if (batch["labels"] != -100).sum() == 0:
-                        loss = loss.nan_to_num(nan=0.0)
-                        num_dummy_samples += train_config.train_batch_size
-                    else:
-                        num_dummy_samples_per_batch = (
-                            (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item()
-                        )
-                        if num_dummy_samples_per_batch > 0:
-                            num_dummy_samples += num_dummy_samples_per_batch
-                            loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch
+                    num_dummy_samples_per_batch = (
+                        (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item()
+                    )
+                    if num_dummy_samples_per_batch > 0:
+                        num_dummy_samples += num_dummy_samples_per_batch
+                        loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch
 
-                    if train_config.task_type == "seq_classification":
-                        logits = model_outputs.logits
-                        labels = batch["labels"][:, 0]
-                        preds = torch.nn.functional.softmax(logits, dim=-1)
-                        acc_helper.forward(preds, labels)
+                if train_config.task_type == "seq_classification":
+                    logits = model_outputs.logits
+                    labels = batch["labels"][:, 0]
+                    preds = torch.nn.functional.softmax(logits, dim=-1)
+                    acc_helper.forward(preds, labels)
+            if train_config.opByOpVerifier:
+                logger.info("Mismatches detected:", verifier.get_perop_mismatch_count())
 
             total_loss += loss.detach().float()
             if is_rank_zero():
@@ -258,7 +239,7 @@ def train(
             else:
                 loss.backward()  # backward pass
 
-            if (step + 1) % train_config.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+            if is_optimizer_step:
                 if train_config.grad_scaler:
                     scaler.step(optimizer)
                     scaler.update()
@@ -440,6 +421,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
     device_type = torch.device(device).type
 
     num_dummy_samples = 0
+    autocast_ctx = get_autocast_ctx(train_config.use_autocast, device_type, dtype=torch.float16)
     for step, batch in enumerate(tqdm(eval_dataloader, colour="green", desc="evaluating Epoch", dynamic_ncols=True)):
         #  stop when the maximum number of eval steps is reached
         if train_config.max_eval_step > 0 and step > train_config.max_eval_step:
@@ -450,11 +432,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
         # Ensure no gradients are computed for this scope to save memory
         with torch.no_grad():
             # Forward pass and compute loss
-            with (
-                torch.autocast(device_type=device_type, dtype=torch.float16)
-                if train_config.use_autocast
-                else nullcontext()
-            ):
+            with autocast_ctx:
                 outputs = model(**batch)
             loss = outputs.loss
 
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -60,7 +60,7 @@ def __repr__(self):
         return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
         \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} tokens/sec\
         \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} tokens/sec\
-        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} tokens/sec"
+        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} sec"
 
 
 @dataclass
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -166,8 +166,8 @@ def generate_data_format_config(
     for output in onnx_model.graph.output:
         if "past_key" in output.name or "past_value" in output.name:
             kv_nodes.append(output.name)
-            kv_overrides = {}
 
+    kv_overrides = {}
     kv_overrides["graphs"] = [
         {
             "graph_name": model_dlc_name + "_configuration_1",
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
@@ -94,7 +94,7 @@ python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2
 You can run the finetune with set of predefined existing datasets on QAIC using the eager pipeline
 
 ```bash
-python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256 
+python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256
 ```
 For more details on finetune, checkout the subsection.
 
@@ -138,6 +138,28 @@ Users can compile a model with QNN SDK by following the steps below:
 * Enabled QNN by passing enable_qnn flag, add --enable_qnn in the cli command.
 * An optional config file can be passed to override the default parameters.
 
+**Default Parameters**
+
+QNN Converter Stage:
+
+    "--float_bias_bitwidth 32 --float_bitwidth 16 --preserve_io_datatype --onnx_skip_simplification --target_backend AIC"
+
+QNN Context Binary Stage:
+
+    LOG_LEVEL = "error"
+    COMPILER_COMPILATION_TARGET = "hardware"
+    COMPILER_CONVERT_TO_FP16 = True
+    COMPILER_DO_DDR_TO_MULTICAST = True
+    COMPILER_HARDWARE_VERSION = "2.0"
+    COMPILER_PERF_WARNINGS = False
+    COMPILER_PRINT_DDR_STATS = False
+    COMPILER_PRINT_PERF_METRICS = False
+    COMPILER_RETAINED_STATE = True
+    COMPILER_STAT_LEVEL = 10
+    COMPILER_STATS_BATCH_SIZE = 1
+    COMPILER_TIME_PASSES = False
+
+
 **CLI Inference Command**
 
 Without QNN Config

Original file line number	Diff line number	Diff line change
`@@ -166,8 +166,8 @@ def generate_data_format_config(`
`166`	`166`	`for output in onnx_model.graph.output:`
`167`	`167`	`if "past_key" in output.name or "past_value" in output.name:`
`168`	`168`	`kv_nodes.append(output.name)`
`169`		`- kv_overrides = {}`
`170`	`169`
	`170`	`+ kv_overrides = {}`
`171`	`171`	`kv_overrides["graphs"] = [`
`172`	`172`	`{`
`173`	`173`	`"graph_name": model_dlc_name + "_configuration_1",`