-
Notifications
You must be signed in to change notification settings - Fork 43
[QEff Finetune]: Use logger in place of print statements in finetuning scripts #371
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,8 @@ | |
import importlib | ||
from pathlib import Path | ||
|
||
from QEfficient.utils.logging_utils import logger | ||
|
||
|
||
def load_module_from_py_file(py_file: str) -> object: | ||
""" | ||
|
@@ -40,7 +42,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str): | |
try: | ||
return getattr(module, func_name)(dataset_config, tokenizer, split) | ||
except AttributeError as e: | ||
print( | ||
logger.error( | ||
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." | ||
) | ||
raise e | ||
|
@@ -63,6 +65,6 @@ def get_data_collator(dataset_processer, dataset_config): | |
try: | ||
return getattr(module, func_name)(dataset_processer) | ||
except AttributeError: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for all the cases like L62. Whenever we are raising some exceptions, it is good to log the raised exception, otherwise dumped file logs would represent different picture than the console logs. We can override the logger to have a custom log function as below.
What do you say? |
||
print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") | ||
print("Using the default data_collator instead.") | ||
logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") | ||
logger.info("Using the default data_collator instead.") | ||
return None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,8 @@ | |
from datasets import load_dataset | ||
from torch.utils.data import Dataset | ||
|
||
from QEfficient.utils.logging_utils import logger | ||
|
||
|
||
class grammar(Dataset): | ||
def __init__(self, tokenizer, csv_name=None, context_length=None): | ||
|
@@ -20,7 +22,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None): | |
delimiter=",", | ||
) | ||
except Exception as e: | ||
print( | ||
logger.error( | ||
"Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the content is incorrect. |
||
) | ||
raise e | ||
|
@@ -36,7 +38,7 @@ def convert_to_features(self, example_batch): | |
# Create prompt and tokenize contexts and questions | ||
|
||
if self.print_text: | ||
print("Input Text: ", self.clean_text(example_batch["text"])) | ||
logger.info("Input Text: ", self.clean_text(example_batch["text"])) | ||
|
||
input_ = example_batch["input"] | ||
target_ = example_batch["target"] | ||
|
@@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None): | |
"""cover function for handling loading the working dataset""" | ||
"""dataset loading""" | ||
currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv" | ||
print(f"Loading dataset {currPath}") | ||
logger.info(f"Loading dataset {currPath}") | ||
csv_name = str(currPath) | ||
print(csv_name) | ||
logger.info(csv_name) | ||
dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length) | ||
|
||
return dataset |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
from QEfficient.finetune.configs.training import TrainConfig | ||
from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler | ||
from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC | ||
from QEfficient.utils.logging_utils import logger | ||
|
||
|
||
def update_config(config, **kwargs): | ||
|
@@ -53,8 +54,7 @@ def update_config(config, **kwargs): | |
raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'") | ||
else: | ||
config_type = type(config).__name__ | ||
# FIXME (Meet): Once logger is available put this in debug level. | ||
print(f"[WARNING]: Unknown parameter '{k}' for config type '{config_type}'") | ||
logger.warning(f"Unknown parameter '{k}' for config type '{config_type}'") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we use logger.debug? |
||
|
||
|
||
def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None, **kwargs) -> Any: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
from tqdm import tqdm | ||
|
||
from QEfficient.finetune.configs.training import TrainConfig | ||
from QEfficient.utils.logging_utils import logger | ||
|
||
try: | ||
import torch_qaic # noqa: F401 | ||
|
@@ -27,7 +28,7 @@ | |
import torch_qaic.utils as qaic_utils # noqa: F401 | ||
from torch.qaic.amp import GradScaler as QAicGradScaler | ||
except ImportError as e: | ||
print(f"Warning: {e}. Moving ahead without these qaic modules.") | ||
logger.warning(f"{e}. Moving ahead without these qaic modules.") | ||
|
||
from torch.amp import GradScaler | ||
|
||
|
@@ -113,26 +114,26 @@ def train( | |
for epoch in range(train_config.num_epochs): | ||
if loss_0_counter.item() == train_config.convergence_counter: | ||
if train_config.enable_ddp: | ||
print( | ||
logger.info( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One important change is required. |
||
f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." | ||
) | ||
break | ||
else: | ||
print( | ||
logger.info( | ||
f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." | ||
) | ||
break | ||
|
||
if train_config.use_peft and train_config.from_peft_checkpoint: | ||
intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 | ||
if epoch < intermediate_epoch: | ||
print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") | ||
logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") | ||
# to bring the count of train_step in sync with where it left off | ||
total_train_steps += len(train_dataloader) | ||
continue | ||
|
||
print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") | ||
print(f"train_config.max_train_step: {train_config.max_train_step}") | ||
logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") | ||
logger.info(f"train_config.max_train_step: {train_config.max_train_step}") | ||
# stop when the maximum number of training steps is reached | ||
if max_steps_reached: | ||
break | ||
|
@@ -159,7 +160,7 @@ def train( | |
# to bring the count of train_step in sync with where it left off | ||
if epoch == intermediate_epoch and step == 0: | ||
total_train_steps += intermediate_step | ||
print( | ||
logger.info( | ||
f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them." | ||
) | ||
if epoch == intermediate_epoch and step < intermediate_step: | ||
|
@@ -194,7 +195,7 @@ def train( | |
labels = batch["labels"][:, 0] | ||
preds = torch.nn.functional.softmax(logits, dim=-1) | ||
acc_helper.forward(preds, labels) | ||
print("Mismatches detected:", verifier.get_perop_mismatch_count()) | ||
logger.info("Mismatches detected:", verifier.get_perop_mismatch_count()) | ||
else: | ||
model_outputs = model(**batch) | ||
loss = model_outputs.loss # Forward call | ||
|
@@ -276,13 +277,13 @@ def train( | |
) | ||
if train_config.enable_ddp: | ||
if loss_0_counter.item() == train_config.convergence_counter: | ||
print( | ||
logger.info( | ||
f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}." | ||
) | ||
break | ||
else: | ||
if loss_0_counter.item() == train_config.convergence_counter: | ||
print( | ||
logger.info( | ||
f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning." | ||
) | ||
break | ||
|
@@ -344,15 +345,15 @@ def train( | |
if train_config.run_validation: | ||
if eval_epoch_loss < best_val_loss: | ||
best_val_loss = eval_epoch_loss | ||
print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") | ||
logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") | ||
val_loss.append(float(eval_epoch_loss)) | ||
val_metric.append(float(eval_metric)) | ||
if train_config.task_type == "seq_classification": | ||
print( | ||
logger.info( | ||
f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" | ||
) | ||
else: | ||
print( | ||
logger.info( | ||
f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" | ||
) | ||
|
||
|
@@ -456,7 +457,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): | |
eval_metric = torch.exp(eval_epoch_loss) | ||
|
||
# Print evaluation metrics | ||
print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") | ||
logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") | ||
|
||
return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric | ||
|
||
|
@@ -486,9 +487,9 @@ def print_model_size(model, config) -> None: | |
model_name (str): Name of the model. | ||
""" | ||
|
||
print(f"--> Model {config.model_name}") | ||
logger.info(f"--> Model {config.model_name}") | ||
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) | ||
print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") | ||
logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") | ||
|
||
|
||
def save_to_json( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As we discussed offline, please set the logger level to INFO in this file before using.
Also add the file handler to the logger.
What i suggest in this case is better to have a function defined in finetuning code, which takes the logger from "from QEfficient.utils.logging_utils import logger" and applies the log level and assigns the file handler. Then that function would return the finetuning logger. Now all the files in finetuning code can call that function to get the updated instance of the logger for all the finetuning use cases. Let us brainstorm on these lines.