Skip to content

[QEff Finetune]: Use logger in place of print statements in finetuning scripts #371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions QEfficient/cloud/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@
)
from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
from QEfficient.utils._utils import login_and_download_hf_lm
from QEfficient.utils.logging_utils import logger

# Try importing QAIC-specific module, proceed without it if unavailable
try:
import torch_qaic # noqa: F401
except ImportError as e:
print(f"Warning: {e}. Proceeding without QAIC modules.")
logger.warning(f"{e}. Moving ahead without these qaic modules.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we discussed offline, please set the logger level to INFO in this file before using.
Also add the file handler to the logger.

What i suggest in this case is better to have a function defined in finetuning code, which takes the logger from "from QEfficient.utils.logging_utils import logger" and applies the log level and assigns the file handler. Then that function would return the finetuning logger. Now all the files in finetuning code can call that function to get the updated instance of the logger for all the finetuning use cases. Let us brainstorm on these lines.



from transformers import AutoModelForSequenceClassification
Expand Down Expand Up @@ -144,7 +145,7 @@ def load_model_and_tokenizer(
# If there is a mismatch between tokenizer vocab size and embedding matrix,
# throw a warning and then expand the embedding matrix
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
print("WARNING: Resizing embedding matrix to match tokenizer vocab size.")
logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
model.resize_token_embeddings(len(tokenizer))

# FIXME (Meet): Cover below line inside the logger once it is implemented.
Expand Down Expand Up @@ -244,12 +245,12 @@ def setup_dataloaders(
# )
##
train_dl_kwargs = get_dataloader_kwargs(train_config, dataset_train, dataset_processer, "train")
print("length of dataset_train", len(dataset_train))
logger.info("length of dataset_train", len(dataset_train))

# FIXME (Meet): Add custom data collator registration from the outside by the user.
custom_data_collator = get_custom_data_collator(dataset_processer, dataset_config)
if custom_data_collator:
print("custom_data_collator is used")
logger.info("custom_data_collator is used")
train_dl_kwargs["collate_fn"] = custom_data_collator

# Create DataLoaders for the training and validation dataset
Expand All @@ -259,7 +260,7 @@ def setup_dataloaders(
pin_memory=True,
**train_dl_kwargs,
)
print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
logger.info(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")

eval_dataloader = None
if train_config.run_validation:
Expand All @@ -283,7 +284,7 @@ def setup_dataloaders(
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
)
else:
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")

longest_seq_length, _ = get_longest_seq_length(
torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
Expand Down Expand Up @@ -327,7 +328,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:

# Create DataLoaders for the training and validation dataset
train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer)
print(
logger.info(
f"The longest sequence length in the train data is {longest_seq_length}, "
f"passed context length is {train_config.context_length} and overall model's context length is "
f"{model.config.max_position_embeddings}"
Expand Down
8 changes: 5 additions & 3 deletions QEfficient/finetune/dataset/custom_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import importlib
from pathlib import Path

from QEfficient.utils.logging_utils import logger


def load_module_from_py_file(py_file: str) -> object:
"""
Expand Down Expand Up @@ -40,7 +42,7 @@ def get_custom_dataset(dataset_config, tokenizer, split: str):
try:
return getattr(module, func_name)(dataset_config, tokenizer, split)
except AttributeError as e:
print(
logger.error(
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
)
raise e
Expand All @@ -63,6 +65,6 @@ def get_data_collator(dataset_processer, dataset_config):
try:
return getattr(module, func_name)(dataset_processer)
except AttributeError:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for all the cases like L62.

Whenever we are raising some exceptions, it is good to log the raised exception, otherwise dumped file logs would represent different picture than the console logs.

We can override the logger to have a custom log function as below.

 import logging

 class CustomLogger(logging.Logger):
     def raise_runtimeerror(self, message):
         self.error(message)
         raise RuntimeError(message)

 logging.setLoggerClass(CustomLogger)
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.DEBUG, filename='app.log', filemode='a',
                     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

What do you say?

print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
print("Using the default data_collator instead.")
logger.info(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
logger.info("Using the default data_collator instead.")
return None
10 changes: 6 additions & 4 deletions QEfficient/finetune/dataset/grammar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from datasets import load_dataset
from torch.utils.data import Dataset

from QEfficient.utils.logging_utils import logger


class grammar(Dataset):
def __init__(self, tokenizer, csv_name=None, context_length=None):
Expand All @@ -20,7 +22,7 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
delimiter=",",
)
except Exception as e:
print(
logger.error(
"Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the content is incorrect.
here --> this format is suitable for markdown format. In console this is not required. Directly print the url.

)
raise e
Expand All @@ -36,7 +38,7 @@ def convert_to_features(self, example_batch):
# Create prompt and tokenize contexts and questions

if self.print_text:
print("Input Text: ", self.clean_text(example_batch["text"]))
logger.info("Input Text: ", self.clean_text(example_batch["text"]))

input_ = example_batch["input"]
target_ = example_batch["target"]
Expand Down Expand Up @@ -71,9 +73,9 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
"""cover function for handling loading the working dataset"""
"""dataset loading"""
currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
print(f"Loading dataset {currPath}")
logger.info(f"Loading dataset {currPath}")
csv_name = str(currPath)
print(csv_name)
logger.info(csv_name)
dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)

return dataset
9 changes: 5 additions & 4 deletions QEfficient/finetune/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@
from utils.train_utils import evaluation, print_model_size

from QEfficient.finetune.configs.training import TrainConfig
from QEfficient.utils.logging_utils import logger

try:
import torch_qaic # noqa: F401

device = "qaic:0"
except ImportError as e:
print(f"Warning: {e}. Moving ahead without these qaic modules.")
logger.warning(f"{e}. Moving ahead without these qaic modules.")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Suppress all warnings
Expand Down Expand Up @@ -77,7 +78,7 @@ def main(**kwargs):
# If there is a mismatch between tokenizer vocab size and embedding matrix,
# throw a warning and then expand the embedding matrix
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
logger.warning("Resizing the embedding matrix to match the tokenizer vocab size.")
model.resize_token_embeddings(len(tokenizer))

print_model_size(model, train_config)
Expand Down Expand Up @@ -108,13 +109,13 @@ def main(**kwargs):
pin_memory=True,
**val_dl_kwargs,
)
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
if len(eval_dataloader) == 0:
raise ValueError(
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
)
else:
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
logger.info(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")

model.to(device)
_ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)
Expand Down
4 changes: 2 additions & 2 deletions QEfficient/finetune/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from QEfficient.finetune.configs.training import TrainConfig
from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler
from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC
from QEfficient.utils.logging_utils import logger


def update_config(config, **kwargs):
Expand Down Expand Up @@ -53,8 +54,7 @@ def update_config(config, **kwargs):
raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'")
else:
config_type = type(config).__name__
# FIXME (Meet): Once logger is available put this in debug level.
print(f"[WARNING]: Unknown parameter '{k}' for config type '{config_type}'")
logger.warning(f"Unknown parameter '{k}' for config type '{config_type}'")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we use logger.debug?



def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None, **kwargs) -> Any:
Expand Down
6 changes: 4 additions & 2 deletions QEfficient/finetune/utils/plot_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import matplotlib.pyplot as plt

from QEfficient.utils.logging_utils import logger


def plot_metric(data, metric_name, x_label, y_label, title, colors):
plt.figure(figsize=(7, 6))
Expand Down Expand Up @@ -67,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors):

def plot_metrics(file_path):
if not os.path.exists(file_path):
print(f"File {file_path} does not exist.")
logger.error(f"File {file_path} does not exist.")
return

with open(file_path, "r") as f:
try:
data = json.load(f)
except json.JSONDecodeError:
print("Invalid JSON file.")
logger.error("Invalid JSON file.")
return

directory = os.path.dirname(file_path)
Expand Down
33 changes: 17 additions & 16 deletions QEfficient/finetune/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from tqdm import tqdm

from QEfficient.finetune.configs.training import TrainConfig
from QEfficient.utils.logging_utils import logger

try:
import torch_qaic # noqa: F401
Expand All @@ -27,7 +28,7 @@
import torch_qaic.utils as qaic_utils # noqa: F401
from torch.qaic.amp import GradScaler as QAicGradScaler
except ImportError as e:
print(f"Warning: {e}. Moving ahead without these qaic modules.")
logger.warning(f"{e}. Moving ahead without these qaic modules.")

from torch.amp import GradScaler

Expand Down Expand Up @@ -113,26 +114,26 @@ def train(
for epoch in range(train_config.num_epochs):
if loss_0_counter.item() == train_config.convergence_counter:
if train_config.enable_ddp:
print(
logger.info(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One important change is required.
We need to log on console only for rank == 0 in case of ddp. Otherwise in 64x ddp or 48x ddp will fill the console with lot of information which is not user friendly.

f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
)
break
else:
print(
logger.info(
f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
)
break

if train_config.use_peft and train_config.from_peft_checkpoint:
intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
if epoch < intermediate_epoch:
print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
logger.info(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
# to bring the count of train_step in sync with where it left off
total_train_steps += len(train_dataloader)
continue

print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
print(f"train_config.max_train_step: {train_config.max_train_step}")
logger.info(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
logger.info(f"train_config.max_train_step: {train_config.max_train_step}")
# stop when the maximum number of training steps is reached
if max_steps_reached:
break
Expand All @@ -159,7 +160,7 @@ def train(
# to bring the count of train_step in sync with where it left off
if epoch == intermediate_epoch and step == 0:
total_train_steps += intermediate_step
print(
logger.info(
f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them."
)
if epoch == intermediate_epoch and step < intermediate_step:
Expand Down Expand Up @@ -194,7 +195,7 @@ def train(
labels = batch["labels"][:, 0]
preds = torch.nn.functional.softmax(logits, dim=-1)
acc_helper.forward(preds, labels)
print("Mismatches detected:", verifier.get_perop_mismatch_count())
logger.info("Mismatches detected:", verifier.get_perop_mismatch_count())
else:
model_outputs = model(**batch)
loss = model_outputs.loss # Forward call
Expand Down Expand Up @@ -276,13 +277,13 @@ def train(
)
if train_config.enable_ddp:
if loss_0_counter.item() == train_config.convergence_counter:
print(
logger.info(
f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}."
)
break
else:
if loss_0_counter.item() == train_config.convergence_counter:
print(
logger.info(
f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning."
)
break
Expand Down Expand Up @@ -344,15 +345,15 @@ def train(
if train_config.run_validation:
if eval_epoch_loss < best_val_loss:
best_val_loss = eval_epoch_loss
print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
logger.info(f"best eval loss on epoch {epoch + 1} is {best_val_loss}")
val_loss.append(float(eval_epoch_loss))
val_metric.append(float(eval_metric))
if train_config.task_type == "seq_classification":
print(
logger.info(
f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
)
else:
print(
logger.info(
f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s"
)

Expand Down Expand Up @@ -456,7 +457,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
eval_metric = torch.exp(eval_epoch_loss)

# Print evaluation metrics
print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")
logger.info(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}")

return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric

Expand Down Expand Up @@ -486,9 +487,9 @@ def print_model_size(model, config) -> None:
model_name (str): Name of the model.
"""

print(f"--> Model {config.model_name}")
logger.info(f"--> Model {config.model_name}")
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")
logger.info(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n")


def save_to_json(
Expand Down
Loading