diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index c440e73c0..ea36ca93f 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -9,7 +9,6 @@ import warnings from typing import Any, Dict, Optional, Union -import fire import numpy as np import torch import torch.distributed as dist @@ -31,6 +30,7 @@ get_custom_data_collator, get_preprocessed_dataset, ) +from QEfficient.finetune.utils.parser import get_finetune_parser from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train from QEfficient.utils._utils import login_and_download_hf_lm @@ -354,4 +354,6 @@ def main(peft_config_file: str = None, **kwargs) -> None: if __name__ == "__main__": - fire.Fire(main) + parser = get_finetune_parser() + args = parser.parse_args() + main(**args.__dict__) diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py index 69b083b6a..32889954d 100644 --- a/QEfficient/finetune/configs/training.py +++ b/QEfficient/finetune/configs/training.py @@ -16,7 +16,8 @@ class TrainConfig: model_name (str): Name of the pre-trained model to fine-tune (default: "meta-llama/Llama-3.2-1B"). tokenizer_name (str): Name of the tokenizer (defaults to model_name if None). run_validation (bool): Whether to run validation during training (default: True). - batch_size_training (int): Batch size for training (default: 1). + train_batch_size (int): Batch size for training (default: 1). + val_batch_size (int): Batch size for validation (default: 1). context_length (Optional[int]): Maximum sequence length for inputs (default: None). gradient_accumulation_steps (int): Steps for gradient accumulation (default: 4). gradient checkpointing (bool): Enable gradient checkpointing to save the memory by compromising the speed. (default: False). @@ -29,9 +30,10 @@ class TrainConfig: weight_decay (float): Weight decay for optimizer (default: 0.0). gamma (float): Learning rate decay factor (default: 0.85). seed (int): Random seed for reproducibility (default: 42). + use_autocast (bool): Use autocast for mixed precision (default: True). + dataset (str): Dataset name for training (default: "samsum_dataset"). use_fp16 (bool): Use mixed precision training (default: True). use_autocast (bool): Use autocast for mixed precision (default: True). - val_batch_size (int): Batch size for validation (default: 1). dataset (str): Dataset name for training (default: "samsum_dataset"). task_type (str): Type of task for which the finetuning is to be done. Options: "generation" and "seq_classification". (default: "generation") peft_method (str): Parameter-efficient fine-tuning method (default: "lora"). @@ -39,7 +41,6 @@ class TrainConfig: from_peft_checkpoint (str): Path to PEFT checkpoint (default: ""). output_dir (str): Directory to save outputs (default: "meta-llama-samsum"). num_freeze_layers (int): Number of layers to freeze (default: 1). - one_qaic (bool): Use single QAIC device (default: False). save_model (bool): Save the trained model (default: True). save_metrics (bool): Save training metrics (default: True). intermediate_step_save (int): Steps between intermediate saves (default: 1000). @@ -58,7 +59,8 @@ class TrainConfig: model_name: str = "meta-llama/Llama-3.2-1B" tokenizer_name: str = None # if not passed as an argument, it uses the value of model_name run_validation: bool = True - batch_size_training: int = 1 + train_batch_size: int = 1 + val_batch_size: int = 1 context_length: int = None gradient_accumulation_steps: int = 4 gradient_checkpointing: bool = False @@ -71,9 +73,7 @@ class TrainConfig: weight_decay: float = 0.0 gamma: float = 0.85 # multiplicatively decay the learning rate by gamma after each epoch seed: int = 42 - use_fp16: bool = True use_autocast: bool = True - val_batch_size: int = 1 dataset = "samsum_dataset" task_type = "generation" # "generation" / "seq_classification" peft_method: str = "lora" @@ -81,7 +81,6 @@ class TrainConfig: from_peft_checkpoint: str = "" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint output_dir: str = "meta-llama-samsum" num_freeze_layers: int = 1 - one_qaic: bool = False save_model: bool = True save_metrics: bool = True # saves training metrics to a json file for later plotting intermediate_step_save: int = 1000 diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index c5c7fe615..6eb9984f2 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -117,7 +117,7 @@ def generate_dataset_config(dataset_name: str) -> Any: def get_dataloader_kwargs(train_config, dataset, dataset_processer, mode): kwargs = {} - batch_size = train_config.batch_size_training if mode == "train" else train_config.val_batch_size + batch_size = train_config.train_batch_size if mode == "train" else train_config.val_batch_size if train_config.enable_ddp: if train_config.enable_sorting_for_ddp: if train_config.context_length: diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py index b3447e0db..c1c110380 100644 --- a/QEfficient/finetune/utils/dataset_utils.py +++ b/QEfficient/finetune/utils/dataset_utils.py @@ -9,7 +9,6 @@ # from QEfficient.finetune.data.concatenator import ConcatDataset from QEfficient.finetune.dataset.dataset_config import DATALOADER_COLLATE_FUNC, DATASET_PREPROC -from QEfficient.finetune.utils.config_utils import get_dataloader_kwargs def get_preprocessed_dataset( @@ -29,20 +28,3 @@ def get_custom_data_collator(dataset_processer, dataset_config) -> torch.utils.d return None return DATALOADER_COLLATE_FUNC[dataset_config.dataset](dataset_processer, dataset_config) - - -def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"): - dataset = get_preprocessed_dataset(tokenizer, dataset_config, split) - dl_kwargs = get_dataloader_kwargs(train_config, dataset, tokenizer, split) - - # if split == "train" and train_config.batching_strategy == "packing": - # dataset = ConcatDataset(dataset, chunk_size=train_config.context_length) - - # Create data loader - dataloader = torch.utils.data.DataLoader( - dataset, - num_workers=train_config.num_workers_dataloader, - pin_memory=True, - **dl_kwargs, - ) - return dataloader diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py new file mode 100644 index 000000000..5690adbb8 --- /dev/null +++ b/QEfficient/finetune/utils/parser.py @@ -0,0 +1,242 @@ +import argparse + + +def get_finetune_parser(): + parser = argparse.ArgumentParser( + description="Finetune command, the model will be downloaded from Huggingface and finetuned on Cloud AI 100 and weights are saved." + ) + parser.add_argument( + "--model-name", + "--model_name", + required=False, + type=str, + default="meta-llama/Llama-3.2-1B", + help="Name of the pre-trained model to fine-tune", + ) + parser.add_argument( + "--tokenizer-name", + "--tokenizer_name", + required=False, + type=str, + default=None, + help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name", + ) + parser.add_argument( + "--run-validation", + "--run_validation", + required=False, + type=bool, + default=True, + help="To run validation during training", + ) + parser.add_argument( + "--train-batch-size", "--train_batch_size", required=False, type=int, default=1, help="Batch size for training" + ) + parser.add_argument( + "--val-batch-size", "--val_batch_size", required=False, type=int, default=1, help="Batch size for validation" + ) + parser.add_argument( + "--context-length", + "--context_length", + required=False, + type=int, + default=None, + help="Maximum sequence length for inputs", + ) + parser.add_argument( + "--gradient-accumulation-steps", + "--gradient_accumulation_steps", + required=False, + type=int, + default=4, + help="Steps for gradient accumulation", + ) + parser.add_argument( + "--gradient-checkpointing", + "--gradient_checkpointing", + required=False, + type=bool, + default=False, + help="Use gradient checkpointing", + ) + parser.add_argument( + "--num-epochs", "--num_epochs", required=False, type=int, default=1, help="Number of training epochs" + ) + parser.add_argument( + "--max-train-step", + "--max_train_step", + required=False, + type=int, + default=0, + help="Maximum training steps, unlimited if 0", + ) + parser.add_argument( + "--max-eval-step", + "--max_eval_step", + required=False, + type=int, + default=0, + help="Maximum evaluation steps, unlimited if 0", + ) + parser.add_argument("--device", required=False, type=str, default="qaic", help="Device to train on") + parser.add_argument( + "--num-workers-dataloader", + "--num_workers_dataloader", + required=False, + type=int, + default=1, + help="Number of workers for dataloader", + ) + parser.add_argument("--lr", required=False, type=float, default=3e-4, help="Learning rate ") + parser.add_argument( + "--weight-decay", "--weight_decay", required=False, type=float, default=0.0, help="Weight decay for optimizer" + ) + parser.add_argument( + "--gamma", + required=False, + type=float, + default=0.85, + help="Learning rate decay factor, multiplicatively decays the learning rate by gamma after each epoch", + ) + parser.add_argument("--seed", required=False, type=int, default=42, help="Random seed for reproducibility") + parser.add_argument( + "--use-autocast", + "--use_autocast", + required=False, + type=bool, + default=True, + help="Use autocast for mixed precision", + ) + + parser.add_argument( + "--dataset", required=False, type=str, default="samsum_dataset", help="Dataset name for finetuning" + ) + parser.add_argument( + "--task-type", + "--task_type", + required=False, + type=str, + default="generation", + help="generation/seq_classification", + ) + parser.add_argument( + "--peft-method", + "--peft_method", + required=False, + type=str, + default="lora", + help="Parameter-efficient fine-tuning method", + ) + parser.add_argument( + "--use-peft", + "--use_peft", + required=False, + type=bool, + default=True, + help="Whether to use PEFT(parameter efficient fine tuning)", + ) + parser.add_argument( + "--from-peft-checkpoint", + "--from_peft_checkpoint", + required=False, + type=str, + default="", + help="Path to load PEFT checkpoint and resume the fine-tuning on that checkpoint", + ) + parser.add_argument( + "--output-dir", + "--output_dir", + required=False, + type=str, + default="meta-llama-samsum", + help="Directory to save outputs", + ) + parser.add_argument( + "--num-freeze-layers", + "--num_freeze_layers", + required=False, + type=int, + default=1, + help="Number of layers to freeze", + ) + parser.add_argument( + "--save-model", "--save_model", required=False, type=bool, default=True, help="Save the trained model" + ) + parser.add_argument( + "--save-metrics", + "--save_metrics", + required=False, + type=bool, + default=True, + help="Save training metrics to a json file for later plotting", + ) + parser.add_argument( + "--intermediate-step-save", + "--intermediate_step_save", + required=False, + type=int, + default=1000, + help="Steps between intermediate saves", + ) + parser.add_argument( + "--batching-strategy", + "--batching_strategy", + required=False, + type=str, + default="packing", + help="Batching strategy", + ) + parser.add_argument( + "--enable-sorting-for-ddp", + "--enable_sorting_for_ddp", + required=False, + type=bool, + default=True, + help="Sort data for DDP", + ) + parser.add_argument( + "--convergence-counter", + "--convergence_counter", + required=False, + type=int, + default=5, + help="Steps to check convergence, its value should be >= 1, stop fine tuning when loss <= convergence_loss (defined below) for #convergence_counter steps", + ) + parser.add_argument( + "--convergence-loss", + "--convergence_loss", + required=False, + type=float, + default=1e-4, + help="Loss threshold for convergence, if loss value is <= convergence_loss for #convergence_counter consecutive steps, fine tuning stops", + ) + parser.add_argument( + "--use-profiler", "--use_profiler", required=False, type=bool, default=False, help="Enable profiling" + ) + parser.add_argument( + "--enable-ddp", "--enable_ddp", required=False, type=int, default=1000, help="Enable distributed data parallel" + ) + parser.add_argument( + "--dist-backend", + "--dist_backend", + required=False, + type=str, + default="cpu:gloo,qaic:qccl,cuda:gloo", + help="Backend for distributed training", + ) + parser.add_argument( + "--grad-scaler", "--grad_scaler", required=False, type=bool, default=True, help="Use gradient scaler" + ) + parser.add_argument( + "--dump-root-dir", + "--dump_root_dir", + required=False, + type=str, + default="meta-llama-samsum-mismatches/step_", + help="Directory for mismatch dumps", + ) + parser.add_argument( + "--opByOpVerifier", required=False, type=bool, default=True, help="Enable operation-by-operation verification" + ) + + return parser diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 8693ae32d..02035b388 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -469,14 +469,6 @@ def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: return longest_seq_length, longest_seq_ix -def get_parameter_dtypes(model): - """Get the data types of model parameters""" - parameter_dtypes = {} - for name, parameter in model.named_parameters(): - parameter_dtypes[name] = parameter.dtype - return parameter_dtypes - - def print_model_size(model, config) -> None: """ Print model name, the number of trainable parameters and initialization time. diff --git a/docs/source/finetune.md b/docs/source/finetune.md index 40df4401c..e5a8bc475 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -13,6 +13,7 @@ For torch_qaic, assuming QEfficient is already installed, ```bash pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl ``` +If qeff-env inside docker is used then torch_qaic and accelerate packages are already installed. ## Finetuning