Updated handling of custom dataset in FT. Updated finetune.md readme accordingly.

quic-meetkuma · quic-meetkuma · commit 581872837312 · 2025-06-27T21:17:54.000+05:30
Signed-off-by: meetkuma &lt;meetkuma@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -20,6 +20,7 @@
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
 
 from QEfficient.finetune.configs.training import TrainConfig
+from QEfficient.finetune.utils.helper import parse_unk_args
 from QEfficient.finetune.utils.config_utils import (
     generate_dataset_config,
     generate_peft_config,
@@ -85,7 +86,7 @@ def setup_seeds(seed: int) -> None:
 
 
 def load_model_and_tokenizer(
-    train_config: TrainConfig, dataset_config: Any, peft_config_file: str, **kwargs
+    train_config: TrainConfig, dataset_config: Any, peft_config_file: Optional[str] = None, **kwargs
 ) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
     """Load the pre-trained model and tokenizer from Hugging Face.
 
@@ -111,7 +112,7 @@ def load_model_and_tokenizer(
         model = AutoModelForSequenceClassification.from_pretrained(
             pretrained_model_path,
             num_labels=dataset_config.num_labels,
-            attn_implementation="sdpa",
+            attn_implementation="eager",
             torch_dtype=torch.float16,
         )
 
@@ -128,7 +129,7 @@ def load_model_and_tokenizer(
         model = AutoModelForCausalLM.from_pretrained(
             pretrained_model_path,
             use_cache=False,
-            attn_implementation="sdpa",
+            attn_implementation="eager",
             torch_dtype=torch.float16,
         )
 
@@ -246,13 +247,13 @@ def setup_dataloaders(
     return train_dataloader, eval_dataloader, longest_seq_length
 
 
-def main(peft_config_file: str = None, **kwargs) -> None:
+def main(**kwargs) -> None:
     """
     Fine-tune a model on QAIC hardware with configurable training and LoRA parameters.
 
     Args:
-        peft_config_file (str, optional): Path to YAML/JSON file containing PEFT (LoRA) config. Defaults to None.
-        kwargs: Additional arguments to override TrainConfig.
+        kwargs: Keyword arguments fetched from CLI to override train config, 
+            dataset config and peft config params.
 
     Example:
         .. code-block:: bash
@@ -268,14 +269,14 @@ def main(peft_config_file: str = None, **kwargs) -> None:
                 --model_name "meta-llama/Llama-3.2-1B" \\
                 --lr 5e-4
     """
-    # TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser
     train_config = TrainConfig()
     update_config(train_config, **kwargs)
-    dataset_config = generate_dataset_config(train_config.dataset)
-    update_config(dataset_config, **kwargs)
+    dataset_config_file = kwargs.pop("dataset_config", None)
+    dataset_config = generate_dataset_config(train_config.dataset, dataset_config_file)
 
     setup_distributed_training(train_config)
     setup_seeds(train_config.seed)
+    peft_config_file = kwargs.pop("peft_config_file", None)
     model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs)
 
     # Create DataLoaders for the training and validation dataset
@@ -308,6 +309,7 @@ def main(peft_config_file: str = None, **kwargs) -> None:
 
 if __name__ == "__main__":
     parser = get_finetune_parser()
-    args = parser.parse_args()
+    args, unk_args = parser.parse_known_args()
+    unk_args_dict = parse_unk_args(unk_args)
     args_dict = vars(args)
-    main(**args_dict)
+    main(**args_dict, **unk_args_dict)
diff --git a/QEfficient/finetune/configs/dataset_config.py b/QEfficient/finetune/configs/dataset_config.py
@@ -48,7 +48,5 @@ class imdb_dataset:
 @dataclass
 class custom_dataset:
     dataset: str = "custom_dataset"
-    file: str = "dataset/custom_dataset.py"
     train_split: str = "train"
     test_split: str = "validation"
-    data_path: str = ""
diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py
@@ -24,45 +24,68 @@ def load_module_from_py_file(py_file: str) -> object:
 
 
 def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
+    if not hasattr(dataset_config, "preproc_file"):
+        raise RuntimeError("Can not find preproc_file key in dataset_config file.")
+
+    if ":" in dataset_config.preproc_file:
+        module_path, func_name = dataset_config.preproc_file.split(":")
     else:
-        module_path, func_name = dataset_config.file, "get_custom_dataset"
+        module_path, func_name = dataset_config.preproc_file, "get_custom_dataset"
+        print(
+            f"Using '{func_name}' function from "
+            f"{dataset_config.preproc_file} as preprocessing function in "
+            "dataset preprocessing."
+        )
 
     if not module_path.endswith(".py"):
-        raise ValueError(f"Dataset file {module_path} is not a .py file.")
+        raise ValueError(f"Custom dataset preprocessing file {module_path} is not a .py file.")
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        raise FileNotFoundError(f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.")
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
     except AttributeError as e:
         print(
-            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
+            f"For custom dataset preprocessing, the method ({func_name}) is not "
+            f"present in the file ({module_path.as_posix()})."
         )
         raise e
 
 
 def get_data_collator(dataset_processer, dataset_config):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
+    if not hasattr(dataset_config, "collate_file"):
+        print(
+            f"Can not find collate_file key in dataset_config file. Using the default data collator function instead."
+        )
+        return None
+
+    if ":" in dataset_config.collate_file:
+        module_path, func_name = dataset_config.collate_file.split(":")
     else:
-        module_path, func_name = dataset_config.file, "get_data_collator"
+        module_path, func_name = dataset_config.collate_file, "get_data_collator"
+        print(
+            f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing."
+        )
 
     if not module_path.endswith(".py"):
-        raise ValueError(f"Dataset file {module_path} is not a .py file.")
+        raise ValueError(f"Custom dataset collate file {module_path} is not a .py file.")
 
     module_path = Path(module_path)
     if not module_path.is_file():
-        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
+        raise FileNotFoundError(
+            f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file."
+        )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_processer)
     except AttributeError:
-        print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
-        print("Using the default data_collator instead.")
+        print(
+            f"Can not find the function {func_name} in file "
+            f"({module_path.as_posix()}). Using the default data collator "
+            "function instead."
+        )
         return None
diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py
@@ -9,12 +9,13 @@
 import json
 import os
 from dataclasses import asdict
-from typing import Any, Dict
+from typing import Any, Dict, Optional
+from collections import namedtuple
 
 import yaml
 from peft import LoraConfig as PeftLoraConfig
 
-import QEfficient.finetune.configs.dataset_config as datasets
+import QEfficient.finetune.configs.dataset_config as qeff_datasets
 from QEfficient.finetune.configs.peft_config import LoraConfig
 from QEfficient.finetune.configs.training import TrainConfig
 from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC
@@ -84,11 +85,14 @@ def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None
     return peft_config
 
 
-def generate_dataset_config(dataset_name: str) -> Any:
+def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[str] = None) -> Any:
     """Generate a dataset configuration based on the specified dataset.
 
     Args:
         dataset_name (str): Name of the dataset to be used for finetuning.
+        custom_dataset_config (str): Dataset config json file for custom datset.
+            This file contains dataset specific arguments to be used in dataset
+            preprocessing step.
 
     Returns:
         Any: A dataset configuration object.
@@ -99,7 +103,15 @@ def generate_dataset_config(dataset_name: str) -> Any:
     supported_datasets = DATASET_PREPROC.keys()
     assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported."
     # FIXME (Meet): Replace below logic by creating using auto registry of datasets.
-    dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]()
+    dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]()
+    if dataset_name == "custom_dataset":
+        custom_dataset_dict = asdict(dataset_config)
+        custom_dataset_dict_override = load_config_file(custom_dataset_config)
+        # Override existing and add new params to dataset_config.
+        custom_dataset_dict.update(custom_dataset_dict_override)
+
+        custom_dataset_class = namedtuple("custom_dataset", custom_dataset_dict.keys())
+        dataset_config = custom_dataset_class(**custom_dataset_dict)
     return dataset_config
 
 
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
@@ -9,3 +9,10 @@
 PEFT_METHOD = ["lora"]
 DEVICE = ["qaic", "cpu", "cuda"]
 BATCHING_STRATEGY = ["padding", "packing"]
+
+
+def parse_unk_args(unk_args_str):
+    if len(unk_args_str) % 2 != 0:
+        raise RuntimeError("Unknown arguments must be in pairs")
+    unk_args_dict = {unk_args_str[i].replace("--", ""): unk_args_str[i + 1] for i in range(0, len(unk_args_str), 2)}
+    return unk_args_dict
diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py
@@ -42,6 +42,20 @@ def get_finetune_parser():
         default=None,
         help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name",
     )
+    parser.add_argument(
+        "--peft_config_file",
+        "--peft-config-file",
+        type=str,
+        default=None,
+        help="Path of PEFT config json file to override the PEFT config params such as lora_r, lora_alpha etc.",
+    )
+    parser.add_argument(
+        "--custom_dataset_config",
+        "--custom-dataset-config",
+        type=str,
+        default=None,
+        help="Path of custom dataset config json file to override the custom dataset params such as test_split_ratio, test_split etc.",
+    )
     parser.add_argument(
         "--run_validation",
         "--run-validation",
diff --git a/docs/source/finetune.md b/docs/source/finetune.md
@@ -75,38 +75,67 @@ tensorboard --logdir runs/<file> --bind_all
     1) Gradient accumulation: By default, gradient accumulation happens for 4 steps. To update this value, command line argument gradient_accumulation_steps has to be passed. (Example: '--gradient_accumulation_steps 8')
     2) Gradient Checkpointing: By default, gradient checkpointing is disabled. To enable it, command line argument gradient_accumulation_steps has to be passed.
 
-## Fine-Tuning on custom dataset
 
-To run fine tuning for any user specific dataset, prepare the dataset using the following steps:
-
-1. Create a directory named 'dataset' inside efficient-transformers.
-2. Inside this directory, create a file named 'custom_dataset.py'.
-3. Inside the newly created efficient-transformers/dataset/custom_dataset.py, define a function named 'get_custom_dataset'. 
-4. get_custom_dataset() should have following 4 parameters:  dataset_config, tokenizer, split, context_length.  
-5. Inside get_custom_dataset(), user needs to apply prompt and tokenize the dataset accordingly. Please refer the below template on how to define get_custom_dataset().
-6. For examples, please refer python files present in [dataset](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset). In case of Samsum dataset, get_preprocessed_samsum() of efficient-transformers/QEfficient/finetune/dataset/samsum_dataset.py is called. 
-7. In [dataset_config.py](https://github.com/quic/efficient-transformers/blob/main/QEfficient/finetune/configs/dataset_config.py), for custom_dataset class, pass the appropriate value for train_split and test_split. As an alternative, these values can be passed as command line arguments as well with the finetune command. For example "--train_split train".
-8. While running fine tuning, pass argument "-–dataset custom_dataset" to finetune on custom dataset.   
-
-Template for get_custom_dataset() to be defined inside efficient-transformers/dataset/custom_dataset.py is as follows:
-
-```python
-def get_custom_dataset(dataset_config, tokenizer, split, context_length=None):
-
-    # load dataset
-    # based on split, retrieve only the specific portion of the dataset (train or eval) either here or at the last
-    
-    def apply_prompt_template():
-        # transform the passed datapoint by applying the prompt on it 
-    
-    def tokenize():
-        # tokenize the passed datapoint
-    
-    # define the prompt
-    # call apply_prompt_template() for each data point:
-    # dataset = dataset.map(apply_prompt_template ,<other args>)
-    # call tokenize() for each data point:
-    # dataset = dataset.map(tokenize, <other args>)
-    
-    return dataset
-```
+### 🔧 Steps to Fine-Tune with a Custom Dataset
+
+1. **Launching Fine-Tuning with a Custom Dataset**  
+   Use the following command-line arguments to begin fine-tuning:
+   ```
+   --dataset custom_dataset --dataset_config data_config.json
+   ```
+   The `data_config.json` file contains essential parameters used during dataset preprocessing.
+
+2. **Specifying the Preprocessing Function**  
+   - In `data_config.json`, include a `"preproc_file"` key to define the path to your preprocessing Python file.
+   - To specify a custom function within that file, use the format `"filename.py:function_name"`.  
+     _Example:_  
+     ```json
+     "preproc_file": "disc_preproc.py:get_preprocessed_disc"
+     ```
+   - Your preprocessing function must follow this structure:
+     ```python
+     def get_custom_dataset(dataset_config, tokenizer, split, context_length=None):
+         def apply_prompt_template():
+             # Apply prompt formatting to each datapoint
+
+         def tokenize():
+             # Tokenize the formatted datapoint
+
+         # Apply functions to dataset using map
+         dataset = dataset.map(apply_prompt_template, ...)
+         dataset = dataset.map(tokenize, ...)
+         
+         return dataset
+     ```
+
+3. **Custom Collate Function for Batching**  
+   - When using a batch size greater than 1, you may override the default collate behavior by including a `"collate_file"` key in `data_config.json`.
+   - Use the same `"file.py:function"` format. If omitted, the default Hugging Face `DataCollatorForSeq2Seq` is used, which pads sequences to the longest length in the batch.
+   - A custom collate function must have the following signature:
+     ```python
+     def get_data_collator(tokenizer):
+         # Define and return a custom collate_fn here
+     ```
+
+4. **Passing Additional Configuration Parameters**  
+   You can add custom arguments in `data_config.json`, which will be accessible via the `dataset_config` argument inside your `get_custom_dataset()` function.
+
+5. **Example `data_config.json` File**
+   ```json
+   {
+     "train_split": "train",
+     "test_split": "test",
+     "test_split_ratio": 0.15,
+     "preproc_file": "disc_preprocd.py:get_preprocessed_disc",
+     "collate_file": "disc_preprocd.py:get_collate_fn_disc",
+     "disc_style": "sarcasm_more"
+   }
+   ```
+
+6. **Implementing Custom Preprocessing Logic**  
+   Within your dataset loader function, define `apply_prompt_template()` to manipulate raw data into desired prompt format, and `tokenize()` to convert it into token IDs using the tokenizer.
+
+7. **Reference for Dataset Utilities**  
+   You can refer to existing implementations in the [dataset directory of this repository](https://github.com/quic/efficient-transformers/tree/main/QEfficient/finetune/dataset).
+
+---