Skip to content

[QEff. Finetune] Updated handling of custom dataset in FT. Updated finetune.md readme file. #520

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions QEfficient/cloud/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,11 +285,10 @@ def main(**kwargs) -> None:
--model_name "meta-llama/Llama-3.2-1B" \\
--lr 5e-4
"""
# TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser
train_config = TrainConfig()
update_config(train_config, **kwargs)
dataset_config = generate_dataset_config(train_config.dataset)
update_config(dataset_config, **kwargs)
custom_dataset_config_file = kwargs.pop("custom_dataset_config", None)
dataset_config = generate_dataset_config(train_config.dataset, custom_dataset_config_file)

logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)

Expand Down
2 changes: 0 additions & 2 deletions QEfficient/finetune/configs/dataset_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,5 @@ class imdb_dataset:
@dataclass
class custom_dataset:
dataset: str = "custom_dataset"
file: str = "dataset/custom_dataset.py"
train_split: str = "train"
test_split: str = "validation"
data_path: str = ""
17 changes: 17 additions & 0 deletions QEfficient/finetune/configs/sample_peft_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"r": 32,
"lora_alpha": 64,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"up_proj",
"down_proj",
"gate_proj"
],
"bias": "none",
"task_type": "CAUSAL_LM",
"lora_dropout": 0.05,
"inference_mode": false
}
49 changes: 36 additions & 13 deletions QEfficient/finetune/dataset/custom_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# -----------------------------------------------------------------------------

import importlib
import logging
from pathlib import Path

from QEfficient.finetune.utils.logging_utils import logger
Expand All @@ -26,51 +27,73 @@ def load_module_from_py_file(py_file: str) -> object:


def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
if ":" in dataset_config.file:
module_path, func_name = dataset_config.file.split(":")
if not hasattr(dataset_config, "preproc_file"):
logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError)

if ":" in dataset_config.preproc_file:
module_path, func_name = dataset_config.preproc_file.split(":")
else:
module_path, func_name = dataset_config.file, "get_custom_dataset"
module_path, func_name = dataset_config.preproc_file, "get_custom_dataset"
logger.log_rank_zero(
f"Using '{func_name}' function from "
f"{dataset_config.preproc_file} as preprocessing function in "
"dataset preprocessing.",
logging.WARNING,
)

if not module_path.endswith(".py"):
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError)

module_path = Path(module_path)
if not module_path.is_file():
logger.raise_error(
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
)

module = load_module_from_py_file(module_path.as_posix())
try:
return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
except AttributeError:
logger.raise_error(
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
f"For custom dataset preprocessing, the method ({func_name}) is not "
f"present in the file ({module_path.as_posix()}).",
AttributeError,
)


def get_data_collator(dataset_processer, dataset_config):
if ":" in dataset_config.file:
module_path, func_name = dataset_config.file.split(":")
if not hasattr(dataset_config, "collate_file"):
logger.log_rank_zero(
"Can not find collate_file key in dataset_config file. Using the default data collator function instead.",
logging.WARNING,
)
return None

if ":" in dataset_config.collate_file:
module_path, func_name = dataset_config.collate_file.split(":")
else:
module_path, func_name = dataset_config.file, "get_data_collator"
module_path, func_name = dataset_config.collate_file, "get_data_collator"
logger.log_rank_zero(
f"Using '{func_name}' function from {dataset_config.collate_file} as collate_fn in dataset preprocessing.",
logging.WARNING,
)

if not module_path.endswith(".py"):
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError)

module_path = Path(module_path)
if not module_path.is_file():
logger.raise_error(
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
)

module = load_module_from_py_file(module_path.as_posix())
try:
return getattr(module, func_name)(dataset_processer)
except AttributeError:
logger.log_rank_zero(
f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})."
f"Can not find the function {func_name} in file "
f"({module_path.as_posix()}). Using the default data collator "
"function instead."
)
logger.log_rank_zero("Using the default data_collator instead.")
return None
87 changes: 87 additions & 0 deletions QEfficient/finetune/dataset/custom_dataset/disc_preproc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------


import datasets
from transformers.data import DataCollatorForSeq2Seq


def get_data_collator(tokenizer):
return DataCollatorForSeq2Seq(tokenizer)


def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None):
dataset = datasets.load_dataset("hallisky/DiSC")

# Considering 'train' split as this dataset has only one split.
dataset = dataset["train"]

test_split_ratio = dataset_config.test_split_ratio
disc_style = dataset_config.disc_style

# Only collect the samples for a given style.
available_styles = set(dataset["category"])
if disc_style not in available_styles:
raise RuntimeError(f"For DiSC dataset the provided disc_stype '{disc_style}' is not supported.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A small typo here, "disc_stype" should be "disc_style"


dataset = dataset.filter(lambda example: example["category"] == disc_style)

# Shuffle the dataset before splitting
dataset = dataset.shuffle(seed=42)

# Split the data in train and test split.
total_samples = len(dataset)
test_size = int(total_samples * test_split_ratio)
train_size = total_samples - test_size

if split == "test":
indices = range(train_size, total_samples)
else:
indices = range(0, train_size)

dataset = dataset.select(indices)

if tokenizer.pad_token is None:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# Below is the template of the DiSC dataset.
# <bos>### Original:{original} \n ### Rewrite: {rewrite} <eos>
template = "### Original:{original} \n ### Rewrite: "

def apply_prompt_template(sample):
return {
"input": template.format(original=sample["original"]),
"label": sample["generation"],
}

dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))

def tokenize_add_label(sample):
input = tokenizer.encode(
tokenizer.bos_token + sample["input"],
add_special_tokens=False,
max_length=context_length,
pad_to_max_length=True,
)
label = tokenizer.encode(
sample["label"] + tokenizer.pad_token + tokenizer.eos_token,
add_special_tokens=False,
max_length=context_length,
pad_to_max_length=True,
)

sample = {
"input_ids": (input + label),
"attention_mask": [1] * (len(input) + len(label)),
"labels": [-100] * len(input) + label,
}

return sample

dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))

return dataset
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"train_split": "train",
"test_split": "test",
"test_split_ratio": 0.15,
"preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc",
"disc_style": "sarcasm_more"
}
25 changes: 21 additions & 4 deletions QEfficient/finetune/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
import inspect
import json
import os
from collections import namedtuple
from dataclasses import asdict
from typing import Any, Dict
from typing import Any, Dict, Optional

import yaml
from peft import LoraConfig as PeftLoraConfig

import QEfficient.finetune.configs.dataset_config as datasets
import QEfficient.finetune.configs.dataset_config as qeff_datasets
from QEfficient.finetune.configs.peft_config import LoraConfig
from QEfficient.finetune.configs.training import TrainConfig
from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC
Expand Down Expand Up @@ -86,11 +87,14 @@ def generate_peft_config(train_config: TrainConfig, **kwargs) -> Any:
return peft_config


def generate_dataset_config(dataset_name: str) -> Any:
def generate_dataset_config(dataset_name: str, custom_dataset_config: Optional[str] = None) -> Any:
"""Generate a dataset configuration based on the specified dataset.

Args:
dataset_name (str): Name of the dataset to be used for finetuning.
custom_dataset_config (str): Dataset config json file for custom datset.
This file contains dataset specific arguments to be used in dataset
preprocessing step.

Returns:
Any: A dataset configuration object.
Expand All @@ -101,7 +105,20 @@ def generate_dataset_config(dataset_name: str) -> Any:
supported_datasets = DATASET_PREPROC.keys()
assert dataset_name in supported_datasets, f"Given dataset '{dataset_name}' is not supported."
# FIXME (Meet): Replace below logic by creating using auto registry of datasets.
dataset_config = {k: v for k, v in inspect.getmembers(datasets)}[dataset_name]()
dataset_config = {k: v for k, v in inspect.getmembers(qeff_datasets)}[dataset_name]()
if dataset_name == "custom_dataset":
if custom_dataset_config is None:
logger.raise_error(
"For 'custom_dataset', please provide dataset config file via 'custom_dataset_config' flag.",
RuntimeError,
)
custom_dataset_dict = asdict(dataset_config)
custom_dataset_dict_override = load_config_file(custom_dataset_config)
# Override existing and add new params to dataset_config.
custom_dataset_dict.update(custom_dataset_dict_override)

custom_dataset_class = namedtuple("custom_dataset", custom_dataset_dict.keys())
dataset_config = custom_dataset_class(**custom_dataset_dict)
return dataset_config


Expand Down
5 changes: 3 additions & 2 deletions QEfficient/finetune/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split):
kwargs["drop_last"] = False
else:
kwargs["batch_size"] = batch_size
kwargs["drop_last"] = False
kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer)
kwargs["drop_last"] = True
# todo: -100 should be changed to a variable. or tokenizer.pad_token_id
kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer, label_pad_token_id=-100)
return kwargs


Expand Down
7 changes: 7 additions & 0 deletions QEfficient/finetune/utils/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ def get_finetune_parser():
default=None,
help="Name of the tokenizer,if not passed as an argument, it uses the value of model_name",
)
parser.add_argument(
"--custom_dataset_config",
"--custom-dataset-config",
type=str,
default=None,
help="Path of custom dataset config json file to override the custom dataset params such as test_split_ratio, test_split etc.",
)
parser.add_argument(
"--run_validation",
"--run-validation",
Expand Down
Loading
Loading