Skip to content

clickprompt #106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions research/huawei-noah/ClickPrompt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# ClickPrompt: CTR Models are Strong Prompt Generators for Adapting Language Models to CTR Prediction

## Introduction
This is the mindspore implementation of the paper [ClickPrompt: CTR Models are Strong Prompt Generators for Adapting Language Models to CTR Prediction](https://arxiv.org/abs/2310.09234).

## Requirements
- Python==3.9
- MindSpore==2.2.1
- mindformers==0.8.0

## Run Code
~~~python
python -u main.py
~~~

Large diffs are not rendered by default.

Binary file not shown.
73,903 changes: 73,903 additions & 0 deletions research/huawei-noah/ClickPrompt/data/ml-1m/proc_data/test.csv

Large diffs are not rendered by default.

665,111 changes: 665,111 additions & 0 deletions research/huawei-noah/ClickPrompt/data/ml-1m/proc_data/train.csv

Large diffs are not rendered by default.

106 changes: 106 additions & 0 deletions research/huawei-noah/ClickPrompt/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import json
import h5py
import random
import pandas as pd
import numpy as np
from typing import Optional, Union, List, Dict, Tuple, Any

from datasets import Dataset

def load_csv_as_df(file_path):
dataset = pd.read_csv(file_path, dtype={'Zipcode': 'str'})
dataset['labels'] = dataset['Label']
dataset['Film genre'] = dataset['First genre']
fields = ["User ID", "Gender", "Age", "Job", "Zipcode", "Movie ID", "Title", "Film genre", "labels"]
dataset = dataset[fields]
return dataset


class PLM4CTRDataset(Dataset):
""" PLM4CTR Dataset
The PLM4CTRDataset overwrites the _getitem function of the Dataset Class to include the templating step.
"""
def _post_setups(
self,
tokenizer,
shuffle_fields: bool,
meta_data_dir: str,
h5_data_dir: str,
mode: str,
model_fusion: str,
do_mlm_only: str,
):
""" Set up the parameters
Args:
tokenizer: Tokenizer from HuggingFace
shuffle_fields: Whether to shuffle the fields for lossless augmentation
meta_data_dir: The data path for meta CTR data
h5_data_dir: The data path for CTR data
mode: `train`/`test`
model_fusion: Method to fuse CTR & NLP model for prediction
do_mlm_only: Whether to do MLM pretraining only
"""
self.tokenizer = tokenizer
self.shuffle_fields = shuffle_fields
self.meta_data_dir = meta_data_dir
self.h5_data_dir = h5_data_dir
self.mode = mode
self.model_fusion = model_fusion
self.do_mlm_only = do_mlm_only

self.get_meta_data()
self.get_h5_data(mode)

def get_meta_data(self):
meta_data = json.load(open(self.meta_data_dir, 'r'))
self.field_names = meta_data['field_names']
self.feature_count = meta_data['feature_count']
self.feature_dict = meta_data['feature_dict']
self.feature_offset = meta_data['feature_offset']
self.num_fields = len(self.field_names)
self.input_size = sum(self.feature_count)

def get_h5_data(self, mode):
assert mode in ["train", "valid", "test"]
with h5py.File(self.h5_data_dir, 'r') as f:
mode_name = mode if mode != "valid" else "train"
self.ctr_X = f[f"{mode_name} data"][:]
self.ctr_Y = f[f"{mode_name} label"][:]
if mode == "train" and not self.do_mlm_only: # The validation set is also used for mlm pretraining.
self.ctr_X = self.ctr_X[:len(self.ctr_X) // 9 * 8]
self.ctr_Y = self.ctr_Y[:len(self.ctr_Y) // 9 * 8]
if mode == "valid":
self.ctr_X = self.ctr_X[len(self.ctr_X) // 9 * 8:]
self.ctr_Y = self.ctr_Y[len(self.ctr_Y) // 9 * 8:]
offset = np.array(self.feature_offset).reshape(1, self.num_fields)
assert self.__len__() == len(self.ctr_X)
assert self.__len__() == len(self.ctr_Y)
self.ctr_X += offset

def _getitem(self, key: Union[int, slice, str], decoded: bool = True, **kwargs) -> Union[Dict, List]:
""" Get Item from Tabular Data
Get one instance of the tabular data, permuted, converted to text and tokenized.
"""
if self.model_fusion == "ctr_only":
return self.ctr_X[key], self.ctr_Y[key]

row = self._data.fast_slice(key, 1)

shuffle_fields = list(row.column_names)
shuffle_fields.remove("labels")
if self.shuffle_fields:
random.shuffle(shuffle_fields)

shuffled_text = " ".join(
["%s is %s." % (field, str(row[field].to_pylist()[0]).strip()) for field in shuffle_fields]
)

tokenized_output = self.tokenizer(
shuffled_text,
padding="max_length",
max_length=100,
return_tensors="ms"
)

return self.ctr_X[key], self.ctr_Y[key], tokenized_output["input_ids"], tokenized_output["attention_mask"]

112 changes: 112 additions & 0 deletions research/huawei-noah/ClickPrompt/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import os
import re
import pandas as pd
import pickle
import mindspore as ms
import numpy as np
import os
from sklearn.metrics import roc_auc_score, log_loss
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore import Model
from mindspore.dataset import GeneratorDataset
from mindformers import MindFormerConfig, LlamaConfig, TransformerOpParallelConfig, AutoTokenizer, LlamaForCausalLM, pipeline
from mindformers import init_context, ContextConfig, ParallelContextConfig
from mindspore import context

from dataset import load_csv_as_df, PLM4CTRDataset
from model import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, DeepCrossModel, DCN_LLaMA

context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")

tokenizer = AutoTokenizer.from_pretrained("llama2_7b")

llama_model = LlamaForCausalLM.from_pretrained("llama2_7b")

data_args = {
"train_file": "data/ml-1m/proc_data/train.csv",
"test_file": "data/ml-1m/proc_data/test.csv",
"h5": "data/ml-1m/proc_data/ctr.h5",
"h5_meta": "data/ml-1m/proc_data/ctr-meta.json",
}
train_df = load_csv_as_df(data_args["train_file"])
test_df = load_csv_as_df(data_args["test_file"])
total_datasets = {
"train": PLM4CTRDataset.from_pandas(train_df[:len(train_df) // 9 * 8]),
"valid": PLM4CTRDataset.from_pandas(train_df[len(train_df) // 9 * 8:]),
"test": PLM4CTRDataset.from_pandas(test_df),
}
for split_name in ["train", "valid", "test"]:
total_datasets[split_name]._post_setups(
tokenizer=tokenizer,
shuffle_fields=False,
meta_data_dir=data_args["h5_meta"],
h5_data_dir=data_args["h5"],
mode=split_name,
model_fusion="prefix",
do_mlm_only=False,
)


ctr_model = DeepCrossModel()
raw_net = DCN_LLaMA(ctr_model, llama_model)
loss_net = NetWithLossClass(raw_net)
train_net = TrainStepWrap(loss_net)
eval_net = PredictWithSigmoid(raw_net)

train_net.set_train()
train_net.network.network.llama.set_train(False)
model = Model(train_net)
train_dataset = GeneratorDataset(
source=total_datasets["train"],
column_names=["batch_ids", "label", "token", "attention_mask"],
shuffle=True
)
train_dataset = train_dataset.batch(128)

test_dataset = GeneratorDataset(
source=total_datasets["test"],
column_names=["batch_ids", "label", "token", "attention_mask"]
)
test_dataset = test_dataset.batch(128)

def evaluate(model, dataset):
batch_num = dataset.get_dataset_size()
batch_size = dataset.get_batch_size()
print('eval batch num', batch_num, 'batch size', batch_size)
eval_data = dataset.create_tuple_iterator()
begin_time = time.time()
pred_list, label_list = [], []

for _ in range(batch_num):
data = next(eval_data)
preds = model(*data)
pred_list.extend(preds.asnumpy().tolist())
label_list.extend(data[1].asnumpy().tolist())

eval_time = time.time() - begin_time
auc = roc_auc_score(y_true=label_list, y_score=pred_list)
logloss = log_loss(y_true=label_list, y_pred=pred_list)
return auc, logloss, eval_time

best_auc = 0
save_path = "./checkpoints"
_patience = 2
# training
for epoch in range(5):
begin_time = time.time()
model.train(1, train_dataset)
train_time = time.time() - begin_time
eval_auc, eval_ll, eval_time = evaluate(eval_net, test_dataset)
print("EPOCH %d , train time: %.5f, test time: %.5f, auc: %.5f, "
"logloss: %.5f" % (epoch, train_time, eval_time, eval_auc, eval_ll))

if eval_auc > best_auc:
best_auc = eval_auc
ms.save_checkpoint(eval_net, save_path)
print('model save in', save_path)
patience = 0
else:
patience += 1
if patience >= _patience:
break
Loading