From a4d0b552b6e83113ea019ec7acd15a743739af23 Mon Sep 17 00:00:00 2001 From: wangshuhe Date: Fri, 2 Apr 2021 13:54:53 +0800 Subject: [PATCH 1/7] feature and object --- .gitignore | 3 +- mmi_model/feature/config.py | 20 ++ mmi_model/feature/data.py | 39 ++++ mmi_model/feature/model.py | 66 ++++++ mmi_model/feature/optim.py | 26 +++ mmi_model/feature/test.py | 41 ++++ mmi_model/feature/train.py | 91 +++++++++ mmi_model/feature/utils.py | 80 ++++++++ mmi_model/feature/vocab.py | 27 +++ mmi_model/object/config.py | 21 ++ mmi_model/object/data.py | 48 +++++ mmi_model/object/model.py | 71 +++++++ mmi_model/object/optim.py | 26 +++ mmi_model/object/test.py | 41 ++++ mmi_model/object/train.py | 91 +++++++++ mmi_model/object/utils.py | 72 +++++++ mmi_model/object/vocab.py | 27 +++ mmi_model/preprocess/char2id.py | 30 +++ mmi_model/preprocess/get_dict.py | 23 +++ shuhe_mmi_feature_fairseq/MSELoss.py | 54 +++++ shuhe_mmi_feature_fairseq/feature/train.sh | 36 ++++ .../image_mmi_transformer.py | 190 ++++++++++++++++++ .../mmi_text_and_feature_dataset.py | 143 +++++++++++++ .../mmi_video_dialogue_task.py | 112 +++++++++++ 24 files changed, 1377 insertions(+), 1 deletion(-) create mode 100644 mmi_model/feature/config.py create mode 100644 mmi_model/feature/data.py create mode 100644 mmi_model/feature/model.py create mode 100644 mmi_model/feature/optim.py create mode 100644 mmi_model/feature/test.py create mode 100644 mmi_model/feature/train.py create mode 100644 mmi_model/feature/utils.py create mode 100644 mmi_model/feature/vocab.py create mode 100644 mmi_model/object/config.py create mode 100644 mmi_model/object/data.py create mode 100644 mmi_model/object/model.py create mode 100644 mmi_model/object/optim.py create mode 100644 mmi_model/object/test.py create mode 100644 mmi_model/object/train.py create mode 100644 mmi_model/object/utils.py create mode 100644 mmi_model/object/vocab.py create mode 100644 mmi_model/preprocess/char2id.py create mode 100644 mmi_model/preprocess/get_dict.py create mode 100644 shuhe_mmi_feature_fairseq/MSELoss.py create mode 100644 shuhe_mmi_feature_fairseq/feature/train.sh create mode 100644 shuhe_mmi_feature_fairseq/image_mmi_transformer.py create mode 100644 shuhe_mmi_feature_fairseq/mmi_text_and_feature_dataset.py create mode 100644 shuhe_mmi_feature_fairseq/mmi_video_dialogue_task.py diff --git a/.gitignore b/.gitignore index feb7284..eae9ca6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Byte-compiled / optimized / DLL files +result/ __pycache__/ *.py[cod] *$py.class @@ -139,4 +140,4 @@ cython_debug/ .idea/ # shuhe -shuhe/ \ No newline at end of file +shuhe/ diff --git a/mmi_model/feature/config.py b/mmi_model/feature/config.py new file mode 100644 index 0000000..fc4986e --- /dev/null +++ b/mmi_model/feature/config.py @@ -0,0 +1,20 @@ +cuda = True + +dict_path = "/data/wangshuhe/test_mmi/mmi.dict" +data_dir = "/data/wangshuhe/test_mmi" +save_path = "/home/wangshuhe/shuhework/OpenViDial/mmi_model/feature/result" +model_path = "/home/wangshuhe/shuhework/OpenViDial/mmi_model/feature/result" + +train_batch_size = 96 +feature_dim = 1000 +d_model = 512 +nhead = 8 +dim_feedforward = 2048 +layer = 6 +dropout = 0.1 +wram_up = 4000 +max_epoch = 20 + +dev_batch_size = 64 + +test_batch_size = 32 \ No newline at end of file diff --git a/mmi_model/feature/data.py b/mmi_model/feature/data.py new file mode 100644 index 0000000..8085c1a --- /dev/null +++ b/mmi_model/feature/data.py @@ -0,0 +1,39 @@ +# encoding: utf-8 +import numpy as np +from torch.utils.data import Dataset +from utils import warmup_mmap_file, feature_file, sent_num_file, offsets_file, read_sents + + +class FeatureDataset(Dataset): + """Load Feature dataset""" + def __init__(self, data_dir, split="train"): + self.data_dir = data_dir + self.dim = 1000 + self.sents = read_sents(data_dir, split) + if (split == 'train' or split == 'valid'): + self.sent_num = np.load(sent_num_file(data_dir, split)) + self.offsets = np.load(offsets_file(data_dir, split)) + self.total_num = self.sent_num[-1] + self.offsets[-1] + self.pair_id = self.get_train_dialogue(data_dir) + else: + self.total_num = len(self.sents) - 1 + self.pair_id = [i for i in range(1, len(self.sents))] + warmup_mmap_file(feature_file(data_dir, split)) + self.features = np.memmap(feature_file(data_dir, split), dtype='float32', mode='r', + shape=(self.total_num, self.dim)) + + def __getitem__(self, item): + return self.sents[self.pair_id[item]], self.features[self.pair_id[item]-1] + + def __len__(self): + return len(self.pair_id) + + def get_train_dialogue(self, data_dir): + tmp = [] + start_ = 0 + for dialogue_id in range(self.sent_num.shape[0]): + num = int(self.sent_num[dialogue_id]) + for i in range(1, num): + tmp.append(start_+i) + start_ += num + return tmp \ No newline at end of file diff --git a/mmi_model/feature/model.py b/mmi_model/feature/model.py new file mode 100644 index 0000000..97f80d9 --- /dev/null +++ b/mmi_model/feature/model.py @@ -0,0 +1,66 @@ +import torch +import torch.nn as nn +import math +import config + +class MMI(nn.Module): + + def __init__(self, vocab, device): + super(MMI, self).__init__() + self.device = device + self.vocab = vocab + self.embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=config.d_model, padding_idx=self.vocab['']) + self.encoder_layer = nn.TransformerEncoderLayer(d_model=config.d_model, nhead=config.nhead, dim_feedforward=config.dim_feedforward, dropout=config.dropout) + self.encoder_norm = nn.LayerNorm(config.d_model) + self.encoder = nn.TransformerEncoder(encoder_layer=self.encoder_layer, num_layers=config.layer, norm=self.encoder_norm) + self.final = nn.Linear(in_features=config.d_model, out_features=config.feature_dim, bias=True) + self.Dropout = nn.Dropout(config.dropout) + + def forward(self, src_text, text_len, src_image): + text_tensor = torch.tensor(src_text, dtype=torch.long, device=self.device).t() + image_tensor = torch.tensor(src_image, dtype=torch.float, device=self.device) + text_feature, text_mask = self.encode(text_tensor) + text_feature = self.final(text_feature) + text_feature = text_feature.permute(1, 0, 2) # sen_len * batch_size * feature -> batch_size * sen_len * feature + image_tensor = torch.unsqueeze(image_tensor, dim=-1) + output = torch.nn.functional.sigmoid(torch.matmul(text_feature, image_tensor).squeeze(dim=-1)) * text_mask # batch_size * sen_len + len_tensor = torch.tensor(text_len, dtype=torch.float, device=self.device) + return output.sum(dim=-1)/len_tensor + + def encode(self, src_tensor): + S = src_tensor.shape[0] + N = src_tensor.shape[1] + padding_mask = (src_tensor == self.vocab['']).bool().t().to(self.device) + embed_tensor = self.Dropout(self.embeddings(src_tensor).to(self.device)+self.get_position(S, N)) + output = self.encoder(embed_tensor, src_key_padding_mask=padding_mask) + return output, padding_mask # sen_len * batch_size * feature_size, batch_size * sen_len + + def get_position(self, sen_len, batch_size): + pre_PE = [] + for i in range(sen_len): + shuhe = [] + for j in range(config.d_model): + if (j % 2 == 0): + shuhe.append(math.sin(i/math.pow(10000, j/config.d_model))) + else: + shuhe.append(math.cos(i/math.pow(10000, (j-1)/config.d_model))) + pre_PE.append(shuhe) + pre_PE = torch.tensor(pre_PE, dtype=torch.float, device=self.device) + pre_PE = pre_PE.reshape(pre_PE.shape[0], 1, pre_PE.shape[1]) + pre_PE = pre_PE.expand(pre_PE.shape[0], batch_size, pre_PE.shape[2]) + return pre_PE + + def save(self, model_path): + params = { + 'vocab': self.vocab, + 'device': self.device, + 'state_dict': self.state_dict() + } + torch.save(params, model_path) + + @staticmethod + def load(model_path): + params = torch.load(model_path, map_location=lambda storage, loc: storage) + model = MMI(params['vocab'], params['device']) + model.load_state_dict(params['state_dict']) + return model \ No newline at end of file diff --git a/mmi_model/feature/optim.py b/mmi_model/feature/optim.py new file mode 100644 index 0000000..1a2f005 --- /dev/null +++ b/mmi_model/feature/optim.py @@ -0,0 +1,26 @@ +import math + +class Optim(): + + def __init__(self, optimizer, d_model, warm_up_step): + self.optimizer = optimizer + self.d_model = d_model + self.warm_up_step = warm_up_step + self.n_current_step = 0 + self.init_lr = math.pow(self.d_model, -0.5) + + def step_and_updata_lr(self): + self.updata_lr() + self.optimizer.step() + + def get_lr(self): + return min(math.pow(self.n_current_step, -0.5), math.pow(self.warm_up_step, -1.5)*self.n_current_step) + + def updata_lr(self): + self.n_current_step += 1 + lr = self.init_lr * self.get_lr() + for para in self.optimizer.param_groups: + para['lr'] = lr + + def zero_grad(self): + self.optimizer.zero_grad() \ No newline at end of file diff --git a/mmi_model/feature/test.py b/mmi_model/feature/test.py new file mode 100644 index 0000000..10f297d --- /dev/null +++ b/mmi_model/feature/test.py @@ -0,0 +1,41 @@ +import argparse +import torch +from tqdm import tqdm +import sys +import os +from torch.utils.data import DataLoader +from data import FeatureDataset +from utils import get_batch, padding +from model import MMI +import toch.nn as nn +import math + +os.environ['CUDA_VISIBLE_DEVICES'] = '3' + +def test(): + print(f"load test data from [{config.data_dir}]", file=sys.stderr) + test_data = FeatureDataset(config.data_dir, split='test') + test_data_loader = DataLoader(dataset=test_data, batch_size=config.test_batch_size, shuffle=False, collate_fn=get_batch) + model = MMI.load(config.model_path) + if (config.cuda): + model = model.to(torch.device("cuda:0")) + sum_loss = 0 + with torch.no_grad(): + max_iter = int(math.ceil(len(test_data)/config.test_batch_size)) + with tqdm(total=max_iter, desc="test") as pbar: + for batch_text, text_len, batch_image in test_data_loader: + batch_size = len(batch_text) + batch_text = padding(batch_text, model.vocab.word2id['']) + loss = model(batch_text, text_len, batch_image) + target = torch.ones(batch_size, dtype=torch.float, device=model.device) + loss = nn.functional.mse_loss(loss, target, reduction='mean') + pbar.set_postfix({"avg_loss": '{%.3f}' % (loss.item())}) + pbar.update(1) + sum_loss += loss + print(f"loss of test : {sum_loss.item()}") + +def main(): + test() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/mmi_model/feature/train.py b/mmi_model/feature/train.py new file mode 100644 index 0000000..0583d4c --- /dev/null +++ b/mmi_model/feature/train.py @@ -0,0 +1,91 @@ +import config +import torch +import torch.nn as nn +from model import MMI +import math +from tqdm import tqdm +import sys +import os +from optim import Optim +from data import FeatureDataset +from torch.utils.data import DataLoader +from vocab import Vocab +from utils import get_batch, padding + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +def evaluate(model, dev_data, dev_loader): + flag = model.training + model.eval() + sum_loss = 0 + with torch.no_grad(): + max_iter = int(math.ceil(len(dev_data)/config.dev_batch_size)) + with tqdm(total=max_iter, desc="validation") as pbar: + for batch_text, text_len, batch_image in dev_loader: + batch_size = len(batch_text) + batch_text = padding(batch_text, model.vocab.word2id['']) + loss = model(batch_text, text_len, batch_image) + target = torch.ones(batch_size, dtype=torch.float, device=model.device) + loss = nn.functional.mse_loss(loss, target, reduction='mean') + pbar.set_postfix({"avg_loss": '{%.3f}' % (loss.item())}) + pbar.update(1) + sum_loss += loss + if (flag): + model.train() + return sum_loss.item() + +def train(): + torch.manual_seed(1) + if (config.cuda): + torch.cuda.manual_seed(1) + + vocab = Vocab(config.dict_path) + train_data = FeatureDataset(config.data_dir, split='train') + dev_data = FeatureDataset(config.data_dir, split='valid') + train_loader = DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, collate_fn=get_batch) + dev_loader = DataLoader(dataset=dev_data, batch_size=config.dev_batch_size, shuffle=True, collate_fn=get_batch) + device = torch.device("cuda:0" if config.cuda else "cpu") + + model = MMI(vocab, device) + model = model.to(device) + model.train() + + optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.feature_dim, config.wram_up) + + epoch = 0 + history_vlaid = [] + print("begin training!", file=sys.stderr) + while (True): + epoch += 1 + max_iter = int(math.ceil(len(train_data)/config.train_batch_size)) + with tqdm(total=max_iter, desc="train") as pbar: + for batch_text, text_len, batch_image in train_loader: + batch_text = padding(batch_text, vocab.word2id['']) + optimizer.zero_grad() + batch_size = len(batch_text) + loss = model(batch_text, text_len, batch_image) + target = torch.ones(batch_size, dtype=torch.float, device=device) + loss = nn.functional.mse_loss(loss, target, reduction='mean') + loss.backward() + optimizer.step_and_updata_lr() + pbar.set_postfix({"epoch": epoch, "avg_loss": '{%.3f}' % (loss.item())}) + pbar.update(1) + if (epoch % config.valid_iter == 0): + print("now begin validation ...", file=sys.stderr) + eval_loss = evaluate(model, dev_data, dev_loader) + print(eval_loss) + flag = len(history_vlaid) == 0 or eval_loss < min(history_vlaid) + if (flag): + print(f"current model is the best! save to [{config.save_path}]", file=sys.stderr) + history_vlaid.append(eval_loss) + model.save(os.path.join(config.save_path, f"{epoch}_{eval_loss}_checkpoint.pth")) + torch.save(optimizer.optimizer.state_dict(), os.path.join(config.save_path, f"{epoch}_{eval_loss}_optimizer.optim")) + if (epoch == config.max_epoch): + print("reach the maximum number of epochs!", file=sys.stderr) + return + +def main(): + train() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/mmi_model/feature/utils.py b/mmi_model/feature/utils.py new file mode 100644 index 0000000..5b679fd --- /dev/null +++ b/mmi_model/feature/utils.py @@ -0,0 +1,80 @@ +# encoding: utf-8 +""" +@author: Yuxian Meng +@contact: yuxian_meng@shannonai.com + +@version: 1.0 +@file: path_utils +@time: 2020/11/14 12:13 +@desc: + +""" +import os + +def sent_num_file(data_dir, split): + return os.path.join(data_dir, f"{split}.sent_num.npy") + +def offsets_file(data_dir, split): + return os.path.join(data_dir, f"{split}.offsets.npy") + +def feature_file(data_dir, split): + return os.path.join(data_dir, f"{split}.features.mmap") + +def object_file(data_dir, split, truncate=0): + return os.path.join(data_dir, f"{split}.objects.mmap")+(f".{truncate}" if truncate else "") + +def object_mask_file(data_dir, split, truncate=0): + return os.path.join(data_dir, f"{split}.objects_mask.mmap")+(f".{truncate}" if truncate else "") + +def src_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src.txt") + +def nmt_src_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src-tgt.src") + +def nmt_tgt_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src-tgt.tgt") + +def text_bin_file(data_dir, split): + return os.path.join(data_dir, split) + +def img_file(data_dir, group_idx, sent_idx): + return os.path.join(data_dir, f"img_dir{group_idx}", f"{sent_idx}.jpg") + +def warmup_mmap_file(path): + with open(path, 'rb') as stream: + while stream.read(10 * 1024 * 1024): + pass + +def padding(sents, pad_word): + ''' + sents: list[list[int]] + ''' + max_ = max(len(sen) for sen in sents) + padding_sents = [[pad_word for j in range(max_)] for i in range(len(sents))] + for i in range(len(sents)): + padding_sents[i][0:len(sents[i])] = sents[i][:] + return padding_sents + +def read_sents(path, split): + output = [] + if (split == 'test'): + output.append([0]) + with open(os.path.join(path, split+'.mmi'), "r") as f: + for line in f: + line = line.strip().split() + for i in range(len(line)): + line[i] = int(line[i]) + output.append(line) + f.close() + return output + +def get_batch(sample): + batch_text = [] + batch_image = [] + text_len = [] + for text_sample, image_sample in sample: + batch_text.append(text_sample) + batch_image.append(image_sample) + text_len.append(len(text_sample)) + return batch_text, text_len, batch_image \ No newline at end of file diff --git a/mmi_model/feature/vocab.py b/mmi_model/feature/vocab.py new file mode 100644 index 0000000..23bc699 --- /dev/null +++ b/mmi_model/feature/vocab.py @@ -0,0 +1,27 @@ +class Vocab(object): + + def __init__(self, file): + self.word2id = dict() + word_cnt = 0 + with open(file, "r") as f: + for line in f: + line = line.strip() + self.word2id[line] = word_cnt + word_cnt += 1 + f.close() + self.id2word = dict() + for key, value in self.word2id.items(): + self.id2word[value] = key + + def __getitem__(self, word): + return self.word2id[word] + + def __len__(self): + return len(self.word2id) + + def __contains__(self, word): + return word in self.word2id + + def id2word(self, id): + return self.id2word[id] + \ No newline at end of file diff --git a/mmi_model/object/config.py b/mmi_model/object/config.py new file mode 100644 index 0000000..79691e3 --- /dev/null +++ b/mmi_model/object/config.py @@ -0,0 +1,21 @@ +cuda = True +max_obj = 20 + +dict_path = "/data/wangshuhe/test_mmi/mmi.dict" +data_dir = "/data/wangshuhe/test_mmi" +save_path = "/home/wangshuhe/shuhework/OpenViDial/mmi_model/feature/result" +model_path = "" + +train_batch_size = 32 +feature_dim = 2048 +d_model = 512 +nhead = 8 +dim_feedforward = 2048 +layer = 6 +dropout = 0.1 +wram_up = 4000 +max_epoch = 20 + +dev_batch_size = 32 + +test_batch_size = 32 \ No newline at end of file diff --git a/mmi_model/object/data.py b/mmi_model/object/data.py new file mode 100644 index 0000000..85baeeb --- /dev/null +++ b/mmi_model/object/data.py @@ -0,0 +1,48 @@ +# encoding: utf-8 +import numpy as np +import config +from torch.utils.data import Dataset +from utils import sent_num_file, offsets_file, object_file, object_mask_file, warmup_mmap_file, read_sents + +class ObjectDataset(Dataset): + """Load Object dataset""" + def __init__(self, data_dir, split="train"): + self.data_dir = data_dir + self.dim = 2048 # todo add x,y,w,h + self.max_obj = config.max_obj # max-obj when getting item + self.sents = read_sents(data_dir, split) + if (split == 'train' or split == 'valid'): + self.sent_num = np.load(sent_num_file(data_dir, split)) + self.offsets = np.load(offsets_file(data_dir, split)) + self.total_sent_num = self.offsets[-1] + self.sent_num[-1] + self.pair_id = self.get_train_dialogue(data_dir) + else: + self.total_sent_num = len(self.sents) - 1 + self.pair_id = [i for i in range(1, len(self.sents))] + warmup_mmap_file(object_file(data_dir, split)) + self.objects = np.memmap(object_file(data_dir, split), dtype=np.float32, mode='r', + shape=(self.total_sent_num, self.max_obj, self.dim)) + warmup_mmap_file(object_mask_file(data_dir, split)) + self.objects_mask = np.memmap(object_mask_file(data_dir, split), dtype=np.bool, mode='r', + shape=(self.total_sent_num, self.max_obj)) + + def __getitem__(self, item): + """ + Returns: + 1. object features, [self.max_object, self.dim] + 2. object_mask, [self.max_object], 0 means no object + """ + return self.sents[self.pair_id[item]], self.objects[self.pair_id[item]-1][: self.max_obj], self.objects_mask[self.pair_id[item]-1][: self.max_obj] + + def __len__(self): + return len(self.pair_id) + + def get_train_dialogue(self, data_dir): + tmp = [] + start_ = 0 + for dialogue_id in range(self.sent_num.shape[0]): + num = int(self.sent_num[dialogue_id]) + for i in range(1, num): + tmp.append(start_+i) + start_ += num + return tmp diff --git a/mmi_model/object/model.py b/mmi_model/object/model.py new file mode 100644 index 0000000..1c1dbca --- /dev/null +++ b/mmi_model/object/model.py @@ -0,0 +1,71 @@ +import torch +import torch.nn as nn +import math +import config + +class MMI(nn.Module): + + def __init__(self, vocab, device): + super(MMI, self).__init__() + self.device = device + self.vocab = vocab + self.embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=config.d_model, padding_idx=self.vocab['']) + self.encoder_layer = nn.TransformerEncoderLayer(d_model=config.d_model, nhead=config.nhead, dim_feedforward=config.dim_feedforward, dropout=config.dropout) + self.encoder_norm = nn.LayerNorm(config.d_model) + self.encoder = nn.TransformerEncoder(encoder_layer=self.encoder_layer, num_layers=config.layer, norm=self.encoder_norm) + self.final = nn.Linear(in_features=config.d_model, out_features=config.feature_dim, bias=True) + self.Dropout = nn.Dropout(config.dropout) + + def forward(self, src_text, text_len, src_image, src_image_mask): + text_tensor = torch.tensor(src_text, dtype=torch.long, device=self.device).t() + image_tensor = torch.tensor(src_image, dtype=torch.float, device=self.device) + image_mask_tensor = torch.tensor(src_image_mask, dtype=torch.float, device=self.device) + text_feature, text_mask = self.encode(text_tensor) + text_feature = self.final(text_feature) + text_feature = text_feature.permute(1, 0, 2) # sen_len * batch_size * feature -> batch_size * sen_len * feature + image_tensor = image_tensor.permute(0, 2, 1) # batch_size * max_obj * feature -> batch_size * feature * max_obj + output = torch.nn.functional.sigmoid(torch.matmul(text_feature, image_tensor)) # batch_size * sen_len * max_obj + image_out_mask_tensor = torch.unsqueeze(image_mask_tensor, dim=1) + image_out_mask_tensor = image_out_mask_tensor.expand(image_out_mask_tensor.shape[0], output.shape[1], image_out_mask_tensor[2]) # batch_size * sen_len * max_obj + output = output * image_out_mask_tensor + output = output.sum(dim=-1) * text_mask # batch_size * sen_len + len_tensor = torch.tensor(text_len, dtype=torch.float, device=self.device)+image_mask_tensor.sum(dim=-1) + return output.sum(dim=-1)/len_tensor + + def encode(self, src_tensor): + S = src_tensor.shape[0] + N = src_tensor.shape[1] + padding_mask = (src_tensor == self.vocab['']).bool().t().to(self.device) + embed_tensor = self.Dropout(self.embeddings(src_tensor).to(self.device)+self.get_position(S, N)) + output = self.encoder(embed_tensor, src_key_padding_mask=padding_mask) + return output, padding_mask # sen_len * batch_size * feature_size, batch_size * sen_len + + def get_position(self, sen_len, batch_size): + pre_PE = [] + for i in range(sen_len): + shuhe = [] + for j in range(config.d_model): + if (j % 2 == 0): + shuhe.append(math.sin(i/math.pow(10000, j/config.d_model))) + else: + shuhe.append(math.cos(i/math.pow(10000, (j-1)/config.d_model))) + pre_PE.append(shuhe) + pre_PE = torch.tensor(pre_PE, dtype=torch.float, device=self.device) + pre_PE = pre_PE.reshape(pre_PE.shape[0], 1, pre_PE.shape[1]) + pre_PE = pre_PE.expand(pre_PE.shape[0], batch_size, pre_PE.shape[2]) + return pre_PE + + def save(self, model_path): + params = { + 'vocab': self.vocab, + 'device': self.device, + 'state_dict': self.state_dict() + } + torch.save(params, model_path) + + @staticmethod + def load(model_path): + params = torch.load(model_path, map_location=lambda storage, loc: storage) + model = MMI(params['vocab'], params['device']) + model.load_state_dict(params['state_dict']) + return model \ No newline at end of file diff --git a/mmi_model/object/optim.py b/mmi_model/object/optim.py new file mode 100644 index 0000000..1a2f005 --- /dev/null +++ b/mmi_model/object/optim.py @@ -0,0 +1,26 @@ +import math + +class Optim(): + + def __init__(self, optimizer, d_model, warm_up_step): + self.optimizer = optimizer + self.d_model = d_model + self.warm_up_step = warm_up_step + self.n_current_step = 0 + self.init_lr = math.pow(self.d_model, -0.5) + + def step_and_updata_lr(self): + self.updata_lr() + self.optimizer.step() + + def get_lr(self): + return min(math.pow(self.n_current_step, -0.5), math.pow(self.warm_up_step, -1.5)*self.n_current_step) + + def updata_lr(self): + self.n_current_step += 1 + lr = self.init_lr * self.get_lr() + for para in self.optimizer.param_groups: + para['lr'] = lr + + def zero_grad(self): + self.optimizer.zero_grad() \ No newline at end of file diff --git a/mmi_model/object/test.py b/mmi_model/object/test.py new file mode 100644 index 0000000..f73a49a --- /dev/null +++ b/mmi_model/object/test.py @@ -0,0 +1,41 @@ +import config +import torch +from tqdm import tqdm +import sys +import os +from torch.utils.data import DataLoader +from data import FeatureDataset +from utils import get_batch, padding +from model import MMI +import toch.nn as nn +import math + +os.environ['CUDA_VISIBLE_DEVICES'] = '3' + +def test(): + print(f"load test data from [{config.data_dir}]", file=sys.stderr) + test_data = FeatureDataset(config.data_dir, split='test') + test_data_loader = DataLoader(dataset=test_data, batch_size=config.test_batch_size, shuffle=False, collate_fn=get_batch) + model = MMI.load(config.model_path) + if (config.cuda): + model = model.to(torch.device("cuda:0")) + sum_loss = 0 + with torch.no_grad(): + max_iter = int(math.ceil(len(test_data)/config.test_batch_size)) + with tqdm(total=max_iter, desc="test") as pbar: + for batch_text, text_len, batch_image, batch_image_mask in test_data_loader: + batch_size = len(batch_text) + batch_text = padding(batch_text, model.vocab.word2id['']) + loss = model(batch_text, text_len, batch_image, batch_image_mask) + target = torch.ones(batch_size, dtype=torch.float, device=model.device) + loss = nn.functional.mse_loss(loss, target, reduction='mean') + pbar.set_postfix({"avg_loss": '{%.3f}' % (loss.item())}) + pbar.update(1) + sum_loss += loss + print(f"loss of test : {sum_loss.item()}") + +def main(): + test() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/mmi_model/object/train.py b/mmi_model/object/train.py new file mode 100644 index 0000000..ff6f0e7 --- /dev/null +++ b/mmi_model/object/train.py @@ -0,0 +1,91 @@ +import config +import torch +import torch.nn as nn +from model import MMI +import math +from tqdm import tqdm +import sys +import os +from optim import Optim +from data import FeatureDataset +from torch.utils.data import DataLoader +from vocab import Vocab +from utils import get_batch, padding + +os.environ['CUDA_VISIBLE_DEVICES'] = '2' + +def evaluate(model, dev_data, dev_loader): + flag = model.training + model.eval() + sum_loss = 0 + with torch.no_grad(): + max_iter = int(math.ceil(len(dev_data)/config.dev_batch_size)) + with tqdm(total=max_iter, desc="validation") as pbar: + for batch_text, text_len, batch_image, batch_image_mask in dev_loader: + batch_size = len(batch_text) + batch_text = padding(batch_text, model.vocab.word2id['']) + loss = model(batch_text, text_len, batch_image, batch_image_mask) + target = torch.ones(batch_size, dtype=torch.float, device=model.device) + loss = nn.functional.mse_loss(loss, target, reduction='mean') + pbar.set_postfix({"avg_loss": '{%.3f}' % (loss.item())}) + pbar.update(1) + sum_loss += loss + if (flag): + model.train() + return sum_loss.item() + +def train(): + torch.manual_seed(1) + if (config.cuda): + torch.cuda.manual_seed(1) + + vocab = Vocab(config.dict_path) + train_data = FeatureDataset(config.data_dir, split='train') + dev_data = FeatureDataset(config.data_dir, split='valid') + train_loader = DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, collate_fn=get_batch) + dev_loader = DataLoader(dataset=dev_data, batch_size=config.dev_batch_size, shuffle=True, collate_fn=get_batch) + device = torch.device("cuda:0" if config.cuda else "cpu") + + model = MMI(vocab, device) + model = model.to(device) + model.train() + + optimizer = Optim(torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9), config.feature_dim, config.wram_up) + + epoch = 0 + history_vlaid = [] + print("begin training!", file=sys.stderr) + while (True): + epoch += 1 + max_iter = int(math.ceil(len(train_data)/config.train_batch_size)) + with tqdm(total=max_iter, desc="train") as pbar: + for batch_text, text_len, batch_image, batch_image_mask in train_loader: + batch_text = padding(batch_text, vocab.word2id['']) + optimizer.zero_grad() + batch_size = len(batch_text) + loss = model(batch_text, text_len, batch_image, batch_image_mask) + target = torch.ones(batch_size, dtype=torch.float, device=device) + loss = nn.functional.mse_loss(loss, target, reduction='mean') + loss.backward() + optimizer.step_and_updata_lr() + pbar.set_postfix({"epoch": epoch, "avg_loss": '{%.3f}' % (loss.item())}) + pbar.update(1) + if (epoch % config.valid_iter == 0): + print("now begin validation ...", file=sys.stderr) + eval_loss = evaluate(model, dev_data, dev_loader) + print(eval_loss) + flag = len(history_vlaid) == 0 or eval_loss < min(history_vlaid) + if (flag): + print(f"current model is the best! save to [{config.save_path}]", file=sys.stderr) + history_vlaid.append(eval_loss) + model.save(os.path.join(config.save_path, f"{epoch}_{eval_loss}_checkpoint.pth")) + torch.save(optimizer.optimizer.state_dict(), os.path.join(config.save_path, f"{epoch}_{eval_loss}_optimizer.optim")) + if (epoch == config.max_epoch): + print("reach the maximum number of epochs!", file=sys.stderr) + return + +def main(): + train() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/mmi_model/object/utils.py b/mmi_model/object/utils.py new file mode 100644 index 0000000..91117f2 --- /dev/null +++ b/mmi_model/object/utils.py @@ -0,0 +1,72 @@ +# encoding: utf-8 +import os + +def sent_num_file(data_dir, split): + return os.path.join(data_dir, f"{split}.sent_num.npy") + +def offsets_file(data_dir, split): + return os.path.join(data_dir, f"{split}.offsets.npy") + +def feature_file(data_dir, split): + return os.path.join(data_dir, f"{split}.features.mmap") + +def object_file(data_dir, split, truncate=0): + return os.path.join(data_dir, f"{split}.objects.mmap")+(f".{truncate}" if truncate else "") + +def object_mask_file(data_dir, split, truncate=0): + return os.path.join(data_dir, f"{split}.objects_mask.mmap")+(f".{truncate}" if truncate else "") + +def src_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src.txt") + +def nmt_src_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src-tgt.src") + +def nmt_tgt_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src-tgt.tgt") + +def text_bin_file(data_dir, split): + return os.path.join(data_dir, split) + +def img_file(data_dir, group_idx, sent_idx): + return os.path.join(data_dir, f"img_dir{group_idx}", f"{sent_idx}.jpg") + +def warmup_mmap_file(path): + with open(path, 'rb') as stream: + while stream.read(10 * 1024 * 1024): + pass + +def padding(sents, pad_word): + ''' + sents: list[list[int]] + ''' + max_ = max(len(sen) for sen in sents) + padding_sents = [[pad_word for j in range(max_)] for i in range(len(sents))] + for i in range(len(sents)): + padding_sents[i][0:len(sents[i])] = sents[i][:] + return padding_sents + +def read_sents(path, split): + output = [] + if (split == 'test'): + output.append([0]) + with open(os.path.join(path, split+'.mmi'), "r") as f: + for line in f: + line = line.strip().split() + for i in range(len(line)): + line[i] = int(line[i]) + output.append(line) + f.close() + return output + +def get_batch(sample): + batch_text = [] + batch_image = [] + batch_image_mask = [] + text_len = [] + for text_sample, image_sample, image_sample_mask in sample: + batch_text.append(text_sample) + batch_image.append(image_sample) + batch_image_mask.append(image_sample_mask) + text_len.append(text_sample) + return batch_text, text_len, batch_image, batch_image_mask \ No newline at end of file diff --git a/mmi_model/object/vocab.py b/mmi_model/object/vocab.py new file mode 100644 index 0000000..23bc699 --- /dev/null +++ b/mmi_model/object/vocab.py @@ -0,0 +1,27 @@ +class Vocab(object): + + def __init__(self, file): + self.word2id = dict() + word_cnt = 0 + with open(file, "r") as f: + for line in f: + line = line.strip() + self.word2id[line] = word_cnt + word_cnt += 1 + f.close() + self.id2word = dict() + for key, value in self.word2id.items(): + self.id2word[value] = key + + def __getitem__(self, word): + return self.word2id[word] + + def __len__(self): + return len(self.word2id) + + def __contains__(self, word): + return word in self.word2id + + def id2word(self, id): + return self.id2word[id] + \ No newline at end of file diff --git a/mmi_model/preprocess/char2id.py b/mmi_model/preprocess/char2id.py new file mode 100644 index 0000000..e57b680 --- /dev/null +++ b/mmi_model/preprocess/char2id.py @@ -0,0 +1,30 @@ +import os + +dict_dir = "/data/wangshuhe/test_mmi/mmi.dict" +data_dir = "/data/wangshuhe/test_mmi" + +print("read dict ...") +word_dict = {} +word_cnt = 0 +with open(dict_dir, "r") as f: + for line in f: + line = line.strip() + word_dict[line] = word_cnt + word_cnt += 1 + f.close() + +print("preprocess data ...") +for sub_name in ['train', 'valid', 'test']: + print(f"{sub_name} ...") + with open(os.path.join(data_dir, sub_name+'.src.txt'), "r") as read_file, open(os.path.join(data_dir, sub_name+'.mmi'), "w") as write_file: + for line in read_file: + line = line.strip().split() + new_line = "" + for word in line: + if (word not in word_dict): + new_line += str(word_dict['']) + " " + else: + new_line += str(word_dict[word]) + " " + write_file.write(new_line+'\n') + read_file.close() + write_file.close() \ No newline at end of file diff --git a/mmi_model/preprocess/get_dict.py b/mmi_model/preprocess/get_dict.py new file mode 100644 index 0000000..4d088a6 --- /dev/null +++ b/mmi_model/preprocess/get_dict.py @@ -0,0 +1,23 @@ +word = {} +word[''] = 0 +word[''] = 1 +word_cnt = 2 + +input_path = "/data/wangshuhe/test_mmi/train.src.txt" +output_path = "/data/wangshuhe/test_mmi/mmi.dict" + +print("read ...") +with open(input_path, "r") as f: + for line in f: + line = line.strip().split() + for sub_word in line: + if (sub_word not in word): + word[sub_word] = word_cnt + word_cnt += 1 + f.close() + +print("write ...") +with open(output_path, "w") as f: + for key, value in word.items(): + f.write(key+'\n') + f.close() \ No newline at end of file diff --git a/shuhe_mmi_feature_fairseq/MSELoss.py b/shuhe_mmi_feature_fairseq/MSELoss.py new file mode 100644 index 0000000..d2445b7 --- /dev/null +++ b/shuhe_mmi_feature_fairseq/MSELoss.py @@ -0,0 +1,54 @@ +import torch +from fairseq import metrics +from fairseq.criterions import FairseqCriterion, register_criterion + + +@register_criterion("mse-loss") +class MSELoss(FairseqCriterion): + """ + Implementation for the MSELoss. + """ + + def __init__(self, task): + super().__init__(task) + self.loss = torch.nn.MSELoss(reduction='mean') + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + sample_size = sample['nsentences'] + loss = model(**sample["net_input"]) + target = torch.ones(sample['net_input']['src_tokens'].shape[0]).cuda() + loss = self.loss(loss, target) + + logging_output = { + "loss": loss, + "ntokens": sample_size, + "nsentences": sample_size, + "sample_size": sample_size, + } + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + metrics.log_scalar( + "loss", loss_sum / sample_size, sample_size, round=3 + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/shuhe_mmi_feature_fairseq/feature/train.sh b/shuhe_mmi_feature_fairseq/feature/train.sh new file mode 100644 index 0000000..68f78ae --- /dev/null +++ b/shuhe_mmi_feature_fairseq/feature/train.sh @@ -0,0 +1,36 @@ +# Note that fairseq may use all gpus on your machine and the actual batch-size is times by n_gpus. +# If you use multiple gpus, batch_size should be devided by number of gpus. + +# hyper-params +LR=3e-4 +DROPOUT=0.1 +LAYER=3 +WARMUP=6000 + +# directory to save models +MODEL_DIR="/home/wangshuhe/shuhework/OpenViDial/mmi_test" +# data directory +DATA_DIR="/data/wangshuhe/test_mmi" +TYPE="features" + +CUDA_VISIBLE_DEVICES=2 fairseq-train \ + --save-dir $MODEL_DIR \ + --user-dir video_dialogue_model \ + --task mmi-video-dialogue \ + --img-type $TYPE \ + --data-dir $DATA_DIR \ + --arch baseline-mmi-img-transformer \ + --encoder-layers $LAYER \ + --encoder-embed-dim 1000 \ + --dropout $DROPOUT \ + --optimizer adam \ + --max-tokens 100000 \ + --batch-size 60 \ + --adam-betas "(0.9,0.999)" \ + --reset-optimizer \ + --criterion mse-loss \ + --lr $LR \ + --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates $WARMUP \ + --max-epoch 20 \ + --keep-last-epochs 5 \ + --ddp-backend=no_c10d \ No newline at end of file diff --git a/shuhe_mmi_feature_fairseq/image_mmi_transformer.py b/shuhe_mmi_feature_fairseq/image_mmi_transformer.py new file mode 100644 index 0000000..731ab51 --- /dev/null +++ b/shuhe_mmi_feature_fairseq/image_mmi_transformer.py @@ -0,0 +1,190 @@ +# encoding: utf-8 +""" +@author: Yuxian Meng +@contact: yuxian_meng@shannonai.com + +@version: 1.0 +@file: transformer_encoder +@time: 2020/11/18 11:35 +@desc: Transformer encoder with src-tokens and img-features as inputs + +""" + +from typing import Optional + +import torch +from fairseq.models import ( + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( + TransformerModel, + TransformerEncoder, + EncoderOut, + base_architecture as transformer_base_architecture +) + +DEFAULT_MAX_SOURCE_POSITIONS = 1024 +DEFAULT_MAX_TARGET_POSITIONS = 1024 + + +@register_model("mmi-img-transformer") +class MMIImageTransformerModel(TransformerModel): + """ + Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) + `_. + + Args: + encoder (TransformerEncoder): the encoder + decoder (TransformerDecoder): the decoder + + The Transformer model provides the following named architectures and + command-line arguments: + + .. argparse:: + :ref: fairseq.models.transformer_parser + :prog: + """ + + def __init__(self, args, encoder, decoder): + super().__init__(args, encoder, decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + TransformerModel.add_args(parser) + parser.add_argument('--img-dim', type=int, metavar='N', default=1000, + help='image feature dimension') + + def forward(self, src_tokens, src_mask, src_imgs, src_lengths, prev_output_tokens, **kwargs): + """ + Run the forward pass for an encoder-decoder model. + + First feed a batch of source tokens through the encoder. Then, feed the + encoder output and previous decoder outputs (i.e., teacher forcing) to + the decoder to produce the next outputs:: + + encoder_out = self.encoder(src_tokens, src_lengths) + return self.decoder(prev_output_tokens, encoder_out) + + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch * sent_num, src_len)` + src_imgs (FloatTensor): images features in the source sentences + `(batch * img_num, dim)` + src_lengths (LongTensor): source sentence lengths of shape `(batch)` + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + + Returns: + output_: image_feature * text_feature, shape `(batch, sent_len)` + """ + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + x = encoder_out.encoder_out.transpose(0, 1) # T * B * C -> B * T * C + src_imgs = torch.unsqueeze(src_imgs, dim=-1) + output_ = torch.nn.functional.sigmoid(torch.matmul(x, src_imgs).squeeze(dim=-1)) * src_mask # B * T + return output_.sum(dim=-1) + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return MMIImageTransformerEncoder(args, src_dict, embed_tokens) + + +class MMIImageTransformerEncoder(TransformerEncoder): + """ + Transformer encoder consisting of *args.encoder_layers* layers. Each layer + is a :class:`TransformerEncoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): encoding dictionary + embed_tokens (torch.nn.Embedding): input embedding + """ + + def __init__(self, args, dictionary, embed_tokens): + super().__init__(args, dictionary, embed_tokens) + + self.img_dim = args.img_dim + + def forward_embedding( + self, src_tokens, token_embedding: Optional[torch.Tensor] = None + ): + # embed tokens and positions + if token_embedding is None: + token_embedding = self.embed_tokens(src_tokens) + x = embed = self.embed_scale * token_embedding + + if self.embed_positions is not None: + x = embed + self.embed_positions(src_tokens) + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + x = self.dropout_module(x) + if self.quant_noise is not None: + x = self.quant_noise(x) + return x, embed + + def forward( + self, + src_tokens, + src_lengths, + cls_input=None, + return_all_hiddens=False, + token_embeddings: Optional[torch.Tensor] = None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + namedtuple: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + + encoder_states = [] if return_all_hiddens else None + + # encoder layers + for layer in self.layers: + x = layer(x, encoder_padding_mask) + if return_all_hiddens: + assert encoder_states is not None + encoder_states.append(x) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + return EncoderOut( + encoder_out=x, # T x B x C + encoder_padding_mask=encoder_padding_mask, # B x T + encoder_embedding=encoder_embedding, # B x T x C + encoder_states=encoder_states, # List[T x B x C] + src_tokens=None, + src_lengths=None, + ) + + +@register_model_architecture('mmi-img-transformer', 'baseline-mmi-img-transformer') +def base_architecture(args): + transformer_base_architecture(args) + diff --git a/shuhe_mmi_feature_fairseq/mmi_text_and_feature_dataset.py b/shuhe_mmi_feature_fairseq/mmi_text_and_feature_dataset.py new file mode 100644 index 0000000..253bd74 --- /dev/null +++ b/shuhe_mmi_feature_fairseq/mmi_text_and_feature_dataset.py @@ -0,0 +1,143 @@ +# encoding: utf-8 +""" +@author: Yuxian Meng +@contact: yuxian_meng@shannonai.com + +@version: 1.0 +@file: text_and_image_dataset +@time: 2020/11/14 15:26 +@desc: Combine Text and Image Datasets + +""" + +import numpy as np +import torch +from fairseq.data.fairseq_dataset import FairseqDataset +from video_dialogue_model.data.feature_dataset import FeatureDataset +from fairseq.data import data_utils + + +class MMITextImageDataset(FairseqDataset): + def __init__(self, image_dataset: FeatureDataset, text_dataset, vocab_dict, span_idxs, shuffle=False): + self.img_dataset = image_dataset + self.text_dataset = text_dataset + self.vocab_dict = vocab_dict + self.span_idxs = span_idxs + self.shuffle = shuffle + + def __getitem__(self, index): + ''' + group_idx, start_idx, end_idx = self.span_idxs[index].tolist() + source_imgs = np.stack([self.img_dataset[idx] for idx in range(start_idx, end_idx)]) # n * dim + source_texts = [self.text_dataset[idx] for idx in range(start_idx+1, end_idx+1)] # n * sent_len + target = self.text_dataset[end_idx] # will not be computed + ''' + group_idx, start_idx, end_idx = self.span_idxs[index].tolist() + source_imgs = [self.img_dataset[start_idx]] # dim + source_texts = self.text_dataset[end_idx] # sent_len + target = self.text_dataset[end_idx] # will not be computed + + return { + 'id': index, + 'source_imgs': torch.FloatTensor(source_imgs), + 'source_texts': source_texts, + 'target': torch.LongTensor(target) + } + + def __len__(self): + return len(self.span_idxs) + + def num_tokens(self, index): + """Return the number of tokens in a sample. This value is used to + enforce ``--max-tokens`` during batching.""" + ''' + group_idx, start_idx, end_idx = self.span_idxs[index].tolist() + sum_tokens = 0 + for i in range(start_idx+1, end_idx+1): + sum_tokens += len(self.text_dataset[i]) + ''' + group_idx, start_idx, end_idx = self.span_idxs[index].tolist() + sum_tokens = len(self.text_dataset[end_idx]) + #sum_tokens += end_idx - start_idx + 1 + return sum_tokens + + def size(self, index): + """Return an example's size as a float or tuple. This value is used when + filtering a dataset with ``--max-positions``.""" + return self.num_tokens(index) + + def ordered_indices(self): + if self.shuffle: + indices = np.random.permutation(len(self)) + else: + indices = np.arange(len(self)) + # todo 添加bucket + # # Inspired by LanguagePairDataset.ordered_indices + # indices = indices[np.argsort(self.cap_ds.sizes[indices], kind='mergesort')] + # return indices[np.argsort(self.img_ds.sizes[indices], kind='mergesort')] + return indices + + def collater(self, samples): + """Merge a list of samples to form a mini-batch.""" + if len(samples) == 0: + return {} + indices = [] + source_imgs = [] + source_texts = [] + source_lengths = [] + targets = [] + + target_ntokens = 0 + num_sentences = 0 + + for sample in samples: + index = sample['id'] + indices.append(index) + + source_imgs.append(sample['source_imgs']) + source_texts.append(torch.LongTensor(sample['source_texts'])) + source_lengths.append(len(sample['source_texts'])) + + targets.append(sample['target']) + target_ntokens += len(sample["target"]) + num_sentences = len(sample) + + indices = torch.tensor(indices, dtype=torch.long) + + pad_imgs = None + for imgs in source_imgs: + if (pad_imgs is None): + pad_imgs = imgs + else: + pad_imgs = torch.cat((pad_imgs, imgs)) + + source_texts_batch = data_utils.collate_tokens(source_texts, + pad_idx=self.vocab_dict.pad(), + eos_idx=self.vocab_dict.eos(), + move_eos_to_beginning=False) + pad_text_batch = (source_texts_batch != self.vocab_dict.pad()).float() + eos_text_batch = (source_texts_batch != self.vocab_dict.eos()).float() + pad_text_batch = pad_text_batch * eos_text_batch + + target_batch = data_utils.collate_tokens(targets, + pad_idx=self.vocab_dict.pad(), + eos_idx=self.vocab_dict.eos(), + move_eos_to_beginning=False) + prev_target_batch = data_utils.collate_tokens(targets, + pad_idx=self.vocab_dict.pad(), + eos_idx=self.vocab_dict.eos(), + move_eos_to_beginning=True) + + return { + 'id': indices, + 'net_input': { + 'src_tokens': source_texts_batch, + 'src_mask': pad_text_batch, + 'src_imgs': pad_imgs, + 'src_lengths': source_lengths, + 'prev_output_tokens': prev_target_batch, + }, + 'target': target_batch, + 'ntokens': target_ntokens, + 'nsentences': num_sentences, + } diff --git a/shuhe_mmi_feature_fairseq/mmi_video_dialogue_task.py b/shuhe_mmi_feature_fairseq/mmi_video_dialogue_task.py new file mode 100644 index 0000000..c143ef4 --- /dev/null +++ b/shuhe_mmi_feature_fairseq/mmi_video_dialogue_task.py @@ -0,0 +1,112 @@ +import os +import numpy as np + +import torch +from fairseq.data import Dictionary, data_utils +from video_dialogue_model.data.utils import text_bin_file +from fairseq.tasks import register_task, FairseqTask +from video_dialogue_model.data.feature_dataset import FeatureDataset +from video_dialogue_model.data.mmi_text_and_feature_dataset import MMITextImageDataset +#from video_dialogue_model.data.text_and_object_dataset import TextObjectDataset +#from video_dialogue_model.data.object_dataset import ObjectDataset + + +@register_task('mmi-video-dialogue') +class MMIVideoDialogueTask(FairseqTask): + @staticmethod + def add_args(parser): + parser.add_argument('--data-dir', default='output', + help='data directory') + parser.add_argument('--max-obj', type=int, default=20, + help='max objects per sentence') + parser.add_argument('--img-type', type=str, default="objects", choices=["features", "objects"], + help='image feature types') + + @classmethod + def setup_task(cls, args, **kwargs): + vocab_dict_file = os.path.join(args.data_dir, 'dict.txt') + vocab_dict = Dictionary.load(vocab_dict_file) + + return MMIVideoDialogueTask(args, vocab_dict) + + def __init__(self, args, vocab_dict): + super().__init__(args) + self.args = args + self.vocab_dict = vocab_dict + + def load_feature_dataset(self, split, **kwargs): + features_dataset = FeatureDataset(self.args.data_dir, split) + span_idxs = self.get_span_info(sent_num=features_dataset.sent_num) + + text_file = text_bin_file(self.args.data_dir, split) # os.path.join(self.args.data_dir, split) + text_dataset = data_utils.load_indexed_dataset(text_file, self.vocab_dict) + + self.datasets[split] = MMITextImageDataset(text_dataset=text_dataset, + image_dataset=features_dataset, + vocab_dict=self.vocab_dict, + span_idxs=span_idxs, + shuffle=True if split == "train" else False) + ''' + def load_text_object_dataset(self, split, **kwargs): + objects_dataset = ObjectDataset(self.args.data_dir, split, max_obj=self.args.max_obj) + span_idxs = self.item2span_idxs(sent_num=objects_dataset.sent_num, + max_src_sent=self.args.max_src_sent) + + text_file = text_bin_file(self.args.data_dir, split) # os.path.join(self.args.data_dir, split) + text_dataset = data_utils.load_indexed_dataset(text_file, self.vocab_dict) + + self.datasets[split] = TextObjectDataset(text_dataset=text_dataset, + image_dataset=objects_dataset, + vocab_dict=self.vocab_dict, + span_idxs=span_idxs, + shuffle=True if split == "train" else False) + ''' + def load_dataset(self, split, **kwargs): + if self.args.img_type == "features": + return self.load_feature_dataset(split, **kwargs) + return self.load_feature_dataset(split, **kwargs) + + @staticmethod + def get_span_info(sent_num: np.array) -> np.array: + """ + compute each src/tgt span of dataset. + For example, if we got [[0,1,2], [3,4]] as source texts, + then return [[0, 0, 2], [1, 3, 4]] + """ + span_idxs = [] + start_idx = 0 + #span_value = 10 + for group_idx in range(sent_num.shape[0]): + num = int(sent_num[group_idx]) + end_ = start_idx + 1 + while (end_ <= start_idx+num-1): + span_idxs.append((group_idx, end_-1, end_)) + end_ += 1 + ''' + if (num == 1): + start_idx += num + continue + start_ = start_idx + end_ = min(start_idx+num-1, start_idx+20-1) + while (end_ <= start_idx+num-1): + span_idxs.append((group_idx, start_, end_)) + start_ = end_ + end_ += span_value + ''' + start_idx += num + return np.array(span_idxs) + + @property + def source_dictionary(self): + return self.vocab_dict + + @property + def target_dictionary(self): + return self.vocab_dict + + def inference_step( + self, models, sample, prefix_tokens=None, constraints=None + ): + with torch.no_grad(): + for model in models: + return model(**sample["net_input"]) From 2b1c340cc05af0f70825f7cb9dddcf49a566ca1c Mon Sep 17 00:00:00 2001 From: wangshuhe Date: Wed, 7 Apr 2021 01:46:47 +0800 Subject: [PATCH 2/7] mmi-fairseq --- .gitignore | 1 + mmi_fairseq/__init__.py | 1 + .../MSELoss.py => mmi_fairseq/feature/Loss.py | 17 +- mmi_fairseq/feature/__init__.py | 5 + mmi_fairseq/feature/feature_dataset.py | 24 +++ mmi_fairseq/feature/generate.py | 164 ++++++++++++++++++ .../feature}/image_mmi_transformer.py | 30 +++- .../feature}/mmi_text_and_feature_dataset.py | 36 ++-- .../feature}/mmi_video_dialogue_task.py | 31 ++-- mmi_fairseq/feature/test.sh | 17 ++ .../feature/train.sh | 10 +- mmi_fairseq/feature/utils.py | 58 +++++++ 12 files changed, 337 insertions(+), 57 deletions(-) create mode 100644 mmi_fairseq/__init__.py rename shuhe_mmi_feature_fairseq/MSELoss.py => mmi_fairseq/feature/Loss.py (78%) create mode 100644 mmi_fairseq/feature/__init__.py create mode 100644 mmi_fairseq/feature/feature_dataset.py create mode 100644 mmi_fairseq/feature/generate.py rename {shuhe_mmi_feature_fairseq => mmi_fairseq/feature}/image_mmi_transformer.py (80%) rename {shuhe_mmi_feature_fairseq => mmi_fairseq/feature}/mmi_text_and_feature_dataset.py (82%) rename {shuhe_mmi_feature_fairseq => mmi_fairseq/feature}/mmi_video_dialogue_task.py (84%) create mode 100644 mmi_fairseq/feature/test.sh rename {shuhe_mmi_feature_fairseq => mmi_fairseq}/feature/train.sh (84%) create mode 100644 mmi_fairseq/feature/utils.py diff --git a/.gitignore b/.gitignore index eae9ca6..b2e7f7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Byte-compiled / optimized / DLL files +mmi_test/ result/ __pycache__/ *.py[cod] diff --git a/mmi_fairseq/__init__.py b/mmi_fairseq/__init__.py new file mode 100644 index 0000000..5464068 --- /dev/null +++ b/mmi_fairseq/__init__.py @@ -0,0 +1 @@ +from mmi_fairseq.feature import * diff --git a/shuhe_mmi_feature_fairseq/MSELoss.py b/mmi_fairseq/feature/Loss.py similarity index 78% rename from shuhe_mmi_feature_fairseq/MSELoss.py rename to mmi_fairseq/feature/Loss.py index d2445b7..1aa2857 100644 --- a/shuhe_mmi_feature_fairseq/MSELoss.py +++ b/mmi_fairseq/feature/Loss.py @@ -3,15 +3,14 @@ from fairseq.criterions import FairseqCriterion, register_criterion -@register_criterion("mse-loss") -class MSELoss(FairseqCriterion): +@register_criterion("base-loss") +class Loss(FairseqCriterion): """ - Implementation for the MSELoss. + Implementation for the Loss. """ def __init__(self, task): super().__init__(task) - self.loss = torch.nn.MSELoss(reduction='mean') def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. @@ -22,9 +21,10 @@ def forward(self, model, sample, reduce=True): 3) logging outputs to display while training """ sample_size = sample['nsentences'] - loss = model(**sample["net_input"]) - target = torch.ones(sample['net_input']['src_tokens'].shape[0]).cuda() - loss = self.loss(loss, target) + loss, label = model(**sample["net_input"]) + loss = -(label*torch.log(loss) + (1-label)*torch.log(1-loss)) + #print(loss) + loss = loss.sum(dim=-1)/sample_size logging_output = { "loss": loss, @@ -40,8 +40,9 @@ def reduce_metrics(logging_outputs) -> None: loss_sum = sum(log.get("loss", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + metrics.log_scalar( - "loss", loss_sum / sample_size, sample_size, round=3 + "loss", loss_sum, sample_size, round=3 ) @staticmethod diff --git a/mmi_fairseq/feature/__init__.py b/mmi_fairseq/feature/__init__.py new file mode 100644 index 0000000..6f4316f --- /dev/null +++ b/mmi_fairseq/feature/__init__.py @@ -0,0 +1,5 @@ +from .image_mmi_transformer import MMIImageTransformerModel +from .Loss import Loss +from .feature_dataset import FeatureDataset +from .mmi_text_and_feature_dataset import MMITextImageDataset +from .mmi_video_dialogue_task import MMIVideoDialogueTask diff --git a/mmi_fairseq/feature/feature_dataset.py b/mmi_fairseq/feature/feature_dataset.py new file mode 100644 index 0000000..ccd0560 --- /dev/null +++ b/mmi_fairseq/feature/feature_dataset.py @@ -0,0 +1,24 @@ +# encoding: utf-8 + +import numpy as np +from torch.utils.data import Dataset +from mmi_fairseq.feature.utils import sent_num_file, offsets_file, feature_file, warmup_mmap_file + + +class FeatureDataset(Dataset): + """Load Feature dataset""" + def __init__(self, data_dir, split="train"): + self.data_dir = data_dir + self.sent_num = np.load(sent_num_file(data_dir, split)) + self.offsets = np.load(offsets_file(data_dir, split)) + self.dim = 1000 + self.total_num = self.offsets[-1] + self.sent_num[-1] + warmup_mmap_file(feature_file(data_dir, split)) + self.features = np.memmap(feature_file(data_dir, split), dtype='float32', mode='r', + shape=(self.total_num, self.dim)) + + def __getitem__(self, item): + return self.features[item] + + def __len__(self): + return self.total_num \ No newline at end of file diff --git a/mmi_fairseq/feature/generate.py b/mmi_fairseq/feature/generate.py new file mode 100644 index 0000000..69c0b4b --- /dev/null +++ b/mmi_fairseq/feature/generate.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Translate pre-processed data with a trained model. + +We basically copy this file from `fairseq.generate.py` and +change this file a little bit to generate attention scores and other stuff. +""" + +import ast +import logging +import os +import sys +from itertools import chain + +import numpy as np +import torch +from fairseq import checkpoint_utils, options, tasks, utils +from fairseq.logging import progress_bar + + +def main(args): + assert args.path is not None, "--path required for generation!" + assert ( + not args.sampling or args.nbest == args.beam + ), "--sampling requires --nbest to be equal to --beam" + assert ( + args.replace_unk is None or args.dataset_impl == "raw" + ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)" + + if args.results_path is not None: + os.makedirs(args.results_path, exist_ok=True) + output_path = os.path.join( + args.results_path, "generate-{}.txt".format(args.gen_subset) + ) + with open(output_path, "w", buffering=1, encoding="utf-8") as h: + return _main(args, h) + else: + return _main(args, sys.stdout) + + +def get_symbols_to_strip_from_output(generator): + if hasattr(generator, "symbols_to_strip_from_output"): + return generator.symbols_to_strip_from_output + else: + return {generator.eos} + + +def _main(args, output_file): + logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=output_file, + ) + logger = logging.getLogger("fairseq_cli.generate") + + utils.import_user_module(args) + + if args.max_tokens is None and args.batch_size is None: + args.max_tokens = 12000 + logger.info(args) + + # Fix seed for stochastic decoding + if args.seed is not None and not args.no_seed_provided: + np.random.seed(args.seed) + utils.set_torch_seed(args.seed) + + use_cuda = torch.cuda.is_available() and not args.cpu + + # Load dataset splits + task = tasks.setup_task(args) + task.load_dataset(args.gen_subset) + + overrides = ast.literal_eval(args.model_overrides) + + # Load ensemble + logger.info("loading model(s) from {}".format(args.path)) + models, _model_args = checkpoint_utils.load_model_ensemble( + utils.split_paths(args.path), + arg_overrides=overrides, + task=task, + suffix=getattr(args, "checkpoint_suffix", ""), + strict=(args.checkpoint_shard_count == 1), + num_shards=args.checkpoint_shard_count, + ) + + if args.lm_path is not None: + overrides["data"] = args.data + + try: + lms, _ = checkpoint_utils.load_model_ensemble( + [args.lm_path], + arg_overrides=overrides, + task=None, + ) + except: + logger.warning( + f"Failed to load language model! Please make sure that the language model dict is the same " + f"as target dict and is located in the data dir ({args.data})" + ) + raise + + assert len(lms) == 1 + else: + lms = [None] + + # Optimize ensemble for generation + for model in chain(models, lms): + if model is None: + continue + if args.fp16: + model.half() + if use_cuda and not args.pipeline_model_parallel: + model.cuda() + model.prepare_for_inference_(args) + + # Load dataset (possibly sharded) + itr = task.get_batch_iterator( + dataset=task.dataset(args.gen_subset), + max_tokens=args.max_tokens, + max_sentences=args.batch_size, + max_positions=utils.resolve_max_positions( + task.max_positions(), *[model.max_positions() for model in models] + ), + ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=args.required_batch_size_multiple, + num_shards=args.num_shards, + shard_id=args.shard_id, + num_workers=args.num_workers, + data_buffer_size=args.data_buffer_size, + ).next_epoch_itr(shuffle=False) + progress = progress_bar.progress_bar( + itr, + log_format=args.log_format, + log_interval=args.log_interval, + default_log_format=("tqdm" if not args.no_progress_bar else "none"), + ) + + output = [] + for sample in progress: + sample = utils.move_to_cuda(sample) if use_cuda else sample + if "net_input" not in sample: + continue + + loss, _ = task.inference_step( + models, + sample + ) + print(loss) + output.append(loss) + return output + +def cli_main(): + parser = options.get_generation_parser() + parser.add_argument("--print-attention", action="store_true", help="print attention matrix as jsonline") + args = options.parse_args_and_arch(parser) + main(args) + +if __name__ == "__main__": + cli_main() \ No newline at end of file diff --git a/shuhe_mmi_feature_fairseq/image_mmi_transformer.py b/mmi_fairseq/feature/image_mmi_transformer.py similarity index 80% rename from shuhe_mmi_feature_fairseq/image_mmi_transformer.py rename to mmi_fairseq/feature/image_mmi_transformer.py index 731ab51..a8b4d4b 100644 --- a/shuhe_mmi_feature_fairseq/image_mmi_transformer.py +++ b/mmi_fairseq/feature/image_mmi_transformer.py @@ -13,6 +13,7 @@ from typing import Optional import torch +import torch.nn as nn from fairseq.models import ( register_model, register_model_architecture, @@ -27,7 +28,6 @@ DEFAULT_MAX_SOURCE_POSITIONS = 1024 DEFAULT_MAX_TARGET_POSITIONS = 1024 - @register_model("mmi-img-transformer") class MMIImageTransformerModel(TransformerModel): """ @@ -48,6 +48,9 @@ class MMIImageTransformerModel(TransformerModel): def __init__(self, args, encoder, decoder): super().__init__(args, encoder, decoder) + self.final = nn.Linear(in_features=args.encoder_embed_dim+args.img_dim, out_features=1, bias=True) + #self.final = nn.Linear(in_features=args.encoder_embed_dim, out_features=args.img_dim, bias=True) + #self.cos = nn.CosineSimilarity(dim=2) @staticmethod def add_args(parser): @@ -56,7 +59,7 @@ def add_args(parser): parser.add_argument('--img-dim', type=int, metavar='N', default=1000, help='image feature dimension') - def forward(self, src_tokens, src_mask, src_imgs, src_lengths, prev_output_tokens, **kwargs): + def forward(self, src_tokens, src_label, src_imgs, src_lengths, prev_output_tokens, **kwargs): """ Run the forward pass for an encoder-decoder model. @@ -69,9 +72,9 @@ def forward(self, src_tokens, src_mask, src_imgs, src_lengths, prev_output_token Args: src_tokens (LongTensor): tokens in the source language of shape - `(batch * sent_num, src_len)` + `(batch, src_len)` src_imgs (FloatTensor): images features in the source sentences - `(batch * img_num, dim)` + `(batch, dim)` src_lengths (LongTensor): source sentence lengths of shape `(batch)` prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing @@ -80,10 +83,23 @@ def forward(self, src_tokens, src_mask, src_imgs, src_lengths, prev_output_token output_: image_feature * text_feature, shape `(batch, sent_len)` """ encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + ''' + x = self.final(encoder_out.encoder_out).transpose(0, 1) # T * B * C -> B * T * C + src_imgs = torch.unsqueeze(src_imgs, dim=1) + src_imgs = src_imgs.expand(x.shape[0], x.shape[1], x.shape[2]) + #print(src_label) + #print(self.cos(x, src_imgs)) + #output_ = torch.nn.functional.sigmoid(torch.matmul(x, src_imgs).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + output_ = torch.nn.functional.sigmoid(self.cos(x, src_imgs)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + #print(output_.sum(dim=-1)/src_lengths) + #print(output_) + return output_.sum(dim=-1)/src_lengths, src_label + ''' x = encoder_out.encoder_out.transpose(0, 1) # T * B * C -> B * T * C - src_imgs = torch.unsqueeze(src_imgs, dim=-1) - output_ = torch.nn.functional.sigmoid(torch.matmul(x, src_imgs).squeeze(dim=-1)) * src_mask # B * T - return output_.sum(dim=-1) + src_imgs = torch.unsqueeze(src_imgs, dim=1) + src_imgs = src_imgs.expand(x.shape[0], x.shape[1], src_imgs.shape[2]) + feature = torch.nn.functional.sigmoid(self.final(torch.cat((x, src_imgs), dim=-1)).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + return feature.sum(dim=-1)/src_lengths, src_label @classmethod def build_encoder(cls, args, src_dict, embed_tokens): diff --git a/shuhe_mmi_feature_fairseq/mmi_text_and_feature_dataset.py b/mmi_fairseq/feature/mmi_text_and_feature_dataset.py similarity index 82% rename from shuhe_mmi_feature_fairseq/mmi_text_and_feature_dataset.py rename to mmi_fairseq/feature/mmi_text_and_feature_dataset.py index 253bd74..00f760f 100644 --- a/shuhe_mmi_feature_fairseq/mmi_text_and_feature_dataset.py +++ b/mmi_fairseq/feature/mmi_text_and_feature_dataset.py @@ -13,7 +13,7 @@ import numpy as np import torch from fairseq.data.fairseq_dataset import FairseqDataset -from video_dialogue_model.data.feature_dataset import FeatureDataset +from mmi_fairseq.feature.feature_dataset import FeatureDataset from fairseq.data import data_utils @@ -32,14 +32,15 @@ def __getitem__(self, index): source_texts = [self.text_dataset[idx] for idx in range(start_idx+1, end_idx+1)] # n * sent_len target = self.text_dataset[end_idx] # will not be computed ''' - group_idx, start_idx, end_idx = self.span_idxs[index].tolist() - source_imgs = [self.img_dataset[start_idx]] # dim + is_true, start_idx, end_idx = self.span_idxs[index].tolist() + source_imgs = self.img_dataset[start_idx] # dim source_texts = self.text_dataset[end_idx] # sent_len target = self.text_dataset[end_idx] # will not be computed return { 'id': index, - 'source_imgs': torch.FloatTensor(source_imgs), + 'is_true': is_true, + 'source_imgs': source_imgs, 'source_texts': source_texts, 'target': torch.LongTensor(target) } @@ -56,9 +57,8 @@ def num_tokens(self, index): for i in range(start_idx+1, end_idx+1): sum_tokens += len(self.text_dataset[i]) ''' - group_idx, start_idx, end_idx = self.span_idxs[index].tolist() - sum_tokens = len(self.text_dataset[end_idx]) - #sum_tokens += end_idx - start_idx + 1 + is_true, start_idx, end_idx = self.span_idxs[index].tolist() + sum_tokens = len(self.text_dataset[start_idx]) return sum_tokens def size(self, index): @@ -85,6 +85,7 @@ def collater(self, samples): source_imgs = [] source_texts = [] source_lengths = [] + source_label = [] targets = [] target_ntokens = 0 @@ -97,6 +98,7 @@ def collater(self, samples): source_imgs.append(sample['source_imgs']) source_texts.append(torch.LongTensor(sample['source_texts'])) source_lengths.append(len(sample['source_texts'])) + source_label.append(sample['is_true']) targets.append(sample['target']) target_ntokens += len(sample["target"]) @@ -104,20 +106,16 @@ def collater(self, samples): indices = torch.tensor(indices, dtype=torch.long) - pad_imgs = None - for imgs in source_imgs: - if (pad_imgs is None): - pad_imgs = imgs - else: - pad_imgs = torch.cat((pad_imgs, imgs)) + source_label_tensor = torch.tensor(source_label, dtype=torch.float) + + source_lengths_tensor = torch.tensor(source_lengths, dtype=torch.long) + + image_tensor = torch.tensor(source_imgs, dtype=torch.float) source_texts_batch = data_utils.collate_tokens(source_texts, pad_idx=self.vocab_dict.pad(), eos_idx=self.vocab_dict.eos(), move_eos_to_beginning=False) - pad_text_batch = (source_texts_batch != self.vocab_dict.pad()).float() - eos_text_batch = (source_texts_batch != self.vocab_dict.eos()).float() - pad_text_batch = pad_text_batch * eos_text_batch target_batch = data_utils.collate_tokens(targets, pad_idx=self.vocab_dict.pad(), @@ -132,9 +130,9 @@ def collater(self, samples): 'id': indices, 'net_input': { 'src_tokens': source_texts_batch, - 'src_mask': pad_text_batch, - 'src_imgs': pad_imgs, - 'src_lengths': source_lengths, + 'src_label': source_label_tensor, + 'src_imgs': image_tensor, + 'src_lengths': source_lengths_tensor, 'prev_output_tokens': prev_target_batch, }, 'target': target_batch, diff --git a/shuhe_mmi_feature_fairseq/mmi_video_dialogue_task.py b/mmi_fairseq/feature/mmi_video_dialogue_task.py similarity index 84% rename from shuhe_mmi_feature_fairseq/mmi_video_dialogue_task.py rename to mmi_fairseq/feature/mmi_video_dialogue_task.py index c143ef4..77e115a 100644 --- a/shuhe_mmi_feature_fairseq/mmi_video_dialogue_task.py +++ b/mmi_fairseq/feature/mmi_video_dialogue_task.py @@ -1,12 +1,13 @@ import os import numpy as np +import random import torch from fairseq.data import Dictionary, data_utils -from video_dialogue_model.data.utils import text_bin_file +from mmi_fairseq.feature.utils import text_bin_file from fairseq.tasks import register_task, FairseqTask -from video_dialogue_model.data.feature_dataset import FeatureDataset -from video_dialogue_model.data.mmi_text_and_feature_dataset import MMITextImageDataset +from mmi_fairseq.feature.feature_dataset import FeatureDataset +from mmi_fairseq.feature.mmi_text_and_feature_dataset import MMITextImageDataset #from video_dialogue_model.data.text_and_object_dataset import TextObjectDataset #from video_dialogue_model.data.object_dataset import ObjectDataset @@ -36,7 +37,7 @@ def __init__(self, args, vocab_dict): def load_feature_dataset(self, split, **kwargs): features_dataset = FeatureDataset(self.args.data_dir, split) - span_idxs = self.get_span_info(sent_num=features_dataset.sent_num) + span_idxs = self.get_span_info(sent_num=features_dataset.sent_num, split=split) text_file = text_bin_file(self.args.data_dir, split) # os.path.join(self.args.data_dir, split) text_dataset = data_utils.load_indexed_dataset(text_file, self.vocab_dict) @@ -67,32 +68,26 @@ def load_dataset(self, split, **kwargs): return self.load_feature_dataset(split, **kwargs) @staticmethod - def get_span_info(sent_num: np.array) -> np.array: + def get_span_info(sent_num: np.array, split) -> np.array: """ compute each src/tgt span of dataset. For example, if we got [[0,1,2], [3,4]] as source texts, then return [[0, 0, 2], [1, 3, 4]] """ + max_num = sum(int(sent_num[group_idx]) for group_idx in range(sent_num.shape[0])) span_idxs = [] start_idx = 0 - #span_value = 10 for group_idx in range(sent_num.shape[0]): num = int(sent_num[group_idx]) end_ = start_idx + 1 while (end_ <= start_idx+num-1): - span_idxs.append((group_idx, end_-1, end_)) + span_idxs.append((1, end_-1, end_)) + if (split != 'test'): + neg_idx = random.randint(0, max_num-1) + while (neg_idx == end_): + neg_idx = random.randint(0, max_num-1) + span_idxs.append((0, end_-1, neg_idx)) end_ += 1 - ''' - if (num == 1): - start_idx += num - continue - start_ = start_idx - end_ = min(start_idx+num-1, start_idx+20-1) - while (end_ <= start_idx+num-1): - span_idxs.append((group_idx, start_, end_)) - start_ = end_ - end_ += span_value - ''' start_idx += num return np.array(span_idxs) diff --git a/mmi_fairseq/feature/test.sh b/mmi_fairseq/feature/test.sh new file mode 100644 index 0000000..3920e5a --- /dev/null +++ b/mmi_fairseq/feature/test.sh @@ -0,0 +1,17 @@ +DATA_DIR="/data/wangshuhe/test_mmi" +TYPE="features" +MODEL_PATH="/home/wangshuhe/shuhework/OpenViDial/mmi_test/checkpoint_best.pt" +NBEST=10 +BEAM=10 +SUBSET="test" + + +CUDA_VISIBLE_DEVICES=2 python ./mmi_fairseq/feature/generate.py \ + --user-dir mmi_fairseq \ + --task mmi-video-dialogue \ + --img-type $TYPE \ + --data-dir $DATA_DIR \ + --path $MODEL_PATH \ + --batch-size 5 \ + --remove-bpe \ + --gen-subset $SUBSET \ No newline at end of file diff --git a/shuhe_mmi_feature_fairseq/feature/train.sh b/mmi_fairseq/feature/train.sh similarity index 84% rename from shuhe_mmi_feature_fairseq/feature/train.sh rename to mmi_fairseq/feature/train.sh index 68f78ae..464e716 100644 --- a/shuhe_mmi_feature_fairseq/feature/train.sh +++ b/mmi_fairseq/feature/train.sh @@ -13,22 +13,22 @@ MODEL_DIR="/home/wangshuhe/shuhework/OpenViDial/mmi_test" DATA_DIR="/data/wangshuhe/test_mmi" TYPE="features" -CUDA_VISIBLE_DEVICES=2 fairseq-train \ +CUDA_VISIBLE_DEVICES=3 fairseq-train \ --save-dir $MODEL_DIR \ - --user-dir video_dialogue_model \ + --user-dir mmi_fairseq \ --task mmi-video-dialogue \ --img-type $TYPE \ --data-dir $DATA_DIR \ --arch baseline-mmi-img-transformer \ --encoder-layers $LAYER \ - --encoder-embed-dim 1000 \ + --encoder-embed-dim 512 \ --dropout $DROPOUT \ --optimizer adam \ --max-tokens 100000 \ - --batch-size 60 \ + --batch-size 5 \ --adam-betas "(0.9,0.999)" \ --reset-optimizer \ - --criterion mse-loss \ + --criterion base-loss \ --lr $LR \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates $WARMUP \ --max-epoch 20 \ diff --git a/mmi_fairseq/feature/utils.py b/mmi_fairseq/feature/utils.py new file mode 100644 index 0000000..a2882a0 --- /dev/null +++ b/mmi_fairseq/feature/utils.py @@ -0,0 +1,58 @@ +# encoding: utf-8 +""" +@author: Yuxian Meng +@contact: yuxian_meng@shannonai.com + +@version: 1.0 +@file: path_utils +@time: 2020/11/14 12:13 +@desc: + +""" +import os + + +def sent_num_file(data_dir, split): + return os.path.join(data_dir, f"{split}.sent_num.npy") + + +def offsets_file(data_dir, split): + return os.path.join(data_dir, f"{split}.offsets.npy") + + +def feature_file(data_dir, split): + return os.path.join(data_dir, f"{split}.features.mmap") + + +def object_file(data_dir, split, truncate=0): + return os.path.join(data_dir, f"{split}.objects.mmap")+(f".{truncate}" if truncate else "") + + +def object_mask_file(data_dir, split, truncate=0): + return os.path.join(data_dir, f"{split}.objects_mask.mmap")+(f".{truncate}" if truncate else "") + + +def src_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src.txt") + + +def nmt_src_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src-tgt.src") + + +def nmt_tgt_file(data_dir, split): + return os.path.join(data_dir, f"{split}.src-tgt.tgt") + + +def text_bin_file(data_dir, split): + return os.path.join(data_dir, split) + + +def img_file(data_dir, group_idx, sent_idx): + return os.path.join(data_dir, f"img_dir{group_idx}", f"{sent_idx}.jpg") + + +def warmup_mmap_file(path): + with open(path, 'rb') as stream: + while stream.read(10 * 1024 * 1024): + pass From dc2d73c718798142f15d8e28c8ba4787f46dae31 Mon Sep 17 00:00:00 2001 From: wangshuhe Date: Wed, 7 Apr 2021 21:59:18 +0800 Subject: [PATCH 3/7] add object mmi --- mmi_fairseq/feature/__init__.py | 10 +- mmi_fairseq/feature/data/__init__.py | 5 + .../feature/{ => data}/feature_dataset.py | 2 +- .../mmi_text_and_feature_dataset.py | 4 +- .../data/mmi_text_and_object_dataset.py | 142 ++++++++++++ mmi_fairseq/feature/data/object_dataset.py | 30 +++ mmi_fairseq/feature/{ => data}/utils.py | 0 mmi_fairseq/feature/{ => loss}/Loss.py | 1 + mmi_fairseq/feature/loss/__init__.py | 1 + mmi_fairseq/feature/model/__init__.py | 2 + .../{ => model}/image_mmi_transformer.py | 0 .../feature/model/object_mmi_transformer.py | 212 ++++++++++++++++++ mmi_fairseq/feature/{ => scrtpts}/generate.py | 0 .../{test.sh => scrtpts/test_image.sh} | 0 .../{train.sh => scrtpts/train_image.sh} | 2 +- mmi_fairseq/feature/scrtpts/train_object.sh | 36 +++ mmi_fairseq/feature/tasks/__init__.py | 1 + .../{ => tasks}/mmi_video_dialogue_task.py | 25 ++- {mmi_model => old_mmi}/feature/config.py | 0 {mmi_model => old_mmi}/feature/data.py | 0 {mmi_model => old_mmi}/feature/model.py | 0 {mmi_model => old_mmi}/feature/optim.py | 0 {mmi_model => old_mmi}/feature/test.py | 0 {mmi_model => old_mmi}/feature/train.py | 0 {mmi_model => old_mmi}/feature/utils.py | 0 {mmi_model => old_mmi}/feature/vocab.py | 0 {mmi_model => old_mmi}/object/config.py | 0 {mmi_model => old_mmi}/object/data.py | 0 {mmi_model => old_mmi}/object/model.py | 0 {mmi_model => old_mmi}/object/optim.py | 0 {mmi_model => old_mmi}/object/test.py | 0 {mmi_model => old_mmi}/object/train.py | 0 {mmi_model => old_mmi}/object/utils.py | 0 {mmi_model => old_mmi}/object/vocab.py | 0 {mmi_model => old_mmi}/preprocess/char2id.py | 0 {mmi_model => old_mmi}/preprocess/get_dict.py | 0 36 files changed, 453 insertions(+), 20 deletions(-) create mode 100644 mmi_fairseq/feature/data/__init__.py rename mmi_fairseq/feature/{ => data}/feature_dataset.py (88%) rename mmi_fairseq/feature/{ => data}/mmi_text_and_feature_dataset.py (98%) create mode 100644 mmi_fairseq/feature/data/mmi_text_and_object_dataset.py create mode 100644 mmi_fairseq/feature/data/object_dataset.py rename mmi_fairseq/feature/{ => data}/utils.py (100%) rename mmi_fairseq/feature/{ => loss}/Loss.py (97%) create mode 100644 mmi_fairseq/feature/loss/__init__.py create mode 100644 mmi_fairseq/feature/model/__init__.py rename mmi_fairseq/feature/{ => model}/image_mmi_transformer.py (100%) create mode 100644 mmi_fairseq/feature/model/object_mmi_transformer.py rename mmi_fairseq/feature/{ => scrtpts}/generate.py (100%) rename mmi_fairseq/feature/{test.sh => scrtpts/test_image.sh} (100%) rename mmi_fairseq/feature/{train.sh => scrtpts/train_image.sh} (97%) create mode 100644 mmi_fairseq/feature/scrtpts/train_object.sh create mode 100644 mmi_fairseq/feature/tasks/__init__.py rename mmi_fairseq/feature/{ => tasks}/mmi_video_dialogue_task.py (84%) rename {mmi_model => old_mmi}/feature/config.py (100%) rename {mmi_model => old_mmi}/feature/data.py (100%) rename {mmi_model => old_mmi}/feature/model.py (100%) rename {mmi_model => old_mmi}/feature/optim.py (100%) rename {mmi_model => old_mmi}/feature/test.py (100%) rename {mmi_model => old_mmi}/feature/train.py (100%) rename {mmi_model => old_mmi}/feature/utils.py (100%) rename {mmi_model => old_mmi}/feature/vocab.py (100%) rename {mmi_model => old_mmi}/object/config.py (100%) rename {mmi_model => old_mmi}/object/data.py (100%) rename {mmi_model => old_mmi}/object/model.py (100%) rename {mmi_model => old_mmi}/object/optim.py (100%) rename {mmi_model => old_mmi}/object/test.py (100%) rename {mmi_model => old_mmi}/object/train.py (100%) rename {mmi_model => old_mmi}/object/utils.py (100%) rename {mmi_model => old_mmi}/object/vocab.py (100%) rename {mmi_model => old_mmi}/preprocess/char2id.py (100%) rename {mmi_model => old_mmi}/preprocess/get_dict.py (100%) diff --git a/mmi_fairseq/feature/__init__.py b/mmi_fairseq/feature/__init__.py index 6f4316f..7a73a88 100644 --- a/mmi_fairseq/feature/__init__.py +++ b/mmi_fairseq/feature/__init__.py @@ -1,5 +1,5 @@ -from .image_mmi_transformer import MMIImageTransformerModel -from .Loss import Loss -from .feature_dataset import FeatureDataset -from .mmi_text_and_feature_dataset import MMITextImageDataset -from .mmi_video_dialogue_task import MMIVideoDialogueTask +from .data import * +from .loss import * +from .model import * +from .tasks import * +from .scrtpts import * diff --git a/mmi_fairseq/feature/data/__init__.py b/mmi_fairseq/feature/data/__init__.py new file mode 100644 index 0000000..98b68f7 --- /dev/null +++ b/mmi_fairseq/feature/data/__init__.py @@ -0,0 +1,5 @@ +from .feature_dataset import FeatureDataset +from .mmi_text_and_feature_dataset import MMITextImageDataset +from .object_dataset import ObjectDataset +from .mmi_text_and_object_dataset import MMITextObjectDataset +from .utils import * \ No newline at end of file diff --git a/mmi_fairseq/feature/feature_dataset.py b/mmi_fairseq/feature/data/feature_dataset.py similarity index 88% rename from mmi_fairseq/feature/feature_dataset.py rename to mmi_fairseq/feature/data/feature_dataset.py index ccd0560..0b89956 100644 --- a/mmi_fairseq/feature/feature_dataset.py +++ b/mmi_fairseq/feature/data/feature_dataset.py @@ -2,7 +2,7 @@ import numpy as np from torch.utils.data import Dataset -from mmi_fairseq.feature.utils import sent_num_file, offsets_file, feature_file, warmup_mmap_file +from mmi_fairseq.feature.data.utils import sent_num_file, offsets_file, feature_file, warmup_mmap_file class FeatureDataset(Dataset): diff --git a/mmi_fairseq/feature/mmi_text_and_feature_dataset.py b/mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py similarity index 98% rename from mmi_fairseq/feature/mmi_text_and_feature_dataset.py rename to mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py index 00f760f..0bde599 100644 --- a/mmi_fairseq/feature/mmi_text_and_feature_dataset.py +++ b/mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py @@ -13,7 +13,7 @@ import numpy as np import torch from fairseq.data.fairseq_dataset import FairseqDataset -from mmi_fairseq.feature.feature_dataset import FeatureDataset +from mmi_fairseq.feature.data.feature_dataset import FeatureDataset from fairseq.data import data_utils @@ -81,6 +81,7 @@ def collater(self, samples): """Merge a list of samples to form a mini-batch.""" if len(samples) == 0: return {} + indices = [] source_imgs = [] source_texts = [] @@ -89,7 +90,6 @@ def collater(self, samples): targets = [] target_ntokens = 0 - num_sentences = 0 for sample in samples: index = sample['id'] diff --git a/mmi_fairseq/feature/data/mmi_text_and_object_dataset.py b/mmi_fairseq/feature/data/mmi_text_and_object_dataset.py new file mode 100644 index 0000000..a27a66d --- /dev/null +++ b/mmi_fairseq/feature/data/mmi_text_and_object_dataset.py @@ -0,0 +1,142 @@ +# encoding: utf-8 +""" +@author: Yuxian Meng +@contact: yuxian_meng@shannonai.com + +@version: 1.0 +@file: text_and_image_dataset +@time: 2020/11/14 15:26 +@desc: Combine Text and Object Datasets + +""" + +import numpy as np +import torch +from fairseq.data.fairseq_dataset import FairseqDataset +from mmi_fairseq.feature.data.object_dataset import ObjectDataset +from fairseq.data import data_utils + + +class MMITextObjectDataset(FairseqDataset): + """ + A combine of text dataset and object dataset + """ + def __init__(self, image_dataset: ObjectDataset, text_dataset, vocab_dict, span_idxs, shuffle=False): + self.img_dataset = image_dataset + self.text_dataset = text_dataset + self.vocab_dict = vocab_dict + self.span_idxs = span_idxs + self.shuffle = shuffle + self.max_obj = image_dataset.max_obj + + def __getitem__(self, index): + # todo: try to add [bos] at the beginning of text sequence to separate objects/texts + is_true, start_idx, end_idx = self.span_idxs[index].tolist() + objects, objects_mask = self.img_dataset[start_idx] # max_obj * dim, max_obj + source_texts = self.text_dataset[end_idx] # sent_len + target = self.text_dataset[end_idx] # will not be computed + + return { + 'id': index, + 'is_true': is_true, + 'objects': objects, + 'objects_mask': objects_mask, + 'source_texts': source_texts, + 'target': torch.LongTensor(target) + } + + def __len__(self): + return len(self.span_idxs) + + def num_tokens(self, index): + """Return the number of tokens in a sample. This value is used to + enforce ``--max-tokens`` during batching.""" + is_true, start_idx, end_idx = self.span_idxs[index].tolist() + sum_tokens = len(self.text_dataset[start_idx]) + return sum_tokens + + def size(self, index): + """Return an example's size as a float or tuple. This value is used when + filtering a dataset with ``--max-positions``.""" + return self.num_tokens(index) + + def ordered_indices(self): + if self.shuffle: + indices = np.random.permutation(len(self)) + else: + indices = np.arange(len(self)) + # todo 添加bucket + # # Inspired by LanguagePairDataset.ordered_indices + # indices = indices[np.argsort(self.cap_ds.sizes[indices], kind='mergesort')] + # return indices[np.argsort(self.img_ds.sizes[indices], kind='mergesort')] + return indices + + def collater(self, samples): + """Merge a list of samples to form a mini-batch.""" + if len(samples) == 0: + return {} + + indices = [] + source_objects = [] + objects_mask = [] + source_texts = [] + source_lengths = [] + source_label = [] + targets = [] + + target_ntokens = 0 + + for sample in samples: + index = sample['id'] + indices.append(index) + + source_objects.append(sample["objects"]) + objects_mask.append(sample["objects_mask"]) + source_texts.append(torch.LongTensor(sample['source_texts'])) + source_lengths.append(len(sample['source_texts'])) + source_label.append(sample['is_true']) + + targets.append(sample['target']) + target_ntokens += len(sample["target"]) + num_sentences = len(samples) + + indices = torch.tensor(indices, dtype=torch.long) + + source_label_tensor = torch.tensor(source_label, dtype=torch.float) + + source_lengths_tensor = torch.tensor(source_lengths, dtype=torch.long) + + image_tensor = torch.tensor(source_objects, dtype=torch.float) + + mask_tensor = torch.tensor(objects_mask, dtype=torch.float) + + + + source_texts_batch = data_utils.collate_tokens(source_texts, + pad_idx=self.vocab_dict.pad(), + eos_idx=self.vocab_dict.eos(), + move_eos_to_beginning=False) + + target_batch = data_utils.collate_tokens(targets, + pad_idx=self.vocab_dict.pad(), + eos_idx=self.vocab_dict.eos(), + move_eos_to_beginning=False) + prev_target_batch = data_utils.collate_tokens(targets, + pad_idx=self.vocab_dict.pad(), + eos_idx=self.vocab_dict.eos(), + move_eos_to_beginning=True) + + return { + 'id': indices, + 'net_input': { + 'src_tokens': source_texts_batch, + 'src_label': source_label_tensor, + 'objs': image_tensor, + 'objs_mask': mask_tensor, + 'src_lengths': source_lengths_tensor, + 'prev_output_tokens': prev_target_batch, + }, + 'target': target_batch, + 'ntokens': target_ntokens, + 'nsentences': num_sentences, + } diff --git a/mmi_fairseq/feature/data/object_dataset.py b/mmi_fairseq/feature/data/object_dataset.py new file mode 100644 index 0000000..bdd6619 --- /dev/null +++ b/mmi_fairseq/feature/data/object_dataset.py @@ -0,0 +1,30 @@ +# encoding: utf-8 + +import numpy as np +from torch.utils.data import Dataset + +from mmi_fairseq.feature.data.utils import sent_num_file, offsets_file, object_file, object_mask_file, warmup_mmap_file + +class ObjectDataset(Dataset): + MAX_OBJ = 20 # max-obj in mmap file + """Load Object dataset""" + def __init__(self, data_dir, split="train", max_obj=20): + self.data_dir = data_dir + self.sent_num = np.load(sent_num_file(data_dir, split)) + self.offsets = np.load(offsets_file(data_dir, split)) + self.total_sent_num = self.offsets[-1] + self.sent_num[-1] + self.dim = 2048 # todo add x,y,w,h + self.max_obj = max_obj # max-obj when getting item + warmup_mmap_file(object_file(data_dir, split, 0)) + print(self.total_sent_num, self.MAX_OBJ, self.dim) + self.objects = np.memmap(object_file(data_dir, split, 0), dtype=np.float32, mode='r', + shape=(self.total_sent_num, self.MAX_OBJ, self.dim)) + warmup_mmap_file(object_mask_file(data_dir, split, 0)) + self.objects_mask = np.memmap(object_mask_file(data_dir, split, 0), dtype=np.bool, mode='r', + shape=(self.total_sent_num, self.MAX_OBJ)) + + def __getitem__(self, item): + return self.objects[item][: self.max_obj], self.objects_mask[item][: self.max_obj] + + def __len__(self): + return self.total_sent_num \ No newline at end of file diff --git a/mmi_fairseq/feature/utils.py b/mmi_fairseq/feature/data/utils.py similarity index 100% rename from mmi_fairseq/feature/utils.py rename to mmi_fairseq/feature/data/utils.py diff --git a/mmi_fairseq/feature/Loss.py b/mmi_fairseq/feature/loss/Loss.py similarity index 97% rename from mmi_fairseq/feature/Loss.py rename to mmi_fairseq/feature/loss/Loss.py index 1aa2857..24abdb6 100644 --- a/mmi_fairseq/feature/Loss.py +++ b/mmi_fairseq/feature/loss/Loss.py @@ -21,6 +21,7 @@ def forward(self, model, sample, reduce=True): 3) logging outputs to display while training """ sample_size = sample['nsentences'] + #print(sample["net_input"]) loss, label = model(**sample["net_input"]) loss = -(label*torch.log(loss) + (1-label)*torch.log(1-loss)) #print(loss) diff --git a/mmi_fairseq/feature/loss/__init__.py b/mmi_fairseq/feature/loss/__init__.py new file mode 100644 index 0000000..c3049fa --- /dev/null +++ b/mmi_fairseq/feature/loss/__init__.py @@ -0,0 +1 @@ +from .Loss import Loss \ No newline at end of file diff --git a/mmi_fairseq/feature/model/__init__.py b/mmi_fairseq/feature/model/__init__.py new file mode 100644 index 0000000..d9a2b96 --- /dev/null +++ b/mmi_fairseq/feature/model/__init__.py @@ -0,0 +1,2 @@ +from .image_mmi_transformer import MMIImageTransformerModel +from .object_mmi_transformer import MMIObjectTransformerModel \ No newline at end of file diff --git a/mmi_fairseq/feature/image_mmi_transformer.py b/mmi_fairseq/feature/model/image_mmi_transformer.py similarity index 100% rename from mmi_fairseq/feature/image_mmi_transformer.py rename to mmi_fairseq/feature/model/image_mmi_transformer.py diff --git a/mmi_fairseq/feature/model/object_mmi_transformer.py b/mmi_fairseq/feature/model/object_mmi_transformer.py new file mode 100644 index 0000000..a85b98c --- /dev/null +++ b/mmi_fairseq/feature/model/object_mmi_transformer.py @@ -0,0 +1,212 @@ +# encoding: utf-8 +""" +@author: Yuxian Meng +@contact: yuxian_meng@shannonai.com + +@version: 1.0 +@file: transformer_encoder +@time: 2020/11/18 11:35 +@desc: Transformer encoder with src-tokens and img-features as inputs + +""" + +from typing import Optional + +import torch +import torch.nn as nn +from fairseq.models import ( + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( + TransformerModel, + TransformerEncoder, + EncoderOut, + base_architecture as transformer_base_architecture +) + +DEFAULT_MAX_SOURCE_POSITIONS = 1024 +DEFAULT_MAX_TARGET_POSITIONS = 1024 + +@register_model("mmi-obj-transformer") +class MMIObjectTransformerModel(TransformerModel): + """ + Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) + `_. + + Args: + encoder (TransformerEncoder): the encoder + decoder (TransformerDecoder): the decoder + + The Transformer model provides the following named architectures and + command-line arguments: + + .. argparse:: + :ref: fairseq.models.transformer_parser + :prog: + """ + + def __init__(self, args, encoder, decoder): + super().__init__(args, encoder, decoder) + self.final = nn.Linear(in_features=args.encoder_embed_dim+args.img_dim, out_features=1, bias=True) + #self.final = nn.Linear(in_features=args.encoder_embed_dim, out_features=args.img_dim, bias=True) + #self.cos = nn.CosineSimilarity(dim=2) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + TransformerModel.add_args(parser) + parser.add_argument('--img-dim', type=int, metavar='N', default=2048, + help='image feature dimension') + + def forward(self, src_tokens, src_label, objs, objs_mask, src_lengths, prev_output_tokens, **kwargs): + """ + Run the forward pass for an encoder-decoder model. + + First feed a batch of source tokens through the encoder. Then, feed the + encoder output and previous decoder outputs (i.e., teacher forcing) to + the decoder to produce the next outputs:: + + encoder_out = self.encoder(src_tokens, src_lengths) + return self.decoder(prev_output_tokens, encoder_out) + + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_label (LongTensor): positive example or negative example + `(batch)` + objs (FloatTensor): images features in the source sentences + `(batch, max_obj, feature_dim)` + objs_mask (FloatTensor): mask file `(batch, max_obj)` + src_lengths (LongTensor): source sentence lengths of shape `(batch)` + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + + Returns: + output_: image_feature * text_feature, shape `(batch, sent_len)` + """ + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + ''' + x = self.final(encoder_out.encoder_out).transpose(0, 1) # T * B * C -> B * T * C + src_imgs = torch.unsqueeze(src_imgs, dim=1) + src_imgs = src_imgs.expand(x.shape[0], x.shape[1], x.shape[2]) + #print(src_label) + #print(self.cos(x, src_imgs)) + #output_ = torch.nn.functional.sigmoid(torch.matmul(x, src_imgs).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + output_ = torch.nn.functional.sigmoid(self.cos(x, src_imgs)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + #print(output_.sum(dim=-1)/src_lengths) + #print(output_) + return output_.sum(dim=-1)/src_lengths, src_label + ''' + x = encoder_out.encoder_out.transpose(0, 1) # T * B * C -> B * T * C + object_mask = objs_mask.unsqueeze(dim=-1) # B * max_obj * feature_dim + objs = objs * object_mask # B * max_obj * feature_dim + src_imgs = objs.sum(dim=1)/objs_mask.sum(dim=-1).unsqueeze(dim=-1) # B * feature_dim + src_imgs = torch.unsqueeze(src_imgs, dim=1) + src_imgs = src_imgs.expand(x.shape[0], x.shape[1], src_imgs.shape[2]) + feature = torch.nn.functional.sigmoid(self.final(torch.cat((x, src_imgs), dim=-1)).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + return feature.sum(dim=-1)/src_lengths, src_label + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return MMIObjectTransformerEncoder(args, src_dict, embed_tokens) + + +class MMIObjectTransformerEncoder(TransformerEncoder): + """ + Transformer encoder consisting of *args.encoder_layers* layers. Each layer + is a :class:`TransformerEncoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): encoding dictionary + embed_tokens (torch.nn.Embedding): input embedding + """ + + def __init__(self, args, dictionary, embed_tokens): + super().__init__(args, dictionary, embed_tokens) + + self.img_dim = args.img_dim + + def forward_embedding( + self, src_tokens, token_embedding: Optional[torch.Tensor] = None + ): + # embed tokens and positions + if token_embedding is None: + token_embedding = self.embed_tokens(src_tokens) + x = embed = self.embed_scale * token_embedding + + if self.embed_positions is not None: + x = embed + self.embed_positions(src_tokens) + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + x = self.dropout_module(x) + if self.quant_noise is not None: + x = self.quant_noise(x) + return x, embed + + def forward( + self, + src_tokens, + src_lengths, + cls_input=None, + return_all_hiddens=False, + token_embeddings: Optional[torch.Tensor] = None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + namedtuple: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + + encoder_states = [] if return_all_hiddens else None + + # encoder layers + for layer in self.layers: + x = layer(x, encoder_padding_mask) + if return_all_hiddens: + assert encoder_states is not None + encoder_states.append(x) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + return EncoderOut( + encoder_out=x, # T x B x C + encoder_padding_mask=encoder_padding_mask, # B x T + encoder_embedding=encoder_embedding, # B x T x C + encoder_states=encoder_states, # List[T x B x C] + src_tokens=None, + src_lengths=None, + ) + + +@register_model_architecture('mmi-obj-transformer', 'baseline-mmi-obj-transformer') +def base_architecture(args): + transformer_base_architecture(args) + diff --git a/mmi_fairseq/feature/generate.py b/mmi_fairseq/feature/scrtpts/generate.py similarity index 100% rename from mmi_fairseq/feature/generate.py rename to mmi_fairseq/feature/scrtpts/generate.py diff --git a/mmi_fairseq/feature/test.sh b/mmi_fairseq/feature/scrtpts/test_image.sh similarity index 100% rename from mmi_fairseq/feature/test.sh rename to mmi_fairseq/feature/scrtpts/test_image.sh diff --git a/mmi_fairseq/feature/train.sh b/mmi_fairseq/feature/scrtpts/train_image.sh similarity index 97% rename from mmi_fairseq/feature/train.sh rename to mmi_fairseq/feature/scrtpts/train_image.sh index 464e716..5b5aeb2 100644 --- a/mmi_fairseq/feature/train.sh +++ b/mmi_fairseq/feature/scrtpts/train_image.sh @@ -25,7 +25,7 @@ CUDA_VISIBLE_DEVICES=3 fairseq-train \ --dropout $DROPOUT \ --optimizer adam \ --max-tokens 100000 \ - --batch-size 5 \ + --batch-size 150 \ --adam-betas "(0.9,0.999)" \ --reset-optimizer \ --criterion base-loss \ diff --git a/mmi_fairseq/feature/scrtpts/train_object.sh b/mmi_fairseq/feature/scrtpts/train_object.sh new file mode 100644 index 0000000..4324687 --- /dev/null +++ b/mmi_fairseq/feature/scrtpts/train_object.sh @@ -0,0 +1,36 @@ +# Note that fairseq may use all gpus on your machine and the actual batch-size is times by n_gpus. +# If you use multiple gpus, batch_size should be devided by number of gpus. + +# hyper-params +LR=3e-4 +DROPOUT=0.1 +LAYER=3 +WARMUP=6000 + +# directory to save models +MODEL_DIR="/home/wangshuhe/shuhework/OpenViDial/mmi_test" +# data directory +DATA_DIR="/data/wangshuhe/test_mmi" +TYPE="objects" + +CUDA_VISIBLE_DEVICES=3 fairseq-train \ + --save-dir $MODEL_DIR \ + --user-dir mmi_fairseq \ + --task mmi-video-dialogue \ + --img-type $TYPE \ + --data-dir $DATA_DIR \ + --arch baseline-mmi-obj-transformer \ + --encoder-layers $LAYER \ + --encoder-embed-dim 512 \ + --dropout $DROPOUT \ + --optimizer adam \ + --max-tokens 100000 \ + --batch-size 150 \ + --adam-betas "(0.9,0.999)" \ + --reset-optimizer \ + --criterion base-loss \ + --lr $LR \ + --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates $WARMUP \ + --max-epoch 20 \ + --keep-last-epochs 5 \ + --ddp-backend=no_c10d \ No newline at end of file diff --git a/mmi_fairseq/feature/tasks/__init__.py b/mmi_fairseq/feature/tasks/__init__.py new file mode 100644 index 0000000..ea407ec --- /dev/null +++ b/mmi_fairseq/feature/tasks/__init__.py @@ -0,0 +1 @@ +from .mmi_video_dialogue_task import MMIVideoDialogueTask \ No newline at end of file diff --git a/mmi_fairseq/feature/mmi_video_dialogue_task.py b/mmi_fairseq/feature/tasks/mmi_video_dialogue_task.py similarity index 84% rename from mmi_fairseq/feature/mmi_video_dialogue_task.py rename to mmi_fairseq/feature/tasks/mmi_video_dialogue_task.py index 77e115a..0ad2d71 100644 --- a/mmi_fairseq/feature/mmi_video_dialogue_task.py +++ b/mmi_fairseq/feature/tasks/mmi_video_dialogue_task.py @@ -4,12 +4,12 @@ import random import torch from fairseq.data import Dictionary, data_utils -from mmi_fairseq.feature.utils import text_bin_file +from mmi_fairseq.feature.data.utils import text_bin_file from fairseq.tasks import register_task, FairseqTask -from mmi_fairseq.feature.feature_dataset import FeatureDataset -from mmi_fairseq.feature.mmi_text_and_feature_dataset import MMITextImageDataset -#from video_dialogue_model.data.text_and_object_dataset import TextObjectDataset -#from video_dialogue_model.data.object_dataset import ObjectDataset +from mmi_fairseq.feature.data.feature_dataset import FeatureDataset +from mmi_fairseq.feature.data.mmi_text_and_feature_dataset import MMITextImageDataset +from mmi_fairseq.feature.data.object_dataset import ObjectDataset +from mmi_fairseq.feature.data.mmi_text_and_object_dataset import MMITextObjectDataset @register_task('mmi-video-dialogue') @@ -47,25 +47,24 @@ def load_feature_dataset(self, split, **kwargs): vocab_dict=self.vocab_dict, span_idxs=span_idxs, shuffle=True if split == "train" else False) - ''' + def load_text_object_dataset(self, split, **kwargs): objects_dataset = ObjectDataset(self.args.data_dir, split, max_obj=self.args.max_obj) - span_idxs = self.item2span_idxs(sent_num=objects_dataset.sent_num, - max_src_sent=self.args.max_src_sent) + span_idxs = self.get_span_info(sent_num=objects_dataset.sent_num, split=split) text_file = text_bin_file(self.args.data_dir, split) # os.path.join(self.args.data_dir, split) text_dataset = data_utils.load_indexed_dataset(text_file, self.vocab_dict) - self.datasets[split] = TextObjectDataset(text_dataset=text_dataset, + self.datasets[split] = MMITextObjectDataset(text_dataset=text_dataset, image_dataset=objects_dataset, vocab_dict=self.vocab_dict, span_idxs=span_idxs, shuffle=True if split == "train" else False) - ''' + def load_dataset(self, split, **kwargs): if self.args.img_type == "features": return self.load_feature_dataset(split, **kwargs) - return self.load_feature_dataset(split, **kwargs) + return self.load_text_object_dataset(split, **kwargs) @staticmethod def get_span_info(sent_num: np.array, split) -> np.array: @@ -74,7 +73,11 @@ def get_span_info(sent_num: np.array, split) -> np.array: For example, if we got [[0,1,2], [3,4]] as source texts, then return [[0, 0, 2], [1, 3, 4]] """ + ''' + 测试时还需要改一下,现在没有结合反向文本 + ''' max_num = sum(int(sent_num[group_idx]) for group_idx in range(sent_num.shape[0])) + print(max_num) span_idxs = [] start_idx = 0 for group_idx in range(sent_num.shape[0]): diff --git a/mmi_model/feature/config.py b/old_mmi/feature/config.py similarity index 100% rename from mmi_model/feature/config.py rename to old_mmi/feature/config.py diff --git a/mmi_model/feature/data.py b/old_mmi/feature/data.py similarity index 100% rename from mmi_model/feature/data.py rename to old_mmi/feature/data.py diff --git a/mmi_model/feature/model.py b/old_mmi/feature/model.py similarity index 100% rename from mmi_model/feature/model.py rename to old_mmi/feature/model.py diff --git a/mmi_model/feature/optim.py b/old_mmi/feature/optim.py similarity index 100% rename from mmi_model/feature/optim.py rename to old_mmi/feature/optim.py diff --git a/mmi_model/feature/test.py b/old_mmi/feature/test.py similarity index 100% rename from mmi_model/feature/test.py rename to old_mmi/feature/test.py diff --git a/mmi_model/feature/train.py b/old_mmi/feature/train.py similarity index 100% rename from mmi_model/feature/train.py rename to old_mmi/feature/train.py diff --git a/mmi_model/feature/utils.py b/old_mmi/feature/utils.py similarity index 100% rename from mmi_model/feature/utils.py rename to old_mmi/feature/utils.py diff --git a/mmi_model/feature/vocab.py b/old_mmi/feature/vocab.py similarity index 100% rename from mmi_model/feature/vocab.py rename to old_mmi/feature/vocab.py diff --git a/mmi_model/object/config.py b/old_mmi/object/config.py similarity index 100% rename from mmi_model/object/config.py rename to old_mmi/object/config.py diff --git a/mmi_model/object/data.py b/old_mmi/object/data.py similarity index 100% rename from mmi_model/object/data.py rename to old_mmi/object/data.py diff --git a/mmi_model/object/model.py b/old_mmi/object/model.py similarity index 100% rename from mmi_model/object/model.py rename to old_mmi/object/model.py diff --git a/mmi_model/object/optim.py b/old_mmi/object/optim.py similarity index 100% rename from mmi_model/object/optim.py rename to old_mmi/object/optim.py diff --git a/mmi_model/object/test.py b/old_mmi/object/test.py similarity index 100% rename from mmi_model/object/test.py rename to old_mmi/object/test.py diff --git a/mmi_model/object/train.py b/old_mmi/object/train.py similarity index 100% rename from mmi_model/object/train.py rename to old_mmi/object/train.py diff --git a/mmi_model/object/utils.py b/old_mmi/object/utils.py similarity index 100% rename from mmi_model/object/utils.py rename to old_mmi/object/utils.py diff --git a/mmi_model/object/vocab.py b/old_mmi/object/vocab.py similarity index 100% rename from mmi_model/object/vocab.py rename to old_mmi/object/vocab.py diff --git a/mmi_model/preprocess/char2id.py b/old_mmi/preprocess/char2id.py similarity index 100% rename from mmi_model/preprocess/char2id.py rename to old_mmi/preprocess/char2id.py diff --git a/mmi_model/preprocess/get_dict.py b/old_mmi/preprocess/get_dict.py similarity index 100% rename from mmi_model/preprocess/get_dict.py rename to old_mmi/preprocess/get_dict.py From f530b1e0d99eb999fc7f7785b856b600c462cbfc Mon Sep 17 00:00:00 2001 From: wangshuhe Date: Sat, 10 Apr 2021 15:52:44 +0800 Subject: [PATCH 4/7] feature loss --- mmi_fairseq/feature/loss/Loss.py | 2 +- mmi_fairseq/feature/model/image_mmi_transformer.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mmi_fairseq/feature/loss/Loss.py b/mmi_fairseq/feature/loss/Loss.py index 24abdb6..07ec6a7 100644 --- a/mmi_fairseq/feature/loss/Loss.py +++ b/mmi_fairseq/feature/loss/Loss.py @@ -23,7 +23,7 @@ def forward(self, model, sample, reduce=True): sample_size = sample['nsentences'] #print(sample["net_input"]) loss, label = model(**sample["net_input"]) - loss = -(label*torch.log(loss) + (1-label)*torch.log(1-loss)) + loss = -(label*loss + (1-label)*torch.log(1-torch.exp(loss))) #print(loss) loss = loss.sum(dim=-1)/sample_size diff --git a/mmi_fairseq/feature/model/image_mmi_transformer.py b/mmi_fairseq/feature/model/image_mmi_transformer.py index a8b4d4b..4d1d43d 100644 --- a/mmi_fairseq/feature/model/image_mmi_transformer.py +++ b/mmi_fairseq/feature/model/image_mmi_transformer.py @@ -98,7 +98,8 @@ def forward(self, src_tokens, src_label, src_imgs, src_lengths, prev_output_toke x = encoder_out.encoder_out.transpose(0, 1) # T * B * C -> B * T * C src_imgs = torch.unsqueeze(src_imgs, dim=1) src_imgs = src_imgs.expand(x.shape[0], x.shape[1], src_imgs.shape[2]) - feature = torch.nn.functional.sigmoid(self.final(torch.cat((x, src_imgs), dim=-1)).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + feature = torch.nn.functional.sigmoid(self.final(torch.cat((x, src_imgs), dim=-1)).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + feature = torch.log(feature) return feature.sum(dim=-1)/src_lengths, src_label @classmethod From 750ae5595fb4ab78fc260d6eb0d0e09ed11db543 Mon Sep 17 00:00:00 2001 From: wangshuhe Date: Sat, 10 Apr 2021 16:36:27 +0800 Subject: [PATCH 5/7] change loss --- mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py | 3 +++ mmi_fairseq/feature/model/image_mmi_transformer.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py b/mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py index 0bde599..d7f5663 100644 --- a/mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py +++ b/mmi_fairseq/feature/data/mmi_text_and_feature_dataset.py @@ -116,6 +116,8 @@ def collater(self, samples): pad_idx=self.vocab_dict.pad(), eos_idx=self.vocab_dict.eos(), move_eos_to_beginning=False) + + mask_ones = torch.ones((source_texts_batch.shape[0], source_texts_batch.shape[1]), dtype=torch.float) # B * T target_batch = data_utils.collate_tokens(targets, pad_idx=self.vocab_dict.pad(), @@ -130,6 +132,7 @@ def collater(self, samples): 'id': indices, 'net_input': { 'src_tokens': source_texts_batch, + 'mask_ones': mask_ones, 'src_label': source_label_tensor, 'src_imgs': image_tensor, 'src_lengths': source_lengths_tensor, diff --git a/mmi_fairseq/feature/model/image_mmi_transformer.py b/mmi_fairseq/feature/model/image_mmi_transformer.py index 4d1d43d..ad4c345 100644 --- a/mmi_fairseq/feature/model/image_mmi_transformer.py +++ b/mmi_fairseq/feature/model/image_mmi_transformer.py @@ -59,7 +59,7 @@ def add_args(parser): parser.add_argument('--img-dim', type=int, metavar='N', default=1000, help='image feature dimension') - def forward(self, src_tokens, src_label, src_imgs, src_lengths, prev_output_tokens, **kwargs): + def forward(self, src_tokens, mask_ones, src_label, src_imgs, src_lengths, prev_output_tokens, **kwargs): """ Run the forward pass for an encoder-decoder model. @@ -99,6 +99,7 @@ def forward(self, src_tokens, src_label, src_imgs, src_lengths, prev_output_toke src_imgs = torch.unsqueeze(src_imgs, dim=1) src_imgs = src_imgs.expand(x.shape[0], x.shape[1], src_imgs.shape[2]) feature = torch.nn.functional.sigmoid(self.final(torch.cat((x, src_imgs), dim=-1)).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + feature = feature + encoder_out.encoder_padding_mask.float()*mask_ones # B * T feature = torch.log(feature) return feature.sum(dim=-1)/src_lengths, src_label From 3317e5916e6ea7ca83e1837d95dce0641a029191 Mon Sep 17 00:00:00 2001 From: wangshuhe Date: Mon, 12 Apr 2021 23:57:29 +0800 Subject: [PATCH 6/7] add generate --- .../data/mmi_text_and_object_dataset.py | 3 + .../feature/model/object_mmi_transformer.py | 6 +- .../feature/scrtpts/combine_new_test.py | 47 ++++++++++ mmi_fairseq/feature/scrtpts/generate.py | 9 +- .../feature/scrtpts/mmi_feature_generate.sh | 94 +++++++++++++++++++ scripts/mmi/combine_bidirectional_score.py | 30 +++++- 6 files changed, 183 insertions(+), 6 deletions(-) create mode 100644 mmi_fairseq/feature/scrtpts/combine_new_test.py create mode 100644 mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh diff --git a/mmi_fairseq/feature/data/mmi_text_and_object_dataset.py b/mmi_fairseq/feature/data/mmi_text_and_object_dataset.py index a27a66d..f85a78f 100644 --- a/mmi_fairseq/feature/data/mmi_text_and_object_dataset.py +++ b/mmi_fairseq/feature/data/mmi_text_and_object_dataset.py @@ -117,6 +117,8 @@ def collater(self, samples): eos_idx=self.vocab_dict.eos(), move_eos_to_beginning=False) + mask_ones = torch.ones((source_texts_batch.shape[0], source_texts_batch.shape[1]), dtype=torch.float) # B * T + target_batch = data_utils.collate_tokens(targets, pad_idx=self.vocab_dict.pad(), eos_idx=self.vocab_dict.eos(), @@ -130,6 +132,7 @@ def collater(self, samples): 'id': indices, 'net_input': { 'src_tokens': source_texts_batch, + 'mask_ones': mask_ones, 'src_label': source_label_tensor, 'objs': image_tensor, 'objs_mask': mask_tensor, diff --git a/mmi_fairseq/feature/model/object_mmi_transformer.py b/mmi_fairseq/feature/model/object_mmi_transformer.py index a85b98c..07471f3 100644 --- a/mmi_fairseq/feature/model/object_mmi_transformer.py +++ b/mmi_fairseq/feature/model/object_mmi_transformer.py @@ -59,7 +59,7 @@ def add_args(parser): parser.add_argument('--img-dim', type=int, metavar='N', default=2048, help='image feature dimension') - def forward(self, src_tokens, src_label, objs, objs_mask, src_lengths, prev_output_tokens, **kwargs): + def forward(self, src_tokens, mask_ones, src_label, objs, objs_mask, src_lengths, prev_output_tokens, **kwargs): """ Run the forward pass for an encoder-decoder model. @@ -104,7 +104,9 @@ def forward(self, src_tokens, src_label, objs, objs_mask, src_lengths, prev_outp src_imgs = objs.sum(dim=1)/objs_mask.sum(dim=-1).unsqueeze(dim=-1) # B * feature_dim src_imgs = torch.unsqueeze(src_imgs, dim=1) src_imgs = src_imgs.expand(x.shape[0], x.shape[1], src_imgs.shape[2]) - feature = torch.nn.functional.sigmoid(self.final(torch.cat((x, src_imgs), dim=-1)).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + feature = torch.nn.functional.sigmoid(self.final(torch.cat((x, src_imgs), dim=-1)).squeeze(dim=-1)) * (1-encoder_out.encoder_padding_mask.float()) # B * T + feature = feature + encoder_out.encoder_padding_mask.float()*mask_ones # B * T + feature = torch.log(feature) return feature.sum(dim=-1)/src_lengths, src_label @classmethod diff --git a/mmi_fairseq/feature/scrtpts/combine_new_test.py b/mmi_fairseq/feature/scrtpts/combine_new_test.py new file mode 100644 index 0000000..b47fbc8 --- /dev/null +++ b/mmi_fairseq/feature/scrtpts/combine_new_test.py @@ -0,0 +1,47 @@ +# encoding: utf-8 + +import argparse +import numpy as np +import os + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--src-dir", type=str, help="origin test dir") + parser.add_argument("--nbest-file", type=str, help="nbest file generated by generate.py") + parser.add_argument("--target-dir", type=str, help="target dir to store") + args = parser.parse_args() + + sent_num = np.load(os.path.join(args.src_dir, "test.sent_num.npy")) + src_file = [] + nbest_file = [] + + print(f"reading src-file from {args.src_dir} and {args.nbest_file}") + with open(os.path.join(args.src_dir, "test.src.txt"), "r") as f: + for line in f: + src_file.append(f) + f.close() + + with open(args.nbest_file, "r") as f: + for line in f: + nbest_file.append(f) + f.close() + + start_ = 0 + n_cnt = 0 + for group_idx in range(sent_num.shape[0]): + num = int(sent_num[group_idx]) + i = 1 + while (i <= start_+num-1): + src_file[i] = nbest_file[n_cnt] + n_cnt += 1 + + print(f"writing to {args.target_dir}") + with open(args.target_dir, "w") as f: + for line in nbest_file: + f.write(line) + f.close() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/mmi_fairseq/feature/scrtpts/generate.py b/mmi_fairseq/feature/scrtpts/generate.py index 69c0b4b..03613bc 100644 --- a/mmi_fairseq/feature/scrtpts/generate.py +++ b/mmi_fairseq/feature/scrtpts/generate.py @@ -150,13 +150,18 @@ def _main(args, output_file): models, sample ) - print(loss) output.append(loss) - return output + + with open(args.score_target_file, "w") as f: + for loss in output: + for i in range(loss.shap[0]): + f.write(str(loss[i].float())) + f.close() def cli_main(): parser = options.get_generation_parser() parser.add_argument("--print-attention", action="store_true", help="print attention matrix as jsonline") + parser.add_argument("--score-target-file", type=str, help="target file to store score") args = options.parse_args_and_arch(parser) main(args) diff --git a/mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh b/mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh new file mode 100644 index 0000000..fa7626b --- /dev/null +++ b/mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh @@ -0,0 +1,94 @@ +export CUDA_VISIBLE_DEVICES=0 + +# 1. normal generation with nbest list + +DATA_DIR="/userhome/yuxian/data/video/preprocessed_data" +MODEL_DIR="/userhome/shuhe/movie_plus/pre_feature/OpenViDial/feature_result" +TYPE="features" +MODEL_PATH="${MODEL_DIR}/checkpoint_best.pt" +NBEST=10 +BEAM=10 +SUBSET="test" +NBEST_FILE="${MODEL_DIR}/${SUBSET}_gen.out.${NBEST}best" + + +python ./train/generate.py \ + --user-dir video_dialogue_model \ + --task video-dialogue \ + --img-type $TYPE \ + --data-dir $DATA_DIR \ + --path $MODEL_PATH \ + --beam $BEAM \ + --batch-size 16 \ + --remove-bpe \ + --gen-subset $SUBSET \ + --nbest $NBEST \ + --quiet \ + >$NBEST_FILE 2>&1 & tail -f $NBEST_FILE 2>&1 + +# 2. split nbest to different directorys +NBEST_DIR="${MODEL_DIR}/${SUBSET}_best${NBEST}" +python ./scripts/mmi/split_nbest.py \ +--nbest-file $NBEST_FILE \ +--target-dir $NBEST_DIR \ +--nbest $NBEST + +echo "copy ..." +cp $DATA_DIR/test.features.mmap $NBEST_DIR/ + +# 3. score backwardly +codes_file="/data/yuxian/datasets/new-video/preprocessed_data/codes.30000.bpe" +dict_file="/data/yuxian/datasets/new-video/preprocessed_data/dict.txt" +backward_model="/data/yuxian/datasets/new-video/mmi_text/checkpoint_best.pt" + +for sub_dir in $(ls ${NBEST_DIR}); do + sub_dir="${NBEST_DIR}/${sub_dir}" + echo "compute backward score of ${sub_dir}" + python ./mmi_fairseq/feature/scrtpts/combine_new_test.py \ + --src-dir $DATA_DIR \ + --nbest-file $sub_dir/src-tgt.src \ + --target-dir $sub_dir/test_feature.src.txt + + subword-nmt apply-bpe -c ${codes_file} < $sub_dir/test_feature.src.txt > $sub_dir/test_feature_bpe.src + + fairseq-preprocess --source-lang src --srcdict $dict_file \ + --only-source \ + --testpref "${sub_dir}/test_feature_bpe.src" \ + --workers 8 --destdir $NBEST_DIR + + # backward generation + out_file="${sub_dir}/gen.out" + python ./mmi_fairseq/feature/generate.py \ + --user-dir mmi_fairseq \ + --task mmi-video-dialogue \ + --img-type $TYPE \ + --data-dir $NBEST_DIR \ + --path $backward_model \ + --batch-size 32 \ + --gen-subset "test" \ + --score-target-file $sub_dir/scores.backward + +done + +# 4. weight average score.forward and score.backward for MMI generation +ALPHA1=0.4 +ALPHA2=0.3 +ALPHA3=0.3 +BIRECTION_OUTPUT="${NBEST_DIR}/bidirection${ALPHA}.out" +python scripts/mmi/combine_bidirectional_score.py \ + --nbest-dir=/userhome/shuhe/movie_plus/pre_feature/OpenViDial/mmi_small_text \ + --nbest-dir-feature $NBEST \ + --type feature \ + --output-file=$BIRECTION_OUTPUT \ + --alpha $ALPHA1 \ + --alpha-2 $ALPHA2 \ + --alpha-3 $ALPHA3 + +# 5. grep reference from output-file and score +SYS_OUTPUT=${BIRECTION_OUTPUT} +REFERENCE="${MODEL_DIR}/${SUBSET}_gen.ref" +grep ^T $NBEST_FILE | cut -f2- > $REFERENCE + +fairseq-score \ +-s $SYS_OUTPUT \ +-r $REFERENCE \ No newline at end of file diff --git a/scripts/mmi/combine_bidirectional_score.py b/scripts/mmi/combine_bidirectional_score.py index 5d8f136..4ad96ed 100644 --- a/scripts/mmi/combine_bidirectional_score.py +++ b/scripts/mmi/combine_bidirectional_score.py @@ -41,16 +41,31 @@ def load_scores(sub_dirs: List[str], split="forward") -> np.array: return np.array(scores) -def combine_score(forward_score, backward_score, alpha=1): +def combine_score_only_text(forward_score, backward_score, alpha=1): return forward_score + alpha * backward_score +def combine_score_feature(forward_score, text_score, feature_score, alpha=0, alpha_2=0, alpha_3=0): + return alpha*forward_score + alpha_2*text_score + alpha_3*feature_score + +def combine_score_object(forward_score, text_score, feature_score, object_score, alpha=0, alpha_2=0, alpha_3=0, alpha_4=0): + return alpha*forward_score + alpha_2*text_score + alpha_3*feature_score + alpha_4*object_score + def main(): parser = argparse.ArgumentParser() parser.add_argument("--nbest-dir", type=str, help="nbest directory, which should contain rank1, .. rankn subdir") + parser.add_argument("--nbest-dir-feature", type=str, default=None) + parser.add_argument("--nbest-dir-object", type=str, default=None) + parser.add_argument("--type", type=str, default="text") parser.add_argument("--output-file", type=str, help="selected prediction from nbest list by forward+backward") parser.add_argument("--alpha", type=float, default=1.0, help="default weight of backward score") + parser.add_argument("--alpha-2", type=float, default=0, + help="default weight of backward score of text") + parser.add_argument("--alpha-3", type=float, default=0, + help="default weight of backward score of feature") + parser.add_argument("--alpha-4", type=float, default=0, + help="default weight of backward score of object") args = parser.parse_args() base_dir = args.nbest_dir @@ -58,7 +73,18 @@ def main(): forward_scores = load_scores(sub_dirs, split="forward") backward_scores = load_scores(sub_dirs, split="backward") - bidirection_scores = combine_score(forward_scores, backward_scores, args.alpha) # nbest, nsents + if (args.type == 'text'): + bidirection_scores = combine_score_only_text(forward_scores, backward_scores, args.alpha) + elif (args.type == 'feature'): + feature_dir = find_sub_dirs(args.nbest_dir_feature) + backward_feature_scores = load_scores(feature_dir, split="backward") + bidirection_scores = combine_score_feature(forward_scores, backward_scores, backward_feature_scores, args.alpha, args.alpha_2, args.alpha_3) + elif (args.type == 'object'): + feature_dir = find_sub_dirs(args.nbest_dir_feature) + object_dir = find_sub_dirs(args.nbest_dir_object) + backward_feature_scores = load_scores(feature_dir, split="backward") + backward_object_scores = load_scores(object_dir, split="backward") + bidirection_scores = combine_score_object(forward_scores, backward_scores, backward_feature_scores, backward_object_scores, args.alpha, args.alpha_2, args.alpha_3, args.alpha_4) best_idx = np.argmax(bidirection_scores, axis=0) print(f"compute {best_idx.shape[0]} bidirectional scores") From 511c16a4e5a0df706fdbf4465498514475aab042 Mon Sep 17 00:00:00 2001 From: wangshuhe Date: Tue, 20 Apr 2021 22:36:38 +0800 Subject: [PATCH 7/7] fix mmi bug --- .../feature/scrtpts/combine_new_test.py | 7 +- mmi_fairseq/feature/scrtpts/generate.py | 5 +- .../feature/scrtpts/mmi_feature_generate.sh | 80 +++++-- .../feature/scrtpts/mmi_object_generate.sh | 201 ++++++++++++++++++ mmi_fairseq/feature/scrtpts/train_image.sh | 3 +- scripts/mmi/combine_bidirectional_score.py | 182 +++++++++++++++- .../data/text_and_object_dataset.py | 15 +- .../model/object_transformer.py | 17 +- .../tasks/video_dialogue_task.py | 2 + 9 files changed, 484 insertions(+), 28 deletions(-) create mode 100644 mmi_fairseq/feature/scrtpts/mmi_object_generate.sh diff --git a/mmi_fairseq/feature/scrtpts/combine_new_test.py b/mmi_fairseq/feature/scrtpts/combine_new_test.py index b47fbc8..ae2f8c4 100644 --- a/mmi_fairseq/feature/scrtpts/combine_new_test.py +++ b/mmi_fairseq/feature/scrtpts/combine_new_test.py @@ -19,12 +19,12 @@ def main(): print(f"reading src-file from {args.src_dir} and {args.nbest_file}") with open(os.path.join(args.src_dir, "test.src.txt"), "r") as f: for line in f: - src_file.append(f) + src_file.append(line) f.close() with open(args.nbest_file, "r") as f: for line in f: - nbest_file.append(f) + nbest_file.append(line) f.close() start_ = 0 @@ -34,11 +34,12 @@ def main(): i = 1 while (i <= start_+num-1): src_file[i] = nbest_file[n_cnt] + i += 1 n_cnt += 1 print(f"writing to {args.target_dir}") with open(args.target_dir, "w") as f: - for line in nbest_file: + for line in src_file: f.write(line) f.close() diff --git a/mmi_fairseq/feature/scrtpts/generate.py b/mmi_fairseq/feature/scrtpts/generate.py index 03613bc..97a952c 100644 --- a/mmi_fairseq/feature/scrtpts/generate.py +++ b/mmi_fairseq/feature/scrtpts/generate.py @@ -20,6 +20,7 @@ import torch from fairseq import checkpoint_utils, options, tasks, utils from fairseq.logging import progress_bar +import math def main(args): @@ -154,8 +155,8 @@ def _main(args, output_file): with open(args.score_target_file, "w") as f: for loss in output: - for i in range(loss.shap[0]): - f.write(str(loss[i].float())) + for i in range(loss.shape[0]): + f.write(str(float(loss[i])/math.log(2))+'\n') f.close() def cli_main(): diff --git a/mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh b/mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh index fa7626b..002e955 100644 --- a/mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh +++ b/mmi_fairseq/feature/scrtpts/mmi_feature_generate.sh @@ -23,7 +23,6 @@ python ./train/generate.py \ --remove-bpe \ --gen-subset $SUBSET \ --nbest $NBEST \ - --quiet \ >$NBEST_FILE 2>&1 & tail -f $NBEST_FILE 2>&1 # 2. split nbest to different directorys @@ -35,14 +34,19 @@ python ./scripts/mmi/split_nbest.py \ echo "copy ..." cp $DATA_DIR/test.features.mmap $NBEST_DIR/ +cp $DATA_DIR/test.offsets.npy $NBEST_DIR/ +cp $DATA_DIR/test.sent_num.npy $NBEST_DIR/ # 3. score backwardly -codes_file="/data/yuxian/datasets/new-video/preprocessed_data/codes.30000.bpe" -dict_file="/data/yuxian/datasets/new-video/preprocessed_data/dict.txt" -backward_model="/data/yuxian/datasets/new-video/mmi_text/checkpoint_best.pt" +codes_file="/userhome/yuxian/data/video/preprocessed_data/codes.30000.bpe" +dict_file="/userhome/yuxian/data/video/preprocessed_data/dict.txt" +backward_model="/userhome/shuhe/movie_plus/pre_feature/OpenViDial/mmi_small_feature/checkpoint_best.pt" for sub_dir in $(ls ${NBEST_DIR}); do sub_dir="${NBEST_DIR}/${sub_dir}" + if [ -f "$sub_dir" ]; then + continue + fi echo "compute backward score of ${sub_dir}" python ./mmi_fairseq/feature/scrtpts/combine_new_test.py \ --src-dir $DATA_DIR \ @@ -53,12 +57,16 @@ for sub_dir in $(ls ${NBEST_DIR}); do fairseq-preprocess --source-lang src --srcdict $dict_file \ --only-source \ - --testpref "${sub_dir}/test_feature_bpe.src" \ + --testpref "${sub_dir}/test_feature_bpe" \ --workers 8 --destdir $NBEST_DIR + + mv $NBEST_DIR/dict.src.txt $NBEST_DIR/dict.txt + mv $NBEST_DIR/test.src-None.src.bin $NBEST_DIR/test.bin + mv $NBEST_DIR/test.src-None.src.idx $NBEST_DIR/test.idx # backward generation out_file="${sub_dir}/gen.out" - python ./mmi_fairseq/feature/generate.py \ + python ./mmi_fairseq/feature/scrtpts/generate.py \ --user-dir mmi_fairseq \ --task mmi-video-dialogue \ --img-type $TYPE \ @@ -70,21 +78,65 @@ for sub_dir in $(ls ${NBEST_DIR}); do done -# 4. weight average score.forward and score.backward for MMI generation -ALPHA1=0.4 -ALPHA2=0.3 -ALPHA3=0.3 +# 4. text backward score +TEXT_NBEST_DIR="${MODEL_DIR}/${SUBSET}_best_text${NBEST}" +python scripts/mmi/text_only/split_nbest.py \ +--nbest-file $NBEST_FILE \ +--target-dir $TEXT_NBEST_DIR \ +--nbest $NBEST + +test_backward_model="/userhome/shuhe/movie_plus/pre_feature/OpenViDial/mmi_small_text/checkpoint_best.pt" + +for sub_dir in $(ls ${TEXT_NBEST_DIR}); do + sub_dir="${TEXT_NBEST_DIR}/${sub_dir}" + echo "compute text backward score of ${sub_dir}" + # apply bpe + for suffix in "src" "tgt"; do + fin="${sub_dir}/src-tgt.${suffix}" + fout="${sub_dir}/bpe.src-tgt.${suffix}" + echo "apply_bpe to ${fin} ..." + subword-nmt apply-bpe -c ${codes_file} < $fin > $fout + done + + # binarize + rm $sub_dir/dict* + fairseq-preprocess --source-lang src --target-lang tgt --srcdict $dict_file --tgtdict $dict_file \ + --testpref "${sub_dir}/bpe.src-tgt" \ + --workers 8 --destdir $sub_dir + + # backward generation + out_file="${sub_dir}/gen.out" + fairseq-generate \ + "${sub_dir}" \ + --score-reference \ + --batch-size 32 \ + --remove-bpe \ + --path $test_backward_model \ + --gen-subset "test" \ + > $out_file + + # extract backward score file + text_backward_score="${sub_dir}/scores.backward" + grep ^H "${out_file}" | cut -f 2 >"${text_backward_score}" + +done + + +# 5. weight average score.forward and score.backward for MMI generation +ALPHA1=0.7 +ALPHA2=0.05 +ALPHA3=0.25 BIRECTION_OUTPUT="${NBEST_DIR}/bidirection${ALPHA}.out" python scripts/mmi/combine_bidirectional_score.py \ - --nbest-dir=/userhome/shuhe/movie_plus/pre_feature/OpenViDial/mmi_small_text \ - --nbest-dir-feature $NBEST \ - --type feature \ + --nbest-dir=$TEXT_NBEST_DIR \ + --nbest-dir-feature=$NBEST_DIR \ + --type "feature" \ --output-file=$BIRECTION_OUTPUT \ --alpha $ALPHA1 \ --alpha-2 $ALPHA2 \ --alpha-3 $ALPHA3 -# 5. grep reference from output-file and score +# 6. grep reference from output-file and score SYS_OUTPUT=${BIRECTION_OUTPUT} REFERENCE="${MODEL_DIR}/${SUBSET}_gen.ref" grep ^T $NBEST_FILE | cut -f2- > $REFERENCE diff --git a/mmi_fairseq/feature/scrtpts/mmi_object_generate.sh b/mmi_fairseq/feature/scrtpts/mmi_object_generate.sh new file mode 100644 index 0000000..7916718 --- /dev/null +++ b/mmi_fairseq/feature/scrtpts/mmi_object_generate.sh @@ -0,0 +1,201 @@ +export CUDA_VISIBLE_DEVICES=0 + +# 1. normal generation with nbest list + +DATA_DIR="/userhome/yuxian/data/video/preprocessed_data" +MODEL_DIR="/userhome/shuhe/movie_plus/pre_feature/OpenViDial/object_result" +TYPE="objects" +MODEL_PATH="${MODEL_DIR}/checkpoint_best.pt" +NBEST=10 +BEAM=10 +SUBSET="test" +NBEST_FILE="${MODEL_DIR}/${SUBSET}_gen.out.${NBEST}best" + + +python ./train/generate.py \ + --user-dir video_dialogue_model \ + --task video-dialogue \ + --img-type $TYPE \ + --data-dir $DATA_DIR \ + --path $MODEL_PATH \ + --beam $BEAM \ + --batch-size 16 \ + --remove-bpe \ + --gen-subset $SUBSET \ + --nbest $NBEST \ + >$NBEST_FILE 2>&1 & tail -f $NBEST_FILE 2>&1 + +# 2. split nbest to different directorys +NBEST_DIR="${MODEL_DIR}/${SUBSET}_best${NBEST}" +python ./scripts/mmi/text_only/split_nbest.py \ +--nbest-file $NBEST_FILE \ +--target-dir $NBEST_DIR \ +--nbest $NBEST + +echo "copy objects ..." +cp $DATA_DIR/test.objects.mmap.20 $NBEST_DIR/test.objects.mmap +cp $DATA_DIR/test.objects_mask.mmap.20 $NBEST_DIR/test.objects_mask.mmap +cp $DATA_DIR/test.offsets.npy $NBEST_DIR/ +cp $DATA_DIR/test.sent_num.npy $NBEST_DIR/ + +# 3. score backwardly +codes_file="/userhome/yuxian/data/video/preprocessed_data/codes.30000.bpe" +dict_file="/userhome/yuxian/data/video/preprocessed_data/dict.txt" +backward_model="/userhome/shuhe/movie_plus/pre_feature/OpenViDial/mmi_small_object/checkpoint_best.pt" + +for sub_dir in $(ls ${NBEST_DIR}); do + sub_dir="${NBEST_DIR}/${sub_dir}" + if [ -f "$sub_dir" ]; then + continue + fi + echo "compute backward score of ${sub_dir}" + python ./mmi_fairseq/feature/scrtpts/combine_new_test.py \ + --src-dir $DATA_DIR \ + --nbest-file $sub_dir/src-tgt.src \ + --target-dir $sub_dir/test_object.src.txt + + subword-nmt apply-bpe -c ${codes_file} < $sub_dir/test_object.src.txt > $sub_dir/test_object_bpe.src + + fairseq-preprocess --source-lang src --srcdict $dict_file \ + --only-source \ + --testpref "${sub_dir}/test_object_bpe" \ + --workers 8 --destdir $NBEST_DIR + + mv $NBEST_DIR/dict.src.txt $NBEST_DIR/dict.txt + mv $NBEST_DIR/test.src-None.src.bin $NBEST_DIR/test.bin + mv $NBEST_DIR/test.src-None.src.idx $NBEST_DIR/test.idx + + # backward generation + out_file="${sub_dir}/gen.out" + python ./mmi_fairseq/feature/scrtpts/generate.py \ + --user-dir mmi_fairseq \ + --task mmi-video-dialogue \ + --img-type $TYPE \ + --data-dir $NBEST_DIR \ + --path $backward_model \ + --batch-size 32 \ + --gen-subset "test" \ + --num-workers 32 \ + --score-target-file $sub_dir/scores.backward + +done + +FEATURE_NBEST_DIR="${MODEL_DIR}/${SUBSET}_best_feature${NBEST}" +python scripts/mmi/text_only/split_nbest.py \ +--nbest-file $NBEST_FILE \ +--target-dir $FEATURE_NBEST_DIR \ +--nbest $NBEST + +echo "copy feature ..." +cp $DATA_DIR/test.features.mmap $FEATURE_NBEST_DIR/ +cp $DATA_DIR/test.offsets.npy $FEATURE_NBEST_DIR/ +cp $DATA_DIR/test.sent_num.npy $FEATURE_NBEST_DIR/ + +# 4. feature score backwardly +feature_backward_model="/userhome/shuhe/movie_plus/pre_feature/OpenViDial/mmi_small_feature/checkpoint_best.pt" + +for sub_dir in $(ls ${FEATURE_NBEST_DIR}); do + sub_dir="${FEATURE_NBEST_DIR}/${sub_dir}" + if [ -f "$sub_dir" ]; then + continue + fi + echo "compute backward score of ${sub_dir}" + python ./mmi_fairseq/feature/scrtpts/combine_new_test.py \ + --src-dir $DATA_DIR \ + --nbest-file $sub_dir/src-tgt.src \ + --target-dir $sub_dir/test_feature.src.txt + + subword-nmt apply-bpe -c ${codes_file} < $sub_dir/test_feature.src.txt > $sub_dir/test_feature_bpe.src + + fairseq-preprocess --source-lang src --srcdict $dict_file \ + --only-source \ + --testpref "${sub_dir}/test_feature_bpe" \ + --workers 8 --destdir $FEATURE_NBEST_DIR + + mv $FEATURE_NBEST_DIR/dict.src.txt $FEATURE_NBEST_DIR/dict.txt + mv $FEATURE_NBEST_DIR/test.src-None.src.bin $FEATURE_NBEST_DIR/test.bin + mv $FEATURE_NBEST_DIR/test.src-None.src.idx $FEATURE_NBEST_DIR/test.idx + + # backward generation + out_file="${sub_dir}/gen.out" + python ./mmi_fairseq/feature/scrtpts/generate.py \ + --user-dir mmi_fairseq \ + --task mmi-video-dialogue \ + --img-type "features" \ + --data-dir $FEATURE_NBEST_DIR \ + --path $feature_backward_model \ + --batch-size 32 \ + --gen-subset "test" \ + --score-target-file $sub_dir/scores.backward + +done + +# 5. text backward score +TEXT_NBEST_DIR="${MODEL_DIR}/${SUBSET}_best_text${NBEST}" +python scripts/mmi/text_only/split_nbest.py \ +--nbest-file $NBEST_FILE \ +--target-dir $TEXT_NBEST_DIR \ +--nbest $NBEST + +test_backward_model="/userhome/shuhe/movie_plus/pre_feature/OpenViDial/mmi_small_text/checkpoint_best.pt" + +for sub_dir in $(ls ${TEXT_NBEST_DIR}); do + sub_dir="${TEXT_NBEST_DIR}/${sub_dir}" + echo "compute text backward score of ${sub_dir}" + # apply bpe + for suffix in "src" "tgt"; do + fin="${sub_dir}/src-tgt.${suffix}" + fout="${sub_dir}/bpe.src-tgt.${suffix}" + echo "apply_bpe to ${fin} ..." + subword-nmt apply-bpe -c ${codes_file} < $fin > $fout + done + + # binarize + rm $sub_dir/dict* + fairseq-preprocess --source-lang src --target-lang tgt --srcdict $dict_file --tgtdict $dict_file \ + --testpref "${sub_dir}/bpe.src-tgt" \ + --workers 8 --destdir $sub_dir + + # backward generation + out_file="${sub_dir}/gen.out" + fairseq-generate \ + "${sub_dir}" \ + --score-reference \ + --batch-size 32 \ + --remove-bpe \ + --path $test_backward_model \ + --gen-subset "test" \ + > $out_file + + # extract backward score file + text_backward_score="${sub_dir}/scores.backward" + grep ^H "${out_file}" | cut -f 2 >"${text_backward_score}" + +done + + +# 6. weight average score.forward and score.backward for MMI generation +ALPHA1=0.4 +ALPHA2=0.1 +ALPHA3=0.1 +ALPHA4=0.4 +BIRECTION_OUTPUT="${NBEST_DIR}/bidirection${ALPHA}.out" +python scripts/mmi/text_only/combine_bidirectional_score.py \ + --nbest-dir=$TEXT_NBEST_DIR \ + --nbest-dir-feature=$FEATURE_NBEST_DIR \ + --nbest-dir-object=$NBEST_DIR \ + --type "object" \ + --output-file=$BIRECTION_OUTPUT \ + --alpha $ALPHA1 \ + --alpha-2 $ALPHA2 \ + --alpha-3 $ALPHA3 \ + --alpha-4 $ALPHA4 + +# 7. grep reference from output-file and score +SYS_OUTPUT=${BIRECTION_OUTPUT} +REFERENCE="${MODEL_DIR}/${SUBSET}_gen.ref" +grep ^T $NBEST_FILE | cut -f2- > $REFERENCE + +fairseq-score \ +-s $SYS_OUTPUT \ +-r $REFERENCE \ No newline at end of file diff --git a/mmi_fairseq/feature/scrtpts/train_image.sh b/mmi_fairseq/feature/scrtpts/train_image.sh index 5b5aeb2..1e1af8a 100644 --- a/mmi_fairseq/feature/scrtpts/train_image.sh +++ b/mmi_fairseq/feature/scrtpts/train_image.sh @@ -24,8 +24,7 @@ CUDA_VISIBLE_DEVICES=3 fairseq-train \ --encoder-embed-dim 512 \ --dropout $DROPOUT \ --optimizer adam \ - --max-tokens 100000 \ - --batch-size 150 \ + --batch-size 256 \ --adam-betas "(0.9,0.999)" \ --reset-optimizer \ --criterion base-loss \ diff --git a/scripts/mmi/combine_bidirectional_score.py b/scripts/mmi/combine_bidirectional_score.py index 4ad96ed..b92690c 100644 --- a/scripts/mmi/combine_bidirectional_score.py +++ b/scripts/mmi/combine_bidirectional_score.py @@ -16,6 +16,10 @@ from typing import List import argparse +from fairseq.data import dictionary +from fairseq.scoring import bleu +import multiprocessing +import time def find_sub_dirs(base_dir: str) -> List[str]: """find all rank-i subdirs under base_dir""" @@ -42,7 +46,7 @@ def load_scores(sub_dirs: List[str], split="forward") -> np.array: def combine_score_only_text(forward_score, backward_score, alpha=1): - return forward_score + alpha * backward_score + return (1-alpha) * forward_score + alpha * backward_score def combine_score_feature(forward_score, text_score, feature_score, alpha=0, alpha_2=0, alpha_3=0): return alpha*forward_score + alpha_2*text_score + alpha_3*feature_score @@ -97,5 +101,179 @@ def main(): print(f"Wrote final output to {args.output_file}") +def get_bleu(src, ref): + dict = dictionary.Dictionary() + + def readlines(fd): + for line in fd.readlines(): + yield line + + with open(ref) as fdref: + scorer = bleu.Scorer( + bleu.BleuConfig( + pad=dict.pad(), + eos=dict.eos(), + unk=dict.unk(), + ) + ) + for sys_tok, ref_tok in zip(readlines(src), readlines(fdref)): + sys_tok = dict.encode_line(sys_tok) + ref_tok = dict.encode_line(ref_tok) + scorer.add(ref_tok, sys_tok) + return scorer.score() + +def find_text(alpha, forward_scores, backward_scores, sub_dirs, queue): + bidirection_scores = combine_score_only_text(forward_scores, backward_scores, alpha) + best_idx = np.argmax(bidirection_scores, axis=0) + + pred_files = [open(os.path.join(sub_dir, "src-tgt.src")) for sub_dir in sub_dirs] + + output_file = os.path.join("/userhome/shuhe/movie_plus/pre_feature/OpenViDial/text_ori_result/test_best10/tmp", str(alpha)+".out") + + with open(output_file, "w") as fout: + for sent_idx, lines in enumerate(zip(*pred_files)): + fout.write(lines[best_idx[sent_idx]]) + fout.close() + + ref = "/userhome/shuhe/movie_plus/pre_feature/OpenViDial/text_ori_result/test_gen.ref" + + queue.put({ + 'bleu': get_bleu(output_file, ref), + 'alpha': str(alpha) + }) + + os.remove(output_file) + + +def find_feature(forward_scores, backward_scores, backward_feature_scores, alpha, alpha_2, alpha_3, sub_dirs, queue): + bidirection_scores = combine_score_feature(forward_scores, backward_scores, backward_feature_scores, alpha, alpha_2, alpha_3) + best_idx = np.argmax(bidirection_scores, axis=0) + + pred_files = [open(os.path.join(sub_dir, "src-tgt.src")) for sub_dir in sub_dirs] + + output_file = os.path.join("/userhome/shuhe/movie_plus/pre_feature/OpenViDial/feature_result/test_best10/tmp", str(alpha)+"_"+str(alpha_2)+"_"+str(alpha_3)+".out") + + with open(output_file, "w") as fout: + for sent_idx, lines in enumerate(zip(*pred_files)): + fout.write(lines[best_idx[sent_idx]]) + fout.close() + + ref = "/userhome/shuhe/movie_plus/pre_feature/OpenViDial/feature_result/test_gen.ref" + + queue.put({ + 'bleu': get_bleu(output_file, ref), + 'alpha': str(alpha)+"_"+str(alpha_2)+"_"+str(alpha_3), + }) + + os.remove(output_file) + +def find_object(forward_scores, backward_scores, backward_feature_scores, backward_object_scores, alpha, alpha_2, alpha_3, alpha_4, sub_dirs, queue): + bidirection_scores = combine_score_object(forward_scores, backward_scores, backward_feature_scores, backward_object_scores, alpha, alpha_2, alpha_3, alpha_4) + best_idx = np.argmax(bidirection_scores, axis=0) + + pred_files = [open(os.path.join(sub_dir, "src-tgt.src")) for sub_dir in sub_dirs] + + output_file = os.path.join("/userhome/shuhe/movie_plus/pre_feature/OpenViDial/object_result_with_feature/test_best10/tmp", str(alpha)+"_"+str(alpha_2)+"_"+str(alpha_3)+"_"+str(alpha_4)+".out") + + with open(output_file, "w") as fout: + for sent_idx, lines in enumerate(zip(*pred_files)): + fout.write(lines[best_idx[sent_idx]]) + fout.close() + + ref = "/userhome/shuhe/movie_plus/pre_feature/OpenViDial/object_result_with_feature/test_gen.ref" + + queue.put({ + 'bleu': get_bleu(output_file, ref), + 'alpha': str(alpha)+"_"+str(alpha_2)+"_"+str(alpha_3)+"_"+str(alpha_4) + }) + + os.remove(output_file) + +def find_(type): + pool = multiprocessing.Pool(32) + + # 创建队列 + queue = multiprocessing.Manager().Queue() + cnt = 0 + + base_dir = "/userhome/shuhe/movie_plus/pre_feature/OpenViDial/object_result_with_feature/test_best_text10" + sub_dirs = find_sub_dirs(base_dir) + + forward_scores = load_scores(sub_dirs, split="forward") + backward_scores = load_scores(sub_dirs, split="backward") + + if (type == 'text'): + alpha = 0 + while (alpha <= 0.99): + pool.apply_async(find_text, args=(alpha, forward_scores, backward_scores, sub_dirs, queue,)) + cnt += 1 + alpha += 0.01 + + elif (type == 'feature'): + nbest_dir_feature = "/userhome/shuhe/movie_plus/pre_feature/OpenViDial/feature_result/test_best10" + feature_dir = find_sub_dirs(nbest_dir_feature) + backward_feature_scores = load_scores(feature_dir, split="backward") + + alpha = 0.01 + while (alpha <= 1): + alpha_2 = 0 + while (alpha_2 <= 1): + alpha_3 = 0 + while (alpha_3 <= 1): + if (alpha+alpha_2+alpha_3 != 1): + alpha_3 += 0.01 + continue + pool.apply_async(find_feature, args=(forward_scores, backward_scores, backward_feature_scores, alpha, alpha_2, alpha_3, sub_dirs, queue,)) + cnt += 1 + alpha_3 += 0.01 + alpha_2 += 0.01 + alpha += 0.01 + + elif (type == 'object'): + nbest_dir_feature = "/userhome/shuhe/movie_plus/pre_feature/OpenViDial/object_result_with_feature/test_best_feature10" + feature_dir = find_sub_dirs(nbest_dir_feature) + nbest_dir_object = "/userhome/shuhe/movie_plus/pre_feature/OpenViDial/object_result_with_feature/test_best10" + object_dir = find_sub_dirs(nbest_dir_object) + backward_feature_scores = load_scores(feature_dir, split="backward") + backward_object_scores = load_scores(object_dir, split="backward") + + alpha = 0.01 + while (alpha <= 1): + alpha_2 = 0 + while (alpha_2 <= 1): + alpha_3 = 0 + while (alpha_3 <= 1): + alpha_4 = 0 + while (alpha_4 <= 1): + if (alpha+alpha_2+alpha_3+alpha_4 != 1): + alpha_4 += 0.01 + continue + pool.apply_async(find_object, args=(forward_scores, backward_scores, backward_feature_scores, backward_object_scores, alpha, alpha_2, alpha_3, alpha_4, sub_dirs, queue,)) + cnt += 1 + alpha_4 += 0.01 + alpha_3 += 0.01 + alpha_2 += 0.01 + alpha += 0.01 + + pool.close() + count = 0 + max_bleu = 0 + final_alpha = None + while True: + value = queue.get() + count += 1 + if (max_bleu < value['bleu']): + final_alpha = value['alpha'] + # 格式化输出时两个%输出一个%,不换行,每次定位到行首,实现覆盖 + print("\r now : %.2f %%" % (count * 100 / cnt), end="") + if (count == cnt): + print(max_bleu) + print(final_alpha) + break + pool.join() + end = time.time() + print(end) + if __name__ == '__main__': - main() + #main() + find_("object") diff --git a/video_dialogue_model/data/text_and_object_dataset.py b/video_dialogue_model/data/text_and_object_dataset.py index f6c452f..1cdcbbd 100644 --- a/video_dialogue_model/data/text_and_object_dataset.py +++ b/video_dialogue_model/data/text_and_object_dataset.py @@ -14,6 +14,7 @@ import torch from fairseq.data.fairseq_dataset import FairseqDataset from video_dialogue_model.data.object_dataset import ObjectDataset +from video_dialogue_model.data.feature_dataset import FeatureDataset from fairseq.data import data_utils @@ -21,8 +22,9 @@ class TextObjectDataset(FairseqDataset): """ A combine of text dataset and object dataset """ - def __init__(self, image_dataset: ObjectDataset, text_dataset, vocab_dict, span_idxs, shuffle=False): + def __init__(self, image_dataset: ObjectDataset, feature_dataset: FeatureDataset, text_dataset, vocab_dict, span_idxs, shuffle=False): self.img_dataset = image_dataset + self.feature_dataset = feature_dataset self.text_dataset = text_dataset self.vocab_dict = vocab_dict self.span_idxs = span_idxs @@ -39,10 +41,13 @@ def __getitem__(self, index): source_texts = np.concatenate([self.text_dataset[idx] for idx in offsets[:-1]]) # L target = self.text_dataset[offsets[-1]] + source_imgs = np.stack([self.feature_dataset[idx] for idx in offsets]) # n * dim + return { 'id': index, 'objects': torch.FloatTensor(objects), 'objects_mask': torch.FloatTensor(objects_mask), + 'source_imgs': torch.FloatTensor(source_imgs), 'source_texts': torch.LongTensor(source_texts), 'target': torch.LongTensor(target) } @@ -87,6 +92,7 @@ def collater(self, samples): indices = [] source_objects = [] objects_mask = [] + source_imgs = [] source_texts = [] source_lengths = [] targets = [] @@ -96,6 +102,8 @@ def collater(self, samples): for sample in samples: index = sample['id'] indices.append(index) + + source_imgs.append(sample['source_imgs']) sent_num, max_object, rcnn_dim = sample["objects"].shape source_objects.append(sample['objects']) # [sent_num, max_obj, dim] objects_mask.append(sample['objects_mask']) # [sent_num, max_obj] @@ -110,6 +118,10 @@ def collater(self, samples): indices = torch.tensor(indices, dtype=torch.long) max_sent = max(x.size(0) for x in source_objects) + pad_imgs = torch.zeros([num_sentences, max_sent, self.feature_dataset.dim], dtype=torch.float) + for idx, imgs in enumerate(source_imgs): + pad_imgs[idx][: imgs.size(0)] = imgs + pad_objects = torch.zeros([num_sentences, max_sent, self.max_obj, self.img_dataset.dim], dtype=torch.float) pad_mask_objs = torch.zeros([num_sentences, max_sent, self.max_obj], dtype=torch.bool) for idx, objs in enumerate(source_objects): @@ -135,6 +147,7 @@ def collater(self, samples): 'id': indices, 'net_input': { 'src_tokens': source_texts_batch, + 'src_imgs': pad_imgs, 'objs': pad_objects, 'objs_mask': pad_mask_objs, 'src_lengths': source_lengths, diff --git a/video_dialogue_model/model/object_transformer.py b/video_dialogue_model/model/object_transformer.py index f6054c5..f0ef162 100644 --- a/video_dialogue_model/model/object_transformer.py +++ b/video_dialogue_model/model/object_transformer.py @@ -51,7 +51,7 @@ class ObjTransformerModel(TransformerModel): def __init__(self, args, encoder, decoder): super().__init__(args, encoder, decoder) - def forward(self, src_tokens, objs, objs_mask, src_lengths, prev_output_tokens, **kwargs): + def forward(self, src_tokens, src_imgs, objs, objs_mask, src_lengths, prev_output_tokens, **kwargs): """ Run the forward pass for an encoder-decoder model. @@ -78,7 +78,7 @@ def forward(self, src_tokens, objs, objs_mask, src_lengths, prev_output_tokens, - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ - encoder_out = self.encoder(src_tokens, objs, objs_mask, src_lengths=src_lengths, **kwargs) + encoder_out = self.encoder(src_tokens, src_imgs=src_imgs, objs=objs, objs_mask=objs_mask, src_lengths=src_lengths, **kwargs) decoder_out = self.decoder(prev_output_tokens, encoder_out=encoder_out, **kwargs) return decoder_out @@ -112,9 +112,10 @@ def __init__(self, args, dictionary, embed_tokens): self.token_type_embedding = nn.Embedding(2, args.encoder_embed_dim) # image/token self.image_proj = nn.Linear(2048, args.encoder_embed_dim) + self.fuse_img_token = nn.Linear(embed_tokens.embedding_dim + 1000, embed_tokens.embedding_dim) def forward_embedding( - self, src_tokens, objs, token_embedding: Optional[torch.Tensor] = None + self, src_tokens, objs, src_imgs=None, token_embedding: Optional[torch.Tensor] = None ): bsz, token_length = src_tokens.size() _, sent_num, max_obj, dim = objs.size() @@ -123,6 +124,13 @@ def forward_embedding( if token_embedding is None: token_embedding = self.embed_tokens(src_tokens) x = embed = self.embed_scale * token_embedding + + token_img_idxs = torch.cumsum((src_tokens == self.dictionary.eos_index).long(), dim=1).unsqueeze(-1).expand([-1, -1, 1000]) + # [B, T, C'] f[b][t][c] = src_imgs[b][token_img_idxs[b][t][c]][c] + token_img_features = torch.gather(src_imgs, 1, token_img_idxs) + # [B, T, C] + x = self.fuse_img_token(torch.cat([x, token_img_features], dim=-1)) + if self.embed_positions is not None: x = embed + self.embed_positions(src_tokens) x += self.token_type_embedding(torch.ones_like(src_tokens)) @@ -151,6 +159,7 @@ def forward_embedding( def forward( self, src_tokens, + src_imgs, objs, objs_mask, src_lengths, @@ -186,7 +195,7 @@ def forward( Only populated if *return_all_hiddens* is True. """ bsz = src_tokens.size(0) - x, encoder_embedding = self.forward_embedding(src_tokens, objs, token_embeddings) + x, encoder_embedding = self.forward_embedding(src_tokens, objs, src_imgs, token_embeddings) # B x T x C -> T x B x C x = x.transpose(0, 1) diff --git a/video_dialogue_model/tasks/video_dialogue_task.py b/video_dialogue_model/tasks/video_dialogue_task.py index a8d6261..b9513ae 100644 --- a/video_dialogue_model/tasks/video_dialogue_task.py +++ b/video_dialogue_model/tasks/video_dialogue_task.py @@ -51,6 +51,7 @@ def load_text_image_dataset(self, split, **kwargs): ) def load_text_object_dataset(self, split, **kwargs): + features_dataset = FeatureDataset(self.args.data_dir, split) objects_dataset = ObjectDataset(self.args.data_dir, split, max_obj=self.args.max_obj) span_idxs = self.item2span_idxs(sent_num=objects_dataset.sent_num, max_src_sent=self.args.max_src_sent) @@ -60,6 +61,7 @@ def load_text_object_dataset(self, split, **kwargs): self.datasets[split] = TextObjectDataset(text_dataset=text_dataset, image_dataset=objects_dataset, + feature_dataset=features_dataset, vocab_dict=self.vocab_dict, span_idxs=span_idxs, shuffle=True if split == "train" else False)