From 2d4ecf4f2ee9dd90ba01ff5d86e8e3913b704723 Mon Sep 17 00:00:00 2001 From: Amanda Doucette Date: Thu, 21 Mar 2019 15:02:09 -0400 Subject: [PATCH] make AlignCNN work on CPU --- src/python/entity_align/model/AlignCNN.py | 271 ++++++++++++++------ src/python/entity_align/train/TrainModel.py | 90 +++++-- 2 files changed, 259 insertions(+), 102 deletions(-) diff --git a/src/python/entity_align/model/AlignCNN.py b/src/python/entity_align/model/AlignCNN.py index 4981670..509d19e 100644 --- a/src/python/entity_align/model/AlignCNN.py +++ b/src/python/entity_align/model/AlignCNN.py @@ -18,24 +18,38 @@ import torch.nn as nn from torch.nn import BCEWithLogitsLoss from torch.autograd import Variable +import os -#This model corresponds to AlignCNN in our paper -#First, strings converted to list of character embeddings -#Then, lstm runs over character embeddings -#lstm embeddings at last time stamp matrix multiplied -#Finally, cnn detects features in that matrix and outputs similarity score +def cuda(x): + return x.cuda() if "CUDA_VISIBLE_DEVICES" in os.environ else x + + +# This model corresponds to AlignCNN in our paper +# First, strings converted to list of character embeddings +# Then, lstm runs over character embeddings +# lstm embeddings at last time stamp matrix multiplied +# Finally, cnn detects features in that matrix and outputs similarity score class AlignCNN(torch.nn.Module): - def __init__(self,config,vocab): + def __init__(self, config, vocab): super(AlignCNN, self).__init__() + self.config = config self.vocab = vocab # Character embeddings - self.embedding = nn.Embedding(vocab.size+1, config.embedding_dim, padding_idx=0) + self.embedding = nn.Embedding( + vocab.size + 1, config.embedding_dim, padding_idx=0 + ) # Sequence encoder of strings (LSTM) - self.rnn = nn.LSTM(config.embedding_dim, config.rnn_hidden_size, 1, bidirectional = config.bidirectional, batch_first = True) + self.rnn = nn.LSTM( + config.embedding_dim, + config.rnn_hidden_size, + 1, + bidirectional=config.bidirectional, + batch_first=True, + ) if self.config.bidirectional: self.num_directions = 2 @@ -43,14 +57,41 @@ def __init__(self,config,vocab): self.num_directions = 1 # Variables for initial states of LSTM (these are different for train and dev because dev might be of different batch sizes) - self.h0 = Variable(torch.zeros(self.num_directions, config.batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) - self.c0 = Variable(torch.zeros(self.num_directions, config.batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) - self.h0_dev = Variable(torch.zeros(self.num_directions, config.dev_batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) - self.c0_dev = Variable(torch.zeros(self.num_directions, config.dev_batch_size, config.rnn_hidden_size).cuda(), requires_grad=False) - + self.h0 = Variable( + cuda( + torch.zeros( + self.num_directions, config.batch_size, config.rnn_hidden_size + ) + ), + requires_grad=False, + ) + self.c0 = Variable( + cuda( + torch.zeros( + self.num_directions, config.batch_size, config.rnn_hidden_size + ) + ), + requires_grad=False, + ) + self.h0_dev = Variable( + cuda( + torch.zeros( + self.num_directions, config.dev_batch_size, config.rnn_hidden_size + ) + ), + requires_grad=False, + ) + self.c0_dev = Variable( + cuda( + torch.zeros( + self.num_directions, config.dev_batch_size, config.rnn_hidden_size + ) + ), + requires_grad=False, + ) # Define the CNN used to score the alignment matrix - pool_output_height = int(np.floor(config.max_string_len/2.0)) + pool_output_height = int(np.floor(config.max_string_len / 2.0)) # Select # of layers / increasing or decreasing filter size based on config if config.num_layers == 4: @@ -58,54 +99,104 @@ def __init__(self,config,vocab): self.relu = nn.ReLU() if config.increasing == True: convlyr = nn.Conv2d(1, config.filter_count, 3, padding=1, stride=1) - convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) - convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1) - convlyr4 = nn.Conv2d(config.filter_count3, config.filter_count4, 7, padding=3, stride=1) + convlyr2 = nn.Conv2d( + config.filter_count, config.filter_count2, 5, padding=2, stride=1 + ) + convlyr3 = nn.Conv2d( + config.filter_count2, config.filter_count3, 5, padding=2, stride=1 + ) + convlyr4 = nn.Conv2d( + config.filter_count3, config.filter_count4, 7, padding=3, stride=1 + ) else: convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1) - convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) - convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1) - convlyr4 = nn.Conv2d(config.filter_count3, config.filter_count4, 3, padding=1, stride=1) - self.add_module("cnn2",convlyr2) - self.add_module("cnn3",convlyr3) - self.add_module("cnn4",convlyr4) - self.align_weights = nn.Parameter(torch.randn(config.filter_count3, pool_output_height, pool_output_height).cuda(),requires_grad=True) + convlyr2 = nn.Conv2d( + config.filter_count, config.filter_count2, 5, padding=2, stride=1 + ) + convlyr3 = nn.Conv2d( + config.filter_count2, config.filter_count3, 5, padding=2, stride=1 + ) + convlyr4 = nn.Conv2d( + config.filter_count3, config.filter_count4, 3, padding=1, stride=1 + ) + self.add_module("cnn2", convlyr2) + self.add_module("cnn3", convlyr3) + self.add_module("cnn4", convlyr4) + self.align_weights = nn.Parameter( + cuda( + torch.randn( + config.filter_count3, pool_output_height, pool_output_height + ) + ), + requires_grad=True, + ) elif config.num_layers == 3: self.num_layers = 3 self.relu = nn.ReLU() if config.increasing == True: convlyr = nn.Conv2d(1, config.filter_count, 5, padding=2, stride=1) - convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) - convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 7, padding=3, stride=1) + convlyr2 = nn.Conv2d( + config.filter_count, config.filter_count2, 5, padding=2, stride=1 + ) + convlyr3 = nn.Conv2d( + config.filter_count2, config.filter_count3, 7, padding=3, stride=1 + ) else: convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1) - convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1) - convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1) - self.add_module("cnn2",convlyr2) - self.add_module("cnn3",convlyr3) - self.align_weights = nn.Parameter(torch.randn(config.filter_count3, pool_output_height, pool_output_height).cuda(),requires_grad=True) + convlyr2 = nn.Conv2d( + config.filter_count, config.filter_count2, 5, padding=2, stride=1 + ) + convlyr3 = nn.Conv2d( + config.filter_count2, config.filter_count3, 5, padding=2, stride=1 + ) + self.add_module("cnn2", convlyr2) + self.add_module("cnn3", convlyr3) + self.align_weights = nn.Parameter( + cuda( + torch.randn( + config.filter_count3, pool_output_height, pool_output_height + ) + ), + requires_grad=True, + ) elif config.num_layers == 2: self.num_layers = 2 self.relu = nn.ReLU() convlyr = nn.Conv2d(1, config.filter_count, 5, padding=2, stride=1) - convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 3, padding=1, stride=1) - self.add_module("cnn2",convlyr2) - self.align_weights = nn.Parameter(torch.randn(config.filter_count2, pool_output_height, pool_output_height).cuda(),requires_grad=True) + convlyr2 = nn.Conv2d( + config.filter_count, config.filter_count2, 3, padding=1, stride=1 + ) + self.add_module("cnn2", convlyr2) + self.align_weights = nn.Parameter( + cuda( + torch.randn( + config.filter_count2, pool_output_height, pool_output_height + ) + ), + requires_grad=True, + ) else: self.num_layers = 1 convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1) - self.align_weights = nn.Parameter(torch.randn(config.filter_count, pool_output_height, pool_output_height).cuda(),requires_grad=True) - self.add_module("cnn",convlyr) + self.align_weights = nn.Parameter( + cuda( + torch.randn( + config.filter_count, pool_output_height, pool_output_height + ) + ), + requires_grad=True, + ) + self.add_module("cnn", convlyr) # Define pooling self.pool = nn.MaxPool2d((2, 2), stride=2) # Vector of ones (used for loss) - self.ones = Variable(torch.ones(config.batch_size, 1).cuda()) + self.ones = Variable(cuda(torch.ones(config.batch_size, 1))) # Loss self.loss = BCEWithLogitsLoss() - def compute_loss(self,source,pos,neg, source_len,pos_len,neg_len): + def compute_loss(self, source, pos, neg, source_len, pos_len, neg_len): """ Compute the loss (BPR) for a batch of examples :param source: Entity mentions :param pos: True aliases of the Mentions @@ -115,13 +206,14 @@ def compute_loss(self,source,pos,neg, source_len,pos_len,neg_len): :param neg_len: lengths of negatives :return: """ - source_embed, src_mask = self.embed(source,source_len) - pos_embed, pos_mask = self.embed(pos,pos_len) - neg_embed, neg_mask = self.embed(neg,neg_len) + source_embed, src_mask = self.embed(source, source_len) + pos_embed, pos_mask = self.embed(pos, pos_len) + neg_embed, neg_mask = self.embed(neg, neg_len) loss = self.loss( - self.score_pair_train(source_embed , pos_embed, src_mask, pos_mask) - - self.score_pair_train(source_embed , neg_embed, src_mask, neg_mask), - self.ones) + self.score_pair_train(source_embed, pos_embed, src_mask, pos_mask) + - self.score_pair_train(source_embed, neg_embed, src_mask, neg_mask), + self.ones, + ) return loss @@ -136,9 +228,9 @@ def print_mm(self, src, tgt, src_len, tgt_len): """ source_embed, source_mask = self.embed_dev(src, src_len) target_embed, target_mask = self.embed_dev(tgt, tgt_len) - return torch.bmm(source_embed,torch.transpose(target_embed, 2, 1)) + return torch.bmm(source_embed, torch.transpose(target_embed, 2, 1)) - def score_pair_train(self,src,tgt, src_mask, tgt_mask): + def score_pair_train(self, src, tgt, src_mask, tgt_mask): """ :param src: Batchsize by Max_String_Length :param tgt: Batchsize by Max_String_Length @@ -146,12 +238,12 @@ def score_pair_train(self,src,tgt, src_mask, tgt_mask): :param tgt_mask: Batchsize by Max_String_Length, binary mask corresponding to length of underlying str :return: Batchsize by 1 """ - multpld = torch.bmm(src,torch.transpose(tgt, 2, 1)) + multpld = torch.bmm(src, torch.transpose(tgt, 2, 1)) src_mask = src_mask.unsqueeze(dim=2) tgt_mask = tgt_mask.unsqueeze(dim=1) mat_mask = torch.bmm(src_mask, tgt_mask) multpld = torch.mul(multpld, mat_mask) - convd = self.cnn(multpld.unsqueeze(1)) #need num channels + convd = self.cnn(multpld.unsqueeze(1)) # need num channels if self.num_layers > 1: convd = self.relu(convd) convd = self.cnn2(convd) @@ -162,34 +254,38 @@ def score_pair_train(self,src,tgt, src_mask, tgt_mask): convd = self.relu(convd) convd = self.cnn4(convd) convd_after_pooling = self.pool(convd) - #print(convd_after_pooling.size()) - #print(self.align_weights.size()) - output = torch.sum(self.align_weights.expand_as(convd_after_pooling) * convd_after_pooling, dim=3,keepdim=True) - output = torch.sum(output, dim=2,keepdim=True) + # print(convd_after_pooling.size()) + # print(self.align_weights.size()) + output = torch.sum( + self.align_weights.expand_as(convd_after_pooling) * convd_after_pooling, + dim=3, + keepdim=True, + ) + output = torch.sum(output, dim=2, keepdim=True) output = torch.squeeze(output, dim=3) output = torch.squeeze(output, dim=2) - output = torch.sum(output, dim=1,keepdim=True) + output = torch.sum(output, dim=1, keepdim=True) return output - def embed(self,string_mat, string_len): + def embed(self, string_mat, string_len): """ :param string_mat: Batch_size by max_string_len :return: batch_size by embedding dim """ - string_mat = torch.from_numpy(string_mat).cuda() - mask = Variable(torch.cuda.ByteTensor((string_mat > 0)).float()) + string_mat = cuda(torch.from_numpy(string_mat)) + mask = Variable(cuda(torch).ByteTensor((string_mat > 0)).float()) embed_token = self.embedding(Variable(string_mat)) final_emb, final_hn_cn = self.rnn(embed_token, (self.h0, self.c0)) return final_emb, mask - def embed_dev(self, string_mat, string_len, print_embed=False, batch_size = None): + def embed_dev(self, string_mat, string_len, print_embed=False, batch_size=None): """ :param string_mat: Batch_size by max_string_len :return: batch_size by embedding dim """ - string_mat = torch.from_numpy(string_mat).cuda() - mask = Variable(torch.cuda.ByteTensor((string_mat > 0)).float()) + string_mat = cuda(torch.from_numpy(string_mat)) + mask = Variable(cuda(torch).ByteTensor((string_mat > 0)).float()) if not batch_size: this_batch_size = self.config.dev_batch_size this_h0 = self.h0_dev @@ -197,28 +293,51 @@ def embed_dev(self, string_mat, string_len, print_embed=False, batch_size = None else: print("irregular batch size {}".format(batch_size)) this_batch_size = batch_size - this_h0 = Variable(torch.zeros(self.num_directions, batch_size, - self.config.rnn_hidden_size).cuda(), - requires_grad=False) - this_c0 = Variable(torch.zeros(self.num_directions, batch_size, - self.config.rnn_hidden_size).cuda(), - requires_grad=False) + this_h0 = Variable( + cuda( + torch.zeros( + self.num_directions, batch_size, self.config.rnn_hidden_size + ) + ), + requires_grad=False, + ) + this_c0 = Variable( + cuda( + torch.zeros( + self.num_directions, batch_size, self.config.rnn_hidden_size + ) + ), + requires_grad=False, + ) embed_token = self.embedding(Variable(string_mat)) - if print_embed==True: + if print_embed == True: return embed_token final_emb, final_hn_cn = self.rnn(embed_token, (this_h0, this_c0)) return final_emb, mask - def score_dev_test_batch(self,batch_queries, - batch_query_lengths, - batch_targets, - batch_target_lengths, - batch_size): + def score_dev_test_batch( + self, + batch_queries, + batch_query_lengths, + batch_targets, + batch_target_lengths, + batch_size, + ): if batch_size == self.config.dev_batch_size: - source_embed,source_mask = self.embed_dev(batch_queries, batch_query_lengths) - target_embed,target_mask = self.embed_dev(batch_targets, batch_target_lengths) + source_embed, source_mask = self.embed_dev( + batch_queries, batch_query_lengths + ) + target_embed, target_mask = self.embed_dev( + batch_targets, batch_target_lengths + ) else: - source_embed,source_mask = self.embed_dev(batch_queries, batch_query_lengths,batch_size=batch_size) - target_embed,target_mask = self.embed_dev(batch_targets, batch_target_lengths,batch_size=batch_size) - scores = self.score_pair_train(source_embed, target_embed,source_mask,target_mask) + source_embed, source_mask = self.embed_dev( + batch_queries, batch_query_lengths, batch_size=batch_size + ) + target_embed, target_mask = self.embed_dev( + batch_targets, batch_target_lengths, batch_size=batch_size + ) + scores = self.score_pair_train( + source_embed, target_embed, source_mask, target_mask + ) return scores diff --git a/src/python/entity_align/train/TrainModel.py b/src/python/entity_align/train/TrainModel.py index bc5cbe2..3ae58a1 100644 --- a/src/python/entity_align/train/TrainModel.py +++ b/src/python/entity_align/train/TrainModel.py @@ -34,19 +34,21 @@ from entity_align.utils.DevTestBatcher import DevBatcher from entity_align.utils.Util import save_dict_to_json -def train_model(config,dataset_name,model_name): + +def train_model(config, dataset_name, model_name): """ Train based on the given config, model / dataset - + :param config: config object :param dataset_name: name of dataset :param model_name: name of model - :return: + :return: """ config.dataset_name = dataset_name now = datetime.datetime.now() config.model_name = model_name - ts = "{:04d}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}".format(now.year, now.month, now.day, now.hour, now.minute, - now.second) + ts = "{:04d}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}".format( + now.year, now.month, now.day, now.hour, now.minute, now.second + ) config.experiment_out_dir = os.path.join("exp_out", dataset_name, model_name, ts) # Load vocab @@ -59,14 +61,16 @@ def train_model(config,dataset_name,model_name): # save the config to outdir config.save_config(output_dir) # save the vocab to out dir - copyfile(config.vocab_file, os.path.join(output_dir, 'vocab.tsv')) + copyfile(config.vocab_file, os.path.join(output_dir, "vocab.tsv")) # save the source code. - copytree(os.path.join(os.environ['SED_ROOT'], 'src'), os.path.join(output_dir, 'src')) + copytree( + os.path.join(os.environ["SED_ROOT"], "src"), os.path.join(output_dir, "src") + ) torch.manual_seed(config.random_seed) # Set up batcher - batcher = Batcher(config, vocab, 'train') + batcher = Batcher(config, vocab, "train") model = None # Set up Model @@ -82,9 +86,12 @@ def train_model(config,dataset_name,model_name): print("Unknown model") sys.exit(1) - model.cuda() - optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate, - weight_decay=config.l2penalty) + model.cuda() if "CUDA_VISIBLE_DEVICES" in os.environ else model.cpu() + optimizer = optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=config.learning_rate, + weight_decay=config.l2penalty, + ) # Stats best_map = 0 @@ -108,45 +115,76 @@ def train_model(config,dataset_name,model_name): # print("p-n:{}".format(model.print_loss(source,pos,neg,source_len,pos_len,neg_len))) this_loss = loss.cpu().data.numpy()[0] sum_loss += this_loss - print("Processed {} batches, Loss of batch {}: {}. Average loss: {}".format(counter, counter, this_loss, - sum_loss / (counter / 100))) + print( + "Processed {} batches, Loss of batch {}: {}. Average loss: {}".format( + counter, counter, this_loss, sum_loss / (counter / 100) + ) + ) sys.stdout.flush() if counter % config.eval_every == 0: dev_batcher = DevBatcher(config, vocab) - prediction_filename = os.path.join(output_dir, 'dev.predictions.{}.tsv').format(counter) + prediction_filename = os.path.join( + output_dir, "dev.predictions.{}.tsv" + ).format(counter) write_predictions(model, dev_batcher, prediction_filename) scores = "" map_score = float(eval_map_file(prediction_filename)) hits_at_1 = float(eval_hits_at_k_file(prediction_filename, 1)) hits_at_10 = float(eval_hits_at_k_file(prediction_filename, 10)) hits_at_50 = float(eval_hits_at_k_file(prediction_filename, 50)) - scores += "{}\t{}\t{}\tMAP\t{}\n".format(config.model_name, config.dataset_name, counter, map_score) - scores += "{}\t{}\t{}\tHits@1\t{}\n".format(config.model_name, config.dataset_name, counter, hits_at_1) - scores += "{}\t{}\t{}\tHits@10\t{}\n".format(config.model_name, config.dataset_name, counter, hits_at_10) - scores += "{}\t{}\t{}\tHits@50\t{}\n".format(config.model_name, config.dataset_name, counter, hits_at_50) + scores += "{}\t{}\t{}\tMAP\t{}\n".format( + config.model_name, config.dataset_name, counter, map_score + ) + scores += "{}\t{}\t{}\tHits@1\t{}\n".format( + config.model_name, config.dataset_name, counter, hits_at_1 + ) + scores += "{}\t{}\t{}\tHits@10\t{}\n".format( + config.model_name, config.dataset_name, counter, hits_at_10 + ) + scores += "{}\t{}\t{}\tHits@50\t{}\n".format( + config.model_name, config.dataset_name, counter, hits_at_50 + ) print(scores) - score_obj = {"samples": counter, "map": map_score, "hits_at_1": hits_at_1, "hits_at_10": hits_at_10, "hits_at_50": hits_at_50, - "config": config.__dict__} + score_obj = { + "samples": counter, + "map": map_score, + "hits_at_1": hits_at_1, + "hits_at_10": hits_at_10, + "hits_at_50": hits_at_50, + "config": config.__dict__, + } print(score_obj) - save_dict_to_json(score_obj, os.path.join(output_dir, 'dev.scores.{}.json'.format(counter))) - with open(os.path.join(output_dir, 'dev.scores.{}.tsv'.format(counter)), 'w') as fout: + save_dict_to_json( + score_obj, + os.path.join(output_dir, "dev.scores.{}.json".format(counter)), + ) + with open( + os.path.join(output_dir, "dev.scores.{}.tsv".format(counter)), "w" + ) as fout: fout.write(scores) if map_score > best_map: print("New best MAP!") print("Saving Model.....") - torch.save(model, os.path.join(output_dir, - 'model_{}_{}_{}.torch'.format(config.model_name, config.dataset_name, - counter))) + torch.save( + model, + os.path.join( + output_dir, + "model_{}_{}_{}.torch".format( + config.model_name, config.dataset_name, counter + ), + ), + ) best_map = map_score sys.stdout.flush() if counter == config.num_minibatches: break + if __name__ == "__main__": # Set up the config config = Config(sys.argv[1]) dataset_name = sys.argv[2] model_name = sys.argv[3] - train_model(config,dataset_name,model_name) + train_model(config, dataset_name, model_name)