From 2d4ecf4f2ee9dd90ba01ff5d86e8e3913b704723 Mon Sep 17 00:00:00 2001
From: Amanda Doucette <amanda@amnda.me>
Date: Thu, 21 Mar 2019 15:02:09 -0400
Subject: [PATCH] make AlignCNN work on CPU

---
 src/python/entity_align/model/AlignCNN.py   | 271 ++++++++++++++------
 src/python/entity_align/train/TrainModel.py |  90 +++++--
 2 files changed, 259 insertions(+), 102 deletions(-)

diff --git a/src/python/entity_align/model/AlignCNN.py b/src/python/entity_align/model/AlignCNN.py
index 4981670..509d19e 100644
--- a/src/python/entity_align/model/AlignCNN.py
+++ b/src/python/entity_align/model/AlignCNN.py
@@ -18,24 +18,38 @@
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss
 from torch.autograd import Variable
+import os
 
 
-#This model corresponds to AlignCNN in our paper
-#First, strings converted to list of character embeddings
-#Then, lstm runs over character embeddings
-#lstm embeddings at last time stamp matrix multiplied
-#Finally, cnn detects features in that matrix and outputs similarity score
+def cuda(x):
+    return x.cuda() if "CUDA_VISIBLE_DEVICES" in os.environ else x
+
+
+# This model corresponds to AlignCNN in our paper
+# First, strings converted to list of character embeddings
+# Then, lstm runs over character embeddings
+# lstm embeddings at last time stamp matrix multiplied
+# Finally, cnn detects features in that matrix and outputs similarity score
 class AlignCNN(torch.nn.Module):
-    def __init__(self,config,vocab):
+    def __init__(self, config, vocab):
         super(AlignCNN, self).__init__()
+
         self.config = config
         self.vocab = vocab
 
         # Character embeddings
-        self.embedding = nn.Embedding(vocab.size+1, config.embedding_dim, padding_idx=0)
+        self.embedding = nn.Embedding(
+            vocab.size + 1, config.embedding_dim, padding_idx=0
+        )
 
         # Sequence encoder of strings (LSTM)
-        self.rnn = nn.LSTM(config.embedding_dim, config.rnn_hidden_size, 1, bidirectional = config.bidirectional, batch_first = True)
+        self.rnn = nn.LSTM(
+            config.embedding_dim,
+            config.rnn_hidden_size,
+            1,
+            bidirectional=config.bidirectional,
+            batch_first=True,
+        )
 
         if self.config.bidirectional:
             self.num_directions = 2
@@ -43,14 +57,41 @@ def __init__(self,config,vocab):
             self.num_directions = 1
 
         # Variables for initial states of LSTM (these are different for train and dev because dev might be of different batch sizes)
-        self.h0 = Variable(torch.zeros(self.num_directions, config.batch_size, config.rnn_hidden_size).cuda(), requires_grad=False)
-        self.c0 = Variable(torch.zeros(self.num_directions, config.batch_size, config.rnn_hidden_size).cuda(), requires_grad=False)
-        self.h0_dev = Variable(torch.zeros(self.num_directions, config.dev_batch_size, config.rnn_hidden_size).cuda(), requires_grad=False)
-        self.c0_dev = Variable(torch.zeros(self.num_directions, config.dev_batch_size, config.rnn_hidden_size).cuda(), requires_grad=False)
-
+        self.h0 = Variable(
+            cuda(
+                torch.zeros(
+                    self.num_directions, config.batch_size, config.rnn_hidden_size
+                )
+            ),
+            requires_grad=False,
+        )
+        self.c0 = Variable(
+            cuda(
+                torch.zeros(
+                    self.num_directions, config.batch_size, config.rnn_hidden_size
+                )
+            ),
+            requires_grad=False,
+        )
+        self.h0_dev = Variable(
+            cuda(
+                torch.zeros(
+                    self.num_directions, config.dev_batch_size, config.rnn_hidden_size
+                )
+            ),
+            requires_grad=False,
+        )
+        self.c0_dev = Variable(
+            cuda(
+                torch.zeros(
+                    self.num_directions, config.dev_batch_size, config.rnn_hidden_size
+                )
+            ),
+            requires_grad=False,
+        )
 
         # Define the CNN used to score the alignment matrix
-        pool_output_height = int(np.floor(config.max_string_len/2.0))
+        pool_output_height = int(np.floor(config.max_string_len / 2.0))
 
         # Select # of layers / increasing or decreasing filter size based on config
         if config.num_layers == 4:
@@ -58,54 +99,104 @@ def __init__(self,config,vocab):
             self.relu = nn.ReLU()
             if config.increasing == True:
                 convlyr = nn.Conv2d(1, config.filter_count, 3, padding=1, stride=1)
-                convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1)
-                convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1)
-                convlyr4 = nn.Conv2d(config.filter_count3, config.filter_count4, 7, padding=3, stride=1)
+                convlyr2 = nn.Conv2d(
+                    config.filter_count, config.filter_count2, 5, padding=2, stride=1
+                )
+                convlyr3 = nn.Conv2d(
+                    config.filter_count2, config.filter_count3, 5, padding=2, stride=1
+                )
+                convlyr4 = nn.Conv2d(
+                    config.filter_count3, config.filter_count4, 7, padding=3, stride=1
+                )
             else:
                 convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1)
-                convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1)
-                convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1)
-                convlyr4 = nn.Conv2d(config.filter_count3, config.filter_count4, 3, padding=1, stride=1)
-            self.add_module("cnn2",convlyr2)
-            self.add_module("cnn3",convlyr3)
-            self.add_module("cnn4",convlyr4)
-            self.align_weights = nn.Parameter(torch.randn(config.filter_count3, pool_output_height, pool_output_height).cuda(),requires_grad=True)
+                convlyr2 = nn.Conv2d(
+                    config.filter_count, config.filter_count2, 5, padding=2, stride=1
+                )
+                convlyr3 = nn.Conv2d(
+                    config.filter_count2, config.filter_count3, 5, padding=2, stride=1
+                )
+                convlyr4 = nn.Conv2d(
+                    config.filter_count3, config.filter_count4, 3, padding=1, stride=1
+                )
+            self.add_module("cnn2", convlyr2)
+            self.add_module("cnn3", convlyr3)
+            self.add_module("cnn4", convlyr4)
+            self.align_weights = nn.Parameter(
+                cuda(
+                    torch.randn(
+                        config.filter_count3, pool_output_height, pool_output_height
+                    )
+                ),
+                requires_grad=True,
+            )
         elif config.num_layers == 3:
             self.num_layers = 3
             self.relu = nn.ReLU()
             if config.increasing == True:
                 convlyr = nn.Conv2d(1, config.filter_count, 5, padding=2, stride=1)
-                convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1)
-                convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 7, padding=3, stride=1)
+                convlyr2 = nn.Conv2d(
+                    config.filter_count, config.filter_count2, 5, padding=2, stride=1
+                )
+                convlyr3 = nn.Conv2d(
+                    config.filter_count2, config.filter_count3, 7, padding=3, stride=1
+                )
             else:
                 convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1)
-                convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 5, padding=2, stride=1)
-                convlyr3 = nn.Conv2d(config.filter_count2, config.filter_count3, 5, padding=2, stride=1)
-            self.add_module("cnn2",convlyr2)
-            self.add_module("cnn3",convlyr3)
-            self.align_weights = nn.Parameter(torch.randn(config.filter_count3, pool_output_height, pool_output_height).cuda(),requires_grad=True)
+                convlyr2 = nn.Conv2d(
+                    config.filter_count, config.filter_count2, 5, padding=2, stride=1
+                )
+                convlyr3 = nn.Conv2d(
+                    config.filter_count2, config.filter_count3, 5, padding=2, stride=1
+                )
+            self.add_module("cnn2", convlyr2)
+            self.add_module("cnn3", convlyr3)
+            self.align_weights = nn.Parameter(
+                cuda(
+                    torch.randn(
+                        config.filter_count3, pool_output_height, pool_output_height
+                    )
+                ),
+                requires_grad=True,
+            )
         elif config.num_layers == 2:
             self.num_layers = 2
             self.relu = nn.ReLU()
             convlyr = nn.Conv2d(1, config.filter_count, 5, padding=2, stride=1)
-            convlyr2 = nn.Conv2d(config.filter_count, config.filter_count2, 3, padding=1, stride=1)
-            self.add_module("cnn2",convlyr2)
-            self.align_weights = nn.Parameter(torch.randn(config.filter_count2, pool_output_height, pool_output_height).cuda(),requires_grad=True)
+            convlyr2 = nn.Conv2d(
+                config.filter_count, config.filter_count2, 3, padding=1, stride=1
+            )
+            self.add_module("cnn2", convlyr2)
+            self.align_weights = nn.Parameter(
+                cuda(
+                    torch.randn(
+                        config.filter_count2, pool_output_height, pool_output_height
+                    )
+                ),
+                requires_grad=True,
+            )
         else:
             self.num_layers = 1
             convlyr = nn.Conv2d(1, config.filter_count, 7, padding=3, stride=1)
-            self.align_weights = nn.Parameter(torch.randn(config.filter_count, pool_output_height, pool_output_height).cuda(),requires_grad=True)
-        self.add_module("cnn",convlyr)
+            self.align_weights = nn.Parameter(
+                cuda(
+                    torch.randn(
+                        config.filter_count, pool_output_height, pool_output_height
+                    )
+                ),
+                requires_grad=True,
+            )
+        self.add_module("cnn", convlyr)
         # Define pooling
         self.pool = nn.MaxPool2d((2, 2), stride=2)
 
         # Vector of ones (used for loss)
-        self.ones = Variable(torch.ones(config.batch_size, 1).cuda())
+        self.ones = Variable(cuda(torch.ones(config.batch_size, 1)))
 
         # Loss
         self.loss = BCEWithLogitsLoss()
 
-    def compute_loss(self,source,pos,neg, source_len,pos_len,neg_len):
+    def compute_loss(self, source, pos, neg, source_len, pos_len, neg_len):
         """ Compute the loss (BPR) for a batch of examples
         :param source: Entity mentions
         :param pos: True aliases of the Mentions
@@ -115,13 +206,14 @@ def compute_loss(self,source,pos,neg, source_len,pos_len,neg_len):
         :param neg_len: lengths of negatives
         :return:
         """
-        source_embed, src_mask = self.embed(source,source_len)
-        pos_embed, pos_mask = self.embed(pos,pos_len)
-        neg_embed, neg_mask = self.embed(neg,neg_len)
+        source_embed, src_mask = self.embed(source, source_len)
+        pos_embed, pos_mask = self.embed(pos, pos_len)
+        neg_embed, neg_mask = self.embed(neg, neg_len)
         loss = self.loss(
-            self.score_pair_train(source_embed , pos_embed, src_mask, pos_mask)
-            - self.score_pair_train(source_embed , neg_embed, src_mask, neg_mask),
-            self.ones)
+            self.score_pair_train(source_embed, pos_embed, src_mask, pos_mask)
+            - self.score_pair_train(source_embed, neg_embed, src_mask, neg_mask),
+            self.ones,
+        )
 
         return loss
 
@@ -136,9 +228,9 @@ def print_mm(self, src, tgt, src_len, tgt_len):
         """
         source_embed, source_mask = self.embed_dev(src, src_len)
         target_embed, target_mask = self.embed_dev(tgt, tgt_len)
-        return torch.bmm(source_embed,torch.transpose(target_embed, 2, 1))
+        return torch.bmm(source_embed, torch.transpose(target_embed, 2, 1))
 
-    def score_pair_train(self,src,tgt, src_mask, tgt_mask):
+    def score_pair_train(self, src, tgt, src_mask, tgt_mask):
         """
         :param src: Batchsize by Max_String_Length
         :param tgt: Batchsize by Max_String_Length
@@ -146,12 +238,12 @@ def score_pair_train(self,src,tgt, src_mask, tgt_mask):
         :param tgt_mask: Batchsize by Max_String_Length, binary mask corresponding to length of underlying str
         :return: Batchsize by 1
         """
-        multpld = torch.bmm(src,torch.transpose(tgt, 2, 1))
+        multpld = torch.bmm(src, torch.transpose(tgt, 2, 1))
         src_mask = src_mask.unsqueeze(dim=2)
         tgt_mask = tgt_mask.unsqueeze(dim=1)
         mat_mask = torch.bmm(src_mask, tgt_mask)
         multpld = torch.mul(multpld, mat_mask)
-        convd = self.cnn(multpld.unsqueeze(1)) #need num channels
+        convd = self.cnn(multpld.unsqueeze(1))  # need num channels
         if self.num_layers > 1:
             convd = self.relu(convd)
             convd = self.cnn2(convd)
@@ -162,34 +254,38 @@ def score_pair_train(self,src,tgt, src_mask, tgt_mask):
             convd = self.relu(convd)
             convd = self.cnn4(convd)
         convd_after_pooling = self.pool(convd)
-        #print(convd_after_pooling.size())
-        #print(self.align_weights.size())
-        output = torch.sum(self.align_weights.expand_as(convd_after_pooling) * convd_after_pooling, dim=3,keepdim=True)
-        output = torch.sum(output, dim=2,keepdim=True)
+        # print(convd_after_pooling.size())
+        # print(self.align_weights.size())
+        output = torch.sum(
+            self.align_weights.expand_as(convd_after_pooling) * convd_after_pooling,
+            dim=3,
+            keepdim=True,
+        )
+        output = torch.sum(output, dim=2, keepdim=True)
         output = torch.squeeze(output, dim=3)
         output = torch.squeeze(output, dim=2)
-        output = torch.sum(output, dim=1,keepdim=True)
+        output = torch.sum(output, dim=1, keepdim=True)
 
         return output
 
-    def embed(self,string_mat, string_len):
+    def embed(self, string_mat, string_len):
         """
         :param string_mat: Batch_size by max_string_len
         :return: batch_size by embedding dim
         """
-        string_mat = torch.from_numpy(string_mat).cuda()
-        mask = Variable(torch.cuda.ByteTensor((string_mat > 0)).float())
+        string_mat = cuda(torch.from_numpy(string_mat))
+        mask = Variable(cuda(torch).ByteTensor((string_mat > 0)).float())
         embed_token = self.embedding(Variable(string_mat))
         final_emb, final_hn_cn = self.rnn(embed_token, (self.h0, self.c0))
         return final_emb, mask
 
-    def embed_dev(self, string_mat, string_len, print_embed=False, batch_size = None):
+    def embed_dev(self, string_mat, string_len, print_embed=False, batch_size=None):
         """
         :param string_mat: Batch_size by max_string_len
         :return: batch_size by embedding dim
         """
-        string_mat = torch.from_numpy(string_mat).cuda()
-        mask = Variable(torch.cuda.ByteTensor((string_mat > 0)).float())
+        string_mat = cuda(torch.from_numpy(string_mat))
+        mask = Variable(cuda(torch).ByteTensor((string_mat > 0)).float())
         if not batch_size:
             this_batch_size = self.config.dev_batch_size
             this_h0 = self.h0_dev
@@ -197,28 +293,51 @@ def embed_dev(self, string_mat, string_len, print_embed=False, batch_size = None
         else:
             print("irregular batch size {}".format(batch_size))
             this_batch_size = batch_size
-            this_h0 = Variable(torch.zeros(self.num_directions, batch_size,
-                                           self.config.rnn_hidden_size).cuda(),
-                               requires_grad=False)
-            this_c0 = Variable(torch.zeros(self.num_directions, batch_size,
-                                           self.config.rnn_hidden_size).cuda(),
-                               requires_grad=False)
+            this_h0 = Variable(
+                cuda(
+                    torch.zeros(
+                        self.num_directions, batch_size, self.config.rnn_hidden_size
+                    )
+                ),
+                requires_grad=False,
+            )
+            this_c0 = Variable(
+                cuda(
+                    torch.zeros(
+                        self.num_directions, batch_size, self.config.rnn_hidden_size
+                    )
+                ),
+                requires_grad=False,
+            )
         embed_token = self.embedding(Variable(string_mat))
-        if print_embed==True:
+        if print_embed == True:
             return embed_token
         final_emb, final_hn_cn = self.rnn(embed_token, (this_h0, this_c0))
         return final_emb, mask
 
-    def score_dev_test_batch(self,batch_queries,
-                             batch_query_lengths,
-                             batch_targets,
-                             batch_target_lengths,
-                             batch_size):
+    def score_dev_test_batch(
+        self,
+        batch_queries,
+        batch_query_lengths,
+        batch_targets,
+        batch_target_lengths,
+        batch_size,
+    ):
         if batch_size == self.config.dev_batch_size:
-            source_embed,source_mask = self.embed_dev(batch_queries, batch_query_lengths)
-            target_embed,target_mask = self.embed_dev(batch_targets, batch_target_lengths)
+            source_embed, source_mask = self.embed_dev(
+                batch_queries, batch_query_lengths
+            )
+            target_embed, target_mask = self.embed_dev(
+                batch_targets, batch_target_lengths
+            )
         else:
-            source_embed,source_mask = self.embed_dev(batch_queries, batch_query_lengths,batch_size=batch_size)
-            target_embed,target_mask = self.embed_dev(batch_targets, batch_target_lengths,batch_size=batch_size)
-        scores = self.score_pair_train(source_embed, target_embed,source_mask,target_mask)
+            source_embed, source_mask = self.embed_dev(
+                batch_queries, batch_query_lengths, batch_size=batch_size
+            )
+            target_embed, target_mask = self.embed_dev(
+                batch_targets, batch_target_lengths, batch_size=batch_size
+            )
+        scores = self.score_pair_train(
+            source_embed, target_embed, source_mask, target_mask
+        )
         return scores
diff --git a/src/python/entity_align/train/TrainModel.py b/src/python/entity_align/train/TrainModel.py
index bc5cbe2..3ae58a1 100644
--- a/src/python/entity_align/train/TrainModel.py
+++ b/src/python/entity_align/train/TrainModel.py
@@ -34,19 +34,21 @@
 from entity_align.utils.DevTestBatcher import DevBatcher
 from entity_align.utils.Util import save_dict_to_json
 
-def train_model(config,dataset_name,model_name):
+
+def train_model(config, dataset_name, model_name):
     """ Train based on the given config, model / dataset
-    
+
     :param config: config object
     :param dataset_name: name of dataset
     :param model_name: name of model
-    :return: 
+    :return:
     """
     config.dataset_name = dataset_name
     now = datetime.datetime.now()
     config.model_name = model_name
-    ts = "{:04d}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}".format(now.year, now.month, now.day, now.hour, now.minute,
-                                                            now.second)
+    ts = "{:04d}-{:02d}-{:02d}-{:02d}-{:02d}-{:02d}".format(
+        now.year, now.month, now.day, now.hour, now.minute, now.second
+    )
     config.experiment_out_dir = os.path.join("exp_out", dataset_name, model_name, ts)
 
     # Load vocab
@@ -59,14 +61,16 @@ def train_model(config,dataset_name,model_name):
     # save the config to outdir
     config.save_config(output_dir)
     # save the vocab to out dir
-    copyfile(config.vocab_file, os.path.join(output_dir, 'vocab.tsv'))
+    copyfile(config.vocab_file, os.path.join(output_dir, "vocab.tsv"))
     # save the source code.
-    copytree(os.path.join(os.environ['SED_ROOT'], 'src'), os.path.join(output_dir, 'src'))
+    copytree(
+        os.path.join(os.environ["SED_ROOT"], "src"), os.path.join(output_dir, "src")
+    )
 
     torch.manual_seed(config.random_seed)
 
     # Set up batcher
-    batcher = Batcher(config, vocab, 'train')
+    batcher = Batcher(config, vocab, "train")
 
     model = None
     # Set up Model
@@ -82,9 +86,12 @@ def train_model(config,dataset_name,model_name):
         print("Unknown model")
         sys.exit(1)
 
-    model.cuda()
-    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate,
-                           weight_decay=config.l2penalty)
+    model.cuda() if "CUDA_VISIBLE_DEVICES" in os.environ else model.cpu()
+    optimizer = optim.Adam(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=config.learning_rate,
+        weight_decay=config.l2penalty,
+    )
 
     # Stats
     best_map = 0
@@ -108,45 +115,76 @@ def train_model(config,dataset_name,model_name):
             # print("p-n:{}".format(model.print_loss(source,pos,neg,source_len,pos_len,neg_len)))
             this_loss = loss.cpu().data.numpy()[0]
             sum_loss += this_loss
-            print("Processed {} batches, Loss of batch {}: {}. Average loss: {}".format(counter, counter, this_loss,
-                                                                                        sum_loss / (counter / 100)))
+            print(
+                "Processed {} batches, Loss of batch {}: {}. Average loss: {}".format(
+                    counter, counter, this_loss, sum_loss / (counter / 100)
+                )
+            )
             sys.stdout.flush()
 
         if counter % config.eval_every == 0:
             dev_batcher = DevBatcher(config, vocab)
-            prediction_filename = os.path.join(output_dir, 'dev.predictions.{}.tsv').format(counter)
+            prediction_filename = os.path.join(
+                output_dir, "dev.predictions.{}.tsv"
+            ).format(counter)
             write_predictions(model, dev_batcher, prediction_filename)
             scores = ""
             map_score = float(eval_map_file(prediction_filename))
             hits_at_1 = float(eval_hits_at_k_file(prediction_filename, 1))
             hits_at_10 = float(eval_hits_at_k_file(prediction_filename, 10))
             hits_at_50 = float(eval_hits_at_k_file(prediction_filename, 50))
-            scores += "{}\t{}\t{}\tMAP\t{}\n".format(config.model_name, config.dataset_name, counter, map_score)
-            scores += "{}\t{}\t{}\tHits@1\t{}\n".format(config.model_name, config.dataset_name, counter, hits_at_1)
-            scores += "{}\t{}\t{}\tHits@10\t{}\n".format(config.model_name, config.dataset_name, counter, hits_at_10)
-            scores += "{}\t{}\t{}\tHits@50\t{}\n".format(config.model_name, config.dataset_name, counter, hits_at_50)
+            scores += "{}\t{}\t{}\tMAP\t{}\n".format(
+                config.model_name, config.dataset_name, counter, map_score
+            )
+            scores += "{}\t{}\t{}\tHits@1\t{}\n".format(
+                config.model_name, config.dataset_name, counter, hits_at_1
+            )
+            scores += "{}\t{}\t{}\tHits@10\t{}\n".format(
+                config.model_name, config.dataset_name, counter, hits_at_10
+            )
+            scores += "{}\t{}\t{}\tHits@50\t{}\n".format(
+                config.model_name, config.dataset_name, counter, hits_at_50
+            )
             print(scores)
-            score_obj = {"samples": counter, "map": map_score, "hits_at_1": hits_at_1, "hits_at_10": hits_at_10, "hits_at_50": hits_at_50,
-                         "config": config.__dict__}
+            score_obj = {
+                "samples": counter,
+                "map": map_score,
+                "hits_at_1": hits_at_1,
+                "hits_at_10": hits_at_10,
+                "hits_at_50": hits_at_50,
+                "config": config.__dict__,
+            }
             print(score_obj)
-            save_dict_to_json(score_obj, os.path.join(output_dir, 'dev.scores.{}.json'.format(counter)))
-            with open(os.path.join(output_dir, 'dev.scores.{}.tsv'.format(counter)), 'w') as fout:
+            save_dict_to_json(
+                score_obj,
+                os.path.join(output_dir, "dev.scores.{}.json".format(counter)),
+            )
+            with open(
+                os.path.join(output_dir, "dev.scores.{}.tsv".format(counter)), "w"
+            ) as fout:
                 fout.write(scores)
             if map_score > best_map:
                 print("New best MAP!")
                 print("Saving Model.....")
-                torch.save(model, os.path.join(output_dir,
-                                               'model_{}_{}_{}.torch'.format(config.model_name, config.dataset_name,
-                                                                             counter)))
+                torch.save(
+                    model,
+                    os.path.join(
+                        output_dir,
+                        "model_{}_{}_{}.torch".format(
+                            config.model_name, config.dataset_name, counter
+                        ),
+                    ),
+                )
                 best_map = map_score
             sys.stdout.flush()
         if counter == config.num_minibatches:
             break
 
+
 if __name__ == "__main__":
 
     # Set up the config
     config = Config(sys.argv[1])
     dataset_name = sys.argv[2]
     model_name = sys.argv[3]
-    train_model(config,dataset_name,model_name)
+    train_model(config, dataset_name, model_name)