cosmetics

DavidHuji · DavidHuji · commit 200aa5f782d0 · 2022-11-02T09:55:41.000+02:00
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # CapDec: Text-Only Training for Image Captioning using Noise-Injected CLIP
 
 
-## Official implementation for the paper ["CapDec: Text-Only Training for Image Captioning using Noise-Injected CLIP"](https://arxiv.org/abs/2211.00575) (EMNLP 2022).
+## Official implementation for the paper ["CapDec: Text-Only Training for Image Captioning using Noise-Injected CLIP"](https://arxiv.org/abs/2211.00575), EMNLP 2022 (findings).
 ![alt text](https://github.com/DavidHuji/CapDec/blob/main/fig1.png)
 
 ## Description  
@@ -46,9 +46,40 @@ python embeddings_generator.py -h
 python train.py --data clip_embeddings_of_last_stage.pkl --out_dir ./coco_train/
 ```
 
-**There are a few interesting configurable parameters for training as follows:** 
+**There are a few interesting configurable parameters for training as follows. 
+You can view it by running 'python train.py --help'**
 ```
-output of train.py -h
+optional arguments:
+  -h, --help            show this help message and exit
+  --data DATA           path to clip embeddings of captions generated by the attached embeddings_generator script
+  --val_pt VAL_PT       path to clip embeddings of validations set
+  --pretrain_weights PRETRAIN_WEIGHTS
+                        path to pretrained weights, if not specified, will train from scratch
+  --out_dir OUT_DIR     path to output directory
+  --add_modality_offset
+                        train with modality offset that was pre calculated at others/CLIP_embeddings_centers_info.pkl
+  --prefix PREFIX       prefix for saved filenames
+  --noise_variance NOISE_VARIANCE
+                        noise variance
+  --uniform_noise       use uniform noise instead of gaussian
+  --dont_norm           dont normalize CLIP embeddings
+  --lr LR               learning rate
+  --epochs EPOCHS       number of epochs
+  --save_every SAVE_EVERY
+                        save every n epochs
+  --prefix_length PREFIX_LENGTH
+                        prefix length
+  --prefix_length_clip PREFIX_LENGTH_CLIP
+                        prefix length for clip
+  --bs BS               batch size
+  --only_prefix         train only the mapper between CLIP and GPT, while GPT is frozen
+  --mapping_type MAPPING_TYPE
+                        type of architurctre between CLIP and GPT (mlp/transformer)
+  --num_layers NUM_LAYERS
+                        number of layers in the mapper
+  --is_not_rn           Choose the CLIP backbone: False for RN, True for ViT
+  --use_image_embedding_as_clipcap
+                        use image embedding as ClipCap
 ```
 
 # Evaluation
diff --git a/train.py b/train.py
@@ -394,26 +394,26 @@ def train(dataset: ClipCocoDataset, model: ClipCaptionModel, args, warmup_steps:
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--data', default='clip_embedding.pkl')
-    parser.add_argument('--val_pt', default='')
-    parser.add_argument('--pretrain_weights', default='')
-    parser.add_argument('--out_dir', default='./checkpoints')
-    parser.add_argument('--add_modality_offset', dest='add_modality_offset', action='store_true', default=False)
+    parser.add_argument('--data', default='clip_embedding.pkl', help='path to clip embeddings of captions generated by the attached embeddings_generator script')
+    parser.add_argument('--val_pt', default='', help='path to clip embeddings of validations set')
+    parser.add_argument('--pretrain_weights', default='', help='path to pretrained weights, if not specified, will train from scratch')
+    parser.add_argument('--out_dir', default='./checkpoints', help='path to output directory')
+    parser.add_argument('--add_modality_offset', dest='add_modality_offset', action='store_true', default=False, help='train with modality offset that was pre calculated at others/CLIP_embeddings_centers_info.pkl')
     parser.add_argument('--prefix', default='coco_prefix', help='prefix for saved filenames')
-    parser.add_argument('--noise_variance', type=float, default=0.0)
-    parser.add_argument('--uniform_noise', dest='uniform_noise', action='store_true', default=False)
-    parser.add_argument('--dont_norm', dest='dont_norm', action='store_true', default=False)
-    parser.add_argument('--lr', type=float, default=2e-5)
-    parser.add_argument('--epochs', type=int, default=10)
-    parser.add_argument('--save_every', type=int, default=1)
-    parser.add_argument('--prefix_length', type=int, default=40)
-    parser.add_argument('--prefix_length_clip', type=int, default=40)
-    parser.add_argument('--bs', type=int, default=34)
-    parser.add_argument('--only_prefix', dest='only_prefix', action='store_true', default=False)
-    parser.add_argument('--mapping_type', type=str, default='transformer', help='mlp/transformer')
-    parser.add_argument('--num_layers', type=int, default=8)
-    parser.add_argument('--is_not_rn', dest='is_not_rn', action='store_true', default=False)
-    parser.add_argument('--use_image_embedding_as_clipcap', dest='use_image_embedding_as_clipcap', action='store_true', default=False)
+    parser.add_argument('--noise_variance', type=float, default=0.0, help='noise variance')
+    parser.add_argument('--uniform_noise', dest='uniform_noise', action='store_true', default=False, help='use uniform noise instead of gaussian')
+    parser.add_argument('--dont_norm', dest='dont_norm', action='store_true', default=False, help='dont normalize CLIP embeddings')
+    parser.add_argument('--lr', type=float, default=2e-5, help='learning rate')
+    parser.add_argument('--epochs', type=int, default=10, help='number of epochs')
+    parser.add_argument('--save_every', type=int, default=1, help='save every n epochs')
+    parser.add_argument('--prefix_length', type=int, default=40, help='prefix length')
+    parser.add_argument('--prefix_length_clip', type=int, default=40, help='prefix length for clip')
+    parser.add_argument('--bs', type=int, default=34, help='batch size')
+    parser.add_argument('--only_prefix', dest='only_prefix', action='store_true', default=False, help='train only the mapper between CLIP and GPT, while GPT is frozen')
+    parser.add_argument('--mapping_type', type=str, default='transformer', help='type of architurctre between CLIP and GPT (mlp/transformer)')
+    parser.add_argument('--num_layers', type=int, default=8, help='number of layers in the mapper')
+    parser.add_argument('--is_not_rn', dest='is_not_rn', action='store_true', default=False, help='Choose the CLIP backbone: False for RN, True for ViT')
+    parser.add_argument('--use_image_embedding_as_clipcap', dest='use_image_embedding_as_clipcap', action='store_true', default=False, help='use image embedding as ClipCap')
     args = parser.parse_args()
     if args.data == 'COCO':
         args.bs = 30