Skip to content

Commit 200aa5f

Browse files
author
DavidHuji
committed
cosmetics
1 parent 4c1be2d commit 200aa5f

File tree

2 files changed

+53
-22
lines changed

2 files changed

+53
-22
lines changed

README.md

+34-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# CapDec: Text-Only Training for Image Captioning using Noise-Injected CLIP
22

33

4-
## Official implementation for the paper ["CapDec: Text-Only Training for Image Captioning using Noise-Injected CLIP"](https://arxiv.org/abs/2211.00575) (EMNLP 2022).
4+
## Official implementation for the paper ["CapDec: Text-Only Training for Image Captioning using Noise-Injected CLIP"](https://arxiv.org/abs/2211.00575), EMNLP 2022 (findings).
55
![alt text](https://github.com/DavidHuji/CapDec/blob/main/fig1.png)
66

77
## Description
@@ -46,9 +46,40 @@ python embeddings_generator.py -h
4646
python train.py --data clip_embeddings_of_last_stage.pkl --out_dir ./coco_train/
4747
```
4848

49-
**There are a few interesting configurable parameters for training as follows:**
49+
**There are a few interesting configurable parameters for training as follows.
50+
You can view it by running 'python train.py --help'**
5051
```
51-
output of train.py -h
52+
optional arguments:
53+
-h, --help show this help message and exit
54+
--data DATA path to clip embeddings of captions generated by the attached embeddings_generator script
55+
--val_pt VAL_PT path to clip embeddings of validations set
56+
--pretrain_weights PRETRAIN_WEIGHTS
57+
path to pretrained weights, if not specified, will train from scratch
58+
--out_dir OUT_DIR path to output directory
59+
--add_modality_offset
60+
train with modality offset that was pre calculated at others/CLIP_embeddings_centers_info.pkl
61+
--prefix PREFIX prefix for saved filenames
62+
--noise_variance NOISE_VARIANCE
63+
noise variance
64+
--uniform_noise use uniform noise instead of gaussian
65+
--dont_norm dont normalize CLIP embeddings
66+
--lr LR learning rate
67+
--epochs EPOCHS number of epochs
68+
--save_every SAVE_EVERY
69+
save every n epochs
70+
--prefix_length PREFIX_LENGTH
71+
prefix length
72+
--prefix_length_clip PREFIX_LENGTH_CLIP
73+
prefix length for clip
74+
--bs BS batch size
75+
--only_prefix train only the mapper between CLIP and GPT, while GPT is frozen
76+
--mapping_type MAPPING_TYPE
77+
type of architurctre between CLIP and GPT (mlp/transformer)
78+
--num_layers NUM_LAYERS
79+
number of layers in the mapper
80+
--is_not_rn Choose the CLIP backbone: False for RN, True for ViT
81+
--use_image_embedding_as_clipcap
82+
use image embedding as ClipCap
5283
```
5384

5485
# Evaluation

train.py

+19-19
Original file line numberDiff line numberDiff line change
@@ -394,26 +394,26 @@ def train(dataset: ClipCocoDataset, model: ClipCaptionModel, args, warmup_steps:
394394

395395
def main():
396396
parser = argparse.ArgumentParser()
397-
parser.add_argument('--data', default='clip_embedding.pkl')
398-
parser.add_argument('--val_pt', default='')
399-
parser.add_argument('--pretrain_weights', default='')
400-
parser.add_argument('--out_dir', default='./checkpoints')
401-
parser.add_argument('--add_modality_offset', dest='add_modality_offset', action='store_true', default=False)
397+
parser.add_argument('--data', default='clip_embedding.pkl', help='path to clip embeddings of captions generated by the attached embeddings_generator script')
398+
parser.add_argument('--val_pt', default='', help='path to clip embeddings of validations set')
399+
parser.add_argument('--pretrain_weights', default='', help='path to pretrained weights, if not specified, will train from scratch')
400+
parser.add_argument('--out_dir', default='./checkpoints', help='path to output directory')
401+
parser.add_argument('--add_modality_offset', dest='add_modality_offset', action='store_true', default=False, help='train with modality offset that was pre calculated at others/CLIP_embeddings_centers_info.pkl')
402402
parser.add_argument('--prefix', default='coco_prefix', help='prefix for saved filenames')
403-
parser.add_argument('--noise_variance', type=float, default=0.0)
404-
parser.add_argument('--uniform_noise', dest='uniform_noise', action='store_true', default=False)
405-
parser.add_argument('--dont_norm', dest='dont_norm', action='store_true', default=False)
406-
parser.add_argument('--lr', type=float, default=2e-5)
407-
parser.add_argument('--epochs', type=int, default=10)
408-
parser.add_argument('--save_every', type=int, default=1)
409-
parser.add_argument('--prefix_length', type=int, default=40)
410-
parser.add_argument('--prefix_length_clip', type=int, default=40)
411-
parser.add_argument('--bs', type=int, default=34)
412-
parser.add_argument('--only_prefix', dest='only_prefix', action='store_true', default=False)
413-
parser.add_argument('--mapping_type', type=str, default='transformer', help='mlp/transformer')
414-
parser.add_argument('--num_layers', type=int, default=8)
415-
parser.add_argument('--is_not_rn', dest='is_not_rn', action='store_true', default=False)
416-
parser.add_argument('--use_image_embedding_as_clipcap', dest='use_image_embedding_as_clipcap', action='store_true', default=False)
403+
parser.add_argument('--noise_variance', type=float, default=0.0, help='noise variance')
404+
parser.add_argument('--uniform_noise', dest='uniform_noise', action='store_true', default=False, help='use uniform noise instead of gaussian')
405+
parser.add_argument('--dont_norm', dest='dont_norm', action='store_true', default=False, help='dont normalize CLIP embeddings')
406+
parser.add_argument('--lr', type=float, default=2e-5, help='learning rate')
407+
parser.add_argument('--epochs', type=int, default=10, help='number of epochs')
408+
parser.add_argument('--save_every', type=int, default=1, help='save every n epochs')
409+
parser.add_argument('--prefix_length', type=int, default=40, help='prefix length')
410+
parser.add_argument('--prefix_length_clip', type=int, default=40, help='prefix length for clip')
411+
parser.add_argument('--bs', type=int, default=34, help='batch size')
412+
parser.add_argument('--only_prefix', dest='only_prefix', action='store_true', default=False, help='train only the mapper between CLIP and GPT, while GPT is frozen')
413+
parser.add_argument('--mapping_type', type=str, default='transformer', help='type of architurctre between CLIP and GPT (mlp/transformer)')
414+
parser.add_argument('--num_layers', type=int, default=8, help='number of layers in the mapper')
415+
parser.add_argument('--is_not_rn', dest='is_not_rn', action='store_true', default=False, help='Choose the CLIP backbone: False for RN, True for ViT')
416+
parser.add_argument('--use_image_embedding_as_clipcap', dest='use_image_embedding_as_clipcap', action='store_true', default=False, help='use image embedding as ClipCap')
417417
args = parser.parse_args()
418418
if args.data == 'COCO':
419419
args.bs = 30

0 commit comments

Comments
 (0)