-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Validation datasets support during training #785
base: main
Are you sure you want to change the base?
Changes from 9 commits
4b0ef2d
7016daa
9908d79
b422778
ef4669c
3e2edba
63b550d
e2c50a6
989816d
3c469a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,3 +29,8 @@ dist/ | |
| /datasets | ||
| /models | ||
| /output | ||
|
|
||
| /inference | ||
| last_checkpoint | ||
| log.txt | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,20 +9,21 @@ | |
|
|
||
| from . import datasets as D | ||
| from . import samplers | ||
| from .dataset_mode import DatasetMode | ||
|
|
||
| from .collate_batch import BatchCollator | ||
| from .transforms import build_transforms | ||
|
|
||
|
|
||
| def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True): | ||
| def build_dataset(dataset_list, transforms, dataset_catalog, mode=DatasetMode.TRAIN): | ||
| """ | ||
| Arguments: | ||
| dataset_list (list[str]): Contains the names of the datasets, i.e., | ||
| coco_2014_trian, coco_2014_val, etc | ||
| transforms (callable): transforms to apply to each (image, target) sample | ||
| dataset_catalog (DatasetCatalog): contains the information on how to | ||
| construct a dataset. | ||
| is_train (bool): whether to setup the dataset for training or testing | ||
| mode (DatasetMode): whether to setup the dataset for training, validation, or testing | ||
| """ | ||
| if not isinstance(dataset_list, (list, tuple)): | ||
| raise RuntimeError( | ||
|
|
@@ -36,16 +37,16 @@ def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True): | |
| # for COCODataset, we want to remove images without annotations | ||
| # during training | ||
| if data["factory"] == "COCODataset": | ||
| args["remove_images_without_annotations"] = is_train | ||
| args["remove_images_without_annotations"] = mode != DatasetMode.TEST | ||
| if data["factory"] == "PascalVOCDataset": | ||
| args["use_difficult"] = not is_train | ||
| args["use_difficult"] = mode == DatasetMode.TEST | ||
| args["transforms"] = transforms | ||
| # make dataset from factory | ||
| dataset = factory(**args) | ||
| datasets.append(dataset) | ||
|
|
||
| # for testing, return a list of datasets | ||
| if not is_train: | ||
| if mode != DatasetMode.TEST: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even though not really the best thing to do, I believe in most cases we simply evaluate on the test dataset after N iterations, so I think that we can remove the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added another boolean flag instead for controlling the way of data loader creating: |
||
| return datasets | ||
|
|
||
| # for training, concatenate all datasets into a single one | ||
|
|
@@ -104,9 +105,9 @@ def make_batch_data_sampler( | |
| return batch_sampler | ||
|
|
||
|
|
||
| def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): | ||
| def make_data_loader(cfg, mode=DatasetMode.TRAIN, is_distributed=False, start_iter=0): | ||
| num_gpus = get_world_size() | ||
| if is_train: | ||
| if mode == DatasetMode.TRAIN: | ||
| images_per_batch = cfg.SOLVER.IMS_PER_BATCH | ||
| assert ( | ||
| images_per_batch % num_gpus == 0 | ||
|
|
@@ -115,6 +116,7 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): | |
| images_per_gpu = images_per_batch // num_gpus | ||
| shuffle = True | ||
| num_iters = cfg.SOLVER.MAX_ITER | ||
| dataset_list = cfg.DATASETS.TRAIN | ||
| else: | ||
| images_per_batch = cfg.TEST.IMS_PER_BATCH | ||
| assert ( | ||
|
|
@@ -125,6 +127,7 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): | |
| shuffle = False if not is_distributed else True | ||
| num_iters = None | ||
| start_iter = 0 | ||
| dataset_list = cfg.DATASETS.TEST if mode == DatasetMode.TEST else cfg.DATASETS.VAL | ||
|
|
||
| if images_per_gpu > 1: | ||
| logger = logging.getLogger(__name__) | ||
|
|
@@ -148,10 +151,9 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): | |
| "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True | ||
| ) | ||
| DatasetCatalog = paths_catalog.DatasetCatalog | ||
| dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST | ||
|
|
||
| transforms = build_transforms(cfg, is_train) | ||
| datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) | ||
| transforms = build_transforms(cfg, mode) | ||
| datasets = build_dataset(dataset_list, transforms, DatasetCatalog, mode) | ||
|
|
||
| data_loaders = [] | ||
| for dataset in datasets: | ||
|
|
@@ -168,8 +170,8 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): | |
| collate_fn=collator, | ||
| ) | ||
| data_loaders.append(data_loader) | ||
| if is_train: | ||
| # during training, a single (possibly concatenated) data_loader is returned | ||
| if mode != DatasetMode.TEST: | ||
| # during training and validation, a single (possibly concatenated) data_loader is returned | ||
| assert len(data_loaders) == 1 | ||
| return data_loaders[0] | ||
| return data_loaders | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| # Author: Petr Vytovtov <p.vytovtov@partner.samsung.com> | ||
| from enum import Enum | ||
|
|
||
|
|
||
| class DatasetMode(Enum): | ||
| TRAIN = 1 | ||
| VALID = 2 | ||
| TEST = 3 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,7 @@ | |
| import torch | ||
| import torch.distributed as dist | ||
|
|
||
| from maskrcnn_benchmark.utils.comm import get_world_size | ||
| from maskrcnn_benchmark.utils.comm import get_world_size, synchronize | ||
| from maskrcnn_benchmark.utils.metric_logger import MetricLogger | ||
|
|
||
| from apex import amp | ||
|
|
@@ -45,6 +45,7 @@ def do_train( | |
| device, | ||
| checkpoint_period, | ||
| arguments, | ||
| data_loader_val=None, | ||
| ): | ||
| logger = logging.getLogger("maskrcnn_benchmark.trainer") | ||
| logger.info("Start training") | ||
|
|
@@ -107,6 +108,37 @@ def do_train( | |
| ) | ||
| if iteration % checkpoint_period == 0: | ||
| checkpointer.save("model_{:07d}".format(iteration), **arguments) | ||
| if data_loader_val is not None: | ||
| meters_val = MetricLogger(delimiter=" ") | ||
| synchronize() | ||
| with torch.no_grad(): | ||
| for idx_val, (images_val, targets_val, _) in enumerate(data_loader_val): | ||
| images_val = images_val.to(device) | ||
| targets_val = [target.to(device) for target in targets_val] | ||
| loss_dict = model(images_val, targets_val) | ||
| losses = sum(loss for loss in loss_dict.values()) | ||
| loss_dict_reduced = reduce_loss_dict(loss_dict) | ||
| losses_reduced = sum(loss for loss in loss_dict_reduced.values()) | ||
| meters_val.update(loss=losses_reduced, **loss_dict_reduced) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand it correctly, you only evaluate the loss here, while a metric which is generally more useful is to report the mAP as we do for testing.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added inference metrics calculating in addition to loss calculation: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line records batch's loss of val-set using current train iteration model, right?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, global average loss is needed and |
||
| synchronize() | ||
| logger.info( | ||
| meters_val.delimiter.join( | ||
| [ | ||
| "[Validation]: ", | ||
| "eta: {eta}", | ||
| "iter: {iter}", | ||
| "{meters}", | ||
| "lr: {lr:.6f}", | ||
| "max mem: {memory:.0f}", | ||
| ] | ||
| ).format( | ||
| eta=eta_string, | ||
| iter=iteration, | ||
| meters=str(meters), | ||
|
||
| lr=optimizer.param_groups[0]["lr"], | ||
| memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, | ||
| ) | ||
| ) | ||
| if iteration == max_iter: | ||
| checkpointer.save("model_final", **arguments) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you revert this change? All the models have been trained using the new
coco_2017traindataset, which corresponds tococo_2014_train+coco_2014_valminusminival. If you want to evaluate at every N iterations, you could do it on thecoco_2014_minival?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've reverted it and created another config file where the number of iterations for validation specified:
https://github.com/facebookresearch/maskrcnn-benchmark/pull/828/files#diff-4dd26a63ac00a49aeb10985800d7f21c