From bf6b8f7e39670b7622039c944d571e6630a4423b Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 11:22:37 +1000 Subject: [PATCH 01/16] Validate at each checkpoint (PR 828) --- maskrcnn_benchmark/config/defaults.py | 2 + maskrcnn_benchmark/data/build.py | 8 ++- maskrcnn_benchmark/engine/trainer.py | 82 ++++++++++++++++++++++++++- tools/train_net.py | 23 ++++++-- 4 files changed, 105 insertions(+), 10 deletions(-) diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py index fc750fd4f..4058deb89 100644 --- a/maskrcnn_benchmark/config/defaults.py +++ b/maskrcnn_benchmark/config/defaults.py @@ -395,6 +395,8 @@ _C.SOLVER.WARMUP_METHOD = "linear" _C.SOLVER.CHECKPOINT_PERIOD = 2500 +# Validate every 2500 +_C.SOLVER.TEST_PERIOD = 2500 # Number of images per batch # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py index d2895fd7e..7ff222c81 100644 --- a/maskrcnn_benchmark/data/build.py +++ b/maskrcnn_benchmark/data/build.py @@ -104,7 +104,9 @@ def make_batch_data_sampler( return batch_sampler -def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): +def make_data_loader( + cfg, is_train=True, is_distributed=False, start_iter=0, is_for_period=False +): num_gpus = get_world_size() if is_train: images_per_batch = cfg.SOLVER.IMS_PER_BATCH @@ -151,7 +153,7 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) - datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train) + datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_trainis_train or is_for_period)) data_loaders = [] for dataset in datasets: @@ -168,7 +170,7 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): collate_fn=collator, ) data_loaders.append(data_loader) - if is_train: + if is_train or is_for_period: # during training, a single (possibly concatenated) data_loader is returned assert len(data_loaders) == 1 return data_loaders[0] diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 38a9e524b..9d7e6d893 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -1,13 +1,19 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import datetime import logging +import os import time import torch import torch.distributed as dist +from tqdm import tqdm -from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.data import make_data_loader +from maskrcnn_benchmark.utils.comm import get_world_size, synchronize from maskrcnn_benchmark.utils.metric_logger import MetricLogger +from maskrcnn_benchmark.engine.inference import inference + +from apex import amp def reduce_loss_dict(loss_dict): @@ -36,13 +42,16 @@ def reduce_loss_dict(loss_dict): def do_train( + cfg, model, data_loader, + data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, + test_period, arguments, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") @@ -53,6 +62,14 @@ def do_train( model.train() start_training_time = time.time() end = time.time() + + iou_types = ("bbox",) + if cfg.MODEL.MASK_ON: + iou_types = iou_types + ("segm",) + if cfg.MODEL.KEYPOINT_ON: + iou_types = iou_types + ("keypoints",) + dataset_names = cfg.DATASETS.TEST + for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 @@ -73,7 +90,10 @@ def do_train( meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() - losses.backward() + # Note: If mixed precision is not used, this ends up doing nothing + # Otherwise apply loss scaling for mixed-precision recipe + with amp.scale_loss(losses, optimizer) as scaled_losses: + scaled_losses.backward() optimizer.step() batch_time = time.time() - end @@ -103,6 +123,64 @@ def do_train( ) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) + if ( + data_loader_val is not None + and test_period > 0 + and iteration % test_period == 0 + ): + meters_val = MetricLogger(delimiter=" ") + synchronize() + _ = inference( # The result can be used for additional logging, e. g. for TensorBoard + model, + # The method changes the segmentation mask format in a data loader, + # so every time a new data loader is created: + make_data_loader( + cfg, + is_train=False, + is_distributed=(get_world_size() > 1), + is_for_period=True, + ), + dataset_name="[Validation]", + iou_types=iou_types, + box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, + device=cfg.MODEL.DEVICE, + expected_results=cfg.TEST.EXPECTED_RESULTS, + expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, + output_folder=None, + ) + synchronize() + model.train() + with torch.no_grad(): + # Should be one image for each GPU: + for iteration_val, (images_val, targets_val, _) in enumerate( + tqdm(data_loader_val) + ): + images_val = images_val.to(device) + targets_val = [target.to(device) for target in targets_val] + loss_dict = model(images_val, targets_val) + losses = sum(loss for loss in loss_dict.values()) + loss_dict_reduced = reduce_loss_dict(loss_dict) + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + meters_val.update(loss=losses_reduced, **loss_dict_reduced) + synchronize() + logger.info( + meters_val.delimiter.join( + [ + "[Validation]: ", + "eta: {eta}", + "iter: {iter}", + "{meters}", + "lr: {lr:.6f}", + "max mem: {memory:.0f}", + ] + ).format( + eta=eta_string, + iter=iteration, + meters=str(meters_val), + lr=optimizer.param_groups[0]["lr"], + memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, + ) + ) if iteration == max_iter: checkpointer.save("model_final", **arguments) diff --git a/tools/train_net.py b/tools/train_net.py index e4f95f015..269393f21 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -36,7 +36,9 @@ def train(cfg, local_rank, distributed): if distributed: model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[local_rank], output_device=local_rank, + model, + device_ids=[local_rank], + output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, ) @@ -60,16 +62,27 @@ def train(cfg, local_rank, distributed): start_iter=arguments["iteration"], ) + test_period = cfg.SOLVER.TEST_PERIOD + if test_period > 0: + data_loader_val = make_data_loader( + cfg, is_train=False, is_distributed=distributed, is_for_period=True + ) + else: + data_loader_val = None + checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD do_train( + cfg, model, data_loader, + data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, + test_period, arguments, ) @@ -93,7 +106,9 @@ def run_test(cfg, model, distributed): mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) - for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): + for output_folder, dataset_name, data_loader_val in zip( + output_folders, dataset_names, data_loaders_val + ): inference( model, data_loader_val, @@ -138,9 +153,7 @@ def main(): if args.distributed: torch.cuda.set_device(args.local_rank) - torch.distributed.init_process_group( - backend="nccl", init_method="env://" - ) + torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) From 0bcf3aa80b165d0baea0cc629b433fe487f09793 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 12:50:08 +1000 Subject: [PATCH 02/16] Fix missing bracket --- maskrcnn_benchmark/data/build.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py index 7ff222c81..190a024ba 100644 --- a/maskrcnn_benchmark/data/build.py +++ b/maskrcnn_benchmark/data/build.py @@ -153,7 +153,9 @@ def make_data_loader( dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST transforms = build_transforms(cfg, is_train) - datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_trainis_train or is_for_period)) + datasets = build_dataset( + dataset_list, transforms, DatasetCatalog, is_trainis_train or is_for_period + ) data_loaders = [] for dataset in datasets: From fbb93bce85da0176fa258034e99aff99e43e9310 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 12:50:52 +1000 Subject: [PATCH 03/16] Fix typo --- maskrcnn_benchmark/data/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py index 190a024ba..cf0c8c2c2 100644 --- a/maskrcnn_benchmark/data/build.py +++ b/maskrcnn_benchmark/data/build.py @@ -154,7 +154,7 @@ def make_data_loader( transforms = build_transforms(cfg, is_train) datasets = build_dataset( - dataset_list, transforms, DatasetCatalog, is_trainis_train or is_for_period + dataset_list, transforms, DatasetCatalog, is_train or is_for_period ) data_loaders = [] From be5a259ed8fd8ac96ee1959a44685df76aac8db6 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 12:54:59 +1000 Subject: [PATCH 04/16] RM failing redundant code --- maskrcnn_benchmark/engine/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 9d7e6d893..1fb751f36 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -92,8 +92,8 @@ def do_train( optimizer.zero_grad() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe - with amp.scale_loss(losses, optimizer) as scaled_losses: - scaled_losses.backward() + # with amp.scale_loss(losses, optimizer) as scaled_losses: + # scaled_losses.backward() optimizer.step() batch_time = time.time() - end From 372ba2a44f173b508abb1265c59158f2432e95db Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 13:29:38 +1000 Subject: [PATCH 05/16] add val output_folder --- maskrcnn_benchmark/engine/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 1fb751f36..642fedb61 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -130,6 +130,7 @@ def do_train( ): meters_val = MetricLogger(delimiter=" ") synchronize() + output_folder = os.path.join(cfg.OUTPUT_DIR, "validation", iteration) _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, @@ -146,7 +147,7 @@ def do_train( device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, - output_folder=None, + output_folder=output_folder, ) synchronize() model.train() From 865c00dfed01bc23b07e197c72a6183ec9d420c7 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 13:34:22 +1000 Subject: [PATCH 06/16] Fix int passed to path.join --- maskrcnn_benchmark/engine/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 642fedb61..f807dbbc2 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -130,7 +130,7 @@ def do_train( ): meters_val = MetricLogger(delimiter=" ") synchronize() - output_folder = os.path.join(cfg.OUTPUT_DIR, "validation", iteration) + output_folder = os.path.join(cfg.OUTPUT_DIR, "validation", str(iteration)) _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, From a04959d7f99b2cef762d28ce5c70f3feca7abb5d Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 13:57:24 +1000 Subject: [PATCH 07/16] os.mkdir --- maskrcnn_benchmark/engine/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index f807dbbc2..93fc89b74 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -131,6 +131,7 @@ def do_train( meters_val = MetricLogger(delimiter=" ") synchronize() output_folder = os.path.join(cfg.OUTPUT_DIR, "validation", str(iteration)) + os.mkdir(output_folder) _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, From 62e7390856134ce6f3387c0e05a4ccc35b090a1b Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 14:01:57 +1000 Subject: [PATCH 08/16] Recursive makedirs --- maskrcnn_benchmark/engine/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 93fc89b74..adecb04f7 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -131,7 +131,7 @@ def do_train( meters_val = MetricLogger(delimiter=" ") synchronize() output_folder = os.path.join(cfg.OUTPUT_DIR, "validation", str(iteration)) - os.mkdir(output_folder) + os.makedirs(output_folder. exist_ok=True) _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, From 79f093353c25ace755107be9a593a1aaf76f2755 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Tue, 9 Jul 2019 14:02:42 +1000 Subject: [PATCH 09/16] typo --- maskrcnn_benchmark/engine/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index adecb04f7..545b00de2 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -131,7 +131,7 @@ def do_train( meters_val = MetricLogger(delimiter=" ") synchronize() output_folder = os.path.join(cfg.OUTPUT_DIR, "validation", str(iteration)) - os.makedirs(output_folder. exist_ok=True) + os.makedirs(output_folder, exist_ok=True) _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, From 49bd631aeb5e1fb436f6a7d800f55b50685d77d9 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Thu, 11 Jul 2019 08:07:19 +1000 Subject: [PATCH 10/16] debug print --- maskrcnn_benchmark/engine/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 545b00de2..be2f0967e 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -151,6 +151,7 @@ def do_train( output_folder=output_folder, ) synchronize() + print("model.train()") model.train() with torch.no_grad(): # Should be one image for each GPU: From fa4d875b7b6b87d99be150b5b1d3472de5a1b07b Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Thu, 11 Jul 2019 08:15:37 +1000 Subject: [PATCH 11/16] Debug --- maskrcnn_benchmark/engine/trainer.py | 1 - tools/train_net.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index be2f0967e..545b00de2 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -151,7 +151,6 @@ def do_train( output_folder=output_folder, ) synchronize() - print("model.train()") model.train() with torch.no_grad(): # Should be one image for each GPU: diff --git a/tools/train_net.py b/tools/train_net.py index 269393f21..9f951198e 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -72,6 +72,9 @@ def train(cfg, local_rank, distributed): checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD + print("data_loader_val") + print(data_loader_val) + do_train( cfg, model, From d4e0af99eeaaabc97e514d4abcabe42985a7b905 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Thu, 11 Jul 2019 08:20:31 +1000 Subject: [PATCH 12/16] Skip data_loader_val if running_evaluation --- tools/train_net.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/train_net.py b/tools/train_net.py index 9f951198e..e5e9d130c 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -26,7 +26,7 @@ from maskrcnn_benchmark.utils.miscellaneous import mkdir -def train(cfg, local_rank, distributed): +def train(cfg, local_rank, distributed, running_evaluation): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) @@ -63,7 +63,7 @@ def train(cfg, local_rank, distributed): ) test_period = cfg.SOLVER.TEST_PERIOD - if test_period > 0: + if test_period > 0 and not running_evaluation: data_loader_val = make_data_loader( cfg, is_train=False, is_distributed=distributed, is_for_period=True ) @@ -148,6 +148,7 @@ def main(): default=None, nargs=argparse.REMAINDER, ) + parser.add_argument("--running_evaluation", default=None) args = parser.parse_args() @@ -180,7 +181,7 @@ def main(): logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) - model = train(cfg, args.local_rank, args.distributed) + model = train(cfg, args.local_rank, args.distributed, args.running_evaluation) if not args.skip_test: run_test(cfg, model, args.distributed) From 08e262b40c4617b7c58b79e9a06253dc91b960bd Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Thu, 11 Jul 2019 08:21:19 +1000 Subject: [PATCH 13/16] Add debug --- tools/train_net.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/train_net.py b/tools/train_net.py index e5e9d130c..705df83bf 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -63,6 +63,8 @@ def train(cfg, local_rank, distributed, running_evaluation): ) test_period = cfg.SOLVER.TEST_PERIOD + print("running_evaluation") + print(running_evaluation) if test_period > 0 and not running_evaluation: data_loader_val = make_data_loader( cfg, is_train=False, is_distributed=distributed, is_for_period=True From 35e615c8328a1b4559f9b77d2ffe1f234a7bb7a5 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Thu, 11 Jul 2019 08:26:57 +1000 Subject: [PATCH 14/16] Add removed func call --- maskrcnn_benchmark/engine/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 545b00de2..9c576c489 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -90,6 +90,7 @@ def do_train( meters.update(loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() + losses.backward() # Note: If mixed precision is not used, this ends up doing nothing # Otherwise apply loss scaling for mixed-precision recipe # with amp.scale_loss(losses, optimizer) as scaled_losses: From 1c3887c6b1f675eaa44cceec179e8a08f0f5c9a7 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Thu, 11 Jul 2019 08:34:57 +1000 Subject: [PATCH 15/16] RM , args.running_evaluation --- tools/train_net.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/train_net.py b/tools/train_net.py index 705df83bf..9f951198e 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -26,7 +26,7 @@ from maskrcnn_benchmark.utils.miscellaneous import mkdir -def train(cfg, local_rank, distributed, running_evaluation): +def train(cfg, local_rank, distributed): model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) @@ -63,9 +63,7 @@ def train(cfg, local_rank, distributed, running_evaluation): ) test_period = cfg.SOLVER.TEST_PERIOD - print("running_evaluation") - print(running_evaluation) - if test_period > 0 and not running_evaluation: + if test_period > 0: data_loader_val = make_data_loader( cfg, is_train=False, is_distributed=distributed, is_for_period=True ) @@ -150,7 +148,6 @@ def main(): default=None, nargs=argparse.REMAINDER, ) - parser.add_argument("--running_evaluation", default=None) args = parser.parse_args() @@ -183,7 +180,7 @@ def main(): logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) - model = train(cfg, args.local_rank, args.distributed, args.running_evaluation) + model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed) From 385867bbd65bd782ce533c98aa2ae074b386bf42 Mon Sep 17 00:00:00 2001 From: Eric Jinks Date: Thu, 11 Jul 2019 09:00:42 +1000 Subject: [PATCH 16/16] RM train from test_period --- maskrcnn_benchmark/engine/trainer.py | 64 ++++++++++++++-------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py index 9c576c489..66bcd13c5 100644 --- a/maskrcnn_benchmark/engine/trainer.py +++ b/maskrcnn_benchmark/engine/trainer.py @@ -152,38 +152,38 @@ def do_train( output_folder=output_folder, ) synchronize() - model.train() - with torch.no_grad(): - # Should be one image for each GPU: - for iteration_val, (images_val, targets_val, _) in enumerate( - tqdm(data_loader_val) - ): - images_val = images_val.to(device) - targets_val = [target.to(device) for target in targets_val] - loss_dict = model(images_val, targets_val) - losses = sum(loss for loss in loss_dict.values()) - loss_dict_reduced = reduce_loss_dict(loss_dict) - losses_reduced = sum(loss for loss in loss_dict_reduced.values()) - meters_val.update(loss=losses_reduced, **loss_dict_reduced) - synchronize() - logger.info( - meters_val.delimiter.join( - [ - "[Validation]: ", - "eta: {eta}", - "iter: {iter}", - "{meters}", - "lr: {lr:.6f}", - "max mem: {memory:.0f}", - ] - ).format( - eta=eta_string, - iter=iteration, - meters=str(meters_val), - lr=optimizer.param_groups[0]["lr"], - memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, - ) - ) + # model.train() + # with torch.no_grad(): + # # Should be one image for each GPU: + # for iteration_val, (images_val, targets_val, _) in enumerate( + # tqdm(data_loader_val) + # ): + # images_val = images_val.to(device) + # targets_val = [target.to(device) for target in targets_val] + # loss_dict = model(images_val, targets_val) + # losses = sum(loss for loss in loss_dict.values()) + # loss_dict_reduced = reduce_loss_dict(loss_dict) + # losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + # meters_val.update(loss=losses_reduced, **loss_dict_reduced) + # synchronize() + # logger.info( + # meters_val.delimiter.join( + # [ + # "[Validation]: ", + # "eta: {eta}", + # "iter: {iter}", + # "{meters}", + # "lr: {lr:.6f}", + # "max mem: {memory:.0f}", + # ] + # ).format( + # eta=eta_string, + # iter=iteration, + # meters=str(meters_val), + # lr=optimizer.param_groups[0]["lr"], + # memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, + # ) + # ) if iteration == max_iter: checkpointer.save("model_final", **arguments)