Skip to content

Commit 38efc1b

Browse files
feat: ✨ ClearML training loss logging (#1844)
1 parent 3db0300 commit 38efc1b

10 files changed

+378
-108
lines changed

references/classification/train_pytorch_character.py

+49-15
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,16 @@ def record_lr(
110110
return lr_recorder[: len(loss_recorder)], loss_recorder
111111

112112

113-
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False):
113+
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False):
114114
if amp:
115115
scaler = torch.cuda.amp.GradScaler()
116116

117117
model.train()
118+
if clearml_log:
119+
from clearml import Logger
120+
121+
logger = Logger.current_logger()
122+
118123
# Iterate over the batches of the dataset
119124
pbar = tqdm(train_loader, position=1)
120125
for images, targets in pbar:
@@ -141,6 +146,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
141146
scheduler.step()
142147

143148
pbar.set_description(f"Training loss: {train_loss.item():.6}")
149+
if clearml_log:
150+
global iteration
151+
logger.report_scalar(
152+
title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration
153+
)
154+
iteration += 1
144155

145156

146157
@torch.no_grad()
@@ -318,35 +329,48 @@ def main(args):
318329
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
319330
exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name
320331

332+
config = {
333+
"learning_rate": args.lr,
334+
"epochs": args.epochs,
335+
"weight_decay": args.weight_decay,
336+
"batch_size": args.batch_size,
337+
"architecture": args.arch,
338+
"input_size": args.input_size,
339+
"optimizer": args.optim,
340+
"framework": "pytorch",
341+
"vocab": args.vocab,
342+
"scheduler": args.sched,
343+
"pretrained": args.pretrained,
344+
}
345+
321346
# W&B
322347
if args.wb:
323348
import wandb
324349

325350
run = wandb.init(
326351
name=exp_name,
327352
project="character-classification",
328-
config={
329-
"learning_rate": args.lr,
330-
"epochs": args.epochs,
331-
"weight_decay": args.weight_decay,
332-
"batch_size": args.batch_size,
333-
"architecture": args.arch,
334-
"input_size": args.input_size,
335-
"optimizer": args.optim,
336-
"framework": "pytorch",
337-
"vocab": args.vocab,
338-
"scheduler": args.sched,
339-
"pretrained": args.pretrained,
340-
},
353+
config=config,
341354
)
342355

356+
# ClearML
357+
if args.clearml:
358+
from clearml import Task
359+
360+
task = Task.init(project_name="docTR/character-classification", task_name=exp_name, reuse_last_task_id=False)
361+
task.upload_artifact("config", config)
362+
global iteration
363+
iteration = 0
364+
343365
# Create loss queue
344366
min_loss = np.inf
345367
# Training loop
346368
if args.early_stop:
347369
early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
348370
for epoch in range(args.epochs):
349-
fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler)
371+
fit_one_epoch(
372+
model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml
373+
)
350374

351375
# Validation loop at the end of each epoch
352376
val_loss, acc = evaluate(model, val_loader, batch_transforms)
@@ -361,6 +385,15 @@ def main(args):
361385
"val_loss": val_loss,
362386
"acc": acc,
363387
})
388+
389+
# ClearML
390+
if args.clearml:
391+
from clearml import Logger
392+
393+
logger = Logger.current_logger()
394+
logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch)
395+
logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch)
396+
364397
if args.early_stop and early_stopper.early_stop(val_loss):
365398
print("Training halted early due to reaching patience limit.")
366399
break
@@ -420,6 +453,7 @@ def parse_args():
420453
"--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples"
421454
)
422455
parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases")
456+
parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML")
423457
parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub")
424458
parser.add_argument(
425459
"--pretrained",

references/classification/train_pytorch_orientation.py

+49-15
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,16 @@ def record_lr(
121121
return lr_recorder[: len(loss_recorder)], loss_recorder
122122

123123

124-
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False):
124+
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False):
125125
if amp:
126126
scaler = torch.cuda.amp.GradScaler()
127127

128128
model.train()
129+
if clearml_log:
130+
from clearml import Logger
131+
132+
logger = Logger.current_logger()
133+
129134
# Iterate over the batches of the dataset
130135
pbar = tqdm(train_loader, position=1)
131136
for images, targets in pbar:
@@ -152,6 +157,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a
152157
scheduler.step()
153158

154159
pbar.set_description(f"Training loss: {train_loss.item():.6}")
160+
if clearml_log:
161+
global iteration
162+
logger.report_scalar(
163+
title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration
164+
)
165+
iteration += 1
155166

156167

157168
@torch.no_grad()
@@ -324,35 +335,48 @@ def main(args):
324335
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
325336
exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name
326337

338+
config = {
339+
"learning_rate": args.lr,
340+
"epochs": args.epochs,
341+
"weight_decay": args.weight_decay,
342+
"batch_size": args.batch_size,
343+
"architecture": args.arch,
344+
"input_size": input_size,
345+
"optimizer": args.optim,
346+
"framework": "pytorch",
347+
"classes": CLASSES,
348+
"scheduler": args.sched,
349+
"pretrained": args.pretrained,
350+
}
351+
327352
# W&B
328353
if args.wb:
329354
import wandb
330355

331356
run = wandb.init(
332357
name=exp_name,
333358
project="orientation-classification",
334-
config={
335-
"learning_rate": args.lr,
336-
"epochs": args.epochs,
337-
"weight_decay": args.weight_decay,
338-
"batch_size": args.batch_size,
339-
"architecture": args.arch,
340-
"input_size": input_size,
341-
"optimizer": args.optim,
342-
"framework": "pytorch",
343-
"classes": CLASSES,
344-
"scheduler": args.sched,
345-
"pretrained": args.pretrained,
346-
},
359+
config=config,
347360
)
348361

362+
# ClearML
363+
if args.clearml:
364+
from clearml import Task
365+
366+
task = Task.init(project_name="docTR/orientation-classification", task_name=exp_name, reuse_last_task_id=False)
367+
task.upload_artifact("config", config)
368+
global iteration
369+
iteration = 0
370+
349371
# Create loss queue
350372
min_loss = np.inf
351373
# Training loop
352374
if args.early_stop:
353375
early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
354376
for epoch in range(args.epochs):
355-
fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler)
377+
fit_one_epoch(
378+
model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml
379+
)
356380

357381
# Validation loop at the end of each epoch
358382
val_loss, acc = evaluate(model, val_loader, batch_transforms)
@@ -367,6 +391,15 @@ def main(args):
367391
"val_loss": val_loss,
368392
"acc": acc,
369393
})
394+
395+
# ClearML
396+
if args.clearml:
397+
from clearml import Logger
398+
399+
logger = Logger.current_logger()
400+
logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch)
401+
logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch)
402+
370403
if args.early_stop and early_stopper.early_stop(val_loss):
371404
print("Training halted early due to reaching patience limit.")
372405
break
@@ -410,6 +443,7 @@ def parse_args():
410443
"--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples"
411444
)
412445
parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases")
446+
parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML")
413447
parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub")
414448
parser.add_argument(
415449
"--pretrained",

references/classification/train_tensorflow_character.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,12 @@ def apply_grads(optimizer, grads, model):
9696
optimizer.apply_gradients(zip(grads, model.trainable_weights))
9797

9898

99-
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
99+
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False):
100+
if clearml_log:
101+
from clearml import Logger
102+
103+
logger = Logger.current_logger()
104+
100105
# Iterate over the batches of the dataset
101106
pbar = tqdm(train_loader, position=1)
102107
for images, targets in pbar:
@@ -111,6 +116,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
111116
apply_grads(optimizer, grads, model)
112117

113118
pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}")
119+
if clearml_log:
120+
global iteration
121+
logger.report_scalar(
122+
title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration
123+
)
124+
iteration += 1
114125

115126

116127
def evaluate(model, val_loader, batch_transforms):
@@ -315,6 +326,8 @@ def main(args):
315326

316327
task = Task.init(project_name="docTR/character-classification", task_name=exp_name, reuse_last_task_id=False)
317328
task.upload_artifact("config", config)
329+
global iteration
330+
iteration = 0
318331

319332
# Create loss queue
320333
min_loss = np.inf
@@ -323,7 +336,7 @@ def main(args):
323336
if args.early_stop:
324337
early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
325338
for epoch in range(args.epochs):
326-
fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp)
339+
fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml)
327340

328341
# Validation loop at the end of each epoch
329342
val_loss, acc = evaluate(model, val_loader, batch_transforms)
@@ -346,6 +359,7 @@ def main(args):
346359
logger = Logger.current_logger()
347360
logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch)
348361
logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch)
362+
349363
if args.early_stop and early_stopper.early_stop(val_loss):
350364
print("Training halted early due to reaching patience limit.")
351365
break

references/classification/train_tensorflow_orientation.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,12 @@ def apply_grads(optimizer, grads, model):
110110
optimizer.apply_gradients(zip(grads, model.trainable_weights))
111111

112112

113-
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
113+
def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False):
114+
if clearml_log:
115+
from clearml import Logger
116+
117+
logger = Logger.current_logger()
118+
114119
# Iterate over the batches of the dataset
115120
pbar = tqdm(train_loader, position=1)
116121
for images, targets in pbar:
@@ -125,6 +130,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False):
125130
apply_grads(optimizer, grads, model)
126131

127132
pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}")
133+
if clearml_log:
134+
global iteration
135+
logger.report_scalar(
136+
title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration
137+
)
138+
iteration += 1
128139

129140

130141
def evaluate(model, val_loader, batch_transforms):
@@ -324,6 +335,8 @@ def main(args):
324335

325336
task = Task.init(project_name="docTR/orientation-classification", task_name=exp_name, reuse_last_task_id=False)
326337
task.upload_artifact("config", config)
338+
global iteration
339+
iteration = 0
327340

328341
# Create loss queue
329342
min_loss = np.inf
@@ -332,7 +345,7 @@ def main(args):
332345
if args.early_stop:
333346
early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta)
334347
for epoch in range(args.epochs):
335-
fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp)
348+
fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml)
336349

337350
# Validation loop at the end of each epoch
338351
val_loss, acc = evaluate(model, val_loader, batch_transforms)
@@ -355,6 +368,7 @@ def main(args):
355368
logger = Logger.current_logger()
356369
logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch)
357370
logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch)
371+
358372
if args.early_stop and early_stopper.early_stop(val_loss):
359373
print("Training halted early due to reaching patience limit.")
360374
break

0 commit comments

Comments
 (0)