Add clearml logging (#51)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
pull/52/head
Ayush Chaurasia 2 years ago committed by GitHub
parent 512a225ce8
commit 298287298d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 108
      ultralytics/yolo/engine/trainer.py
  2. 5
      ultralytics/yolo/engine/validator.py
  3. 1
      ultralytics/yolo/utils/callbacks/__init__.py
  4. 9
      ultralytics/yolo/utils/callbacks/base.py
  5. 45
      ultralytics/yolo/utils/callbacks/clearml.py
  6. 1
      ultralytics/yolo/utils/loggers/__init__.py
  7. 5
      ultralytics/yolo/v8/segment/train.py

@ -1,10 +1,6 @@
"""
Simple training loop; Boilerplate that could apply to any arbitrary neural network,
"""
# TODOs
# 1. finish _set_model_attributes
# 2. allow num_class update for both pretrained and csv_loaded models
# 3. save
import os
import time
@ -24,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
from tqdm import tqdm
import ultralytics.yolo.utils as utils
import ultralytics.yolo.utils.loggers as loggers
import ultralytics.yolo.utils.callbacks as callbacks
from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml
from ultralytics.yolo.utils import LOGGER, ROOT, TQDM_BAR_FORMAT
from ultralytics.yolo.utils.checks import print_args
@ -73,8 +69,9 @@ class BaseTrainer:
self.fitness = None
self.loss = None
for callback, func in loggers.default_callbacks.items():
for callback, func in callbacks.default_callbacks.items():
self.add_callback(callback, func)
callbacks.add_integration_callbacks(self)
def _get_config(self, config: Union[str, DictConfig], overrides: Union[str, Dict] = {}):
"""
@ -146,7 +143,6 @@ class BaseTrainer:
self.test_loader = self.get_dataloader(self.testset, batch_size=self.args.batch_size * 2, rank=-1)
self.validator = self.get_validator()
print("created testloader :", rank)
self.console.info(self.progress_string())
self.ema = ModelEMA(self.model)
def _do_train(self, rank=-1, world_size=1):
@ -155,7 +151,7 @@ class BaseTrainer:
else:
self.model = self.model.to(self.device)
# callback hook. before_train
self.trigger_callbacks("before_train")
self._setup_train(rank)
self.epoch = 1
@ -163,22 +159,22 @@ class BaseTrainer:
self.epoch_time_start = time.time()
self.train_time_start = time.time()
for epoch in range(self.args.epochs):
# callback hook. on_epoch_start
self.trigger_callbacks("on_epoch_start")
self.model.train()
pbar = enumerate(self.train_loader)
if rank in {-1, 0}:
pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader), bar_format=TQDM_BAR_FORMAT)
tloss = None
self.tloss = None
for i, batch in pbar:
# img, label (classification)/ img, targets, paths, _, masks(detection)
# callback hook. on_batch_start
self.trigger_callbacks("on_batch_start")
# forward
batch = self.preprocess_batch(batch)
# TODO: warmup, multiscale
preds = self.model(batch["img"])
self.loss, self.loss_items = self.criterion(preds, batch)
tloss = (tloss * i + self.loss_items) / (i + 1) if tloss is not None else self.loss_items
self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
else self.loss_items
# backward
self.model.zero_grad(set_to_none=True)
@ -186,28 +182,28 @@ class BaseTrainer:
# optimize
self.optimizer_step()
self.trigger_callbacks('on_batch_end')
# log
mem = (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB)
loss_len = tloss.shape[0] if len(tloss.size()) else 1
losses = tloss if loss_len > 1 else torch.unsqueeze(tloss, 0)
loss_len = self.tloss.shape[0] if len(self.tloss.size()) else 1
losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
if rank in {-1, 0}:
pbar.set_description(
(" {} " + "{:.3f} " * (1 + loss_len) + ' {} ').format(f'{epoch + 1}/{self.args.epochs}', mem,
*losses, batch["img"].shape[-1]))
self.trigger_callbacks('on_batch_end')
if rank in [-1, 0]:
# validation
# callback: on_val_start()
self.trigger_callbacks('on_val_start')
self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights'])
self.validate()
# callback: on_val_end()
self.metrics, self.fitness = self.validate()
self.trigger_callbacks('on_val_end')
# save model
if (not self.args.nosave) or (self.epoch + 1 == self.args.epochs):
self.save_model()
# callback; on_model_save
self.trigger_callbacks('on_model_save')
self.epoch += 1
tnow = time.time()
@ -216,9 +212,8 @@ class BaseTrainer:
# TODO: termination condition
self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours) \
\n{self.usage_help()}")
# callback; on_train_end
self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)")
self.trigger_callbacks('on_train_end')
dist.destroy_process_group() if world_size != 1 else None
def save_model(self):
@ -238,12 +233,6 @@ class BaseTrainer:
torch.save(ckpt, self.best)
del ckpt
def get_dataloader(self, dataset_path, batch_size=16, rank=0):
"""
Returns dataloader derived from torch.data.Dataloader
"""
pass
def get_dataset(self, data):
"""
Get train, val path from data dict if it exists. Returns None if data format is not recognized
@ -259,12 +248,6 @@ class BaseTrainer:
weights=get_model(model) if pretrained else None,
data=self.data) # model
def load_model(self, model_cfg, weights, data):
raise NotImplementedError("This task trainer doesn't support loading cfg files")
def get_validator(self):
pass
def optimizer_step(self):
self.scaler.unscale_(self.optimizer) # unscale gradients
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients
@ -286,48 +269,55 @@ class BaseTrainer:
# TODO: discuss validator class. Enforce that a validator metrics dict should contain
"fitness" metric.
"""
self.metrics = self.validator(self)
self.fitness = self.metrics.get("fitness",
-self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found
if not self.best_fitness or self.best_fitness < self.fitness:
metrics = self.validator(self)
fitness = metrics.get("fitness", -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found
if not self.best_fitness or self.best_fitness < fitness:
self.best_fitness = self.fitness
return metrics, fitness
def set_model_attributes(self):
def log(self, text, rank=-1):
"""
To set or update model parameters before training.
Logs the given text to given ranks process if provided, otherwise logs to all ranks
:param text: text to log
:param rank: List[Int]
"""
pass
if rank in {-1, 0}:
self.console.info(text)
def build_targets(self, preds, targets):
pass
def load_model(self, model_cfg, weights, data):
raise NotImplementedError("This task trainer doesn't support loading cfg files")
def get_validator(self):
raise NotImplementedError("get_validator function not implemented in trainer")
def get_dataloader(self, dataset_path, batch_size=16, rank=0):
"""
Returns dataloader derived from torch.data.Dataloader
"""
raise NotImplementedError("get_dataloader function not implemented in trainer")
def criterion(self, preds, batch):
"""
Returns loss and individual loss items as Tensor
"""
pass
raise NotImplementedError("criterion function not implemented in trainer")
def progress_string(self):
def label_loss_items(self, loss_items):
"""
Returns progress string depending on task type.
Returns a loss dict with labelled training loss items tensor
"""
return ''
# Not needed for classification but necessary for segmentation & detection
return {"loss": loss_items}
def usage_help(self):
def set_model_attributes(self):
"""
Returns usage functionality. gets printed to the console after training.
To set or update model parameters before training.
"""
pass
def log(self, text, rank=-1):
"""
Logs the given text to given ranks process if provided, otherwise logs to all ranks
:param text: text to log
:param rank: List[Int]
"""
if rank in {-1, 0}:
self.console.info(text)
def build_targets(self, preds, targets):
pass
def build_optimizer(model, name='Adam', lr=0.001, momentum=0.9, decay=1e-5):

@ -24,6 +24,7 @@ class BaseValidator:
self.cuda = self.device.type != 'cpu'
self.batch_i = None
self.training = True
self.loss = None
def __call__(self, trainer=None, model=None):
"""
@ -44,7 +45,7 @@ class BaseValidator:
model.eval()
dt = Profile(), Profile(), Profile(), Profile()
loss = 0
self.loss = 0
n_batches = len(self.dataloader)
desc = self.get_desc()
bar = tqdm(self.dataloader, desc, n_batches, not self.training, bar_format=TQDM_BAR_FORMAT)
@ -65,7 +66,7 @@ class BaseValidator:
# loss
with dt[2]:
if self.training:
loss += trainer.criterion(preds, batch)[0]
self.loss += trainer.criterion(preds, batch)[0]
# pre-process predictions
with dt[3]:

@ -0,0 +1 @@
from .base import add_integration_callbacks, default_callbacks

@ -30,3 +30,12 @@ default_callbacks = {
"on_val_start": on_val_start,
"on_val_end": on_val_end,
"on_model_save": on_model_save}
def add_integration_callbacks(trainer):
callbacks = {}
from .clearml import callbacks, clearml
if clearml:
for callback, func in callbacks.items():
trainer.add_callback(callback, func)

@ -0,0 +1,45 @@
try:
import clearml
from clearml import Task
assert hasattr(clearml, '__version__')
except (ImportError, AssertionError):
clearml = None
def _log_scalers(metric_dict, group="", step=0):
task = Task.current_task()
if task:
for k, v in metric_dict.items():
task.get_logger().report_scalar(group, k, v, step)
def before_train(trainer):
# TODO: reuse existing task
task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv5',
task_name=trainer.args.name if trainer.args.name != 'exp' else 'Training',
tags=['YOLOv5'],
output_uri=True,
reuse_last_task_id=False,
auto_connect_frameworks={'pytorch': False})
task.connect(trainer.args, name='parameters')
def on_batch_end(trainer):
train_loss = trainer.tloss
_log_scalers(trainer.label_loss_items(train_loss), "train", trainer.epoch)
def on_val_end(trainer):
metrics = trainer.metrics
val_losses = trainer.validator.loss
val_loss_dict = trainer.label_loss_items(val_losses)
_log_scalers(val_loss_dict, "val", trainer.epoch)
_log_scalers(metrics, "metrics", trainer.epoch)
callbacks = {
"before_train": before_train,
"on_val_end": on_val_end,
"on_batch_end": on_batch_end,}

@ -1 +0,0 @@
from .base import default_callbacks

@ -234,6 +234,11 @@ class SegmentationTrainer(BaseTrainer):
loss = lbox + lobj + lcls + lseg
return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach()
def label_loss_items(self, loss_items):
# We should just use named tensors here in future
keys = ["lbox", "lseg", "lobj", "lcls"]
return dict(zip(keys, loss_items))
def progress_string(self):
return ('\n' + '%11s' * 7) % \
('Epoch', 'GPU_mem', 'box_loss', 'seg_loss', 'obj_loss', 'cls_loss', 'Size')

Loading…
Cancel
Save