From 298287298d330482ce1a7bfae7fdf18565f2971c Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Thu, 24 Nov 2022 19:55:03 +0530 Subject: [PATCH] Add clearml logging (#51) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- ultralytics/yolo/engine/trainer.py | 108 ++++++++---------- ultralytics/yolo/engine/validator.py | 5 +- ultralytics/yolo/utils/callbacks/__init__.py | 1 + .../yolo/utils/{loggers => callbacks}/base.py | 9 ++ ultralytics/yolo/utils/callbacks/clearml.py | 45 ++++++++ ultralytics/yolo/utils/loggers/__init__.py | 1 - ultralytics/yolo/v8/segment/train.py | 5 + 7 files changed, 112 insertions(+), 62 deletions(-) create mode 100644 ultralytics/yolo/utils/callbacks/__init__.py rename ultralytics/yolo/utils/{loggers => callbacks}/base.py (69%) create mode 100644 ultralytics/yolo/utils/callbacks/clearml.py delete mode 100644 ultralytics/yolo/utils/loggers/__init__.py diff --git a/ultralytics/yolo/engine/trainer.py b/ultralytics/yolo/engine/trainer.py index 979595756d..35392a60f3 100644 --- a/ultralytics/yolo/engine/trainer.py +++ b/ultralytics/yolo/engine/trainer.py @@ -1,10 +1,6 @@ """ Simple training loop; Boilerplate that could apply to any arbitrary neural network, """ -# TODOs -# 1. finish _set_model_attributes -# 2. allow num_class update for both pretrained and csv_loaded models -# 3. save import os import time @@ -24,7 +20,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP from tqdm import tqdm import ultralytics.yolo.utils as utils -import ultralytics.yolo.utils.loggers as loggers +import ultralytics.yolo.utils.callbacks as callbacks from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml from ultralytics.yolo.utils import LOGGER, ROOT, TQDM_BAR_FORMAT from ultralytics.yolo.utils.checks import print_args @@ -73,8 +69,9 @@ class BaseTrainer: self.fitness = None self.loss = None - for callback, func in loggers.default_callbacks.items(): + for callback, func in callbacks.default_callbacks.items(): self.add_callback(callback, func) + callbacks.add_integration_callbacks(self) def _get_config(self, config: Union[str, DictConfig], overrides: Union[str, Dict] = {}): """ @@ -146,7 +143,6 @@ class BaseTrainer: self.test_loader = self.get_dataloader(self.testset, batch_size=self.args.batch_size * 2, rank=-1) self.validator = self.get_validator() print("created testloader :", rank) - self.console.info(self.progress_string()) self.ema = ModelEMA(self.model) def _do_train(self, rank=-1, world_size=1): @@ -155,7 +151,7 @@ class BaseTrainer: else: self.model = self.model.to(self.device) - # callback hook. before_train + self.trigger_callbacks("before_train") self._setup_train(rank) self.epoch = 1 @@ -163,22 +159,22 @@ class BaseTrainer: self.epoch_time_start = time.time() self.train_time_start = time.time() for epoch in range(self.args.epochs): - # callback hook. on_epoch_start + self.trigger_callbacks("on_epoch_start") self.model.train() pbar = enumerate(self.train_loader) if rank in {-1, 0}: pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader), bar_format=TQDM_BAR_FORMAT) - tloss = None + self.tloss = None for i, batch in pbar: - # img, label (classification)/ img, targets, paths, _, masks(detection) - # callback hook. on_batch_start + self.trigger_callbacks("on_batch_start") # forward batch = self.preprocess_batch(batch) # TODO: warmup, multiscale preds = self.model(batch["img"]) self.loss, self.loss_items = self.criterion(preds, batch) - tloss = (tloss * i + self.loss_items) / (i + 1) if tloss is not None else self.loss_items + self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \ + else self.loss_items # backward self.model.zero_grad(set_to_none=True) @@ -186,28 +182,28 @@ class BaseTrainer: # optimize self.optimizer_step() - self.trigger_callbacks('on_batch_end') # log mem = (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) - loss_len = tloss.shape[0] if len(tloss.size()) else 1 - losses = tloss if loss_len > 1 else torch.unsqueeze(tloss, 0) + loss_len = self.tloss.shape[0] if len(self.tloss.size()) else 1 + losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0) if rank in {-1, 0}: pbar.set_description( (" {} " + "{:.3f} " * (1 + loss_len) + ' {} ').format(f'{epoch + 1}/{self.args.epochs}', mem, *losses, batch["img"].shape[-1])) + self.trigger_callbacks('on_batch_end') if rank in [-1, 0]: # validation - # callback: on_val_start() + self.trigger_callbacks('on_val_start') self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights']) - self.validate() - # callback: on_val_end() + self.metrics, self.fitness = self.validate() + self.trigger_callbacks('on_val_end') # save model if (not self.args.nosave) or (self.epoch + 1 == self.args.epochs): self.save_model() - # callback; on_model_save + self.trigger_callbacks('on_model_save') self.epoch += 1 tnow = time.time() @@ -216,9 +212,8 @@ class BaseTrainer: # TODO: termination condition - self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours) \ - \n{self.usage_help()}") - # callback; on_train_end + self.log(f"\nTraining complete ({(time.time() - self.train_time_start) / 3600:.3f} hours)") + self.trigger_callbacks('on_train_end') dist.destroy_process_group() if world_size != 1 else None def save_model(self): @@ -238,12 +233,6 @@ class BaseTrainer: torch.save(ckpt, self.best) del ckpt - def get_dataloader(self, dataset_path, batch_size=16, rank=0): - """ - Returns dataloader derived from torch.data.Dataloader - """ - pass - def get_dataset(self, data): """ Get train, val path from data dict if it exists. Returns None if data format is not recognized @@ -259,12 +248,6 @@ class BaseTrainer: weights=get_model(model) if pretrained else None, data=self.data) # model - def load_model(self, model_cfg, weights, data): - raise NotImplementedError("This task trainer doesn't support loading cfg files") - - def get_validator(self): - pass - def optimizer_step(self): self.scaler.unscale_(self.optimizer) # unscale gradients torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients @@ -286,48 +269,55 @@ class BaseTrainer: # TODO: discuss validator class. Enforce that a validator metrics dict should contain "fitness" metric. """ - self.metrics = self.validator(self) - self.fitness = self.metrics.get("fitness", - -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found - if not self.best_fitness or self.best_fitness < self.fitness: + metrics = self.validator(self) + fitness = metrics.get("fitness", -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found + if not self.best_fitness or self.best_fitness < fitness: self.best_fitness = self.fitness + return metrics, fitness - def set_model_attributes(self): + def log(self, text, rank=-1): """ - To set or update model parameters before training. + Logs the given text to given ranks process if provided, otherwise logs to all ranks + :param text: text to log + :param rank: List[Int] + """ - pass + if rank in {-1, 0}: + self.console.info(text) - def build_targets(self, preds, targets): - pass + def load_model(self, model_cfg, weights, data): + raise NotImplementedError("This task trainer doesn't support loading cfg files") + + def get_validator(self): + raise NotImplementedError("get_validator function not implemented in trainer") + + def get_dataloader(self, dataset_path, batch_size=16, rank=0): + """ + Returns dataloader derived from torch.data.Dataloader + """ + raise NotImplementedError("get_dataloader function not implemented in trainer") def criterion(self, preds, batch): """ Returns loss and individual loss items as Tensor """ - pass + raise NotImplementedError("criterion function not implemented in trainer") - def progress_string(self): + def label_loss_items(self, loss_items): """ - Returns progress string depending on task type. + Returns a loss dict with labelled training loss items tensor """ - return '' + # Not needed for classification but necessary for segmentation & detection + return {"loss": loss_items} - def usage_help(self): + def set_model_attributes(self): """ - Returns usage functionality. gets printed to the console after training. + To set or update model parameters before training. """ pass - def log(self, text, rank=-1): - """ - Logs the given text to given ranks process if provided, otherwise logs to all ranks - :param text: text to log - :param rank: List[Int] - - """ - if rank in {-1, 0}: - self.console.info(text) + def build_targets(self, preds, targets): + pass def build_optimizer(model, name='Adam', lr=0.001, momentum=0.9, decay=1e-5): diff --git a/ultralytics/yolo/engine/validator.py b/ultralytics/yolo/engine/validator.py index e60f0863e2..24a840ea69 100644 --- a/ultralytics/yolo/engine/validator.py +++ b/ultralytics/yolo/engine/validator.py @@ -24,6 +24,7 @@ class BaseValidator: self.cuda = self.device.type != 'cpu' self.batch_i = None self.training = True + self.loss = None def __call__(self, trainer=None, model=None): """ @@ -44,7 +45,7 @@ class BaseValidator: model.eval() dt = Profile(), Profile(), Profile(), Profile() - loss = 0 + self.loss = 0 n_batches = len(self.dataloader) desc = self.get_desc() bar = tqdm(self.dataloader, desc, n_batches, not self.training, bar_format=TQDM_BAR_FORMAT) @@ -65,7 +66,7 @@ class BaseValidator: # loss with dt[2]: if self.training: - loss += trainer.criterion(preds, batch)[0] + self.loss += trainer.criterion(preds, batch)[0] # pre-process predictions with dt[3]: diff --git a/ultralytics/yolo/utils/callbacks/__init__.py b/ultralytics/yolo/utils/callbacks/__init__.py new file mode 100644 index 0000000000..c64adb3bbf --- /dev/null +++ b/ultralytics/yolo/utils/callbacks/__init__.py @@ -0,0 +1 @@ +from .base import add_integration_callbacks, default_callbacks diff --git a/ultralytics/yolo/utils/loggers/base.py b/ultralytics/yolo/utils/callbacks/base.py similarity index 69% rename from ultralytics/yolo/utils/loggers/base.py rename to ultralytics/yolo/utils/callbacks/base.py index 0c2d855d40..3cffa1283c 100644 --- a/ultralytics/yolo/utils/loggers/base.py +++ b/ultralytics/yolo/utils/callbacks/base.py @@ -30,3 +30,12 @@ default_callbacks = { "on_val_start": on_val_start, "on_val_end": on_val_end, "on_model_save": on_model_save} + + +def add_integration_callbacks(trainer): + callbacks = {} + + from .clearml import callbacks, clearml + if clearml: + for callback, func in callbacks.items(): + trainer.add_callback(callback, func) diff --git a/ultralytics/yolo/utils/callbacks/clearml.py b/ultralytics/yolo/utils/callbacks/clearml.py new file mode 100644 index 0000000000..8b0668a8b1 --- /dev/null +++ b/ultralytics/yolo/utils/callbacks/clearml.py @@ -0,0 +1,45 @@ +try: + import clearml + from clearml import Task + + assert hasattr(clearml, '__version__') +except (ImportError, AssertionError): + clearml = None + + +def _log_scalers(metric_dict, group="", step=0): + task = Task.current_task() + if task: + for k, v in metric_dict.items(): + task.get_logger().report_scalar(group, k, v, step) + + +def before_train(trainer): + # TODO: reuse existing task + task = Task.init(project_name=trainer.args.project if trainer.args.project != 'runs/train' else 'YOLOv5', + task_name=trainer.args.name if trainer.args.name != 'exp' else 'Training', + tags=['YOLOv5'], + output_uri=True, + reuse_last_task_id=False, + auto_connect_frameworks={'pytorch': False}) + + task.connect(trainer.args, name='parameters') + + +def on_batch_end(trainer): + train_loss = trainer.tloss + _log_scalers(trainer.label_loss_items(train_loss), "train", trainer.epoch) + + +def on_val_end(trainer): + metrics = trainer.metrics + val_losses = trainer.validator.loss + val_loss_dict = trainer.label_loss_items(val_losses) + _log_scalers(val_loss_dict, "val", trainer.epoch) + _log_scalers(metrics, "metrics", trainer.epoch) + + +callbacks = { + "before_train": before_train, + "on_val_end": on_val_end, + "on_batch_end": on_batch_end,} diff --git a/ultralytics/yolo/utils/loggers/__init__.py b/ultralytics/yolo/utils/loggers/__init__.py deleted file mode 100644 index 289b43915b..0000000000 --- a/ultralytics/yolo/utils/loggers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .base import default_callbacks diff --git a/ultralytics/yolo/v8/segment/train.py b/ultralytics/yolo/v8/segment/train.py index 9949629f0e..16f2ea97e7 100644 --- a/ultralytics/yolo/v8/segment/train.py +++ b/ultralytics/yolo/v8/segment/train.py @@ -234,6 +234,11 @@ class SegmentationTrainer(BaseTrainer): loss = lbox + lobj + lcls + lseg return loss * bs, torch.cat((lbox, lseg, lobj, lcls)).detach() + def label_loss_items(self, loss_items): + # We should just use named tensors here in future + keys = ["lbox", "lseg", "lobj", "lcls"] + return dict(zip(keys, loss_items)) + def progress_string(self): return ('\n' + '%11s' * 7) % \ ('Epoch', 'GPU_mem', 'box_loss', 'seg_loss', 'obj_loss', 'cls_loss', 'Size')