|
|
|
# Ultralytics YOLO 🚀, GPL-3.0 license
|
|
|
|
"""
|
|
|
|
Train a model on a dataset
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
$ yolo mode=train model=yolov8n.pt data=coco128.yaml imgsz=640 epochs=100 batch=16
|
|
|
|
"""
|
|
|
|
import os
|
|
|
|
import subprocess
|
|
|
|
import time
|
|
|
|
from collections import defaultdict
|
|
|
|
from copy import deepcopy
|
|
|
|
from datetime import datetime
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import torch
|
|
|
|
import torch.distributed as dist
|
|
|
|
import torch.nn as nn
|
|
|
|
from torch.cuda import amp
|
|
|
|
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
|
|
from torch.optim import lr_scheduler
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights
|
|
|
|
from ultralytics.yolo.cfg import get_cfg
|
|
|
|
from ultralytics.yolo.data.utils import check_cls_dataset, check_det_dataset
|
|
|
|
from ultralytics.yolo.utils import (DEFAULT_CFG, LOGGER, RANK, SETTINGS, TQDM_BAR_FORMAT, __version__, callbacks,
|
|
|
|
colorstr, emojis, yaml_save)
|
|
|
|
from ultralytics.yolo.utils.autobatch import check_train_batch_size
|
|
|
|
from ultralytics.yolo.utils.checks import check_file, check_imgsz, print_args
|
|
|
|
from ultralytics.yolo.utils.dist import ddp_cleanup, generate_ddp_command
|
|
|
|
from ultralytics.yolo.utils.files import get_latest_run, increment_path
|
|
|
|
from ultralytics.yolo.utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, init_seeds, one_cycle,
|
|
|
|
select_device, strip_optimizer)
|
|
|
|
|
|
|
|
|
|
|
|
class BaseTrainer:
|
|
|
|
"""
|
|
|
|
BaseTrainer
|
|
|
|
|
|
|
|
A base class for creating trainers.
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
args (SimpleNamespace): Configuration for the trainer.
|
|
|
|
check_resume (method): Method to check if training should be resumed from a saved checkpoint.
|
|
|
|
console (logging.Logger): Logger instance.
|
|
|
|
validator (BaseValidator): Validator instance.
|
|
|
|
model (nn.Module): Model instance.
|
|
|
|
callbacks (defaultdict): Dictionary of callbacks.
|
|
|
|
save_dir (Path): Directory to save results.
|
|
|
|
wdir (Path): Directory to save weights.
|
|
|
|
last (Path): Path to last checkpoint.
|
|
|
|
best (Path): Path to best checkpoint.
|
|
|
|
save_period (int): Save checkpoint every x epochs (disabled if < 1).
|
|
|
|
batch_size (int): Batch size for training.
|
|
|
|
epochs (int): Number of epochs to train for.
|
|
|
|
start_epoch (int): Starting epoch for training.
|
|
|
|
device (torch.device): Device to use for training.
|
|
|
|
amp (bool): Flag to enable AMP (Automatic Mixed Precision).
|
|
|
|
scaler (amp.GradScaler): Gradient scaler for AMP.
|
|
|
|
data (str): Path to data.
|
|
|
|
trainset (torch.utils.data.Dataset): Training dataset.
|
|
|
|
testset (torch.utils.data.Dataset): Testing dataset.
|
|
|
|
ema (nn.Module): EMA (Exponential Moving Average) of the model.
|
|
|
|
lf (nn.Module): Loss function.
|
|
|
|
scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler.
|
|
|
|
best_fitness (float): The best fitness value achieved.
|
|
|
|
fitness (float): Current fitness value.
|
|
|
|
loss (float): Current loss value.
|
|
|
|
tloss (float): Total loss value.
|
|
|
|
loss_names (list): List of loss names.
|
|
|
|
csv (Path): Path to results CSV file.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, cfg=DEFAULT_CFG, overrides=None):
|
|
|
|
"""
|
|
|
|
Initializes the BaseTrainer class.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
|
|
|
|
overrides (dict, optional): Configuration overrides. Defaults to None.
|
|
|
|
"""
|
|
|
|
self.args = get_cfg(cfg, overrides)
|
|
|
|
self.device = select_device(self.args.device, self.args.batch)
|
|
|
|
self.check_resume()
|
|
|
|
self.console = LOGGER
|
|
|
|
self.validator = None
|
|
|
|
self.model = None
|
|
|
|
self.metrics = None
|
|
|
|
init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic)
|
|
|
|
|
|
|
|
# Dirs
|
|
|
|
project = self.args.project or Path(SETTINGS['runs_dir']) / self.args.task
|
|
|
|
name = self.args.name or f'{self.args.mode}'
|
|
|
|
if hasattr(self.args, 'save_dir'):
|
|
|
|
self.save_dir = Path(self.args.save_dir)
|
|
|
|
else:
|
|
|
|
self.save_dir = Path(
|
|
|
|
increment_path(Path(project) / name, exist_ok=self.args.exist_ok if RANK in {-1, 0} else True))
|
|
|
|
self.wdir = self.save_dir / 'weights' # weights dir
|
|
|
|
if RANK in {-1, 0}:
|
|
|
|
self.wdir.mkdir(parents=True, exist_ok=True) # make dir
|
|
|
|
self.args.save_dir = str(self.save_dir)
|
|
|
|
yaml_save(self.save_dir / 'args.yaml', vars(self.args)) # save run args
|
|
|
|
self.last, self.best = self.wdir / 'last.pt', self.wdir / 'best.pt' # checkpoint paths
|
|
|
|
self.save_period = self.args.save_period
|
|
|
|
|
|
|
|
self.batch_size = self.args.batch
|
|
|
|
self.epochs = self.args.epochs
|
|
|
|
self.start_epoch = 0
|
|
|
|
if RANK == -1:
|
|
|
|
print_args(vars(self.args))
|
|
|
|
|
|
|
|
# Device
|
|
|
|
self.amp = self.device.type != 'cpu'
|
|
|
|
self.scaler = amp.GradScaler(enabled=self.amp)
|
|
|
|
if self.device.type == 'cpu':
|
|
|
|
self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading
|
|
|
|
|
|
|
|
# Model and Dataloaders.
|
|
|
|
self.model = self.args.model
|
|
|
|
try:
|
|
|
|
if self.args.task == 'classify':
|
|
|
|
self.data = check_cls_dataset(self.args.data)
|
|
|
|
elif self.args.data.endswith('.yaml') or self.args.task in ('detect', 'segment'):
|
|
|
|
self.data = check_det_dataset(self.args.data)
|
|
|
|
if 'yaml_file' in self.data:
|
|
|
|
self.args.data = self.data['yaml_file'] # for validating 'yolo train data=url.zip' usage
|
|
|
|
except Exception as e:
|
|
|
|
raise FileNotFoundError(emojis(f"Dataset '{self.args.data}' error ❌ {e}")) from e
|
|
|
|
|
|
|
|
self.trainset, self.testset = self.get_dataset(self.data)
|
|
|
|
self.ema = None
|
|
|
|
|
|
|
|
# Optimization utils init
|
|
|
|
self.lf = None
|
|
|
|
self.scheduler = None
|
|
|
|
|
|
|
|
# Epoch level metrics
|
|
|
|
self.best_fitness = None
|
|
|
|
self.fitness = None
|
|
|
|
self.loss = None
|
|
|
|
self.tloss = None
|
|
|
|
self.loss_names = ['Loss']
|
|
|
|
self.csv = self.save_dir / 'results.csv'
|
|
|
|
self.plot_idx = [0, 1, 2]
|
|
|
|
|
|
|
|
# Callbacks
|
|
|
|
self.callbacks = defaultdict(list, callbacks.default_callbacks) # add callbacks
|
|
|
|
if RANK in {0, -1}:
|
|
|
|
callbacks.add_integration_callbacks(self)
|
|
|
|
|
|
|
|
def add_callback(self, event: str, callback):
|
|
|
|
"""
|
|
|
|
Appends the given callback.
|
|
|
|
"""
|
|
|
|
self.callbacks[event].append(callback)
|
|
|
|
|
|
|
|
def set_callback(self, event: str, callback):
|
|
|
|
"""
|
|
|
|
Overrides the existing callbacks with the given callback.
|
|
|
|
"""
|
|
|
|
self.callbacks[event] = [callback]
|
|
|
|
|
|
|
|
def run_callbacks(self, event: str):
|
|
|
|
for callback in self.callbacks.get(event, []):
|
|
|
|
callback(self)
|
|
|
|
|
|
|
|
def train(self):
|
|
|
|
# Allow device='', device=None on Multi-GPU systems to default to device=0
|
|
|
|
if isinstance(self.args.device, int) or self.args.device: # i.e. device=0 or device=[0,1,2,3]
|
|
|
|
world_size = torch.cuda.device_count()
|
|
|
|
elif torch.cuda.is_available(): # i.e. device=None or device=''
|
|
|
|
world_size = 1 # default to device 0
|
|
|
|
else: # i.e. device='cpu' or 'mps'
|
|
|
|
world_size = 0
|
|
|
|
|
|
|
|
# Run subprocess if DDP training, else train normally
|
|
|
|
if world_size > 1 and 'LOCAL_RANK' not in os.environ:
|
|
|
|
cmd, file = generate_ddp_command(world_size, self) # security vulnerability in Snyk scans
|
|
|
|
try:
|
|
|
|
subprocess.run(cmd, check=True)
|
|
|
|
except Exception as e:
|
|
|
|
self.console.warning(e)
|
|
|
|
finally:
|
|
|
|
ddp_cleanup(self, file)
|
|
|
|
else:
|
|
|
|
self._do_train(RANK, world_size)
|
|
|
|
|
|
|
|
def _setup_ddp(self, rank, world_size):
|
|
|
|
# os.environ['MASTER_ADDR'] = 'localhost'
|
|
|
|
# os.environ['MASTER_PORT'] = '9020'
|
|
|
|
torch.cuda.set_device(rank)
|
|
|
|
self.device = torch.device('cuda', rank)
|
|
|
|
self.console.info(f'DDP settings: RANK {rank}, WORLD_SIZE {world_size}, DEVICE {self.device}')
|
|
|
|
dist.init_process_group('nccl' if dist.is_nccl_available() else 'gloo', rank=rank, world_size=world_size)
|
|
|
|
|
|
|
|
def _setup_train(self, rank, world_size):
|
|
|
|
"""
|
|
|
|
Builds dataloaders and optimizer on correct rank process.
|
|
|
|
"""
|
|
|
|
# model
|
|
|
|
self.run_callbacks('on_pretrain_routine_start')
|
|
|
|
ckpt = self.setup_model()
|
|
|
|
self.model = self.model.to(self.device)
|
|
|
|
self.set_model_attributes()
|
|
|
|
if world_size > 1:
|
|
|
|
self.model = DDP(self.model, device_ids=[rank])
|
|
|
|
# Check imgsz
|
|
|
|
gs = max(int(self.model.stride.max() if hasattr(self.model, 'stride') else 32), 32) # grid size (max stride)
|
|
|
|
self.args.imgsz = check_imgsz(self.args.imgsz, stride=gs, floor=gs, max_dim=1)
|
|
|
|
# Batch size
|
|
|
|
if self.batch_size == -1:
|
|
|
|
if RANK == -1: # single-GPU only, estimate best batch size
|
|
|
|
self.batch_size = check_train_batch_size(self.model, self.args.imgsz, self.amp)
|
|
|
|
else:
|
|
|
|
SyntaxError('batch=-1 to use AutoBatch is only available in Single-GPU training. '
|
|
|
|
'Please pass a valid batch size value for Multi-GPU DDP training, i.e. batch=16')
|
|
|
|
|
|
|
|
# Optimizer
|
|
|
|
self.accumulate = max(round(self.args.nbs / self.batch_size), 1) # accumulate loss before optimizing
|
|
|
|
weight_decay = self.args.weight_decay * self.batch_size * self.accumulate / self.args.nbs # scale weight_decay
|
|
|
|
self.optimizer = self.build_optimizer(model=self.model,
|
|
|
|
name=self.args.optimizer,
|
|
|
|
lr=self.args.lr0,
|
|
|
|
momentum=self.args.momentum,
|
|
|
|
decay=weight_decay)
|
|
|
|
# Scheduler
|
|
|
|
if self.args.cos_lr:
|
|
|
|
self.lf = one_cycle(1, self.args.lrf, self.epochs) # cosine 1->hyp['lrf']
|
|
|
|
else:
|
|
|
|
self.lf = lambda x: (1 - x / self.epochs) * (1.0 - self.args.lrf) + self.args.lrf # linear
|
|
|
|
self.scheduler = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=self.lf)
|
|
|
|
self.stopper, self.stop = EarlyStopping(patience=self.args.patience), False
|
|
|
|
|
|
|
|
# dataloaders
|
|
|
|
batch_size = self.batch_size // world_size if world_size > 1 else self.batch_size
|
|
|
|
self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=rank, mode='train')
|
|
|
|
if rank in {0, -1}:
|
|
|
|
self.test_loader = self.get_dataloader(self.testset, batch_size=batch_size * 2, rank=-1, mode='val')
|
|
|
|
self.validator = self.get_validator()
|
|
|
|
metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix='val')
|
|
|
|
self.metrics = dict(zip(metric_keys, [0] * len(metric_keys))) # TODO: init metrics for plot_results()?
|
|
|
|
self.ema = ModelEMA(self.model)
|
|
|
|
if self.args.plots:
|
|
|
|
self.plot_training_labels()
|
|
|
|
self.resume_training(ckpt)
|
|
|
|
self.scheduler.last_epoch = self.start_epoch - 1 # do not move
|
|
|
|
self.run_callbacks('on_pretrain_routine_end')
|
|
|
|
|
|
|
|
def _do_train(self, rank=-1, world_size=1):
|
|
|
|
if world_size > 1:
|
|
|
|
self._setup_ddp(rank, world_size)
|
|
|
|
|
|
|
|
self._setup_train(rank, world_size)
|
|
|
|
|
|
|
|
self.epoch_time = None
|
|
|
|
self.epoch_time_start = time.time()
|
|
|
|
self.train_time_start = time.time()
|
|
|
|
nb = len(self.train_loader) # number of batches
|
|
|
|
nw = max(round(self.args.warmup_epochs * nb), 100) # number of warmup iterations
|
|
|
|
last_opt_step = -1
|
|
|
|
self.run_callbacks('on_train_start')
|
|
|
|
self.log(f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
|
|
|
|
f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
|
|
|
|
f"Logging results to {colorstr('bold', self.save_dir)}\n"
|
|
|
|
f'Starting training for {self.epochs} epochs...')
|
|
|
|
if self.args.close_mosaic:
|
|
|
|
base_idx = (self.epochs - self.args.close_mosaic) * nb
|
|
|
|
self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2])
|
|
|
|
for epoch in range(self.start_epoch, self.epochs):
|
|
|
|
self.epoch = epoch
|
|
|
|
self.run_callbacks('on_train_epoch_start')
|
|
|
|
self.model.train()
|
|
|
|
if rank != -1:
|
|
|
|
self.train_loader.sampler.set_epoch(epoch)
|
|
|
|
pbar = enumerate(self.train_loader)
|
|
|
|
# Update dataloader attributes (optional)
|
|
|
|
if epoch == (self.epochs - self.args.close_mosaic):
|
|
|
|
self.console.info('Closing dataloader mosaic')
|
|
|
|
if hasattr(self.train_loader.dataset, 'mosaic'):
|
|
|
|
self.train_loader.dataset.mosaic = False
|
|
|
|
if hasattr(self.train_loader.dataset, 'close_mosaic'):
|
|
|
|
self.train_loader.dataset.close_mosaic(hyp=self.args)
|
|
|
|
|
|
|
|
if rank in {-1, 0}:
|
|
|
|
self.console.info(self.progress_string())
|
|
|
|
pbar = tqdm(enumerate(self.train_loader), total=nb, bar_format=TQDM_BAR_FORMAT)
|
|
|
|
self.tloss = None
|
|
|
|
self.optimizer.zero_grad()
|
|
|
|
for i, batch in pbar:
|
|
|
|
self.run_callbacks('on_train_batch_start')
|
|
|
|
# Warmup
|
|
|
|
ni = i + nb * epoch
|
|
|
|
if ni <= nw:
|
|
|
|
xi = [0, nw] # x interp
|
|
|
|
self.accumulate = max(1, np.interp(ni, xi, [1, self.args.nbs / self.batch_size]).round())
|
|
|
|
for j, x in enumerate(self.optimizer.param_groups):
|
|
|
|
# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
|
|
|
|
x['lr'] = np.interp(
|
|
|
|
ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x['initial_lr'] * self.lf(epoch)])
|
|
|
|
if 'momentum' in x:
|
|
|
|
x['momentum'] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
|
|
|
|
|
|
|
|
# Forward
|
|
|
|
with torch.cuda.amp.autocast(self.amp):
|
|
|
|
batch = self.preprocess_batch(batch)
|
|
|
|
preds = self.model(batch['img'])
|
|
|
|
self.loss, self.loss_items = self.criterion(preds, batch)
|
|
|
|
if rank != -1:
|
|
|
|
self.loss *= world_size
|
|
|
|
self.tloss = (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None \
|
|
|
|
else self.loss_items
|
|
|
|
|
|
|
|
# Backward
|
|
|
|
self.scaler.scale(self.loss).backward()
|
|
|
|
|
|
|
|
# Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
|
|
|
|
if ni - last_opt_step >= self.accumulate:
|
|
|
|
self.optimizer_step()
|
|
|
|
last_opt_step = ni
|
|
|
|
|
|
|
|
# Log
|
|
|
|
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
|
|
|
|
loss_len = self.tloss.shape[0] if len(self.tloss.size()) else 1
|
|
|
|
losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
|
|
|
|
if rank in {-1, 0}:
|
|
|
|
pbar.set_description(
|
|
|
|
('%11s' * 2 + '%11.4g' * (2 + loss_len)) %
|
|
|
|
(f'{epoch + 1}/{self.epochs}', mem, *losses, batch['cls'].shape[0], batch['img'].shape[-1]))
|
|
|
|
self.run_callbacks('on_batch_end')
|
|
|
|
if self.args.plots and ni in self.plot_idx:
|
|
|
|
self.plot_training_samples(batch, ni)
|
|
|
|
|
|
|
|
self.run_callbacks('on_train_batch_end')
|
|
|
|
|
|
|
|
self.lr = {f'lr/pg{ir}': x['lr'] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
|
|
|
|
|
|
|
|
self.scheduler.step()
|
|
|
|
self.run_callbacks('on_train_epoch_end')
|
|
|
|
|
|
|
|
if rank in {-1, 0}:
|
|
|
|
|
|
|
|
# Validation
|
|
|
|
self.ema.update_attr(self.model, include=['yaml', 'nc', 'args', 'names', 'stride', 'class_weights'])
|
|
|
|
final_epoch = (epoch + 1 == self.epochs) or self.stopper.possible_stop
|
|
|
|
|
|
|
|
if self.args.val or final_epoch:
|
|
|
|
self.metrics, self.fitness = self.validate()
|
|
|
|
self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
|
|
|
|
self.stop = self.stopper(epoch + 1, self.fitness)
|
|
|
|
|
|
|
|
# Save model
|
|
|
|
if self.args.save or (epoch + 1 == self.epochs):
|
|
|
|
self.save_model()
|
|
|
|
self.run_callbacks('on_model_save')
|
|
|
|
|
|
|
|
tnow = time.time()
|
|
|
|
self.epoch_time = tnow - self.epoch_time_start
|
|
|
|
self.epoch_time_start = tnow
|
|
|
|
self.run_callbacks('on_fit_epoch_end')
|
|
|
|
|
|
|
|
# Early Stopping
|
|
|
|
if RANK != -1: # if DDP training
|
|
|
|
broadcast_list = [self.stop if RANK == 0 else None]
|
|
|
|
dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks
|
|
|
|
if RANK != 0:
|
|
|
|
self.stop = broadcast_list[0]
|
|
|
|
if self.stop:
|
|
|
|
break # must break all DDP ranks
|
|
|
|
|
|
|
|
if rank in {-1, 0}:
|
|
|
|
# Do final val with best.pt
|
|
|
|
self.log(f'\n{epoch - self.start_epoch + 1} epochs completed in '
|
|
|
|
f'{(time.time() - self.train_time_start) / 3600:.3f} hours.')
|
|
|
|
self.final_eval()
|
|
|
|
if self.args.plots:
|
|
|
|
self.plot_metrics()
|
|
|
|
self.log(f"Results saved to {colorstr('bold', self.save_dir)}")
|
|
|
|
self.run_callbacks('on_train_end')
|
|
|
|
torch.cuda.empty_cache()
|
|
|
|
self.run_callbacks('teardown')
|
|
|
|
|
|
|
|
def save_model(self):
|
|
|
|
ckpt = {
|
|
|
|
'epoch': self.epoch,
|
|
|
|
'best_fitness': self.best_fitness,
|
|
|
|
'model': deepcopy(de_parallel(self.model)).half(),
|
|
|
|
'ema': deepcopy(self.ema.ema).half(),
|
|
|
|
'updates': self.ema.updates,
|
|
|
|
'optimizer': self.optimizer.state_dict(),
|
|
|
|
'train_args': vars(self.args), # save as dict
|
|
|
|
'date': datetime.now().isoformat(),
|
|
|
|
'version': __version__}
|
|
|
|
|
|
|
|
# Save last, best and delete
|
|
|
|
torch.save(ckpt, self.last)
|
|
|
|
if self.best_fitness == self.fitness:
|
|
|
|
torch.save(ckpt, self.best)
|
|
|
|
if (self.epoch > 0) and (self.save_period > 0) and (self.epoch % self.save_period == 0):
|
|
|
|
torch.save(ckpt, self.wdir / f'epoch{self.epoch}.pt')
|
|
|
|
del ckpt
|
|
|
|
|
|
|
|
def get_dataset(self, data):
|
|
|
|
"""
|
|
|
|
Get train, val path from data dict if it exists. Returns None if data format is not recognized.
|
|
|
|
"""
|
|
|
|
return data['train'], data.get('val') or data.get('test')
|
|
|
|
|
|
|
|
def setup_model(self):
|
|
|
|
"""
|
|
|
|
load/create/download model for any task.
|
|
|
|
"""
|
|
|
|
if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed
|
|
|
|
return
|
|
|
|
|
|
|
|
model, weights = self.model, None
|
|
|
|
ckpt = None
|
|
|
|
if str(model).endswith('.pt'):
|
|
|
|
weights, ckpt = attempt_load_one_weight(model)
|
|
|
|
cfg = ckpt['model'].yaml
|
|
|
|
else:
|
|
|
|
cfg = model
|
|
|
|
self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1) # calls Model(cfg, weights)
|
|
|
|
return ckpt
|
|
|
|
|
|
|
|
def optimizer_step(self):
|
|
|
|
self.scaler.unscale_(self.optimizer) # unscale gradients
|
|
|
|
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients
|
|
|
|
self.scaler.step(self.optimizer)
|
|
|
|
self.scaler.update()
|
|
|
|
self.optimizer.zero_grad()
|
|
|
|
if self.ema:
|
|
|
|
self.ema.update(self.model)
|
|
|
|
|
|
|
|
def preprocess_batch(self, batch):
|
|
|
|
"""
|
|
|
|
Allows custom preprocessing model inputs and ground truths depending on task type.
|
|
|
|
"""
|
|
|
|
return batch
|
|
|
|
|
|
|
|
def validate(self):
|
|
|
|
"""
|
|
|
|
Runs validation on test set using self.validator. The returned dict is expected to contain "fitness" key.
|
|
|
|
"""
|
|
|
|
metrics = self.validator(self)
|
|
|
|
fitness = metrics.pop('fitness', -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found
|
|
|
|
if not self.best_fitness or self.best_fitness < fitness:
|
|
|
|
self.best_fitness = fitness
|
|
|
|
return metrics, fitness
|
|
|
|
|
|
|
|
def log(self, text, rank=-1):
|
|
|
|
"""
|
|
|
|
Logs the given text to given ranks process if provided, otherwise logs to all ranks.
|
|
|
|
|
|
|
|
Args"
|
|
|
|
text (str): text to log
|
|
|
|
rank (List[Int]): process rank
|
|
|
|
|
|
|
|
"""
|
|
|
|
if rank in {-1, 0}:
|
|
|
|
self.console.info(text)
|
|
|
|
|
|
|
|
def get_model(self, cfg=None, weights=None, verbose=True):
|
|
|
|
raise NotImplementedError("This task trainer doesn't support loading cfg files")
|
|
|
|
|
|
|
|
def get_validator(self):
|
|
|
|
raise NotImplementedError('get_validator function not implemented in trainer')
|
|
|
|
|
|
|
|
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
|
|
|
|
"""
|
|
|
|
Returns dataloader derived from torch.data.Dataloader.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError('get_dataloader function not implemented in trainer')
|
|
|
|
|
|
|
|
def criterion(self, preds, batch):
|
|
|
|
"""
|
|
|
|
Returns loss and individual loss items as Tensor.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError('criterion function not implemented in trainer')
|
|
|
|
|
|
|
|
def label_loss_items(self, loss_items=None, prefix='train'):
|
|
|
|
"""
|
|
|
|
Returns a loss dict with labelled training loss items tensor
|
|
|
|
"""
|
|
|
|
# Not needed for classification but necessary for segmentation & detection
|
|
|
|
return {'loss': loss_items} if loss_items is not None else ['loss']
|
|
|
|
|
|
|
|
def set_model_attributes(self):
|
|
|
|
"""
|
|
|
|
To set or update model parameters before training.
|
|
|
|
"""
|
|
|
|
self.model.names = self.data['names']
|
|
|
|
|
|
|
|
def build_targets(self, preds, targets):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def progress_string(self):
|
|
|
|
return ''
|
|
|
|
|
|
|
|
# TODO: may need to put these following functions into callback
|
|
|
|
def plot_training_samples(self, batch, ni):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def plot_training_labels(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def save_metrics(self, metrics):
|
|
|
|
keys, vals = list(metrics.keys()), list(metrics.values())
|
|
|
|
n = len(metrics) + 1 # number of cols
|
|
|
|
s = '' if self.csv.exists() else (('%23s,' * n % tuple(['epoch'] + keys)).rstrip(',') + '\n') # header
|
|
|
|
with open(self.csv, 'a') as f:
|
|
|
|
f.write(s + ('%23.5g,' * n % tuple([self.epoch] + vals)).rstrip(',') + '\n')
|
|
|
|
|
|
|
|
def plot_metrics(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def final_eval(self):
|
|
|
|
for f in self.last, self.best:
|
|
|
|
if f.exists():
|
|
|
|
strip_optimizer(f) # strip optimizers
|
|
|
|
if f is self.best:
|
|
|
|
self.console.info(f'\nValidating {f}...')
|
|
|
|
self.metrics = self.validator(model=f)
|
|
|
|
self.metrics.pop('fitness', None)
|
|
|
|
self.run_callbacks('on_fit_epoch_end')
|
|
|
|
|
|
|
|
def check_resume(self):
|
|
|
|
resume = self.args.resume
|
|
|
|
if resume:
|
|
|
|
try:
|
|
|
|
last = Path(
|
|
|
|
check_file(resume) if isinstance(resume, (str,
|
|
|
|
Path)) and Path(resume).exists() else get_latest_run())
|
|
|
|
self.args = get_cfg(attempt_load_weights(last).args)
|
|
|
|
self.args.model, resume = str(last), True # reinstate
|
|
|
|
except Exception as e:
|
|
|
|
raise FileNotFoundError('Resume checkpoint not found. Please pass a valid checkpoint to resume from, '
|
|
|
|
"i.e. 'yolo train resume model=path/to/last.pt'") from e
|
|
|
|
self.resume = resume
|
|
|
|
|
|
|
|
def resume_training(self, ckpt):
|
|
|
|
if ckpt is None:
|
|
|
|
return
|
|
|
|
best_fitness = 0.0
|
|
|
|
start_epoch = ckpt['epoch'] + 1
|
|
|
|
if ckpt['optimizer'] is not None:
|
|
|
|
self.optimizer.load_state_dict(ckpt['optimizer']) # optimizer
|
|
|
|
best_fitness = ckpt['best_fitness']
|
|
|
|
if self.ema and ckpt.get('ema'):
|
|
|
|
self.ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) # EMA
|
|
|
|
self.ema.updates = ckpt['updates']
|
|
|
|
if self.resume:
|
|
|
|
assert start_epoch > 0, \
|
|
|
|
f'{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n' \
|
|
|
|
f"Start a new training without --resume, i.e. 'yolo task=... mode=train model={self.args.model}'"
|
|
|
|
LOGGER.info(
|
|
|
|
f'Resuming training from {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs')
|
|
|
|
if self.epochs < start_epoch:
|
|
|
|
LOGGER.info(
|
|
|
|
f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs.")
|
|
|
|
self.epochs += ckpt['epoch'] # finetune additional epochs
|
|
|
|
self.best_fitness = best_fitness
|
|
|
|
self.start_epoch = start_epoch
|
|
|
|
if start_epoch > (self.epochs - self.args.close_mosaic):
|
|
|
|
self.console.info('Closing dataloader mosaic')
|
|
|
|
if hasattr(self.train_loader.dataset, 'mosaic'):
|
|
|
|
self.train_loader.dataset.mosaic = False
|
|
|
|
if hasattr(self.train_loader.dataset, 'close_mosaic'):
|
|
|
|
self.train_loader.dataset.close_mosaic(hyp=self.args)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def build_optimizer(model, name='Adam', lr=0.001, momentum=0.9, decay=1e-5):
|
|
|
|
"""
|
|
|
|
Builds an optimizer with the specified parameters and parameter groups.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
model (nn.Module): model to optimize
|
|
|
|
name (str): name of the optimizer to use
|
|
|
|
lr (float): learning rate
|
|
|
|
momentum (float): momentum
|
|
|
|
decay (float): weight decay
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
optimizer (torch.optim.Optimizer): the built optimizer
|
|
|
|
"""
|
|
|
|
g = [], [], [] # optimizer parameter groups
|
|
|
|
bn = tuple(v for k, v in nn.__dict__.items() if 'Norm' in k) # normalization layers, i.e. BatchNorm2d()
|
|
|
|
for v in model.modules():
|
|
|
|
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias (no decay)
|
|
|
|
g[2].append(v.bias)
|
|
|
|
if isinstance(v, bn): # weight (no decay)
|
|
|
|
g[1].append(v.weight)
|
|
|
|
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight (with decay)
|
|
|
|
g[0].append(v.weight)
|
|
|
|
|
|
|
|
if name == 'Adam':
|
|
|
|
optimizer = torch.optim.Adam(g[2], lr=lr, betas=(momentum, 0.999)) # adjust beta1 to momentum
|
|
|
|
elif name == 'AdamW':
|
|
|
|
optimizer = torch.optim.AdamW(g[2], lr=lr, betas=(momentum, 0.999), weight_decay=0.0)
|
|
|
|
elif name == 'RMSProp':
|
|
|
|
optimizer = torch.optim.RMSprop(g[2], lr=lr, momentum=momentum)
|
|
|
|
elif name == 'SGD':
|
|
|
|
optimizer = torch.optim.SGD(g[2], lr=lr, momentum=momentum, nesterov=True)
|
|
|
|
else:
|
|
|
|
raise NotImplementedError(f'Optimizer {name} not implemented.')
|
|
|
|
|
|
|
|
optimizer.add_param_group({'params': g[0], 'weight_decay': decay}) # add g0 with weight_decay
|
|
|
|
optimizer.add_param_group({'params': g[1], 'weight_decay': 0.0}) # add g1 (BatchNorm2d weights)
|
|
|
|
LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__}(lr={lr}) with parameter groups "
|
|
|
|
f'{len(g[1])} weight(decay=0.0), {len(g[0])} weight(decay={decay}), {len(g[2])} bias')
|
|
|
|
return optimizer
|