@ -23,14 +23,31 @@ from torch import nn, optim
from ultralytics . cfg import get_cfg , get_save_dir
from ultralytics . data . utils import check_cls_dataset , check_det_dataset
from ultralytics . nn . tasks import attempt_load_one_weight , attempt_load_weights
from ultralytics . utils import ( DEFAULT_CFG , LOGGER , RANK , TQDM , __version__ , callbacks , clean_url , colorstr , emojis ,
yaml_save )
from ultralytics . utils import (
DEFAULT_CFG ,
LOGGER ,
RANK ,
TQDM ,
__version__ ,
callbacks ,
clean_url ,
colorstr ,
emojis ,
yaml_save ,
)
from ultralytics . utils . autobatch import check_train_batch_size
from ultralytics . utils . checks import check_amp , check_file , check_imgsz , check_model_file_from_stem , print_args
from ultralytics . utils . dist import ddp_cleanup , generate_ddp_command
from ultralytics . utils . files import get_latest_run
from ultralytics . utils . torch_utils import ( EarlyStopping , ModelEMA , de_parallel , init_seeds , one_cycle , select_device ,
strip_optimizer )
from ultralytics . utils . torch_utils import (
EarlyStopping ,
ModelEMA ,
de_parallel ,
init_seeds ,
one_cycle ,
select_device ,
strip_optimizer ,
)
class BaseTrainer :
@ -89,12 +106,12 @@ class BaseTrainer:
# Dirs
self . save_dir = get_save_dir ( self . args )
self . args . name = self . save_dir . name # update name for loggers
self . wdir = self . save_dir / ' weights ' # weights dir
self . wdir = self . save_dir / " weights " # weights dir
if RANK in ( - 1 , 0 ) :
self . wdir . mkdir ( parents = True , exist_ok = True ) # make dir
self . args . save_dir = str ( self . save_dir )
yaml_save ( self . save_dir / ' args.yaml ' , vars ( self . args ) ) # save run args
self . last , self . best = self . wdir / ' last.pt ' , self . wdir / ' best.pt ' # checkpoint paths
yaml_save ( self . save_dir / " args.yaml " , vars ( self . args ) ) # save run args
self . last , self . best = self . wdir / " last.pt " , self . wdir / " best.pt " # checkpoint paths
self . save_period = self . args . save_period
self . batch_size = self . args . batch
@ -104,18 +121,18 @@ class BaseTrainer:
print_args ( vars ( self . args ) )
# Device
if self . device . type in ( ' cpu ' , ' mps ' ) :
if self . device . type in ( " cpu " , " mps " ) :
self . args . workers = 0 # faster CPU training as time dominated by inference, not dataloading
# Model and Dataset
self . model = check_model_file_from_stem ( self . args . model ) # add suffix, i.e. yolov8n -> yolov8n.pt
try :
if self . args . task == ' classify ' :
if self . args . task == " classify " :
self . data = check_cls_dataset ( self . args . data )
elif self . args . data . split ( ' . ' ) [ - 1 ] in ( ' yaml ' , ' yml ' ) or self . args . task in ( ' detect ' , ' segment ' , ' pose ' ) :
elif self . args . data . split ( " . " ) [ - 1 ] in ( " yaml " , " yml " ) or self . args . task in ( " detect " , " segment " , " pose " ) :
self . data = check_det_dataset ( self . args . data )
if ' yaml_file ' in self . data :
self . args . data = self . data [ ' yaml_file ' ] # for validating 'yolo train data=url.zip' usage
if " yaml_file " in self . data :
self . args . data = self . data [ " yaml_file " ] # for validating 'yolo train data=url.zip' usage
except Exception as e :
raise RuntimeError ( emojis ( f " Dataset ' { clean_url ( self . args . data ) } ' error ❌ { e } " ) ) from e
@ -131,8 +148,8 @@ class BaseTrainer:
self . fitness = None
self . loss = None
self . tloss = None
self . loss_names = [ ' Loss ' ]
self . csv = self . save_dir / ' results.csv '
self . loss_names = [ " Loss " ]
self . csv = self . save_dir / " results.csv "
self . plot_idx = [ 0 , 1 , 2 ]
# Callbacks
@ -156,7 +173,7 @@ class BaseTrainer:
def train ( self ) :
""" Allow device= ' ' , device=None on Multi-GPU systems to default to device=0. """
if isinstance ( self . args . device , str ) and len ( self . args . device ) : # i.e. device='0' or device='0,1,2,3'
world_size = len ( self . args . device . split ( ' , ' ) )
world_size = len ( self . args . device . split ( " , " ) )
elif isinstance ( self . args . device , ( tuple , list ) ) : # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
world_size = len ( self . args . device )
elif torch . cuda . is_available ( ) : # i.e. device=None or device='' or device=number
@ -165,14 +182,16 @@ class BaseTrainer:
world_size = 0
# Run subprocess if DDP training, else train normally
if world_size > 1 and ' LOCAL_RANK ' not in os . environ :
if world_size > 1 and " LOCAL_RANK " not in os . environ :
# Argument checks
if self . args . rect :
LOGGER . warning ( " WARNING ⚠️ ' rect=True ' is incompatible with Multi-GPU training, setting ' rect=False ' " )
self . args . rect = False
if self . args . batch == - 1 :
LOGGER . warning ( " WARNING ⚠️ ' batch=-1 ' for AutoBatch is incompatible with Multi-GPU training, setting "
" default ' batch=16 ' " )
LOGGER . warning (
" WARNING ⚠️ ' batch=-1 ' for AutoBatch is incompatible with Multi-GPU training, setting "
" default ' batch=16 ' "
)
self . args . batch = 16
# Command
@ -199,37 +218,45 @@ class BaseTrainer:
def _setup_ddp ( self , world_size ) :
""" Initializes and sets the DistributedDataParallel parameters for training. """
torch . cuda . set_device ( RANK )
self . device = torch . device ( ' cuda ' , RANK )
self . device = torch . device ( " cuda " , RANK )
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os . environ [ ' NCCL_BLOCKING_WAIT ' ] = ' 1 ' # set to enforce timeout
os . environ [ " NCCL_BLOCKING_WAIT " ] = " 1 " # set to enforce timeout
dist . init_process_group (
' nccl ' if dist . is_nccl_available ( ) else ' gloo ' ,
" nccl " if dist . is_nccl_available ( ) else " gloo " ,
timeout = timedelta ( seconds = 10800 ) , # 3 hours
rank = RANK ,
world_size = world_size )
world_size = world_size ,
)
def _setup_train ( self , world_size ) :
""" Builds dataloaders and optimizer on correct rank process. """
# Model
self . run_callbacks ( ' on_pretrain_routine_start ' )
self . run_callbacks ( " on_pretrain_routine_start " )
ckpt = self . setup_model ( )
self . model = self . model . to ( self . device )
self . set_model_attributes ( )
# Freeze layers
freeze_list = self . args . freeze if isinstance (
self . args . freeze , list ) else range ( self . args . freeze ) if isinstance ( self . args . freeze , int ) else [ ]
always_freeze_names = [ ' .dfl ' ] # always freeze these layers
freeze_layer_names = [ f ' model. { x } . ' for x in freeze_list ] + always_freeze_names
freeze_list = (
self . args . freeze
if isinstance ( self . args . freeze , list )
else range ( self . args . freeze )
if isinstance ( self . args . freeze , int )
else [ ]
)
always_freeze_names = [ " .dfl " ] # always freeze these layers
freeze_layer_names = [ f " model. { x } . " for x in freeze_list ] + always_freeze_names
for k , v in self . model . named_parameters ( ) :
# v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results)
if any ( x in k for x in freeze_layer_names ) :
LOGGER . info ( f " Freezing layer ' { k } ' " )
v . requires_grad = False
elif not v . requires_grad :
LOGGER . info ( f " WARNING ⚠️ setting ' requires_grad=True ' for frozen layer ' { k } ' . "
' See ultralytics.engine.trainer for customization of frozen layers. ' )
LOGGER . info (
f " WARNING ⚠️ setting ' requires_grad=True ' for frozen layer ' { k } ' . "
" See ultralytics.engine.trainer for customization of frozen layers. "
)
v . requires_grad = True
# Check AMP
@ -246,7 +273,7 @@ class BaseTrainer:
self . model = nn . parallel . DistributedDataParallel ( self . model , device_ids = [ RANK ] )
# Check imgsz
gs = max ( int ( self . model . stride . max ( ) if hasattr ( self . model , ' stride ' ) else 32 ) , 32 ) # grid size (max stride)
gs = max ( int ( self . model . stride . max ( ) if hasattr ( self . model , " stride " ) else 32 ) , 32 ) # grid size (max stride)
self . args . imgsz = check_imgsz ( self . args . imgsz , stride = gs , floor = gs , max_dim = 1 )
self . stride = gs # for multi-scale training
@ -256,15 +283,14 @@ class BaseTrainer:
# Dataloaders
batch_size = self . batch_size / / max ( world_size , 1 )
self . train_loader = self . get_dataloader ( self . trainset , batch_size = batch_size , rank = RANK , mode = ' train ' )
self . train_loader = self . get_dataloader ( self . trainset , batch_size = batch_size , rank = RANK , mode = " train " )
if RANK in ( - 1 , 0 ) :
# NOTE: When training DOTA dataset, double batch size could get OOM cause some images got more than 2000 objects.
self . test_loader = self . get_dataloader ( self . testset ,
batch_size = batch_size if self . args . task == ' obb ' else batch_size * 2 ,
rank = - 1 ,
mode = ' val ' )
self . test_loader = self . get_dataloader (
self . testset , batch_size = batch_size if self . args . task == " obb " else batch_size * 2 , rank = - 1 , mode = " val "
)
self . validator = self . get_validator ( )
metric_keys = self . validator . metrics . keys + self . label_loss_items ( prefix = ' val ' )
metric_keys = self . validator . metrics . keys + self . label_loss_items ( prefix = " val " )
self . metrics = dict ( zip ( metric_keys , [ 0 ] * len ( metric_keys ) ) )
self . ema = ModelEMA ( self . model )
if self . args . plots :
@ -274,18 +300,20 @@ class BaseTrainer:
self . accumulate = max ( round ( self . args . nbs / self . batch_size ) , 1 ) # accumulate loss before optimizing
weight_decay = self . args . weight_decay * self . batch_size * self . accumulate / self . args . nbs # scale weight_decay
iterations = math . ceil ( len ( self . train_loader . dataset ) / max ( self . batch_size , self . args . nbs ) ) * self . epochs
self . optimizer = self . build_optimizer ( model = self . model ,
self . optimizer = self . build_optimizer (
model = self . model ,
name = self . args . optimizer ,
lr = self . args . lr0 ,
momentum = self . args . momentum ,
decay = weight_decay ,
iterations = iterations )
iterations = iterations ,
)
# Scheduler
self . _setup_scheduler ( )
self . stopper , self . stop = EarlyStopping ( patience = self . args . patience ) , False
self . resume_training ( ckpt )
self . scheduler . last_epoch = self . start_epoch - 1 # do not move
self . run_callbacks ( ' on_pretrain_routine_end ' )
self . run_callbacks ( " on_pretrain_routine_end " )
def _do_train ( self , world_size = 1 ) :
""" Train completed, evaluate and plot if specified by arguments. """
@ -299,19 +327,23 @@ class BaseTrainer:
self . epoch_time = None
self . epoch_time_start = time . time ( )
self . train_time_start = time . time ( )
self . run_callbacks ( ' on_train_start ' )
LOGGER . info ( f ' Image sizes { self . args . imgsz } train, { self . args . imgsz } val \n '
self . run_callbacks ( " on_train_start " )
LOGGER . info (
f ' Image sizes { self . args . imgsz } train, { self . args . imgsz } val \n '
f ' Using { self . train_loader . num_workers * ( world_size or 1 ) } dataloader workers \n '
f " Logging results to { colorstr ( ' bold ' , self . save_dir ) } \n "
f ' Starting training for '
f ' { self . args . time } hours... ' if self . args . time else f ' { self . epochs } epochs... ' )
f ' { self . args . time } hours... '
if self . args . time
else f " { self . epochs } epochs... "
)
if self . args . close_mosaic :
base_idx = ( self . epochs - self . args . close_mosaic ) * nb
self . plot_idx . extend ( [ base_idx , base_idx + 1 , base_idx + 2 ] )
epoch = self . epochs # predefine for resume fully trained model edge cases
for epoch in range ( self . start_epoch , self . epochs ) :
self . epoch = epoch
self . run_callbacks ( ' on_train_epoch_start ' )
self . run_callbacks ( " on_train_epoch_start " )
self . model . train ( )
if RANK != - 1 :
self . train_loader . sampler . set_epoch ( epoch )
@ -327,7 +359,7 @@ class BaseTrainer:
self . tloss = None
self . optimizer . zero_grad ( )
for i , batch in pbar :
self . run_callbacks ( ' on_train_batch_start ' )
self . run_callbacks ( " on_train_batch_start " )
# Warmup
ni = i + nb * epoch
if ni < = nw :
@ -335,10 +367,11 @@ class BaseTrainer:
self . accumulate = max ( 1 , int ( np . interp ( ni , xi , [ 1 , self . args . nbs / self . batch_size ] ) . round ( ) ) )
for j , x in enumerate ( self . optimizer . param_groups ) :
# Bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
x [ ' lr ' ] = np . interp (
ni , xi , [ self . args . warmup_bias_lr if j == 0 else 0.0 , x [ ' initial_lr ' ] * self . lf ( epoch ) ] )
if ' momentum ' in x :
x [ ' momentum ' ] = np . interp ( ni , xi , [ self . args . warmup_momentum , self . args . momentum ] )
x [ " lr " ] = np . interp (
ni , xi , [ self . args . warmup_bias_lr if j == 0 else 0.0 , x [ " initial_lr " ] * self . lf ( epoch ) ]
)
if " momentum " in x :
x [ " momentum " ] = np . interp ( ni , xi , [ self . args . warmup_momentum , self . args . momentum ] )
# Forward
with torch . cuda . amp . autocast ( self . amp ) :
@ -346,8 +379,9 @@ class BaseTrainer:
self . loss , self . loss_items = self . model ( batch )
if RANK != - 1 :
self . loss * = world_size
self . tloss = ( self . tloss * i + self . loss_items ) / ( i + 1 ) if self . tloss is not None \
else self . loss_items
self . tloss = (
( self . tloss * i + self . loss_items ) / ( i + 1 ) if self . tloss is not None else self . loss_items
)
# Backward
self . scaler . scale ( self . loss ) . backward ( )
@ -368,24 +402,25 @@ class BaseTrainer:
break
# Log
mem = f ' { torch . cuda . memory_reserved ( ) / 1E9 if torch . cuda . is_available ( ) else 0 : .3g } G ' # (GB)
mem = f " { torch . cuda . memory_reserved ( ) / 1E9 if torch . cuda . is_available ( ) else 0 : .3g } G " # (GB)
loss_len = self . tloss . shape [ 0 ] if len ( self . tloss . size ( ) ) else 1
losses = self . tloss if loss_len > 1 else torch . unsqueeze ( self . tloss , 0 )
if RANK in ( - 1 , 0 ) :
pbar . set_description (
( ' %11s ' * 2 + ' %11.4g ' * ( 2 + loss_len ) ) %
( f ' { epoch + 1 } / { self . epochs } ' , mem , * losses , batch [ ' cls ' ] . shape [ 0 ] , batch [ ' img ' ] . shape [ - 1 ] ) )
self . run_callbacks ( ' on_batch_end ' )
( " %11s " * 2 + " %11.4g " * ( 2 + loss_len ) )
% ( f " { epoch + 1 } / { self . epochs } " , mem , * losses , batch [ " cls " ] . shape [ 0 ] , batch [ " img " ] . shape [ - 1 ] )
)
self . run_callbacks ( " on_batch_end " )
if self . args . plots and ni in self . plot_idx :
self . plot_training_samples ( batch , ni )
self . run_callbacks ( ' on_train_batch_end ' )
self . run_callbacks ( " on_train_batch_end " )
self . lr = { f ' lr/pg{ ir } ' : x [ ' lr ' ] for ir , x in enumerate ( self . optimizer . param_groups ) } # for loggers
self . run_callbacks ( ' on_train_epoch_end ' )
self . lr = { f " lr/pg{ ir } " : x [ " lr " ] for ir , x in enumerate ( self . optimizer . param_groups ) } # for loggers
self . run_callbacks ( " on_train_epoch_end " )
if RANK in ( - 1 , 0 ) :
final_epoch = epoch + 1 == self . epochs
self . ema . update_attr ( self . model , include = [ ' yaml ' , ' nc ' , ' args ' , ' names ' , ' stride ' , ' class_weights ' ] )
self . ema . update_attr ( self . model , include = [ " yaml " , " nc " , " args " , " names " , " stride " , " class_weights " ] )
# Validation
if self . args . val or final_epoch or self . stopper . possible_stop or self . stop :
@ -398,14 +433,14 @@ class BaseTrainer:
# Save model
if self . args . save or final_epoch :
self . save_model ( )
self . run_callbacks ( ' on_model_save ' )
self . run_callbacks ( " on_model_save " )
# Scheduler
t = time . time ( )
self . epoch_time = t - self . epoch_time_start
self . epoch_time_start = t
with warnings . catch_warnings ( ) :
warnings . simplefilter ( ' ignore ' ) # suppress 'Detected lr_scheduler.step() before optimizer.step()'
warnings . simplefilter ( " ignore " ) # suppress 'Detected lr_scheduler.step() before optimizer.step()'
if self . args . time :
mean_epoch_time = ( t - self . train_time_start ) / ( epoch - self . start_epoch + 1 )
self . epochs = self . args . epochs = math . ceil ( self . args . time * 3600 / mean_epoch_time )
@ -413,7 +448,7 @@ class BaseTrainer:
self . scheduler . last_epoch = self . epoch # do not move
self . stop | = epoch > = self . epochs # stop if exceeded epochs
self . scheduler . step ( )
self . run_callbacks ( ' on_fit_epoch_end ' )
self . run_callbacks ( " on_fit_epoch_end " )
torch . cuda . empty_cache ( ) # clear GPU memory at end of epoch, may help reduce CUDA out of memory errors
# Early Stopping
@ -426,39 +461,43 @@ class BaseTrainer:
if RANK in ( - 1 , 0 ) :
# Do final val with best.pt
LOGGER . info ( f ' \n { epoch - self . start_epoch + 1 } epochs completed in '
f ' { ( time . time ( ) - self . train_time_start ) / 3600 : .3f } hours. ' )
LOGGER . info (
f " \n { epoch - self . start_epoch + 1 } epochs completed in "
f " { ( time . time ( ) - self . train_time_start ) / 3600 : .3f } hours. "
)
self . final_eval ( )
if self . args . plots :
self . plot_metrics ( )
self . run_callbacks ( ' on_train_end ' )
self . run_callbacks ( " on_train_end " )
torch . cuda . empty_cache ( )
self . run_callbacks ( ' teardown ' )
self . run_callbacks ( " teardown " )
def save_model ( self ) :
""" Save model training checkpoints with additional metadata. """
import pandas as pd # scope for faster startup
metrics = { * * self . metrics , * * { ' fitness ' : self . fitness } }
results = { k . strip ( ) : v for k , v in pd . read_csv ( self . csv ) . to_dict ( orient = ' list ' ) . items ( ) }
metrics = { * * self . metrics , * * { " fitness " : self . fitness } }
results = { k . strip ( ) : v for k , v in pd . read_csv ( self . csv ) . to_dict ( orient = " list " ) . items ( ) }
ckpt = {
' epoch ' : self . epoch ,
' best_fitness ' : self . best_fitness ,
' model ' : deepcopy ( de_parallel ( self . model ) ) . half ( ) ,
' ema ' : deepcopy ( self . ema . ema ) . half ( ) ,
' updates ' : self . ema . updates ,
' optimizer ' : self . optimizer . state_dict ( ) ,
' train_args ' : vars ( self . args ) , # save as dict
' train_metrics ' : metrics ,
' train_results ' : results ,
' date ' : datetime . now ( ) . isoformat ( ) ,
' version ' : __version__ }
" epoch " : self . epoch ,
" best_fitness " : self . best_fitness ,
" model " : deepcopy ( de_parallel ( self . model ) ) . half ( ) ,
" ema " : deepcopy ( self . ema . ema ) . half ( ) ,
" updates " : self . ema . updates ,
" optimizer " : self . optimizer . state_dict ( ) ,
" train_args " : vars ( self . args ) , # save as dict
" train_metrics " : metrics ,
" train_results " : results ,
" date " : datetime . now ( ) . isoformat ( ) ,
" version " : __version__ ,
}
# Save last and best
torch . save ( ckpt , self . last )
if self . best_fitness == self . fitness :
torch . save ( ckpt , self . best )
if ( self . save_period > 0 ) and ( self . epoch > 0 ) and ( self . epoch % self . save_period == 0 ) :
torch . save ( ckpt , self . wdir / f ' epoch{ self . epoch } .pt ' )
torch . save ( ckpt , self . wdir / f " epoch{ self . epoch } .pt " )
@staticmethod
def get_dataset ( data ) :
@ -467,7 +506,7 @@ class BaseTrainer:
Returns None if data format is not recognized .
"""
return data [ ' train ' ] , data . get ( ' val ' ) or data . get ( ' test ' )
return data [ " train " ] , data . get ( " val " ) or data . get ( " test " )
def setup_model ( self ) :
""" Load/create/download model for any task. """
@ -476,9 +515,9 @@ class BaseTrainer:
model , weights = self . model , None
ckpt = None
if str ( model ) . endswith ( ' .pt ' ) :
if str ( model ) . endswith ( " .pt " ) :
weights , ckpt = attempt_load_one_weight ( model )
cfg = ckpt [ ' model ' ] . yaml
cfg = ckpt [ " model " ] . yaml
else :
cfg = model
self . model = self . get_model ( cfg = cfg , weights = weights , verbose = RANK == - 1 ) # calls Model(cfg, weights)
@ -505,7 +544,7 @@ class BaseTrainer:
The returned dict is expected to contain " fitness " key .
"""
metrics = self . validator ( self )
fitness = metrics . pop ( ' fitness ' , - self . loss . detach ( ) . cpu ( ) . numpy ( ) ) # use loss as fitness measure if not found
fitness = metrics . pop ( " fitness " , - self . loss . detach ( ) . cpu ( ) . numpy ( ) ) # use loss as fitness measure if not found
if not self . best_fitness or self . best_fitness < fitness :
self . best_fitness = fitness
return metrics , fitness
@ -516,24 +555,24 @@ class BaseTrainer:
def get_validator ( self ) :
""" Returns a NotImplementedError when the get_validator function is called. """
raise NotImplementedError ( ' get_validator function not implemented in trainer ' )
raise NotImplementedError ( " get_validator function not implemented in trainer " )
def get_dataloader ( self , dataset_path , batch_size = 16 , rank = 0 , mode = ' train ' ) :
def get_dataloader ( self , dataset_path , batch_size = 16 , rank = 0 , mode = " train " ) :
""" Returns dataloader derived from torch.data.Dataloader. """
raise NotImplementedError ( ' get_dataloader function not implemented in trainer ' )
raise NotImplementedError ( " get_dataloader function not implemented in trainer " )
def build_dataset ( self , img_path , mode = ' train ' , batch = None ) :
def build_dataset ( self , img_path , mode = " train " , batch = None ) :
""" Build dataset. """
raise NotImplementedError ( ' build_dataset function not implemented in trainer ' )
raise NotImplementedError ( " build_dataset function not implemented in trainer " )
def label_loss_items ( self , loss_items = None , prefix = ' train ' ) :
def label_loss_items ( self , loss_items = None , prefix = " train " ) :
""" Returns a loss dict with labelled training loss items tensor. """
# Not needed for classification but necessary for segmentation & detection
return { ' loss ' : loss_items } if loss_items is not None else [ ' loss ' ]
return { " loss " : loss_items } if loss_items is not None else [ " loss " ]
def set_model_attributes ( self ) :
""" To set or update model parameters before training. """
self . model . names = self . data [ ' names ' ]
self . model . names = self . data [ " names " ]
def build_targets ( self , preds , targets ) :
""" Builds target tensors for training YOLO model. """
@ -541,7 +580,7 @@ class BaseTrainer:
def progress_string ( self ) :
""" Returns a string describing training progress. """
return ' '
return " "
# TODO: may need to put these following functions into callback
def plot_training_samples ( self , batch , ni ) :
@ -556,9 +595,9 @@ class BaseTrainer:
""" Saves training metrics to a CSV file. """
keys , vals = list ( metrics . keys ( ) ) , list ( metrics . values ( ) )
n = len ( metrics ) + 1 # number of cols
s = ' ' if self . csv . exists ( ) else ( ( ' %23s , ' * n % tuple ( [ ' epoch ' ] + keys ) ) . rstrip ( ' , ' ) + ' \n ' ) # header
with open ( self . csv , ' a ' ) as f :
f . write ( s + ( ' %23.5g , ' * n % tuple ( [ self . epoch + 1 ] + vals ) ) . rstrip ( ' , ' ) + ' \n ' )
s = " " if self . csv . exists ( ) else ( ( " %23s , " * n % tuple ( [ " epoch " ] + keys ) ) . rstrip ( " , " ) + " \n " ) # header
with open ( self . csv , " a " ) as f :
f . write ( s + ( " %23.5g , " * n % tuple ( [ self . epoch + 1 ] + vals ) ) . rstrip ( " , " ) + " \n " )
def plot_metrics ( self ) :
""" Plot and display metrics visually. """
@ -567,7 +606,7 @@ class BaseTrainer:
def on_plot ( self , name , data = None ) :
""" Registers plots (e.g. to be consumed in callbacks) """
path = Path ( name )
self . plots [ path ] = { ' data ' : data , ' timestamp ' : time . time ( ) }
self . plots [ path ] = { " data " : data , " timestamp " : time . time ( ) }
def final_eval ( self ) :
""" Performs final evaluation and validation for object detection YOLO model. """
@ -575,11 +614,11 @@ class BaseTrainer:
if f . exists ( ) :
strip_optimizer ( f ) # strip optimizers
if f is self . best :
LOGGER . info ( f ' \n Validating { f } ... ' )
LOGGER . info ( f " \n Validating { f } ... " )
self . validator . args . plots = self . args . plots
self . metrics = self . validator ( model = f )
self . metrics . pop ( ' fitness ' , None )
self . run_callbacks ( ' on_fit_epoch_end ' )
self . metrics . pop ( " fitness " , None )
self . run_callbacks ( " on_fit_epoch_end " )
def check_resume ( self , overrides ) :
""" Check if resume checkpoint exists and update arguments accordingly. """
@ -591,19 +630,21 @@ class BaseTrainer:
# Check that resume data YAML exists, otherwise strip to force re-download of dataset
ckpt_args = attempt_load_weights ( last ) . args
if not Path ( ckpt_args [ ' data ' ] ) . exists ( ) :
ckpt_args [ ' data ' ] = self . args . data
if not Path ( ckpt_args [ " data " ] ) . exists ( ) :
ckpt_args [ " data " ] = self . args . data
resume = True
self . args = get_cfg ( ckpt_args )
self . args . model = str ( last ) # reinstate model
for k in ' imgsz ' , ' batch ' : # allow arg updates to reduce memory on resume if crashed due to CUDA OOM
for k in " imgsz " , " batch " : # allow arg updates to reduce memory on resume if crashed due to CUDA OOM
if k in overrides :
setattr ( self . args , k , overrides [ k ] )
except Exception as e :
raise FileNotFoundError ( ' Resume checkpoint not found. Please pass a valid checkpoint to resume from, '
" i.e. ' yolo train resume model=path/to/last.pt ' " ) from e
raise FileNotFoundError (
" Resume checkpoint not found. Please pass a valid checkpoint to resume from, "
" i.e. ' yolo train resume model=path/to/last.pt ' "
) from e
self . resume = resume
def resume_training ( self , ckpt ) :
@ -611,23 +652,26 @@ class BaseTrainer:
if ckpt is None :
return
best_fitness = 0.0
start_epoch = ckpt [ ' epoch ' ] + 1
if ckpt [ ' optimizer ' ] is not None :
self . optimizer . load_state_dict ( ckpt [ ' optimizer ' ] ) # optimizer
best_fitness = ckpt [ ' best_fitness ' ]
if self . ema and ckpt . get ( ' ema ' ) :
self . ema . ema . load_state_dict ( ckpt [ ' ema ' ] . float ( ) . state_dict ( ) ) # EMA
self . ema . updates = ckpt [ ' updates ' ]
start_epoch = ckpt [ " epoch " ] + 1
if ckpt [ " optimizer " ] is not None :
self . optimizer . load_state_dict ( ckpt [ " optimizer " ] ) # optimizer
best_fitness = ckpt [ " best_fitness " ]
if self . ema and ckpt . get ( " ema " ) :
self . ema . ema . load_state_dict ( ckpt [ " ema " ] . float ( ) . state_dict ( ) ) # EMA
self . ema . updates = ckpt [ " updates " ]
if self . resume :
assert start_epoch > 0 , \
f ' { self . args . model } training to { self . epochs } epochs is finished, nothing to resume. \n ' \
assert start_epoch > 0 , (
f " { self . args . model } training to { self . epochs } epochs is finished, nothing to resume. \n "
f " Start a new training without resuming, i.e. ' yolo train model= { self . args . model } ' "
)
LOGGER . info (
f ' Resuming training from { self . args . model } from epoch { start_epoch + 1 } to { self . epochs } total epochs ' )
f " Resuming training from { self . args . model } from epoch { start_epoch + 1 } to { self . epochs } total epochs "
)
if self . epochs < start_epoch :
LOGGER . info (
f " { self . model } has been trained for { ckpt [ ' epoch ' ] } epochs. Fine-tuning for { self . epochs } more epochs. " )
self . epochs + = ckpt [ ' epoch ' ] # finetune additional epochs
f " { self . model } has been trained for { ckpt [ ' epoch ' ] } epochs. Fine-tuning for { self . epochs } more epochs. "
)
self . epochs + = ckpt [ " epoch " ] # finetune additional epochs
self . best_fitness = best_fitness
self . start_epoch = start_epoch
if start_epoch > ( self . epochs - self . args . close_mosaic ) :
@ -635,13 +679,13 @@ class BaseTrainer:
def _close_dataloader_mosaic ( self ) :
""" Update dataloaders to stop using mosaic augmentation. """
if hasattr ( self . train_loader . dataset , ' mosaic ' ) :
if hasattr ( self . train_loader . dataset , " mosaic " ) :
self . train_loader . dataset . mosaic = False
if hasattr ( self . train_loader . dataset , ' close_mosaic ' ) :
LOGGER . info ( ' Closing dataloader mosaic ' )
if hasattr ( self . train_loader . dataset , " close_mosaic " ) :
LOGGER . info ( " Closing dataloader mosaic " )
self . train_loader . dataset . close_mosaic ( hyp = self . args )
def build_optimizer ( self , model , name = ' auto ' , lr = 0.001 , momentum = 0.9 , decay = 1e-5 , iterations = 1e5 ) :
def build_optimizer ( self , model , name = " auto " , lr = 0.001 , momentum = 0.9 , decay = 1e-5 , iterations = 1e5 ) :
"""
Constructs an optimizer for the given model , based on the specified optimizer name , learning rate , momentum ,
weight decay , and number of iterations .
@ -661,41 +705,45 @@ class BaseTrainer:
"""
g = [ ] , [ ] , [ ] # optimizer parameter groups
bn = tuple ( v for k , v in nn . __dict__ . items ( ) if ' Norm ' in k ) # normalization layers, i.e. BatchNorm2d()
if name == ' auto ' :
LOGGER . info ( f " { colorstr ( ' optimizer: ' ) } ' optimizer=auto ' found, "
bn = tuple ( v for k , v in nn . __dict__ . items ( ) if " Norm " in k ) # normalization layers, i.e. BatchNorm2d()
if name == " auto " :
LOGGER . info (
f " { colorstr ( ' optimizer: ' ) } ' optimizer=auto ' found, "
f " ignoring ' lr0= { self . args . lr0 } ' and ' momentum= { self . args . momentum } ' and "
f " determining best ' optimizer ' , ' lr0 ' and ' momentum ' automatically... " )
nc = getattr ( model , ' nc ' , 10 ) # number of classes
f " determining best ' optimizer ' , ' lr0 ' and ' momentum ' automatically... "
)
nc = getattr ( model , " nc " , 10 ) # number of classes
lr_fit = round ( 0.002 * 5 / ( 4 + nc ) , 6 ) # lr0 fit equation to 6 decimal places
name , lr , momentum = ( ' SGD ' , 0.01 , 0.9 ) if iterations > 10000 else ( ' AdamW ' , lr_fit , 0.9 )
name , lr , momentum = ( " SGD " , 0.01 , 0.9 ) if iterations > 10000 else ( " AdamW " , lr_fit , 0.9 )
self . args . warmup_bias_lr = 0.0 # no higher than 0.01 for Adam
for module_name , module in model . named_modules ( ) :
for param_name , param in module . named_parameters ( recurse = False ) :
fullname = f ' { module_name } .{ param_name } ' if module_name else param_name
if ' bias ' in fullname : # bias (no decay)
fullname = f " { module_name } .{ param_name } " if module_name else param_name
if " bias " in fullname : # bias (no decay)
g [ 2 ] . append ( param )
elif isinstance ( module , bn ) : # weight (no decay)
g [ 1 ] . append ( param )
else : # weight (with decay)
g [ 0 ] . append ( param )
if name in ( ' Adam ' , ' Adamax ' , ' AdamW ' , ' NAdam ' , ' RAdam ' ) :
if name in ( " Adam " , " Adamax " , " AdamW " , " NAdam " , " RAdam " ) :
optimizer = getattr ( optim , name , optim . Adam ) ( g [ 2 ] , lr = lr , betas = ( momentum , 0.999 ) , weight_decay = 0.0 )
elif name == ' RMSProp ' :
elif name == " RMSProp " :
optimizer = optim . RMSprop ( g [ 2 ] , lr = lr , momentum = momentum )
elif name == ' SGD ' :
elif name == " SGD " :
optimizer = optim . SGD ( g [ 2 ] , lr = lr , momentum = momentum , nesterov = True )
else :
raise NotImplementedError (
f " Optimizer ' { name } ' not found in list of available optimizers "
f ' [Adam, AdamW, NAdam, RAdam, RMSProp, SGD, auto]. '
' To request support for addition optimizers please visit https://github.com/ultralytics/ultralytics. ' )
f " [Adam, AdamW, NAdam, RAdam, RMSProp, SGD, auto]. "
" To request support for addition optimizers please visit https://github.com/ultralytics/ultralytics. "
)
optimizer . add_param_group ( { ' params ' : g [ 0 ] , ' weight_decay ' : decay } ) # add g0 with weight_decay
optimizer . add_param_group ( { ' params ' : g [ 1 ] , ' weight_decay ' : 0.0 } ) # add g1 (BatchNorm2d weights)
optimizer . add_param_group ( { " params " : g [ 0 ] , " weight_decay " : decay } ) # add g0 with weight_decay
optimizer . add_param_group ( { " params " : g [ 1 ] , " weight_decay " : 0.0 } ) # add g1 (BatchNorm2d weights)
LOGGER . info (
f " { colorstr ( ' optimizer: ' ) } { type ( optimizer ) . __name__ } (lr= { lr } , momentum= { momentum } ) with parameter groups "
f ' { len ( g [ 1 ] ) } weight(decay=0.0), { len ( g [ 0 ] ) } weight(decay= { decay } ), { len ( g [ 2 ] ) } bias(decay=0.0) ' )
f ' { len ( g [ 1 ] ) } weight(decay=0.0), { len ( g [ 0 ] ) } weight(decay= { decay } ), { len ( g [ 2 ] ) } bias(decay=0.0) '
)
return optimizer