|
|
|
@ -5,6 +5,7 @@ Simple training loop; Boilerplate that could apply to any arbitrary neural netwo |
|
|
|
|
|
|
|
|
|
import os |
|
|
|
|
import subprocess |
|
|
|
|
import sys |
|
|
|
|
import time |
|
|
|
|
from collections import defaultdict |
|
|
|
|
from copy import deepcopy |
|
|
|
@ -28,10 +29,10 @@ from ultralytics.yolo.utils import (DEFAULT_CFG, LOGGER, RANK, SETTINGS, TQDM_BA |
|
|
|
|
yaml_save) |
|
|
|
|
from ultralytics.yolo.utils.autobatch import check_train_batch_size |
|
|
|
|
from ultralytics.yolo.utils.checks import check_file, check_imgsz, print_args |
|
|
|
|
from ultralytics.yolo.utils.dist import ddp_cleanup, generate_ddp_command |
|
|
|
|
from ultralytics.yolo.utils.dist import ddp_cleanup, generate_ddp_file, find_free_network_port |
|
|
|
|
from ultralytics.yolo.utils.files import get_latest_run, increment_path |
|
|
|
|
from ultralytics.yolo.utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, init_seeds, one_cycle, |
|
|
|
|
select_device, strip_optimizer) |
|
|
|
|
select_device, strip_optimizer, TORCH_1_9) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseTrainer: |
|
|
|
@ -174,13 +175,18 @@ class BaseTrainer: |
|
|
|
|
|
|
|
|
|
# Run subprocess if DDP training, else train normally |
|
|
|
|
if world_size > 1 and "LOCAL_RANK" not in os.environ: |
|
|
|
|
command = generate_ddp_command(world_size, self) |
|
|
|
|
# cmd, file = generate_ddp_command(world_size, self) # security vulnerability in Snyk scans |
|
|
|
|
file = generate_ddp_file(self) if sys.argv[0].endswith('yolo') else os.path.abspath(sys.argv[0]) |
|
|
|
|
torch_distributed_cmd = "torch.distributed.run" if TORCH_1_9 else "torch.distributed.launch" |
|
|
|
|
cmd = [ |
|
|
|
|
sys.executable, "-m", torch_distributed_cmd, "--nproc_per_node", f"{world_size}", "--master_port", |
|
|
|
|
f"{find_free_network_port()}", file] + sys.argv[1:] |
|
|
|
|
try: |
|
|
|
|
subprocess.run(command) |
|
|
|
|
subprocess.run(cmd, check=True) |
|
|
|
|
except Exception as e: |
|
|
|
|
self.console(e) |
|
|
|
|
self.console.warning(e) |
|
|
|
|
finally: |
|
|
|
|
ddp_cleanup(command, self) |
|
|
|
|
ddp_cleanup(self, file) |
|
|
|
|
else: |
|
|
|
|
self._do_train(int(os.getenv("RANK", -1)), world_size) |
|
|
|
|
|
|
|
|
|