You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

322 lines
12 KiB

#!/usr/bin/python3
# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import datetime
import json
import logging
import os
import time
from collections import OrderedDict, defaultdict
from functools import partial
from pprint import pformat
import numpy as np
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch, PeriodicWriter
from detectron2.evaluation import (
CityscapesInstanceEvaluator,
CityscapesSemSegEvaluator,
COCOEvaluator,
COCOPanopticEvaluator,
DatasetEvaluators,
LVISEvaluator,
PascalVOCDetectionEvaluator,
SemSegEvaluator,
verify_results,
)
from detectron2.layers import get_norm
from detectron2.modeling import GeneralizedRCNNWithTTA
from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
from detectron2.solver.build import maybe_add_gradient_clipping
from detectron2.utils.events import EventWriter
from lr_decay import get_default_optimizer_params, lr_factor_func
# [modification] for better logging
def _ex_repr(self):
d = vars(self)
ex = ', '.join(f'{k}={v}' for k, v in d.items() if not k.startswith('__') and k not in [
'trainer', 'before_train', 'after_train', 'before_step', 'after_step', 'state_dict',
'_model', '_data_loader', 'logger',
])
return f'{type(self).__name__}({ex})'
hooks.HookBase.__repr__ = _ex_repr
EventWriter.__repr__ = _ex_repr
# [modification] add norm
@ROI_HEADS_REGISTRY.register()
class Res5ROIHeadsExtraNorm(Res5ROIHeads):
"""
As described in the MOCO paper, there is an extra BN layer
following the res5 stage.
"""
def _build_res5_block(self, cfg):
seq, out_channels = super()._build_res5_block(cfg)
norm = cfg.MODEL.RESNETS.NORM
norm = get_norm(norm, out_channels)
seq.add_module("norm", norm)
return seq, out_channels
class Trainer(DefaultTrainer):
"""
We use the "DefaultTrainer" which contains pre-defined default logic for
standard training workflow. They may not work for you, especially if you
are working on a new research project. In that case you can write your
own training loop. You can use "tools/plain_train_net.py" as an example.
"""
# [modification] override the `build_optimizer` for using Adam and layer-wise lr decay
lr_decay_ratio: float = 1.0
@classmethod
def build_optimizer(cls, cfg, model):
is_resnet50 = int(cfg.MODEL.RESNETS.DEPTH) == 50
if comm.is_main_process():
dbg = defaultdict(list)
for module_name, module in model.named_modules():
for module_param_name, value in module.named_parameters(recurse=False):
if not value.requires_grad:
continue
lrf = lr_factor_func(f"{module_name}.{module_param_name}", is_resnet50=is_resnet50, dec=cls.lr_decay_ratio, debug=True)
dbg[lrf].append(f"{module_name}.{module_param_name}")
for k in sorted(dbg.keys()):
print(f'[{k}] {sorted(dbg[k])}')
print()
params = get_default_optimizer_params(
model,
base_lr=cfg.SOLVER.BASE_LR,
weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
lr_factor_func=partial(lr_factor_func, is_resnet50=is_resnet50, dec=cls.lr_decay_ratio, debug=False)
)
opt_clz = {
'sgd': partial(torch.optim.SGD, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV),
'adamw': torch.optim.AdamW,
'adam': torch.optim.AdamW,
}[cfg.SOLVER.OPTIMIZER.lower()]
return maybe_add_gradient_clipping(cfg, opt_clz)(params, lr=cfg.SOLVER.BASE_LR, weight_decay=cfg.SOLVER.WEIGHT_DECAY)
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
return build_evaluator(cfg, dataset_name, output_folder)
@classmethod
def test_with_TTA(cls, cfg, model):
logger = logging.getLogger("detectron2.trainer")
# In the end of training, run an evaluation with TTA
# Only support some R-CNN models.
logger.info("Running inference with test-time augmentation ...")
model = GeneralizedRCNNWithTTA(cfg, model)
evaluators = [
cls.build_evaluator(
cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
)
for name in cfg.DATASETS.TEST
]
res = cls.test(cfg, model, evaluators)
res = OrderedDict({k + "_TTA": v for k, v in res.items()})
return res
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
# [modification] we add these two new keys
cfg.SOLVER.OPTIMIZER, cfg.SOLVER.LR_DECAY = 'sgd', 1.0 # by default using SGD and no lr_decay
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
return cfg
def main(args):
cfg = setup(args)
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
# [modification] for implementing lr decay and for logging
Trainer.lr_decay_ratio = cfg.SOLVER.LR_DECAY
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if cfg.TEST.AUG.ENABLED:
res.update(Trainer.test_with_TTA(cfg, model))
if comm.is_main_process():
verify_results(cfg, res)
return res
# [modification] just skip some warnings
import warnings
comm.synchronize()
warnings.filterwarnings('ignore', category=UserWarning)
_ = np.arange(3, dtype=np.int).astype(np.bool)
_ = np.array(torch.ones(3, dtype=torch.int32).numpy(), dtype=np.int)
_ = np.array(torch.ones(3, dtype=torch.int64).numpy(), dtype=np.int)
_ = np.array(torch.ones(3, dtype=torch.long).numpy(), dtype=np.int)
_ = torch.rand(100) // 5
_ = torch.meshgrid(torch.ones(1))
warnings.resetwarnings()
comm.synchronize()
"""
If you'd like to do anything fancier than the standard training logic,
consider writing your own training loop (see plain_train_net.py) or
subclassing the trainer.
"""
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
for h in trainer._hooks:
if isinstance(h, PeriodicWriter):
h._period = 1000 # [modification] less logging
# [modification] we add some hooks for logging
is_local_master = comm.get_rank() % args.num_gpus == 0
if comm.is_main_process():
print(f'[default hooks] {pformat(trainer._hooks, indent=2, width=300)}')
ex_hooks = [
hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model)) if cfg.TEST.AUG.ENABLED else None,
LogHook(cfg.TEST.EVAL_PERIOD, args.config_file, cfg.OUTPUT_DIR, is_local_master) if comm.is_main_process() else None,
]
trainer.register_hooks(ex_hooks)
if comm.is_main_process():
print(f'[extra hooks] {pformat(ex_hooks, indent=2, width=300)}')
return trainer.train()
# [modification] we add a hook for logging results to `cfg.OUTPUT_DIR/d2_coco_log.txt`
class LogHook(hooks.HookBase):
def __init__(self, eval_period, config_file, output_dir, is_local_master):
self.eval_period = eval_period
self.log_period = eval_period // 4
self.log = {}
self.is_master = comm.is_main_process()
self.is_local_master = is_local_master
self.config_file = config_file
self.out_dir = output_dir
self.log_txt_name = os.path.join(self.out_dir, 'd2_coco_log.txt')
def __write_to_log_file(self, d):
if self.is_local_master:
self.log.update(d)
with open(self.log_txt_name, 'w') as fp:
json.dump(self.log, fp)
fp.write('\n')
def update_and_write_to_local_log(self):
stat = self.trainer.storage.latest()
self.log['boxAP'], self.log['bAP50'], self.log['bAP75'] = stat['bbox/AP'][0], stat['bbox/AP50'][0], stat['bbox/AP75'][0]
self.log['mskAP'], self.log['mAP50'], self.log['mAP75'] = stat['segm/AP'][0], stat['segm/AP50'][0], stat['segm/AP75'][0]
self.log['bAP-l'], self.log['bAP-m'], self.log['bAP-s'] = stat['bbox/APl'][0], stat['bbox/APm'][0], stat['bbox/APs'][0]
self.log['mAP-l'], self.log['mAP-m'], self.log['mAP-s'] = stat['segm/APl'][0], stat['segm/APm'][0], stat['segm/APs'][0]
all_ap = sorted([(v[0], k.split('AP-')[-1].strip()) for k, v in stat.items() if k.startswith('bbox/AP-')])
all_ap = [tu[1] for tu in all_ap]
self.log['easy'] = ' | '.join(all_ap[-7:])
self.log['hard'] = ' | '.join(all_ap[:7])
for k in self.log.keys():
if 'AP' in k:
self.log[k] = round(self.log[k], 3)
self.__write_to_log_file({})
def after_step(self):
next_iter = self.trainer.iter + 1
if self.eval_period > 0 and next_iter % self.eval_period == 0:
self.update_and_write_to_local_log()
if self.log_period > 0 and next_iter % self.log_period == 0:
stat = self.trainer.storage.latest()
remain_secs = round(stat['eta_seconds'][0])
d = {
'cfg': self.config_file,
'rema': str(datetime.timedelta(seconds=remain_secs)), 'fini': time.strftime("%m-%d %H:%M", time.localtime(time.time() + remain_secs)),
'cur_iter': f'{next_iter}/{self.trainer.max_iter}',
}
self.__write_to_log_file(d)
def after_train(self):
self.update_and_write_to_local_log()
last_boxAP, last_mskAP = round(self.log['boxAP'], 3), round(self.log['mskAP'], 3)
self.__write_to_log_file({
'rema': '-', 'fini': time.strftime("%m-%d %H:%M", time.localtime(time.time() - 120)),
'last_boxAP': last_boxAP,
'last_mskAP': last_mskAP,
})
time.sleep(5)
if self.is_master:
print(f'\n[finished] ========== last_boxAP={last_boxAP}, last_mskAP={last_mskAP} ==========\n')
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)
def build_evaluator(cfg, dataset_name, output_folder=None):
"""
Create evaluator(s) for a given dataset.
This uses the special metadata "evaluator_type" associated with each builtin dataset.
For your own dataset, you can simply create an evaluator manually in your
script and do not have to worry about the hacky if-else logic here.
"""
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluator_list = []
evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
evaluator_list.append(
SemSegEvaluator(
dataset_name,
distributed=True,
output_dir=output_folder,
)
)
if evaluator_type in ["coco", "coco_panoptic_seg"]:
evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
if evaluator_type == "coco_panoptic_seg":
evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
if evaluator_type == "cityscapes_instance":
return CityscapesInstanceEvaluator(dataset_name)
if evaluator_type == "cityscapes_sem_seg":
return CityscapesSemSegEvaluator(dataset_name)
elif evaluator_type == "pascal_voc":
return PascalVOCDetectionEvaluator(dataset_name)
elif evaluator_type == "lvis":
return LVISEvaluator(dataset_name, output_dir=output_folder)
if len(evaluator_list) == 0:
raise NotImplementedError(
"no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type)
)
elif len(evaluator_list) == 1:
return evaluator_list[0]
return DatasetEvaluators(evaluator_list)