mae-cnn/downstream_d2/train_net.py

#!/usr/bin/python3

# Copyright (c) ByteDance, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import datetime
import json
import logging
import os
import time
from collections import OrderedDict, defaultdict
from functools import partial
from pprint import pformat

import numpy as np
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch, PeriodicWriter
from detectron2.evaluation import (
    CityscapesInstanceEvaluator,
    CityscapesSemSegEvaluator,
    COCOEvaluator,
    COCOPanopticEvaluator,
    DatasetEvaluators,
    LVISEvaluator,
    PascalVOCDetectionEvaluator,
    SemSegEvaluator,
    verify_results,
)
from detectron2.layers import get_norm
from detectron2.modeling import GeneralizedRCNNWithTTA
from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
from detectron2.solver.build import maybe_add_gradient_clipping
from detectron2.utils.events import EventWriter

from lr_decay import get_default_optimizer_params, lr_factor_func


# [modification] for better logging
def _ex_repr(self):
    d = vars(self)
    ex = ', '.join(f'{k}={v}' for k, v in d.items() if not k.startswith('__') and k not in [
        'trainer', 'before_train', 'after_train', 'before_step', 'after_step', 'state_dict',
        '_model', '_data_loader', 'logger',
    ])
    return f'{type(self).__name__}({ex})'
hooks.HookBase.__repr__ = _ex_repr
EventWriter.__repr__ = _ex_repr


# [modification] add norm
@ROI_HEADS_REGISTRY.register()
class Res5ROIHeadsExtraNorm(Res5ROIHeads):
    """
    As described in the MOCO paper, there is an extra BN layer
    following the res5 stage.
    """

    def _build_res5_block(self, cfg):
        seq, out_channels = super()._build_res5_block(cfg)
        norm = cfg.MODEL.RESNETS.NORM
        norm = get_norm(norm, out_channels)
        seq.add_module("norm", norm)
        return seq, out_channels


class Trainer(DefaultTrainer):
    """
    We use the "DefaultTrainer" which contains pre-defined default logic for
    standard training workflow. They may not work for you, especially if you
    are working on a new research project. In that case you can write your
    own training loop. You can use "tools/plain_train_net.py" as an example.
    """

    # [modification] override the `build_optimizer` for using Adam and layer-wise lr decay
    lr_decay_ratio: float = 1.0
    @classmethod
    def build_optimizer(cls, cfg, model):
        is_resnet50 = int(cfg.MODEL.RESNETS.DEPTH) == 50
        if comm.is_main_process():
            dbg = defaultdict(list)
            for module_name, module in model.named_modules():
                for module_param_name, value in module.named_parameters(recurse=False):
                    if not value.requires_grad:
                        continue
                    lrf = lr_factor_func(f"{module_name}.{module_param_name}", is_resnet50=is_resnet50, dec=cls.lr_decay_ratio, debug=True)
                    dbg[lrf].append(f"{module_name}.{module_param_name}")
            for k in sorted(dbg.keys()):
                print(f'[{k}] {sorted(dbg[k])}')
            print()

        params = get_default_optimizer_params(
            model,
            base_lr=cfg.SOLVER.BASE_LR,
            weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
            bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
            weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
            lr_factor_func=partial(lr_factor_func, is_resnet50=is_resnet50, dec=cls.lr_decay_ratio, debug=False)
        )

        opt_clz = {
            'sgd': partial(torch.optim.SGD, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV),
            'adamw': torch.optim.AdamW,
            'adam': torch.optim.AdamW,
        }[cfg.SOLVER.OPTIMIZER.lower()]
        return maybe_add_gradient_clipping(cfg, opt_clz)(params, lr=cfg.SOLVER.BASE_LR, weight_decay=cfg.SOLVER.WEIGHT_DECAY)

    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        return build_evaluator(cfg, dataset_name, output_folder)

    @classmethod
    def test_with_TTA(cls, cfg, model):
        logger = logging.getLogger("detectron2.trainer")
        # In the end of training, run an evaluation with TTA
        # Only support some R-CNN models.
        logger.info("Running inference with test-time augmentation ...")
        model = GeneralizedRCNNWithTTA(cfg, model)
        evaluators = [
            cls.build_evaluator(
                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
            )
            for name in cfg.DATASETS.TEST
        ]
        res = cls.test(cfg, model, evaluators)
        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
        return res


def setup(args):
    """
    Create configs and perform basic setups.
    """
    cfg = get_cfg()
    # [modification] we add these two new keys
    cfg.SOLVER.OPTIMIZER, cfg.SOLVER.LR_DECAY = 'sgd', 1.0  # by default using SGD and no lr_decay
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    default_setup(cfg, args)
    return cfg


def main(args):
    cfg = setup(args)
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

    # [modification] for implementing lr decay and for logging
    Trainer.lr_decay_ratio = cfg.SOLVER.LR_DECAY

    if args.eval_only:
        model = Trainer.build_model(cfg)
        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume
        )
        res = Trainer.test(cfg, model)
        if cfg.TEST.AUG.ENABLED:
            res.update(Trainer.test_with_TTA(cfg, model))
        if comm.is_main_process():
            verify_results(cfg, res)
        return res

    # [modification] just skip some warnings
    import warnings
    comm.synchronize()
    warnings.filterwarnings('ignore', category=UserWarning)
    _ = np.arange(3, dtype=np.int).astype(np.bool)
    _ = np.array(torch.ones(3, dtype=torch.int32).numpy(), dtype=np.int)
    _ = np.array(torch.ones(3, dtype=torch.int64).numpy(), dtype=np.int)
    _ = np.array(torch.ones(3, dtype=torch.long).numpy(), dtype=np.int)
    _ = torch.rand(100) // 5
    _ = torch.meshgrid(torch.ones(1))
    warnings.resetwarnings()
    comm.synchronize()

    """
    If you'd like to do anything fancier than the standard training logic,
    consider writing your own training loop (see plain_train_net.py) or
    subclassing the trainer.
    """
    trainer = Trainer(cfg)
    trainer.resume_or_load(resume=args.resume)
    for h in trainer._hooks:
        if isinstance(h, PeriodicWriter):
            h._period = 1000  # [modification] less logging

    # [modification] we add some hooks for logging
    is_local_master = comm.get_rank() % args.num_gpus == 0
    if comm.is_main_process():
        print(f'[default hooks] {pformat(trainer._hooks, indent=2, width=300)}')
    ex_hooks = [
        hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model)) if cfg.TEST.AUG.ENABLED else None,
        LogHook(cfg.TEST.EVAL_PERIOD, args.config_file, cfg.OUTPUT_DIR, is_local_master) if comm.is_main_process() else None,
    ]
    trainer.register_hooks(ex_hooks)
    if comm.is_main_process():
        print(f'[extra hooks] {pformat(ex_hooks, indent=2, width=300)}')

    return trainer.train()


# [modification] we add a hook for logging results to `cfg.OUTPUT_DIR/d2_coco_log.txt`
class LogHook(hooks.HookBase):
    def __init__(self, eval_period, config_file, output_dir, is_local_master):
        self.eval_period = eval_period
        self.log_period = eval_period // 4
        self.log = {}

        self.is_master = comm.is_main_process()
        self.is_local_master = is_local_master

        self.config_file = config_file
        self.out_dir = output_dir
        self.log_txt_name = os.path.join(self.out_dir, 'd2_coco_log.txt')

    def __write_to_log_file(self, d):
        if self.is_local_master:
            self.log.update(d)
            with open(self.log_txt_name, 'w') as fp:
                json.dump(self.log, fp)
                fp.write('\n')

    def update_and_write_to_local_log(self):
        stat = self.trainer.storage.latest()
        self.log['boxAP'], self.log['bAP50'], self.log['bAP75'] = stat['bbox/AP'][0], stat['bbox/AP50'][0], stat['bbox/AP75'][0]
        self.log['mskAP'], self.log['mAP50'], self.log['mAP75'] = stat['segm/AP'][0], stat['segm/AP50'][0], stat['segm/AP75'][0]
        self.log['bAP-l'], self.log['bAP-m'], self.log['bAP-s'] = stat['bbox/APl'][0], stat['bbox/APm'][0], stat['bbox/APs'][0]
        self.log['mAP-l'], self.log['mAP-m'], self.log['mAP-s'] = stat['segm/APl'][0], stat['segm/APm'][0], stat['segm/APs'][0]
        all_ap = sorted([(v[0], k.split('AP-')[-1].strip()) for k, v in stat.items() if k.startswith('bbox/AP-')])
        all_ap = [tu[1] for tu in all_ap]
        self.log['easy'] = ' | '.join(all_ap[-7:])
        self.log['hard'] = ' | '.join(all_ap[:7])
        for k in self.log.keys():
            if 'AP' in k:
                self.log[k] = round(self.log[k], 3)
        self.__write_to_log_file({})

    def after_step(self):
        next_iter = self.trainer.iter + 1
        if self.eval_period > 0 and next_iter % self.eval_period == 0:
            self.update_and_write_to_local_log()

        if self.log_period > 0 and next_iter % self.log_period == 0:
            stat = self.trainer.storage.latest()
            remain_secs = round(stat['eta_seconds'][0])
            d = {
                'cfg': self.config_file,
                'rema': str(datetime.timedelta(seconds=remain_secs)), 'fini': time.strftime("%m-%d %H:%M", time.localtime(time.time() + remain_secs)),
                'cur_iter': f'{next_iter}/{self.trainer.max_iter}',
            }
            self.__write_to_log_file(d)

    def after_train(self):
        self.update_and_write_to_local_log()
        last_boxAP, last_mskAP = round(self.log['boxAP'], 3), round(self.log['mskAP'], 3)
        self.__write_to_log_file({
            'rema': '-', 'fini': time.strftime("%m-%d %H:%M", time.localtime(time.time() - 120)),
            'last_boxAP': last_boxAP,
            'last_mskAP': last_mskAP,
        })
        time.sleep(5)
        if self.is_master:
            print(f'\n[finished] ========== last_boxAP={last_boxAP}, last_mskAP={last_mskAP} ==========\n')


if __name__ == "__main__":
    args = default_argument_parser().parse_args()
    print("Command Line Args:", args)
    launch(
        main,
        args.num_gpus,
        num_machines=args.num_machines,
        machine_rank=args.machine_rank,
        dist_url=args.dist_url,
        args=(args,),
    )


def build_evaluator(cfg, dataset_name, output_folder=None):
    """
    Create evaluator(s) for a given dataset.
    This uses the special metadata "evaluator_type" associated with each builtin dataset.
    For your own dataset, you can simply create an evaluator manually in your
    script and do not have to worry about the hacky if-else logic here.
    """
    if output_folder is None:
        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
    evaluator_list = []
    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
    if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
        evaluator_list.append(
            SemSegEvaluator(
                dataset_name,
                distributed=True,
                output_dir=output_folder,
            )
        )
    if evaluator_type in ["coco", "coco_panoptic_seg"]:
        evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
    if evaluator_type == "coco_panoptic_seg":
        evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
    if evaluator_type == "cityscapes_instance":
        return CityscapesInstanceEvaluator(dataset_name)
    if evaluator_type == "cityscapes_sem_seg":
        return CityscapesSemSegEvaluator(dataset_name)
    elif evaluator_type == "pascal_voc":
        return PascalVOCDetectionEvaluator(dataset_name)
    elif evaluator_type == "lvis":
        return LVISEvaluator(dataset_name, output_dir=output_folder)
    if len(evaluator_list) == 0:
        raise NotImplementedError(
            "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type)
        )
    elif len(evaluator_list) == 1:
        return evaluator_list[0]
    return DatasetEvaluators(evaluator_list)