Add upsample_cfg support in FPN (#2787)

* Add upsample_cfg support in FPN * small fix * Add multiple extra conv sources * small logical fix * Add neck tests for fpn * Add neck tests for fpn * fixed several typos * resolved issues * Removed extra_convs_source option * added necks to apis.rst * change according to comments * reconfigured configs
5 years ago · 50ffa2482c
parent e903b5c109
commit 50ffa2482c
12 changed files with 263 additions and 27 deletions
--- a/configs/_base_/models/retinanet_r50_fpn.py
+++ b/configs/_base_/models/retinanet_r50_fpn.py
@ -16,7 +16,7 @@ model = dict(
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
-        add_extra_convs=True,
+        add_extra_convs='on_input',
        num_outs=5),
    bbox_head=dict(
        type='RetinaHead',
--- a/configs/atss/atss_r50_fpn_1x_coco.py
+++ b/configs/atss/atss_r50_fpn_1x_coco.py
@ -19,8 +19,7 @@ model = dict(
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
-        add_extra_convs=True,
-        extra_convs_on_inputs=False,
+        add_extra_convs='on_output',
        num_outs=5),
    bbox_head=dict(
        type='ATSSHead',
--- a/configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py
+++ b/configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py
@ -20,8 +20,7 @@ model = dict(
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
-        add_extra_convs=True,
-        extra_convs_on_inputs=False,  # use P5
+        add_extra_convs='on_output',  # use P5
        num_outs=5,
        relu_before_extra_convs=True),
    bbox_head=dict(
--- a/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+++ b/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@ -20,8 +20,7 @@ model = dict(
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
-        add_extra_convs=True,
-        extra_convs_on_inputs=False,  # use P5
+        add_extra_convs='on_output',  # use P5
        num_outs=5,
        relu_before_extra_convs=True),
    bbox_head=dict(
--- a/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
+++ b/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
@ -21,7 +21,7 @@ model = dict(
        out_channels=256,
        start_level=1,
        num_outs=5,
-        add_extra_convs=True),
+        add_extra_convs='on_input'),
    bbox_head=dict(
        type='FoveaHead',
        num_classes=80,
--- a/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py
+++ b/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py
@ -16,7 +16,7 @@ model = dict(
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
-        add_extra_convs=True,
+        add_extra_convs='on_input',
        num_outs=5),
    bbox_head=dict(
        type='GARetinaHead',
--- a/configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py
+++ b/configs/libra_rcnn/libra_retinanet_r50_fpn_1x_coco.py
@ -7,7 +7,7 @@ model = dict(
            in_channels=[256, 512, 1024, 2048],
            out_channels=256,
            start_level=1,
-            add_extra_convs=True,
+            add_extra_convs='on_input',
            num_outs=5),
        dict(
            type='BFP',
--- a/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
+++ b/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
@ -19,7 +19,7 @@ model = dict(
        in_channels=[256, 512, 1024, 2048],
        out_channels=256,
        start_level=1,
-        add_extra_convs=True,
+        add_extra_convs='on_input',
        num_outs=5),
    bbox_head=dict(
        type='RepPointsHead',
--- a/demo/mmdet_inference_colab.ipynb
+++ b/demo/mmdet_inference_colab.ipynb
@ -534,4 +534,4 @@
      ]
    }
  ]
-}
+}
--- a/docs/api.rst
+++ b/docs/api.rst
@ -75,6 +75,11 @@ backbones
 .. automodule:: mmdet.models.backbones
    :members:

+necks
+^^^^^^^^^^^^
+.. automodule:: mmdet.models.necks
+    :members:
+
 dense_heads
 ^^^^^^^^^^^^
 .. automodule:: mmdet.models.dense_heads
--- a/mmdet/models/necks/fpn.py
+++ b/mmdet/models/necks/fpn.py
@ -22,10 +22,19 @@ class FPN(nn.Module):
            build the feature pyramid. Default: 0.
        end_level (int): Index of the end input backbone level (exclusive) to
            build the feature pyramid. Default: -1, which means the last level.
-        add_extra_convs (bool): Whether to add conv layers on top of the
-            original feature maps. Default: False.
-        extra_convs_on_inputs (bool): Whether to apply extra conv on
-            the original feature from the backbone. Default: False.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
        relu_before_extra_convs (bool): Whether to apply relu before the extra
            conv. Default: False.
        no_norm_on_lateral (bool): Whether to apply norm on lateral.
@ -34,6 +43,8 @@ class FPN(nn.Module):
        norm_cfg (dict): Config dict for normalization layer. Default: None.
        act_cfg (str): Config dict for activation layer in ConvModule.
            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`

    Example:
        >>> import torch
@ -63,7 +74,8 @@ class FPN(nn.Module):
                 no_norm_on_lateral=False,
                 conv_cfg=None,
                 norm_cfg=None,
-                 act_cfg=None):
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
        super(FPN, self).__init__()
        assert isinstance(in_channels, list)
        self.in_channels = in_channels
@ -73,6 +85,7 @@ class FPN(nn.Module):
        self.relu_before_extra_convs = relu_before_extra_convs
        self.no_norm_on_lateral = no_norm_on_lateral
        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()

        if end_level == -1:
            self.backbone_end_level = self.num_ins
@ -85,7 +98,17 @@ class FPN(nn.Module):
        self.start_level = start_level
        self.end_level = end_level
        self.add_extra_convs = add_extra_convs
-        self.extra_convs_on_inputs = extra_convs_on_inputs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'

        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()
@ -114,9 +137,9 @@ class FPN(nn.Module):

        # add extra conv layers (e.g., RetinaNet)
        extra_levels = num_outs - self.backbone_end_level + self.start_level
-        if add_extra_convs and extra_levels >= 1:
+        if self.add_extra_convs and extra_levels >= 1:
            for i in range(extra_levels):
-                if i == 0 and self.extra_convs_on_inputs:
+                if i == 0 and self.add_extra_convs == 'on_input':
                    in_channels = self.in_channels[self.backbone_end_level - 1]
                else:
                    in_channels = out_channels
@ -151,9 +174,15 @@ class FPN(nn.Module):
        # build top-down path
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
-            prev_shape = laterals[i - 1].shape[2:]
-            laterals[i - 1] += F.interpolate(
-                laterals[i], size=prev_shape, mode='nearest')
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i],
+                                                 **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)

        # build outputs
        # part 1: from original levels
@ -169,11 +198,15 @@ class FPN(nn.Module):
                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
            # add conv layers on top of original feature maps (RetinaNet)
            else:
-                if self.extra_convs_on_inputs:
-                    orig = inputs[self.backbone_end_level - 1]
-                    outs.append(self.fpn_convs[used_backbone_levels](orig))
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
                else:
-                    outs.append(self.fpn_convs[used_backbone_levels](outs[-1]))
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
                for i in range(used_backbone_levels + 1, self.num_outs):
                    if self.relu_before_extra_convs:
                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
--- a/tests/test_necks.py
+++ b/tests/test_necks.py
@ -0,0 +1,201 @@
+import pytest
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.models.necks import FPN
+
+
+def test_fpn():
+    """Tests fpn """
+    s = 64
+    in_channels = [8, 16, 32, 64]
+    feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+    out_channels = 8
+    # `num_outs` is not equal to len(in_channels) - start_level
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            num_outs=2)
+
+    # `end_level` is larger than len(in_channels) - 1
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            end_level=4,
+            num_outs=2)
+
+    # `num_outs` is not equal to end_level - start_level
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            end_level=3,
+            num_outs=1)
+
+    # Invalid `add_extra_convs` option
+    with pytest.raises(AssertionError):
+        FPN(in_channels=in_channels,
+            out_channels=out_channels,
+            start_level=1,
+            add_extra_convs='on_xxx',
+            num_outs=5)
+
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5)
+
+    # FPN expects a multiple levels of features per image
+    feats = [
+        torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+        for i in range(len(in_channels))
+    ]
+    outs = fpn_model(feats)
+    assert fpn_model.add_extra_convs == 'on_input'
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Tests for fpn with no extra convs (pooling is used instead)
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=False,
+        num_outs=5)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    assert not fpn_model.add_extra_convs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Tests for fpn with lateral bns
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        no_norm_on_lateral=False,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        num_outs=5)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    assert fpn_model.add_extra_convs == 'on_input'
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+    bn_exist = False
+    for m in fpn_model.modules():
+        if isinstance(m, _BatchNorm):
+            bn_exist = True
+    assert bn_exist
+
+    # Bilinear upsample
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        upsample_cfg=dict(mode='bilinear', align_corners=True),
+        num_outs=5)
+    fpn_model(feats)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    assert fpn_model.add_extra_convs == 'on_input'
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Scale factor instead of fixed upsample size upsample
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        start_level=1,
+        add_extra_convs=True,
+        upsample_cfg=dict(scale_factor=2),
+        num_outs=5)
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Extra convs source is 'inputs'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs='on_input',
+        start_level=1,
+        num_outs=5)
+    assert fpn_model.add_extra_convs == 'on_input'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Extra convs source is 'laterals'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs='on_lateral',
+        start_level=1,
+        num_outs=5)
+    assert fpn_model.add_extra_convs == 'on_lateral'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # Extra convs source is 'outputs'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs='on_output',
+        start_level=1,
+        num_outs=5)
+    assert fpn_model.add_extra_convs == 'on_output'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # extra_convs_on_inputs=False is equal to extra convs source is 'on_output'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs=True,
+        extra_convs_on_inputs=False,
+        start_level=1,
+        num_outs=5,
+    )
+    assert fpn_model.add_extra_convs == 'on_output'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    # extra_convs_on_inputs=True is equal to extra convs source is 'on_input'
+    fpn_model = FPN(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        add_extra_convs=True,
+        extra_convs_on_inputs=True,
+        start_level=1,
+        num_outs=5,
+    )
+    assert fpn_model.add_extra_convs == 'on_input'
+    outs = fpn_model(feats)
+    assert len(outs) == fpn_model.num_outs
+    for i in range(fpn_model.num_outs):
+        outs[i].shape[1] == out_channels
+        outs[i].shape[2] == outs[i].shape[3] == s // (2**i)