[Feature]: Add `PatchEmbed` and `PatchMerging` with `AdaptivePadding` (#5952)

* add PatchEmbed

* remove patch size

* add doc

* add tests

* resolve comments

* add pad_to_stride

* resolve comments

* fix bugs

* add adap pooling

* use adap pooling

* fix docstr

* add uni test

* add more doc

* add example

* remove patch_to_stride

* rename poo

* resolve comments

* fix doc

* move padding calculation to a function
pull/5975/head
Shilong Zhang 4 years ago committed by GitHub
parent d3ddfb8b20
commit a1c47b1643
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 332
      mmdet/models/utils/transformer.py
  2. 464
      tests/test_models/test_utils/test_transformer.py

@ -1,16 +1,20 @@
# Copyright (c) OpenMMLab. All rights reserved.
import math
import warnings
from typing import Sequence
import torch
import torch.nn as nn
from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init
import torch.nn.functional as F
from mmcv.cnn import (build_activation_layer, build_conv_layer,
build_norm_layer, xavier_init)
from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
TransformerLayerSequence,
build_transformer_layer_sequence)
from mmcv.runner.base_module import BaseModule
from mmcv.utils import to_2tuple
from torch.nn.init import normal_
from mmdet.models.utils.builder import TRANSFORMER
@ -25,6 +29,332 @@ except ImportError:
from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
class AdaptivePadding(nn.Module):
"""Applies padding to input (if needed) so that input can get fully covered
by filter you specified. It support two modes "same" and "corner". The
"same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
input. The "corner" mode would pad zero to bottom right.
Args:
kernel_size (int | tuple): Size of the kernel:
stride (int | tuple): Stride of the filter. Default: 1:
dilation (int | tuple): Spacing between kernel elements.
Default: 1
padding (str): Support "same" and "corner", "corner" mode
would pad zero to bottom right, and "same" mode would
pad zero around input. Default: "corner".
Example:
>>> kernel_size = 16
>>> stride = 16
>>> dilation = 1
>>> input = torch.rand(1, 1, 15, 17)
>>> adap_pad = AdaptivePadding(
>>> kernel_size=kernel_size,
>>> stride=stride,
>>> dilation=dilation,
>>> padding="corner")
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
>>> input = torch.rand(1, 1, 16, 17)
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
"""
def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
super(AdaptivePadding, self).__init__()
assert padding in ('same', 'corner')
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
padding = to_2tuple(padding)
dilation = to_2tuple(dilation)
self.padding = padding
self.kernel_size = kernel_size
self.stride = stride
self.dilation = dilation
def get_pad_shape(self, input_shape):
input_h, input_w = input_shape
kernel_h, kernel_w = self.kernel_size
stride_h, stride_w = self.stride
output_h = math.ceil(input_h / stride_h)
output_w = math.ceil(input_w / stride_w)
pad_h = max((output_h - 1) * stride_h +
(kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
pad_w = max((output_w - 1) * stride_w +
(kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
return pad_h, pad_w
def forward(self, x):
pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
if pad_h > 0 or pad_w > 0:
if self.padding == 'corner':
x = F.pad(x, [0, pad_w, 0, pad_h])
elif self.padding == 'same':
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
pad_h - pad_h // 2
])
return x
class PatchEmbed(BaseModule):
"""Image to Patch Embedding.
We use a conv layer to implement PatchEmbed.
Args:
in_channels (int): The num of input channels. Default: 3
embed_dims (int): The dimensions of embedding. Default: 768
conv_type (str): The config dict for embedding
conv layer type selection. Default: "Conv2d.
kernel_size (int): The kernel_size of embedding conv. Default: 16.
stride (int): The slide stride of embedding conv.
Default: None (Would be set as `kernel_size`).
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int): The dilation rate of embedding conv. Default: 1.
bias (bool): Bias of embed conv. Default: False.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: None.
input_size (int | tuple | None): The size of input, which will be
used to calculate the out size. Only work when `dynamic_size`
is False. Default: None.
init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
Default: None.
"""
def __init__(
self,
in_channels=3,
embed_dims=768,
conv_type='Conv2d',
kernel_size=16,
stride=16,
padding='corner',
dilation=1,
bias=False,
norm_cfg=None,
input_size=None,
init_cfg=None,
):
super(PatchEmbed, self).__init__(init_cfg=init_cfg)
self.embed_dims = embed_dims
if stride is None:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adap_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of conv
padding = 0
else:
self.adap_padding = None
padding = to_2tuple(padding)
self.projection = build_conv_layer(
dict(type=conv_type),
in_channels=in_channels,
out_channels=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
else:
self.norm = None
if input_size:
input_size = to_2tuple(input_size)
# `init_out_size` would be used outside to
# calculate the num_patches
# when `use_abs_pos_embed` outside
self.init_input_size = input_size
if self.adap_padding:
pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
input_h, input_w = input_size
input_h = input_h + pad_h
input_w = input_w + pad_w
input_size = (input_h, input_w)
# https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
(kernel_size[0] - 1) - 1) // stride[0] + 1
w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
(kernel_size[1] - 1) - 1) // stride[1] + 1
self.init_out_size = (h_out, w_out)
else:
self.init_input_size = None
self.init_out_size = None
def forward(self, x):
"""
Args:
x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, out_h * out_w, embed_dims)
- out_size (tuple[int]): Spatial shape of x, arrange as
(out_h, out_w).
"""
if self.adap_padding:
x = self.adap_padding(x)
x = self.projection(x)
out_size = (x.shape[2], x.shape[3])
x = x.flatten(2).transpose(1, 2)
if self.norm is not None:
x = self.norm(x)
return x, out_size
class PatchMerging(BaseModule):
"""Merge patch feature map.
This layer groups feature map by kernel_size, and applies norm and linear
layers to the grouped feature map. Our implementation uses `nn.Unfold` to
merge patch, which is about 25% faster than original implementation.
Instead, we need to modify pretrained models for compatibility.
Args:
in_channels (int): The num of input channels.
to gets fully covered by filter and stride you specified..
Default: True.
out_channels (int): The num of output channels.
kernel_size (int | tuple, optional): the kernel size in the unfold
layer. Defaults to 2.
stride (int | tuple, optional): the stride of the sliding blocks in the
unfold layer. Default: None. (Would be set as `kernel_size`)
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int | tuple, optional): dilation parameter in the unfold
layer. Default: 1.
bias (bool, optional): Whether to add bias in linear layer or not.
Defaults: False.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (dict, optional): The extra config for initialization.
Default: None.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=2,
stride=None,
padding='corner',
dilation=1,
bias=False,
norm_cfg=dict(type='LN'),
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.in_channels = in_channels
self.out_channels = out_channels
if stride:
stride = stride
else:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adap_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of unfold
padding = 0
else:
self.adap_padding = None
padding = to_2tuple(padding)
self.sampler = nn.Unfold(
kernel_size=kernel_size,
dilation=dilation,
padding=padding,
stride=stride)
sample_dim = kernel_size[0] * kernel_size[1] * in_channels
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
else:
self.norm = None
self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
def forward(self, x, input_size):
"""
Args:
x (Tensor): Has shape (B, H*W, C_in).
input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
Default: None.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
- out_size (tuple[int]): Spatial shape of x, arrange as
(Merged_H, Merged_W).
"""
B, L, C = x.shape
assert isinstance(input_size, Sequence), f'Expect ' \
f'input_size is ' \
f'`Sequence` ' \
f'but get {input_size}'
H, W = input_size
assert L == H * W, 'input feature has wrong size'
x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
# Use nn.Unfold to merge patch. About 25% faster than original method,
# but need to modify pretrained model for compatibility
if self.adap_padding:
x = self.adap_padding(x)
H, W = x.shape[-2:]
x = self.sampler(x)
# if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
(self.sampler.kernel_size[0] - 1) -
1) // self.sampler.stride[0] + 1
out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
(self.sampler.kernel_size[1] - 1) -
1) // self.sampler.stride[1] + 1
output_size = (out_h, out_w)
x = x.transpose(1, 2) # B, H/2*W/2, 4*C
x = self.norm(x) if self.norm else x
x = self.reduction(x)
return x, output_size
def inverse_sigmoid(x, eps=1e-5):
"""Inverse function of sigmoid.

@ -1,10 +1,468 @@
# Copyright (c) OpenMMLab. All rights reserved.
import pytest
import torch
from mmcv.utils import ConfigDict
from mmdet.models.utils.transformer import (DetrTransformerDecoder,
DetrTransformerEncoder,
Transformer)
from mmdet.models.utils.transformer import (AdaptivePadding,
DetrTransformerDecoder,
DetrTransformerEncoder, PatchEmbed,
PatchMerging, Transformer)
def test_adaptive_padding():
for padding in ('same', 'corner'):
kernel_size = 16
stride = 16
dilation = 1
input = torch.rand(1, 1, 15, 17)
pool = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
out = pool(input)
# padding to divisible by 16
assert (out.shape[2], out.shape[3]) == (16, 32)
input = torch.rand(1, 1, 16, 17)
out = pool(input)
# padding to divisible by 16
assert (out.shape[2], out.shape[3]) == (16, 32)
kernel_size = (2, 2)
stride = (2, 2)
dilation = (1, 1)
adap_pad = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
input = torch.rand(1, 1, 11, 13)
out = adap_pad(input)
# padding to divisible by 2
assert (out.shape[2], out.shape[3]) == (12, 14)
kernel_size = (2, 2)
stride = (10, 10)
dilation = (1, 1)
adap_pad = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
input = torch.rand(1, 1, 10, 13)
out = adap_pad(input)
# no padding
assert (out.shape[2], out.shape[3]) == (10, 13)
kernel_size = (11, 11)
adap_pad = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
input = torch.rand(1, 1, 11, 13)
out = adap_pad(input)
# all padding
assert (out.shape[2], out.shape[3]) == (21, 21)
# test padding as kernel is (7,9)
input = torch.rand(1, 1, 11, 13)
stride = (3, 4)
kernel_size = (4, 5)
dilation = (2, 2)
# actually (7, 9)
adap_pad = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
dilation_out = adap_pad(input)
assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
kernel_size = (7, 9)
dilation = (1, 1)
adap_pad = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
kernel79_out = adap_pad(input)
assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
assert kernel79_out.shape == dilation_out.shape
# assert only support "same" "corner"
with pytest.raises(AssertionError):
AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=1)
def test_patch_embed():
B = 2
H = 3
W = 4
C = 3
embed_dims = 10
kernel_size = 3
stride = 1
dummy_input = torch.rand(B, C, H, W)
patch_merge_1 = PatchEmbed(
in_channels=C,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=0,
dilation=1,
norm_cfg=None)
x1, shape = patch_merge_1(dummy_input)
# test out shape
assert x1.shape == (2, 2, 10)
# test outsize is correct
assert shape == (1, 2)
# test L = out_h * out_w
assert shape[0] * shape[1] == x1.shape[1]
B = 2
H = 10
W = 10
C = 3
embed_dims = 10
kernel_size = 5
stride = 2
dummy_input = torch.rand(B, C, H, W)
# test dilation
patch_merge_2 = PatchEmbed(
in_channels=C,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=0,
dilation=2,
norm_cfg=None,
)
x2, shape = patch_merge_2(dummy_input)
# test out shape
assert x2.shape == (2, 1, 10)
# test outsize is correct
assert shape == (1, 1)
# test L = out_h * out_w
assert shape[0] * shape[1] == x2.shape[1]
stride = 2
input_size = (10, 10)
dummy_input = torch.rand(B, C, H, W)
# test stride and norm
patch_merge_3 = PatchEmbed(
in_channels=C,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=0,
dilation=2,
norm_cfg=dict(type='LN'),
input_size=input_size)
x3, shape = patch_merge_3(dummy_input)
# test out shape
assert x3.shape == (2, 1, 10)
# test outsize is correct
assert shape == (1, 1)
# test L = out_h * out_w
assert shape[0] * shape[1] == x3.shape[1]
# test thte init_out_size with nn.Unfold
assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
1) // 2 + 1
assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
1) // 2 + 1
H = 11
W = 12
input_size = (H, W)
dummy_input = torch.rand(B, C, H, W)
# test stride and norm
patch_merge_3 = PatchEmbed(
in_channels=C,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=0,
dilation=2,
norm_cfg=dict(type='LN'),
input_size=input_size)
_, shape = patch_merge_3(dummy_input)
# when input_size equal to real input
# the out_size shoule be equal to `init_out_size`
assert shape == patch_merge_3.init_out_size
input_size = (H, W)
dummy_input = torch.rand(B, C, H, W)
# test stride and norm
patch_merge_3 = PatchEmbed(
in_channels=C,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=0,
dilation=2,
norm_cfg=dict(type='LN'),
input_size=input_size)
_, shape = patch_merge_3(dummy_input)
# when input_size equal to real input
# the out_size shoule be equal to `init_out_size`
assert shape == patch_merge_3.init_out_size
# test adap padding
for padding in ('same', 'corner'):
in_c = 2
embed_dims = 3
B = 2
# test stride is 1
input_size = (5, 5)
kernel_size = (5, 5)
stride = (1, 1)
dilation = 1
bias = False
x = torch.rand(B, in_c, *input_size)
patch_embed = PatchEmbed(
in_channels=in_c,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_embed(x)
assert x_out.size() == (B, 25, 3)
assert out_size == (5, 5)
assert x_out.size(1) == out_size[0] * out_size[1]
# test kernel_size == stride
input_size = (5, 5)
kernel_size = (5, 5)
stride = (5, 5)
dilation = 1
bias = False
x = torch.rand(B, in_c, *input_size)
patch_embed = PatchEmbed(
in_channels=in_c,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_embed(x)
assert x_out.size() == (B, 1, 3)
assert out_size == (1, 1)
assert x_out.size(1) == out_size[0] * out_size[1]
# test kernel_size == stride
input_size = (6, 5)
kernel_size = (5, 5)
stride = (5, 5)
dilation = 1
bias = False
x = torch.rand(B, in_c, *input_size)
patch_embed = PatchEmbed(
in_channels=in_c,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_embed(x)
assert x_out.size() == (B, 2, 3)
assert out_size == (2, 1)
assert x_out.size(1) == out_size[0] * out_size[1]
# test different kernel_size with diffrent stride
input_size = (6, 5)
kernel_size = (6, 2)
stride = (6, 2)
dilation = 1
bias = False
x = torch.rand(B, in_c, *input_size)
patch_embed = PatchEmbed(
in_channels=in_c,
embed_dims=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_embed(x)
assert x_out.size() == (B, 3, 3)
assert out_size == (1, 3)
assert x_out.size(1) == out_size[0] * out_size[1]
def test_patch_merging():
# Test the model with int padding
in_c = 3
out_c = 4
kernel_size = 3
stride = 3
padding = 1
dilation = 1
bias = False
# test the case `pad_to_stride` is False
patch_merge = PatchMerging(
in_channels=in_c,
out_channels=out_c,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
B, L, C = 1, 100, 3
input_size = (10, 10)
x = torch.rand(B, L, C)
x_out, out_size = patch_merge(x, input_size)
assert x_out.size() == (1, 16, 4)
assert out_size == (4, 4)
# assert out size is consistent with real output
assert x_out.size(1) == out_size[0] * out_size[1]
in_c = 4
out_c = 5
kernel_size = 6
stride = 3
padding = 2
dilation = 2
bias = False
patch_merge = PatchMerging(
in_channels=in_c,
out_channels=out_c,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
B, L, C = 1, 100, 4
input_size = (10, 10)
x = torch.rand(B, L, C)
x_out, out_size = patch_merge(x, input_size)
assert x_out.size() == (1, 4, 5)
assert out_size == (2, 2)
# assert out size is consistent with real output
assert x_out.size(1) == out_size[0] * out_size[1]
# Test with adaptive padding
for padding in ('same', 'corner'):
in_c = 2
out_c = 3
B = 2
# test stride is 1
input_size = (5, 5)
kernel_size = (5, 5)
stride = (1, 1)
dilation = 1
bias = False
L = input_size[0] * input_size[1]
x = torch.rand(B, L, in_c)
patch_merge = PatchMerging(
in_channels=in_c,
out_channels=out_c,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_merge(x, input_size)
assert x_out.size() == (B, 25, 3)
assert out_size == (5, 5)
assert x_out.size(1) == out_size[0] * out_size[1]
# test kernel_size == stride
input_size = (5, 5)
kernel_size = (5, 5)
stride = (5, 5)
dilation = 1
bias = False
L = input_size[0] * input_size[1]
x = torch.rand(B, L, in_c)
patch_merge = PatchMerging(
in_channels=in_c,
out_channels=out_c,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_merge(x, input_size)
assert x_out.size() == (B, 1, 3)
assert out_size == (1, 1)
assert x_out.size(1) == out_size[0] * out_size[1]
# test kernel_size == stride
input_size = (6, 5)
kernel_size = (5, 5)
stride = (5, 5)
dilation = 1
bias = False
L = input_size[0] * input_size[1]
x = torch.rand(B, L, in_c)
patch_merge = PatchMerging(
in_channels=in_c,
out_channels=out_c,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_merge(x, input_size)
assert x_out.size() == (B, 2, 3)
assert out_size == (2, 1)
assert x_out.size(1) == out_size[0] * out_size[1]
# test different kernel_size with diffrent stride
input_size = (6, 5)
kernel_size = (6, 2)
stride = (6, 2)
dilation = 1
bias = False
L = input_size[0] * input_size[1]
x = torch.rand(B, L, in_c)
patch_merge = PatchMerging(
in_channels=in_c,
out_channels=out_c,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
x_out, out_size = patch_merge(x, input_size)
assert x_out.size() == (B, 3, 3)
assert out_size == (1, 3)
assert x_out.size(1) == out_size[0] * out_size[1]
def test_detr_transformer_dencoder_encoder_layer():

Loading…
Cancel
Save