You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
266 lines
9.6 KiB
266 lines
9.6 KiB
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
|
# |
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
# you may not use this file except in compliance with the License. |
|
# You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
""" |
|
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. |
|
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py |
|
Ths copyright of microsoft/Swin-Transformer is as follows: |
|
MIT License [see LICENSE for details] |
|
""" |
|
|
|
import paddle |
|
import paddle.nn as nn |
|
from paddle import ParamAttr |
|
from paddle.regularizer import L2Decay |
|
from paddle.nn.initializer import Normal, Constant |
|
|
|
from paddlers.models.ppdet.modeling.ops import get_act_fn |
|
from paddlers.models.ppdet.modeling.layers import ConvNormLayer |
|
|
|
|
|
class MobileOneBlock(nn.Layer): |
|
def __init__( |
|
self, |
|
ch_in, |
|
ch_out, |
|
stride, |
|
kernel_size, |
|
conv_num=1, |
|
norm_type='bn', |
|
norm_decay=0., |
|
norm_groups=32, |
|
bias_on=False, |
|
lr_scale=1., |
|
freeze_norm=False, |
|
initializer=Normal( |
|
mean=0., std=0.01), |
|
skip_quant=False, |
|
act='relu', ): |
|
super(MobileOneBlock, self).__init__() |
|
|
|
self.ch_in = ch_in |
|
self.ch_out = ch_out |
|
self.kernel_size = kernel_size |
|
self.stride = stride |
|
self.padding = (kernel_size - 1) // 2 |
|
self.k = conv_num |
|
|
|
self.depth_conv = nn.LayerList() |
|
self.point_conv = nn.LayerList() |
|
for _ in range(self.k): |
|
self.depth_conv.append( |
|
ConvNormLayer( |
|
ch_in, |
|
ch_in, |
|
kernel_size, |
|
stride=stride, |
|
groups=ch_in, |
|
norm_type=norm_type, |
|
norm_decay=norm_decay, |
|
norm_groups=norm_groups, |
|
bias_on=bias_on, |
|
lr_scale=lr_scale, |
|
freeze_norm=freeze_norm, |
|
initializer=initializer, |
|
skip_quant=skip_quant)) |
|
self.point_conv.append( |
|
ConvNormLayer( |
|
ch_in, |
|
ch_out, |
|
1, |
|
stride=1, |
|
groups=1, |
|
norm_type=norm_type, |
|
norm_decay=norm_decay, |
|
norm_groups=norm_groups, |
|
bias_on=bias_on, |
|
lr_scale=lr_scale, |
|
freeze_norm=freeze_norm, |
|
initializer=initializer, |
|
skip_quant=skip_quant)) |
|
self.rbr_1x1 = ConvNormLayer( |
|
ch_in, |
|
ch_in, |
|
1, |
|
stride=self.stride, |
|
groups=ch_in, |
|
norm_type=norm_type, |
|
norm_decay=norm_decay, |
|
norm_groups=norm_groups, |
|
bias_on=bias_on, |
|
lr_scale=lr_scale, |
|
freeze_norm=freeze_norm, |
|
initializer=initializer, |
|
skip_quant=skip_quant) |
|
self.rbr_identity_st1 = nn.BatchNorm2D( |
|
num_features=ch_in, |
|
weight_attr=ParamAttr(regularizer=L2Decay(0.0)), |
|
bias_attr=ParamAttr(regularizer=L2Decay( |
|
0.0))) if ch_in == ch_out and self.stride == 1 else None |
|
self.rbr_identity_st2 = nn.BatchNorm2D( |
|
num_features=ch_out, |
|
weight_attr=ParamAttr(regularizer=L2Decay(0.0)), |
|
bias_attr=ParamAttr(regularizer=L2Decay( |
|
0.0))) if ch_in == ch_out and self.stride == 1 else None |
|
self.act = get_act_fn(act) if act is None or isinstance(act, ( |
|
str, dict)) else act |
|
|
|
def forward(self, x): |
|
if hasattr(self, "conv1") and hasattr(self, "conv2"): |
|
y = self.act(self.conv2(self.act(self.conv1(x)))) |
|
else: |
|
if self.rbr_identity_st1 is None: |
|
id_out_st1 = 0 |
|
else: |
|
id_out_st1 = self.rbr_identity_st1(x) |
|
|
|
x1_1 = 0 |
|
for i in range(self.k): |
|
x1_1 += self.depth_conv[i](x) |
|
|
|
x1_2 = self.rbr_1x1(x) |
|
x1 = self.act(x1_1 + x1_2 + id_out_st1) |
|
|
|
if self.rbr_identity_st2 is None: |
|
id_out_st2 = 0 |
|
else: |
|
id_out_st2 = self.rbr_identity_st2(x1) |
|
|
|
x2_1 = 0 |
|
for i in range(self.k): |
|
x2_1 += self.point_conv[i](x1) |
|
y = self.act(x2_1 + id_out_st2) |
|
|
|
return y |
|
|
|
def convert_to_deploy(self): |
|
if not hasattr(self, 'conv1'): |
|
self.conv1 = nn.Conv2D( |
|
in_channels=self.ch_in, |
|
out_channels=self.ch_in, |
|
kernel_size=self.kernel_size, |
|
stride=self.stride, |
|
padding=self.padding, |
|
groups=self.ch_in, |
|
bias_attr=ParamAttr( |
|
initializer=Constant(value=0.), learning_rate=1.)) |
|
if not hasattr(self, 'conv2'): |
|
self.conv2 = nn.Conv2D( |
|
in_channels=self.ch_in, |
|
out_channels=self.ch_out, |
|
kernel_size=1, |
|
stride=1, |
|
padding='SAME', |
|
groups=1, |
|
bias_attr=ParamAttr( |
|
initializer=Constant(value=0.), learning_rate=1.)) |
|
|
|
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias( |
|
) |
|
self.conv1.weight.set_value(conv1_kernel) |
|
self.conv1.bias.set_value(conv1_bias) |
|
self.conv2.weight.set_value(conv2_kernel) |
|
self.conv2.bias.set_value(conv2_bias) |
|
self.__delattr__('depth_conv') |
|
self.__delattr__('point_conv') |
|
self.__delattr__('rbr_1x1') |
|
if hasattr(self, 'rbr_identity_st1'): |
|
self.__delattr__('rbr_identity_st1') |
|
if hasattr(self, 'rbr_identity_st2'): |
|
self.__delattr__('rbr_identity_st2') |
|
|
|
def get_equivalent_kernel_bias(self): |
|
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv) |
|
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) |
|
st1_kernelid, st1_biasid = self._fuse_bn_tensor( |
|
self.rbr_identity_st1, kernel_size=self.kernel_size) |
|
|
|
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv) |
|
st2_kernelid, st2_biasid = self._fuse_bn_tensor( |
|
self.rbr_identity_st2, kernel_size=1) |
|
|
|
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor( |
|
st1_kernel1x1) + st1_kernelid |
|
|
|
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid |
|
|
|
conv2_kernel = st2_kernel1x1 + st2_kernelid |
|
conv2_bias = st2_bias1x1 + st2_biasid |
|
|
|
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias |
|
|
|
def _pad_1x1_to_3x3_tensor(self, kernel1x1): |
|
if kernel1x1 is None: |
|
return 0 |
|
else: |
|
padding_size = (self.kernel_size - 1) // 2 |
|
return nn.functional.pad( |
|
kernel1x1, |
|
[padding_size, padding_size, padding_size, padding_size]) |
|
|
|
def _fuse_bn_tensor(self, branch, kernel_size=3): |
|
if branch is None: |
|
return 0, 0 |
|
|
|
if isinstance(branch, nn.LayerList): |
|
fused_kernels = [] |
|
fused_bias = [] |
|
for block in branch: |
|
kernel = block.conv.weight |
|
running_mean = block.norm._mean |
|
running_var = block.norm._variance |
|
gamma = block.norm.weight |
|
beta = block.norm.bias |
|
eps = block.norm._epsilon |
|
|
|
std = (running_var + eps).sqrt() |
|
t = (gamma / std).reshape((-1, 1, 1, 1)) |
|
|
|
fused_kernels.append(kernel * t) |
|
fused_bias.append(beta - running_mean * gamma / std) |
|
|
|
return sum(fused_kernels), sum(fused_bias) |
|
|
|
elif isinstance(branch, ConvNormLayer): |
|
kernel = branch.conv.weight |
|
running_mean = branch.norm._mean |
|
running_var = branch.norm._variance |
|
gamma = branch.norm.weight |
|
beta = branch.norm.bias |
|
eps = branch.norm._epsilon |
|
else: |
|
assert isinstance(branch, nn.BatchNorm2D) |
|
input_dim = self.ch_in if kernel_size == 1 else 1 |
|
kernel_value = paddle.zeros( |
|
shape=[self.ch_in, input_dim, kernel_size, kernel_size], |
|
dtype='float32') |
|
if kernel_size > 1: |
|
for i in range(self.ch_in): |
|
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, ( |
|
kernel_size - 1) // 2] = 1 |
|
elif kernel_size == 1: |
|
for i in range(self.ch_in): |
|
kernel_value[i, i % input_dim, 0, 0] = 1 |
|
else: |
|
raise ValueError("Invalid kernel size recieved!") |
|
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place) |
|
running_mean = branch._mean |
|
running_var = branch._variance |
|
gamma = branch.weight |
|
beta = branch.bias |
|
eps = branch._epsilon |
|
|
|
std = (running_var + eps).sqrt() |
|
t = (gamma / std).reshape((-1, 1, 1, 1)) |
|
|
|
return kernel * t, beta - running_mean * gamma / std
|
|
|