Merge pull request #40 from Bobholamovic/update_ppseg
[Feat] Update ppseg and Add CondenseNet V2own
commit
ebceda8419
143 changed files with 8360 additions and 1965 deletions
@ -0,0 +1 @@ |
|||||||
|
ppseg f6c73b478cdf00f40ae69edd35bf6bce2a1687ef |
@ -0,0 +1,135 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import os |
||||||
|
import numpy as np |
||||||
|
|
||||||
|
from paddlers.models.ppseg.datasets import Dataset |
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.transforms import Compose |
||||||
|
|
||||||
|
|
||||||
|
@manager.DATASETS.add_component |
||||||
|
class PSSLDataset(Dataset): |
||||||
|
""" |
||||||
|
The PSSL dataset for segmentation. PSSL is short for Pseudo Semantic Segmentation Labels, where the pseudo label |
||||||
|
is computed by the Consensus explanation algorithm. |
||||||
|
|
||||||
|
The PSSL refers to "Distilling Ensemble of Explanations for Weakly-Supervised Pre-Training of Image Segmentation |
||||||
|
Models" (https://arxiv.org/abs/2207.03335). |
||||||
|
|
||||||
|
The Consensus explanation refers to "Cross-Model Consensus of Explanations and Beyond for Image Classification |
||||||
|
Models: An Empirical Study" (https://arxiv.org/abs/2109.00707). |
||||||
|
|
||||||
|
To use this dataset, we need to additionally prepare the orignal ImageNet dataset, which has the folder structure |
||||||
|
as follows: |
||||||
|
|
||||||
|
imagenet_root |
||||||
|
| |
||||||
|
|--train |
||||||
|
| |--n01440764 |
||||||
|
| | |--n01440764_10026.JPEG |
||||||
|
| | |--... |
||||||
|
| |--nxxxxxxxx |
||||||
|
| |--... |
||||||
|
|
||||||
|
where only the "train" set is needed. |
||||||
|
|
||||||
|
The PSSL dataset has the folder structure as follows: |
||||||
|
|
||||||
|
pssl_root |
||||||
|
| |
||||||
|
|--train |
||||||
|
| |--n01440764 |
||||||
|
| | |--n01440764_10026.JPEG_eiseg.npz |
||||||
|
| | |--... |
||||||
|
| |--nxxxxxxxx |
||||||
|
| |--... |
||||||
|
| |
||||||
|
|--imagenet_lsvrc_2015_synsets.txt |
||||||
|
|--train.txt |
||||||
|
|
||||||
|
where "train.txt" and "imagenet_lsvrc_2015_synsets.txt" are included in the PSSL dataset. |
||||||
|
|
||||||
|
Args: |
||||||
|
transforms (list): Transforms for image. |
||||||
|
imagenet_root (str): The path to the original ImageNet dataset. |
||||||
|
pssl_root (str): The path to the PSSL dataset. |
||||||
|
mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. |
||||||
|
edge (bool, optional): Whether to compute edge while training. Default: False. |
||||||
|
""" |
||||||
|
ignore_index = 1001 # 0~999 is target class, 1000 is bg |
||||||
|
NUM_CLASSES = 1001 # consider target class and bg |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
transforms, |
||||||
|
imagenet_root, |
||||||
|
pssl_root, |
||||||
|
mode='train', |
||||||
|
edge=False): |
||||||
|
mode = mode.lower() |
||||||
|
if mode not in ['train']: |
||||||
|
raise ValueError("mode should be 'train', but got {}.".format(mode)) |
||||||
|
if transforms is None: |
||||||
|
raise ValueError("`transforms` is necessary, but it is None.") |
||||||
|
|
||||||
|
self.transforms = Compose(transforms) |
||||||
|
self.mode = mode |
||||||
|
self.edge = edge |
||||||
|
|
||||||
|
self.num_classes = self.NUM_CLASSES |
||||||
|
self.ignore_index = self.num_classes # 1001 |
||||||
|
self.file_list = [] |
||||||
|
self.class_id_dict = {} |
||||||
|
|
||||||
|
if imagenet_root is None or not os.path.isdir(pssl_root): |
||||||
|
raise ValueError( |
||||||
|
"The dataset is not Found or the folder structure is nonconfoumance." |
||||||
|
) |
||||||
|
|
||||||
|
train_list_file = os.path.join(pssl_root, "train.txt") |
||||||
|
if not os.path.exists(train_list_file): |
||||||
|
raise ValueError("Train list file isn't exists.") |
||||||
|
for idx, line in enumerate(open(train_list_file)): |
||||||
|
# line: train/n04118776/n04118776_45912.JPEG_eiseg.npz |
||||||
|
label_path = line.strip() |
||||||
|
img_path = label_path.split('.JPEG')[0] + '.JPEG' |
||||||
|
label_path = os.path.join(pssl_root, label_path) |
||||||
|
img_path = os.path.join(imagenet_root, img_path) |
||||||
|
self.file_list.append([img_path, label_path]) |
||||||
|
|
||||||
|
# mapping class name to class id. |
||||||
|
class_id_file = os.path.join(pssl_root, |
||||||
|
"imagenet_lsvrc_2015_synsets.txt") |
||||||
|
if not os.path.exists(class_id_file): |
||||||
|
raise ValueError("Class id file isn't exists.") |
||||||
|
for idx, line in enumerate(open(class_id_file)): |
||||||
|
class_name = line.strip() |
||||||
|
self.class_id_dict[class_name] = idx |
||||||
|
|
||||||
|
def __getitem__(self, idx): |
||||||
|
image_path, label_path = self.file_list[idx] |
||||||
|
|
||||||
|
# transform label |
||||||
|
class_name = (image_path.split('/')[-1]).split('_')[0] |
||||||
|
class_id = self.class_id_dict[class_name] |
||||||
|
|
||||||
|
pssl_seg = np.load(label_path)['arr_0'] |
||||||
|
gt_semantic_seg = np.zeros_like(pssl_seg, dtype=np.int64) + 1000 |
||||||
|
# [0, 999] for imagenet classes, 1000 for background, others(-1) will be ignored during training. |
||||||
|
gt_semantic_seg[pssl_seg == 1] = class_id |
||||||
|
|
||||||
|
im, label = self.transforms(im=image_path, label=gt_semantic_seg) |
||||||
|
|
||||||
|
return im, label |
@ -0,0 +1,318 @@ |
|||||||
|
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/ghostnet_pytorch |
||||||
|
|
||||||
|
import math |
||||||
|
import paddle |
||||||
|
from paddle import ParamAttr |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear |
||||||
|
from paddle.regularizer import L2Decay |
||||||
|
from paddle.nn.initializer import Uniform, KaimingNormal |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.utils import utils, logger |
||||||
|
|
||||||
|
__all__ = ["GhostNet_x0_5", "GhostNet_x1_0", "GhostNet_x1_3"] |
||||||
|
|
||||||
|
|
||||||
|
class ConvBNLayer(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
kernel_size, |
||||||
|
stride=1, |
||||||
|
groups=1, |
||||||
|
act="relu", |
||||||
|
name=None): |
||||||
|
super(ConvBNLayer, self).__init__() |
||||||
|
self._conv = Conv2D( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=out_channels, |
||||||
|
kernel_size=kernel_size, |
||||||
|
stride=stride, |
||||||
|
padding=(kernel_size - 1) // 2, |
||||||
|
groups=groups, |
||||||
|
weight_attr=ParamAttr( |
||||||
|
initializer=KaimingNormal(), name=name + "_weights"), |
||||||
|
bias_attr=False) |
||||||
|
bn_name = name + "_bn" |
||||||
|
|
||||||
|
self._batch_norm = BatchNorm( |
||||||
|
num_channels=out_channels, |
||||||
|
act=act, |
||||||
|
param_attr=ParamAttr( |
||||||
|
name=bn_name + "_scale", regularizer=L2Decay(0.0)), |
||||||
|
bias_attr=ParamAttr( |
||||||
|
name=bn_name + "_offset", regularizer=L2Decay(0.0)), |
||||||
|
moving_mean_name=bn_name + "_mean", |
||||||
|
moving_variance_name=bn_name + "_variance") |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
y = self._conv(inputs) |
||||||
|
y = self._batch_norm(y) |
||||||
|
return y |
||||||
|
|
||||||
|
|
||||||
|
class SEBlock(nn.Layer): |
||||||
|
def __init__(self, num_channels, reduction_ratio=4, name=None): |
||||||
|
super(SEBlock, self).__init__() |
||||||
|
self.pool2d_gap = AdaptiveAvgPool2D(1) |
||||||
|
self._num_channels = num_channels |
||||||
|
stdv = 1.0 / math.sqrt(num_channels * 1.0) |
||||||
|
med_ch = num_channels // reduction_ratio |
||||||
|
self.squeeze = Linear( |
||||||
|
num_channels, |
||||||
|
med_ch, |
||||||
|
weight_attr=ParamAttr( |
||||||
|
initializer=Uniform(-stdv, stdv), name=name + "_1_weights"), |
||||||
|
bias_attr=ParamAttr(name=name + "_1_offset")) |
||||||
|
stdv = 1.0 / math.sqrt(med_ch * 1.0) |
||||||
|
self.excitation = Linear( |
||||||
|
med_ch, |
||||||
|
num_channels, |
||||||
|
weight_attr=ParamAttr( |
||||||
|
initializer=Uniform(-stdv, stdv), name=name + "_2_weights"), |
||||||
|
bias_attr=ParamAttr(name=name + "_2_offset")) |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
pool = self.pool2d_gap(inputs) |
||||||
|
pool = paddle.squeeze(pool, axis=[2, 3]) |
||||||
|
squeeze = self.squeeze(pool) |
||||||
|
squeeze = F.relu(squeeze) |
||||||
|
excitation = self.excitation(squeeze) |
||||||
|
excitation = paddle.clip(x=excitation, min=0, max=1) |
||||||
|
excitation = paddle.unsqueeze(excitation, axis=[2, 3]) |
||||||
|
out = paddle.multiply(inputs, excitation) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class GhostModule(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
output_channels, |
||||||
|
kernel_size=1, |
||||||
|
ratio=2, |
||||||
|
dw_size=3, |
||||||
|
stride=1, |
||||||
|
relu=True, |
||||||
|
name=None): |
||||||
|
super(GhostModule, self).__init__() |
||||||
|
init_channels = int(math.ceil(output_channels / ratio)) |
||||||
|
new_channels = int(init_channels * (ratio - 1)) |
||||||
|
self.primary_conv = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=init_channels, |
||||||
|
kernel_size=kernel_size, |
||||||
|
stride=stride, |
||||||
|
groups=1, |
||||||
|
act="relu" if relu else None, |
||||||
|
name=name + "_primary_conv") |
||||||
|
self.cheap_operation = ConvBNLayer( |
||||||
|
in_channels=init_channels, |
||||||
|
out_channels=new_channels, |
||||||
|
kernel_size=dw_size, |
||||||
|
stride=1, |
||||||
|
groups=init_channels, |
||||||
|
act="relu" if relu else None, |
||||||
|
name=name + "_cheap_operation") |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
x = self.primary_conv(inputs) |
||||||
|
y = self.cheap_operation(x) |
||||||
|
out = paddle.concat([x, y], axis=1) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class GhostBottleneck(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
hidden_dim, |
||||||
|
output_channels, |
||||||
|
kernel_size, |
||||||
|
stride, |
||||||
|
use_se, |
||||||
|
name=None): |
||||||
|
super(GhostBottleneck, self).__init__() |
||||||
|
self._stride = stride |
||||||
|
self._use_se = use_se |
||||||
|
self._num_channels = in_channels |
||||||
|
self._output_channels = output_channels |
||||||
|
self.ghost_module_1 = GhostModule( |
||||||
|
in_channels=in_channels, |
||||||
|
output_channels=hidden_dim, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
relu=True, |
||||||
|
name=name + "_ghost_module_1") |
||||||
|
if stride == 2: |
||||||
|
self.depthwise_conv = ConvBNLayer( |
||||||
|
in_channels=hidden_dim, |
||||||
|
out_channels=hidden_dim, |
||||||
|
kernel_size=kernel_size, |
||||||
|
stride=stride, |
||||||
|
groups=hidden_dim, |
||||||
|
act=None, |
||||||
|
name=name + |
||||||
|
"_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. |
||||||
|
) |
||||||
|
if use_se: |
||||||
|
self.se_block = SEBlock(num_channels=hidden_dim, name=name + "_se") |
||||||
|
self.ghost_module_2 = GhostModule( |
||||||
|
in_channels=hidden_dim, |
||||||
|
output_channels=output_channels, |
||||||
|
kernel_size=1, |
||||||
|
relu=False, |
||||||
|
name=name + "_ghost_module_2") |
||||||
|
if stride != 1 or in_channels != output_channels: |
||||||
|
self.shortcut_depthwise = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=in_channels, |
||||||
|
kernel_size=kernel_size, |
||||||
|
stride=stride, |
||||||
|
groups=in_channels, |
||||||
|
act=None, |
||||||
|
name=name + |
||||||
|
"_shortcut_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. |
||||||
|
) |
||||||
|
self.shortcut_conv = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=output_channels, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
groups=1, |
||||||
|
act=None, |
||||||
|
name=name + "_shortcut_conv") |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
x = self.ghost_module_1(inputs) |
||||||
|
if self._stride == 2: |
||||||
|
x = self.depthwise_conv(x) |
||||||
|
if self._use_se: |
||||||
|
x = self.se_block(x) |
||||||
|
x = self.ghost_module_2(x) |
||||||
|
if self._stride == 1 and self._num_channels == self._output_channels: |
||||||
|
shortcut = inputs |
||||||
|
else: |
||||||
|
shortcut = self.shortcut_depthwise(inputs) |
||||||
|
shortcut = self.shortcut_conv(shortcut) |
||||||
|
return paddle.add(x=x, y=shortcut) |
||||||
|
|
||||||
|
|
||||||
|
class GhostNet(nn.Layer): |
||||||
|
def __init__(self, scale, in_channels=3, pretrained=None): |
||||||
|
super(GhostNet, self).__init__() |
||||||
|
self.cfgs = [ |
||||||
|
# k, t, c, SE, s |
||||||
|
[3, 16, 16, 0, 1], |
||||||
|
[3, 48, 24, 0, 2], |
||||||
|
[3, 72, 24, 0, 1], # x4 |
||||||
|
[5, 72, 40, 1, 2], |
||||||
|
[5, 120, 40, 1, 1], # x8 |
||||||
|
[3, 240, 80, 0, 2], |
||||||
|
[3, 200, 80, 0, 1], |
||||||
|
[3, 184, 80, 0, 1], |
||||||
|
[3, 184, 80, 0, 1], |
||||||
|
[3, 480, 112, 1, 1], |
||||||
|
[3, 672, 112, 1, 1], # x16 |
||||||
|
[5, 672, 160, 1, 2], |
||||||
|
[5, 960, 160, 0, 1], |
||||||
|
[5, 960, 160, 1, 1], |
||||||
|
[5, 960, 160, 0, 1], |
||||||
|
[5, 960, 160, 1, 1] # x32 |
||||||
|
] |
||||||
|
self.scale = scale |
||||||
|
self.pretrained = pretrained |
||||||
|
|
||||||
|
output_channels = int(self._make_divisible(16 * self.scale, 4)) |
||||||
|
self.conv1 = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=output_channels, |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
groups=1, |
||||||
|
act="relu", |
||||||
|
name="conv1") |
||||||
|
|
||||||
|
# build inverted residual blocks |
||||||
|
self.out_index = [2, 4, 10, 15] |
||||||
|
self.feat_channels = [] |
||||||
|
self.ghost_bottleneck_list = [] |
||||||
|
for idx, (k, exp_size, c, use_se, s) in enumerate(self.cfgs): |
||||||
|
in_channels = output_channels |
||||||
|
output_channels = int(self._make_divisible(c * self.scale, 4)) |
||||||
|
hidden_dim = int(self._make_divisible(exp_size * self.scale, 4)) |
||||||
|
ghost_bottleneck = self.add_sublayer( |
||||||
|
name="_ghostbottleneck_" + str(idx), |
||||||
|
sublayer=GhostBottleneck( |
||||||
|
in_channels=in_channels, |
||||||
|
hidden_dim=hidden_dim, |
||||||
|
output_channels=output_channels, |
||||||
|
kernel_size=k, |
||||||
|
stride=s, |
||||||
|
use_se=use_se, |
||||||
|
name="_ghostbottleneck_" + str(idx))) |
||||||
|
self.ghost_bottleneck_list.append(ghost_bottleneck) |
||||||
|
if idx in self.out_index: |
||||||
|
self.feat_channels.append(output_channels) |
||||||
|
|
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
feat_list = [] |
||||||
|
x = self.conv1(inputs) |
||||||
|
for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list): |
||||||
|
x = ghost_bottleneck(x) |
||||||
|
if idx in self.out_index: |
||||||
|
feat_list.append(x) |
||||||
|
return feat_list |
||||||
|
|
||||||
|
def _make_divisible(self, v, divisor, min_value=None): |
||||||
|
""" |
||||||
|
This function is taken from the original tf repo. |
||||||
|
It ensures that all layers have a channel number that is divisible by 8 |
||||||
|
It can be seen here: |
||||||
|
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py |
||||||
|
""" |
||||||
|
if min_value is None: |
||||||
|
min_value = divisor |
||||||
|
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) |
||||||
|
# Make sure that round down does not go down by more than 10%. |
||||||
|
if new_v < 0.9 * v: |
||||||
|
new_v += divisor |
||||||
|
return new_v |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def GhostNet_x0_5(**kwargs): |
||||||
|
model = GhostNet(scale=0.5, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def GhostNet_x1_0(**kwargs): |
||||||
|
model = GhostNet(scale=1.0, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def GhostNet_x1_3(**kwargs): |
||||||
|
model = GhostNet(scale=1.3, **kwargs) |
||||||
|
return model |
@ -0,0 +1,974 @@ |
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
""" |
||||||
|
This code is based on |
||||||
|
https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py |
||||||
|
""" |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
from numbers import Integral |
||||||
|
from paddle import ParamAttr |
||||||
|
from paddle.regularizer import L2Decay |
||||||
|
from paddle.nn.initializer import Normal, Constant |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg import utils |
||||||
|
|
||||||
|
__all__ = [ |
||||||
|
"Lite_HRNet_18", "Lite_HRNet_30", "Lite_HRNet_naive", |
||||||
|
"Lite_HRNet_wider_naive", "LiteHRNet" |
||||||
|
] |
||||||
|
|
||||||
|
|
||||||
|
def Conv2d(in_channels, |
||||||
|
out_channels, |
||||||
|
kernel_size, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
dilation=1, |
||||||
|
groups=1, |
||||||
|
bias=True, |
||||||
|
weight_init=Normal(std=0.001), |
||||||
|
bias_init=Constant(0.)): |
||||||
|
weight_attr = paddle.framework.ParamAttr(initializer=weight_init) |
||||||
|
if bias: |
||||||
|
bias_attr = paddle.framework.ParamAttr(initializer=bias_init) |
||||||
|
else: |
||||||
|
bias_attr = False |
||||||
|
conv = nn.Conv2D( |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
kernel_size, |
||||||
|
stride, |
||||||
|
padding, |
||||||
|
dilation, |
||||||
|
groups, |
||||||
|
weight_attr=weight_attr, |
||||||
|
bias_attr=bias_attr) |
||||||
|
return conv |
||||||
|
|
||||||
|
|
||||||
|
def channel_shuffle(x, groups): |
||||||
|
x_shape = paddle.shape(x) |
||||||
|
batch_size, height, width = x_shape[0], x_shape[2], x_shape[3] |
||||||
|
num_channels = x.shape[1] |
||||||
|
channels_per_group = num_channels // groups |
||||||
|
|
||||||
|
x = paddle.reshape( |
||||||
|
x=x, shape=[batch_size, groups, channels_per_group, height, width]) |
||||||
|
x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4]) |
||||||
|
x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width]) |
||||||
|
|
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class ConvNormLayer(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
ch_in, |
||||||
|
ch_out, |
||||||
|
filter_size, |
||||||
|
stride=1, |
||||||
|
groups=1, |
||||||
|
norm_type=None, |
||||||
|
norm_groups=32, |
||||||
|
norm_decay=0., |
||||||
|
freeze_norm=False, |
||||||
|
act=None): |
||||||
|
super(ConvNormLayer, self).__init__() |
||||||
|
self.act = act |
||||||
|
norm_lr = 0. if freeze_norm else 1. |
||||||
|
if norm_type is not None: |
||||||
|
assert norm_type in ['bn', 'sync_bn', 'gn'], \ |
||||||
|
"norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type) |
||||||
|
param_attr = ParamAttr( |
||||||
|
initializer=Constant(1.0), |
||||||
|
learning_rate=norm_lr, |
||||||
|
regularizer=L2Decay(norm_decay), ) |
||||||
|
bias_attr = ParamAttr( |
||||||
|
learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) |
||||||
|
global_stats = True if freeze_norm else None |
||||||
|
if norm_type in ['bn', 'sync_bn']: |
||||||
|
self.norm = nn.BatchNorm2D( |
||||||
|
ch_out, |
||||||
|
weight_attr=param_attr, |
||||||
|
bias_attr=bias_attr, |
||||||
|
use_global_stats=global_stats, ) |
||||||
|
elif norm_type == 'gn': |
||||||
|
self.norm = nn.GroupNorm( |
||||||
|
num_groups=norm_groups, |
||||||
|
num_channels=ch_out, |
||||||
|
weight_attr=param_attr, |
||||||
|
bias_attr=bias_attr) |
||||||
|
norm_params = self.norm.parameters() |
||||||
|
if freeze_norm: |
||||||
|
for param in norm_params: |
||||||
|
param.stop_gradient = True |
||||||
|
conv_bias_attr = False |
||||||
|
else: |
||||||
|
conv_bias_attr = True |
||||||
|
self.norm = None |
||||||
|
|
||||||
|
self.conv = nn.Conv2D( |
||||||
|
in_channels=ch_in, |
||||||
|
out_channels=ch_out, |
||||||
|
kernel_size=filter_size, |
||||||
|
stride=stride, |
||||||
|
padding=(filter_size - 1) // 2, |
||||||
|
groups=groups, |
||||||
|
weight_attr=ParamAttr(initializer=Normal( |
||||||
|
mean=0., std=0.001)), |
||||||
|
bias_attr=conv_bias_attr) |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
out = self.conv(inputs) |
||||||
|
if self.norm is not None: |
||||||
|
out = self.norm(out) |
||||||
|
|
||||||
|
if self.act == 'relu': |
||||||
|
out = F.relu(out) |
||||||
|
elif self.act == 'sigmoid': |
||||||
|
out = F.sigmoid(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class DepthWiseSeparableConvNormLayer(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
ch_in, |
||||||
|
ch_out, |
||||||
|
filter_size, |
||||||
|
stride=1, |
||||||
|
dw_norm_type=None, |
||||||
|
pw_norm_type=None, |
||||||
|
norm_decay=0., |
||||||
|
freeze_norm=False, |
||||||
|
dw_act=None, |
||||||
|
pw_act=None): |
||||||
|
super(DepthWiseSeparableConvNormLayer, self).__init__() |
||||||
|
self.depthwise_conv = ConvNormLayer( |
||||||
|
ch_in=ch_in, |
||||||
|
ch_out=ch_in, |
||||||
|
filter_size=filter_size, |
||||||
|
stride=stride, |
||||||
|
groups=ch_in, |
||||||
|
norm_type=dw_norm_type, |
||||||
|
act=dw_act, |
||||||
|
norm_decay=norm_decay, |
||||||
|
freeze_norm=freeze_norm, ) |
||||||
|
self.pointwise_conv = ConvNormLayer( |
||||||
|
ch_in=ch_in, |
||||||
|
ch_out=ch_out, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=pw_norm_type, |
||||||
|
act=pw_act, |
||||||
|
norm_decay=norm_decay, |
||||||
|
freeze_norm=freeze_norm, ) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.depthwise_conv(x) |
||||||
|
x = self.pointwise_conv(x) |
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class CrossResolutionWeightingModule(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
channels, |
||||||
|
ratio=16, |
||||||
|
norm_type='bn', |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
super(CrossResolutionWeightingModule, self).__init__() |
||||||
|
self.channels = channels |
||||||
|
total_channel = sum(channels) |
||||||
|
self.conv1 = ConvNormLayer( |
||||||
|
ch_in=total_channel, |
||||||
|
ch_out=total_channel // ratio, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
self.conv2 = ConvNormLayer( |
||||||
|
ch_in=total_channel // ratio, |
||||||
|
ch_out=total_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='sigmoid', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
out = [] |
||||||
|
for idx, xi in enumerate(x[:-1]): |
||||||
|
kernel_size = stride = pow(2, len(x) - idx - 1) |
||||||
|
xi = F.avg_pool2d(xi, kernel_size=kernel_size, stride=stride) |
||||||
|
out.append(xi) |
||||||
|
out.append(x[-1]) |
||||||
|
|
||||||
|
out = paddle.concat(out, 1) |
||||||
|
out = self.conv1(out) |
||||||
|
out = self.conv2(out) |
||||||
|
out = paddle.split(out, self.channels, 1) |
||||||
|
out = [ |
||||||
|
s * F.interpolate( |
||||||
|
a, paddle.shape(s)[-2:], mode='nearest') for s, a in zip(x, out) |
||||||
|
] |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class SpatialWeightingModule(nn.Layer): |
||||||
|
def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.): |
||||||
|
super(SpatialWeightingModule, self).__init__() |
||||||
|
self.global_avgpooling = nn.AdaptiveAvgPool2D(1) |
||||||
|
self.conv1 = ConvNormLayer( |
||||||
|
ch_in=in_channel, |
||||||
|
ch_out=in_channel // ratio, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
self.conv2 = ConvNormLayer( |
||||||
|
ch_in=in_channel // ratio, |
||||||
|
ch_out=in_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
act='sigmoid', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
out = self.global_avgpooling(x) |
||||||
|
out = self.conv1(out) |
||||||
|
out = self.conv2(out) |
||||||
|
return x * out |
||||||
|
|
||||||
|
|
||||||
|
class ConditionalChannelWeightingBlock(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
stride, |
||||||
|
reduce_ratio, |
||||||
|
norm_type='bn', |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
super(ConditionalChannelWeightingBlock, self).__init__() |
||||||
|
assert stride in [1, 2] |
||||||
|
branch_channels = [channel // 2 for channel in in_channels] |
||||||
|
|
||||||
|
self.cross_resolution_weighting = CrossResolutionWeightingModule( |
||||||
|
branch_channels, |
||||||
|
ratio=reduce_ratio, |
||||||
|
norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
self.depthwise_convs = nn.LayerList([ |
||||||
|
ConvNormLayer( |
||||||
|
channel, |
||||||
|
channel, |
||||||
|
filter_size=3, |
||||||
|
stride=stride, |
||||||
|
groups=channel, |
||||||
|
norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) for channel in branch_channels |
||||||
|
]) |
||||||
|
|
||||||
|
self.spatial_weighting = nn.LayerList([ |
||||||
|
SpatialWeightingModule( |
||||||
|
channel, |
||||||
|
ratio=4, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) for channel in branch_channels |
||||||
|
]) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = [s.chunk(2, axis=1) for s in x] |
||||||
|
x1 = [s[0] for s in x] |
||||||
|
x2 = [s[1] for s in x] |
||||||
|
|
||||||
|
x2 = self.cross_resolution_weighting(x2) |
||||||
|
x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)] |
||||||
|
x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)] |
||||||
|
|
||||||
|
out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)] |
||||||
|
out = [channel_shuffle(s, groups=2) for s in out] |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class ShuffleUnit(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channel, |
||||||
|
out_channel, |
||||||
|
stride, |
||||||
|
norm_type='bn', |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
super(ShuffleUnit, self).__init__() |
||||||
|
branch_channel = out_channel // 2 |
||||||
|
self.stride = stride |
||||||
|
if self.stride == 1: |
||||||
|
assert in_channel == branch_channel * 2, \ |
||||||
|
"when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2) |
||||||
|
if stride > 1: |
||||||
|
self.branch1 = nn.Sequential( |
||||||
|
ConvNormLayer( |
||||||
|
ch_in=in_channel, |
||||||
|
ch_out=in_channel, |
||||||
|
filter_size=3, |
||||||
|
stride=self.stride, |
||||||
|
groups=in_channel, |
||||||
|
norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay), |
||||||
|
ConvNormLayer( |
||||||
|
ch_in=in_channel, |
||||||
|
ch_out=branch_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay), ) |
||||||
|
self.branch2 = nn.Sequential( |
||||||
|
ConvNormLayer( |
||||||
|
ch_in=branch_channel if stride == 1 else in_channel, |
||||||
|
ch_out=branch_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay), |
||||||
|
ConvNormLayer( |
||||||
|
ch_in=branch_channel, |
||||||
|
ch_out=branch_channel, |
||||||
|
filter_size=3, |
||||||
|
stride=self.stride, |
||||||
|
groups=branch_channel, |
||||||
|
norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay), |
||||||
|
ConvNormLayer( |
||||||
|
ch_in=branch_channel, |
||||||
|
ch_out=branch_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay), ) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
if self.stride > 1: |
||||||
|
x1 = self.branch1(x) |
||||||
|
x2 = self.branch2(x) |
||||||
|
else: |
||||||
|
x1, x2 = x.chunk(2, axis=1) |
||||||
|
x2 = self.branch2(x2) |
||||||
|
out = paddle.concat([x1, x2], axis=1) |
||||||
|
out = channel_shuffle(out, groups=2) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class IterativeHead(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
norm_type='bn', |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
super(IterativeHead, self).__init__() |
||||||
|
num_branches = len(in_channels) |
||||||
|
self.in_channels = in_channels[::-1] |
||||||
|
|
||||||
|
projects = [] |
||||||
|
for i in range(num_branches): |
||||||
|
if i != num_branches - 1: |
||||||
|
projects.append( |
||||||
|
DepthWiseSeparableConvNormLayer( |
||||||
|
ch_in=self.in_channels[i], |
||||||
|
ch_out=self.in_channels[i + 1], |
||||||
|
filter_size=3, |
||||||
|
stride=1, |
||||||
|
dw_act=None, |
||||||
|
pw_act='relu', |
||||||
|
dw_norm_type=norm_type, |
||||||
|
pw_norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay)) |
||||||
|
else: |
||||||
|
projects.append( |
||||||
|
DepthWiseSeparableConvNormLayer( |
||||||
|
ch_in=self.in_channels[i], |
||||||
|
ch_out=self.in_channels[i], |
||||||
|
filter_size=3, |
||||||
|
stride=1, |
||||||
|
dw_act=None, |
||||||
|
pw_act='relu', |
||||||
|
dw_norm_type=norm_type, |
||||||
|
pw_norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay)) |
||||||
|
self.projects = nn.LayerList(projects) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = x[::-1] |
||||||
|
y = [] |
||||||
|
last_x = None |
||||||
|
for i, s in enumerate(x): |
||||||
|
if last_x is not None: |
||||||
|
last_x = F.interpolate( |
||||||
|
last_x, |
||||||
|
size=paddle.shape(s)[-2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=True) |
||||||
|
s = s + last_x |
||||||
|
s = self.projects[i](s) |
||||||
|
y.append(s) |
||||||
|
last_x = s |
||||||
|
|
||||||
|
return y[::-1] |
||||||
|
|
||||||
|
|
||||||
|
class Stem(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channel, |
||||||
|
stem_channel, |
||||||
|
out_channel, |
||||||
|
expand_ratio, |
||||||
|
norm_type='bn', |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
super(Stem, self).__init__() |
||||||
|
self.conv1 = ConvNormLayer( |
||||||
|
in_channel, |
||||||
|
stem_channel, |
||||||
|
filter_size=3, |
||||||
|
stride=2, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
mid_channel = int(round(stem_channel * expand_ratio)) |
||||||
|
branch_channel = stem_channel // 2 |
||||||
|
if stem_channel == out_channel: |
||||||
|
inc_channel = out_channel - branch_channel |
||||||
|
else: |
||||||
|
inc_channel = out_channel - stem_channel |
||||||
|
self.branch1 = nn.Sequential( |
||||||
|
ConvNormLayer( |
||||||
|
ch_in=branch_channel, |
||||||
|
ch_out=branch_channel, |
||||||
|
filter_size=3, |
||||||
|
stride=2, |
||||||
|
groups=branch_channel, |
||||||
|
norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay), |
||||||
|
ConvNormLayer( |
||||||
|
ch_in=branch_channel, |
||||||
|
ch_out=inc_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay), ) |
||||||
|
self.expand_conv = ConvNormLayer( |
||||||
|
ch_in=branch_channel, |
||||||
|
ch_out=mid_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
self.depthwise_conv = ConvNormLayer( |
||||||
|
ch_in=mid_channel, |
||||||
|
ch_out=mid_channel, |
||||||
|
filter_size=3, |
||||||
|
stride=2, |
||||||
|
groups=mid_channel, |
||||||
|
norm_type=norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
self.linear_conv = ConvNormLayer( |
||||||
|
ch_in=mid_channel, |
||||||
|
ch_out=branch_channel |
||||||
|
if stem_channel == out_channel else stem_channel, |
||||||
|
filter_size=1, |
||||||
|
stride=1, |
||||||
|
norm_type=norm_type, |
||||||
|
act='relu', |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.conv1(x) |
||||||
|
x1, x2 = x.chunk(2, axis=1) |
||||||
|
x1 = self.branch1(x1) |
||||||
|
x2 = self.expand_conv(x2) |
||||||
|
x2 = self.depthwise_conv(x2) |
||||||
|
x2 = self.linear_conv(x2) |
||||||
|
out = paddle.concat([x1, x2], axis=1) |
||||||
|
out = channel_shuffle(out, groups=2) |
||||||
|
|
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class LiteHRNetModule(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
num_branches, |
||||||
|
num_blocks, |
||||||
|
in_channels, |
||||||
|
reduce_ratio, |
||||||
|
module_type, |
||||||
|
multiscale_output=False, |
||||||
|
with_fuse=True, |
||||||
|
norm_type='bn', |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
super(LiteHRNetModule, self).__init__() |
||||||
|
assert num_branches == len(in_channels),\ |
||||||
|
"num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels)) |
||||||
|
assert module_type in [ |
||||||
|
'LITE', 'NAIVE' |
||||||
|
], "module_type should be one of ['LITE', 'NAIVE']" |
||||||
|
self.num_branches = num_branches |
||||||
|
self.in_channels = in_channels |
||||||
|
self.multiscale_output = multiscale_output |
||||||
|
self.with_fuse = with_fuse |
||||||
|
self.norm_type = 'bn' |
||||||
|
self.module_type = module_type |
||||||
|
|
||||||
|
if self.module_type == 'LITE': |
||||||
|
self.layers = self._make_weighting_blocks( |
||||||
|
num_blocks, |
||||||
|
reduce_ratio, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
elif self.module_type == 'NAIVE': |
||||||
|
self.layers = self._make_naive_branches( |
||||||
|
num_branches, |
||||||
|
num_blocks, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay) |
||||||
|
|
||||||
|
if self.with_fuse: |
||||||
|
self.fuse_layers = self._make_fuse_layers( |
||||||
|
freeze_norm=freeze_norm, norm_decay=norm_decay) |
||||||
|
self.relu = nn.ReLU() |
||||||
|
|
||||||
|
def _make_weighting_blocks(self, |
||||||
|
num_blocks, |
||||||
|
reduce_ratio, |
||||||
|
stride=1, |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
layers = [] |
||||||
|
for i in range(num_blocks): |
||||||
|
layers.append( |
||||||
|
ConditionalChannelWeightingBlock( |
||||||
|
self.in_channels, |
||||||
|
stride=stride, |
||||||
|
reduce_ratio=reduce_ratio, |
||||||
|
norm_type=self.norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay)) |
||||||
|
return nn.Sequential(*layers) |
||||||
|
|
||||||
|
def _make_naive_branches(self, |
||||||
|
num_branches, |
||||||
|
num_blocks, |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
branches = [] |
||||||
|
for branch_idx in range(num_branches): |
||||||
|
layers = [] |
||||||
|
for i in range(num_blocks): |
||||||
|
layers.append( |
||||||
|
ShuffleUnit( |
||||||
|
self.in_channels[branch_idx], |
||||||
|
self.in_channels[branch_idx], |
||||||
|
stride=1, |
||||||
|
norm_type=self.norm_type, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay)) |
||||||
|
branches.append(nn.Sequential(*layers)) |
||||||
|
return nn.LayerList(branches) |
||||||
|
|
||||||
|
def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.): |
||||||
|
if self.num_branches == 1: |
||||||
|
return None |
||||||
|
fuse_layers = [] |
||||||
|
num_out_branches = self.num_branches if self.multiscale_output else 1 |
||||||
|
for i in range(num_out_branches): |
||||||
|
fuse_layer = [] |
||||||
|
for j in range(self.num_branches): |
||||||
|
if j > i: |
||||||
|
fuse_layer.append( |
||||||
|
nn.Sequential( |
||||||
|
Conv2d( |
||||||
|
self.in_channels[j], |
||||||
|
self.in_channels[i], |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(self.in_channels[i]), |
||||||
|
nn.Upsample( |
||||||
|
scale_factor=2**(j - i), mode='nearest'))) |
||||||
|
elif j == i: |
||||||
|
fuse_layer.append(None) |
||||||
|
else: |
||||||
|
conv_downsamples = [] |
||||||
|
for k in range(i - j): |
||||||
|
if k == i - j - 1: |
||||||
|
conv_downsamples.append( |
||||||
|
nn.Sequential( |
||||||
|
Conv2d( |
||||||
|
self.in_channels[j], |
||||||
|
self.in_channels[j], |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
padding=1, |
||||||
|
groups=self.in_channels[j], |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(self.in_channels[j]), |
||||||
|
Conv2d( |
||||||
|
self.in_channels[j], |
||||||
|
self.in_channels[i], |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(self.in_channels[i]))) |
||||||
|
else: |
||||||
|
conv_downsamples.append( |
||||||
|
nn.Sequential( |
||||||
|
Conv2d( |
||||||
|
self.in_channels[j], |
||||||
|
self.in_channels[j], |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
padding=1, |
||||||
|
groups=self.in_channels[j], |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(self.in_channels[j]), |
||||||
|
Conv2d( |
||||||
|
self.in_channels[j], |
||||||
|
self.in_channels[j], |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(self.in_channels[j]), |
||||||
|
nn.ReLU())) |
||||||
|
|
||||||
|
fuse_layer.append(nn.Sequential(*conv_downsamples)) |
||||||
|
fuse_layers.append(nn.LayerList(fuse_layer)) |
||||||
|
|
||||||
|
return nn.LayerList(fuse_layers) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
if self.num_branches == 1: |
||||||
|
return [self.layers[0](x[0])] |
||||||
|
if self.module_type == 'LITE': |
||||||
|
out = self.layers(x) |
||||||
|
elif self.module_type == 'NAIVE': |
||||||
|
for i in range(self.num_branches): |
||||||
|
x[i] = self.layers[i](x[i]) |
||||||
|
out = x |
||||||
|
if self.with_fuse: |
||||||
|
out_fuse = [] |
||||||
|
for i in range(len(self.fuse_layers)): |
||||||
|
y = out[0] if i == 0 else self.fuse_layers[i][0](out[0]) |
||||||
|
for j in range(self.num_branches): |
||||||
|
if j == 0: |
||||||
|
y += y |
||||||
|
elif i == j: |
||||||
|
y += out[j] |
||||||
|
else: |
||||||
|
y += self.fuse_layers[i][j](out[j]) |
||||||
|
if i == 0: |
||||||
|
out[i] = y |
||||||
|
out_fuse.append(self.relu(y)) |
||||||
|
out = out_fuse |
||||||
|
elif not self.multiscale_output: |
||||||
|
out = [out[0]] |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class LiteHRNet(nn.Layer): |
||||||
|
""" |
||||||
|
@inproceedings{Yulitehrnet21, |
||||||
|
title={Lite-HRNet: A Lightweight High-Resolution Network}, |
||||||
|
author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong}, |
||||||
|
booktitle={CVPR},year={2021} |
||||||
|
} |
||||||
|
|
||||||
|
Args: |
||||||
|
network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"], |
||||||
|
"naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet. |
||||||
|
"wider_naive": Naive network with wider channels in each block. |
||||||
|
"lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting. |
||||||
|
"lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18. |
||||||
|
in_channels (int, optional): The channels of input image. Default: 3. |
||||||
|
freeze_at (int): the stage to freeze |
||||||
|
freeze_norm (bool): whether to freeze norm in HRNet |
||||||
|
norm_decay (float): weight decay for normalization layer weights |
||||||
|
return_idx (List): the stage to return |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
network_type, |
||||||
|
in_channels=3, |
||||||
|
freeze_at=0, |
||||||
|
freeze_norm=True, |
||||||
|
norm_decay=0., |
||||||
|
return_idx=[0, 1, 2, 3], |
||||||
|
use_head=False, |
||||||
|
pretrained=None): |
||||||
|
super(LiteHRNet, self).__init__() |
||||||
|
if isinstance(return_idx, Integral): |
||||||
|
return_idx = [return_idx] |
||||||
|
assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \ |
||||||
|
"the network_type should be one of [lite_18, lite_30, naive, wider_naive]" |
||||||
|
assert len(return_idx) > 0, "need one or more return index" |
||||||
|
self.freeze_at = freeze_at |
||||||
|
self.freeze_norm = freeze_norm |
||||||
|
self.norm_decay = norm_decay |
||||||
|
self.return_idx = return_idx |
||||||
|
self.norm_type = 'bn' |
||||||
|
self.use_head = use_head |
||||||
|
self.pretrained = pretrained |
||||||
|
|
||||||
|
self.module_configs = { |
||||||
|
"lite_18": { |
||||||
|
"num_modules": [2, 4, 2], |
||||||
|
"num_branches": [2, 3, 4], |
||||||
|
"num_blocks": [2, 2, 2], |
||||||
|
"module_type": ["LITE", "LITE", "LITE"], |
||||||
|
"reduce_ratios": [8, 8, 8], |
||||||
|
"num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], |
||||||
|
}, |
||||||
|
"lite_30": { |
||||||
|
"num_modules": [3, 8, 3], |
||||||
|
"num_branches": [2, 3, 4], |
||||||
|
"num_blocks": [2, 2, 2], |
||||||
|
"module_type": ["LITE", "LITE", "LITE"], |
||||||
|
"reduce_ratios": [8, 8, 8], |
||||||
|
"num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], |
||||||
|
}, |
||||||
|
"naive": { |
||||||
|
"num_modules": [2, 4, 2], |
||||||
|
"num_branches": [2, 3, 4], |
||||||
|
"num_blocks": [2, 2, 2], |
||||||
|
"module_type": ["NAIVE", "NAIVE", "NAIVE"], |
||||||
|
"reduce_ratios": [1, 1, 1], |
||||||
|
"num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]], |
||||||
|
}, |
||||||
|
"wider_naive": { |
||||||
|
"num_modules": [2, 4, 2], |
||||||
|
"num_branches": [2, 3, 4], |
||||||
|
"num_blocks": [2, 2, 2], |
||||||
|
"module_type": ["NAIVE", "NAIVE", "NAIVE"], |
||||||
|
"reduce_ratios": [1, 1, 1], |
||||||
|
"num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], |
||||||
|
}, |
||||||
|
} |
||||||
|
|
||||||
|
self.stages_config = self.module_configs[network_type] |
||||||
|
|
||||||
|
self.stem = Stem(in_channels, 32, 32, 1) |
||||||
|
num_channels_pre_layer = [32] |
||||||
|
for stage_idx in range(3): |
||||||
|
num_channels = self.stages_config["num_channels"][stage_idx] |
||||||
|
setattr(self, 'transition{}'.format(stage_idx), |
||||||
|
self._make_transition_layer(num_channels_pre_layer, |
||||||
|
num_channels, self.freeze_norm, |
||||||
|
self.norm_decay)) |
||||||
|
stage, num_channels_pre_layer = self._make_stage( |
||||||
|
self.stages_config, stage_idx, num_channels, True, |
||||||
|
self.freeze_norm, self.norm_decay) |
||||||
|
setattr(self, 'stage{}'.format(stage_idx), stage) |
||||||
|
|
||||||
|
num_channels = self.stages_config["num_channels"][-1] |
||||||
|
self.feat_channels = num_channels |
||||||
|
|
||||||
|
if self.use_head: |
||||||
|
self.head_layer = IterativeHead(num_channels_pre_layer, 'bn', |
||||||
|
self.freeze_norm, self.norm_decay) |
||||||
|
|
||||||
|
self.feat_channels = [num_channels[0]] |
||||||
|
for i in range(1, len(num_channels)): |
||||||
|
self.feat_channels.append(num_channels[i] // 2) |
||||||
|
|
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
def _make_transition_layer(self, |
||||||
|
num_channels_pre_layer, |
||||||
|
num_channels_cur_layer, |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
num_branches_pre = len(num_channels_pre_layer) |
||||||
|
num_branches_cur = len(num_channels_cur_layer) |
||||||
|
transition_layers = [] |
||||||
|
for i in range(num_branches_cur): |
||||||
|
if i < num_branches_pre: |
||||||
|
if num_channels_cur_layer[i] != num_channels_pre_layer[i]: |
||||||
|
transition_layers.append( |
||||||
|
nn.Sequential( |
||||||
|
Conv2d( |
||||||
|
num_channels_pre_layer[i], |
||||||
|
num_channels_pre_layer[i], |
||||||
|
kernel_size=3, |
||||||
|
stride=1, |
||||||
|
padding=1, |
||||||
|
groups=num_channels_pre_layer[i], |
||||||
|
bias=False), |
||||||
|
nn.BatchNorm2D(num_channels_pre_layer[i]), |
||||||
|
Conv2d( |
||||||
|
num_channels_pre_layer[i], |
||||||
|
num_channels_cur_layer[i], |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(num_channels_cur_layer[i]), |
||||||
|
nn.ReLU())) |
||||||
|
else: |
||||||
|
transition_layers.append(None) |
||||||
|
else: |
||||||
|
conv_downsamples = [] |
||||||
|
for j in range(i + 1 - num_branches_pre): |
||||||
|
conv_downsamples.append( |
||||||
|
nn.Sequential( |
||||||
|
Conv2d( |
||||||
|
num_channels_pre_layer[-1], |
||||||
|
num_channels_pre_layer[-1], |
||||||
|
groups=num_channels_pre_layer[-1], |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
padding=1, |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(num_channels_pre_layer[-1]), |
||||||
|
Conv2d( |
||||||
|
num_channels_pre_layer[-1], |
||||||
|
num_channels_cur_layer[i] |
||||||
|
if j == i - num_branches_pre else |
||||||
|
num_channels_pre_layer[-1], |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
bias=False, ), |
||||||
|
nn.BatchNorm2D(num_channels_cur_layer[i] |
||||||
|
if j == i - num_branches_pre else |
||||||
|
num_channels_pre_layer[-1]), |
||||||
|
nn.ReLU())) |
||||||
|
transition_layers.append(nn.Sequential(*conv_downsamples)) |
||||||
|
return nn.LayerList(transition_layers) |
||||||
|
|
||||||
|
def _make_stage(self, |
||||||
|
stages_config, |
||||||
|
stage_idx, |
||||||
|
in_channels, |
||||||
|
multiscale_output, |
||||||
|
freeze_norm=False, |
||||||
|
norm_decay=0.): |
||||||
|
num_modules = stages_config["num_modules"][stage_idx] |
||||||
|
num_branches = stages_config["num_branches"][stage_idx] |
||||||
|
num_blocks = stages_config["num_blocks"][stage_idx] |
||||||
|
reduce_ratio = stages_config['reduce_ratios'][stage_idx] |
||||||
|
module_type = stages_config['module_type'][stage_idx] |
||||||
|
|
||||||
|
modules = [] |
||||||
|
for i in range(num_modules): |
||||||
|
if not multiscale_output and i == num_modules - 1: |
||||||
|
reset_multiscale_output = False |
||||||
|
else: |
||||||
|
reset_multiscale_output = True |
||||||
|
modules.append( |
||||||
|
LiteHRNetModule( |
||||||
|
num_branches, |
||||||
|
num_blocks, |
||||||
|
in_channels, |
||||||
|
reduce_ratio, |
||||||
|
module_type, |
||||||
|
multiscale_output=reset_multiscale_output, |
||||||
|
with_fuse=True, |
||||||
|
freeze_norm=freeze_norm, |
||||||
|
norm_decay=norm_decay)) |
||||||
|
in_channels = modules[-1].in_channels |
||||||
|
return nn.Sequential(*modules), in_channels |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.stem(x) |
||||||
|
|
||||||
|
y_list = [x] |
||||||
|
for stage_idx in range(3): |
||||||
|
x_list = [] |
||||||
|
transition = getattr(self, 'transition{}'.format(stage_idx)) |
||||||
|
for j in range(self.stages_config["num_branches"][stage_idx]): |
||||||
|
if transition[j] is not None: |
||||||
|
if j >= len(y_list): |
||||||
|
x_list.append(transition[j](y_list[-1])) |
||||||
|
else: |
||||||
|
x_list.append(transition[j](y_list[j])) |
||||||
|
else: |
||||||
|
x_list.append(y_list[j]) |
||||||
|
y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list) |
||||||
|
|
||||||
|
if self.use_head: |
||||||
|
y_list = self.head_layer(y_list) |
||||||
|
|
||||||
|
res = [] |
||||||
|
for i, layer in enumerate(y_list): |
||||||
|
if i == self.freeze_at: |
||||||
|
layer.stop_gradient = True |
||||||
|
if i in self.return_idx: |
||||||
|
res.append(layer) |
||||||
|
return res |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def Lite_HRNet_18(**kwargs): |
||||||
|
model = LiteHRNet(network_type="lite_18", **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def Lite_HRNet_30(**kwargs): |
||||||
|
model = LiteHRNet(network_type="lite_30", **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def Lite_HRNet_naive(**kwargs): |
||||||
|
model = LiteHRNet(network_type="naive", **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def Lite_HRNet_wider_naive(**kwargs): |
||||||
|
model = LiteHRNet(network_type="wider_naive", **kwargs) |
||||||
|
return model |
@ -0,0 +1,315 @@ |
|||||||
|
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
from paddle import ParamAttr, reshape, transpose, concat, split |
||||||
|
from paddle.nn import Layer, Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm, Linear |
||||||
|
from paddle.nn.initializer import KaimingNormal |
||||||
|
from paddle.nn.functional import swish |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.utils import utils, logger |
||||||
|
|
||||||
|
__all__ = [ |
||||||
|
'ShuffleNetV2_x0_25', 'ShuffleNetV2_x0_33', 'ShuffleNetV2_x0_5', |
||||||
|
'ShuffleNetV2_x1_0', 'ShuffleNetV2_x1_5', 'ShuffleNetV2_x2_0', |
||||||
|
'ShuffleNetV2_swish' |
||||||
|
] |
||||||
|
|
||||||
|
|
||||||
|
def channel_shuffle(x, groups): |
||||||
|
x_shape = paddle.shape(x) |
||||||
|
batch_size, height, width = x_shape[0], x_shape[2], x_shape[3] |
||||||
|
num_channels = x.shape[1] |
||||||
|
channels_per_group = num_channels // groups |
||||||
|
|
||||||
|
# reshape |
||||||
|
x = reshape( |
||||||
|
x=x, shape=[batch_size, groups, channels_per_group, height, width]) |
||||||
|
|
||||||
|
# transpose |
||||||
|
x = transpose(x=x, perm=[0, 2, 1, 3, 4]) |
||||||
|
|
||||||
|
# flatten |
||||||
|
x = reshape(x=x, shape=[batch_size, num_channels, height, width]) |
||||||
|
|
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class ConvBNLayer(Layer): |
||||||
|
def __init__( |
||||||
|
self, |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
kernel_size, |
||||||
|
stride, |
||||||
|
padding, |
||||||
|
groups=1, |
||||||
|
act=None, |
||||||
|
name=None, ): |
||||||
|
super(ConvBNLayer, self).__init__() |
||||||
|
self._conv = Conv2D( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=out_channels, |
||||||
|
kernel_size=kernel_size, |
||||||
|
stride=stride, |
||||||
|
padding=padding, |
||||||
|
groups=groups, |
||||||
|
weight_attr=ParamAttr( |
||||||
|
initializer=KaimingNormal(), name=name + "_weights"), |
||||||
|
bias_attr=False) |
||||||
|
|
||||||
|
self._batch_norm = BatchNorm( |
||||||
|
out_channels, |
||||||
|
param_attr=ParamAttr(name=name + "_bn_scale"), |
||||||
|
bias_attr=ParamAttr(name=name + "_bn_offset"), |
||||||
|
act=act, |
||||||
|
moving_mean_name=name + "_bn_mean", |
||||||
|
moving_variance_name=name + "_bn_variance") |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
y = self._conv(inputs) |
||||||
|
y = self._batch_norm(y) |
||||||
|
return y |
||||||
|
|
||||||
|
|
||||||
|
class InvertedResidual(Layer): |
||||||
|
def __init__(self, in_channels, out_channels, stride, act="relu", |
||||||
|
name=None): |
||||||
|
super(InvertedResidual, self).__init__() |
||||||
|
self._conv_pw = ConvBNLayer( |
||||||
|
in_channels=in_channels // 2, |
||||||
|
out_channels=out_channels // 2, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
groups=1, |
||||||
|
act=act, |
||||||
|
name='stage_' + name + '_conv1') |
||||||
|
self._conv_dw = ConvBNLayer( |
||||||
|
in_channels=out_channels // 2, |
||||||
|
out_channels=out_channels // 2, |
||||||
|
kernel_size=3, |
||||||
|
stride=stride, |
||||||
|
padding=1, |
||||||
|
groups=out_channels // 2, |
||||||
|
act=None, |
||||||
|
name='stage_' + name + '_conv2') |
||||||
|
self._conv_linear = ConvBNLayer( |
||||||
|
in_channels=out_channels // 2, |
||||||
|
out_channels=out_channels // 2, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
groups=1, |
||||||
|
act=act, |
||||||
|
name='stage_' + name + '_conv3') |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
x1, x2 = split( |
||||||
|
inputs, |
||||||
|
num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2], |
||||||
|
axis=1) |
||||||
|
x2 = self._conv_pw(x2) |
||||||
|
x2 = self._conv_dw(x2) |
||||||
|
x2 = self._conv_linear(x2) |
||||||
|
out = concat([x1, x2], axis=1) |
||||||
|
return channel_shuffle(out, 2) |
||||||
|
|
||||||
|
|
||||||
|
class InvertedResidualDS(Layer): |
||||||
|
def __init__(self, in_channels, out_channels, stride, act="relu", |
||||||
|
name=None): |
||||||
|
super(InvertedResidualDS, self).__init__() |
||||||
|
|
||||||
|
# branch1 |
||||||
|
self._conv_dw_1 = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=in_channels, |
||||||
|
kernel_size=3, |
||||||
|
stride=stride, |
||||||
|
padding=1, |
||||||
|
groups=in_channels, |
||||||
|
act=None, |
||||||
|
name='stage_' + name + '_conv4') |
||||||
|
self._conv_linear_1 = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=out_channels // 2, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
groups=1, |
||||||
|
act=act, |
||||||
|
name='stage_' + name + '_conv5') |
||||||
|
# branch2 |
||||||
|
self._conv_pw_2 = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=out_channels // 2, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
groups=1, |
||||||
|
act=act, |
||||||
|
name='stage_' + name + '_conv1') |
||||||
|
self._conv_dw_2 = ConvBNLayer( |
||||||
|
in_channels=out_channels // 2, |
||||||
|
out_channels=out_channels // 2, |
||||||
|
kernel_size=3, |
||||||
|
stride=stride, |
||||||
|
padding=1, |
||||||
|
groups=out_channels // 2, |
||||||
|
act=None, |
||||||
|
name='stage_' + name + '_conv2') |
||||||
|
self._conv_linear_2 = ConvBNLayer( |
||||||
|
in_channels=out_channels // 2, |
||||||
|
out_channels=out_channels // 2, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
groups=1, |
||||||
|
act=act, |
||||||
|
name='stage_' + name + '_conv3') |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
x1 = self._conv_dw_1(inputs) |
||||||
|
x1 = self._conv_linear_1(x1) |
||||||
|
x2 = self._conv_pw_2(inputs) |
||||||
|
x2 = self._conv_dw_2(x2) |
||||||
|
x2 = self._conv_linear_2(x2) |
||||||
|
out = concat([x1, x2], axis=1) |
||||||
|
|
||||||
|
return channel_shuffle(out, 2) |
||||||
|
|
||||||
|
|
||||||
|
class ShuffleNet(Layer): |
||||||
|
def __init__(self, scale=1.0, act="relu", in_channels=3, pretrained=None): |
||||||
|
super(ShuffleNet, self).__init__() |
||||||
|
self.scale = scale |
||||||
|
self.pretrained = pretrained |
||||||
|
stage_repeats = [4, 8, 4] |
||||||
|
|
||||||
|
if scale == 0.25: |
||||||
|
stage_out_channels = [-1, 24, 24, 48, 96, 512] |
||||||
|
elif scale == 0.33: |
||||||
|
stage_out_channels = [-1, 24, 32, 64, 128, 512] |
||||||
|
elif scale == 0.5: |
||||||
|
stage_out_channels = [-1, 24, 48, 96, 192, 1024] |
||||||
|
elif scale == 1.0: |
||||||
|
stage_out_channels = [-1, 24, 116, 232, 464, 1024] |
||||||
|
elif scale == 1.5: |
||||||
|
stage_out_channels = [-1, 24, 176, 352, 704, 1024] |
||||||
|
elif scale == 2.0: |
||||||
|
stage_out_channels = [-1, 24, 224, 488, 976, 2048] |
||||||
|
else: |
||||||
|
raise NotImplementedError("This scale size:[" + str(scale) + |
||||||
|
"] is not implemented!") |
||||||
|
|
||||||
|
self.out_index = [3, 11, 15] |
||||||
|
self.feat_channels = stage_out_channels[1:5] |
||||||
|
|
||||||
|
# 1. conv1 |
||||||
|
self._conv1 = ConvBNLayer( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=stage_out_channels[1], |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
padding=1, |
||||||
|
act=act, |
||||||
|
name='stage1_conv') |
||||||
|
self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1) |
||||||
|
|
||||||
|
# 2. bottleneck sequences |
||||||
|
self._block_list = [] |
||||||
|
for stage_id, num_repeat in enumerate(stage_repeats): |
||||||
|
for i in range(num_repeat): |
||||||
|
if i == 0: |
||||||
|
block = self.add_sublayer( |
||||||
|
name=str(stage_id + 2) + '_' + str(i + 1), |
||||||
|
sublayer=InvertedResidualDS( |
||||||
|
in_channels=stage_out_channels[stage_id + 1], |
||||||
|
out_channels=stage_out_channels[stage_id + 2], |
||||||
|
stride=2, |
||||||
|
act=act, |
||||||
|
name=str(stage_id + 2) + '_' + str(i + 1))) |
||||||
|
else: |
||||||
|
block = self.add_sublayer( |
||||||
|
name=str(stage_id + 2) + '_' + str(i + 1), |
||||||
|
sublayer=InvertedResidual( |
||||||
|
in_channels=stage_out_channels[stage_id + 2], |
||||||
|
out_channels=stage_out_channels[stage_id + 2], |
||||||
|
stride=1, |
||||||
|
act=act, |
||||||
|
name=str(stage_id + 2) + '_' + str(i + 1))) |
||||||
|
self._block_list.append(block) |
||||||
|
|
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
feat_list = [] |
||||||
|
|
||||||
|
y = self._conv1(inputs) |
||||||
|
y = self._max_pool(y) |
||||||
|
feat_list.append(y) |
||||||
|
|
||||||
|
for idx, inv in enumerate(self._block_list): |
||||||
|
y = inv(y) |
||||||
|
if idx in self.out_index: |
||||||
|
feat_list.append(y) |
||||||
|
return feat_list |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def ShuffleNetV2_x0_25(**kwargs): |
||||||
|
model = ShuffleNet(scale=0.25, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def ShuffleNetV2_x0_33(**kwargs): |
||||||
|
model = ShuffleNet(scale=0.33, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def ShuffleNetV2_x0_5(**kwargs): |
||||||
|
model = ShuffleNet(scale=0.5, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def ShuffleNetV2_x1_0(**kwargs): |
||||||
|
model = ShuffleNet(scale=1.0, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def ShuffleNetV2_x1_5(**kwargs): |
||||||
|
model = ShuffleNet(scale=1.5, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def ShuffleNetV2_x2_0(**kwargs): |
||||||
|
model = ShuffleNet(scale=2.0, **kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def ShuffleNetV2_swish(**kwargs): |
||||||
|
model = ShuffleNet(scale=1.0, act="swish", **kwargs) |
||||||
|
return model |
@ -0,0 +1,716 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
""" |
||||||
|
This file refers to https://github.com/hustvl/TopFormer and https://github.com/BR-IDL/PaddleViT |
||||||
|
""" |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg import utils |
||||||
|
from paddlers.models.ppseg.models.backbones.transformer_utils import Identity, DropPath |
||||||
|
|
||||||
|
__all__ = ["TopTransformer_Base", "TopTransformer_Small", "TopTransformer_Tiny"] |
||||||
|
|
||||||
|
|
||||||
|
def make_divisible(val, divisor, min_value=None): |
||||||
|
""" |
||||||
|
This function is taken from the original tf repo. |
||||||
|
It ensures that all layers have a channel number that is divisible by 8 |
||||||
|
It can be seen here: |
||||||
|
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py |
||||||
|
""" |
||||||
|
if min_value is None: |
||||||
|
min_value = divisor |
||||||
|
new_v = max(min_value, int(val + divisor / 2) // divisor * divisor) |
||||||
|
# Make sure that round down does not go down by more than 10%. |
||||||
|
if new_v < 0.9 * val: |
||||||
|
new_v += divisor |
||||||
|
return new_v |
||||||
|
|
||||||
|
|
||||||
|
class HSigmoid(nn.Layer): |
||||||
|
def __init__(self, inplace=True): |
||||||
|
super().__init__() |
||||||
|
self.relu = nn.ReLU6() |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
return self.relu(x + 3) / 6 |
||||||
|
|
||||||
|
|
||||||
|
class Conv2DBN(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
ks=1, |
||||||
|
stride=1, |
||||||
|
pad=0, |
||||||
|
dilation=1, |
||||||
|
groups=1, |
||||||
|
bn_weight_init=1, |
||||||
|
lr_mult=1.0): |
||||||
|
super().__init__() |
||||||
|
conv_weight_attr = paddle.ParamAttr(learning_rate=lr_mult) |
||||||
|
self.c = nn.Conv2D( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=out_channels, |
||||||
|
kernel_size=ks, |
||||||
|
stride=stride, |
||||||
|
padding=pad, |
||||||
|
dilation=dilation, |
||||||
|
groups=groups, |
||||||
|
weight_attr=conv_weight_attr, |
||||||
|
bias_attr=False) |
||||||
|
bn_weight_attr = paddle.ParamAttr( |
||||||
|
initializer=nn.initializer.Constant(bn_weight_init), |
||||||
|
learning_rate=lr_mult) |
||||||
|
bn_bias_attr = paddle.ParamAttr( |
||||||
|
initializer=nn.initializer.Constant(0), learning_rate=lr_mult) |
||||||
|
self.bn = nn.BatchNorm2D( |
||||||
|
out_channels, weight_attr=bn_weight_attr, bias_attr=bn_bias_attr) |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
out = self.c(inputs) |
||||||
|
out = self.bn(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class ConvBNAct(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
padding=0, |
||||||
|
groups=1, |
||||||
|
norm=nn.BatchNorm2D, |
||||||
|
act=None, |
||||||
|
bias_attr=False, |
||||||
|
lr_mult=1.0): |
||||||
|
super(ConvBNAct, self).__init__() |
||||||
|
param_attr = paddle.ParamAttr(learning_rate=lr_mult) |
||||||
|
self.conv = nn.Conv2D( |
||||||
|
in_channels=in_channels, |
||||||
|
out_channels=out_channels, |
||||||
|
kernel_size=kernel_size, |
||||||
|
stride=stride, |
||||||
|
padding=padding, |
||||||
|
groups=groups, |
||||||
|
weight_attr=param_attr, |
||||||
|
bias_attr=param_attr if bias_attr else False) |
||||||
|
self.act = act() if act is not None else Identity() |
||||||
|
self.bn = norm(out_channels, weight_attr=param_attr, bias_attr=param_attr) \ |
||||||
|
if norm is not None else Identity() |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.conv(x) |
||||||
|
x = self.bn(x) |
||||||
|
x = self.act(x) |
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class MLP(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_features, |
||||||
|
hidden_features=None, |
||||||
|
out_features=None, |
||||||
|
act_layer=nn.ReLU, |
||||||
|
drop=0., |
||||||
|
lr_mult=1.0): |
||||||
|
super().__init__() |
||||||
|
out_features = out_features or in_features |
||||||
|
hidden_features = hidden_features or in_features |
||||||
|
self.fc1 = Conv2DBN(in_features, hidden_features, lr_mult=lr_mult) |
||||||
|
param_attr = paddle.ParamAttr(learning_rate=lr_mult) |
||||||
|
self.dwconv = nn.Conv2D( |
||||||
|
hidden_features, |
||||||
|
hidden_features, |
||||||
|
3, |
||||||
|
1, |
||||||
|
1, |
||||||
|
groups=hidden_features, |
||||||
|
weight_attr=param_attr, |
||||||
|
bias_attr=param_attr) |
||||||
|
self.act = act_layer() |
||||||
|
self.fc2 = Conv2DBN(hidden_features, out_features, lr_mult=lr_mult) |
||||||
|
self.drop = nn.Dropout(drop) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.fc1(x) |
||||||
|
x = self.dwconv(x) |
||||||
|
x = self.act(x) |
||||||
|
x = self.drop(x) |
||||||
|
x = self.fc2(x) |
||||||
|
x = self.drop(x) |
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class InvertedResidual(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
kernel_size, |
||||||
|
stride, |
||||||
|
expand_ratio, |
||||||
|
activations=None, |
||||||
|
lr_mult=1.0): |
||||||
|
super(InvertedResidual, self).__init__() |
||||||
|
assert stride in [1, 2], "The stride should be 1 or 2." |
||||||
|
|
||||||
|
if activations is None: |
||||||
|
activations = nn.ReLU |
||||||
|
|
||||||
|
hidden_dim = int(round(in_channels * expand_ratio)) |
||||||
|
self.use_res_connect = stride == 1 and in_channels == out_channels |
||||||
|
|
||||||
|
layers = [] |
||||||
|
if expand_ratio != 1: |
||||||
|
layers.append( |
||||||
|
Conv2DBN( |
||||||
|
in_channels, hidden_dim, ks=1, lr_mult=lr_mult)) |
||||||
|
layers.append(activations()) |
||||||
|
layers.extend([ |
||||||
|
Conv2DBN( |
||||||
|
hidden_dim, |
||||||
|
hidden_dim, |
||||||
|
ks=kernel_size, |
||||||
|
stride=stride, |
||||||
|
pad=kernel_size // 2, |
||||||
|
groups=hidden_dim, |
||||||
|
lr_mult=lr_mult), activations(), Conv2DBN( |
||||||
|
hidden_dim, out_channels, ks=1, lr_mult=lr_mult) |
||||||
|
]) |
||||||
|
self.conv = nn.Sequential(*layers) |
||||||
|
self.out_channels = out_channels |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
if self.use_res_connect: |
||||||
|
return x + self.conv(x) |
||||||
|
else: |
||||||
|
return self.conv(x) |
||||||
|
|
||||||
|
|
||||||
|
class TokenPyramidModule(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
cfgs, |
||||||
|
out_indices, |
||||||
|
in_channels=3, |
||||||
|
inp_channel=16, |
||||||
|
activation=nn.ReLU, |
||||||
|
width_mult=1., |
||||||
|
lr_mult=1.): |
||||||
|
super().__init__() |
||||||
|
self.out_indices = out_indices |
||||||
|
|
||||||
|
self.stem = nn.Sequential( |
||||||
|
Conv2DBN( |
||||||
|
in_channels, inp_channel, 3, 2, 1, lr_mult=lr_mult), |
||||||
|
activation()) |
||||||
|
|
||||||
|
self.layers = [] |
||||||
|
for i, (k, t, c, s) in enumerate(cfgs): |
||||||
|
output_channel = make_divisible(c * width_mult, 8) |
||||||
|
exp_size = t * inp_channel |
||||||
|
exp_size = make_divisible(exp_size * width_mult, 8) |
||||||
|
layer_name = 'layer{}'.format(i + 1) |
||||||
|
layer = InvertedResidual( |
||||||
|
inp_channel, |
||||||
|
output_channel, |
||||||
|
kernel_size=k, |
||||||
|
stride=s, |
||||||
|
expand_ratio=t, |
||||||
|
activations=activation, |
||||||
|
lr_mult=lr_mult) |
||||||
|
self.add_sublayer(layer_name, layer) |
||||||
|
self.layers.append(layer_name) |
||||||
|
inp_channel = output_channel |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
outs = [] |
||||||
|
x = self.stem(x) |
||||||
|
for i, layer_name in enumerate(self.layers): |
||||||
|
layer = getattr(self, layer_name) |
||||||
|
x = layer(x) |
||||||
|
if i in self.out_indices: |
||||||
|
outs.append(x) |
||||||
|
return outs |
||||||
|
|
||||||
|
|
||||||
|
class Attention(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
dim, |
||||||
|
key_dim, |
||||||
|
num_heads, |
||||||
|
attn_ratio=4, |
||||||
|
activation=None, |
||||||
|
lr_mult=1.0): |
||||||
|
super().__init__() |
||||||
|
self.num_heads = num_heads |
||||||
|
self.scale = key_dim**-0.5 |
||||||
|
self.key_dim = key_dim |
||||||
|
self.nh_kd = nh_kd = key_dim * num_heads |
||||||
|
self.d = int(attn_ratio * key_dim) |
||||||
|
self.dh = int(attn_ratio * key_dim) * num_heads |
||||||
|
self.attn_ratio = attn_ratio |
||||||
|
|
||||||
|
self.to_q = Conv2DBN(dim, nh_kd, 1, lr_mult=lr_mult) |
||||||
|
self.to_k = Conv2DBN(dim, nh_kd, 1, lr_mult=lr_mult) |
||||||
|
self.to_v = Conv2DBN(dim, self.dh, 1, lr_mult=lr_mult) |
||||||
|
|
||||||
|
self.proj = nn.Sequential( |
||||||
|
activation(), |
||||||
|
Conv2DBN( |
||||||
|
self.dh, dim, bn_weight_init=0, lr_mult=lr_mult)) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x_shape = paddle.shape(x) |
||||||
|
H, W = x_shape[2], x_shape[3] |
||||||
|
|
||||||
|
qq = self.to_q(x).reshape( |
||||||
|
[0, self.num_heads, self.key_dim, -1]).transpose([0, 1, 3, 2]) |
||||||
|
kk = self.to_k(x).reshape([0, self.num_heads, self.key_dim, -1]) |
||||||
|
vv = self.to_v(x).reshape([0, self.num_heads, self.d, -1]).transpose( |
||||||
|
[0, 1, 3, 2]) |
||||||
|
|
||||||
|
attn = paddle.matmul(qq, kk) |
||||||
|
attn = F.softmax(attn, axis=-1) |
||||||
|
|
||||||
|
xx = paddle.matmul(attn, vv) |
||||||
|
|
||||||
|
xx = xx.transpose([0, 1, 3, 2]).reshape([0, self.dh, H, W]) |
||||||
|
xx = self.proj(xx) |
||||||
|
return xx |
||||||
|
|
||||||
|
|
||||||
|
class Block(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
dim, |
||||||
|
key_dim, |
||||||
|
num_heads, |
||||||
|
mlp_ratios=4., |
||||||
|
attn_ratio=2., |
||||||
|
drop=0., |
||||||
|
drop_path=0., |
||||||
|
act_layer=nn.ReLU, |
||||||
|
lr_mult=1.0): |
||||||
|
super().__init__() |
||||||
|
self.dim = dim |
||||||
|
self.num_heads = num_heads |
||||||
|
self.mlp_ratios = mlp_ratios |
||||||
|
|
||||||
|
self.attn = Attention( |
||||||
|
dim, |
||||||
|
key_dim=key_dim, |
||||||
|
num_heads=num_heads, |
||||||
|
attn_ratio=attn_ratio, |
||||||
|
activation=act_layer, |
||||||
|
lr_mult=lr_mult) |
||||||
|
|
||||||
|
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here |
||||||
|
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() |
||||||
|
mlp_hidden_dim = int(dim * mlp_ratios) |
||||||
|
self.mlp = MLP(in_features=dim, |
||||||
|
hidden_features=mlp_hidden_dim, |
||||||
|
act_layer=act_layer, |
||||||
|
drop=drop, |
||||||
|
lr_mult=lr_mult) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
h = x |
||||||
|
x = self.attn(x) |
||||||
|
x = self.drop_path(x) |
||||||
|
x = h + x |
||||||
|
|
||||||
|
h = x |
||||||
|
x = self.mlp(x) |
||||||
|
x = self.drop_path(x) |
||||||
|
x = x + h |
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class BasicLayer(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
block_num, |
||||||
|
embedding_dim, |
||||||
|
key_dim, |
||||||
|
num_heads, |
||||||
|
mlp_ratios=4., |
||||||
|
attn_ratio=2., |
||||||
|
drop=0., |
||||||
|
attn_drop=0., |
||||||
|
drop_path=0., |
||||||
|
act_layer=None, |
||||||
|
lr_mult=1.0): |
||||||
|
super().__init__() |
||||||
|
self.block_num = block_num |
||||||
|
|
||||||
|
self.transformer_blocks = nn.LayerList() |
||||||
|
for i in range(self.block_num): |
||||||
|
self.transformer_blocks.append( |
||||||
|
Block( |
||||||
|
embedding_dim, |
||||||
|
key_dim=key_dim, |
||||||
|
num_heads=num_heads, |
||||||
|
mlp_ratios=mlp_ratios, |
||||||
|
attn_ratio=attn_ratio, |
||||||
|
drop=drop, |
||||||
|
drop_path=drop_path[i] |
||||||
|
if isinstance(drop_path, list) else drop_path, |
||||||
|
act_layer=act_layer, |
||||||
|
lr_mult=lr_mult)) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
# token * N |
||||||
|
for i in range(self.block_num): |
||||||
|
x = self.transformer_blocks[i](x) |
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class PyramidPoolAgg(nn.Layer): |
||||||
|
def __init__(self, stride): |
||||||
|
super().__init__() |
||||||
|
self.stride = stride |
||||||
|
self.tmp = Identity() # avoid the error of paddle.flops |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
''' |
||||||
|
# The F.adaptive_avg_pool2d does not support the (H, W) be Tensor, |
||||||
|
# so exporting the inference model will raise error. |
||||||
|
_, _, H, W = inputs[-1].shape |
||||||
|
H = (H - 1) // self.stride + 1 |
||||||
|
W = (W - 1) // self.stride + 1 |
||||||
|
return paddle.concat( |
||||||
|
[F.adaptive_avg_pool2d(inp, (H, W)) for inp in inputs], axis=1) |
||||||
|
''' |
||||||
|
out = [] |
||||||
|
ks = 2**len(inputs) |
||||||
|
stride = self.stride**len(inputs) |
||||||
|
for x in inputs: |
||||||
|
x = F.avg_pool2d(x, int(ks), int(stride)) |
||||||
|
ks /= 2 |
||||||
|
stride /= 2 |
||||||
|
out.append(x) |
||||||
|
out = paddle.concat(out, axis=1) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class InjectionMultiSum(nn.Layer): |
||||||
|
def __init__(self, in_channels, out_channels, activations=None, |
||||||
|
lr_mult=1.0): |
||||||
|
super(InjectionMultiSum, self).__init__() |
||||||
|
|
||||||
|
self.local_embedding = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, lr_mult=lr_mult) |
||||||
|
self.global_embedding = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, lr_mult=lr_mult) |
||||||
|
self.global_act = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, lr_mult=lr_mult) |
||||||
|
self.act = HSigmoid() |
||||||
|
|
||||||
|
def forward(self, x_low, x_global): |
||||||
|
xl_hw = paddle.shape(x_low)[2:] |
||||||
|
local_feat = self.local_embedding(x_low) |
||||||
|
|
||||||
|
global_act = self.global_act(x_global) |
||||||
|
sig_act = F.interpolate( |
||||||
|
self.act(global_act), xl_hw, mode='bilinear', align_corners=False) |
||||||
|
|
||||||
|
global_feat = self.global_embedding(x_global) |
||||||
|
global_feat = F.interpolate( |
||||||
|
global_feat, xl_hw, mode='bilinear', align_corners=False) |
||||||
|
|
||||||
|
out = local_feat * sig_act + global_feat |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class InjectionMultiSumCBR(nn.Layer): |
||||||
|
def __init__(self, in_channels, out_channels, activations=None): |
||||||
|
''' |
||||||
|
local_embedding: conv-bn-relu |
||||||
|
global_embedding: conv-bn-relu |
||||||
|
global_act: conv |
||||||
|
''' |
||||||
|
super(InjectionMultiSumCBR, self).__init__() |
||||||
|
|
||||||
|
self.local_embedding = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1) |
||||||
|
self.global_embedding = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1) |
||||||
|
self.global_act = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, norm=None, act=None) |
||||||
|
self.act = HSigmoid() |
||||||
|
|
||||||
|
def forward(self, x_low, x_global): |
||||||
|
xl_hw = paddle.shape(x)[2:] |
||||||
|
local_feat = self.local_embedding(x_low) |
||||||
|
# kernel |
||||||
|
global_act = self.global_act(x_global) |
||||||
|
global_act = F.interpolate( |
||||||
|
self.act(global_act), xl_hw, mode='bilinear', align_corners=False) |
||||||
|
# feat_h |
||||||
|
global_feat = self.global_embedding(x_global) |
||||||
|
global_feat = F.interpolate( |
||||||
|
global_feat, xl_hw, mode='bilinear', align_corners=False) |
||||||
|
out = local_feat * global_act + global_feat |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class FuseBlockSum(nn.Layer): |
||||||
|
def __init__(self, in_channels, out_channels, activations=None): |
||||||
|
super(FuseBlockSum, self).__init__() |
||||||
|
|
||||||
|
self.fuse1 = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, act=None) |
||||||
|
self.fuse2 = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, act=None) |
||||||
|
|
||||||
|
def forward(self, x_low, x_high): |
||||||
|
xl_hw = paddle.shape(x)[2:] |
||||||
|
inp = self.fuse1(x_low) |
||||||
|
kernel = self.fuse2(x_high) |
||||||
|
feat_h = F.interpolate( |
||||||
|
kernel, xl_hw, mode='bilinear', align_corners=False) |
||||||
|
out = inp + feat_h |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class FuseBlockMulti(nn.Layer): |
||||||
|
def __init__( |
||||||
|
self, |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
stride=1, |
||||||
|
activations=None, ): |
||||||
|
super(FuseBlockMulti, self).__init__() |
||||||
|
assert stride in [1, 2], "The stride should be 1 or 2." |
||||||
|
|
||||||
|
self.fuse1 = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, act=None) |
||||||
|
self.fuse2 = ConvBNAct( |
||||||
|
in_channels, out_channels, kernel_size=1, act=None) |
||||||
|
self.act = HSigmoid() |
||||||
|
|
||||||
|
def forward(self, x_low, x_high): |
||||||
|
xl_hw = paddle.shape(x)[2:] |
||||||
|
inp = self.fuse1(x_low) |
||||||
|
sig_act = self.fuse2(x_high) |
||||||
|
sig_act = F.interpolate( |
||||||
|
self.act(sig_act), xl_hw, mode='bilinear', align_corners=False) |
||||||
|
out = inp * sig_act |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
SIM_BLOCK = { |
||||||
|
"fuse_sum": FuseBlockSum, |
||||||
|
"fuse_multi": FuseBlockMulti, |
||||||
|
"multi_sum": InjectionMultiSum, |
||||||
|
"multi_sum_cbr": InjectionMultiSumCBR, |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
class TopTransformer(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
cfgs, |
||||||
|
injection_out_channels, |
||||||
|
encoder_out_indices, |
||||||
|
trans_out_indices=[1, 2, 3], |
||||||
|
depths=4, |
||||||
|
key_dim=16, |
||||||
|
num_heads=8, |
||||||
|
attn_ratios=2, |
||||||
|
mlp_ratios=2, |
||||||
|
c2t_stride=2, |
||||||
|
drop_path_rate=0., |
||||||
|
act_layer=nn.ReLU6, |
||||||
|
injection_type="muli_sum", |
||||||
|
injection=True, |
||||||
|
lr_mult=1.0, |
||||||
|
in_channels=3, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
self.feat_channels = [ |
||||||
|
c[2] for i, c in enumerate(cfgs) if i in encoder_out_indices |
||||||
|
] |
||||||
|
self.injection_out_channels = injection_out_channels |
||||||
|
self.injection = injection |
||||||
|
self.embed_dim = sum(self.feat_channels) |
||||||
|
self.trans_out_indices = trans_out_indices |
||||||
|
|
||||||
|
self.tpm = TokenPyramidModule( |
||||||
|
cfgs=cfgs, |
||||||
|
out_indices=encoder_out_indices, |
||||||
|
in_channels=in_channels, |
||||||
|
lr_mult=lr_mult) |
||||||
|
self.ppa = PyramidPoolAgg(stride=c2t_stride) |
||||||
|
|
||||||
|
dpr = [x.item() for x in \ |
||||||
|
paddle.linspace(0, drop_path_rate, depths)] |
||||||
|
self.trans = BasicLayer( |
||||||
|
block_num=depths, |
||||||
|
embedding_dim=self.embed_dim, |
||||||
|
key_dim=key_dim, |
||||||
|
num_heads=num_heads, |
||||||
|
mlp_ratios=mlp_ratios, |
||||||
|
attn_ratio=attn_ratios, |
||||||
|
drop=0, |
||||||
|
attn_drop=0, |
||||||
|
drop_path=dpr, |
||||||
|
act_layer=act_layer, |
||||||
|
lr_mult=lr_mult) |
||||||
|
|
||||||
|
self.SIM = nn.LayerList() |
||||||
|
inj_module = SIM_BLOCK[injection_type] |
||||||
|
if self.injection: |
||||||
|
for i in range(len(self.feat_channels)): |
||||||
|
if i in trans_out_indices: |
||||||
|
self.SIM.append( |
||||||
|
inj_module( |
||||||
|
self.feat_channels[i], |
||||||
|
injection_out_channels[i], |
||||||
|
activations=act_layer, |
||||||
|
lr_mult=lr_mult)) |
||||||
|
else: |
||||||
|
self.SIM.append(Identity()) |
||||||
|
|
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
ouputs = self.tpm(x) |
||||||
|
out = self.ppa(ouputs) |
||||||
|
out = self.trans(out) |
||||||
|
|
||||||
|
if self.injection: |
||||||
|
xx = out.split(self.feat_channels, axis=1) |
||||||
|
results = [] |
||||||
|
for i in range(len(self.feat_channels)): |
||||||
|
if i in self.trans_out_indices: |
||||||
|
local_tokens = ouputs[i] |
||||||
|
global_semantics = xx[i] |
||||||
|
out_ = self.SIM[i](local_tokens, global_semantics) |
||||||
|
results.append(out_) |
||||||
|
return results |
||||||
|
else: |
||||||
|
ouputs.append(out) |
||||||
|
return ouputs |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def TopTransformer_Base(**kwargs): |
||||||
|
cfgs = [ |
||||||
|
# k, t, c, s |
||||||
|
[3, 1, 16, 1], # 1/2 |
||||||
|
[3, 4, 32, 2], # 1/4 1 |
||||||
|
[3, 3, 32, 1], # |
||||||
|
[5, 3, 64, 2], # 1/8 3 |
||||||
|
[5, 3, 64, 1], # |
||||||
|
[3, 3, 128, 2], # 1/16 5 |
||||||
|
[3, 3, 128, 1], # |
||||||
|
[5, 6, 160, 2], # 1/32 7 |
||||||
|
[5, 6, 160, 1], # |
||||||
|
[3, 6, 160, 1], # |
||||||
|
] |
||||||
|
|
||||||
|
model = TopTransformer( |
||||||
|
cfgs=cfgs, |
||||||
|
injection_out_channels=[None, 256, 256, 256], |
||||||
|
encoder_out_indices=[2, 4, 6, 9], |
||||||
|
trans_out_indices=[1, 2, 3], |
||||||
|
depths=4, |
||||||
|
key_dim=16, |
||||||
|
num_heads=8, |
||||||
|
attn_ratios=2, |
||||||
|
mlp_ratios=2, |
||||||
|
c2t_stride=2, |
||||||
|
drop_path_rate=0., |
||||||
|
act_layer=nn.ReLU6, |
||||||
|
injection_type="multi_sum", |
||||||
|
injection=True, |
||||||
|
**kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def TopTransformer_Small(**kwargs): |
||||||
|
cfgs = [ |
||||||
|
# k, t, c, s |
||||||
|
[3, 1, 16, 1], # 1/2 |
||||||
|
[3, 4, 24, 2], # 1/4 1 |
||||||
|
[3, 3, 24, 1], # |
||||||
|
[5, 3, 48, 2], # 1/8 3 |
||||||
|
[5, 3, 48, 1], # |
||||||
|
[3, 3, 96, 2], # 1/16 5 |
||||||
|
[3, 3, 96, 1], # |
||||||
|
[5, 6, 128, 2], # 1/32 7 |
||||||
|
[5, 6, 128, 1], # |
||||||
|
[3, 6, 128, 1], # |
||||||
|
] |
||||||
|
|
||||||
|
model = TopTransformer( |
||||||
|
cfgs=cfgs, |
||||||
|
injection_out_channels=[None, 192, 192, 192], |
||||||
|
encoder_out_indices=[2, 4, 6, 9], |
||||||
|
trans_out_indices=[1, 2, 3], |
||||||
|
depths=4, |
||||||
|
key_dim=16, |
||||||
|
num_heads=6, |
||||||
|
attn_ratios=2, |
||||||
|
mlp_ratios=2, |
||||||
|
c2t_stride=2, |
||||||
|
drop_path_rate=0., |
||||||
|
act_layer=nn.ReLU6, |
||||||
|
injection_type="multi_sum", |
||||||
|
injection=True, |
||||||
|
**kwargs) |
||||||
|
return model |
||||||
|
|
||||||
|
|
||||||
|
@manager.BACKBONES.add_component |
||||||
|
def TopTransformer_Tiny(**kwargs): |
||||||
|
cfgs = [ |
||||||
|
# k, t, c, s |
||||||
|
[3, 1, 16, 1], # 1/2 |
||||||
|
[3, 4, 16, 2], # 1/4 1 |
||||||
|
[3, 3, 16, 1], # |
||||||
|
[5, 3, 32, 2], # 1/8 3 |
||||||
|
[5, 3, 32, 1], # |
||||||
|
[3, 3, 64, 2], # 1/16 5 |
||||||
|
[3, 3, 64, 1], # |
||||||
|
[5, 6, 96, 2], # 1/32 7 |
||||||
|
[5, 6, 96, 1], # |
||||||
|
] |
||||||
|
|
||||||
|
model = TopTransformer( |
||||||
|
cfgs=cfgs, |
||||||
|
injection_out_channels=[None, 128, 128, 128], |
||||||
|
encoder_out_indices=[2, 4, 6, 8], |
||||||
|
trans_out_indices=[1, 2, 3], |
||||||
|
depths=4, |
||||||
|
key_dim=16, |
||||||
|
num_heads=4, |
||||||
|
attn_ratios=2, |
||||||
|
mlp_ratios=2, |
||||||
|
c2t_stride=2, |
||||||
|
drop_path_rate=0., |
||||||
|
act_layer=nn.ReLU6, |
||||||
|
injection_type="multi_sum", |
||||||
|
injection=True, |
||||||
|
**kwargs) |
||||||
|
return model |
@ -0,0 +1,174 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.utils import utils |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class CCNet(nn.Layer): |
||||||
|
""" |
||||||
|
The CCNet implementation based on PaddlePaddle. |
||||||
|
|
||||||
|
The original article refers to |
||||||
|
Zilong Huang, et al. "CCNet: Criss-Cross Attention for Semantic Segmentation" |
||||||
|
(https://arxiv.org/abs/1811.11721) |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The unique number of target classes. |
||||||
|
backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd. |
||||||
|
backbone_indices (tuple, list, optional): Two values in the tuple indicate the indices of output of backbone. Default: (2, 3). |
||||||
|
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. |
||||||
|
dropout_prob (float, optional): The probability of dropout. Default: 0.0. |
||||||
|
recurrence (int, optional): The number of recurrent operations. Defautl: 1. |
||||||
|
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, |
||||||
|
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone, |
||||||
|
backbone_indices=(2, 3), |
||||||
|
enable_auxiliary_loss=True, |
||||||
|
dropout_prob=0.0, |
||||||
|
recurrence=1, |
||||||
|
align_corners=False, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
self.enable_auxiliary_loss = enable_auxiliary_loss |
||||||
|
self.recurrence = recurrence |
||||||
|
self.align_corners = align_corners |
||||||
|
|
||||||
|
self.backbone = backbone |
||||||
|
self.backbone_indices = backbone_indices |
||||||
|
backbone_channels = [ |
||||||
|
backbone.feat_channels[i] for i in backbone_indices |
||||||
|
] |
||||||
|
|
||||||
|
if enable_auxiliary_loss: |
||||||
|
self.aux_head = layers.AuxLayer( |
||||||
|
backbone_channels[0], |
||||||
|
512, |
||||||
|
num_classes, |
||||||
|
dropout_prob=dropout_prob) |
||||||
|
self.head = RCCAModule( |
||||||
|
backbone_channels[1], |
||||||
|
512, |
||||||
|
num_classes, |
||||||
|
dropout_prob=dropout_prob, |
||||||
|
recurrence=recurrence) |
||||||
|
self.pretrained = pretrained |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
feat_list = self.backbone(x) |
||||||
|
logit_list = [] |
||||||
|
output = self.head(feat_list[self.backbone_indices[-1]]) |
||||||
|
logit_list.append(output) |
||||||
|
if self.training and self.enable_auxiliary_loss: |
||||||
|
aux_out = self.aux_head(feat_list[self.backbone_indices[-2]]) |
||||||
|
logit_list.append(aux_out) |
||||||
|
return [ |
||||||
|
F.interpolate( |
||||||
|
logit, |
||||||
|
paddle.shape(x)[2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) for logit in logit_list |
||||||
|
] |
||||||
|
|
||||||
|
|
||||||
|
class RCCAModule(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
out_channels, |
||||||
|
num_classes, |
||||||
|
dropout_prob=0.1, |
||||||
|
recurrence=1): |
||||||
|
super().__init__() |
||||||
|
inter_channels = in_channels // 4 |
||||||
|
self.recurrence = recurrence |
||||||
|
self.conva = layers.ConvBNLeakyReLU( |
||||||
|
in_channels, inter_channels, 3, padding=1, bias_attr=False) |
||||||
|
self.cca = CrissCrossAttention(inter_channels) |
||||||
|
self.convb = layers.ConvBNLeakyReLU( |
||||||
|
inter_channels, inter_channels, 3, padding=1, bias_attr=False) |
||||||
|
self.out = layers.AuxLayer( |
||||||
|
in_channels + inter_channels, |
||||||
|
out_channels, |
||||||
|
num_classes, |
||||||
|
dropout_prob=dropout_prob) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
feat = self.conva(x) |
||||||
|
for i in range(self.recurrence): |
||||||
|
feat = self.cca(feat) |
||||||
|
feat = self.convb(feat) |
||||||
|
output = self.out(paddle.concat([x, feat], axis=1)) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class CrissCrossAttention(nn.Layer): |
||||||
|
def __init__(self, in_channels): |
||||||
|
super().__init__() |
||||||
|
self.q_conv = nn.Conv2D(in_channels, in_channels // 8, kernel_size=1) |
||||||
|
self.k_conv = nn.Conv2D(in_channels, in_channels // 8, kernel_size=1) |
||||||
|
self.v_conv = nn.Conv2D(in_channels, in_channels, kernel_size=1) |
||||||
|
self.softmax = nn.Softmax(axis=3) |
||||||
|
self.gamma = self.create_parameter( |
||||||
|
shape=(1, ), default_initializer=nn.initializer.Constant(0)) |
||||||
|
self.inf_tensor = paddle.full(shape=(1, ), fill_value=float('inf')) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
b, c, h, w = paddle.shape(x) |
||||||
|
proj_q = self.q_conv(x) |
||||||
|
proj_q_h = proj_q.transpose([0, 3, 1, 2]).reshape( |
||||||
|
[b * w, -1, h]).transpose([0, 2, 1]) |
||||||
|
proj_q_w = proj_q.transpose([0, 2, 1, 3]).reshape( |
||||||
|
[b * h, -1, w]).transpose([0, 2, 1]) |
||||||
|
|
||||||
|
proj_k = self.k_conv(x) |
||||||
|
proj_k_h = proj_k.transpose([0, 3, 1, 2]).reshape([b * w, -1, h]) |
||||||
|
proj_k_w = proj_k.transpose([0, 2, 1, 3]).reshape([b * h, -1, w]) |
||||||
|
|
||||||
|
proj_v = self.v_conv(x) |
||||||
|
proj_v_h = proj_v.transpose([0, 3, 1, 2]).reshape([b * w, -1, h]) |
||||||
|
proj_v_w = proj_v.transpose([0, 2, 1, 3]).reshape([b * h, -1, w]) |
||||||
|
|
||||||
|
energy_h = (paddle.bmm(proj_q_h, proj_k_h) + self.Inf(b, h, w)).reshape( |
||||||
|
[b, w, h, h]).transpose([0, 2, 1, 3]) |
||||||
|
energy_w = paddle.bmm(proj_q_w, proj_k_w).reshape([b, h, w, w]) |
||||||
|
concate = self.softmax(paddle.concat([energy_h, energy_w], axis=3)) |
||||||
|
|
||||||
|
attn_h = concate[:, :, :, 0:h].transpose([0, 2, 1, 3]).reshape( |
||||||
|
[b * w, h, h]) |
||||||
|
attn_w = concate[:, :, :, h:h + w].reshape([b * h, w, w]) |
||||||
|
out_h = paddle.bmm(proj_v_h, attn_h.transpose([0, 2, 1])).reshape( |
||||||
|
[b, w, -1, h]).transpose([0, 2, 3, 1]) |
||||||
|
out_w = paddle.bmm(proj_v_w, attn_w.transpose([0, 2, 1])).reshape( |
||||||
|
[b, h, -1, w]).transpose([0, 2, 1, 3]) |
||||||
|
return self.gamma * (out_h + out_w) + x |
||||||
|
|
||||||
|
def Inf(self, B, H, W): |
||||||
|
return -paddle.tile( |
||||||
|
paddle.diag(paddle.tile(self.inf_tensor, [H]), 0).unsqueeze(0), |
||||||
|
[B * W, 1, 1]) |
@ -0,0 +1,403 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager, param_init |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.utils import utils |
||||||
|
|
||||||
|
|
||||||
|
class DualResNet(nn.Layer): |
||||||
|
""" |
||||||
|
The DDRNet implementation based on PaddlePaddle. |
||||||
|
|
||||||
|
The original article refers to |
||||||
|
Yuanduo Hong, Huihui Pan, Weichao Sun, et al. "Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes" |
||||||
|
(https://arxiv.org/abs/2101.06085) |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The unique number of target classes. |
||||||
|
in_channels (int, optional): Number of input channels. Default: 3. |
||||||
|
block_layers (list, tuple): The numbers of layers in different blocks. Default: [2, 2, 2, 2]. |
||||||
|
planes (int): Base channels in network. Default: 64. |
||||||
|
spp_planes (int): Branch channels for DAPPM. Default: 128. |
||||||
|
head_planes (int): Mid channels of segmentation head. Default: 128. |
||||||
|
enable_auxiliary_loss (bool): Whether use auxiliary head for stage3. Default: False. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
in_channels=3, |
||||||
|
block_layers=[2, 2, 2, 2], |
||||||
|
planes=64, |
||||||
|
spp_planes=128, |
||||||
|
head_planes=128, |
||||||
|
enable_auxiliary_loss=False, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
highres_planes = planes * 2 |
||||||
|
self.enable_auxiliary_loss = enable_auxiliary_loss |
||||||
|
self.conv1 = nn.Sequential( |
||||||
|
layers.ConvBNReLU( |
||||||
|
in_channels, planes, kernel_size=3, stride=2, padding=1), |
||||||
|
layers.ConvBNReLU( |
||||||
|
planes, planes, kernel_size=3, stride=2, padding=1), ) |
||||||
|
self.relu = nn.ReLU() |
||||||
|
self.layer1 = self._make_layers(BasicBlock, planes, planes, |
||||||
|
block_layers[0]) |
||||||
|
self.layer2 = self._make_layers( |
||||||
|
BasicBlock, planes, planes * 2, block_layers[1], stride=2) |
||||||
|
self.layer3 = self._make_layers( |
||||||
|
BasicBlock, planes * 2, planes * 4, block_layers[2], stride=2) |
||||||
|
self.layer4 = self._make_layers( |
||||||
|
BasicBlock, planes * 4, planes * 8, block_layers[3], stride=2) |
||||||
|
|
||||||
|
self.compression3 = layers.ConvBN( |
||||||
|
planes * 4, highres_planes, kernel_size=1, bias_attr=False) |
||||||
|
|
||||||
|
self.compression4 = layers.ConvBN( |
||||||
|
planes * 8, highres_planes, kernel_size=1, bias_attr=False) |
||||||
|
|
||||||
|
self.down3 = layers.ConvBN( |
||||||
|
highres_planes, |
||||||
|
planes * 4, |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
bias_attr=False) |
||||||
|
|
||||||
|
self.down4 = nn.Sequential( |
||||||
|
layers.ConvBNReLU( |
||||||
|
highres_planes, |
||||||
|
planes * 4, |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
padding=1, |
||||||
|
bias_attr=False), |
||||||
|
layers.ConvBN( |
||||||
|
planes * 4, |
||||||
|
planes * 8, |
||||||
|
kernel_size=3, |
||||||
|
stride=2, |
||||||
|
padding=1, |
||||||
|
bias_attr=False)) |
||||||
|
|
||||||
|
self.layer3_ = self._make_layers(BasicBlock, planes * 2, highres_planes, |
||||||
|
2) |
||||||
|
self.layer4_ = self._make_layers(BasicBlock, highres_planes, |
||||||
|
highres_planes, 2) |
||||||
|
self.layer5_ = self._make_layers(Bottleneck, highres_planes, |
||||||
|
highres_planes, 1) |
||||||
|
self.layer5 = self._make_layers( |
||||||
|
Bottleneck, planes * 8, planes * 8, 1, stride=2) |
||||||
|
|
||||||
|
self.spp = DAPPM(planes * 16, spp_planes, planes * 4) |
||||||
|
if self.enable_auxiliary_loss: |
||||||
|
self.aux_head = DDRNetHead(highres_planes, head_planes, num_classes) |
||||||
|
self.head = DDRNetHead(planes * 4, head_planes, num_classes) |
||||||
|
|
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
else: |
||||||
|
for m in self.sublayers(): |
||||||
|
if isinstance(m, nn.Conv2D): |
||||||
|
param_init.kaiming_normal_init(m.weight) |
||||||
|
elif isinstance(m, nn.BatchNorm2D): |
||||||
|
param_init.constant_init(m.weight, value=1) |
||||||
|
param_init.constant_init(m.bias, value=0) |
||||||
|
|
||||||
|
def _make_layers(self, block, inplanes, planes, blocks, stride=1): |
||||||
|
downsample = None |
||||||
|
if stride != 1 or inplanes != planes * block.expansion: |
||||||
|
downsample = nn.Sequential( |
||||||
|
nn.Conv2D( |
||||||
|
inplanes, |
||||||
|
planes * block.expansion, |
||||||
|
kernel_size=1, |
||||||
|
stride=stride, |
||||||
|
bias_attr=False), |
||||||
|
nn.BatchNorm2D(planes * block.expansion), ) |
||||||
|
layers = [] |
||||||
|
layers.append(block(inplanes, planes, stride, downsample)) |
||||||
|
inplanes = planes * block.expansion |
||||||
|
for i in range(1, blocks): |
||||||
|
if i == (blocks - 1): |
||||||
|
layers.append(block(inplanes, planes, stride=1, no_relu=True)) |
||||||
|
else: |
||||||
|
layers.append(block(inplanes, planes, stride=1, no_relu=False)) |
||||||
|
return nn.Sequential(*layers) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
n, c, h, w = paddle.shape(x) |
||||||
|
width_output = w // 8 |
||||||
|
height_output = h // 8 |
||||||
|
|
||||||
|
x = self.conv1(x) |
||||||
|
stage1_out = self.layer1(x) |
||||||
|
stage2_out = self.layer2(self.relu(stage1_out)) |
||||||
|
stage3_out = self.layer3(self.relu(stage2_out)) |
||||||
|
stage3_out_dual = self.layer3_(self.relu(stage2_out)) |
||||||
|
x = stage3_out + self.down3(self.relu(stage3_out_dual)) |
||||||
|
stage3_merge = stage3_out_dual + F.interpolate( |
||||||
|
self.compression3(self.relu(stage3_out)), |
||||||
|
size=[height_output, width_output], |
||||||
|
mode='bilinear') |
||||||
|
|
||||||
|
stage4_out = self.layer4(self.relu(x)) |
||||||
|
stage4_out_dual = self.layer4_(self.relu(stage3_merge)) |
||||||
|
|
||||||
|
x = stage4_out + self.down4(self.relu(stage4_out_dual)) |
||||||
|
stage4_merge = stage4_out_dual + F.interpolate( |
||||||
|
self.compression4(self.relu(stage4_out)), |
||||||
|
size=[height_output, width_output], |
||||||
|
mode='bilinear') |
||||||
|
|
||||||
|
stage5_out_dual = self.layer5_(self.relu(stage4_merge)) |
||||||
|
x = F.interpolate( |
||||||
|
self.spp(self.layer5(self.relu(x))), |
||||||
|
size=[height_output, width_output], |
||||||
|
mode='bilinear') |
||||||
|
|
||||||
|
output = self.head(x + stage5_out_dual) |
||||||
|
logit_list = [] |
||||||
|
logit_list.append(output) |
||||||
|
|
||||||
|
if self.enable_auxiliary_loss: |
||||||
|
aux_out = self.aux_head(stage3_merge) |
||||||
|
logit_list.append(aux_out) |
||||||
|
return [ |
||||||
|
F.interpolate( |
||||||
|
logit, [h, w], mode='bilinear') for logit in logit_list |
||||||
|
] |
||||||
|
|
||||||
|
|
||||||
|
class BasicBlock(nn.Layer): |
||||||
|
expansion = 1 |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
inplanes, |
||||||
|
planes, |
||||||
|
stride=1, |
||||||
|
downsample=None, |
||||||
|
no_relu=False): |
||||||
|
super().__init__() |
||||||
|
self.conv_bn_relu = layers.ConvBNReLU( |
||||||
|
inplanes, |
||||||
|
planes, |
||||||
|
kernel_size=3, |
||||||
|
stride=stride, |
||||||
|
padding=1, |
||||||
|
bias_attr=False) |
||||||
|
self.relu = nn.ReLU() |
||||||
|
self.conv_bn = layers.ConvBN( |
||||||
|
planes, planes, kernel_size=3, stride=1, padding=1, bias_attr=False) |
||||||
|
self.downsample = downsample |
||||||
|
self.stride = stride |
||||||
|
self.no_relu = no_relu |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
residual = x |
||||||
|
out = self.conv_bn_relu(x) |
||||||
|
out = self.conv_bn(out) |
||||||
|
if self.downsample is not None: |
||||||
|
residual = self.downsample(x) |
||||||
|
out += residual |
||||||
|
if self.no_relu: |
||||||
|
return out |
||||||
|
else: |
||||||
|
return self.relu(out) |
||||||
|
|
||||||
|
|
||||||
|
class Bottleneck(nn.Layer): |
||||||
|
expansion = 2 |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
inplanes, |
||||||
|
planes, |
||||||
|
stride=1, |
||||||
|
downsample=None, |
||||||
|
no_relu=True): |
||||||
|
super().__init__() |
||||||
|
self.conv_bn_relu1 = layers.ConvBNReLU( |
||||||
|
inplanes, planes, kernel_size=1, bias_attr=False) |
||||||
|
self.conv_bn_relu2 = layers.ConvBNReLU( |
||||||
|
planes, |
||||||
|
planes, |
||||||
|
kernel_size=3, |
||||||
|
stride=stride, |
||||||
|
padding=1, |
||||||
|
bias_attr=False) |
||||||
|
self.conv_bn = layers.ConvBN( |
||||||
|
planes, planes * self.expansion, kernel_size=1, bias_attr=False) |
||||||
|
self.relu = nn.ReLU() |
||||||
|
self.downsample = downsample |
||||||
|
self.stride = stride |
||||||
|
self.no_relu = no_relu |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
residual = x |
||||||
|
out = self.conv_bn_relu1(x) |
||||||
|
out = self.conv_bn_relu2(out) |
||||||
|
out = self.conv_bn(out) |
||||||
|
if self.downsample is not None: |
||||||
|
residual = self.downsample(x) |
||||||
|
out += residual |
||||||
|
if self.no_relu: |
||||||
|
return out |
||||||
|
else: |
||||||
|
return self.relu(out) |
||||||
|
|
||||||
|
|
||||||
|
class DAPPM(nn.Layer): |
||||||
|
def __init__(self, inplanes, branch_planes, outplanes): |
||||||
|
super().__init__() |
||||||
|
self.scale1 = nn.Sequential( |
||||||
|
nn.AvgPool2D( |
||||||
|
kernel_size=5, stride=2, padding=2), |
||||||
|
layers.SyncBatchNorm(inplanes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
inplanes, branch_planes, kernel_size=1, bias_attr=False), ) |
||||||
|
self.scale2 = nn.Sequential( |
||||||
|
nn.AvgPool2D( |
||||||
|
kernel_size=9, stride=4, padding=4), |
||||||
|
layers.SyncBatchNorm(inplanes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
inplanes, branch_planes, kernel_size=1, bias_attr=False), ) |
||||||
|
self.scale3 = nn.Sequential( |
||||||
|
nn.AvgPool2D( |
||||||
|
kernel_size=17, stride=8, padding=8), |
||||||
|
layers.SyncBatchNorm(inplanes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
inplanes, branch_planes, kernel_size=1, bias_attr=False), ) |
||||||
|
self.scale4 = nn.Sequential( |
||||||
|
nn.AdaptiveAvgPool2D((1, 1)), |
||||||
|
layers.SyncBatchNorm(inplanes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
inplanes, branch_planes, kernel_size=1, bias_attr=False), ) |
||||||
|
self.scale0 = nn.Sequential( |
||||||
|
layers.SyncBatchNorm(inplanes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
inplanes, branch_planes, kernel_size=1, bias_attr=False), ) |
||||||
|
self.process1 = nn.Sequential( |
||||||
|
layers.SyncBatchNorm(branch_planes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
branch_planes, |
||||||
|
branch_planes, |
||||||
|
kernel_size=3, |
||||||
|
padding=1, |
||||||
|
bias_attr=False), ) |
||||||
|
self.process2 = nn.Sequential( |
||||||
|
layers.SyncBatchNorm(branch_planes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
branch_planes, |
||||||
|
branch_planes, |
||||||
|
kernel_size=3, |
||||||
|
padding=1, |
||||||
|
bias_attr=False), ) |
||||||
|
self.process3 = nn.Sequential( |
||||||
|
layers.SyncBatchNorm(branch_planes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
branch_planes, |
||||||
|
branch_planes, |
||||||
|
kernel_size=3, |
||||||
|
padding=1, |
||||||
|
bias_attr=False), ) |
||||||
|
self.process4 = nn.Sequential( |
||||||
|
layers.SyncBatchNorm(branch_planes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
branch_planes, |
||||||
|
branch_planes, |
||||||
|
kernel_size=3, |
||||||
|
padding=1, |
||||||
|
bias_attr=False), ) |
||||||
|
self.compression = nn.Sequential( |
||||||
|
layers.SyncBatchNorm(branch_planes * 5), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
branch_planes * 5, outplanes, kernel_size=1, bias_attr=False)) |
||||||
|
self.shortcut = nn.Sequential( |
||||||
|
layers.SyncBatchNorm(inplanes), |
||||||
|
nn.ReLU(), |
||||||
|
nn.Conv2D( |
||||||
|
inplanes, outplanes, kernel_size=1, bias_attr=False)) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
n, c, h, w = paddle.shape(x) |
||||||
|
x0 = self.scale0(x) |
||||||
|
x1 = self.process1( |
||||||
|
F.interpolate( |
||||||
|
self.scale1(x), size=[h, w], mode='bilinear') + x0) |
||||||
|
x2 = self.process2( |
||||||
|
F.interpolate( |
||||||
|
self.scale2(x), size=[h, w], mode='bilinear') + x1) |
||||||
|
x3 = self.process3( |
||||||
|
F.interpolate( |
||||||
|
self.scale3(x), size=[h, w], mode='bilinear') + x2) |
||||||
|
x4 = self.process4( |
||||||
|
F.interpolate( |
||||||
|
self.scale4(x), size=[h, w], mode='bilinear') + x3) |
||||||
|
|
||||||
|
out = self.compression(paddle.concat([x0, x1, x2, x3, x4], |
||||||
|
1)) + self.shortcut(x) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class DDRNetHead(nn.Layer): |
||||||
|
def __init__(self, inplanes, interplanes, outplanes, scale_factor=None): |
||||||
|
super().__init__() |
||||||
|
self.bn1 = nn.BatchNorm2D(inplanes) |
||||||
|
self.relu = nn.ReLU() |
||||||
|
self.conv_bn_relu = layers.ConvBNReLU( |
||||||
|
inplanes, interplanes, kernel_size=3, padding=1, bias_attr=False) |
||||||
|
self.conv = nn.Conv2D( |
||||||
|
interplanes, outplanes, kernel_size=1, padding=0, bias_attr=True) |
||||||
|
|
||||||
|
self.scale_factor = scale_factor |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.bn1(x) |
||||||
|
x = self.relu(x) |
||||||
|
x = self.conv_bn_relu(x) |
||||||
|
out = self.conv(x) |
||||||
|
|
||||||
|
if self.scale_factor is not None: |
||||||
|
out = F.interpolate( |
||||||
|
out, scale_factor=self.scale_factor, mode='bilinear') |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
def DDRNet_23(**kwargs): |
||||||
|
return DualResNet( |
||||||
|
block_layers=[2, 2, 2, 2], |
||||||
|
planes=64, |
||||||
|
spp_planes=128, |
||||||
|
head_planes=128, |
||||||
|
**kwargs) |
@ -0,0 +1,198 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.utils import utils |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class GloRe(nn.Layer): |
||||||
|
""" |
||||||
|
The GloRe implementation based on PaddlePaddle. |
||||||
|
|
||||||
|
The original article refers to: |
||||||
|
Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks" |
||||||
|
(https://arxiv.org/pdf/1811.12814.pdf) |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The unique number of target classes. |
||||||
|
backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. |
||||||
|
backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone. |
||||||
|
gru_channels (int, optional): The number of input channels in GloRe Unit. Default: 512. |
||||||
|
gru_num_state (int, optional): The number of states in GloRe Unit. Default: 128. |
||||||
|
gru_num_node (tuple, optional): The number of nodes in GloRe Unit. Default: Default: 128. |
||||||
|
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True. |
||||||
|
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, |
||||||
|
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone, |
||||||
|
backbone_indices=(2, 3), |
||||||
|
gru_channels=512, |
||||||
|
gru_num_state=128, |
||||||
|
gru_num_node=64, |
||||||
|
enable_auxiliary_loss=True, |
||||||
|
align_corners=False, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.backbone = backbone |
||||||
|
backbone_channels = [ |
||||||
|
backbone.feat_channels[i] for i in backbone_indices |
||||||
|
] |
||||||
|
|
||||||
|
self.head = GloReHead(num_classes, backbone_indices, backbone_channels, |
||||||
|
gru_channels, gru_num_state, gru_num_node, |
||||||
|
enable_auxiliary_loss) |
||||||
|
self.align_corners = align_corners |
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
feat_list = self.backbone(x) |
||||||
|
logit_list = self.head(feat_list) |
||||||
|
return [ |
||||||
|
F.interpolate( |
||||||
|
logit, |
||||||
|
paddle.shape(x)[2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) for logit in logit_list |
||||||
|
] |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
|
||||||
|
class GloReHead(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone_indices, |
||||||
|
backbone_channels, |
||||||
|
gru_channels=512, |
||||||
|
gru_num_state=128, |
||||||
|
gru_num_node=64, |
||||||
|
enable_auxiliary_loss=True): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
in_channels = backbone_channels[1] |
||||||
|
self.conv_bn_relu = layers.ConvBNReLU( |
||||||
|
in_channels, gru_channels, 1, bias_attr=False) |
||||||
|
self.gru_module = GruModule( |
||||||
|
num_input=gru_channels, |
||||||
|
num_state=gru_num_state, |
||||||
|
num_node=gru_num_node) |
||||||
|
|
||||||
|
self.dropout = nn.Dropout(0.1) |
||||||
|
self.classifier = nn.Conv2D(512, num_classes, kernel_size=1) |
||||||
|
self.auxlayer = layers.AuxLayer( |
||||||
|
in_channels=backbone_channels[0], |
||||||
|
inter_channels=backbone_channels[0] // 4, |
||||||
|
out_channels=num_classes) |
||||||
|
|
||||||
|
self.backbone_indices = backbone_indices |
||||||
|
self.enable_auxiliary_loss = enable_auxiliary_loss |
||||||
|
|
||||||
|
def forward(self, feat_list): |
||||||
|
|
||||||
|
logit_list = [] |
||||||
|
x = feat_list[self.backbone_indices[1]] |
||||||
|
|
||||||
|
feature = self.conv_bn_relu(x) |
||||||
|
gru_output = self.gru_module(feature) |
||||||
|
output = self.dropout(gru_output) |
||||||
|
logit = self.classifier(output) |
||||||
|
logit_list.append(logit) |
||||||
|
|
||||||
|
if self.enable_auxiliary_loss: |
||||||
|
low_level_feat = feat_list[self.backbone_indices[0]] |
||||||
|
auxiliary_logit = self.auxlayer(low_level_feat) |
||||||
|
logit_list.append(auxiliary_logit) |
||||||
|
|
||||||
|
return logit_list |
||||||
|
|
||||||
|
|
||||||
|
class GCN(nn.Layer): |
||||||
|
def __init__(self, num_state, num_node, bias=False): |
||||||
|
super(GCN, self).__init__() |
||||||
|
self.conv1 = nn.Conv1D(num_node, num_node, kernel_size=1) |
||||||
|
self.relu = nn.ReLU() |
||||||
|
self.conv2 = nn.Conv1D( |
||||||
|
num_state, num_state, kernel_size=1, bias_attr=bias) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
h = self.conv1(paddle.transpose(x, perm=(0, 2, 1))) |
||||||
|
h = paddle.transpose(h, perm=(0, 2, 1)) |
||||||
|
h = h + x |
||||||
|
h = self.relu(self.conv2(h)) |
||||||
|
return h |
||||||
|
|
||||||
|
|
||||||
|
class GruModule(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
num_input=512, |
||||||
|
num_state=128, |
||||||
|
num_node=64, |
||||||
|
normalize=False): |
||||||
|
super(GruModule, self).__init__() |
||||||
|
self.normalize = normalize |
||||||
|
self.num_state = num_state |
||||||
|
self.num_node = num_node |
||||||
|
self.reduction_dim = nn.Conv2D(num_input, num_state, kernel_size=1) |
||||||
|
self.projection_mat = nn.Conv2D(num_input, num_node, kernel_size=1) |
||||||
|
self.gcn = GCN(num_state=self.num_state, num_node=self.num_node) |
||||||
|
self.extend_dim = nn.Conv2D( |
||||||
|
self.num_state, num_input, kernel_size=1, bias_attr=False) |
||||||
|
self.extend_bn = layers.SyncBatchNorm(num_input, epsilon=1e-4) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
n, c, h, w = input.shape |
||||||
|
# B, C, H, W |
||||||
|
reduction_dim = self.reduction_dim(input) |
||||||
|
# B, N, H, W |
||||||
|
mat_B = self.projection_mat(input) |
||||||
|
# B, C, H*W |
||||||
|
reshaped_reduction = paddle.reshape( |
||||||
|
reduction_dim, shape=[n, self.num_state, h * w]) |
||||||
|
# B, N, H*W |
||||||
|
reshaped_B = paddle.reshape(mat_B, shape=[n, self.num_node, h * w]) |
||||||
|
# B, N, H*W |
||||||
|
reproject = reshaped_B |
||||||
|
# B, C, N |
||||||
|
node_state_V = paddle.matmul( |
||||||
|
reshaped_reduction, paddle.transpose( |
||||||
|
reshaped_B, perm=[0, 2, 1])) |
||||||
|
|
||||||
|
if self.normalize: |
||||||
|
node_state_V = node_state_V * (1. / reshaped_reduction.shape[2]) |
||||||
|
|
||||||
|
# B, C, N |
||||||
|
gcn_out = self.gcn(node_state_V) |
||||||
|
# B, C, H*W |
||||||
|
Y = paddle.matmul(gcn_out, reproject) |
||||||
|
# B, C, H, W |
||||||
|
Y = paddle.reshape(Y, shape=[n, self.num_state, h, w]) |
||||||
|
Y_extend = self.extend_dim(Y) |
||||||
|
Y_extend = self.extend_bn(Y_extend) |
||||||
|
|
||||||
|
out = input + Y_extend |
||||||
|
return out |
@ -0,0 +1,285 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
from paddle import ParamAttr |
||||||
|
from paddle.nn.initializer import Constant |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.models.layers import tensor_fusion_helper as helper |
||||||
|
|
||||||
|
|
||||||
|
class UAFM(nn.Layer): |
||||||
|
""" |
||||||
|
The base of Unified Attention Fusion Module. |
||||||
|
Args: |
||||||
|
x_ch (int): The channel of x tensor, which is the low level feature. |
||||||
|
y_ch (int): The channel of y tensor, which is the high level feature. |
||||||
|
out_ch (int): The channel of output tensor. |
||||||
|
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. |
||||||
|
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.conv_x = layers.ConvBNReLU( |
||||||
|
x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) |
||||||
|
self.conv_out = layers.ConvBNReLU( |
||||||
|
y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) |
||||||
|
self.resize_mode = resize_mode |
||||||
|
|
||||||
|
def check(self, x, y): |
||||||
|
assert x.ndim == 4 and y.ndim == 4 |
||||||
|
x_h, x_w = x.shape[2:] |
||||||
|
y_h, y_w = y.shape[2:] |
||||||
|
assert x_h >= y_h and x_w >= y_w |
||||||
|
|
||||||
|
def prepare(self, x, y): |
||||||
|
x = self.prepare_x(x, y) |
||||||
|
y = self.prepare_y(x, y) |
||||||
|
return x, y |
||||||
|
|
||||||
|
def prepare_x(self, x, y): |
||||||
|
x = self.conv_x(x) |
||||||
|
return x |
||||||
|
|
||||||
|
def prepare_y(self, x, y): |
||||||
|
y_up = F.interpolate(y, paddle.shape(x)[2:], mode=self.resize_mode) |
||||||
|
return y_up |
||||||
|
|
||||||
|
def fuse(self, x, y): |
||||||
|
out = x + y |
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
||||||
|
|
||||||
|
def forward(self, x, y): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
x (Tensor): The low level feature. |
||||||
|
y (Tensor): The high level feature. |
||||||
|
""" |
||||||
|
self.check(x, y) |
||||||
|
x, y = self.prepare(x, y) |
||||||
|
out = self.fuse(x, y) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class UAFM_ChAtten(UAFM): |
||||||
|
""" |
||||||
|
The UAFM with channel attention, which uses mean and max values. |
||||||
|
Args: |
||||||
|
x_ch (int): The channel of x tensor, which is the low level feature. |
||||||
|
y_ch (int): The channel of y tensor, which is the high level feature. |
||||||
|
out_ch (int): The channel of output tensor. |
||||||
|
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. |
||||||
|
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): |
||||||
|
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) |
||||||
|
|
||||||
|
self.conv_xy_atten = nn.Sequential( |
||||||
|
layers.ConvBNAct( |
||||||
|
4 * y_ch, |
||||||
|
y_ch // 2, |
||||||
|
kernel_size=1, |
||||||
|
bias_attr=False, |
||||||
|
act_type="leakyrelu"), |
||||||
|
layers.ConvBN( |
||||||
|
y_ch // 2, y_ch, kernel_size=1, bias_attr=False)) |
||||||
|
|
||||||
|
def fuse(self, x, y): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
x (Tensor): The low level feature. |
||||||
|
y (Tensor): The high level feature. |
||||||
|
""" |
||||||
|
atten = helper.avg_max_reduce_hw([x, y], self.training) |
||||||
|
atten = F.sigmoid(self.conv_xy_atten(atten)) |
||||||
|
|
||||||
|
out = x * atten + y * (1 - atten) |
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class UAFM_ChAtten_S(UAFM): |
||||||
|
""" |
||||||
|
The UAFM with channel attention, which uses mean values. |
||||||
|
Args: |
||||||
|
x_ch (int): The channel of x tensor, which is the low level feature. |
||||||
|
y_ch (int): The channel of y tensor, which is the high level feature. |
||||||
|
out_ch (int): The channel of output tensor. |
||||||
|
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. |
||||||
|
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): |
||||||
|
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) |
||||||
|
|
||||||
|
self.conv_xy_atten = nn.Sequential( |
||||||
|
layers.ConvBNAct( |
||||||
|
2 * y_ch, |
||||||
|
y_ch // 2, |
||||||
|
kernel_size=1, |
||||||
|
bias_attr=False, |
||||||
|
act_type="leakyrelu"), |
||||||
|
layers.ConvBN( |
||||||
|
y_ch // 2, y_ch, kernel_size=1, bias_attr=False)) |
||||||
|
|
||||||
|
def fuse(self, x, y): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
x (Tensor): The low level feature. |
||||||
|
y (Tensor): The high level feature. |
||||||
|
""" |
||||||
|
atten = helper.avg_reduce_hw([x, y]) |
||||||
|
atten = F.sigmoid(self.conv_xy_atten(atten)) |
||||||
|
|
||||||
|
out = x * atten + y * (1 - atten) |
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class UAFM_SpAtten(UAFM): |
||||||
|
""" |
||||||
|
The UAFM with spatial attention, which uses mean and max values. |
||||||
|
Args: |
||||||
|
x_ch (int): The channel of x tensor, which is the low level feature. |
||||||
|
y_ch (int): The channel of y tensor, which is the high level feature. |
||||||
|
out_ch (int): The channel of output tensor. |
||||||
|
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. |
||||||
|
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): |
||||||
|
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) |
||||||
|
|
||||||
|
self.conv_xy_atten = nn.Sequential( |
||||||
|
layers.ConvBNReLU( |
||||||
|
4, 2, kernel_size=3, padding=1, bias_attr=False), |
||||||
|
layers.ConvBN( |
||||||
|
2, 1, kernel_size=3, padding=1, bias_attr=False)) |
||||||
|
self._scale = self.create_parameter( |
||||||
|
shape=[1], |
||||||
|
attr=ParamAttr(initializer=Constant(value=1.)), |
||||||
|
dtype="float32") |
||||||
|
self._scale.stop_gradient = True |
||||||
|
|
||||||
|
def fuse(self, x, y): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
x (Tensor): The low level feature. |
||||||
|
y (Tensor): The high level feature. |
||||||
|
""" |
||||||
|
atten = helper.avg_max_reduce_channel([x, y]) |
||||||
|
atten = F.sigmoid(self.conv_xy_atten(atten)) |
||||||
|
|
||||||
|
out = x * atten + y * (self._scale - atten) |
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class UAFM_SpAtten_S(UAFM): |
||||||
|
""" |
||||||
|
The UAFM with spatial attention, which uses mean values. |
||||||
|
Args: |
||||||
|
x_ch (int): The channel of x tensor, which is the low level feature. |
||||||
|
y_ch (int): The channel of y tensor, which is the high level feature. |
||||||
|
out_ch (int): The channel of output tensor. |
||||||
|
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. |
||||||
|
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): |
||||||
|
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) |
||||||
|
|
||||||
|
self.conv_xy_atten = nn.Sequential( |
||||||
|
layers.ConvBNReLU( |
||||||
|
2, 2, kernel_size=3, padding=1, bias_attr=False), |
||||||
|
layers.ConvBN( |
||||||
|
2, 1, kernel_size=3, padding=1, bias_attr=False)) |
||||||
|
|
||||||
|
def fuse(self, x, y): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
x (Tensor): The low level feature. |
||||||
|
y (Tensor): The high level feature. |
||||||
|
""" |
||||||
|
atten = helper.avg_reduce_channel([x, y]) |
||||||
|
atten = F.sigmoid(self.conv_xy_atten(atten)) |
||||||
|
|
||||||
|
out = x * atten + y * (1 - atten) |
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class UAFMMobile(UAFM): |
||||||
|
""" |
||||||
|
Unified Attention Fusion Module for mobile. |
||||||
|
Args: |
||||||
|
x_ch (int): The channel of x tensor, which is the low level feature. |
||||||
|
y_ch (int): The channel of y tensor, which is the high level feature. |
||||||
|
out_ch (int): The channel of output tensor. |
||||||
|
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. |
||||||
|
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): |
||||||
|
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) |
||||||
|
|
||||||
|
self.conv_x = layers.SeparableConvBNReLU( |
||||||
|
x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) |
||||||
|
self.conv_out = layers.SeparableConvBNReLU( |
||||||
|
y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) |
||||||
|
|
||||||
|
|
||||||
|
class UAFMMobile_SpAtten(UAFM): |
||||||
|
""" |
||||||
|
Unified Attention Fusion Module with spatial attention for mobile. |
||||||
|
Args: |
||||||
|
x_ch (int): The channel of x tensor, which is the low level feature. |
||||||
|
y_ch (int): The channel of y tensor, which is the high level feature. |
||||||
|
out_ch (int): The channel of output tensor. |
||||||
|
ksize (int, optional): The kernel size of the conv for x tensor. Default: 3. |
||||||
|
resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'): |
||||||
|
super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode) |
||||||
|
|
||||||
|
self.conv_x = layers.SeparableConvBNReLU( |
||||||
|
x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False) |
||||||
|
self.conv_out = layers.SeparableConvBNReLU( |
||||||
|
y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False) |
||||||
|
|
||||||
|
self.conv_xy_atten = nn.Sequential( |
||||||
|
layers.ConvBNReLU( |
||||||
|
4, 2, kernel_size=3, padding=1, bias_attr=False), |
||||||
|
layers.ConvBN( |
||||||
|
2, 1, kernel_size=3, padding=1, bias_attr=False)) |
||||||
|
|
||||||
|
def fuse(self, x, y): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
x (Tensor): The low level feature. |
||||||
|
y (Tensor): The high level feature. |
||||||
|
""" |
||||||
|
atten = helper.avg_max_reduce_channel([x, y]) |
||||||
|
atten = F.sigmoid(self.conv_xy_atten(atten)) |
||||||
|
|
||||||
|
out = x * atten + y * (1 - atten) |
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
@ -0,0 +1,133 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
|
||||||
|
def avg_reduce_hw(x): |
||||||
|
# Reduce hw by avg |
||||||
|
# Return cat([avg_pool_0, avg_pool_1, ...]) |
||||||
|
if not isinstance(x, (list, tuple)): |
||||||
|
return F.adaptive_avg_pool2d(x, 1) |
||||||
|
elif len(x) == 1: |
||||||
|
return F.adaptive_avg_pool2d(x[0], 1) |
||||||
|
else: |
||||||
|
res = [] |
||||||
|
for xi in x: |
||||||
|
res.append(F.adaptive_avg_pool2d(xi, 1)) |
||||||
|
return paddle.concat(res, axis=1) |
||||||
|
|
||||||
|
|
||||||
|
def avg_max_reduce_hw_helper(x, is_training, use_concat=True): |
||||||
|
assert not isinstance(x, (list, tuple)) |
||||||
|
avg_pool = F.adaptive_avg_pool2d(x, 1) |
||||||
|
# TODO(pjc): when axis=[2, 3], the paddle.max api has bug for training. |
||||||
|
if is_training: |
||||||
|
max_pool = F.adaptive_max_pool2d(x, 1) |
||||||
|
else: |
||||||
|
max_pool = paddle.max(x, axis=[2, 3], keepdim=True) |
||||||
|
|
||||||
|
if use_concat: |
||||||
|
res = paddle.concat([avg_pool, max_pool], axis=1) |
||||||
|
else: |
||||||
|
res = [avg_pool, max_pool] |
||||||
|
return res |
||||||
|
|
||||||
|
|
||||||
|
def avg_max_reduce_hw(x, is_training): |
||||||
|
# Reduce hw by avg and max |
||||||
|
# Return cat([avg_pool_0, avg_pool_1, ..., max_pool_0, max_pool_1, ...]) |
||||||
|
if not isinstance(x, (list, tuple)): |
||||||
|
return avg_max_reduce_hw_helper(x, is_training) |
||||||
|
elif len(x) == 1: |
||||||
|
return avg_max_reduce_hw_helper(x[0], is_training) |
||||||
|
else: |
||||||
|
res_avg = [] |
||||||
|
res_max = [] |
||||||
|
for xi in x: |
||||||
|
avg, max = avg_max_reduce_hw_helper(xi, is_training, False) |
||||||
|
res_avg.append(avg) |
||||||
|
res_max.append(max) |
||||||
|
res = res_avg + res_max |
||||||
|
return paddle.concat(res, axis=1) |
||||||
|
|
||||||
|
|
||||||
|
def avg_reduce_channel(x): |
||||||
|
# Reduce channel by avg |
||||||
|
# Return cat([avg_ch_0, avg_ch_1, ...]) |
||||||
|
if not isinstance(x, (list, tuple)): |
||||||
|
return paddle.mean(x, axis=1, keepdim=True) |
||||||
|
elif len(x) == 1: |
||||||
|
return paddle.mean(x[0], axis=1, keepdim=True) |
||||||
|
else: |
||||||
|
res = [] |
||||||
|
for xi in x: |
||||||
|
res.append(paddle.mean(xi, axis=1, keepdim=True)) |
||||||
|
return paddle.concat(res, axis=1) |
||||||
|
|
||||||
|
|
||||||
|
def max_reduce_channel(x): |
||||||
|
# Reduce channel by max |
||||||
|
# Return cat([max_ch_0, max_ch_1, ...]) |
||||||
|
if not isinstance(x, (list, tuple)): |
||||||
|
return paddle.max(x, axis=1, keepdim=True) |
||||||
|
elif len(x) == 1: |
||||||
|
return paddle.max(x[0], axis=1, keepdim=True) |
||||||
|
else: |
||||||
|
res = [] |
||||||
|
for xi in x: |
||||||
|
res.append(paddle.max(xi, axis=1, keepdim=True)) |
||||||
|
return paddle.concat(res, axis=1) |
||||||
|
|
||||||
|
|
||||||
|
def avg_max_reduce_channel_helper(x, use_concat=True): |
||||||
|
# Reduce hw by avg and max, only support single input |
||||||
|
assert not isinstance(x, (list, tuple)) |
||||||
|
mean_value = paddle.mean(x, axis=1, keepdim=True) |
||||||
|
max_value = paddle.max(x, axis=1, keepdim=True) |
||||||
|
|
||||||
|
if use_concat: |
||||||
|
res = paddle.concat([mean_value, max_value], axis=1) |
||||||
|
else: |
||||||
|
res = [mean_value, max_value] |
||||||
|
return res |
||||||
|
|
||||||
|
|
||||||
|
def avg_max_reduce_channel(x): |
||||||
|
# Reduce hw by avg and max |
||||||
|
# Return cat([avg_ch_0, max_ch_0, avg_ch_1, max_ch_1, ...]) |
||||||
|
if not isinstance(x, (list, tuple)): |
||||||
|
return avg_max_reduce_channel_helper(x) |
||||||
|
elif len(x) == 1: |
||||||
|
return avg_max_reduce_channel_helper(x[0]) |
||||||
|
else: |
||||||
|
res = [] |
||||||
|
for xi in x: |
||||||
|
res.extend(avg_max_reduce_channel_helper(xi, False)) |
||||||
|
return paddle.concat(res, axis=1) |
||||||
|
|
||||||
|
|
||||||
|
def cat_avg_max_reduce_channel(x): |
||||||
|
# Reduce hw by cat+avg+max |
||||||
|
assert isinstance(x, (list, tuple)) and len(x) > 1 |
||||||
|
|
||||||
|
x = paddle.concat(x, axis=1) |
||||||
|
|
||||||
|
mean_value = paddle.mean(x, axis=1, keepdim=True) |
||||||
|
max_value = paddle.max(x, axis=1, keepdim=True) |
||||||
|
res = paddle.concat([mean_value, max_value], axis=1) |
||||||
|
|
||||||
|
return res |
@ -0,0 +1,162 @@ |
|||||||
|
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
from functools import partial |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg import utils |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class LRASPP(nn.Layer): |
||||||
|
""" |
||||||
|
Semantic segmentation model with a light R-ASPP head. |
||||||
|
|
||||||
|
The original article refers to |
||||||
|
Howard, Andrew, et al. "Searching for mobilenetv3." |
||||||
|
(https://arxiv.org/pdf/1909.11065.pdf) |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The number of target classes. |
||||||
|
backbone(nn.Layer): Backbone network, such as stdc1net and resnet18. The backbone must |
||||||
|
has feat_channels, of which the length is 5. |
||||||
|
backbone_indices (List(int), optional): The values indicate the indices of backbone output |
||||||
|
used as the input of the LR-ASPP head. |
||||||
|
Default: [0, 1, 3]. |
||||||
|
lraspp_head_inter_chs (List(int), optional): The intermediate channels of LR-ASPP head. |
||||||
|
Default: [32, 64]. |
||||||
|
lraspp_head_out_ch (int, optional): The output channels of each ASPP branch in the LR-ASPP head. |
||||||
|
Default: 128 |
||||||
|
resize_mode (str, optional): The resize mode for the upsampling operation in the LR-ASPP head. |
||||||
|
Default: bilinear. |
||||||
|
use_gap (bool, optional): If true, use global average pooling in the LR-ASPP head; otherwise, use |
||||||
|
a 49x49 kernel for average pooling. |
||||||
|
Default: True. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone, |
||||||
|
backbone_indices=[0, 1, 3], |
||||||
|
lraspp_head_inter_chs=[32, 64], |
||||||
|
lraspp_head_out_ch=128, |
||||||
|
resize_mode='bilinear', |
||||||
|
use_gap=True, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
# backbone |
||||||
|
assert hasattr(backbone, 'feat_channels'), \ |
||||||
|
"The backbone should has feat_channels." |
||||||
|
assert len(backbone.feat_channels) >= len(backbone_indices), \ |
||||||
|
f"The length of input backbone_indices ({len(backbone_indices)}) should not be" \ |
||||||
|
f"greater than the length of feat_channels ({len(backbone.feat_channels)})." |
||||||
|
assert len(backbone.feat_channels) > max(backbone_indices), \ |
||||||
|
f"The max value ({max(backbone_indices)}) of backbone_indices should be " \ |
||||||
|
f"less than the length of feat_channels ({len(backbone.feat_channels)})." |
||||||
|
self.backbone = backbone |
||||||
|
|
||||||
|
assert len(backbone_indices) >= 1, "The lenght of backbone_indices " \ |
||||||
|
"should not be lesser than 1" |
||||||
|
|
||||||
|
# head |
||||||
|
assert len(backbone_indices) == len( |
||||||
|
lraspp_head_inter_chs |
||||||
|
) + 1, "The length of backbone_indices should be 1 greater than lraspp_head_inter_chs." |
||||||
|
self.backbone_indices = backbone_indices |
||||||
|
|
||||||
|
self.lraspp_head = LRASPPHead(backbone_indices, backbone.feat_channels, |
||||||
|
lraspp_head_inter_chs, lraspp_head_out_ch, |
||||||
|
num_classes, resize_mode, use_gap) |
||||||
|
|
||||||
|
# pretrained |
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x_hw = paddle.shape(x)[2:] |
||||||
|
|
||||||
|
feats_backbone = self.backbone(x) |
||||||
|
assert len(feats_backbone) >= len(self.backbone_indices), \ |
||||||
|
f"The nums of backbone feats ({len(feats_backbone)}) should be greater or " \ |
||||||
|
f"equal than the nums of backbone_indices ({len(self.backbone_indices)})" |
||||||
|
|
||||||
|
y = self.lraspp_head(feats_backbone) |
||||||
|
y = F.interpolate(y, x_hw, mode='bilinear', align_corners=False) |
||||||
|
logit_list = [y] |
||||||
|
|
||||||
|
return logit_list |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
|
||||||
|
class LRASPPHead(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
indices, |
||||||
|
in_chs, |
||||||
|
mid_chs, |
||||||
|
out_ch, |
||||||
|
n_classes, |
||||||
|
resize_mode, |
||||||
|
use_gap, |
||||||
|
align_corners=False): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.indices = indices[-2::-1] |
||||||
|
self.in_chs = [in_chs[i] for i in indices[::-1]] |
||||||
|
self.mid_chs = mid_chs[::-1] |
||||||
|
self.convs = nn.LayerList() |
||||||
|
self.conv_ups = nn.LayerList() |
||||||
|
for in_ch, mid_ch in zip(self.in_chs[1:], self.mid_chs): |
||||||
|
self.convs.append( |
||||||
|
nn.Conv2D( |
||||||
|
in_ch, mid_ch, kernel_size=1, bias_attr=False)) |
||||||
|
self.conv_ups.append(layers.ConvBNReLU(out_ch + mid_ch, out_ch, 1)) |
||||||
|
self.conv_w = nn.Sequential( |
||||||
|
nn.AvgPool2D( |
||||||
|
kernel_size=(49, 49), stride=(16, 20)) |
||||||
|
if not use_gap else nn.AdaptiveAvgPool2D(1), |
||||||
|
nn.Conv2D( |
||||||
|
self.in_chs[0], out_ch, 1, bias_attr=False), |
||||||
|
nn.Sigmoid()) |
||||||
|
self.conv_v = layers.ConvBNReLU(self.in_chs[0], out_ch, 1) |
||||||
|
self.conv_t = nn.Conv2D(out_ch, out_ch, kernel_size=1, bias_attr=False) |
||||||
|
self.conv_out = nn.Conv2D( |
||||||
|
out_ch, n_classes, kernel_size=1, bias_attr=False) |
||||||
|
|
||||||
|
self.interp = partial( |
||||||
|
F.interpolate, mode=resize_mode, align_corners=align_corners) |
||||||
|
|
||||||
|
def forward(self, in_feat_list): |
||||||
|
x = in_feat_list[-1] |
||||||
|
|
||||||
|
x = self.conv_v(x) * self.interp(self.conv_w(x), paddle.shape(x)[2:]) |
||||||
|
y = self.conv_t(x) |
||||||
|
|
||||||
|
for idx, conv, conv_up in zip(self.indices, self.convs, self.conv_ups): |
||||||
|
feat = in_feat_list[idx] |
||||||
|
y = self.interp(y, paddle.shape(feat)[2:]) |
||||||
|
y = paddle.concat([y, conv(feat)], axis=1) |
||||||
|
y = conv_up(y) |
||||||
|
|
||||||
|
y = self.conv_out(y) |
||||||
|
return y |
@ -0,0 +1,289 @@ |
|||||||
|
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg import utils |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class MobileSeg(nn.Layer): |
||||||
|
""" |
||||||
|
The semantic segmentation models for mobile devices. |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The number of target classes. |
||||||
|
backbone(nn.Layer): Backbone network, such as stdc1net and resnet18. The backbone must |
||||||
|
has feat_channels, of which the length is 5. |
||||||
|
backbone_indices (List(int), optional): The values indicate the indices of output of backbone. |
||||||
|
Default: [2, 3, 4]. |
||||||
|
cm_bin_sizes (List(int), optional): The bin size of context module. Default: [1,2,4]. |
||||||
|
cm_out_ch (int, optional): The output channel of the last context module. Default: 128. |
||||||
|
arm_type (str, optional): The type of attention refinement module. Default: ARM_Add_SpAttenAdd3. |
||||||
|
arm_out_chs (List(int), optional): The out channels of each arm module. Default: [64, 96, 128]. |
||||||
|
seg_head_inter_chs (List(int), optional): The intermediate channels of segmentation head. |
||||||
|
Default: [64, 64, 64]. |
||||||
|
resize_mode (str, optional): The resize mode for the upsampling operation in decoder. |
||||||
|
Default: bilinear. |
||||||
|
use_last_fuse (bool, optional): Whether use fusion in the last. Default: False. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone, |
||||||
|
backbone_indices=[1, 2, 3], |
||||||
|
cm_bin_sizes=[1, 2], |
||||||
|
cm_out_ch=64, |
||||||
|
arm_type='UAFMMobile', |
||||||
|
arm_out_chs=[32, 48, 64], |
||||||
|
seg_head_inter_chs=[32, 32, 32], |
||||||
|
resize_mode='bilinear', |
||||||
|
use_last_fuse=False, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
# backbone |
||||||
|
assert hasattr(backbone, 'feat_channels'), \ |
||||||
|
"The backbone should has feat_channels." |
||||||
|
assert len(backbone.feat_channels) >= len(backbone_indices), \ |
||||||
|
f"The length of input backbone_indices ({len(backbone_indices)}) should not be" \ |
||||||
|
f"greater than the length of feat_channels ({len(backbone.feat_channels)})." |
||||||
|
assert len(backbone.feat_channels) > max(backbone_indices), \ |
||||||
|
f"The max value ({max(backbone_indices)}) of backbone_indices should be " \ |
||||||
|
f"less than the length of feat_channels ({len(backbone.feat_channels)})." |
||||||
|
self.backbone = backbone |
||||||
|
|
||||||
|
assert len(backbone_indices) >= 1, "The lenght of backbone_indices " \ |
||||||
|
"should not be lesser than 1" |
||||||
|
self.backbone_indices = backbone_indices # [..., x16_id, x32_id] |
||||||
|
backbone_out_chs = [backbone.feat_channels[i] for i in backbone_indices] |
||||||
|
|
||||||
|
# head |
||||||
|
if len(arm_out_chs) == 1: |
||||||
|
arm_out_chs = arm_out_chs * len(backbone_indices) |
||||||
|
assert len(arm_out_chs) == len(backbone_indices), "The length of " \ |
||||||
|
"arm_out_chs and backbone_indices should be equal" |
||||||
|
|
||||||
|
self.ppseg_head = MobileSegHead(backbone_out_chs, arm_out_chs, |
||||||
|
cm_bin_sizes, cm_out_ch, arm_type, |
||||||
|
resize_mode, use_last_fuse) |
||||||
|
|
||||||
|
if len(seg_head_inter_chs) == 1: |
||||||
|
seg_head_inter_chs = seg_head_inter_chs * len(backbone_indices) |
||||||
|
assert len(seg_head_inter_chs) == len(backbone_indices), "The length of " \ |
||||||
|
"seg_head_inter_chs and backbone_indices should be equal" |
||||||
|
self.seg_heads = nn.LayerList() # [..., head_16, head32] |
||||||
|
for in_ch, mid_ch in zip(arm_out_chs, seg_head_inter_chs): |
||||||
|
self.seg_heads.append(SegHead(in_ch, mid_ch, num_classes)) |
||||||
|
|
||||||
|
# pretrained |
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x_hw = paddle.shape(x)[2:] |
||||||
|
|
||||||
|
feats_backbone = self.backbone(x) # [x4, x8, x16, x32] |
||||||
|
assert len(feats_backbone) >= len(self.backbone_indices), \ |
||||||
|
f"The nums of backbone feats ({len(feats_backbone)}) should be greater or " \ |
||||||
|
f"equal than the nums of backbone_indices ({len(self.backbone_indices)})" |
||||||
|
|
||||||
|
feats_selected = [feats_backbone[i] for i in self.backbone_indices] |
||||||
|
feats_head = self.ppseg_head(feats_selected) # [..., x8, x16, x32] |
||||||
|
|
||||||
|
if self.training: |
||||||
|
logit_list = [] |
||||||
|
for x, seg_head in zip(feats_head, self.seg_heads): |
||||||
|
x = seg_head(x) |
||||||
|
logit_list.append(x) |
||||||
|
logit_list = [ |
||||||
|
F.interpolate( |
||||||
|
x, x_hw, mode='bilinear', align_corners=False) |
||||||
|
for x in logit_list |
||||||
|
] |
||||||
|
else: |
||||||
|
x = self.seg_heads[0](feats_head[0]) |
||||||
|
x = F.interpolate(x, x_hw, mode='bilinear', align_corners=False) |
||||||
|
logit_list = [x] |
||||||
|
|
||||||
|
return logit_list |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
|
||||||
|
class MobileSegHead(nn.Layer): |
||||||
|
""" |
||||||
|
The head of MobileSeg. |
||||||
|
|
||||||
|
Args: |
||||||
|
backbone_out_chs (List(Tensor)): The channels of output tensors in the backbone. |
||||||
|
arm_out_chs (List(int)): The out channels of each arm module. |
||||||
|
cm_bin_sizes (List(int)): The bin size of context module. |
||||||
|
cm_out_ch (int): The output channel of the last context module. |
||||||
|
arm_type (str): The type of attention refinement module. |
||||||
|
resize_mode (str): The resize mode for the upsampling operation in decoder. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, backbone_out_chs, arm_out_chs, cm_bin_sizes, cm_out_ch, |
||||||
|
arm_type, resize_mode, use_last_fuse): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.cm = MobileContextModule(backbone_out_chs[-1], cm_out_ch, |
||||||
|
cm_out_ch, cm_bin_sizes) |
||||||
|
|
||||||
|
assert hasattr(layers,arm_type), \ |
||||||
|
"Not support arm_type ({})".format(arm_type) |
||||||
|
arm_class = eval("layers." + arm_type) |
||||||
|
|
||||||
|
self.arm_list = nn.LayerList() # [..., arm8, arm16, arm32] |
||||||
|
for i in range(len(backbone_out_chs)): |
||||||
|
low_chs = backbone_out_chs[i] |
||||||
|
high_ch = cm_out_ch if i == len( |
||||||
|
backbone_out_chs) - 1 else arm_out_chs[i + 1] |
||||||
|
out_ch = arm_out_chs[i] |
||||||
|
arm = arm_class( |
||||||
|
low_chs, high_ch, out_ch, ksize=3, resize_mode=resize_mode) |
||||||
|
self.arm_list.append(arm) |
||||||
|
|
||||||
|
self.use_last_fuse = use_last_fuse |
||||||
|
if self.use_last_fuse: |
||||||
|
self.fuse_convs = nn.LayerList() |
||||||
|
for i in range(1, len(arm_out_chs)): |
||||||
|
conv = layers.SeparableConvBNReLU( |
||||||
|
arm_out_chs[i], |
||||||
|
arm_out_chs[0], |
||||||
|
kernel_size=3, |
||||||
|
bias_attr=False) |
||||||
|
self.fuse_convs.append(conv) |
||||||
|
self.last_conv = layers.SeparableConvBNReLU( |
||||||
|
len(arm_out_chs) * arm_out_chs[0], |
||||||
|
arm_out_chs[0], |
||||||
|
kernel_size=3, |
||||||
|
bias_attr=False) |
||||||
|
|
||||||
|
def forward(self, in_feat_list): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
in_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32]. |
||||||
|
x2, x4 and x8 are optional. |
||||||
|
Returns: |
||||||
|
out_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32]. |
||||||
|
x2, x4 and x8 are optional. |
||||||
|
The length of in_feat_list and out_feat_list are the same. |
||||||
|
""" |
||||||
|
|
||||||
|
high_feat = self.cm(in_feat_list[-1]) |
||||||
|
out_feat_list = [] |
||||||
|
|
||||||
|
for i in reversed(range(len(in_feat_list))): |
||||||
|
low_feat = in_feat_list[i] |
||||||
|
arm = self.arm_list[i] |
||||||
|
high_feat = arm(low_feat, high_feat) |
||||||
|
out_feat_list.insert(0, high_feat) |
||||||
|
|
||||||
|
if self.use_last_fuse: |
||||||
|
x_list = [out_feat_list[0]] |
||||||
|
size = paddle.shape(out_feat_list[0])[2:] |
||||||
|
for i, (x, conv |
||||||
|
) in enumerate(zip(out_feat_list[1:], self.fuse_convs)): |
||||||
|
x = conv(x) |
||||||
|
x = F.interpolate( |
||||||
|
x, size=size, mode='bilinear', align_corners=False) |
||||||
|
x_list.append(x) |
||||||
|
x = paddle.concat(x_list, axis=1) |
||||||
|
x = self.last_conv(x) |
||||||
|
out_feat_list[0] = x |
||||||
|
|
||||||
|
return out_feat_list |
||||||
|
|
||||||
|
|
||||||
|
class MobileContextModule(nn.Layer): |
||||||
|
""" |
||||||
|
Context Module for Mobile Model. |
||||||
|
|
||||||
|
Args: |
||||||
|
in_channels (int): The number of input channels to pyramid pooling module. |
||||||
|
inter_channels (int): The number of inter channels to pyramid pooling module. |
||||||
|
out_channels (int): The number of output channels after pyramid pooling module. |
||||||
|
bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 3). |
||||||
|
align_corners (bool): An argument of F.interpolate. It should be set to False |
||||||
|
when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
inter_channels, |
||||||
|
out_channels, |
||||||
|
bin_sizes, |
||||||
|
align_corners=False): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.stages = nn.LayerList([ |
||||||
|
self._make_stage(in_channels, inter_channels, size) |
||||||
|
for size in bin_sizes |
||||||
|
]) |
||||||
|
|
||||||
|
self.conv_out = layers.SeparableConvBNReLU( |
||||||
|
in_channels=inter_channels, |
||||||
|
out_channels=out_channels, |
||||||
|
kernel_size=3, |
||||||
|
bias_attr=False) |
||||||
|
|
||||||
|
self.align_corners = align_corners |
||||||
|
|
||||||
|
def _make_stage(self, in_channels, out_channels, size): |
||||||
|
prior = nn.AdaptiveAvgPool2D(output_size=size) |
||||||
|
conv = layers.ConvBNReLU( |
||||||
|
in_channels=in_channels, out_channels=out_channels, kernel_size=1) |
||||||
|
return nn.Sequential(prior, conv) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
out = None |
||||||
|
input_shape = paddle.shape(input)[2:] |
||||||
|
|
||||||
|
for stage in self.stages: |
||||||
|
x = stage(input) |
||||||
|
x = F.interpolate( |
||||||
|
x, |
||||||
|
input_shape, |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) |
||||||
|
if out is None: |
||||||
|
out = x |
||||||
|
else: |
||||||
|
out += x |
||||||
|
|
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class SegHead(nn.Layer): |
||||||
|
def __init__(self, in_chan, mid_chan, n_classes): |
||||||
|
super().__init__() |
||||||
|
self.conv = layers.SeparableConvBNReLU( |
||||||
|
in_chan, mid_chan, kernel_size=3, bias_attr=False) |
||||||
|
self.conv_out = nn.Conv2D( |
||||||
|
mid_chan, n_classes, kernel_size=1, bias_attr=False) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.conv(x) |
||||||
|
x = self.conv_out(x) |
||||||
|
return x |
@ -0,0 +1,273 @@ |
|||||||
|
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg import utils |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.utils import utils |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class PPLiteSeg(nn.Layer): |
||||||
|
""" |
||||||
|
The PP_LiteSeg implementation based on PaddlePaddle. |
||||||
|
|
||||||
|
The original article refers to "Juncai Peng, Yi Liu, Shiyu Tang, Yuying Hao, Lutao Chu, |
||||||
|
Guowei Chen, Zewu Wu, Zeyu Chen, Zhiliang Yu, Yuning Du, Qingqing Dang,Baohua Lai, |
||||||
|
Qiwen Liu, Xiaoguang Hu, Dianhai Yu, Yanjun Ma. PP-LiteSeg: A Superior Real-Time Semantic |
||||||
|
Segmentation Model. https://arxiv.org/abs/2204.02681". |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The number of target classes. |
||||||
|
backbone(nn.Layer): Backbone network, such as stdc1net and resnet18. The backbone must |
||||||
|
has feat_channels, of which the length is 5. |
||||||
|
backbone_indices (List(int), optional): The values indicate the indices of output of backbone. |
||||||
|
Default: [2, 3, 4]. |
||||||
|
arm_type (str, optional): The type of attention refinement module. Default: ARM_Add_SpAttenAdd3. |
||||||
|
cm_bin_sizes (List(int), optional): The bin size of context module. Default: [1,2,4]. |
||||||
|
cm_out_ch (int, optional): The output channel of the last context module. Default: 128. |
||||||
|
arm_out_chs (List(int), optional): The out channels of each arm module. Default: [64, 96, 128]. |
||||||
|
seg_head_inter_chs (List(int), optional): The intermediate channels of segmentation head. |
||||||
|
Default: [64, 64, 64]. |
||||||
|
resize_mode (str, optional): The resize mode for the upsampling operation in decoder. |
||||||
|
Default: bilinear. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
|
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone, |
||||||
|
backbone_indices=[2, 3, 4], |
||||||
|
arm_type='UAFM_SpAtten', |
||||||
|
cm_bin_sizes=[1, 2, 4], |
||||||
|
cm_out_ch=128, |
||||||
|
arm_out_chs=[64, 96, 128], |
||||||
|
seg_head_inter_chs=[64, 64, 64], |
||||||
|
resize_mode='bilinear', |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
# backbone |
||||||
|
assert hasattr(backbone, 'feat_channels'), \ |
||||||
|
"The backbone should has feat_channels." |
||||||
|
assert len(backbone.feat_channels) >= len(backbone_indices), \ |
||||||
|
f"The length of input backbone_indices ({len(backbone_indices)}) should not be" \ |
||||||
|
f"greater than the length of feat_channels ({len(backbone.feat_channels)})." |
||||||
|
assert len(backbone.feat_channels) > max(backbone_indices), \ |
||||||
|
f"The max value ({max(backbone_indices)}) of backbone_indices should be " \ |
||||||
|
f"less than the length of feat_channels ({len(backbone.feat_channels)})." |
||||||
|
self.backbone = backbone |
||||||
|
|
||||||
|
assert len(backbone_indices) > 1, "The lenght of backbone_indices " \ |
||||||
|
"should be greater than 1" |
||||||
|
self.backbone_indices = backbone_indices # [..., x16_id, x32_id] |
||||||
|
backbone_out_chs = [backbone.feat_channels[i] for i in backbone_indices] |
||||||
|
|
||||||
|
# head |
||||||
|
if len(arm_out_chs) == 1: |
||||||
|
arm_out_chs = arm_out_chs * len(backbone_indices) |
||||||
|
assert len(arm_out_chs) == len(backbone_indices), "The length of " \ |
||||||
|
"arm_out_chs and backbone_indices should be equal" |
||||||
|
|
||||||
|
self.ppseg_head = PPLiteSegHead(backbone_out_chs, arm_out_chs, |
||||||
|
cm_bin_sizes, cm_out_ch, arm_type, |
||||||
|
resize_mode) |
||||||
|
|
||||||
|
if len(seg_head_inter_chs) == 1: |
||||||
|
seg_head_inter_chs = seg_head_inter_chs * len(backbone_indices) |
||||||
|
assert len(seg_head_inter_chs) == len(backbone_indices), "The length of " \ |
||||||
|
"seg_head_inter_chs and backbone_indices should be equal" |
||||||
|
self.seg_heads = nn.LayerList() # [..., head_16, head32] |
||||||
|
for in_ch, mid_ch in zip(arm_out_chs, seg_head_inter_chs): |
||||||
|
self.seg_heads.append(SegHead(in_ch, mid_ch, num_classes)) |
||||||
|
|
||||||
|
# pretrained |
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x_hw = paddle.shape(x)[2:] |
||||||
|
|
||||||
|
feats_backbone = self.backbone(x) # [x2, x4, x8, x16, x32] |
||||||
|
assert len(feats_backbone) >= len(self.backbone_indices), \ |
||||||
|
f"The nums of backbone feats ({len(feats_backbone)}) should be greater or " \ |
||||||
|
f"equal than the nums of backbone_indices ({len(self.backbone_indices)})" |
||||||
|
|
||||||
|
feats_selected = [feats_backbone[i] for i in self.backbone_indices] |
||||||
|
|
||||||
|
feats_head = self.ppseg_head(feats_selected) # [..., x8, x16, x32] |
||||||
|
|
||||||
|
if self.training: |
||||||
|
logit_list = [] |
||||||
|
|
||||||
|
for x, seg_head in zip(feats_head, self.seg_heads): |
||||||
|
x = seg_head(x) |
||||||
|
logit_list.append(x) |
||||||
|
|
||||||
|
logit_list = [ |
||||||
|
F.interpolate( |
||||||
|
x, x_hw, mode='bilinear', align_corners=False) |
||||||
|
for x in logit_list |
||||||
|
] |
||||||
|
else: |
||||||
|
x = self.seg_heads[0](feats_head[0]) |
||||||
|
x = F.interpolate(x, x_hw, mode='bilinear', align_corners=False) |
||||||
|
logit_list = [x] |
||||||
|
|
||||||
|
return logit_list |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
|
||||||
|
class PPLiteSegHead(nn.Layer): |
||||||
|
""" |
||||||
|
The head of PPLiteSeg. |
||||||
|
|
||||||
|
Args: |
||||||
|
backbone_out_chs (List(Tensor)): The channels of output tensors in the backbone. |
||||||
|
arm_out_chs (List(int)): The out channels of each arm module. |
||||||
|
cm_bin_sizes (List(int)): The bin size of context module. |
||||||
|
cm_out_ch (int): The output channel of the last context module. |
||||||
|
arm_type (str): The type of attention refinement module. |
||||||
|
resize_mode (str): The resize mode for the upsampling operation in decoder. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, backbone_out_chs, arm_out_chs, cm_bin_sizes, cm_out_ch, |
||||||
|
arm_type, resize_mode): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.cm = PPContextModule(backbone_out_chs[-1], cm_out_ch, cm_out_ch, |
||||||
|
cm_bin_sizes) |
||||||
|
|
||||||
|
assert hasattr(layers,arm_type), \ |
||||||
|
"Not support arm_type ({})".format(arm_type) |
||||||
|
arm_class = eval("layers." + arm_type) |
||||||
|
|
||||||
|
self.arm_list = nn.LayerList() # [..., arm8, arm16, arm32] |
||||||
|
for i in range(len(backbone_out_chs)): |
||||||
|
low_chs = backbone_out_chs[i] |
||||||
|
high_ch = cm_out_ch if i == len( |
||||||
|
backbone_out_chs) - 1 else arm_out_chs[i + 1] |
||||||
|
out_ch = arm_out_chs[i] |
||||||
|
arm = arm_class( |
||||||
|
low_chs, high_ch, out_ch, ksize=3, resize_mode=resize_mode) |
||||||
|
self.arm_list.append(arm) |
||||||
|
|
||||||
|
def forward(self, in_feat_list): |
||||||
|
""" |
||||||
|
Args: |
||||||
|
in_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32]. |
||||||
|
x2, x4 and x8 are optional. |
||||||
|
Returns: |
||||||
|
out_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32]. |
||||||
|
x2, x4 and x8 are optional. |
||||||
|
The length of in_feat_list and out_feat_list are the same. |
||||||
|
""" |
||||||
|
|
||||||
|
high_feat = self.cm(in_feat_list[-1]) |
||||||
|
out_feat_list = [] |
||||||
|
|
||||||
|
for i in reversed(range(len(in_feat_list))): |
||||||
|
low_feat = in_feat_list[i] |
||||||
|
arm = self.arm_list[i] |
||||||
|
high_feat = arm(low_feat, high_feat) |
||||||
|
out_feat_list.insert(0, high_feat) |
||||||
|
|
||||||
|
return out_feat_list |
||||||
|
|
||||||
|
|
||||||
|
class PPContextModule(nn.Layer): |
||||||
|
""" |
||||||
|
Simple Context module. |
||||||
|
|
||||||
|
Args: |
||||||
|
in_channels (int): The number of input channels to pyramid pooling module. |
||||||
|
inter_channels (int): The number of inter channels to pyramid pooling module. |
||||||
|
out_channels (int): The number of output channels after pyramid pooling module. |
||||||
|
bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 3). |
||||||
|
align_corners (bool): An argument of F.interpolate. It should be set to False |
||||||
|
when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
in_channels, |
||||||
|
inter_channels, |
||||||
|
out_channels, |
||||||
|
bin_sizes, |
||||||
|
align_corners=False): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.stages = nn.LayerList([ |
||||||
|
self._make_stage(in_channels, inter_channels, size) |
||||||
|
for size in bin_sizes |
||||||
|
]) |
||||||
|
|
||||||
|
self.conv_out = layers.ConvBNReLU( |
||||||
|
in_channels=inter_channels, |
||||||
|
out_channels=out_channels, |
||||||
|
kernel_size=3, |
||||||
|
padding=1) |
||||||
|
|
||||||
|
self.align_corners = align_corners |
||||||
|
|
||||||
|
def _make_stage(self, in_channels, out_channels, size): |
||||||
|
prior = nn.AdaptiveAvgPool2D(output_size=size) |
||||||
|
conv = layers.ConvBNReLU( |
||||||
|
in_channels=in_channels, out_channels=out_channels, kernel_size=1) |
||||||
|
return nn.Sequential(prior, conv) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
out = None |
||||||
|
input_shape = paddle.shape(input)[2:] |
||||||
|
|
||||||
|
for stage in self.stages: |
||||||
|
x = stage(input) |
||||||
|
x = F.interpolate( |
||||||
|
x, |
||||||
|
input_shape, |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) |
||||||
|
if out is None: |
||||||
|
out = x |
||||||
|
else: |
||||||
|
out += x |
||||||
|
|
||||||
|
out = self.conv_out(out) |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class SegHead(nn.Layer): |
||||||
|
def __init__(self, in_chan, mid_chan, n_classes): |
||||||
|
super().__init__() |
||||||
|
self.conv = layers.ConvBNReLU( |
||||||
|
in_chan, |
||||||
|
mid_chan, |
||||||
|
kernel_size=3, |
||||||
|
stride=1, |
||||||
|
padding=1, |
||||||
|
bias_attr=False) |
||||||
|
self.conv_out = nn.Conv2D( |
||||||
|
mid_chan, n_classes, kernel_size=1, bias_attr=False) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self.conv(x) |
||||||
|
x = self.conv_out(x) |
||||||
|
return x |
@ -0,0 +1,449 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
# Refer to the origin implementation: https://github.com/clovaai/c3_sinet/blob/master/models/SINet.py |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.utils import utils |
||||||
|
|
||||||
|
CFG = [[[3, 1], [5, 1]], [[3, 1], [3, 1]], [[3, 1], [5, 1]], [[3, 1], [3, 1]], |
||||||
|
[[5, 1], [3, 2]], [[5, 2], [3, 4]], [[3, 1], [3, 1]], [[5, 1], [5, 1]], |
||||||
|
[[3, 2], [3, 4]], [[3, 1], [5, 2]]] |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class SINet(nn.Layer): |
||||||
|
""" |
||||||
|
The SINet implementation based on PaddlePaddle. |
||||||
|
|
||||||
|
The original article refers to |
||||||
|
Hyojin Park, Lars Lowe Sjösund, YoungJoon Yoo, Nicolas Monet, Jihwan Bang, Nojun Kwak |
||||||
|
"SINet: Extreme Lightweight Portrait Segmentation Networks with Spatial Squeeze Modules |
||||||
|
and Information Blocking Decoder", (https://arxiv.org/abs/1911.09099). |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The unique number of target classes. |
||||||
|
config (List, optional): The config for SINet. Defualt use the CFG. |
||||||
|
stage2_blocks (int, optional): The num of blocks in stage2. Default: 2. |
||||||
|
stage3_blocks (int, optional): The num of blocks in stage3. Default: 8. |
||||||
|
in_channels (int, optional): The channels of input image. Default: 3. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes=2, |
||||||
|
config=CFG, |
||||||
|
stage2_blocks=2, |
||||||
|
stage3_blocks=8, |
||||||
|
in_channels=3, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
dim1 = 16 |
||||||
|
dim2 = 48 |
||||||
|
dim3 = 96 |
||||||
|
|
||||||
|
self.encoder = SINetEncoder(config, in_channels, num_classes, |
||||||
|
stage2_blocks, stage3_blocks) |
||||||
|
|
||||||
|
self.up = nn.UpsamplingBilinear2D(scale_factor=2) |
||||||
|
self.bn_3 = nn.BatchNorm(num_classes) |
||||||
|
|
||||||
|
self.level2_C = CBR(dim2, num_classes, 1, 1) |
||||||
|
self.bn_2 = nn.BatchNorm(num_classes) |
||||||
|
|
||||||
|
self.classifier = nn.Sequential( |
||||||
|
nn.UpsamplingBilinear2D(scale_factor=2), |
||||||
|
nn.Conv2D( |
||||||
|
num_classes, num_classes, 3, 1, 1, bias_attr=False)) |
||||||
|
|
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output1 = self.encoder.level1(input) # x2 |
||||||
|
|
||||||
|
output2_0 = self.encoder.level2_0(output1) # x4 |
||||||
|
for i, layer in enumerate(self.encoder.level2): |
||||||
|
if i == 0: |
||||||
|
output2 = layer(output2_0) |
||||||
|
else: |
||||||
|
output2 = layer(output2) |
||||||
|
output2_cat = self.encoder.BR2(paddle.concat([output2_0, output2], 1)) |
||||||
|
|
||||||
|
output3_0 = self.encoder.level3_0(output2_cat) # x8 |
||||||
|
for i, layer in enumerate(self.encoder.level3): |
||||||
|
if i == 0: |
||||||
|
output3 = layer(output3_0) |
||||||
|
else: |
||||||
|
output3 = layer(output3) |
||||||
|
output3_cat = self.encoder.BR3(paddle.concat([output3_0, output3], 1)) |
||||||
|
enc_final = self.encoder.classifier(output3_cat) # x8 |
||||||
|
|
||||||
|
dec_stage1 = self.bn_3(self.up(enc_final)) # x4 |
||||||
|
stage1_confidence = paddle.max(F.softmax(dec_stage1), axis=1) |
||||||
|
stage1_gate = (1 - stage1_confidence).unsqueeze(1) |
||||||
|
|
||||||
|
dec_stage2_0 = self.level2_C(output2) # x4 |
||||||
|
dec_stage2 = self.bn_2( |
||||||
|
self.up(dec_stage2_0 * stage1_gate + dec_stage1)) # x2 |
||||||
|
|
||||||
|
out = self.classifier(dec_stage2) # x |
||||||
|
|
||||||
|
return [out] |
||||||
|
|
||||||
|
|
||||||
|
def channel_shuffle(x, groups): |
||||||
|
x_shape = paddle.shape(x) |
||||||
|
batch_size, height, width = x_shape[0], x_shape[2], x_shape[3] |
||||||
|
num_channels = x.shape[1] |
||||||
|
channels_per_group = num_channels // groups |
||||||
|
|
||||||
|
# reshape |
||||||
|
x = paddle.reshape( |
||||||
|
x=x, shape=[batch_size, groups, channels_per_group, height, width]) |
||||||
|
|
||||||
|
# transpose |
||||||
|
x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4]) |
||||||
|
|
||||||
|
# flatten |
||||||
|
x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width]) |
||||||
|
|
||||||
|
return x |
||||||
|
|
||||||
|
|
||||||
|
class CBR(nn.Layer): |
||||||
|
''' |
||||||
|
This class defines the convolution layer with batch normalization and PReLU activation |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nIn, nOut, kSize, stride=1): |
||||||
|
super().__init__() |
||||||
|
padding = int((kSize - 1) / 2) |
||||||
|
|
||||||
|
self.conv = nn.Conv2D( |
||||||
|
nIn, |
||||||
|
nOut, (kSize, kSize), |
||||||
|
stride=stride, |
||||||
|
padding=(padding, padding), |
||||||
|
bias_attr=False) |
||||||
|
self.bn = nn.BatchNorm(nOut) |
||||||
|
self.act = nn.PReLU(nOut) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output = self.conv(input) |
||||||
|
output = self.bn(output) |
||||||
|
output = self.act(output) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class SeparableCBR(nn.Layer): |
||||||
|
''' |
||||||
|
This class defines the convolution layer with batch normalization and PReLU activation |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nIn, nOut, kSize, stride=1): |
||||||
|
super().__init__() |
||||||
|
padding = int((kSize - 1) / 2) |
||||||
|
|
||||||
|
self.conv = nn.Sequential( |
||||||
|
nn.Conv2D( |
||||||
|
nIn, |
||||||
|
nIn, (kSize, kSize), |
||||||
|
stride=stride, |
||||||
|
padding=(padding, padding), |
||||||
|
groups=nIn, |
||||||
|
bias_attr=False), |
||||||
|
nn.Conv2D( |
||||||
|
nIn, nOut, kernel_size=1, stride=1, bias_attr=False), ) |
||||||
|
self.bn = nn.BatchNorm(nOut) |
||||||
|
self.act = nn.PReLU(nOut) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output = self.conv(input) |
||||||
|
output = self.bn(output) |
||||||
|
output = self.act(output) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class SqueezeBlock(nn.Layer): |
||||||
|
def __init__(self, exp_size, divide=4.0): |
||||||
|
super(SqueezeBlock, self).__init__() |
||||||
|
|
||||||
|
if divide > 1: |
||||||
|
self.dense = nn.Sequential( |
||||||
|
nn.Linear(exp_size, int(exp_size / divide)), |
||||||
|
nn.PReLU(int(exp_size / divide)), |
||||||
|
nn.Linear(int(exp_size / divide), exp_size), |
||||||
|
nn.PReLU(exp_size), ) |
||||||
|
else: |
||||||
|
self.dense = nn.Sequential( |
||||||
|
nn.Linear(exp_size, exp_size), nn.PReLU(exp_size)) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
alpha = F.adaptive_avg_pool2d(x, [1, 1]) |
||||||
|
alpha = paddle.squeeze(alpha, axis=[2, 3]) |
||||||
|
alpha = self.dense(alpha) |
||||||
|
alpha = paddle.unsqueeze(alpha, axis=[2, 3]) |
||||||
|
out = x * alpha |
||||||
|
return out |
||||||
|
|
||||||
|
|
||||||
|
class SESeparableCBR(nn.Layer): |
||||||
|
''' |
||||||
|
This class defines the convolution layer with batch normalization and PReLU activation |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nIn, nOut, kSize, stride=1, divide=2.0): |
||||||
|
super().__init__() |
||||||
|
padding = int((kSize - 1) / 2) |
||||||
|
|
||||||
|
self.conv = nn.Sequential( |
||||||
|
nn.Conv2D( |
||||||
|
nIn, |
||||||
|
nIn, (kSize, kSize), |
||||||
|
stride=stride, |
||||||
|
padding=(padding, padding), |
||||||
|
groups=nIn, |
||||||
|
bias_attr=False), |
||||||
|
SqueezeBlock( |
||||||
|
nIn, divide=divide), |
||||||
|
nn.Conv2D( |
||||||
|
nIn, nOut, kernel_size=1, stride=1, bias_attr=False), ) |
||||||
|
|
||||||
|
self.bn = nn.BatchNorm(nOut) |
||||||
|
self.act = nn.PReLU(nOut) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output = self.conv(input) |
||||||
|
output = self.bn(output) |
||||||
|
output = self.act(output) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class BR(nn.Layer): |
||||||
|
''' |
||||||
|
This class groups the batch normalization and PReLU activation |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nOut): |
||||||
|
super().__init__() |
||||||
|
self.bn = nn.BatchNorm(nOut) |
||||||
|
self.act = nn.PReLU(nOut) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output = self.bn(input) |
||||||
|
output = self.act(output) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class CB(nn.Layer): |
||||||
|
''' |
||||||
|
This class groups the convolution and batch normalization |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nIn, nOut, kSize, stride=1): |
||||||
|
super().__init__() |
||||||
|
padding = int((kSize - 1) / 2) |
||||||
|
self.conv = nn.Conv2D( |
||||||
|
nIn, |
||||||
|
nOut, (kSize, kSize), |
||||||
|
stride=stride, |
||||||
|
padding=(padding, padding), |
||||||
|
bias_attr=False) |
||||||
|
self.bn = nn.BatchNorm(nOut) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output = self.conv(input) |
||||||
|
output = self.bn(output) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class C(nn.Layer): |
||||||
|
''' |
||||||
|
This class is for a convolutional layer. |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nIn, nOut, kSize, stride=1, group=1): |
||||||
|
super().__init__() |
||||||
|
padding = int((kSize - 1) / 2) |
||||||
|
self.conv = nn.Conv2D( |
||||||
|
nIn, |
||||||
|
nOut, (kSize, kSize), |
||||||
|
stride=stride, |
||||||
|
padding=(padding, padding), |
||||||
|
bias_attr=False, |
||||||
|
groups=group) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output = self.conv(input) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class S2block(nn.Layer): |
||||||
|
''' |
||||||
|
This class defines the dilated convolution. |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nIn, nOut, kSize, avgsize): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.resolution_down = False |
||||||
|
if avgsize > 1: |
||||||
|
self.resolution_down = True |
||||||
|
self.down_res = nn.AvgPool2D(avgsize, avgsize) |
||||||
|
self.up_res = nn.UpsamplingBilinear2D(scale_factor=avgsize) |
||||||
|
self.avgsize = avgsize |
||||||
|
|
||||||
|
padding = int((kSize - 1) / 2) |
||||||
|
self.conv = nn.Sequential( |
||||||
|
nn.Conv2D( |
||||||
|
nIn, |
||||||
|
nIn, |
||||||
|
kernel_size=(kSize, kSize), |
||||||
|
stride=1, |
||||||
|
padding=(padding, padding), |
||||||
|
groups=nIn, |
||||||
|
bias_attr=False), |
||||||
|
nn.BatchNorm(nIn)) |
||||||
|
|
||||||
|
self.act_conv1x1 = nn.Sequential( |
||||||
|
nn.PReLU(nIn), |
||||||
|
nn.Conv2D( |
||||||
|
nIn, nOut, kernel_size=1, stride=1, bias_attr=False), ) |
||||||
|
|
||||||
|
self.bn = nn.BatchNorm(nOut) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
if self.resolution_down: |
||||||
|
input = self.down_res(input) |
||||||
|
output = self.conv(input) |
||||||
|
|
||||||
|
output = self.act_conv1x1(output) |
||||||
|
if self.resolution_down: |
||||||
|
output = self.up_res(output) |
||||||
|
return self.bn(output) |
||||||
|
|
||||||
|
|
||||||
|
class S2module(nn.Layer): |
||||||
|
''' |
||||||
|
This class defines the ESP block, which is based on the following principle |
||||||
|
Reduce ---> Split ---> Transform --> Merge |
||||||
|
''' |
||||||
|
|
||||||
|
def __init__(self, nIn, nOut, add=True, config=[[3, 1], [5, 1]]): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
group_n = len(config) |
||||||
|
assert group_n == 2 |
||||||
|
n = int(nOut / group_n) |
||||||
|
n1 = nOut - group_n * n |
||||||
|
|
||||||
|
self.c1 = C(nIn, n, 1, 1, group=group_n) |
||||||
|
# self.c1 = C(nIn, n, 1, 1) |
||||||
|
|
||||||
|
for i in range(group_n): |
||||||
|
if i == 0: |
||||||
|
self.layer_0 = S2block( |
||||||
|
n, n + n1, kSize=config[i][0], avgsize=config[i][1]) |
||||||
|
else: |
||||||
|
self.layer_1 = S2block( |
||||||
|
n, n, kSize=config[i][0], avgsize=config[i][1]) |
||||||
|
|
||||||
|
self.BR = BR(nOut) |
||||||
|
self.add = add |
||||||
|
self.group_n = group_n |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output1 = self.c1(input) |
||||||
|
output1 = channel_shuffle(output1, self.group_n) |
||||||
|
res_0 = self.layer_0(output1) |
||||||
|
res_1 = self.layer_1(output1) |
||||||
|
combine = paddle.concat([res_0, res_1], 1) |
||||||
|
|
||||||
|
if self.add: |
||||||
|
combine = input + combine |
||||||
|
output = self.BR(combine) |
||||||
|
return output |
||||||
|
|
||||||
|
|
||||||
|
class SINetEncoder(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
config, |
||||||
|
in_channels=3, |
||||||
|
num_classes=2, |
||||||
|
stage2_blocks=2, |
||||||
|
stage3_blocks=8): |
||||||
|
super().__init__() |
||||||
|
assert stage2_blocks == 2 |
||||||
|
dim1 = 16 |
||||||
|
dim2 = 48 |
||||||
|
dim3 = 96 |
||||||
|
|
||||||
|
self.level1 = CBR(in_channels, 12, 3, 2) |
||||||
|
|
||||||
|
self.level2_0 = SESeparableCBR(12, dim1, 3, 2, divide=1) |
||||||
|
|
||||||
|
self.level2 = nn.LayerList() |
||||||
|
for i in range(0, stage2_blocks): |
||||||
|
if i == 0: |
||||||
|
self.level2.append( |
||||||
|
S2module( |
||||||
|
dim1, dim2, config=config[i], add=False)) |
||||||
|
else: |
||||||
|
self.level2.append(S2module(dim2, dim2, config=config[i])) |
||||||
|
self.BR2 = BR(dim2 + dim1) |
||||||
|
|
||||||
|
self.level3_0 = SESeparableCBR(dim2 + dim1, dim2, 3, 2, divide=2) |
||||||
|
self.level3 = nn.LayerList() |
||||||
|
for i in range(0, stage3_blocks): |
||||||
|
if i == 0: |
||||||
|
self.level3.append( |
||||||
|
S2module( |
||||||
|
dim2, dim3, config=config[2 + i], add=False)) |
||||||
|
else: |
||||||
|
self.level3.append(S2module(dim3, dim3, config=config[2 + i])) |
||||||
|
self.BR3 = BR(dim3 + dim2) |
||||||
|
|
||||||
|
self.classifier = C(dim3 + dim2, num_classes, 1, 1) |
||||||
|
|
||||||
|
def forward(self, input): |
||||||
|
output1 = self.level1(input) # x2 |
||||||
|
|
||||||
|
output2_0 = self.level2_0(output1) # x4 |
||||||
|
for i, layer in enumerate(self.level2): |
||||||
|
if i == 0: |
||||||
|
output2 = layer(output2_0) |
||||||
|
else: |
||||||
|
output2 = layer(output2) |
||||||
|
|
||||||
|
output3_0 = self.level3_0( |
||||||
|
self.BR2(paddle.concat([output2_0, output2], 1))) # x8 |
||||||
|
for i, layer in enumerate(self.level3): |
||||||
|
if i == 0: |
||||||
|
output3 = layer(output3_0) |
||||||
|
else: |
||||||
|
output3 = layer(output3) |
||||||
|
|
||||||
|
output3_cat = self.BR3(paddle.concat([output3_0, output3], 1)) |
||||||
|
classifier = self.classifier(output3_cat) |
||||||
|
return classifier |
@ -0,0 +1,155 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import warnings |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
from paddlers.models.ppseg.utils import utils |
||||||
|
from paddlers.models.ppseg.models.backbones.top_transformer import ConvBNAct |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class TopFormer(nn.Layer): |
||||||
|
""" |
||||||
|
The Token Pyramid Transformer(TopFormer) implementation based on PaddlePaddle. |
||||||
|
|
||||||
|
The original article refers to |
||||||
|
Zhang, Wenqiang, Zilong Huang, Guozhong Luo, Tao Chen, Xinggang Wang, Wenyu Liu, Gang Yu, |
||||||
|
and Chunhua Shen. "TopFormer: Token Pyramid Transformer for Mobile Semantic Segmentation." |
||||||
|
In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, |
||||||
|
pp. 12083-12093. 2022. |
||||||
|
|
||||||
|
This model refers to https://github.com/hustvl/TopFormer. |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes(int,optional): The unique number of target classes. |
||||||
|
backbone(nn.Layer): Backbone network. |
||||||
|
head_use_dw (bool, optional): Whether the head use depthwise convolutions. Default: False. |
||||||
|
align_corners (bool, optional): Set the align_corners in resizing. Default: False. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone, |
||||||
|
head_use_dw=False, |
||||||
|
align_corners=False, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
self.backbone = backbone |
||||||
|
|
||||||
|
head_in_channels = [ |
||||||
|
i for i in backbone.injection_out_channels if i is not None |
||||||
|
] |
||||||
|
self.decode_head = TopFormerHead( |
||||||
|
num_classes=num_classes, |
||||||
|
in_channels=head_in_channels, |
||||||
|
use_dw=head_use_dw, |
||||||
|
align_corners=align_corners) |
||||||
|
|
||||||
|
self.align_corners = align_corners |
||||||
|
self.pretrained = pretrained |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x_hw = paddle.shape(x)[2:] |
||||||
|
x = self.backbone(x) # len=3, 1/8,1/16,1/32 |
||||||
|
x = self.decode_head(x) |
||||||
|
x = F.interpolate( |
||||||
|
x, x_hw, mode='bilinear', align_corners=self.align_corners) |
||||||
|
|
||||||
|
return [x] |
||||||
|
|
||||||
|
|
||||||
|
class TopFormerHead(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
in_channels, |
||||||
|
in_index=[0, 1, 2], |
||||||
|
in_transform='multiple_select', |
||||||
|
use_dw=False, |
||||||
|
dropout_ratio=0.1, |
||||||
|
align_corners=False): |
||||||
|
super().__init__() |
||||||
|
|
||||||
|
self.in_index = in_index |
||||||
|
self.in_transform = in_transform |
||||||
|
self.align_corners = align_corners |
||||||
|
|
||||||
|
self._init_inputs(in_channels, in_index, in_transform) |
||||||
|
self.linear_fuse = ConvBNAct( |
||||||
|
in_channels=self.last_channels, |
||||||
|
out_channels=self.last_channels, |
||||||
|
kernel_size=1, |
||||||
|
stride=1, |
||||||
|
groups=self.last_channels if use_dw else 1, |
||||||
|
act=nn.ReLU) |
||||||
|
self.dropout = nn.Dropout2D(dropout_ratio) |
||||||
|
self.conv_seg = nn.Conv2D( |
||||||
|
self.last_channels, num_classes, kernel_size=1) |
||||||
|
|
||||||
|
def _init_inputs(self, in_channels, in_index, in_transform): |
||||||
|
assert in_transform in [None, 'resize_concat', 'multiple_select'] |
||||||
|
if in_transform is not None: |
||||||
|
assert len(in_channels) == len(in_index) |
||||||
|
if in_transform == 'resize_concat': |
||||||
|
self.last_channels = sum(in_channels) |
||||||
|
else: |
||||||
|
self.last_channels = in_channels[0] |
||||||
|
else: |
||||||
|
assert isinstance(in_channels, int) |
||||||
|
assert isinstance(in_index, int) |
||||||
|
self.last_channels = in_channels |
||||||
|
|
||||||
|
def _transform_inputs(self, inputs): |
||||||
|
if self.in_transform == 'resize_concat': |
||||||
|
inputs = [inputs[i] for i in self.in_index] |
||||||
|
inputs = [ |
||||||
|
F.interpolate( |
||||||
|
input_data=x, |
||||||
|
size=paddle.shape(inputs[0])[2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) for x in inputs |
||||||
|
] |
||||||
|
inputs = paddle.concat(inputs, axis=1) |
||||||
|
elif self.in_transform == 'multiple_select': |
||||||
|
inputs_tmp = [inputs[i] for i in self.in_index] |
||||||
|
inputs = inputs_tmp[0] |
||||||
|
for x in inputs_tmp[1:]: |
||||||
|
x = F.interpolate( |
||||||
|
x, |
||||||
|
size=paddle.shape(inputs)[2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) |
||||||
|
inputs += x |
||||||
|
else: |
||||||
|
inputs = inputs[self.in_index] |
||||||
|
|
||||||
|
return inputs |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
x = self._transform_inputs(x) |
||||||
|
x = self.linear_fuse(x) |
||||||
|
x = self.dropout(x) |
||||||
|
x = self.conv_seg(x) |
||||||
|
return x |
@ -0,0 +1,173 @@ |
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
|
# |
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
|
# you may not use this file except in compliance with the License. |
||||||
|
# You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import paddle |
||||||
|
import paddle.nn as nn |
||||||
|
import paddle.nn.functional as F |
||||||
|
|
||||||
|
from paddlers.models.ppseg import utils |
||||||
|
from paddlers.models.ppseg.cvlibs import manager |
||||||
|
from paddlers.models.ppseg.models import layers |
||||||
|
|
||||||
|
|
||||||
|
@manager.MODELS.add_component |
||||||
|
class UPerNet(nn.Layer): |
||||||
|
""" |
||||||
|
The UPerNet implementation based on PaddlePaddle. |
||||||
|
|
||||||
|
The original article refers to |
||||||
|
Tete Xiao, et, al. "Unified Perceptual Parsing for Scene Understanding" |
||||||
|
(https://arxiv.org/abs/1807.10221). |
||||||
|
|
||||||
|
Args: |
||||||
|
num_classes (int): The unique number of target classes. |
||||||
|
backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101. |
||||||
|
backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone. |
||||||
|
channels (int): The channels of inter layers. Default: 512. |
||||||
|
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False. |
||||||
|
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, |
||||||
|
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. |
||||||
|
dropout_prob (float): Dropout ratio for upernet head. Default: 0.1. |
||||||
|
pretrained (str, optional): The path or url of pretrained model. Default: None. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
backbone, |
||||||
|
backbone_indices, |
||||||
|
channels=512, |
||||||
|
enable_auxiliary_loss=False, |
||||||
|
align_corners=False, |
||||||
|
dropout_prob=0.1, |
||||||
|
pretrained=None): |
||||||
|
super().__init__() |
||||||
|
self.backbone = backbone |
||||||
|
self.backbone_indices = backbone_indices |
||||||
|
self.in_channels = [ |
||||||
|
self.backbone.feat_channels[i] for i in backbone_indices |
||||||
|
] |
||||||
|
self.align_corners = align_corners |
||||||
|
self.pretrained = pretrained |
||||||
|
self.enable_auxiliary_loss = enable_auxiliary_loss |
||||||
|
|
||||||
|
fpn_inplanes = [ |
||||||
|
self.backbone.feat_channels[i] for i in backbone_indices |
||||||
|
] |
||||||
|
self.head = UPerNetHead( |
||||||
|
num_classes=num_classes, |
||||||
|
fpn_inplanes=fpn_inplanes, |
||||||
|
dropout_prob=dropout_prob, |
||||||
|
channels=channels, |
||||||
|
enable_auxiliary_loss=self.enable_auxiliary_loss) |
||||||
|
self.init_weight() |
||||||
|
|
||||||
|
def forward(self, x): |
||||||
|
feats = self.backbone(x) |
||||||
|
feats = [feats[i] for i in self.backbone_indices] |
||||||
|
logit_list = self.head(feats) |
||||||
|
logit_list = [ |
||||||
|
F.interpolate( |
||||||
|
logit, |
||||||
|
paddle.shape(x)[2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) for logit in logit_list |
||||||
|
] |
||||||
|
return logit_list |
||||||
|
|
||||||
|
def init_weight(self): |
||||||
|
if self.pretrained is not None: |
||||||
|
utils.load_entire_model(self, self.pretrained) |
||||||
|
|
||||||
|
|
||||||
|
class UPerNetHead(nn.Layer): |
||||||
|
def __init__(self, |
||||||
|
num_classes, |
||||||
|
fpn_inplanes, |
||||||
|
channels, |
||||||
|
dropout_prob=0.1, |
||||||
|
enable_auxiliary_loss=False, |
||||||
|
align_corners=True): |
||||||
|
super(UPerNetHead, self).__init__() |
||||||
|
self.align_corners = align_corners |
||||||
|
self.ppm = layers.PPModule( |
||||||
|
in_channels=fpn_inplanes[-1], |
||||||
|
out_channels=channels, |
||||||
|
bin_sizes=(1, 2, 3, 6), |
||||||
|
dim_reduction=True, |
||||||
|
align_corners=True) |
||||||
|
self.enable_auxiliary_loss = enable_auxiliary_loss |
||||||
|
self.lateral_convs = nn.LayerList() |
||||||
|
self.fpn_convs = nn.LayerList() |
||||||
|
|
||||||
|
for fpn_inplane in fpn_inplanes[:-1]: |
||||||
|
self.lateral_convs.append( |
||||||
|
layers.ConvBNReLU(fpn_inplane, channels, 1)) |
||||||
|
self.fpn_convs.append( |
||||||
|
layers.ConvBNReLU( |
||||||
|
channels, channels, 3, bias_attr=False)) |
||||||
|
|
||||||
|
if self.enable_auxiliary_loss: |
||||||
|
self.aux_head = layers.AuxLayer( |
||||||
|
fpn_inplanes[2], |
||||||
|
fpn_inplanes[2], |
||||||
|
num_classes, |
||||||
|
dropout_prob=dropout_prob) |
||||||
|
|
||||||
|
self.fpn_bottleneck = layers.ConvBNReLU( |
||||||
|
len(fpn_inplanes) * channels, channels, 3, padding=1) |
||||||
|
|
||||||
|
self.conv_last = nn.Sequential( |
||||||
|
layers.ConvBNReLU( |
||||||
|
len(fpn_inplanes) * channels, channels, 3, bias_attr=False), |
||||||
|
nn.Conv2D( |
||||||
|
channels, num_classes, kernel_size=1)) |
||||||
|
self.conv_seg = nn.Conv2D(channels, num_classes, kernel_size=1) |
||||||
|
|
||||||
|
def forward(self, inputs): |
||||||
|
laterals = [] |
||||||
|
for i, lateral_conv in enumerate(self.lateral_convs): |
||||||
|
laterals.append(lateral_conv(inputs[i])) |
||||||
|
|
||||||
|
laterals.append(self.ppm(inputs[-1])) |
||||||
|
fpn_levels = len(laterals) |
||||||
|
for i in range(fpn_levels - 1, 0, -1): |
||||||
|
prev_shape = paddle.shape(laterals[i - 1]) |
||||||
|
laterals[i - 1] = laterals[i - 1] + F.interpolate( |
||||||
|
laterals[i], |
||||||
|
size=prev_shape[2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) |
||||||
|
|
||||||
|
fpn_outs = [] |
||||||
|
for i in range(fpn_levels - 1): |
||||||
|
fpn_outs.append(self.fpn_convs[i](laterals[i])) |
||||||
|
fpn_outs.append(laterals[-1]) |
||||||
|
|
||||||
|
for i in range(fpn_levels - 1, 0, -1): |
||||||
|
fpn_outs[i] = F.interpolate( |
||||||
|
fpn_outs[i], |
||||||
|
size=paddle.shape(fpn_outs[0])[2:], |
||||||
|
mode='bilinear', |
||||||
|
align_corners=self.align_corners) |
||||||
|
fuse_out = paddle.concat(fpn_outs, axis=1) |
||||||
|
x = self.fpn_bottleneck(fuse_out) |
||||||
|
|
||||||
|
x = self.conv_seg(x) |
||||||
|
logits_list = [x] |
||||||
|
if self.enable_auxiliary_loss: |
||||||
|
aux_out = self.aux_head(inputs[2]) |
||||||
|
logits_list.append(aux_out) |
||||||
|
return logits_list |
||||||
|
else: |
||||||
|
return logits_list |
File diff suppressed because it is too large
Load Diff
@ -1,59 +0,0 @@ |
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. |
|
||||||
# |
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
||||||
# you may not use this file except in compliance with the License. |
|
||||||
# You may obtain a copy of the License at |
|
||||||
# |
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
|
||||||
# |
|
||||||
# Unless required by applicable law or agreed to in writing, software |
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
# See the License for the specific language governing permissions and |
|
||||||
# limitations under the License. |
|
||||||
|
|
||||||
import numpy as np |
|
||||||
|
|
||||||
|
|
||||||
def config_check(cfg, train_dataset=None, val_dataset=None): |
|
||||||
""" |
|
||||||
To check config。 |
|
||||||
|
|
||||||
Args: |
|
||||||
cfg (paddleseg.cvlibs.Config): An object of paddleseg.cvlibs.Config. |
|
||||||
train_dataset (paddle.io.Dataset): Used to read and process training datasets. |
|
||||||
val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. |
|
||||||
""" |
|
||||||
|
|
||||||
num_classes_check(cfg, train_dataset, val_dataset) |
|
||||||
|
|
||||||
|
|
||||||
def num_classes_check(cfg, train_dataset, val_dataset): |
|
||||||
"""" |
|
||||||
Check that the num_classes in model, train_dataset and val_dataset is consistent. |
|
||||||
""" |
|
||||||
num_classes_set = set() |
|
||||||
if train_dataset and hasattr(train_dataset, 'num_classes'): |
|
||||||
num_classes_set.add(train_dataset.num_classes) |
|
||||||
if val_dataset and hasattr(val_dataset, 'num_classes'): |
|
||||||
num_classes_set.add(val_dataset.num_classes) |
|
||||||
if cfg.dic.get('model', None) and cfg.dic['model'].get('num_classes', None): |
|
||||||
num_classes_set.add(cfg.dic['model'].get('num_classes')) |
|
||||||
if (not cfg.train_dataset) and (not cfg.val_dataset): |
|
||||||
raise ValueError( |
|
||||||
'One of `train_dataset` or `val_dataset should be given, but there are none.' |
|
||||||
) |
|
||||||
if len(num_classes_set) == 0: |
|
||||||
raise ValueError( |
|
||||||
'`num_classes` is not found. Please set it in model, train_dataset or val_dataset' |
|
||||||
) |
|
||||||
elif len(num_classes_set) > 1: |
|
||||||
raise ValueError( |
|
||||||
'`num_classes` is not consistent: {}. Please set it consistently in model or train_dataset or val_dataset' |
|
||||||
.format(num_classes_set)) |
|
||||||
else: |
|
||||||
num_classes = num_classes_set.pop() |
|
||||||
if train_dataset: |
|
||||||
train_dataset.num_classes = num_classes |
|
||||||
if val_dataset: |
|
||||||
val_dataset.num_classes = num_classes |
|
@ -1,442 +1,442 @@ |
|||||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
||||||
# |
# |
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||||
# you may not use this file except in compliance with the License. |
# you may not use this file except in compliance with the License. |
||||||
# You may obtain a copy of the License at |
# You may obtain a copy of the License at |
||||||
# |
# |
||||||
# http://www.apache.org/licenses/LICENSE-2.0 |
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
# |
# |
||||||
# Unless required by applicable law or agreed to in writing, software |
# Unless required by applicable law or agreed to in writing, software |
||||||
# distributed under the License is distributed on an "AS IS" BASIS, |
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
# See the License for the specific language governing permissions and |
# See the License for the specific language governing permissions and |
||||||
# limitations under the License. |
# limitations under the License. |
||||||
""" |
""" |
||||||
This code is based on https://github.com/AgentMaker/Paddle-Image-Models |
This code is based on https://github.com/AgentMaker/Paddle-Image-Models |
||||||
Ths copyright of AgentMaker/Paddle-Image-Models is as follows: |
Ths copyright of AgentMaker/Paddle-Image-Models is as follows: |
||||||
Apache License [see LICENSE for details] |
Apache License [see LICENSE for details] |
||||||
""" |
""" |
||||||
|
|
||||||
import paddle |
import paddle |
||||||
import paddle.nn as nn |
import paddle.nn as nn |
||||||
|
|
||||||
__all__ = ["CondenseNetV2_a", "CondenseNetV2_b", "CondenseNetV2_c"] |
__all__ = ["CondenseNetV2_A", "CondenseNetV2_B", "CondenseNetV2_C"] |
||||||
|
|
||||||
|
|
||||||
class SELayer(nn.Layer): |
class SELayer(nn.Layer): |
||||||
def __init__(self, inplanes, reduction=16): |
def __init__(self, inplanes, reduction=16): |
||||||
super(SELayer, self).__init__() |
super(SELayer, self).__init__() |
||||||
self.avg_pool = nn.AdaptiveAvgPool2D(1) |
self.avg_pool = nn.AdaptiveAvgPool2D(1) |
||||||
self.fc = nn.Sequential( |
self.fc = nn.Sequential( |
||||||
nn.Linear( |
nn.Linear( |
||||||
inplanes, inplanes // reduction, bias_attr=False), |
inplanes, inplanes // reduction, bias_attr=False), |
||||||
nn.ReLU(), |
nn.ReLU(), |
||||||
nn.Linear( |
nn.Linear( |
||||||
inplanes // reduction, inplanes, bias_attr=False), |
inplanes // reduction, inplanes, bias_attr=False), |
||||||
nn.Sigmoid(), ) |
nn.Sigmoid(), ) |
||||||
|
|
||||||
def forward(self, x): |
def forward(self, x): |
||||||
b, c, _, _ = x.shape |
b, c, _, _ = x.shape |
||||||
y = self.avg_pool(x).reshape((b, c)) |
y = self.avg_pool(x).reshape((b, c)) |
||||||
y = self.fc(y).reshape((b, c, 1, 1)) |
y = self.fc(y).reshape((b, c, 1, 1)) |
||||||
return x * paddle.expand(y, shape=x.shape) |
return x * paddle.expand(y, shape=x.shape) |
||||||
|
|
||||||
|
|
||||||
class HS(nn.Layer): |
class HS(nn.Layer): |
||||||
def __init__(self): |
def __init__(self): |
||||||
super(HS, self).__init__() |
super(HS, self).__init__() |
||||||
self.relu6 = nn.ReLU6() |
self.relu6 = nn.ReLU6() |
||||||
|
|
||||||
def forward(self, inputs): |
def forward(self, inputs): |
||||||
return inputs * self.relu6(inputs + 3) / 6 |
return inputs * self.relu6(inputs + 3) / 6 |
||||||
|
|
||||||
|
|
||||||
class Conv(nn.Sequential): |
class Conv(nn.Sequential): |
||||||
def __init__( |
def __init__( |
||||||
self, |
self, |
||||||
in_channels, |
in_channels, |
||||||
out_channels, |
out_channels, |
||||||
kernel_size, |
kernel_size, |
||||||
stride=1, |
stride=1, |
||||||
padding=0, |
padding=0, |
||||||
groups=1, |
groups=1, |
||||||
activation="ReLU", |
activation="ReLU", |
||||||
bn_momentum=0.9, ): |
bn_momentum=0.9, ): |
||||||
super(Conv, self).__init__() |
super(Conv, self).__init__() |
||||||
self.add_sublayer( |
self.add_sublayer( |
||||||
"norm", nn.BatchNorm2D( |
"norm", nn.BatchNorm2D( |
||||||
in_channels, momentum=bn_momentum)) |
in_channels, momentum=bn_momentum)) |
||||||
if activation == "ReLU": |
if activation == "ReLU": |
||||||
self.add_sublayer("activation", nn.ReLU()) |
self.add_sublayer("activation", nn.ReLU()) |
||||||
elif activation == "HS": |
elif activation == "HS": |
||||||
self.add_sublayer("activation", HS()) |
self.add_sublayer("activation", HS()) |
||||||
else: |
else: |
||||||
raise NotImplementedError |
raise NotImplementedError |
||||||
self.add_sublayer( |
self.add_sublayer( |
||||||
"conv", |
"conv", |
||||||
nn.Conv2D( |
nn.Conv2D( |
||||||
in_channels, |
in_channels, |
||||||
out_channels, |
out_channels, |
||||||
kernel_size=kernel_size, |
kernel_size=kernel_size, |
||||||
stride=stride, |
stride=stride, |
||||||
padding=padding, |
padding=padding, |
||||||
bias_attr=False, |
bias_attr=False, |
||||||
groups=groups, ), ) |
groups=groups, ), ) |
||||||
|
|
||||||
|
|
||||||
def ShuffleLayer(x, groups): |
def ShuffleLayer(x, groups): |
||||||
batchsize, num_channels, height, width = x.shape |
batchsize, num_channels, height, width = x.shape |
||||||
channels_per_group = num_channels // groups |
channels_per_group = num_channels // groups |
||||||
# Reshape |
# Reshape |
||||||
x = x.reshape((batchsize, groups, channels_per_group, height, width)) |
x = x.reshape((batchsize, groups, channels_per_group, height, width)) |
||||||
# Transpose |
# Transpose |
||||||
x = x.transpose((0, 2, 1, 3, 4)) |
x = x.transpose((0, 2, 1, 3, 4)) |
||||||
# Reshape |
# Reshape |
||||||
x = x.reshape((batchsize, groups * channels_per_group, height, width)) |
x = x.reshape((batchsize, groups * channels_per_group, height, width)) |
||||||
return x |
return x |
||||||
|
|
||||||
|
|
||||||
def ShuffleLayerTrans(x, groups): |
def ShuffleLayerTrans(x, groups): |
||||||
batchsize, num_channels, height, width = x.shape |
batchsize, num_channels, height, width = x.shape |
||||||
channels_per_group = num_channels // groups |
channels_per_group = num_channels // groups |
||||||
# Reshape |
# Reshape |
||||||
x = x.reshape((batchsize, channels_per_group, groups, height, width)) |
x = x.reshape((batchsize, channels_per_group, groups, height, width)) |
||||||
# Transpose |
# Transpose |
||||||
x = x.transpose((0, 2, 1, 3, 4)) |
x = x.transpose((0, 2, 1, 3, 4)) |
||||||
# Reshape |
# Reshape |
||||||
x = x.reshape((batchsize, channels_per_group * groups, height, width)) |
x = x.reshape((batchsize, channels_per_group * groups, height, width)) |
||||||
return x |
return x |
||||||
|
|
||||||
|
|
||||||
class CondenseLGC(nn.Layer): |
class CondenseLGC(nn.Layer): |
||||||
def __init__( |
def __init__( |
||||||
self, |
self, |
||||||
in_channels, |
in_channels, |
||||||
out_channels, |
out_channels, |
||||||
kernel_size, |
kernel_size, |
||||||
stride=1, |
stride=1, |
||||||
padding=0, |
padding=0, |
||||||
groups=1, |
groups=1, |
||||||
activation="ReLU", ): |
activation="ReLU", ): |
||||||
super(CondenseLGC, self).__init__() |
super(CondenseLGC, self).__init__() |
||||||
self.in_channels = in_channels |
self.in_channels = in_channels |
||||||
self.out_channels = out_channels |
self.out_channels = out_channels |
||||||
self.groups = groups |
self.groups = groups |
||||||
self.norm = nn.BatchNorm2D(self.in_channels) |
self.norm = nn.BatchNorm2D(self.in_channels) |
||||||
if activation == "ReLU": |
if activation == "ReLU": |
||||||
self.activation = nn.ReLU() |
self.activation = nn.ReLU() |
||||||
elif activation == "HS": |
elif activation == "HS": |
||||||
self.activation = HS() |
self.activation = HS() |
||||||
else: |
else: |
||||||
raise NotImplementedError |
raise NotImplementedError |
||||||
self.conv = nn.Conv2D( |
self.conv = nn.Conv2D( |
||||||
self.in_channels, |
self.in_channels, |
||||||
self.out_channels, |
self.out_channels, |
||||||
kernel_size=kernel_size, |
kernel_size=kernel_size, |
||||||
stride=stride, |
stride=stride, |
||||||
padding=padding, |
padding=padding, |
||||||
groups=self.groups, |
groups=self.groups, |
||||||
bias_attr=False, ) |
bias_attr=False, ) |
||||||
self.register_buffer( |
self.register_buffer( |
||||||
"index", paddle.zeros( |
"index", paddle.zeros( |
||||||
(self.in_channels, ), dtype="int64")) |
(self.in_channels, ), dtype="int64")) |
||||||
|
|
||||||
def forward(self, x): |
def forward(self, x): |
||||||
x = paddle.index_select(x, self.index, axis=1) |
x = paddle.index_select(x, self.index, axis=1) |
||||||
x = self.norm(x) |
x = self.norm(x) |
||||||
x = self.activation(x) |
x = self.activation(x) |
||||||
x = self.conv(x) |
x = self.conv(x) |
||||||
x = ShuffleLayer(x, self.groups) |
x = ShuffleLayer(x, self.groups) |
||||||
return x |
return x |
||||||
|
|
||||||
|
|
||||||
class CondenseSFR(nn.Layer): |
class CondenseSFR(nn.Layer): |
||||||
def __init__( |
def __init__( |
||||||
self, |
self, |
||||||
in_channels, |
in_channels, |
||||||
out_channels, |
out_channels, |
||||||
kernel_size, |
kernel_size, |
||||||
stride=1, |
stride=1, |
||||||
padding=0, |
padding=0, |
||||||
groups=1, |
groups=1, |
||||||
activation="ReLU", ): |
activation="ReLU", ): |
||||||
super(CondenseSFR, self).__init__() |
super(CondenseSFR, self).__init__() |
||||||
self.in_channels = in_channels |
self.in_channels = in_channels |
||||||
self.out_channels = out_channels |
self.out_channels = out_channels |
||||||
self.groups = groups |
self.groups = groups |
||||||
self.norm = nn.BatchNorm2D(self.in_channels) |
self.norm = nn.BatchNorm2D(self.in_channels) |
||||||
if activation == "ReLU": |
if activation == "ReLU": |
||||||
self.activation = nn.ReLU() |
self.activation = nn.ReLU() |
||||||
elif activation == "HS": |
elif activation == "HS": |
||||||
self.activation = HS() |
self.activation = HS() |
||||||
else: |
else: |
||||||
raise NotImplementedError |
raise NotImplementedError |
||||||
self.conv = nn.Conv2D( |
self.conv = nn.Conv2D( |
||||||
self.in_channels, |
self.in_channels, |
||||||
self.out_channels, |
self.out_channels, |
||||||
kernel_size=kernel_size, |
kernel_size=kernel_size, |
||||||
padding=padding, |
padding=padding, |
||||||
groups=self.groups, |
groups=self.groups, |
||||||
bias_attr=False, |
bias_attr=False, |
||||||
stride=stride, ) |
stride=stride, ) |
||||||
self.register_buffer("index", |
self.register_buffer("index", |
||||||
paddle.zeros( |
paddle.zeros( |
||||||
(self.out_channels, self.out_channels))) |
(self.out_channels, self.out_channels))) |
||||||
|
|
||||||
def forward(self, x): |
def forward(self, x): |
||||||
x = self.norm(x) |
x = self.norm(x) |
||||||
x = self.activation(x) |
x = self.activation(x) |
||||||
x = ShuffleLayerTrans(x, self.groups) |
x = ShuffleLayerTrans(x, self.groups) |
||||||
x = self.conv(x) # SIZE: N, C, H, W |
x = self.conv(x) # SIZE: N, C, H, W |
||||||
N, C, H, W = x.shape |
N, C, H, W = x.shape |
||||||
x = x.reshape((N, C, H * W)) |
x = x.reshape((N, C, H * W)) |
||||||
x = x.transpose((0, 2, 1)) # SIZE: N, HW, C |
x = x.transpose((0, 2, 1)) # SIZE: N, HW, C |
||||||
# x SIZE: N, HW, C; self.index SIZE: C, C; OUTPUT SIZE: N, HW, C |
# x SIZE: N, HW, C; self.index SIZE: C, C; OUTPUT SIZE: N, HW, C |
||||||
x = paddle.matmul(x, self.index) |
x = paddle.matmul(x, self.index) |
||||||
x = x.transpose((0, 2, 1)) # SIZE: N, C, HW |
x = x.transpose((0, 2, 1)) # SIZE: N, C, HW |
||||||
x = x.reshape((N, C, H, W)) # SIZE: N, C, HW |
x = x.reshape((N, C, H, W)) # SIZE: N, C, HW |
||||||
return x |
return x |
||||||
|
|
||||||
|
|
||||||
class _SFR_DenseLayer(nn.Layer): |
class _SFR_DenseLayer(nn.Layer): |
||||||
def __init__( |
def __init__( |
||||||
self, |
self, |
||||||
in_channels, |
in_channels, |
||||||
growth_rate, |
growth_rate, |
||||||
group_1x1, |
group_1x1, |
||||||
group_3x3, |
group_3x3, |
||||||
group_trans, |
group_trans, |
||||||
bottleneck, |
bottleneck, |
||||||
activation, |
activation, |
||||||
use_se=False, ): |
use_se=False, ): |
||||||
super(_SFR_DenseLayer, self).__init__() |
super(_SFR_DenseLayer, self).__init__() |
||||||
self.group_1x1 = group_1x1 |
self.group_1x1 = group_1x1 |
||||||
self.group_3x3 = group_3x3 |
self.group_3x3 = group_3x3 |
||||||
self.group_trans = group_trans |
self.group_trans = group_trans |
||||||
self.use_se = use_se |
self.use_se = use_se |
||||||
# 1x1 conv i --> b*k |
# 1x1 conv i --> b*k |
||||||
self.conv_1 = CondenseLGC( |
self.conv_1 = CondenseLGC( |
||||||
in_channels, |
in_channels, |
||||||
bottleneck * growth_rate, |
bottleneck * growth_rate, |
||||||
kernel_size=1, |
kernel_size=1, |
||||||
groups=self.group_1x1, |
groups=self.group_1x1, |
||||||
activation=activation, ) |
activation=activation, ) |
||||||
# 3x3 conv b*k --> k |
# 3x3 conv b*k --> k |
||||||
self.conv_2 = Conv( |
self.conv_2 = Conv( |
||||||
bottleneck * growth_rate, |
bottleneck * growth_rate, |
||||||
growth_rate, |
growth_rate, |
||||||
kernel_size=3, |
kernel_size=3, |
||||||
padding=1, |
padding=1, |
||||||
groups=self.group_3x3, |
groups=self.group_3x3, |
||||||
activation=activation, ) |
activation=activation, ) |
||||||
# 1x1 res conv k(8-16-32)--> i (k*l) |
# 1x1 res conv k(8-16-32)--> i (k*l) |
||||||
self.sfr = CondenseSFR( |
self.sfr = CondenseSFR( |
||||||
growth_rate, |
growth_rate, |
||||||
in_channels, |
in_channels, |
||||||
kernel_size=1, |
kernel_size=1, |
||||||
groups=self.group_trans, |
groups=self.group_trans, |
||||||
activation=activation, ) |
activation=activation, ) |
||||||
if self.use_se: |
if self.use_se: |
||||||
self.se = SELayer(inplanes=growth_rate, reduction=1) |
self.se = SELayer(inplanes=growth_rate, reduction=1) |
||||||
|
|
||||||
def forward(self, x): |
def forward(self, x): |
||||||
x_ = x |
x_ = x |
||||||
x = self.conv_1(x) |
x = self.conv_1(x) |
||||||
x = self.conv_2(x) |
x = self.conv_2(x) |
||||||
if self.use_se: |
if self.use_se: |
||||||
x = self.se(x) |
x = self.se(x) |
||||||
sfr_feature = self.sfr(x) |
sfr_feature = self.sfr(x) |
||||||
y = x_ + sfr_feature |
y = x_ + sfr_feature |
||||||
return paddle.concat([y, x], 1) |
return paddle.concat([y, x], 1) |
||||||
|
|
||||||
|
|
||||||
class _SFR_DenseBlock(nn.Sequential): |
class _SFR_DenseBlock(nn.Sequential): |
||||||
def __init__( |
def __init__( |
||||||
self, |
self, |
||||||
num_layers, |
num_layers, |
||||||
in_channels, |
in_channels, |
||||||
growth_rate, |
growth_rate, |
||||||
group_1x1, |
group_1x1, |
||||||
group_3x3, |
group_3x3, |
||||||
group_trans, |
group_trans, |
||||||
bottleneck, |
bottleneck, |
||||||
activation, |
activation, |
||||||
use_se, ): |
use_se, ): |
||||||
super(_SFR_DenseBlock, self).__init__() |
super(_SFR_DenseBlock, self).__init__() |
||||||
for i in range(num_layers): |
for i in range(num_layers): |
||||||
layer = _SFR_DenseLayer( |
layer = _SFR_DenseLayer( |
||||||
in_channels + i * growth_rate, |
in_channels + i * growth_rate, |
||||||
growth_rate, |
growth_rate, |
||||||
group_1x1, |
group_1x1, |
||||||
group_3x3, |
group_3x3, |
||||||
group_trans, |
group_trans, |
||||||
bottleneck, |
bottleneck, |
||||||
activation, |
activation, |
||||||
use_se, ) |
use_se, ) |
||||||
self.add_sublayer("denselayer_%d" % (i + 1), layer) |
self.add_sublayer("denselayer_%d" % (i + 1), layer) |
||||||
|
|
||||||
|
|
||||||
class _Transition(nn.Layer): |
class _Transition(nn.Layer): |
||||||
def __init__(self): |
def __init__(self): |
||||||
super(_Transition, self).__init__() |
super(_Transition, self).__init__() |
||||||
self.pool = nn.AvgPool2D(kernel_size=2, stride=2) |
self.pool = nn.AvgPool2D(kernel_size=2, stride=2) |
||||||
|
|
||||||
def forward(self, x): |
def forward(self, x): |
||||||
x = self.pool(x) |
x = self.pool(x) |
||||||
return x |
return x |
||||||
|
|
||||||
|
|
||||||
class CondenseNetV2(nn.Layer): |
class CondenseNetV2(nn.Layer): |
||||||
def __init__( |
def __init__( |
||||||
self, |
self, |
||||||
stages, |
stages, |
||||||
growth, |
growth, |
||||||
HS_start_block, |
HS_start_block, |
||||||
SE_start_block, |
SE_start_block, |
||||||
fc_channel, |
fc_channel, |
||||||
group_1x1, |
group_1x1, |
||||||
group_3x3, |
group_3x3, |
||||||
group_trans, |
group_trans, |
||||||
bottleneck, |
bottleneck, |
||||||
last_se_reduction, |
last_se_reduction, |
||||||
in_channels=3, |
in_channels=3, |
||||||
class_num=1000, ): |
class_num=1000, ): |
||||||
super(CondenseNetV2, self).__init__() |
super(CondenseNetV2, self).__init__() |
||||||
self.stages = stages |
self.stages = stages |
||||||
self.growth = growth |
self.growth = growth |
||||||
self.in_channels = in_channels |
self.in_channels = in_channels |
||||||
self.class_num = class_num |
self.class_num = class_num |
||||||
self.last_se_reduction = last_se_reduction |
self.last_se_reduction = last_se_reduction |
||||||
assert len(self.stages) == len(self.growth) |
assert len(self.stages) == len(self.growth) |
||||||
self.progress = 0.0 |
self.progress = 0.0 |
||||||
|
|
||||||
self.init_stride = 2 |
self.init_stride = 2 |
||||||
self.pool_size = 7 |
self.pool_size = 7 |
||||||
|
|
||||||
self.features = nn.Sequential() |
self.features = nn.Sequential() |
||||||
# Initial nChannels should be 3 |
# Initial nChannels should be 3 |
||||||
self.num_features = 2 * self.growth[0] |
self.num_features = 2 * self.growth[0] |
||||||
# Dense-block 1 (224x224) |
# Dense-block 1 (224x224) |
||||||
self.features.add_sublayer( |
self.features.add_sublayer( |
||||||
"init_conv", |
"init_conv", |
||||||
nn.Conv2D( |
nn.Conv2D( |
||||||
in_channels, |
in_channels, |
||||||
self.num_features, |
self.num_features, |
||||||
kernel_size=3, |
kernel_size=3, |
||||||
stride=self.init_stride, |
stride=self.init_stride, |
||||||
padding=1, |
padding=1, |
||||||
bias_attr=False, ), ) |
bias_attr=False, ), ) |
||||||
for i in range(len(self.stages)): |
for i in range(len(self.stages)): |
||||||
activation = "HS" if i >= HS_start_block else "ReLU" |
activation = "HS" if i >= HS_start_block else "ReLU" |
||||||
use_se = True if i >= SE_start_block else False |
use_se = True if i >= SE_start_block else False |
||||||
# Dense-block i |
# Dense-block i |
||||||
self.add_block(i, group_1x1, group_3x3, group_trans, bottleneck, |
self.add_block(i, group_1x1, group_3x3, group_trans, bottleneck, |
||||||
activation, use_se) |
activation, use_se) |
||||||
|
|
||||||
self.fc = nn.Linear(self.num_features, fc_channel) |
self.fc = nn.Linear(self.num_features, fc_channel) |
||||||
self.fc_act = HS() |
self.fc_act = HS() |
||||||
|
|
||||||
# Classifier layer |
# Classifier layer |
||||||
if class_num > 0: |
if class_num > 0: |
||||||
self.classifier = nn.Linear(fc_channel, class_num) |
self.classifier = nn.Linear(fc_channel, class_num) |
||||||
self._initialize() |
self._initialize() |
||||||
|
|
||||||
def add_block(self, i, group_1x1, group_3x3, group_trans, bottleneck, |
def add_block(self, i, group_1x1, group_3x3, group_trans, bottleneck, |
||||||
activation, use_se): |
activation, use_se): |
||||||
# Check if ith is the last one |
# Check if ith is the last one |
||||||
last = i == len(self.stages) - 1 |
last = i == len(self.stages) - 1 |
||||||
block = _SFR_DenseBlock( |
block = _SFR_DenseBlock( |
||||||
num_layers=self.stages[i], |
num_layers=self.stages[i], |
||||||
in_channels=self.num_features, |
in_channels=self.num_features, |
||||||
growth_rate=self.growth[i], |
growth_rate=self.growth[i], |
||||||
group_1x1=group_1x1, |
group_1x1=group_1x1, |
||||||
group_3x3=group_3x3, |
group_3x3=group_3x3, |
||||||
group_trans=group_trans, |
group_trans=group_trans, |
||||||
bottleneck=bottleneck, |
bottleneck=bottleneck, |
||||||
activation=activation, |
activation=activation, |
||||||
use_se=use_se, ) |
use_se=use_se, ) |
||||||
self.features.add_sublayer("denseblock_%d" % (i + 1), block) |
self.features.add_sublayer("denseblock_%d" % (i + 1), block) |
||||||
self.num_features += self.stages[i] * self.growth[i] |
self.num_features += self.stages[i] * self.growth[i] |
||||||
if not last: |
if not last: |
||||||
trans = _Transition() |
trans = _Transition() |
||||||
self.features.add_sublayer("transition_%d" % (i + 1), trans) |
self.features.add_sublayer("transition_%d" % (i + 1), trans) |
||||||
else: |
else: |
||||||
self.features.add_sublayer("norm_last", |
self.features.add_sublayer("norm_last", |
||||||
nn.BatchNorm2D(self.num_features)) |
nn.BatchNorm2D(self.num_features)) |
||||||
self.features.add_sublayer("relu_last", nn.ReLU()) |
self.features.add_sublayer("relu_last", nn.ReLU()) |
||||||
self.features.add_sublayer("pool_last", |
self.features.add_sublayer("pool_last", |
||||||
nn.AvgPool2D(self.pool_size)) |
nn.AvgPool2D(self.pool_size)) |
||||||
# if useSE: |
# if useSE: |
||||||
self.features.add_sublayer( |
self.features.add_sublayer( |
||||||
"se_last", |
"se_last", |
||||||
SELayer( |
SELayer( |
||||||
self.num_features, reduction=self.last_se_reduction)) |
self.num_features, reduction=self.last_se_reduction)) |
||||||
|
|
||||||
def forward(self, x): |
def forward(self, x): |
||||||
features = self.features(x) |
features = self.features(x) |
||||||
out = features.reshape((features.shape[0], features.shape[1] * |
out = features.reshape((features.shape[0], features.shape[1] * |
||||||
features.shape[2] * features.shape[3])) |
features.shape[2] * features.shape[3])) |
||||||
out = self.fc(out) |
out = self.fc(out) |
||||||
out = self.fc_act(out) |
out = self.fc_act(out) |
||||||
|
|
||||||
if self.class_num > 0: |
if self.class_num > 0: |
||||||
out = self.classifier(out) |
out = self.classifier(out) |
||||||
|
|
||||||
return out |
return out |
||||||
|
|
||||||
def _initialize(self): |
def _initialize(self): |
||||||
# Initialize |
# Initialize |
||||||
for m in self.sublayers(): |
for m in self.sublayers(): |
||||||
if isinstance(m, nn.Conv2D): |
if isinstance(m, nn.Conv2D): |
||||||
nn.initializer.KaimingNormal()(m.weight) |
nn.initializer.KaimingNormal()(m.weight) |
||||||
elif isinstance(m, nn.BatchNorm2D): |
elif isinstance(m, nn.BatchNorm2D): |
||||||
nn.initializer.Constant(value=1.0)(m.weight) |
nn.initializer.Constant(value=1.0)(m.weight) |
||||||
nn.initializer.Constant(value=0.0)(m.bias) |
nn.initializer.Constant(value=0.0)(m.bias) |
||||||
|
|
||||||
|
|
||||||
def CondenseNetV2_a(**kwargs): |
def CondenseNetV2_A(**kwargs): |
||||||
model = CondenseNetV2( |
model = CondenseNetV2( |
||||||
stages=[1, 1, 4, 6, 8], |
stages=[1, 1, 4, 6, 8], |
||||||
growth=[8, 8, 16, 32, 64], |
growth=[8, 8, 16, 32, 64], |
||||||
HS_start_block=2, |
HS_start_block=2, |
||||||
SE_start_block=3, |
SE_start_block=3, |
||||||
fc_channel=828, |
fc_channel=828, |
||||||
group_1x1=8, |
group_1x1=8, |
||||||
group_3x3=8, |
group_3x3=8, |
||||||
group_trans=8, |
group_trans=8, |
||||||
bottleneck=4, |
bottleneck=4, |
||||||
last_se_reduction=16, |
last_se_reduction=16, |
||||||
**kwargs) |
**kwargs) |
||||||
return model |
return model |
||||||
|
|
||||||
|
|
||||||
def CondenseNetV2_b(**kwargs): |
def CondenseNetV2_B(**kwargs): |
||||||
model = CondenseNetV2( |
model = CondenseNetV2( |
||||||
stages=[2, 4, 6, 8, 6], |
stages=[2, 4, 6, 8, 6], |
||||||
growth=[6, 12, 24, 48, 96], |
growth=[6, 12, 24, 48, 96], |
||||||
HS_start_block=2, |
HS_start_block=2, |
||||||
SE_start_block=3, |
SE_start_block=3, |
||||||
fc_channel=1024, |
fc_channel=1024, |
||||||
group_1x1=6, |
group_1x1=6, |
||||||
group_3x3=6, |
group_3x3=6, |
||||||
group_trans=6, |
group_trans=6, |
||||||
bottleneck=4, |
bottleneck=4, |
||||||
last_se_reduction=16, |
last_se_reduction=16, |
||||||
**kwargs) |
**kwargs) |
||||||
return model |
return model |
||||||
|
|
||||||
|
|
||||||
def CondenseNetV2_c(**kwargs): |
def CondenseNetV2_C(**kwargs): |
||||||
model = CondenseNetV2( |
model = CondenseNetV2( |
||||||
stages=[4, 6, 8, 10, 8], |
stages=[4, 6, 8, 10, 8], |
||||||
growth=[8, 16, 32, 64, 128], |
growth=[8, 16, 32, 64, 128], |
||||||
HS_start_block=2, |
HS_start_block=2, |
||||||
SE_start_block=3, |
SE_start_block=3, |
||||||
fc_channel=1024, |
fc_channel=1024, |
||||||
group_1x1=8, |
group_1x1=8, |
||||||
group_3x3=8, |
group_3x3=8, |
||||||
group_trans=8, |
group_trans=8, |
||||||
bottleneck=4, |
bottleneck=4, |
||||||
last_se_reduction=16, |
last_se_reduction=16, |
||||||
**kwargs) |
**kwargs) |
||||||
return model |
return model |
@ -1,8 +0,0 @@ |
|||||||
# Basic configurations of BIT |
|
||||||
|
|
||||||
_base_: ../_base_/airchange.yaml |
|
||||||
|
|
||||||
save_dir: ./test_tipc/output/cd/bit/ |
|
||||||
|
|
||||||
model: !Node |
|
||||||
type: BIT |
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue