add segmentation task. optimize directory structure

own
chulutao 3 years ago
parent 16c85bf3c2
commit 691e5c438d
  1. 1
      docs/README.md
  2. 1
      paddlers/datasets/__init__.py
  3. 91
      paddlers/datasets/seg_dataset.py
  4. 17
      paddlers/models/ppseg/__init__.py
  5. 20
      paddlers/models/ppseg/core/__init__.py
  6. 309
      paddlers/models/ppseg/core/infer.py
  7. 150
      paddlers/models/ppseg/core/predict.py
  8. 326
      paddlers/models/ppseg/core/train.py
  9. 199
      paddlers/models/ppseg/core/val.py
  10. 17
      paddlers/models/ppseg/cvlibs/__init__.py
  11. 279
      paddlers/models/ppseg/cvlibs/callbacks.py
  12. 404
      paddlers/models/ppseg/cvlibs/config.py
  13. 149
      paddlers/models/ppseg/cvlibs/manager.py
  14. 120
      paddlers/models/ppseg/cvlibs/param_init.py
  15. 29
      paddlers/models/ppseg/datasets/__init__.py
  16. 111
      paddlers/models/ppseg/datasets/ade.py
  17. 98
      paddlers/models/ppseg/datasets/chase_db1.py
  18. 87
      paddlers/models/ppseg/datasets/cityscapes.py
  19. 82
      paddlers/models/ppseg/datasets/cocostuff.py
  20. 162
      paddlers/models/ppseg/datasets/dataset.py
  21. 96
      paddlers/models/ppseg/datasets/drive.py
  22. 136
      paddlers/models/ppseg/datasets/eg1800.py
  23. 95
      paddlers/models/ppseg/datasets/hrf.py
  24. 95
      paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
  25. 97
      paddlers/models/ppseg/datasets/optic_disc_seg.py
  26. 82
      paddlers/models/ppseg/datasets/pascal_context.py
  27. 82
      paddlers/models/ppseg/datasets/pp_humanseg14k.py
  28. 95
      paddlers/models/ppseg/datasets/stare.py
  29. 135
      paddlers/models/ppseg/datasets/supervisely.py
  30. 112
      paddlers/models/ppseg/datasets/voc.py
  31. 57
      paddlers/models/ppseg/models/__init__.py
  32. 434
      paddlers/models/ppseg/models/ann.py
  33. 178
      paddlers/models/ppseg/models/attention_unet.py
  34. 23
      paddlers/models/ppseg/models/backbones/__init__.py
  35. 837
      paddlers/models/ppseg/models/backbones/hrnet.py
  36. 588
      paddlers/models/ppseg/models/backbones/mix_transformer.py
  37. 168
      paddlers/models/ppseg/models/backbones/mobilenetv2.py
  38. 364
      paddlers/models/ppseg/models/backbones/mobilenetv3.py
  39. 398
      paddlers/models/ppseg/models/backbones/resnet_vd.py
  40. 281
      paddlers/models/ppseg/models/backbones/stdcnet.py
  41. 792
      paddlers/models/ppseg/models/backbones/swin_transformer.py
  42. 83
      paddlers/models/ppseg/models/backbones/transformer_utils.py
  43. 410
      paddlers/models/ppseg/models/backbones/vision_transformer.py
  44. 415
      paddlers/models/ppseg/models/backbones/xception_deeplab.py
  45. 307
      paddlers/models/ppseg/models/bisenet.py
  46. 259
      paddlers/models/ppseg/models/bisenetv1.py
  47. 218
      paddlers/models/ppseg/models/danet.py
  48. 228
      paddlers/models/ppseg/models/decoupled_segnet.py
  49. 308
      paddlers/models/ppseg/models/deeplab.py
  50. 149
      paddlers/models/ppseg/models/dmnet.py
  51. 226
      paddlers/models/ppseg/models/dnlnet.py
  52. 215
      paddlers/models/ppseg/models/emanet.py
  53. 224
      paddlers/models/ppseg/models/encnet.py
  54. 622
      paddlers/models/ppseg/models/enet.py
  55. 477
      paddlers/models/ppseg/models/espnet.py
  56. 308
      paddlers/models/ppseg/models/espnetv1.py
  57. 316
      paddlers/models/ppseg/models/fast_scnn.py
  58. 240
      paddlers/models/ppseg/models/fastfcn.py
  59. 145
      paddlers/models/ppseg/models/fcn.py
  60. 222
      paddlers/models/ppseg/models/gcnet.py
  61. 291
      paddlers/models/ppseg/models/ginet.py
  62. 353
      paddlers/models/ppseg/models/gscnn.py
  63. 308
      paddlers/models/ppseg/models/hardnet.py
  64. 127
      paddlers/models/ppseg/models/hrnet_contrast.py
  65. 197
      paddlers/models/ppseg/models/isanet.py
  66. 20
      paddlers/models/ppseg/models/layers/__init__.py
  67. 73
      paddlers/models/ppseg/models/layers/activation.py
  68. 146
      paddlers/models/ppseg/models/layers/attention.py
  69. 302
      paddlers/models/ppseg/models/layers/layer_libs.py
  70. 154
      paddlers/models/ppseg/models/layers/nonlocal2d.py
  71. 192
      paddlers/models/ppseg/models/layers/pyramid_pool.py
  72. 83
      paddlers/models/ppseg/models/layers/wrap_functions.py
  73. 36
      paddlers/models/ppseg/models/losses/__init__.py
  74. 174
      paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
  75. 73
      paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
  76. 218
      paddlers/models/ppseg/models/losses/cross_entropy_loss.py
  77. 129
      paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
  78. 116
      paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
  79. 56
      paddlers/models/ppseg/models/losses/dice_loss.py
  80. 78
      paddlers/models/ppseg/models/losses/edge_attention_loss.py
  81. 60
      paddlers/models/ppseg/models/losses/focal_loss.py
  82. 141
      paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
  83. 80
      paddlers/models/ppseg/models/losses/kl_loss.py
  84. 76
      paddlers/models/ppseg/models/losses/l1_loss.py
  85. 222
      paddlers/models/ppseg/models/losses/lovasz_loss.py
  86. 65
      paddlers/models/ppseg/models/losses/mean_square_error_loss.py
  87. 57
      paddlers/models/ppseg/models/losses/mixed_loss.py
  88. 99
      paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
  89. 114
      paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
  90. 199
      paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
  91. 160
      paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
  92. 256
      paddlers/models/ppseg/models/losses/rmi_loss.py
  93. 175
      paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
  94. 47
      paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
  95. 241
      paddlers/models/ppseg/models/mla_transformer.py
  96. 246
      paddlers/models/ppseg/models/ocrnet.py
  97. 201
      paddlers/models/ppseg/models/pfpnnet.py
  98. 832
      paddlers/models/ppseg/models/pointrend.py
  99. 226
      paddlers/models/ppseg/models/portraitnet.py
  100. 226
      paddlers/models/ppseg/models/pphumanseg_lite.py
  101. Some files were not shown because too many files have changed in this diff Show More

@ -0,0 +1 @@
PaddleSeg commit fec42fd869b6f796c74cd510671595e3512bc8e9

@ -1 +1,2 @@
from .voc import VOCDetection
from .seg_dataset import SegDataset

@ -0,0 +1,91 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import copy
from paddle.io import Dataset
from paddlers.utils import logging, get_num_workers, get_encoding, path_normalization, is_pic
class SegDataset(Dataset):
"""读取语义分割任务数据集,并对样本进行相应的处理。
Args:
data_dir (str): 数据集所在的目录路径
file_list (str): 描述数据集图片文件和对应标注文件的文件路径文本内每行路径为相对data_dir的相对路
label_list (str): 描述数据集包含的类别信息文件路径默认值为None
transforms (paddlers.transforms): 数据集中每个样本的预处理/增强算子
num_workers (int|str): 数据集中样本在预处理过程中的线程或进程数默认为'auto'
shuffle (bool): 是否需要对数据集中样本打乱顺序默认为False
"""
def __init__(self,
data_dir,
file_list,
label_list=None,
transforms=None,
num_workers='auto',
shuffle=False):
super(SegDataset, self).__init__()
self.transforms = copy.deepcopy(transforms)
# TODO batch padding
self.batch_transforms = None
self.num_workers = get_num_workers(num_workers)
self.shuffle = shuffle
self.file_list = list()
self.labels = list()
# TODO:非None时,让用户跳转数据集分析生成label_list
# 不要在此处分析label file
if label_list is not None:
with open(label_list, encoding=get_encoding(label_list)) as f:
for line in f:
item = line.strip()
self.labels.append(item)
with open(file_list, encoding=get_encoding(file_list)) as f:
for line in f:
items = line.strip().split()
if len(items) > 2:
raise Exception(
"A space is defined as the delimiter to separate the image and label path, " \
"so the space cannot be in the image or label path, but the line[{}] of " \
" file_list[{}] has a space in the image or label path.".format(line, file_list))
items[0] = path_normalization(items[0])
items[1] = path_normalization(items[1])
if not is_pic(items[0]) or not is_pic(items[1]):
continue
full_path_im = osp.join(data_dir, items[0])
full_path_label = osp.join(data_dir, items[1])
if not osp.exists(full_path_im):
raise IOError('Image file {} does not exist!'.format(
full_path_im))
if not osp.exists(full_path_label):
raise IOError('Label file {} does not exist!'.format(
full_path_label))
self.file_list.append({
'image': full_path_im,
'mask': full_path_label
})
self.num_samples = len(self.file_list)
logging.info("{} samples in file {}".format(
len(self.file_list), file_list))
def __getitem__(self, idx):
sample = copy.deepcopy(self.file_list[idx])
outputs = self.transforms(sample)
return outputs
def __len__(self):
return len(self.file_list)

@ -0,0 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import models, datasets, transforms
__version__ = 'develop'

@ -0,0 +1,20 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .train import train
from .val import evaluate
from .predict import predict
from . import infer
__all__ = ['train', 'evaluate', 'predict']

@ -0,0 +1,309 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections.abc
from itertools import combinations
import numpy as np
import cv2
import paddle
import paddle.nn.functional as F
def get_reverse_list(ori_shape, transforms):
"""
get reverse list of transform.
Args:
ori_shape (list): Origin shape of image.
transforms (list): List of transform.
Returns:
list: List of tuple, there are two format:
('resize', (h, w)) The image shape before resize,
('padding', (h, w)) The image shape before padding.
"""
reverse_list = []
h, w = ori_shape[0], ori_shape[1]
for op in transforms:
if op.__class__.__name__ in ['Resize']:
reverse_list.append(('resize', (h, w)))
h, w = op.target_size[0], op.target_size[1]
if op.__class__.__name__ in ['ResizeByLong']:
reverse_list.append(('resize', (h, w)))
long_edge = max(h, w)
short_edge = min(h, w)
short_edge = int(round(short_edge * op.long_size / long_edge))
long_edge = op.long_size
if h > w:
h = long_edge
w = short_edge
else:
w = long_edge
h = short_edge
if op.__class__.__name__ in ['ResizeByShort']:
reverse_list.append(('resize', (h, w)))
long_edge = max(h, w)
short_edge = min(h, w)
long_edge = int(round(long_edge * op.short_size / short_edge))
short_edge = op.short_size
if h > w:
h = long_edge
w = short_edge
else:
w = long_edge
h = short_edge
if op.__class__.__name__ in ['Padding']:
reverse_list.append(('padding', (h, w)))
w, h = op.target_size[0], op.target_size[1]
if op.__class__.__name__ in ['PaddingByAspectRatio']:
reverse_list.append(('padding', (h, w)))
ratio = w / h
if ratio == op.aspect_ratio:
pass
elif ratio > op.aspect_ratio:
h = int(w / op.aspect_ratio)
else:
w = int(h * op.aspect_ratio)
if op.__class__.__name__ in ['LimitLong']:
long_edge = max(h, w)
short_edge = min(h, w)
if ((op.max_long is not None) and (long_edge > op.max_long)):
reverse_list.append(('resize', (h, w)))
long_edge = op.max_long
short_edge = int(round(short_edge * op.max_long / long_edge))
elif ((op.min_long is not None) and (long_edge < op.min_long)):
reverse_list.append(('resize', (h, w)))
long_edge = op.min_long
short_edge = int(round(short_edge * op.min_long / long_edge))
if h > w:
h = long_edge
w = short_edge
else:
w = long_edge
h = short_edge
return reverse_list
def reverse_transform(pred, ori_shape, transforms, mode='nearest'):
"""recover pred to origin shape"""
reverse_list = get_reverse_list(ori_shape, transforms)
intTypeList = [paddle.int8, paddle.int16, paddle.int32, paddle.int64]
dtype = pred.dtype
for item in reverse_list[::-1]:
if item[0] == 'resize':
h, w = item[1][0], item[1][1]
if paddle.get_device() == 'cpu' and dtype in intTypeList:
pred = paddle.cast(pred, 'float32')
pred = F.interpolate(pred, (h, w), mode=mode)
pred = paddle.cast(pred, dtype)
else:
pred = F.interpolate(pred, (h, w), mode=mode)
elif item[0] == 'padding':
h, w = item[1][0], item[1][1]
pred = pred[:, :, 0:h, 0:w]
else:
raise Exception("Unexpected info '{}' in im_info".format(item[0]))
return pred
def flip_combination(flip_horizontal=False, flip_vertical=False):
"""
Get flip combination.
Args:
flip_horizontal (bool): Whether to flip horizontally. Default: False.
flip_vertical (bool): Whether to flip vertically. Default: False.
Returns:
list: List of tuple. The first element of tuple is whether to flip horizontally,
and the second is whether to flip vertically.
"""
flip_comb = [(False, False)]
if flip_horizontal:
flip_comb.append((True, False))
if flip_vertical:
flip_comb.append((False, True))
if flip_horizontal:
flip_comb.append((True, True))
return flip_comb
def tensor_flip(x, flip):
"""Flip tensor according directions"""
if flip[0]:
x = x[:, :, :, ::-1]
if flip[1]:
x = x[:, :, ::-1, :]
return x
def slide_inference(model, im, crop_size, stride):
"""
Infer by sliding window.
Args:
model (paddle.nn.Layer): model to get logits of image.
im (Tensor): the input image.
crop_size (tuple|list). The size of sliding window, (w, h).
stride (tuple|list). The size of stride, (w, h).
Return:
Tensor: The logit of input image.
"""
h_im, w_im = im.shape[-2:]
w_crop, h_crop = crop_size
w_stride, h_stride = stride
# calculate the crop nums
rows = np.int(np.ceil(1.0 * (h_im - h_crop) / h_stride)) + 1
cols = np.int(np.ceil(1.0 * (w_im - w_crop) / w_stride)) + 1
# prevent negative sliding rounds when imgs after scaling << crop_size
rows = 1 if h_im <= h_crop else rows
cols = 1 if w_im <= w_crop else cols
# TODO 'Tensor' object does not support item assignment. If support, use tensor to calculation.
final_logit = None
count = np.zeros([1, 1, h_im, w_im])
for r in range(rows):
for c in range(cols):
h1 = r * h_stride
w1 = c * w_stride
h2 = min(h1 + h_crop, h_im)
w2 = min(w1 + w_crop, w_im)
h1 = max(h2 - h_crop, 0)
w1 = max(w2 - w_crop, 0)
im_crop = im[:, :, h1:h2, w1:w2]
logits = model(im_crop)
if not isinstance(logits, collections.abc.Sequence):
raise TypeError(
"The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}"
.format(type(logits)))
logit = logits[0].numpy()
if final_logit is None:
final_logit = np.zeros([1, logit.shape[1], h_im, w_im])
final_logit[:, :, h1:h2, w1:w2] += logit[:, :, :h2 - h1, :w2 - w1]
count[:, :, h1:h2, w1:w2] += 1
if np.sum(count == 0) != 0:
raise RuntimeError(
'There are pixel not predicted. It is possible that stride is greater than crop_size'
)
final_logit = final_logit / count
final_logit = paddle.to_tensor(final_logit)
return final_logit
def inference(model,
im,
ori_shape=None,
transforms=None,
is_slide=False,
stride=None,
crop_size=None):
"""
Inference for image.
Args:
model (paddle.nn.Layer): model to get logits of image.
im (Tensor): the input image.
ori_shape (list): Origin shape of image.
transforms (list): Transforms for image.
is_slide (bool): Whether to infer by sliding window. Default: False.
crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True.
stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True.
Returns:
Tensor: If ori_shape is not None, a prediction with shape (1, 1, h, w) is returned.
If ori_shape is None, a logit with shape (1, num_classes, h, w) is returned.
"""
if hasattr(model, 'data_format') and model.data_format == 'NHWC':
im = im.transpose((0, 2, 3, 1))
if not is_slide:
logits = model(im)
if not isinstance(logits, collections.abc.Sequence):
raise TypeError(
"The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}"
.format(type(logits)))
logit = logits[0]
else:
logit = slide_inference(model, im, crop_size=crop_size, stride=stride)
if hasattr(model, 'data_format') and model.data_format == 'NHWC':
logit = logit.transpose((0, 3, 1, 2))
if ori_shape is not None:
logit = reverse_transform(logit, ori_shape, transforms, mode='bilinear')
pred = paddle.argmax(logit, axis=1, keepdim=True, dtype='int32')
return pred, logit
else:
return logit
def aug_inference(model,
im,
ori_shape,
transforms,
scales=1.0,
flip_horizontal=False,
flip_vertical=False,
is_slide=False,
stride=None,
crop_size=None):
"""
Infer with augmentation.
Args:
model (paddle.nn.Layer): model to get logits of image.
im (Tensor): the input image.
ori_shape (list): Origin shape of image.
transforms (list): Transforms for image.
scales (float|tuple|list): Scales for resize. Default: 1.
flip_horizontal (bool): Whether to flip horizontally. Default: False.
flip_vertical (bool): Whether to flip vertically. Default: False.
is_slide (bool): Whether to infer by sliding wimdow. Default: False.
crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True.
stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True.
Returns:
Tensor: Prediction of image with shape (1, 1, h, w) is returned.
"""
if isinstance(scales, float):
scales = [scales]
elif not isinstance(scales, (tuple, list)):
raise TypeError(
'`scales` expects float/tuple/list type, but received {}'.format(
type(scales)))
final_logit = 0
h_input, w_input = im.shape[-2], im.shape[-1]
flip_comb = flip_combination(flip_horizontal, flip_vertical)
for scale in scales:
h = int(h_input * scale + 0.5)
w = int(w_input * scale + 0.5)
im = F.interpolate(im, (h, w), mode='bilinear')
for flip in flip_comb:
im_flip = tensor_flip(im, flip)
logit = inference(
model,
im_flip,
is_slide=is_slide,
crop_size=crop_size,
stride=stride)
logit = tensor_flip(logit, flip)
logit = F.interpolate(logit, (h_input, w_input), mode='bilinear')
logit = F.softmax(logit, axis=1)
final_logit = final_logit + logit
final_logit = reverse_transform(
final_logit, ori_shape, transforms, mode='bilinear')
pred = paddle.argmax(final_logit, axis=1, keepdim=True, dtype='int32')
return pred, final_logit

@ -0,0 +1,150 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import math
import cv2
import numpy as np
import paddle
from paddlers.models.ppseg import utils
from paddlers.models.ppseg.core import infer
from paddlers.models.ppseg.utils import logger, progbar, visualize
def mkdir(path):
sub_dir = os.path.dirname(path)
if not os.path.exists(sub_dir):
os.makedirs(sub_dir)
def partition_list(arr, m):
"""split the list 'arr' into m pieces"""
n = int(math.ceil(len(arr) / float(m)))
return [arr[i:i + n] for i in range(0, len(arr), n)]
def predict(model,
model_path,
transforms,
image_list,
image_dir=None,
save_dir='output',
aug_pred=False,
scales=1.0,
flip_horizontal=True,
flip_vertical=False,
is_slide=False,
stride=None,
crop_size=None,
custom_color=None):
"""
predict and visualize the image_list.
Args:
model (nn.Layer): Used to predict for input image.
model_path (str): The path of pretrained model.
transforms (transform.Compose): Preprocess for input image.
image_list (list): A list of image path to be predicted.
image_dir (str, optional): The root directory of the images predicted. Default: None.
save_dir (str, optional): The directory to save the visualized results. Default: 'output'.
aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False.
scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0.
flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True.
flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False.
is_slide (bool, optional): Whether to predict by sliding window. Default: False.
stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
It should be provided when `is_slide` is True.
crop_size (tuple|list, optional): The crop size of sliding window, the first is width and the second is height.
It should be provided when `is_slide` is True.
custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
"""
utils.utils.load_entire_model(model, model_path)
model.eval()
nranks = paddle.distributed.get_world_size()
local_rank = paddle.distributed.get_rank()
if nranks > 1:
img_lists = partition_list(image_list, nranks)
else:
img_lists = [image_list]
added_saved_dir = os.path.join(save_dir, 'added_prediction')
pred_saved_dir = os.path.join(save_dir, 'pseudo_color_prediction')
logger.info("Start to predict...")
progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1)
color_map = visualize.get_color_map_list(256, custom_color=custom_color)
with paddle.no_grad():
for i, im_path in enumerate(img_lists[local_rank]):
im = cv2.imread(im_path)
ori_shape = im.shape[:2]
im, _ = transforms(im)
im = im[np.newaxis, ...]
im = paddle.to_tensor(im)
if aug_pred:
pred, _ = infer.aug_inference(
model,
im,
ori_shape=ori_shape,
transforms=transforms.transforms,
scales=scales,
flip_horizontal=flip_horizontal,
flip_vertical=flip_vertical,
is_slide=is_slide,
stride=stride,
crop_size=crop_size)
else:
pred, _ = infer.inference(
model,
im,
ori_shape=ori_shape,
transforms=transforms.transforms,
is_slide=is_slide,
stride=stride,
crop_size=crop_size)
pred = paddle.squeeze(pred)
pred = pred.numpy().astype('uint8')
# get the saved name
if image_dir is not None:
im_file = im_path.replace(image_dir, '')
else:
im_file = os.path.basename(im_path)
if im_file[0] == '/' or im_file[0] == '\\':
im_file = im_file[1:]
# save added image
added_image = utils.visualize.visualize(
im_path, pred, color_map, weight=0.6)
added_image_path = os.path.join(added_saved_dir, im_file)
mkdir(added_image_path)
cv2.imwrite(added_image_path, added_image)
# save pseudo color prediction
pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
pred_saved_path = os.path.join(
pred_saved_dir,
os.path.splitext(im_file)[0] + ".png")
mkdir(pred_saved_path)
pred_mask.save(pred_saved_path)
# pred_im = utils.visualize(im_path, pred, weight=0.0)
# pred_saved_path = os.path.join(pred_saved_dir, im_file)
# mkdir(pred_saved_path)
# cv2.imwrite(pred_saved_path, pred_im)
progbar_pred.update(i + 1)

@ -0,0 +1,326 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
from collections import deque
import shutil
import paddle
import paddle.nn.functional as F
from paddlers.models.ppseg.utils import (TimeAverager, calculate_eta, resume, logger,
worker_init_fn, train_profiler, op_flops_funs)
from paddlers.models.ppseg.core.val import evaluate
def check_logits_losses(logits_list, losses):
len_logits = len(logits_list)
len_losses = len(losses['types'])
if len_logits != len_losses:
raise RuntimeError(
'The length of logits_list should equal to the types of loss config: {} != {}.'
.format(len_logits, len_losses))
def loss_computation(logits_list, labels, losses, edges=None):
check_logits_losses(logits_list, losses)
loss_list = []
for i in range(len(logits_list)):
logits = logits_list[i]
loss_i = losses['types'][i]
coef_i = losses['coef'][i]
if loss_i.__class__.__name__ in ('BCELoss',
'FocalLoss') and loss_i.edge_label:
# If use edges as labels According to loss type.
loss_list.append(coef_i * loss_i(logits, edges))
elif loss_i.__class__.__name__ == 'MixedLoss':
mixed_loss_list = loss_i(logits, labels)
for mixed_loss in mixed_loss_list:
loss_list.append(coef_i * mixed_loss)
elif loss_i.__class__.__name__ in ("KLLoss", ):
loss_list.append(
coef_i * loss_i(logits_list[0], logits_list[1].detach()))
else:
loss_list.append(coef_i * loss_i(logits, labels))
return loss_list
def train(model,
train_dataset,
val_dataset=None,
optimizer=None,
save_dir='output',
iters=10000,
batch_size=2,
resume_model=None,
save_interval=1000,
log_iters=10,
num_workers=0,
use_vdl=False,
losses=None,
keep_checkpoint_max=5,
test_config=None,
precision='fp32',
profiler_options=None,
to_static_training=False):
"""
Launch training.
Args:
modelnn.Layer): A sementic segmentation model.
train_dataset (paddle.io.Dataset): Used to read and process training datasets.
val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
optimizer (paddle.optimizer.Optimizer): The optimizer.
save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
iters (int, optional): How may iters to train the model. Defualt: 10000.
batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
resume_model (str, optional): The path of resume model.
save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
log_iters (int, optional): Display logging information at every log_iters. Default: 10.
num_workers (int, optional): Num workers for data loader. Default: 0.
use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
losses (dict, optional): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
test_config(dict, optional): Evaluation config.
precision (str, optional): Use AMP if precision='fp16'. If precision='fp32', the training is normal.
profiler_options (str, optional): The option of train profiler.
to_static_training (bool, optional): Whether to use @to_static for training.
"""
model.train()
nranks = paddle.distributed.ParallelEnv().nranks
local_rank = paddle.distributed.ParallelEnv().local_rank
start_iter = 0
if resume_model is not None:
start_iter = resume(model, optimizer, resume_model)
if not os.path.isdir(save_dir):
if os.path.exists(save_dir):
os.remove(save_dir)
os.makedirs(save_dir)
if nranks > 1:
paddle.distributed.fleet.init(is_collective=True)
optimizer = paddle.distributed.fleet.distributed_optimizer(
optimizer) # The return is Fleet object
ddp_model = paddle.distributed.fleet.distributed_model(model)
batch_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
loader = paddle.io.DataLoader(
train_dataset,
batch_sampler=batch_sampler,
num_workers=num_workers,
return_list=True,
worker_init_fn=worker_init_fn,
)
# use amp
if precision == 'fp16':
logger.info('use amp to train')
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
if use_vdl:
from visualdl import LogWriter
log_writer = LogWriter(save_dir)
if to_static_training:
model = paddle.jit.to_static(model)
logger.info("Successfully to apply @to_static")
avg_loss = 0.0
avg_loss_list = []
iters_per_epoch = len(batch_sampler)
best_mean_iou = -1.0
best_model_iter = -1
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
save_models = deque()
batch_start = time.time()
iter = start_iter
while iter < iters:
for data in loader:
iter += 1
if iter > iters:
version = paddle.__version__
if version == '2.1.2':
continue
else:
break
reader_cost_averager.record(time.time() - batch_start)
images = data[0]
labels = data[1].astype('int64')
edges = None
if len(data) == 3:
edges = data[2].astype('int64')
if hasattr(model, 'data_format') and model.data_format == 'NHWC':
images = images.transpose((0, 2, 3, 1))
if precision == 'fp16':
with paddle.amp.auto_cast(
enable=True,
custom_white_list={
"elementwise_add", "batch_norm", "sync_batch_norm"
},
custom_black_list={'bilinear_interp_v2'}):
if nranks > 1:
logits_list = ddp_model(images)
else:
logits_list = model(images)
loss_list = loss_computation(
logits_list=logits_list,
labels=labels,
losses=losses,
edges=edges)
loss = sum(loss_list)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
if isinstance(optimizer, paddle.distributed.fleet.Fleet):
scaler.minimize(optimizer.user_defined_optimizer, scaled)
else:
scaler.minimize(optimizer, scaled) # update parameters
else:
if nranks > 1:
logits_list = ddp_model(images)
else:
logits_list = model(images)
loss_list = loss_computation(
logits_list=logits_list,
labels=labels,
losses=losses,
edges=edges)
loss = sum(loss_list)
loss.backward()
# if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step.
if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau):
optimizer.step(loss)
else:
optimizer.step()
lr = optimizer.get_lr()
# update lr
if isinstance(optimizer, paddle.distributed.fleet.Fleet):
lr_sche = optimizer.user_defined_optimizer._learning_rate
else:
lr_sche = optimizer._learning_rate
if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler):
lr_sche.step()
train_profiler.add_profiler_step(profiler_options)
model.clear_gradients()
avg_loss += loss.numpy()[0]
if not avg_loss_list:
avg_loss_list = [l.numpy() for l in loss_list]
else:
for i in range(len(loss_list)):
avg_loss_list[i] += loss_list[i].numpy()
batch_cost_averager.record(
time.time() - batch_start, num_samples=batch_size)
if (iter) % log_iters == 0 and local_rank == 0:
avg_loss /= log_iters
avg_loss_list = [l[0] / log_iters for l in avg_loss_list]
remain_iters = iters - iter
avg_train_batch_cost = batch_cost_averager.get_average()
avg_train_reader_cost = reader_cost_averager.get_average()
eta = calculate_eta(remain_iters, avg_train_batch_cost)
logger.info(
"[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}"
.format((iter - 1) // iters_per_epoch + 1, iter, iters,
avg_loss, lr, avg_train_batch_cost,
avg_train_reader_cost,
batch_cost_averager.get_ips_average(), eta))
if use_vdl:
log_writer.add_scalar('Train/loss', avg_loss, iter)
# Record all losses if there are more than 2 losses.
if len(avg_loss_list) > 1:
avg_loss_dict = {}
for i, value in enumerate(avg_loss_list):
avg_loss_dict['loss_' + str(i)] = value
for key, value in avg_loss_dict.items():
log_tag = 'Train/' + key
log_writer.add_scalar(log_tag, value, iter)
log_writer.add_scalar('Train/lr', lr, iter)
log_writer.add_scalar('Train/batch_cost',
avg_train_batch_cost, iter)
log_writer.add_scalar('Train/reader_cost',
avg_train_reader_cost, iter)
avg_loss = 0.0
avg_loss_list = []
reader_cost_averager.reset()
batch_cost_averager.reset()
if (iter % save_interval == 0
or iter == iters) and (val_dataset is not None):
num_workers = 1 if num_workers > 0 else 0
if test_config is None:
test_config = {}
mean_iou, acc, _, _, _ = evaluate(
model, val_dataset, num_workers=num_workers, **test_config)
model.train()
if (iter % save_interval == 0 or iter == iters) and local_rank == 0:
current_save_dir = os.path.join(save_dir,
"iter_{}".format(iter))
if not os.path.isdir(current_save_dir):
os.makedirs(current_save_dir)
paddle.save(model.state_dict(),
os.path.join(current_save_dir, 'model.pdparams'))
paddle.save(optimizer.state_dict(),
os.path.join(current_save_dir, 'model.pdopt'))
save_models.append(current_save_dir)
if len(save_models) > keep_checkpoint_max > 0:
model_to_remove = save_models.popleft()
shutil.rmtree(model_to_remove)
if val_dataset is not None:
if mean_iou > best_mean_iou:
best_mean_iou = mean_iou
best_model_iter = iter
best_model_dir = os.path.join(save_dir, "best_model")
paddle.save(
model.state_dict(),
os.path.join(best_model_dir, 'model.pdparams'))
logger.info(
'[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
.format(best_mean_iou, best_model_iter))
if use_vdl:
log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
log_writer.add_scalar('Evaluate/Acc', acc, iter)
batch_start = time.time()
# Calculate flops.
if local_rank == 0:
_, c, h, w = images.shape
_ = paddle.flops(
model, [1, c, h, w],
custom_ops={paddle.nn.SyncBatchNorm: op_flops_funs.count_syncbn})
# Sleep for half a second to let dataloader release resources.
time.sleep(0.5)
if use_vdl:
log_writer.close()

@ -0,0 +1,199 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import time
import paddle
import paddle.nn.functional as F
from paddlers.models.ppseg.utils import metrics, TimeAverager, calculate_eta, logger, progbar
from paddlers.models.ppseg.core import infer
np.set_printoptions(suppress=True)
def evaluate(model,
eval_dataset,
aug_eval=False,
scales=1.0,
flip_horizontal=False,
flip_vertical=False,
is_slide=False,
stride=None,
crop_size=None,
num_workers=0,
print_detail=True,
auc_roc=False):
"""
Launch evalution.
Args:
modelnn.Layer): A sementic segmentation model.
eval_dataset (paddle.io.Dataset): Used to read and process validation datasets.
aug_eval (bool, optional): Whether to use mulit-scales and flip augment for evaluation. Default: False.
scales (list|float, optional): Scales for augment. It is valid when `aug_eval` is True. Default: 1.0.
flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_eval` is True. Default: True.
flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_eval` is True. Default: False.
is_slide (bool, optional): Whether to evaluate by sliding window. Default: False.
stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
It should be provided when `is_slide` is True.
crop_size (tuple|list, optional): The crop size of sliding window, the first is width and the second is height.
It should be provided when `is_slide` is True.
num_workers (int, optional): Num workers for data loader. Default: 0.
print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True.
auc_roc(bool, optional): whether add auc_roc metric
Returns:
float: The mIoU of validation datasets.
float: The accuracy of validation datasets.
"""
model.eval()
nranks = paddle.distributed.ParallelEnv().nranks
local_rank = paddle.distributed.ParallelEnv().local_rank
if nranks > 1:
# Initialize parallel environment if not done.
if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
):
paddle.distributed.init_parallel_env()
batch_sampler = paddle.io.DistributedBatchSampler(
eval_dataset, batch_size=1, shuffle=False, drop_last=False)
loader = paddle.io.DataLoader(
eval_dataset,
batch_sampler=batch_sampler,
num_workers=num_workers,
return_list=True,
)
total_iters = len(loader)
intersect_area_all = paddle.zeros([1], dtype='int64')
pred_area_all = paddle.zeros([1], dtype='int64')
label_area_all = paddle.zeros([1], dtype='int64')
logits_all = None
label_all = None
if print_detail:
logger.info(
"Start evaluating (total_samples: {}, total_iters: {})...".format(
len(eval_dataset), total_iters))
#TODO(chenguowei): fix log print error with multi-gpus
progbar_val = progbar.Progbar(
target=total_iters, verbose=1 if nranks < 2 else 2)
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
batch_start = time.time()
with paddle.no_grad():
for iter, (im, label) in enumerate(loader):
reader_cost_averager.record(time.time() - batch_start)
label = label.astype('int64')
ori_shape = label.shape[-2:]
if aug_eval:
pred, logits = infer.aug_inference(
model,
im,
ori_shape=ori_shape,
transforms=eval_dataset.transforms.transforms,
scales=scales,
flip_horizontal=flip_horizontal,
flip_vertical=flip_vertical,
is_slide=is_slide,
stride=stride,
crop_size=crop_size)
else:
pred, logits = infer.inference(
model,
im,
ori_shape=ori_shape,
transforms=eval_dataset.transforms.transforms,
is_slide=is_slide,
stride=stride,
crop_size=crop_size)
intersect_area, pred_area, label_area = metrics.calculate_area(
pred,
label,
eval_dataset.num_classes,
ignore_index=eval_dataset.ignore_index)
# Gather from all ranks
if nranks > 1:
intersect_area_list = []
pred_area_list = []
label_area_list = []
paddle.distributed.all_gather(intersect_area_list,
intersect_area)
paddle.distributed.all_gather(pred_area_list, pred_area)
paddle.distributed.all_gather(label_area_list, label_area)
# Some image has been evaluated and should be eliminated in last iter
if (iter + 1) * nranks > len(eval_dataset):
valid = len(eval_dataset) - iter * nranks
intersect_area_list = intersect_area_list[:valid]
pred_area_list = pred_area_list[:valid]
label_area_list = label_area_list[:valid]
for i in range(len(intersect_area_list)):
intersect_area_all = intersect_area_all + intersect_area_list[
i]
pred_area_all = pred_area_all + pred_area_list[i]
label_area_all = label_area_all + label_area_list[i]
else:
intersect_area_all = intersect_area_all + intersect_area
pred_area_all = pred_area_all + pred_area
label_area_all = label_area_all + label_area
if auc_roc:
logits = F.softmax(logits, axis=1)
if logits_all is None:
logits_all = logits.numpy()
label_all = label.numpy()
else:
logits_all = np.concatenate(
[logits_all, logits.numpy()]) # (KN, C, H, W)
label_all = np.concatenate([label_all, label.numpy()])
batch_cost_averager.record(
time.time() - batch_start, num_samples=len(label))
batch_cost = batch_cost_averager.get_average()
reader_cost = reader_cost_averager.get_average()
if local_rank == 0 and print_detail:
progbar_val.update(iter + 1, [('batch_cost', batch_cost),
('reader cost', reader_cost)])
reader_cost_averager.reset()
batch_cost_averager.reset()
batch_start = time.time()
class_iou, miou = metrics.mean_iou(intersect_area_all, pred_area_all,
label_area_all)
class_acc, acc = metrics.accuracy(intersect_area_all, pred_area_all)
kappa = metrics.kappa(intersect_area_all, pred_area_all, label_area_all)
class_dice, mdice = metrics.dice(intersect_area_all, pred_area_all,
label_area_all)
if auc_roc:
auc_roc = metrics.auc_roc(
logits_all, label_all, num_classes=eval_dataset.num_classes)
auc_infor = ' Auc_roc: {:.4f}'.format(auc_roc)
if print_detail:
infor = "[EVAL] #Images: {} mIoU: {:.4f} Acc: {:.4f} Kappa: {:.4f} Dice: {:.4f}".format(
len(eval_dataset), miou, acc, kappa, mdice)
infor = infor + auc_infor if auc_roc else infor
logger.info(infor)
logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4)))
logger.info("[EVAL] Class Acc: \n" + str(np.round(class_acc, 4)))
return miou, acc, class_iou, class_acc, kappa

@ -0,0 +1,17 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import manager
from . import param_init
from .config import Config

@ -0,0 +1,279 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import numpy as np
import paddle
from paddle.distributed.parallel import ParallelEnv
from visualdl import LogWriter
from paddlers.models.ppseg.utils.progbar import Progbar
import paddlers.models.ppseg.utils.logger as logger
class CallbackList(object):
"""
Container abstracting a list of callbacks.
Args:
callbacks (list[Callback]): List of `Callback` instances.
"""
def __init__(self, callbacks=None):
callbacks = callbacks or []
self.callbacks = [c for c in callbacks]
def append(self, callback):
self.callbacks.append(callback)
def set_params(self, params):
for callback in self.callbacks:
callback.set_params(params)
def set_model(self, model):
for callback in self.callbacks:
callback.set_model(model)
def set_optimizer(self, optimizer):
for callback in self.callbacks:
callback.set_optimizer(optimizer)
def on_iter_begin(self, iter, logs=None):
"""Called right before processing a batch.
"""
logs = logs or {}
for callback in self.callbacks:
callback.on_iter_begin(iter, logs)
self._t_enter_iter = time.time()
def on_iter_end(self, iter, logs=None):
"""Called at the end of a batch.
"""
logs = logs or {}
for callback in self.callbacks:
callback.on_iter_end(iter, logs)
self._t_exit_iter = time.time()
def on_train_begin(self, logs=None):
"""Called at the beginning of training.
"""
logs = logs or {}
for callback in self.callbacks:
callback.on_train_begin(logs)
def on_train_end(self, logs=None):
"""Called at the end of training.
"""
logs = logs or {}
for callback in self.callbacks:
callback.on_train_end(logs)
def __iter__(self):
return iter(self.callbacks)
class Callback(object):
"""Abstract base class used to build new callbacks.
"""
def __init__(self):
self.validation_data = None
def set_params(self, params):
self.params = params
def set_model(self, model):
self.model = model
def set_optimizer(self, optimizer):
self.optimizer = optimizer
def on_iter_begin(self, iter, logs=None):
pass
def on_iter_end(self, iter, logs=None):
pass
def on_train_begin(self, logs=None):
pass
def on_train_end(self, logs=None):
pass
class BaseLogger(Callback):
def __init__(self, period=10):
super(BaseLogger, self).__init__()
self.period = period
def _reset(self):
self.totals = {}
def on_train_begin(self, logs=None):
self.totals = {}
def on_iter_end(self, iter, logs=None):
logs = logs or {}
#(iter - 1) // iters_per_epoch + 1
for k, v in logs.items():
if k in self.totals.keys():
self.totals[k] += v
else:
self.totals[k] = v
if iter % self.period == 0 and ParallelEnv().local_rank == 0:
for k in self.totals:
logs[k] = self.totals[k] / self.period
self._reset()
class TrainLogger(Callback):
def __init__(self, log_freq=10):
self.log_freq = log_freq
def _calculate_eta(self, remaining_iters, speed):
if remaining_iters < 0:
remaining_iters = 0
remaining_time = int(remaining_iters * speed)
result = "{:0>2}:{:0>2}:{:0>2}"
arr = []
for i in range(2, -1, -1):
arr.append(int(remaining_time / 60**i))
remaining_time %= 60**i
return result.format(*arr)
def on_iter_end(self, iter, logs=None):
if iter % self.log_freq == 0 and ParallelEnv().local_rank == 0:
total_iters = self.params["total_iters"]
iters_per_epoch = self.params["iters_per_epoch"]
remaining_iters = total_iters - iter
eta = self._calculate_eta(remaining_iters, logs["batch_cost"])
current_epoch = (iter - 1) // self.params["iters_per_epoch"] + 1
loss = logs["loss"]
lr = self.optimizer.get_lr()
batch_cost = logs["batch_cost"]
reader_cost = logs["reader_cost"]
logger.info(
"[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
.format(current_epoch, iter, total_iters, loss, lr, batch_cost,
reader_cost, eta))
class ProgbarLogger(Callback):
def __init__(self):
super(ProgbarLogger, self).__init__()
def on_train_begin(self, logs=None):
self.verbose = self.params["verbose"]
self.total_iters = self.params["total_iters"]
self.target = self.params["total_iters"]
self.progbar = Progbar(target=self.target, verbose=self.verbose)
self.seen = 0
self.log_values = []
def on_iter_begin(self, iter, logs=None):
#self.seen = 0
if self.seen < self.target:
self.log_values = []
def on_iter_end(self, iter, logs=None):
logs = logs or {}
self.seen += 1
for k in self.params['metrics']:
if k in logs:
self.log_values.append((k, logs[k]))
#if self.verbose and self.seen < self.target and ParallelEnv.local_rank == 0:
#print(self.log_values)
if self.seen < self.target:
self.progbar.update(self.seen, self.log_values)
class ModelCheckpoint(Callback):
def __init__(self,
save_dir,
monitor="miou",
save_best_only=False,
save_params_only=True,
mode="max",
period=1):
super(ModelCheckpoint, self).__init__()
self.monitor = monitor
self.save_dir = save_dir
self.save_best_only = save_best_only
self.save_params_only = save_params_only
self.period = period
self.iters_since_last_save = 0
if mode == "min":
self.monitor_op = np.less
self.best = np.Inf
elif mode == "max":
self.monitor_op = np.greater
self.best = -np.Inf
else:
raise RuntimeError("`mode` is neither \"min\" nor \"max\"!")
def on_train_begin(self, logs=None):
self.verbose = self.params["verbose"]
save_dir = self.save_dir
if not os.path.isdir(save_dir):
if os.path.exists(save_dir):
os.remove(save_dir)
os.makedirs(save_dir)
def on_iter_end(self, iter, logs=None):
logs = logs or {}
self.iters_since_last_save += 1
current_save_dir = os.path.join(self.save_dir, "iter_{}".format(iter))
current_save_dir = os.path.abspath(current_save_dir)
#if self.iters_since_last_save % self.period and ParallelEnv().local_rank == 0:
#self.iters_since_last_save = 0
if iter % self.period == 0 and ParallelEnv().local_rank == 0:
if self.verbose > 0:
print("iter {iter_num}: saving model to {path}".format(
iter_num=iter, path=current_save_dir))
paddle.save(self.model.state_dict(),
os.path.join(current_save_dir, 'model.pdparams'))
if not self.save_params_only:
paddle.save(self.optimizer.state_dict(),
os.path.join(current_save_dir, 'model.pdopt'))
class VisualDL(Callback):
def __init__(self, log_dir="./log", freq=1):
super(VisualDL, self).__init__()
self.log_dir = log_dir
self.freq = freq
def on_train_begin(self, logs=None):
self.writer = LogWriter(self.log_dir)
def on_iter_end(self, iter, logs=None):
logs = logs or {}
if iter % self.freq == 0 and ParallelEnv().local_rank == 0:
for k, v in logs.items():
self.writer.add_scalar("Train/{}".format(k), v, iter)
self.writer.flush()
def on_train_end(self, logs=None):
self.writer.close()

@ -0,0 +1,404 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import os
from typing import Any, Dict, Generic
import paddle
import yaml
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import logger
class Config(object):
'''
Training configuration parsing. The only yaml/yml file is supported.
The following hyper-parameters are available in the config file:
batch_size: The number of samples per gpu.
iters: The total training steps.
train_dataset: A training data config including type/data_root/transforms/mode.
For data type, please refer to paddleseg.datasets.
For specific transforms, please refer to paddleseg.transforms.transforms.
val_dataset: A validation data config including type/data_root/transforms/mode.
optimizer: A optimizer config, but currently PaddleSeg only supports sgd with momentum in config file.
In addition, weight_decay could be set as a regularization.
learning_rate: A learning rate config. If decay is configured, learning _rate value is the starting learning rate,
where only poly decay is supported using the config file. In addition, decay power and end_lr are tuned experimentally.
loss: A loss config. Multi-loss config is available. The loss type order is consistent with the seg model outputs,
where the coef term indicates the weight of corresponding loss. Note that the number of coef must be the same as the number of
model outputs, and there could be only one loss type if using the same loss type among the outputs, otherwise the number of
loss type must be consistent with coef.
model: A model config including type/backbone and model-dependent arguments.
For model type, please refer to paddleseg.models.
For backbone, please refer to paddleseg.models.backbones.
Args:
path (str) : The path of config file, supports yaml format only.
Examples:
from paddlers.models.ppseg.cvlibs.config import Config
# Create a cfg object with yaml file path.
cfg = Config(yaml_cfg_path)
# Parsing the argument when its property is used.
train_dataset = cfg.train_dataset
# the argument of model should be parsed after dataset,
# since the model builder uses some properties in dataset.
model = cfg.model
...
'''
def __init__(self,
path: str,
learning_rate: float = None,
batch_size: int = None,
iters: int = None):
if not path:
raise ValueError('Please specify the configuration file path.')
if not os.path.exists(path):
raise FileNotFoundError('File {} does not exist'.format(path))
self._model = None
self._losses = None
if path.endswith('yml') or path.endswith('yaml'):
self.dic = self._parse_from_yaml(path)
else:
raise RuntimeError('Config file should in yaml format!')
self.update(
learning_rate=learning_rate, batch_size=batch_size, iters=iters)
def _update_dic(self, dic, base_dic):
"""
Update config from dic based base_dic
"""
base_dic = base_dic.copy()
dic = dic.copy()
if dic.get('_inherited_', True) == False:
dic.pop('_inherited_')
return dic
for key, val in dic.items():
if isinstance(val, dict) and key in base_dic:
base_dic[key] = self._update_dic(val, base_dic[key])
else:
base_dic[key] = val
dic = base_dic
return dic
def _parse_from_yaml(self, path: str):
'''Parse a yaml file and build config'''
with codecs.open(path, 'r', 'utf-8') as file:
dic = yaml.load(file, Loader=yaml.FullLoader)
if '_base_' in dic:
cfg_dir = os.path.dirname(path)
base_path = dic.pop('_base_')
base_path = os.path.join(cfg_dir, base_path)
base_dic = self._parse_from_yaml(base_path)
dic = self._update_dic(dic, base_dic)
return dic
def update(self,
learning_rate: float = None,
batch_size: int = None,
iters: int = None):
'''Update config'''
if learning_rate:
if 'lr_scheduler' in self.dic:
self.dic['lr_scheduler']['learning_rate'] = learning_rate
else:
self.dic['learning_rate']['value'] = learning_rate
if batch_size:
self.dic['batch_size'] = batch_size
if iters:
self.dic['iters'] = iters
@property
def batch_size(self) -> int:
return self.dic.get('batch_size', 1)
@property
def iters(self) -> int:
iters = self.dic.get('iters')
if not iters:
raise RuntimeError('No iters specified in the configuration file.')
return iters
@property
def lr_scheduler(self) -> paddle.optimizer.lr.LRScheduler:
if 'lr_scheduler' not in self.dic:
raise RuntimeError(
'No `lr_scheduler` specified in the configuration file.')
params = self.dic.get('lr_scheduler')
lr_type = params.pop('type')
if lr_type == 'PolynomialDecay':
params.setdefault('decay_steps', self.iters)
params.setdefault('end_lr', 0)
params.setdefault('power', 0.9)
return getattr(paddle.optimizer.lr, lr_type)(**params)
@property
def learning_rate(self) -> paddle.optimizer.lr.LRScheduler:
logger.warning(
'''`learning_rate` in configuration file will be deprecated, please use `lr_scheduler` instead. E.g
lr_scheduler:
type: PolynomialDecay
learning_rate: 0.01''')
_learning_rate = self.dic.get('learning_rate', {})
if isinstance(_learning_rate, float):
return _learning_rate
_learning_rate = self.dic.get('learning_rate', {}).get('value')
if not _learning_rate:
raise RuntimeError(
'No learning rate specified in the configuration file.')
args = self.decay_args
decay_type = args.pop('type')
if decay_type == 'poly':
lr = _learning_rate
return paddle.optimizer.lr.PolynomialDecay(lr, **args)
elif decay_type == 'piecewise':
values = _learning_rate
return paddle.optimizer.lr.PiecewiseDecay(values=values, **args)
elif decay_type == 'stepdecay':
lr = _learning_rate
return paddle.optimizer.lr.StepDecay(lr, **args)
else:
raise RuntimeError('Only poly and piecewise decay support.')
@property
def optimizer(self) -> paddle.optimizer.Optimizer:
if 'lr_scheduler' in self.dic:
lr = self.lr_scheduler
else:
lr = self.learning_rate
args = self.optimizer_args
optimizer_type = args.pop('type')
if optimizer_type == 'sgd':
return paddle.optimizer.Momentum(
lr, parameters=self.model.parameters(), **args)
elif optimizer_type == 'adam':
return paddle.optimizer.Adam(
lr, parameters=self.model.parameters(), **args)
elif optimizer_type in paddle.optimizer.__all__:
return getattr(paddle.optimizer, optimizer_type)(
lr, parameters=self.model.parameters(), **args)
raise RuntimeError('Unknown optimizer type {}.'.format(optimizer_type))
@property
def optimizer_args(self) -> dict:
args = self.dic.get('optimizer', {}).copy()
if args['type'] == 'sgd':
args.setdefault('momentum', 0.9)
return args
@property
def decay_args(self) -> dict:
args = self.dic.get('learning_rate', {}).get('decay', {
'type': 'poly',
'power': 0.9
}).copy()
if args['type'] == 'poly':
args.setdefault('decay_steps', self.iters)
args.setdefault('end_lr', 0)
return args
@property
def loss(self) -> dict:
if self._losses is None:
self._losses = self._prepare_loss('loss')
return self._losses
@property
def distill_loss(self) -> dict:
if not hasattr(self, '_distill_losses'):
self._distill_losses = self._prepare_loss('distill_loss')
return self._distill_losses
def _prepare_loss(self, loss_name):
"""
Parse the loss parameters and load the loss layers.
Args:
loss_name (str): The root name of loss in the yaml file.
Returns:
dict: A dict including the loss parameters and layers.
"""
args = self.dic.get(loss_name, {}).copy()
if 'types' in args and 'coef' in args:
len_types = len(args['types'])
len_coef = len(args['coef'])
if len_types != len_coef:
if len_types == 1:
args['types'] = args['types'] * len_coef
else:
raise ValueError(
'The length of types should equal to coef or equal to 1 in loss config, but they are {} and {}.'
.format(len_types, len_coef))
else:
raise ValueError(
'Loss config should contain keys of "types" and "coef"')
losses = dict()
for key, val in args.items():
if key == 'types':
losses['types'] = []
for item in args['types']:
if item['type'] != 'MixedLoss':
if 'ignore_index' in item:
assert item['ignore_index'] == self.train_dataset.ignore_index, 'If ignore_index of loss is set, '\
'the ignore_index of loss and train_dataset must be the same. \nCurrently, loss ignore_index = {}, '\
'train_dataset ignore_index = {}. \nIt is recommended not to set loss ignore_index, so it is consistent with '\
'train_dataset by default.'.format(item['ignore_index'], self.train_dataset.ignore_index)
item['ignore_index'] = \
self.train_dataset.ignore_index
losses['types'].append(self._load_object(item))
else:
losses[key] = val
if len(losses['coef']) != len(losses['types']):
raise RuntimeError(
'The length of coef should equal to types in loss config: {} != {}.'
.format(len(losses['coef']), len(losses['types'])))
return losses
@property
def model(self) -> paddle.nn.Layer:
model_cfg = self.dic.get('model').copy()
if not model_cfg:
raise RuntimeError('No model specified in the configuration file.')
if not 'num_classes' in model_cfg:
num_classes = None
if self.train_dataset_config:
if hasattr(self.train_dataset_class, 'NUM_CLASSES'):
num_classes = self.train_dataset_class.NUM_CLASSES
elif hasattr(self.train_dataset, 'num_classes'):
num_classes = self.train_dataset.num_classes
elif self.val_dataset_config:
if hasattr(self.val_dataset_class, 'NUM_CLASSES'):
num_classes = self.val_dataset_class.NUM_CLASSES
elif hasattr(self.val_dataset, 'num_classes'):
num_classes = self.val_dataset.num_classes
if num_classes is not None:
model_cfg['num_classes'] = num_classes
if not self._model:
self._model = self._load_object(model_cfg)
return self._model
@property
def train_dataset_config(self) -> Dict:
return self.dic.get('train_dataset', {}).copy()
@property
def val_dataset_config(self) -> Dict:
return self.dic.get('val_dataset', {}).copy()
@property
def train_dataset_class(self) -> Generic:
dataset_type = self.train_dataset_config['type']
return self._load_component(dataset_type)
@property
def val_dataset_class(self) -> Generic:
dataset_type = self.val_dataset_config['type']
return self._load_component(dataset_type)
@property
def train_dataset(self) -> paddle.io.Dataset:
_train_dataset = self.train_dataset_config
if not _train_dataset:
return None
return self._load_object(_train_dataset)
@property
def val_dataset(self) -> paddle.io.Dataset:
_val_dataset = self.val_dataset_config
if not _val_dataset:
return None
return self._load_object(_val_dataset)
def _load_component(self, com_name: str) -> Any:
com_list = [
manager.MODELS, manager.BACKBONES, manager.DATASETS,
manager.TRANSFORMS, manager.LOSSES
]
for com in com_list:
if com_name in com.components_dict:
return com[com_name]
else:
raise RuntimeError(
'The specified component was not found {}.'.format(com_name))
def _load_object(self, cfg: dict) -> Any:
cfg = cfg.copy()
if 'type' not in cfg:
raise RuntimeError('No object information in {}.'.format(cfg))
component = self._load_component(cfg.pop('type'))
params = {}
for key, val in cfg.items():
if self._is_meta_type(val):
params[key] = self._load_object(val)
elif isinstance(val, list):
params[key] = [
self._load_object(item)
if self._is_meta_type(item) else item for item in val
]
else:
params[key] = val
return component(**params)
@property
def test_config(self) -> Dict:
return self.dic.get('test_config', {})
@property
def export_config(self) -> Dict:
return self.dic.get('export', {})
@property
def to_static_training(self) -> bool:
'''Whether to use @to_static for training'''
return self.dic.get('to_static_training', False)
def _is_meta_type(self, item: Any) -> bool:
return isinstance(item, dict) and 'type' in item
def __str__(self) -> str:
return yaml.dump(self.dic)

@ -0,0 +1,149 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
from collections.abc import Sequence
import warnings
class ComponentManager:
"""
Implement a manager class to add the new component properly.
The component can be added as either class or function type.
Args:
name (str): The name of component.
Returns:
A callable object of ComponentManager.
Examples 1:
from paddlers.models.ppseg.cvlibs.manager import ComponentManager
model_manager = ComponentManager()
class AlexNet: ...
class ResNet: ...
model_manager.add_component(AlexNet)
model_manager.add_component(ResNet)
# Or pass a sequence alliteratively:
model_manager.add_component([AlexNet, ResNet])
print(model_manager.components_dict)
# {'AlexNet': <class '__main__.AlexNet'>, 'ResNet': <class '__main__.ResNet'>}
Examples 2:
# Or an easier way, using it as a Python decorator, while just add it above the class declaration.
from paddlers.models.ppseg.cvlibs.manager import ComponentManager
model_manager = ComponentManager()
@model_manager.add_component
class AlexNet: ...
@model_manager.add_component
class ResNet: ...
print(model_manager.components_dict)
# {'AlexNet': <class '__main__.AlexNet'>, 'ResNet': <class '__main__.ResNet'>}
"""
def __init__(self, name=None):
self._components_dict = dict()
self._name = name
def __len__(self):
return len(self._components_dict)
def __repr__(self):
name_str = self._name if self._name else self.__class__.__name__
return "{}:{}".format(name_str, list(self._components_dict.keys()))
def __getitem__(self, item):
if item not in self._components_dict.keys():
raise KeyError("{} does not exist in availabel {}".format(
item, self))
return self._components_dict[item]
@property
def components_dict(self):
return self._components_dict
@property
def name(self):
return self._name
def _add_single_component(self, component):
"""
Add a single component into the corresponding manager.
Args:
component (function|class): A new component.
Raises:
TypeError: When `component` is neither class nor function.
KeyError: When `component` was added already.
"""
# Currently only support class or function type
if not (inspect.isclass(component) or inspect.isfunction(component)):
raise TypeError(
"Expect class/function type, but received {}".format(
type(component)))
# Obtain the internal name of the component
component_name = component.__name__
# Check whether the component was added already
if component_name in self._components_dict.keys():
warnings.warn(
"{} exists already! It is now updated to {} !!!".format(
component_name, component))
self._components_dict[component_name] = component
else:
# Take the internal name of the component as its key
self._components_dict[component_name] = component
def add_component(self, components):
"""
Add component(s) into the corresponding manager.
Args:
components (function|class|list|tuple): Support four types of components.
Returns:
components (function|class|list|tuple): Same with input components.
"""
# Check whether the type is a sequence
if isinstance(components, Sequence):
for component in components:
self._add_single_component(component)
else:
component = components
self._add_single_component(component)
return components
MODELS = ComponentManager("models")
BACKBONES = ComponentManager("backbones")
DATASETS = ComponentManager("datasets")
TRANSFORMS = ComponentManager("transforms")
LOSSES = ComponentManager("losses")

@ -0,0 +1,120 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
def constant_init(param, **kwargs):
"""
Initialize the `param` with constants.
Args:
param (Tensor): Tensor that needs to be initialized.
Examples:
from paddlers.models.ppseg.cvlibs import param_init
import paddle.nn as nn
linear = nn.Linear(2, 4)
param_init.constant_init(linear.weight, value=2.0)
print(linear.weight.numpy())
# result is [[2. 2. 2. 2.], [2. 2. 2. 2.]]
"""
initializer = nn.initializer.Constant(**kwargs)
initializer(param, param.block)
def normal_init(param, **kwargs):
"""
Initialize the `param` with a Normal distribution.
Args:
param (Tensor): Tensor that needs to be initialized.
Examples:
from paddlers.models.ppseg.cvlibs import param_init
import paddle.nn as nn
linear = nn.Linear(2, 4)
param_init.normal_init(linear.weight, loc=0.0, scale=1.0)
"""
initializer = nn.initializer.Normal(**kwargs)
initializer(param, param.block)
def kaiming_normal_init(param, **kwargs):
r"""
Initialize the input tensor with Kaiming Normal initialization.
This function implements the `param` initialization from the paper
`Delving Deep into Rectifiers: Surpassing Human-Level Performance on
ImageNet Classification <https://arxiv.org/abs/1502.01852>`
by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
robust initialization method that particularly considers the rectifier
nonlinearities. In case of Uniform distribution, the range is [-x, x], where
.. math::
x = \sqrt{\\frac{6.0}{fan\_in}}
In case of Normal distribution, the mean is 0 and the standard deviation
is
.. math::
\sqrt{\\frac{2.0}{fan\_in}}
Args:
param (Tensor): Tensor that needs to be initialized.
Examples:
from paddlers.models.ppseg.cvlibs import param_init
import paddle.nn as nn
linear = nn.Linear(2, 4)
# uniform is used to decide whether to use uniform or normal distribution
param_init.kaiming_normal_init(linear.weight)
"""
initializer = nn.initializer.KaimingNormal(**kwargs)
initializer(param, param.block)
def kaiming_uniform(param, **kwargs):
r"""Implements the Kaiming Uniform initializer
This class implements the weight initialization from the paper
`Delving Deep into Rectifiers: Surpassing Human-Level Performance on
ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
robust initialization method that particularly considers the rectifier
nonlinearities.
In case of Uniform distribution, the range is [-x, x], where
.. math::
x = \sqrt{\\frac{6.0}{fan\_in}}
Args:
param (Tensor): Tensor that needs to be initialized.
Examples:
from paddlers.models.ppseg.cvlibs import param_init
import paddle.nn as nn
linear = nn.Linear(2, 4)
param_init.kaiming_uniform(linear.weight)
"""
initializer = nn.initializer.KaimingUniform(**kwargs)
initializer(param, param.block)

@ -0,0 +1,29 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .dataset import Dataset
from .cityscapes import Cityscapes
from .voc import PascalVOC
from .ade import ADE20K
from .optic_disc_seg import OpticDiscSeg
from .pascal_context import PascalContext
from .mini_deep_globe_road_extraction import MiniDeepGlobeRoadExtraction
from .eg1800 import EG1800
from .supervisely import SUPERVISELY
from .cocostuff import CocoStuff
from .stare import STARE
from .drive import DRIVE
from .hrf import HRF
from .chase_db1 import CHASEDB1
from .pp_humanseg14k import PPHumanSeg14K

@ -0,0 +1,111 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
from PIL import Image
from paddlers.models.ppseg.datasets import Dataset
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
import paddlers.models.ppseg.transforms.functional as F
URL = "http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip"
@manager.DATASETS.add_component
class ADE20K(Dataset):
"""
ADE20K dataset `http://sceneparsing.csail.mit.edu/`.
Args:
transforms (list): A list of image transformations.
dataset_root (str, optional): The ADK20K dataset directory. Default: None.
mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 150
def __init__(self, transforms, dataset_root=None, mode='train', edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'val']:
raise ValueError(
"`mode` should be one of ('train', 'val') in ADE20K dataset, but got {}."
.format(mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME,
extraname='ADEChallengeData2016')
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
img_dir = os.path.join(self.dataset_root, 'images/training')
label_dir = os.path.join(self.dataset_root, 'annotations/training')
elif mode == 'val':
img_dir = os.path.join(self.dataset_root, 'images/validation')
label_dir = os.path.join(self.dataset_root,
'annotations/validation')
img_files = os.listdir(img_dir)
label_files = [i.replace('.jpg', '.png') for i in img_files]
for i in range(len(img_files)):
img_path = os.path.join(img_dir, img_files[i])
label_path = os.path.join(label_dir, label_files[i])
self.file_list.append([img_path, label_path])
def __getitem__(self, idx):
image_path, label_path = self.file_list[idx]
if self.mode == 'val':
im, _ = self.transforms(im=image_path)
label = np.asarray(Image.open(label_path))
# The class 0 is ignored. And it will equal to 255 after
# subtracted 1, because the dtype of label is uint8.
label = label - 1
label = label[np.newaxis, :, :]
return im, label
else:
im, label = self.transforms(im=image_path, label=label_path)
label = label - 1
# Recover the ignore pixels adding by transform
label[label == 254] = 255
if self.edge:
edge_mask = F.mask_to_binary_edge(
label, radius=2, num_classes=self.num_classes)
return im, label, edge_mask
else:
return im, label

@ -0,0 +1,98 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
from paddlers.models.ppseg.datasets import Dataset
URL = 'https://bj.bcebos.com/paddleseg/dataset/chase_db1/chase_db1.zip'
@manager.DATASETS.add_component
class CHASEDB1(Dataset):
"""
CHASE_DB1 dataset is a dataset for retinal vessel segmentation
which contains 28 color retina images with the size of 999×960 pixels.
It is collected from both left and right eyes of 14 school children.
Each image is annotated by two independent human experts, and we choose the labels from 1st expert.
(https://blogs.kingston.ac.uk/retinal/chasedb1/)
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory. Default: None
edge (bool): whether extract edge infor in the output
mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
"""
NUM_CLASSES = 2
def __init__(self,
dataset_root=None,
transforms=None,
edge=False,
mode='train'):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.edge = edge
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255 # labels only have 1/0, thus ignore_index is not necessary
if mode not in ['train', 'val', 'test']:
raise ValueError(
"`mode` should be 'train', 'val' or 'test', but got {}.".format(
mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
file_path = os.path.join(self.dataset_root, 'train_list.txt')
elif mode == 'val':
file_path = os.path.join(self.dataset_root, 'val_list.txt')
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split()
if len(items) != 2:
if mode == 'train' or mode == 'val':
raise Exception(
"File list format incorrect! It should be"
" image_name label_name\\n")
image_path = os.path.join(self.dataset_root, items[0])
grt_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
grt_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, grt_path])

@ -0,0 +1,87 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import glob
from paddlers.models.ppseg.datasets import Dataset
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
@manager.DATASETS.add_component
class Cityscapes(Dataset):
"""
Cityscapes dataset `https://www.cityscapes-dataset.com/`.
The folder structure is as follow:
cityscapes
|
|--leftImg8bit
| |--train
| |--val
| |--test
|
|--gtFine
| |--train
| |--val
| |--test
Make sure there are **labelTrainIds.png in gtFine directory. If not, please run the conver_cityscapes.py in tools.
Args:
transforms (list): Transforms for image.
dataset_root (str): Cityscapes dataset directory.
mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 19
def __init__(self, transforms, dataset_root, mode='train', edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
self.file_list = list()
mode = mode.lower()
self.mode = mode
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'val', 'test']:
raise ValueError(
"mode should be 'train', 'val' or 'test', but got {}.".format(
mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
img_dir = os.path.join(self.dataset_root, 'leftImg8bit')
label_dir = os.path.join(self.dataset_root, 'gtFine')
if self.dataset_root is None or not os.path.isdir(
self.dataset_root) or not os.path.isdir(
img_dir) or not os.path.isdir(label_dir):
raise ValueError(
"The dataset is not Found or the folder structure is nonconfoumance."
)
label_files = sorted(
glob.glob(
os.path.join(label_dir, mode, '*',
'*_gtFine_labelTrainIds.png')))
img_files = sorted(
glob.glob(os.path.join(img_dir, mode, '*', '*_leftImg8bit.png')))
self.file_list = [[
img_path, label_path
] for img_path, label_path in zip(img_files, label_files)]

@ -0,0 +1,82 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import glob
from paddlers.models.ppseg.datasets import Dataset
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
@manager.DATASETS.add_component
class CocoStuff(Dataset):
"""
COCO-Stuff dataset `https://github.com/nightrome/cocostuff`.
The folder structure is as follow:
cocostuff
|
|--images
| |--train2017
| |--val2017
|
|--annotations
| |--train2017
| |--val2017
Args:
transforms (list): Transforms for image.
dataset_root (str): Cityscapes dataset directory.
mode (str): Which part of dataset to use. it is one of ('train', 'val'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 171
def __init__(self, transforms, dataset_root, mode='train', edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
self.file_list = list()
mode = mode.lower()
self.mode = mode
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'val']:
raise ValueError(
"mode should be 'train', 'val', but got {}.".format(mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
img_dir = os.path.join(self.dataset_root, 'images')
label_dir = os.path.join(self.dataset_root, 'annotations')
if self.dataset_root is None or not os.path.isdir(
self.dataset_root) or not os.path.isdir(
img_dir) or not os.path.isdir(label_dir):
raise ValueError(
"The dataset is not Found or the folder structure is nonconfoumance."
)
label_files = sorted(
glob.glob(os.path.join(label_dir, mode + '2017', '*.png')))
img_files = sorted(
glob.glob(os.path.join(img_dir, mode + '2017', '*.jpg')))
self.file_list = [[
img_path, label_path
] for img_path, label_path in zip(img_files, label_files)]

@ -0,0 +1,162 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle
import numpy as np
from PIL import Image
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
import paddlers.models.ppseg.transforms.functional as F
@manager.DATASETS.add_component
class Dataset(paddle.io.Dataset):
"""
Pass in a custom dataset that conforms to the format.
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory.
num_classes (int): Number of classes.
mode (str, optional): which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
train_path (str, optional): The train dataset file. When mode is 'train', train_path is necessary.
The contents of train_path file are as follow:
image1.jpg ground_truth1.png
image2.jpg ground_truth2.png
val_path (str. optional): The evaluation dataset file. When mode is 'val', val_path is necessary.
The contents is the same as train_path
test_path (str, optional): The test dataset file. When mode is 'test', test_path is necessary.
The annotation file is not necessary in test_path file.
separator (str, optional): The separator of dataset list. Default: ' '.
edge (bool, optional): Whether to compute edge while training. Default: False
Examples:
import paddlers.models.ppseg.transforms as T
from paddlers.models.ppseg.datasets import Dataset
transforms = [T.RandomPaddingCrop(crop_size=(512,512)), T.Normalize()]
dataset_root = 'dataset_root_path'
train_path = 'train_path'
num_classes = 2
dataset = Dataset(transforms = transforms,
dataset_root = dataset_root,
num_classes = 2,
train_path = train_path,
mode = 'train')
"""
def __init__(self,
transforms,
dataset_root,
num_classes,
mode='train',
train_path=None,
val_path=None,
test_path=None,
separator=' ',
ignore_index=255,
edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
self.file_list = list()
self.mode = mode.lower()
self.num_classes = num_classes
self.ignore_index = ignore_index
self.edge = edge
if self.mode not in ['train', 'val', 'test']:
raise ValueError(
"mode should be 'train', 'val' or 'test', but got {}.".format(
self.mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if not os.path.exists(self.dataset_root):
raise FileNotFoundError('there is not `dataset_root`: {}.'.format(
self.dataset_root))
if self.mode == 'train':
if train_path is None:
raise ValueError(
'When `mode` is "train", `train_path` is necessary, but it is None.'
)
elif not os.path.exists(train_path):
raise FileNotFoundError(
'`train_path` is not found: {}'.format(train_path))
else:
file_path = train_path
elif self.mode == 'val':
if val_path is None:
raise ValueError(
'When `mode` is "val", `val_path` is necessary, but it is None.'
)
elif not os.path.exists(val_path):
raise FileNotFoundError(
'`val_path` is not found: {}'.format(val_path))
else:
file_path = val_path
else:
if test_path is None:
raise ValueError(
'When `mode` is "test", `test_path` is necessary, but it is None.'
)
elif not os.path.exists(test_path):
raise FileNotFoundError(
'`test_path` is not found: {}'.format(test_path))
else:
file_path = test_path
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split(separator)
if len(items) != 2:
if self.mode == 'train' or self.mode == 'val':
raise ValueError(
"File list format incorrect! In training or evaluation task it should be"
" image_name{}label_name\\n".format(separator))
image_path = os.path.join(self.dataset_root, items[0])
label_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
label_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, label_path])
def __getitem__(self, idx):
image_path, label_path = self.file_list[idx]
if self.mode == 'test':
im, _ = self.transforms(im=image_path)
im = im[np.newaxis, ...]
return im, image_path
elif self.mode == 'val':
im, _ = self.transforms(im=image_path)
label = np.asarray(Image.open(label_path))
label = label[np.newaxis, :, :]
return im, label
else:
im, label = self.transforms(im=image_path, label=label_path)
if self.edge:
edge_mask = F.mask_to_binary_edge(
label, radius=2, num_classes=self.num_classes)
return im, label, edge_mask
else:
return im, label
def __len__(self):
return len(self.file_list)

@ -0,0 +1,96 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
from paddlers.models.ppseg.datasets import Dataset
URL = 'https://bj.bcebos.com/paddleseg/dataset/drive/drive.zip'
@manager.DATASETS.add_component
class DRIVE(Dataset):
"""
The Digital Retinal Images for Vessel Extraction (DRIVE) dataset is a dataset for retinal vessel segmentation.
It consists of a total of JPEG 40 color fundus images which is of size (584, 565); including 7 abnormal pathology cases.
(http://www.isi.uu.nl/Research/Databases/DRIVE/)
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory. Default: None
edge (bool): whether extract edge infor in the output
mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
"""
NUM_CLASSES = 2
def __init__(self,
dataset_root=None,
transforms=None,
edge=False,
mode='train'):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.edge = edge
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255 # labels only have 1/0, thus ignore_index is not necessary
if mode not in ['train', 'val', 'test']:
raise ValueError(
"`mode` should be 'train', 'val' or 'test', but got {}.".format(
mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
file_path = os.path.join(self.dataset_root, 'train_list.txt')
elif mode == 'val':
file_path = os.path.join(self.dataset_root, 'val_list.txt')
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split()
if len(items) != 2:
if mode == 'train' or mode == 'val':
raise Exception(
"File list format incorrect! It should be"
" image_name label_name\\n")
image_path = os.path.join(self.dataset_root, items[0])
grt_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
grt_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, grt_path])

@ -0,0 +1,136 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import copy
import cv2
import numpy as np
from paddlers.models.ppseg.datasets import Dataset
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
import paddlers.models.ppseg.transforms.functional as F
URL = "https://paddleseg.bj.bcebos.com/dataset/EG1800.zip"
@manager.DATASETS.add_component
class EG1800(Dataset):
"""
EG1800 dataset `http://xiaoyongshen.me/webpage_portrait/index.html`.
Args:
common_transforms (list): A list of common image transformations for two inputs of portrait net.
transforms1 (list): A list of image transformations for the first input of portrait net.
transforms2 (list): A list of image transformations for the second input of portrait net.
dataset_root (str, optional): The EG1800 dataset directory. Default: None.
mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 2
def __init__(self,
common_transforms,
transforms1,
transforms2,
dataset_root=None,
mode='train',
edge=False):
self.dataset_root = dataset_root
self.common_transforms = Compose(common_transforms)
self.transforms = self.common_transforms
if transforms1 is not None:
self.transforms1 = Compose(transforms1, to_rgb=False)
if transforms2 is not None:
self.transforms2 = Compose(transforms2, to_rgb=False)
mode = mode.lower()
self.ignore_index = 255
self.mode = mode
self.num_classes = self.NUM_CLASSES
self.input_width = 224
self.input_height = 224
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
path = os.path.join(dataset_root, 'eg1800_train.txt')
else:
path = os.path.join(dataset_root, 'eg1800_test.txt')
with open(path, 'r') as f:
files = f.readlines()
img_files = [
os.path.join(dataset_root, 'Images', file).strip() for file in files
]
label_files = [
os.path.join(dataset_root, 'Labels', file).strip() for file in files
]
self.file_list = [[
img_path, label_path
] for img_path, label_path in zip(img_files, label_files)]
pass
def __getitem__(self, item):
image_path, label_path = self.file_list[item]
im = cv2.imread(image_path)
label = cv2.imread(label_path, 0)
label[label > 1] = 0
if self.mode == "val":
common_im, label = self.common_transforms(im=im, label=label)
im = np.float32(common_im[::-1, :, :]) # RGB => BGR
im_aug = copy.deepcopy(im)
else:
common_im, label = self.common_transforms(im=im, label=label)
common_im = np.transpose(common_im, [1, 2, 0])
# add augmentation
im, _ = self.transforms1(common_im)
im_aug, _ = self.transforms2(common_im)
im = np.float32(im[::-1, :, :]) # RGB => BGR
im_aug = np.float32(im_aug[::-1, :, :]) # RGB => BGR
label = cv2.resize(
np.uint8(label), (self.input_width, self.input_height),
interpolation=cv2.INTER_NEAREST)
# add mask blur
label = np.uint8(cv2.blur(label, (5, 5)))
label[label >= 0.5] = 1
label[label < 0.5] = 0
edge_mask = F.mask_to_binary_edge(
label, radius=4, num_classes=self.num_classes)
edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1)
im = np.concatenate([im_aug, im])
if self.mode == "train":
return im, label, edge_mask
else:
return im, label

@ -0,0 +1,95 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
from paddlers.models.ppseg.datasets import Dataset
URL = 'https://bj.bcebos.com/paddleseg/dataset/hrf/hrf.zip'
@manager.DATASETS.add_component
class HRF(Dataset):
"""
The HRF dataset is a dataset for retinal vessel segmentation which comprises 45 images and is organized as 15 subsets. Each subset contains one healthy fundus image, one image of patient with diabetic retinopathy and one glaucoma image. The image sizes are 3,304 x 2,336, with a training/testing image split of 21/24.
(https://doi.org/10.1155/2013/154860)
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory. Default: None
edge (bool): whether extract edge infor in the output
mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
"""
NUM_CLASSES = 2
def __init__(self,
dataset_root=None,
transforms=None,
edge=False,
mode='train'):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.edge = edge
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
if mode not in ['train', 'val', 'test']:
raise ValueError(
"`mode` should be 'train', 'val' or 'test', but got {}.".format(
mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
file_path = os.path.join(self.dataset_root, 'train_list.txt')
elif mode == 'val':
file_path = os.path.join(self.dataset_root, 'val_list.txt')
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split()
if len(items) != 2:
if mode == 'train' or mode == 'val':
raise Exception(
"File list format incorrect! It should be"
" image_name label_name\\n")
image_path = os.path.join(self.dataset_root, items[0])
grt_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
grt_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, grt_path])

@ -0,0 +1,95 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from .dataset import Dataset
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
URL = "https://paddleseg.bj.bcebos.com/dataset/MiniDeepGlobeRoadExtraction.zip"
@manager.DATASETS.add_component
class MiniDeepGlobeRoadExtraction(Dataset):
"""
MiniDeepGlobeRoadExtraction dataset is extraced from DeepGlobe CVPR2018 challenge (http://deepglobe.org/)
There are 800 images in the training set and 200 images in the validation set.
Args:
dataset_root (str, optional): The dataset directory. Default: None.
transforms (list, optional): Transforms for image. Default: None.
mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False.
"""
NUM_CLASSES = 2
def __init__(self,
dataset_root=None,
transforms=None,
mode='train',
edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'val']:
raise ValueError(
"`mode` should be 'train' or 'val', but got {}.".format(mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
file_path = os.path.join(self.dataset_root, 'train.txt')
else:
file_path = os.path.join(self.dataset_root, 'val.txt')
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split('|')
if len(items) != 2:
if mode == 'train' or mode == 'val':
raise Exception(
"File list format incorrect! It should be"
" image_name|label_name\\n")
image_path = os.path.join(self.dataset_root, items[0])
grt_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
grt_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, grt_path])

@ -0,0 +1,97 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from .dataset import Dataset
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
URL = "https://paddleseg.bj.bcebos.com/dataset/optic_disc_seg.zip"
@manager.DATASETS.add_component
class OpticDiscSeg(Dataset):
"""
OpticDiscSeg dataset is extraced from iChallenge-AMD
(https://ai.baidu.com/broad/subordinate?dataset=amd).
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory. Default: None
mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 2
def __init__(self,
dataset_root=None,
transforms=None,
mode='train',
edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'val', 'test']:
raise ValueError(
"`mode` should be 'train', 'val' or 'test', but got {}.".format(
mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
file_path = os.path.join(self.dataset_root, 'train_list.txt')
elif mode == 'val':
file_path = os.path.join(self.dataset_root, 'val_list.txt')
else:
file_path = os.path.join(self.dataset_root, 'test_list.txt')
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split()
if len(items) != 2:
if mode == 'train' or mode == 'val':
raise Exception(
"File list format incorrect! It should be"
" image_name label_name\\n")
image_path = os.path.join(self.dataset_root, items[0])
grt_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
grt_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, grt_path])

@ -0,0 +1,82 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from PIL import Image
from paddlers.models.ppseg.datasets import Dataset
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
@manager.DATASETS.add_component
class PascalContext(Dataset):
"""
PascalVOC2010 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`.
If you want to use pascal context dataset, please run the convert_voc2010.py in tools firstly.
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory. Default: None
mode (str): Which part of dataset to use. it is one of ('train', 'trainval', 'context', 'val').
If you want to set mode to 'context', please make sure the dataset have been augmented. Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 60
def __init__(self, transforms=None, dataset_root=None, mode='train', edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'trainval', 'val']:
raise ValueError(
"`mode` should be one of ('train', 'trainval', 'val') in PascalContext dataset, but got {}."
.format(mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
raise ValueError(
"The dataset is not Found or the folder structure is nonconfoumance."
)
image_set_dir = os.path.join(self.dataset_root, 'ImageSets',
'Segmentation')
if mode == 'train':
file_path = os.path.join(image_set_dir, 'train_context.txt')
elif mode == 'val':
file_path = os.path.join(image_set_dir, 'val_context.txt')
elif mode == 'trainval':
file_path = os.path.join(image_set_dir, 'trainval_context.txt')
if not os.path.exists(file_path):
raise RuntimeError(
"PASCAL-Context annotations are not ready, "
"Please make sure voc_context.py has been properly run.")
img_dir = os.path.join(self.dataset_root, 'JPEGImages')
label_dir = os.path.join(self.dataset_root, 'Context')
with open(file_path, 'r') as f:
for line in f:
line = line.strip()
image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
label_path = os.path.join(label_dir, ''.join([line, '.png']))
self.file_list.append([image_path, label_path])

@ -0,0 +1,82 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from .dataset import Dataset
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
@manager.DATASETS.add_component
class PPHumanSeg14K(Dataset):
"""
This is the PP-HumanSeg14K Dataset.
This dataset was introduced in the work:
Chu, Lutao, et al. "PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset." Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2022.
This dataset is divided into training set, validation set and test set. The training set includes 8770 pictures, the validation set includes 2431 pictures, and the test set includes 2482 pictures.
Args:
dataset_root (str, optional): The dataset directory. Default: None.
transforms (list, optional): Transforms for image. Default: None.
mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False.
"""
NUM_CLASSES = 2
def __init__(self,
dataset_root=None,
transforms=None,
mode='train',
edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'val', 'test']:
raise ValueError(
"`mode` should be 'train', 'val' or 'test', but got {}.".format(
mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if mode == 'train':
file_path = os.path.join(self.dataset_root, 'train.txt')
elif mode == 'val':
file_path = os.path.join(self.dataset_root, 'val.txt')
else:
file_path = os.path.join(self.dataset_root, 'test.txt')
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split(' ')
if len(items) != 2:
if mode == 'train' or mode == 'val':
raise Exception(
"File list format incorrect! It should be"
" image_name label_name\\n")
image_path = os.path.join(self.dataset_root, items[0])
grt_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
grt_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, grt_path])

@ -0,0 +1,95 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
from paddlers.models.ppseg.datasets import Dataset
URL = 'https://bj.bcebos.com/paddleseg/dataset/stare/stare.zip'
@manager.DATASETS.add_component
class STARE(Dataset):
"""
STARE dataset is processed from the STARE(STructured Analysis of the Retina) project.
(https://cecas.clemson.edu/~ahoover/stare/)
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory. Default: None
edge (bool): whether extract edge infor in the output
mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
"""
NUM_CLASSES = 2
def __init__(self,
dataset_root=None,
transforms=None,
edge=False,
mode='train'):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.edge = edge
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
if mode not in ['train', 'val', 'test']:
raise ValueError(
"`mode` should be 'train', 'val' or 'test', but got {}.".format(
mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1) # data STARE
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
file_path = os.path.join(self.dataset_root, 'train_list.txt')
elif mode == 'val':
file_path = os.path.join(self.dataset_root, 'val_list.txt')
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split()
if len(items) != 2:
if mode == 'train' or mode == 'val':
raise Exception(
"File list format incorrect! It should be"
" image_name label_name\\n")
image_path = os.path.join(self.dataset_root, items[0])
grt_path = None
else:
image_path = os.path.join(self.dataset_root, items[0])
grt_path = os.path.join(self.dataset_root, items[1])
self.file_list.append([image_path, grt_path])

@ -0,0 +1,135 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import copy
import cv2
import numpy as np
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
from paddlers.models.ppseg.datasets import Dataset
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
import paddlers.models.ppseg.transforms.functional as F
URL = "https://paddleseg.bj.bcebos.com/dataset/Supervisely_face.zip"
@manager.DATASETS.add_component
class SUPERVISELY(Dataset):
"""
Supervise.ly dataset `https://supervise.ly/`.
Args:
common_transforms (list): A list of common image transformations for two inputs of portrait net.
transforms1 (list): A list of image transformations for the first input of portrait net.
transforms2 (list): A list of image transformations for the second input of portrait net.
dataset_root (str, optional): The Supervise.ly dataset directory. Default: None.
mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 2
def __init__(self,
common_transforms,
transforms1,
transforms2,
dataset_root=None,
mode='train',
edge=False):
self.dataset_root = dataset_root
self.common_transforms = Compose(common_transforms)
self.transforms = self.common_transforms
if transforms1 is not None:
self.transforms1 = Compose(transforms1, to_rgb=False)
if transforms2 is not None:
self.transforms2 = Compose(transforms2, to_rgb=False)
mode = mode.lower()
self.ignore_index = 255
self.mode = mode
self.num_classes = self.NUM_CLASSES
self.input_width = 224
self.input_height = 224
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME)
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
if mode == 'train':
path = os.path.join(dataset_root, 'supervisely_face_train_easy.txt')
else:
path = os.path.join(dataset_root, 'supervisely_face_test_easy.txt')
with open(path, 'r') as f:
files = f.readlines()
files = ["/".join(file.split('/')[1:]) for file in files]
img_files = [os.path.join(dataset_root, file).strip() for file in files]
label_files = [
os.path.join(dataset_root, file.replace('/img/', '/ann/')).strip()
for file in files
]
self.file_list = [[
img_path, label_path
] for img_path, label_path in zip(img_files, label_files)]
def __getitem__(self, item):
image_path, label_path = self.file_list[item]
im = cv2.imread(image_path)
label = cv2.imread(label_path, 0)
label[label > 0] = 1
if self.mode == "val":
common_im, label = self.common_transforms(im=im, label=label)
im = np.float32(common_im[::-1, :, :]) # RGB => BGR
im_aug = copy.deepcopy(im)
else:
common_im, label = self.common_transforms(im=im, label=label)
common_im = np.transpose(common_im, [1, 2, 0])
# add augmentation
im, _ = self.transforms1(common_im)
im_aug, _ = self.transforms2(common_im)
im = np.float32(im[::-1, :, :]) # RGB => BGR
im_aug = np.float32(im_aug[::-1, :, :]) # RGB => BGR
label = cv2.resize(
np.uint8(label), (self.input_width, self.input_height),
interpolation=cv2.INTER_NEAREST)
# add mask blur
label = np.uint8(cv2.blur(label, (5, 5)))
label[label >= 0.5] = 1
label[label < 0.5] = 0
edge_mask = F.mask_to_binary_edge(
label, radius=4, num_classes=self.num_classes)
edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1)
im = np.concatenate([im_aug, im])
if self.mode == "train":
return im, label, edge_mask
else:
return im, label

@ -0,0 +1,112 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from paddlers.models.ppseg.datasets import Dataset
from paddlers.models.ppseg.utils.download import download_file_and_uncompress
from paddlers.models.ppseg.utils import seg_env
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.transforms import Compose
URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar"
@manager.DATASETS.add_component
class PascalVOC(Dataset):
"""
PascalVOC2012 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`.
If you want to augment the dataset, please run the voc_augment.py in tools.
Args:
transforms (list): Transforms for image.
dataset_root (str): The dataset directory. Default: None
mode (str, optional): Which part of dataset to use. it is one of ('train', 'trainval', 'trainaug', 'val').
If you want to set mode to 'trainaug', please make sure the dataset have been augmented. Default: 'train'.
edge (bool, optional): Whether to compute edge while training. Default: False
"""
NUM_CLASSES = 21
def __init__(self, transforms, dataset_root=None, mode='train', edge=False):
self.dataset_root = dataset_root
self.transforms = Compose(transforms)
mode = mode.lower()
self.mode = mode
self.file_list = list()
self.num_classes = self.NUM_CLASSES
self.ignore_index = 255
self.edge = edge
if mode not in ['train', 'trainval', 'trainaug', 'val']:
raise ValueError(
"`mode` should be one of ('train', 'trainval', 'trainaug', 'val') in PascalVOC dataset, but got {}."
.format(mode))
if self.transforms is None:
raise ValueError("`transforms` is necessary, but it is None.")
if self.dataset_root is None:
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=seg_env.DATA_HOME,
extrapath=seg_env.DATA_HOME,
extraname='VOCdevkit')
elif not os.path.exists(self.dataset_root):
self.dataset_root = os.path.normpath(self.dataset_root)
savepath, extraname = self.dataset_root.rsplit(
sep=os.path.sep, maxsplit=1)
self.dataset_root = download_file_and_uncompress(
url=URL,
savepath=savepath,
extrapath=savepath,
extraname=extraname)
image_set_dir = os.path.join(self.dataset_root, 'VOC2012', 'ImageSets',
'Segmentation')
if mode == 'train':
file_path = os.path.join(image_set_dir, 'train.txt')
elif mode == 'val':
file_path = os.path.join(image_set_dir, 'val.txt')
elif mode == 'trainval':
file_path = os.path.join(image_set_dir, 'trainval.txt')
elif mode == 'trainaug':
file_path = os.path.join(image_set_dir, 'train.txt')
file_path_aug = os.path.join(image_set_dir, 'aug.txt')
if not os.path.exists(file_path_aug):
raise RuntimeError(
"When `mode` is 'trainaug', Pascal Voc dataset should be augmented, "
"Please make sure voc_augment.py has been properly run when using this mode."
)
img_dir = os.path.join(self.dataset_root, 'VOC2012', 'JPEGImages')
label_dir = os.path.join(self.dataset_root, 'VOC2012',
'SegmentationClass')
label_dir_aug = os.path.join(self.dataset_root, 'VOC2012',
'SegmentationClassAug')
with open(file_path, 'r') as f:
for line in f:
line = line.strip()
image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
label_path = os.path.join(label_dir, ''.join([line, '.png']))
self.file_list.append([image_path, label_path])
if mode == 'trainaug':
with open(file_path_aug, 'r') as f:
for line in f:
line = line.strip()
image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
label_path = os.path.join(label_dir_aug,
''.join([line, '.png']))
self.file_list.append([image_path, label_path])

@ -0,0 +1,57 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .backbones import *
from .losses import *
from .ann import *
from .bisenet import *
from .danet import *
from .deeplab import *
from .fast_scnn import *
from .fcn import *
from .gcnet import *
from .ocrnet import *
from .pspnet import *
from .gscnn import GSCNN
from .unet import UNet
from .hardnet import HarDNet
from .u2net import U2Net, U2Netp
from .attention_unet import AttentionUNet
from .unet_plusplus import UNetPlusPlus
from .unet_3plus import UNet3Plus
from .decoupled_segnet import DecoupledSegNet
from .emanet import *
from .isanet import *
from .dnlnet import *
from .setr import *
from .sfnet import *
from .pphumanseg_lite import *
from .mla_transformer import MLATransformer
from .portraitnet import PortraitNet
from .stdcseg import STDCSeg
from .segformer import SegFormer
from .pointrend import PointRend
from .ginet import GINet
from .segmenter import *
from .segnet import SegNet
from .encnet import ENCNet
from .hrnet_contrast import HRNetW48Contrast
from .espnet import ESPNetV2
from .dmnet import DMNet
from .espnetv1 import ESPNetV1
from .enet import ENet
from .bisenetv1 import BiseNetV1
from .fastfcn import FastFCN
from .pfpnnet import PFPNNet

@ -0,0 +1,434 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class ANN(nn.Layer):
"""
The ANN implementation based on PaddlePaddle.
The original article refers to
Zhen, Zhu, et al. "Asymmetric Non-local Neural Networks for Semantic Segmentation"
(https://arxiv.org/pdf/1908.07678.pdf).
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
key_value_channels (int, optional): The key and value channels of self-attention map in both AFNB and APNB modules.
Default: 256.
inter_channels (int, optional): Both input and output channels of APNB modules. Default: 512.
psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(2, 3),
key_value_channels=256,
inter_channels=512,
psp_size=(1, 3, 6, 8),
enable_auxiliary_loss=True,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
backbone_channels = [
backbone.feat_channels[i] for i in backbone_indices
]
self.head = ANNHead(num_classes, backbone_indices, backbone_channels,
key_value_channels, inter_channels, psp_size,
enable_auxiliary_loss)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feat_list = self.backbone(x)
logit_list = self.head(feat_list)
return [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class ANNHead(nn.Layer):
"""
The ANNHead implementation.
It mainly consists of AFNB and APNB modules.
Args:
num_classes (int): The unique number of target classes.
backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
The first index will be taken as low-level features; the second one will be
taken as high-level features in AFNB module. Usually backbone consists of four
downsampling stage, such as ResNet, and return an output of each stage. If it is (2, 3),
it means taking feature map of the third stage and the fourth stage in backbone.
backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
key_value_channels (int): The key and value channels of self-attention map in both AFNB and APNB modules.
inter_channels (int): Both input and output channels of APNB modules.
psp_size (tuple): The out size of pooled feature maps.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
"""
def __init__(self,
num_classes,
backbone_indices,
backbone_channels,
key_value_channels,
inter_channels,
psp_size,
enable_auxiliary_loss=True):
super().__init__()
low_in_channels = backbone_channels[0]
high_in_channels = backbone_channels[1]
self.fusion = AFNB(
low_in_channels=low_in_channels,
high_in_channels=high_in_channels,
out_channels=high_in_channels,
key_channels=key_value_channels,
value_channels=key_value_channels,
dropout_prob=0.05,
repeat_sizes=([1]),
psp_size=psp_size)
self.context = nn.Sequential(
layers.ConvBNReLU(
in_channels=high_in_channels,
out_channels=inter_channels,
kernel_size=3,
padding=1),
APNB(
in_channels=inter_channels,
out_channels=inter_channels,
key_channels=key_value_channels,
value_channels=key_value_channels,
dropout_prob=0.05,
repeat_sizes=([1]),
psp_size=psp_size))
self.cls = nn.Conv2D(
in_channels=inter_channels, out_channels=num_classes, kernel_size=1)
self.auxlayer = layers.AuxLayer(
in_channels=low_in_channels,
inter_channels=low_in_channels // 2,
out_channels=num_classes,
dropout_prob=0.05)
self.backbone_indices = backbone_indices
self.enable_auxiliary_loss = enable_auxiliary_loss
def forward(self, feat_list):
logit_list = []
low_level_x = feat_list[self.backbone_indices[0]]
high_level_x = feat_list[self.backbone_indices[1]]
x = self.fusion(low_level_x, high_level_x)
x = self.context(x)
logit = self.cls(x)
logit_list.append(logit)
if self.enable_auxiliary_loss:
auxiliary_logit = self.auxlayer(low_level_x)
logit_list.append(auxiliary_logit)
return logit_list
class AFNB(nn.Layer):
"""
Asymmetric Fusion Non-local Block.
Args:
low_in_channels (int): Low-level-feature channels.
high_in_channels (int): High-level-feature channels.
out_channels (int): Out channels of AFNB module.
key_channels (int): The key channels in self-attention block.
value_channels (int): The value channels in self-attention block.
dropout_prob (float): The dropout rate of output.
repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]).
psp_size (tuple. optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
"""
def __init__(self,
low_in_channels,
high_in_channels,
out_channels,
key_channels,
value_channels,
dropout_prob,
repeat_sizes=([1]),
psp_size=(1, 3, 6, 8)):
super().__init__()
self.psp_size = psp_size
self.stages = nn.LayerList([
SelfAttentionBlock_AFNB(low_in_channels, high_in_channels,
key_channels, value_channels, out_channels,
size) for size in repeat_sizes
])
self.conv_bn = layers.ConvBN(
in_channels=out_channels + high_in_channels,
out_channels=out_channels,
kernel_size=1)
self.dropout = nn.Dropout(p=dropout_prob)
def forward(self, low_feats, high_feats):
priors = [stage(low_feats, high_feats) for stage in self.stages]
context = priors[0]
for i in range(1, len(priors)):
context += priors[i]
output = self.conv_bn(paddle.concat([context, high_feats], axis=1))
output = self.dropout(output)
return output
class APNB(nn.Layer):
"""
Asymmetric Pyramid Non-local Block.
Args:
in_channels (int): The input channels of APNB module.
out_channels (int): Out channels of APNB module.
key_channels (int): The key channels in self-attention block.
value_channels (int): The value channels in self-attention block.
dropout_prob (float): The dropout rate of output.
repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]).
psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
"""
def __init__(self,
in_channels,
out_channels,
key_channels,
value_channels,
dropout_prob,
repeat_sizes=([1]),
psp_size=(1, 3, 6, 8)):
super().__init__()
self.psp_size = psp_size
self.stages = nn.LayerList([
SelfAttentionBlock_APNB(in_channels, out_channels, key_channels,
value_channels, size)
for size in repeat_sizes
])
self.conv_bn = layers.ConvBNReLU(
in_channels=in_channels * 2,
out_channels=out_channels,
kernel_size=1)
self.dropout = nn.Dropout(p=dropout_prob)
def forward(self, x):
priors = [stage(x) for stage in self.stages]
context = priors[0]
for i in range(1, len(priors)):
context += priors[i]
output = self.conv_bn(paddle.concat([context, x], axis=1))
output = self.dropout(output)
return output
def _pp_module(x, psp_size):
n, c, h, w = x.shape
priors = []
for size in psp_size:
feat = F.adaptive_avg_pool2d(x, size)
feat = paddle.reshape(feat, shape=(0, c, -1))
priors.append(feat)
center = paddle.concat(priors, axis=-1)
return center
class SelfAttentionBlock_AFNB(nn.Layer):
"""
Self-Attention Block for AFNB module.
Args:
low_in_channels (int): Low-level-feature channels.
high_in_channels (int): High-level-feature channels.
key_channels (int): The key channels in self-attention block.
value_channels (int): The value channels in self-attention block.
out_channels (int, optional): Out channels of AFNB module. Default: None.
scale (int, optional): Pooling size. Default: 1.
psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
"""
def __init__(self,
low_in_channels,
high_in_channels,
key_channels,
value_channels,
out_channels=None,
scale=1,
psp_size=(1, 3, 6, 8)):
super().__init__()
self.scale = scale
self.in_channels = low_in_channels
self.out_channels = out_channels
self.key_channels = key_channels
self.value_channels = value_channels
if out_channels == None:
self.out_channels = high_in_channels
self.pool = nn.MaxPool2D(scale)
self.f_key = layers.ConvBNReLU(
in_channels=low_in_channels,
out_channels=key_channels,
kernel_size=1)
self.f_query = layers.ConvBNReLU(
in_channels=high_in_channels,
out_channels=key_channels,
kernel_size=1)
self.f_value = nn.Conv2D(
in_channels=low_in_channels,
out_channels=value_channels,
kernel_size=1)
self.W = nn.Conv2D(
in_channels=value_channels,
out_channels=out_channels,
kernel_size=1)
self.psp_size = psp_size
def forward(self, low_feats, high_feats):
batch_size, _, h, w = high_feats.shape
value = self.f_value(low_feats)
value = _pp_module(value, self.psp_size)
value = paddle.transpose(value, (0, 2, 1))
query = self.f_query(high_feats)
query = paddle.reshape(query, shape=(0, self.key_channels, -1))
query = paddle.transpose(query, perm=(0, 2, 1))
key = self.f_key(low_feats)
key = _pp_module(key, self.psp_size)
sim_map = paddle.matmul(query, key)
sim_map = (self.key_channels**-.5) * sim_map
sim_map = F.softmax(sim_map, axis=-1)
context = paddle.matmul(sim_map, value)
context = paddle.transpose(context, perm=(0, 2, 1))
hf_shape = paddle.shape(high_feats)
context = paddle.reshape(
context, shape=[0, self.value_channels, hf_shape[2], hf_shape[3]])
context = self.W(context)
return context
class SelfAttentionBlock_APNB(nn.Layer):
"""
Self-Attention Block for APNB module.
Args:
in_channels (int): The input channels of APNB module.
out_channels (int): The out channels of APNB module.
key_channels (int): The key channels in self-attention block.
value_channels (int): The value channels in self-attention block.
scale (int, optional): Pooling size. Default: 1.
psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
"""
def __init__(self,
in_channels,
out_channels,
key_channels,
value_channels,
scale=1,
psp_size=(1, 3, 6, 8)):
super().__init__()
self.scale = scale
self.in_channels = in_channels
self.out_channels = out_channels
self.key_channels = key_channels
self.value_channels = value_channels
self.pool = nn.MaxPool2D(scale)
self.f_key = layers.ConvBNReLU(
in_channels=self.in_channels,
out_channels=self.key_channels,
kernel_size=1)
self.f_query = self.f_key
self.f_value = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.value_channels,
kernel_size=1)
self.W = nn.Conv2D(
in_channels=self.value_channels,
out_channels=self.out_channels,
kernel_size=1)
self.psp_size = psp_size
def forward(self, x):
batch_size, _, h, w = x.shape
if self.scale > 1:
x = self.pool(x)
value = self.f_value(x)
value = _pp_module(value, self.psp_size)
value = paddle.transpose(value, perm=(0, 2, 1))
query = self.f_query(x)
query = paddle.reshape(query, shape=(0, self.key_channels, -1))
query = paddle.transpose(query, perm=(0, 2, 1))
key = self.f_key(x)
key = _pp_module(key, self.psp_size)
sim_map = paddle.matmul(query, key)
sim_map = (self.key_channels**-.5) * sim_map
sim_map = F.softmax(sim_map, axis=-1)
context = paddle.matmul(sim_map, value)
context = paddle.transpose(context, perm=(0, 2, 1))
x_shape = paddle.shape(x)
context = paddle.reshape(
context, shape=[0, self.value_channels, x_shape[2], x_shape[3]])
context = self.W(context)
return context

@ -0,0 +1,178 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg import utils
import numpy as np
@manager.MODELS.add_component
class AttentionUNet(nn.Layer):
"""
The Attention-UNet implementation based on PaddlePaddle.
As mentioned in the original paper, author proposes a novel attention gate (AG)
that automatically learns to focus on target structures of varying shapes and sizes.
Models trained with AGs implicitly learn to suppress irrelevant regions in an input image while
highlighting salient features useful for a specific task.
The original article refers to
Oktay, O, et, al. "Attention u-net: Learning where to look for the pancreas."
(https://arxiv.org/pdf/1804.03999.pdf).
Args:
num_classes (int): The unique number of target classes.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self, num_classes, pretrained=None):
super().__init__()
n_channels = 3
self.encoder = Encoder(n_channels, [64, 128, 256, 512])
filters = np.array([64, 128, 256, 512, 1024])
self.up5 = UpConv(ch_in=filters[4], ch_out=filters[3])
self.att5 = AttentionBlock(
F_g=filters[3], F_l=filters[3], F_out=filters[2])
self.up_conv5 = ConvBlock(ch_in=filters[4], ch_out=filters[3])
self.up4 = UpConv(ch_in=filters[3], ch_out=filters[2])
self.att4 = AttentionBlock(
F_g=filters[2], F_l=filters[2], F_out=filters[1])
self.up_conv4 = ConvBlock(ch_in=filters[3], ch_out=filters[2])
self.up3 = UpConv(ch_in=filters[2], ch_out=filters[1])
self.att3 = AttentionBlock(
F_g=filters[1], F_l=filters[1], F_out=filters[0])
self.up_conv3 = ConvBlock(ch_in=filters[2], ch_out=filters[1])
self.up2 = UpConv(ch_in=filters[1], ch_out=filters[0])
self.att2 = AttentionBlock(
F_g=filters[0], F_l=filters[0], F_out=filters[0] // 2)
self.up_conv2 = ConvBlock(ch_in=filters[1], ch_out=filters[0])
self.conv_1x1 = nn.Conv2D(
filters[0], num_classes, kernel_size=1, stride=1, padding=0)
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
x5, (x1, x2, x3, x4) = self.encoder(x)
d5 = self.up5(x5)
x4 = self.att5(g=d5, x=x4)
d5 = paddle.concat([x4, d5], axis=1)
d5 = self.up_conv5(d5)
d4 = self.up4(d5)
x3 = self.att4(g=d4, x=x3)
d4 = paddle.concat((x3, d4), axis=1)
d4 = self.up_conv4(d4)
d3 = self.up3(d4)
x2 = self.att3(g=d3, x=x2)
d3 = paddle.concat((x2, d3), axis=1)
d3 = self.up_conv3(d3)
d2 = self.up2(d3)
x1 = self.att2(g=d2, x=x1)
d2 = paddle.concat((x1, d2), axis=1)
d2 = self.up_conv2(d2)
logit = self.conv_1x1(d2)
logit_list = [logit]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class AttentionBlock(nn.Layer):
def __init__(self, F_g, F_l, F_out):
super().__init__()
self.W_g = nn.Sequential(
nn.Conv2D(F_g, F_out, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2D(F_out))
self.W_x = nn.Sequential(
nn.Conv2D(F_l, F_out, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2D(F_out))
self.psi = nn.Sequential(
nn.Conv2D(F_out, 1, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2D(1), nn.Sigmoid())
self.relu = nn.ReLU()
def forward(self, g, x):
g1 = self.W_g(g)
x1 = self.W_x(x)
psi = self.relu(g1 + x1)
psi = self.psi(psi)
res = x * psi
return res
class UpConv(nn.Layer):
def __init__(self, ch_in, ch_out):
super().__init__()
self.up = nn.Sequential(
nn.Upsample(scale_factor=2, mode="bilinear"),
nn.Conv2D(ch_in, ch_out, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2D(ch_out), nn.ReLU())
def forward(self, x):
return self.up(x)
class Encoder(nn.Layer):
def __init__(self, input_channels, filters):
super().__init__()
self.double_conv = nn.Sequential(
layers.ConvBNReLU(input_channels, 64, 3),
layers.ConvBNReLU(64, 64, 3))
down_channels = filters
self.down_sample_list = nn.LayerList([
self.down_sampling(channel, channel * 2)
for channel in down_channels
])
def down_sampling(self, in_channels, out_channels):
modules = []
modules.append(nn.MaxPool2D(kernel_size=2, stride=2))
modules.append(layers.ConvBNReLU(in_channels, out_channels, 3))
modules.append(layers.ConvBNReLU(out_channels, out_channels, 3))
return nn.Sequential(*modules)
def forward(self, x):
short_cuts = []
x = self.double_conv(x)
for down_sample in self.down_sample_list:
short_cuts.append(x)
x = down_sample(x)
return x, short_cuts
class ConvBlock(nn.Layer):
def __init__(self, ch_in, ch_out):
super(ConvBlock, self).__init__()
self.conv = nn.Sequential(
nn.Conv2D(ch_in, ch_out, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2D(ch_out), nn.ReLU(),
nn.Conv2D(ch_out, ch_out, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2D(ch_out), nn.ReLU())
def forward(self, x):
return self.conv(x)

@ -0,0 +1,23 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .hrnet import *
from .resnet_vd import *
from .xception_deeplab import *
from .mobilenetv3 import *
from .vision_transformer import *
from .swin_transformer import *
from .mobilenetv2 import *
from .mix_transformer import *
from .stdcnet import *

@ -0,0 +1,837 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager, param_init
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
__all__ = [
"HRNet_W18_Small_V1", "HRNet_W18_Small_V2", "HRNet_W18", "HRNet_W30",
"HRNet_W32", "HRNet_W40", "HRNet_W44", "HRNet_W48", "HRNet_W60", "HRNet_W64"
]
class HRNet(nn.Layer):
"""
The HRNet implementation based on PaddlePaddle.
The original article refers to
Jingdong Wang, et, al. "HRNet:Deep High-Resolution Representation Learning for Visual Recognition"
(https://arxiv.org/pdf/1908.07919.pdf).
Args:
pretrained (str, optional): The path of pretrained model.
stage1_num_modules (int, optional): Number of modules for stage1. Default 1.
stage1_num_blocks (list, optional): Number of blocks per module for stage1. Default (4).
stage1_num_channels (list, optional): Number of channels per branch for stage1. Default (64).
stage2_num_modules (int, optional): Number of modules for stage2. Default 1.
stage2_num_blocks (list, optional): Number of blocks per module for stage2. Default (4, 4).
stage2_num_channels (list, optional): Number of channels per branch for stage2. Default (18, 36).
stage3_num_modules (int, optional): Number of modules for stage3. Default 4.
stage3_num_blocks (list, optional): Number of blocks per module for stage3. Default (4, 4, 4).
stage3_num_channels (list, optional): Number of channels per branch for stage3. Default [18, 36, 72).
stage4_num_modules (int, optional): Number of modules for stage4. Default 3.
stage4_num_blocks (list, optional): Number of blocks per module for stage4. Default (4, 4, 4, 4).
stage4_num_channels (list, optional): Number of channels per branch for stage4. Default (18, 36, 72. 144).
has_se (bool, optional): Whether to use Squeeze-and-Excitation module. Default False.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
"""
def __init__(self,
pretrained=None,
stage1_num_modules=1,
stage1_num_blocks=(4, ),
stage1_num_channels=(64, ),
stage2_num_modules=1,
stage2_num_blocks=(4, 4),
stage2_num_channels=(18, 36),
stage3_num_modules=4,
stage3_num_blocks=(4, 4, 4),
stage3_num_channels=(18, 36, 72),
stage4_num_modules=3,
stage4_num_blocks=(4, 4, 4, 4),
stage4_num_channels=(18, 36, 72, 144),
has_se=False,
align_corners=False,
padding_same=True):
super(HRNet, self).__init__()
self.pretrained = pretrained
self.stage1_num_modules = stage1_num_modules
self.stage1_num_blocks = stage1_num_blocks
self.stage1_num_channels = stage1_num_channels
self.stage2_num_modules = stage2_num_modules
self.stage2_num_blocks = stage2_num_blocks
self.stage2_num_channels = stage2_num_channels
self.stage3_num_modules = stage3_num_modules
self.stage3_num_blocks = stage3_num_blocks
self.stage3_num_channels = stage3_num_channels
self.stage4_num_modules = stage4_num_modules
self.stage4_num_blocks = stage4_num_blocks
self.stage4_num_channels = stage4_num_channels
self.has_se = has_se
self.align_corners = align_corners
self.feat_channels = [sum(stage4_num_channels)]
self.conv_layer1_1 = layers.ConvBNReLU(
in_channels=3,
out_channels=64,
kernel_size=3,
stride=2,
padding=1 if not padding_same else 'same',
bias_attr=False)
self.conv_layer1_2 = layers.ConvBNReLU(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=2,
padding=1 if not padding_same else 'same',
bias_attr=False)
self.la1 = Layer1(
num_channels=64,
num_blocks=self.stage1_num_blocks[0],
num_filters=self.stage1_num_channels[0],
has_se=has_se,
name="layer2",
padding_same=padding_same)
self.tr1 = TransitionLayer(
in_channels=[self.stage1_num_channels[0] * 4],
out_channels=self.stage2_num_channels,
name="tr1",
padding_same=padding_same)
self.st2 = Stage(
num_channels=self.stage2_num_channels,
num_modules=self.stage2_num_modules,
num_blocks=self.stage2_num_blocks,
num_filters=self.stage2_num_channels,
has_se=self.has_se,
name="st2",
align_corners=align_corners,
padding_same=padding_same)
self.tr2 = TransitionLayer(
in_channels=self.stage2_num_channels,
out_channels=self.stage3_num_channels,
name="tr2",
padding_same=padding_same)
self.st3 = Stage(
num_channels=self.stage3_num_channels,
num_modules=self.stage3_num_modules,
num_blocks=self.stage3_num_blocks,
num_filters=self.stage3_num_channels,
has_se=self.has_se,
name="st3",
align_corners=align_corners,
padding_same=padding_same)
self.tr3 = TransitionLayer(
in_channels=self.stage3_num_channels,
out_channels=self.stage4_num_channels,
name="tr3",
padding_same=padding_same)
self.st4 = Stage(
num_channels=self.stage4_num_channels,
num_modules=self.stage4_num_modules,
num_blocks=self.stage4_num_blocks,
num_filters=self.stage4_num_channels,
has_se=self.has_se,
name="st4",
align_corners=align_corners,
padding_same=padding_same)
self.init_weight()
def forward(self, x):
conv1 = self.conv_layer1_1(x)
conv2 = self.conv_layer1_2(conv1)
la1 = self.la1(conv2)
tr1 = self.tr1([la1])
st2 = self.st2(tr1)
tr2 = self.tr2(st2)
st3 = self.st3(tr2)
tr3 = self.tr3(st3)
st4 = self.st4(tr3)
size = paddle.shape(st4[0])[2:]
x1 = F.interpolate(
st4[1], size, mode='bilinear', align_corners=self.align_corners)
x2 = F.interpolate(
st4[2], size, mode='bilinear', align_corners=self.align_corners)
x3 = F.interpolate(
st4[3], size, mode='bilinear', align_corners=self.align_corners)
x = paddle.concat([st4[0], x1, x2, x3], axis=1)
return [x]
def init_weight(self):
for layer in self.sublayers():
if isinstance(layer, nn.Conv2D):
param_init.normal_init(layer.weight, std=0.001)
elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
param_init.constant_init(layer.weight, value=1.0)
param_init.constant_init(layer.bias, value=0.0)
if self.pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
class Layer1(nn.Layer):
def __init__(self,
num_channels,
num_filters,
num_blocks,
has_se=False,
name=None,
padding_same=True):
super(Layer1, self).__init__()
self.bottleneck_block_list = []
for i in range(num_blocks):
bottleneck_block = self.add_sublayer(
"bb_{}_{}".format(name, i + 1),
BottleneckBlock(
num_channels=num_channels if i == 0 else num_filters * 4,
num_filters=num_filters,
has_se=has_se,
stride=1,
downsample=True if i == 0 else False,
name=name + '_' + str(i + 1),
padding_same=padding_same))
self.bottleneck_block_list.append(bottleneck_block)
def forward(self, x):
conv = x
for block_func in self.bottleneck_block_list:
conv = block_func(conv)
return conv
class TransitionLayer(nn.Layer):
def __init__(self, in_channels, out_channels, name=None, padding_same=True):
super(TransitionLayer, self).__init__()
num_in = len(in_channels)
num_out = len(out_channels)
self.conv_bn_func_list = []
for i in range(num_out):
residual = None
if i < num_in:
if in_channels[i] != out_channels[i]:
residual = self.add_sublayer(
"transition_{}_layer_{}".format(name, i + 1),
layers.ConvBNReLU(
in_channels=in_channels[i],
out_channels=out_channels[i],
kernel_size=3,
padding=1 if not padding_same else 'same',
bias_attr=False))
else:
residual = self.add_sublayer(
"transition_{}_layer_{}".format(name, i + 1),
layers.ConvBNReLU(
in_channels=in_channels[-1],
out_channels=out_channels[i],
kernel_size=3,
stride=2,
padding=1 if not padding_same else 'same',
bias_attr=False))
self.conv_bn_func_list.append(residual)
def forward(self, x):
outs = []
for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
if conv_bn_func is None:
outs.append(x[idx])
else:
if idx < len(x):
outs.append(conv_bn_func(x[idx]))
else:
outs.append(conv_bn_func(x[-1]))
return outs
class Branches(nn.Layer):
def __init__(self,
num_blocks,
in_channels,
out_channels,
has_se=False,
name=None,
padding_same=True):
super(Branches, self).__init__()
self.basic_block_list = []
for i in range(len(out_channels)):
self.basic_block_list.append([])
for j in range(num_blocks[i]):
in_ch = in_channels[i] if j == 0 else out_channels[i]
basic_block_func = self.add_sublayer(
"bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
BasicBlock(
num_channels=in_ch,
num_filters=out_channels[i],
has_se=has_se,
name=name + '_branch_layer_' + str(i + 1) + '_' +
str(j + 1),
padding_same=padding_same))
self.basic_block_list[i].append(basic_block_func)
def forward(self, x):
outs = []
for idx, input in enumerate(x):
conv = input
for basic_block_func in self.basic_block_list[idx]:
conv = basic_block_func(conv)
outs.append(conv)
return outs
class BottleneckBlock(nn.Layer):
def __init__(self,
num_channels,
num_filters,
has_se,
stride=1,
downsample=False,
name=None,
padding_same=True):
super(BottleneckBlock, self).__init__()
self.has_se = has_se
self.downsample = downsample
self.conv1 = layers.ConvBNReLU(
in_channels=num_channels,
out_channels=num_filters,
kernel_size=1,
bias_attr=False)
self.conv2 = layers.ConvBNReLU(
in_channels=num_filters,
out_channels=num_filters,
kernel_size=3,
stride=stride,
padding=1 if not padding_same else 'same',
bias_attr=False)
self.conv3 = layers.ConvBN(
in_channels=num_filters,
out_channels=num_filters * 4,
kernel_size=1,
bias_attr=False)
if self.downsample:
self.conv_down = layers.ConvBN(
in_channels=num_channels,
out_channels=num_filters * 4,
kernel_size=1,
bias_attr=False)
if self.has_se:
self.se = SELayer(
num_channels=num_filters * 4,
num_filters=num_filters * 4,
reduction_ratio=16,
name=name + '_fc')
self.add = layers.Add()
self.relu = layers.Activation("relu")
def forward(self, x):
residual = x
conv1 = self.conv1(x)
conv2 = self.conv2(conv1)
conv3 = self.conv3(conv2)
if self.downsample:
residual = self.conv_down(x)
if self.has_se:
conv3 = self.se(conv3)
y = self.add(conv3, residual)
y = self.relu(y)
return y
class BasicBlock(nn.Layer):
def __init__(self,
num_channels,
num_filters,
stride=1,
has_se=False,
downsample=False,
name=None,
padding_same=True):
super(BasicBlock, self).__init__()
self.has_se = has_se
self.downsample = downsample
self.conv1 = layers.ConvBNReLU(
in_channels=num_channels,
out_channels=num_filters,
kernel_size=3,
stride=stride,
padding=1 if not padding_same else 'same',
bias_attr=False)
self.conv2 = layers.ConvBN(
in_channels=num_filters,
out_channels=num_filters,
kernel_size=3,
padding=1 if not padding_same else 'same',
bias_attr=False)
if self.downsample:
self.conv_down = layers.ConvBNReLU(
in_channels=num_channels,
out_channels=num_filters,
kernel_size=1,
bias_attr=False)
if self.has_se:
self.se = SELayer(
num_channels=num_filters,
num_filters=num_filters,
reduction_ratio=16,
name=name + '_fc')
self.add = layers.Add()
self.relu = layers.Activation("relu")
def forward(self, x):
residual = x
conv1 = self.conv1(x)
conv2 = self.conv2(conv1)
if self.downsample:
residual = self.conv_down(x)
if self.has_se:
conv2 = self.se(conv2)
y = self.add(conv2, residual)
y = self.relu(y)
return y
class SELayer(nn.Layer):
def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
super(SELayer, self).__init__()
self.pool2d_gap = nn.AdaptiveAvgPool2D(1)
self._num_channels = num_channels
med_ch = int(num_channels / reduction_ratio)
stdv = 1.0 / math.sqrt(num_channels * 1.0)
self.squeeze = nn.Linear(
num_channels,
med_ch,
weight_attr=paddle.ParamAttr(
initializer=nn.initializer.Uniform(-stdv, stdv)))
stdv = 1.0 / math.sqrt(med_ch * 1.0)
self.excitation = nn.Linear(
med_ch,
num_filters,
weight_attr=paddle.ParamAttr(
initializer=nn.initializer.Uniform(-stdv, stdv)))
def forward(self, x):
pool = self.pool2d_gap(x)
pool = paddle.reshape(pool, shape=[-1, self._num_channels])
squeeze = self.squeeze(pool)
squeeze = F.relu(squeeze)
excitation = self.excitation(squeeze)
excitation = F.sigmoid(excitation)
excitation = paddle.reshape(
excitation, shape=[-1, self._num_channels, 1, 1])
out = x * excitation
return out
class Stage(nn.Layer):
def __init__(self,
num_channels,
num_modules,
num_blocks,
num_filters,
has_se=False,
multi_scale_output=True,
name=None,
align_corners=False,
padding_same=True):
super(Stage, self).__init__()
self._num_modules = num_modules
self.stage_func_list = []
for i in range(num_modules):
if i == num_modules - 1 and not multi_scale_output:
stage_func = self.add_sublayer(
"stage_{}_{}".format(name, i + 1),
HighResolutionModule(
num_channels=num_channels,
num_blocks=num_blocks,
num_filters=num_filters,
has_se=has_se,
multi_scale_output=False,
name=name + '_' + str(i + 1),
align_corners=align_corners,
padding_same=padding_same))
else:
stage_func = self.add_sublayer(
"stage_{}_{}".format(name, i + 1),
HighResolutionModule(
num_channels=num_channels,
num_blocks=num_blocks,
num_filters=num_filters,
has_se=has_se,
name=name + '_' + str(i + 1),
align_corners=align_corners,
padding_same=padding_same))
self.stage_func_list.append(stage_func)
def forward(self, x):
out = x
for idx in range(self._num_modules):
out = self.stage_func_list[idx](out)
return out
class HighResolutionModule(nn.Layer):
def __init__(self,
num_channels,
num_blocks,
num_filters,
has_se=False,
multi_scale_output=True,
name=None,
align_corners=False,
padding_same=True):
super(HighResolutionModule, self).__init__()
self.branches_func = Branches(
num_blocks=num_blocks,
in_channels=num_channels,
out_channels=num_filters,
has_se=has_se,
name=name,
padding_same=padding_same)
self.fuse_func = FuseLayers(
in_channels=num_filters,
out_channels=num_filters,
multi_scale_output=multi_scale_output,
name=name,
align_corners=align_corners,
padding_same=padding_same)
def forward(self, x):
out = self.branches_func(x)
out = self.fuse_func(out)
return out
class FuseLayers(nn.Layer):
def __init__(self,
in_channels,
out_channels,
multi_scale_output=True,
name=None,
align_corners=False,
padding_same=True):
super(FuseLayers, self).__init__()
self._actual_ch = len(in_channels) if multi_scale_output else 1
self._in_channels = in_channels
self.align_corners = align_corners
self.residual_func_list = []
for i in range(self._actual_ch):
for j in range(len(in_channels)):
if j > i:
residual_func = self.add_sublayer(
"residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
layers.ConvBN(
in_channels=in_channels[j],
out_channels=out_channels[i],
kernel_size=1,
bias_attr=False))
self.residual_func_list.append(residual_func)
elif j < i:
pre_num_filters = in_channels[j]
for k in range(i - j):
if k == i - j - 1:
residual_func = self.add_sublayer(
"residual_{}_layer_{}_{}_{}".format(
name, i + 1, j + 1, k + 1),
layers.ConvBN(
in_channels=pre_num_filters,
out_channels=out_channels[i],
kernel_size=3,
stride=2,
padding=1 if not padding_same else 'same',
bias_attr=False))
pre_num_filters = out_channels[i]
else:
residual_func = self.add_sublayer(
"residual_{}_layer_{}_{}_{}".format(
name, i + 1, j + 1, k + 1),
layers.ConvBNReLU(
in_channels=pre_num_filters,
out_channels=out_channels[j],
kernel_size=3,
stride=2,
padding=1 if not padding_same else 'same',
bias_attr=False))
pre_num_filters = out_channels[j]
self.residual_func_list.append(residual_func)
def forward(self, x):
outs = []
residual_func_idx = 0
for i in range(self._actual_ch):
residual = x[i]
residual_shape = paddle.shape(residual)[-2:]
for j in range(len(self._in_channels)):
if j > i:
y = self.residual_func_list[residual_func_idx](x[j])
residual_func_idx += 1
y = F.interpolate(
y,
residual_shape,
mode='bilinear',
align_corners=self.align_corners)
residual = residual + y
elif j < i:
y = x[j]
for k in range(i - j):
y = self.residual_func_list[residual_func_idx](y)
residual_func_idx += 1
residual = residual + y
residual = F.relu(residual)
outs.append(residual)
return outs
@manager.BACKBONES.add_component
def HRNet_W18_Small_V1(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[1],
stage1_num_channels=[32],
stage2_num_modules=1,
stage2_num_blocks=[2, 2],
stage2_num_channels=[16, 32],
stage3_num_modules=1,
stage3_num_blocks=[2, 2, 2],
stage3_num_channels=[16, 32, 64],
stage4_num_modules=1,
stage4_num_blocks=[2, 2, 2, 2],
stage4_num_channels=[16, 32, 64, 128],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W18_Small_V2(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[2],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[2, 2],
stage2_num_channels=[18, 36],
stage3_num_modules=3,
stage3_num_blocks=[2, 2, 2],
stage3_num_channels=[18, 36, 72],
stage4_num_modules=2,
stage4_num_blocks=[2, 2, 2, 2],
stage4_num_channels=[18, 36, 72, 144],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W18(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[18, 36],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[18, 36, 72],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[18, 36, 72, 144],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W30(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[30, 60],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[30, 60, 120],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[30, 60, 120, 240],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W32(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[32, 64],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[32, 64, 128],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[32, 64, 128, 256],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W40(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[40, 80],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[40, 80, 160],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[40, 80, 160, 320],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W44(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[44, 88],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[44, 88, 176],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[44, 88, 176, 352],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W48(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[48, 96],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[48, 96, 192],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[48, 96, 192, 384],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W60(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[60, 120],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[60, 120, 240],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[60, 120, 240, 480],
**kwargs)
return model
@manager.BACKBONES.add_component
def HRNet_W64(**kwargs):
model = HRNet(
stage1_num_modules=1,
stage1_num_blocks=[4],
stage1_num_channels=[64],
stage2_num_modules=1,
stage2_num_blocks=[4, 4],
stage2_num_channels=[64, 128],
stage3_num_modules=4,
stage3_num_blocks=[4, 4, 4],
stage3_num_channels=[64, 128, 256],
stage4_num_modules=3,
stage4_num_blocks=[4, 4, 4, 4],
stage4_num_channels=[64, 128, 256, 512],
**kwargs)
return model

@ -0,0 +1,588 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from functools import partial
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddle.nn.initializer as paddle_init
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
from paddlers.models.ppseg.models.backbones.transformer_utils import *
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.dwconv = DWConv(hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
elif isinstance(m, nn.Conv2D):
fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
fan_out //= m._groups
paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
if m.bias is not None:
zeros_(m.bias)
def forward(self, x, H, W):
x = self.fc1(x)
x = self.dwconv(x, H, W)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
sr_ratio=1):
super().__init__()
assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
self.dim = dim
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.dim = dim
self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.sr_ratio = sr_ratio
if sr_ratio > 1:
self.sr = nn.Conv2D(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
self.norm = nn.LayerNorm(dim)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
elif isinstance(m, nn.Conv2D):
fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
fan_out //= m._groups
paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
if m.bias is not None:
zeros_(m.bias)
def forward(self, x, H, W):
x_shape = paddle.shape(x)
B, N = x_shape[0], x_shape[1]
C = self.dim
q = self.q(x).reshape([B, N, self.num_heads,
C // self.num_heads]).transpose([0, 2, 1, 3])
if self.sr_ratio > 1:
x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1])
x_ = self.norm(x_)
kv = self.kv(x_).reshape(
[B, -1, 2, self.num_heads,
C // self.num_heads]).transpose([2, 0, 3, 1, 4])
else:
kv = self.kv(x).reshape(
[B, -1, 2, self.num_heads,
C // self.num_heads]).transpose([2, 0, 3, 1, 4])
k, v = kv[0], kv[1]
attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale
attn = F.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
sr_ratio=1):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
sr_ratio=sr_ratio)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
elif isinstance(m, nn.Conv2D):
fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
fan_out //= m._groups
paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
if m.bias is not None:
zeros_(m.bias)
def forward(self, x, H, W):
x = x + self.drop_path(self.attn(self.norm1(x), H, W))
x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
return x
class OverlapPatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=224,
patch_size=7,
stride=4,
in_chans=3,
embed_dim=768):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.H, self.W = img_size[0] // patch_size[0], img_size[
1] // patch_size[1]
self.num_patches = self.H * self.W
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=patch_size,
stride=stride,
padding=(patch_size[0] // 2, patch_size[1] // 2))
self.norm = nn.LayerNorm(embed_dim)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
elif isinstance(m, nn.Conv2D):
fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
fan_out //= m._groups
paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
if m.bias is not None:
zeros_(m.bias)
def forward(self, x):
x = self.proj(x)
x_shape = paddle.shape(x)
H, W = x_shape[2], x_shape[3]
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
return x, H, W
class MixVisionTransformer(nn.Layer):
def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
num_classes=1000,
embed_dims=[64, 128, 256, 512],
num_heads=[1, 2, 4, 8],
mlp_ratios=[4, 4, 4, 4],
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer=nn.LayerNorm,
depths=[3, 4, 6, 3],
sr_ratios=[8, 4, 2, 1],
pretrained=None):
super().__init__()
self.num_classes = num_classes
self.depths = depths
self.feat_channels = embed_dims[:]
# patch_embed
self.patch_embed1 = OverlapPatchEmbed(
img_size=img_size,
patch_size=7,
stride=4,
in_chans=in_chans,
embed_dim=embed_dims[0])
self.patch_embed2 = OverlapPatchEmbed(
img_size=img_size // 4,
patch_size=3,
stride=2,
in_chans=embed_dims[0],
embed_dim=embed_dims[1])
self.patch_embed3 = OverlapPatchEmbed(
img_size=img_size // 8,
patch_size=3,
stride=2,
in_chans=embed_dims[1],
embed_dim=embed_dims[2])
self.patch_embed4 = OverlapPatchEmbed(
img_size=img_size // 16,
patch_size=3,
stride=2,
in_chans=embed_dims[2],
embed_dim=embed_dims[3])
# transformer encoder
dpr = [
x.numpy() for x in paddle.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule
cur = 0
self.block1 = nn.LayerList([
Block(
dim=embed_dims[0],
num_heads=num_heads[0],
mlp_ratio=mlp_ratios[0],
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[cur + i],
norm_layer=norm_layer,
sr_ratio=sr_ratios[0]) for i in range(depths[0])
])
self.norm1 = norm_layer(embed_dims[0])
cur += depths[0]
self.block2 = nn.LayerList([
Block(
dim=embed_dims[1],
num_heads=num_heads[1],
mlp_ratio=mlp_ratios[1],
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[cur + i],
norm_layer=norm_layer,
sr_ratio=sr_ratios[1]) for i in range(depths[1])
])
self.norm2 = norm_layer(embed_dims[1])
cur += depths[1]
self.block3 = nn.LayerList([
Block(
dim=embed_dims[2],
num_heads=num_heads[2],
mlp_ratio=mlp_ratios[2],
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[cur + i],
norm_layer=norm_layer,
sr_ratio=sr_ratios[2]) for i in range(depths[2])
])
self.norm3 = norm_layer(embed_dims[2])
cur += depths[2]
self.block4 = nn.LayerList([
Block(
dim=embed_dims[3],
num_heads=num_heads[3],
mlp_ratio=mlp_ratios[3],
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[cur + i],
norm_layer=norm_layer,
sr_ratio=sr_ratios[3]) for i in range(depths[3])
])
self.norm4 = norm_layer(embed_dims[3])
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
if self.pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
else:
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
elif isinstance(m, nn.Conv2D):
fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
fan_out //= m._groups
paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
if m.bias is not None:
zeros_(m.bias)
def reset_drop_path(self, drop_path_rate):
dpr = [
x.item()
for x in paddle.linspace(0, drop_path_rate, sum(self.depths))
]
cur = 0
for i in range(self.depths[0]):
self.block1[i].drop_path.drop_prob = dpr[cur + i]
cur += self.depths[0]
for i in range(self.depths[1]):
self.block2[i].drop_path.drop_prob = dpr[cur + i]
cur += self.depths[1]
for i in range(self.depths[2]):
self.block3[i].drop_path.drop_prob = dpr[cur + i]
cur += self.depths[2]
for i in range(self.depths[3]):
self.block4[i].drop_path.drop_prob = dpr[cur + i]
def freeze_patch_emb(self):
self.patch_embed1.requires_grad = False
def get_classifier(self):
return self.head
def reset_classifier(self, num_classes, global_pool=''):
self.num_classes = num_classes
self.head = nn.Linear(
self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
def forward_features(self, x):
B = paddle.shape(x)[0]
outs = []
# stage 1
x, H, W = self.patch_embed1(x)
for i, blk in enumerate(self.block1):
x = blk(x, H, W)
x = self.norm1(x)
x = x.reshape([B, H, W, self.feat_channels[0]]).transpose([0, 3, 1, 2])
outs.append(x)
# stage 2
x, H, W = self.patch_embed2(x)
for i, blk in enumerate(self.block2):
x = blk(x, H, W)
x = self.norm2(x)
x = x.reshape([B, H, W, self.feat_channels[1]]).transpose([0, 3, 1, 2])
outs.append(x)
# stage 3
x, H, W = self.patch_embed3(x)
for i, blk in enumerate(self.block3):
x = blk(x, H, W)
x = self.norm3(x)
x = x.reshape([B, H, W, self.feat_channels[2]]).transpose([0, 3, 1, 2])
outs.append(x)
# stage 4
x, H, W = self.patch_embed4(x)
for i, blk in enumerate(self.block4):
x = blk(x, H, W)
x = self.norm4(x)
x = x.reshape([B, H, W, self.feat_channels[3]]).transpose([0, 3, 1, 2])
outs.append(x)
return outs
def forward(self, x):
x = self.forward_features(x)
# x = self.head(x)
return x
class DWConv(nn.Layer):
def __init__(self, dim=768):
super(DWConv, self).__init__()
self.dim = dim
self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
def forward(self, x, H, W):
x_shape = paddle.shape(x)
B, N = x_shape[0], x_shape[1]
x = x.transpose([0, 2, 1]).reshape([B, self.dim, H, W])
x = self.dwconv(x)
x = x.flatten(2).transpose([0, 2, 1])
return x
@manager.BACKBONES.add_component
def MixVisionTransformer_B0(**kwargs):
return MixVisionTransformer(
patch_size=4,
embed_dims=[32, 64, 160, 256],
num_heads=[1, 2, 5, 8],
mlp_ratios=[4, 4, 4, 4],
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
depths=[2, 2, 2, 2],
sr_ratios=[8, 4, 2, 1],
drop_rate=0.0,
drop_path_rate=0.1,
**kwargs)
@manager.BACKBONES.add_component
def MixVisionTransformer_B1(**kwargs):
return MixVisionTransformer(
patch_size=4,
embed_dims=[64, 128, 320, 512],
num_heads=[1, 2, 5, 8],
mlp_ratios=[4, 4, 4, 4],
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
depths=[2, 2, 2, 2],
sr_ratios=[8, 4, 2, 1],
drop_rate=0.0,
drop_path_rate=0.1,
**kwargs)
@manager.BACKBONES.add_component
def MixVisionTransformer_B2(**kwargs):
return MixVisionTransformer(
patch_size=4,
embed_dims=[64, 128, 320, 512],
num_heads=[1, 2, 5, 8],
mlp_ratios=[4, 4, 4, 4],
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
depths=[3, 4, 6, 3],
sr_ratios=[8, 4, 2, 1],
drop_rate=0.0,
drop_path_rate=0.1,
**kwargs)
@manager.BACKBONES.add_component
def MixVisionTransformer_B3(**kwargs):
return MixVisionTransformer(
patch_size=4,
embed_dims=[64, 128, 320, 512],
num_heads=[1, 2, 5, 8],
mlp_ratios=[4, 4, 4, 4],
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
depths=[3, 4, 18, 3],
sr_ratios=[8, 4, 2, 1],
drop_rate=0.0,
drop_path_rate=0.1,
**kwargs)
@manager.BACKBONES.add_component
def MixVisionTransformer_B4(**kwargs):
return MixVisionTransformer(
patch_size=4,
embed_dims=[64, 128, 320, 512],
num_heads=[1, 2, 5, 8],
mlp_ratios=[4, 4, 4, 4],
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
depths=[3, 8, 27, 3],
sr_ratios=[8, 4, 2, 1],
drop_rate=0.0,
drop_path_rate=0.1,
**kwargs)
@manager.BACKBONES.add_component
def MixVisionTransformer_B5(**kwargs):
return MixVisionTransformer(
patch_size=4,
embed_dims=[64, 128, 320, 512],
num_heads=[1, 2, 5, 8],
mlp_ratios=[4, 4, 4, 4],
qkv_bias=True,
norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
depths=[3, 6, 40, 3],
sr_ratios=[8, 4, 2, 1],
drop_rate=0.0,
drop_path_rate=0.1,
**kwargs)

@ -0,0 +1,168 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg import utils
@manager.BACKBONES.add_component
class MobileNetV2(nn.Layer):
"""
The MobileNetV2 implementation based on PaddlePaddle.
The original article refers to
Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
"MobileNetV2: Inverted Residuals and Linear Bottlenecks"
(https://arxiv.org/abs/1801.04381).
Args:
channel_ratio (float, optional): The ratio of channel. Default: 1.0
min_channel (int, optional): The minimum of channel. Default: 16
pretrained (str, optional): The path or url of pretrained model. Default: None
"""
def __init__(self, channel_ratio=1.0, min_channel=16, pretrained=None):
super(MobileNetV2, self).__init__()
self.channel_ratio = channel_ratio
self.min_channel = min_channel
self.pretrained = pretrained
self.stage0 = conv_bn(3, self.depth(32), 3, 2)
self.stage1 = InvertedResidual(self.depth(32), self.depth(16), 1, 1)
self.stage2 = nn.Sequential(
InvertedResidual(self.depth(16), self.depth(24), 2, 6),
InvertedResidual(self.depth(24), self.depth(24), 1, 6),
)
self.stage3 = nn.Sequential(
InvertedResidual(self.depth(24), self.depth(32), 2, 6),
InvertedResidual(self.depth(32), self.depth(32), 1, 6),
InvertedResidual(self.depth(32), self.depth(32), 1, 6),
)
self.stage4 = nn.Sequential(
InvertedResidual(self.depth(32), self.depth(64), 2, 6),
InvertedResidual(self.depth(64), self.depth(64), 1, 6),
InvertedResidual(self.depth(64), self.depth(64), 1, 6),
InvertedResidual(self.depth(64), self.depth(64), 1, 6),
)
self.stage5 = nn.Sequential(
InvertedResidual(self.depth(64), self.depth(96), 1, 6),
InvertedResidual(self.depth(96), self.depth(96), 1, 6),
InvertedResidual(self.depth(96), self.depth(96), 1, 6),
)
self.stage6 = nn.Sequential(
InvertedResidual(self.depth(96), self.depth(160), 2, 6),
InvertedResidual(self.depth(160), self.depth(160), 1, 6),
InvertedResidual(self.depth(160), self.depth(160), 1, 6),
)
self.stage7 = InvertedResidual(self.depth(160), self.depth(320), 1, 6)
self.init_weight()
def depth(self, channels):
min_channel = min(channels, self.min_channel)
return max(min_channel, int(channels * self.channel_ratio))
def forward(self, x):
feat_list = []
feature_1_2 = self.stage0(x)
feature_1_2 = self.stage1(feature_1_2)
feature_1_4 = self.stage2(feature_1_2)
feature_1_8 = self.stage3(feature_1_4)
feature_1_16 = self.stage4(feature_1_8)
feature_1_16 = self.stage5(feature_1_16)
feature_1_32 = self.stage6(feature_1_16)
feature_1_32 = self.stage7(feature_1_32)
feat_list.append(feature_1_4)
feat_list.append(feature_1_8)
feat_list.append(feature_1_16)
feat_list.append(feature_1_32)
return feat_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def conv_bn(inp, oup, kernel, stride):
return nn.Sequential(
nn.Conv2D(
in_channels=inp,
out_channels=oup,
kernel_size=kernel,
stride=stride,
padding=(kernel - 1) // 2,
bias_attr=False),
nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
nn.ReLU())
class InvertedResidual(nn.Layer):
def __init__(self, inp, oup, stride, expand_ratio, dilation=1):
super(InvertedResidual, self).__init__()
self.stride = stride
assert stride in [1, 2]
self.use_res_connect = self.stride == 1 and inp == oup
self.conv = nn.Sequential(
nn.Conv2D(
inp,
inp * expand_ratio,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
groups=1,
bias_attr=False),
nn.BatchNorm2D(
num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
nn.ReLU(),
nn.Conv2D(
inp * expand_ratio,
inp * expand_ratio,
kernel_size=3,
stride=stride,
padding=dilation,
dilation=dilation,
groups=inp * expand_ratio,
bias_attr=False),
nn.BatchNorm2D(
num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
nn.ReLU(),
nn.Conv2D(
inp * expand_ratio,
oup,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
groups=1,
bias_attr=False),
nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)

@ -0,0 +1,364 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
from paddlers.models.ppseg.models import layers
__all__ = [
"MobileNetV3_small_x0_35", "MobileNetV3_small_x0_5",
"MobileNetV3_small_x0_75", "MobileNetV3_small_x1_0",
"MobileNetV3_small_x1_25", "MobileNetV3_large_x0_35",
"MobileNetV3_large_x0_5", "MobileNetV3_large_x0_75",
"MobileNetV3_large_x1_0", "MobileNetV3_large_x1_25"
]
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class MobileNetV3(nn.Layer):
"""
The MobileNetV3 implementation based on PaddlePaddle.
The original article refers to Jingdong
Andrew Howard, et, al. "Searching for MobileNetV3"
(https://arxiv.org/pdf/1905.02244.pdf).
Args:
pretrained (str, optional): The path of pretrained model.
scale (float, optional): The scale of channels . Default: 1.0.
model_name (str, optional): Model name. It determines the type of MobileNetV3. The value is 'small' or 'large'. Defualt: 'small'.
output_stride (int, optional): The stride of output features compared to input images. The value should be one of (2, 4, 8, 16, 32). Default: None.
"""
def __init__(self,
pretrained=None,
scale=1.0,
model_name="small",
output_stride=None):
super(MobileNetV3, self).__init__()
inplanes = 16
if model_name == "large":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, "relu", 1],
[3, 64, 24, False, "relu", 2],
[3, 72, 24, False, "relu", 1], # output 1 -> out_index=2
[5, 72, 40, True, "relu", 2],
[5, 120, 40, True, "relu", 1],
[5, 120, 40, True, "relu", 1], # output 2 -> out_index=5
[3, 240, 80, False, "hard_swish", 2],
[3, 200, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 480, 112, True, "hard_swish", 1],
[3, 672, 112, True, "hard_swish",
1], # output 3 -> out_index=11
[5, 672, 160, True, "hard_swish", 2],
[5, 960, 160, True, "hard_swish", 1],
[5, 960, 160, True, "hard_swish",
1], # output 3 -> out_index=14
]
self.out_indices = [2, 5, 11, 14]
self.feat_channels = [
make_divisible(i * scale) for i in [24, 40, 112, 160]
]
self.cls_ch_squeeze = 960
self.cls_ch_expand = 1280
elif model_name == "small":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, "relu", 2], # output 1 -> out_index=0
[3, 72, 24, False, "relu", 2],
[3, 88, 24, False, "relu", 1], # output 2 -> out_index=3
[5, 96, 40, True, "hard_swish", 2],
[5, 240, 40, True, "hard_swish", 1],
[5, 240, 40, True, "hard_swish", 1],
[5, 120, 48, True, "hard_swish", 1],
[5, 144, 48, True, "hard_swish", 1], # output 3 -> out_index=7
[5, 288, 96, True, "hard_swish", 2],
[5, 576, 96, True, "hard_swish", 1],
[5, 576, 96, True, "hard_swish", 1], # output 4 -> out_index=10
]
self.out_indices = [0, 3, 7, 10]
self.feat_channels = [
make_divisible(i * scale) for i in [16, 24, 48, 96]
]
self.cls_ch_squeeze = 576
self.cls_ch_expand = 1280
else:
raise NotImplementedError(
"mode[{}_model] is not implemented!".format(model_name))
###################################################
# modify stride and dilation based on output_stride
self.dilation_cfg = [1] * len(self.cfg)
self.modify_bottle_params(output_stride=output_stride)
###################################################
self.conv1 = ConvBNLayer(
in_c=3,
out_c=make_divisible(inplanes * scale),
filter_size=3,
stride=2,
padding=1,
num_groups=1,
if_act=True,
act="hard_swish")
self.block_list = []
inplanes = make_divisible(inplanes * scale)
for i, (k, exp, c, se, nl, s) in enumerate(self.cfg):
######################################
# add dilation rate
dilation_rate = self.dilation_cfg[i]
######################################
self.block_list.append(
ResidualUnit(
in_c=inplanes,
mid_c=make_divisible(scale * exp),
out_c=make_divisible(scale * c),
filter_size=k,
stride=s,
dilation=dilation_rate,
use_se=se,
act=nl,
name="conv" + str(i + 2)))
self.add_sublayer(
sublayer=self.block_list[-1], name="conv" + str(i + 2))
inplanes = make_divisible(scale * c)
self.pretrained = pretrained
self.init_weight()
def modify_bottle_params(self, output_stride=None):
if output_stride is not None and output_stride % 2 != 0:
raise ValueError("output stride must to be even number")
if output_stride is not None:
stride = 2
rate = 1
for i, _cfg in enumerate(self.cfg):
stride = stride * _cfg[-1]
if stride > output_stride:
rate = rate * _cfg[-1]
self.cfg[i][-1] = 1
self.dilation_cfg[i] = rate
def forward(self, inputs, label=None):
x = self.conv1(inputs)
# A feature list saves each downsampling feature.
feat_list = []
for i, block in enumerate(self.block_list):
x = block(x)
if i in self.out_indices:
feat_list.append(x)
return feat_list
def init_weight(self):
if self.pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
class ConvBNLayer(nn.Layer):
def __init__(self,
in_c,
out_c,
filter_size,
stride,
padding,
dilation=1,
num_groups=1,
if_act=True,
act=None):
super(ConvBNLayer, self).__init__()
self.if_act = if_act
self.act = act
self.conv = nn.Conv2D(
in_channels=in_c,
out_channels=out_c,
kernel_size=filter_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=num_groups,
bias_attr=False)
self.bn = layers.SyncBatchNorm(
num_features=out_c,
weight_attr=paddle.ParamAttr(
regularizer=paddle.regularizer.L2Decay(0.0)),
bias_attr=paddle.ParamAttr(
regularizer=paddle.regularizer.L2Decay(0.0)))
self._act_op = layers.Activation(act='hardswish')
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.if_act:
x = self._act_op(x)
return x
class ResidualUnit(nn.Layer):
def __init__(self,
in_c,
mid_c,
out_c,
filter_size,
stride,
use_se,
dilation=1,
act=None,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_c == out_c
self.if_se = use_se
self.expand_conv = ConvBNLayer(
in_c=in_c,
out_c=mid_c,
filter_size=1,
stride=1,
padding=0,
if_act=True,
act=act)
self.bottleneck_conv = ConvBNLayer(
in_c=mid_c,
out_c=mid_c,
filter_size=filter_size,
stride=stride,
padding='same',
dilation=dilation,
num_groups=mid_c,
if_act=True,
act=act)
if self.if_se:
self.mid_se = SEModule(mid_c, name=name + "_se")
self.linear_conv = ConvBNLayer(
in_c=mid_c,
out_c=out_c,
filter_size=1,
stride=1,
padding=0,
if_act=False,
act=None)
self.dilation = dilation
def forward(self, inputs):
x = self.expand_conv(inputs)
x = self.bottleneck_conv(x)
if self.if_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = inputs + x
return x
class SEModule(nn.Layer):
def __init__(self, channel, reduction=4, name=""):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2D(1)
self.conv1 = nn.Conv2D(
in_channels=channel,
out_channels=channel // reduction,
kernel_size=1,
stride=1,
padding=0)
self.conv2 = nn.Conv2D(
in_channels=channel // reduction,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0)
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = F.hardsigmoid(outputs)
return paddle.multiply(x=inputs, y=outputs)
def MobileNetV3_small_x0_35(**kwargs):
model = MobileNetV3(model_name="small", scale=0.35, **kwargs)
return model
def MobileNetV3_small_x0_5(**kwargs):
model = MobileNetV3(model_name="small", scale=0.5, **kwargs)
return model
def MobileNetV3_small_x0_75(**kwargs):
model = MobileNetV3(model_name="small", scale=0.75, **kwargs)
return model
@manager.BACKBONES.add_component
def MobileNetV3_small_x1_0(**kwargs):
model = MobileNetV3(model_name="small", scale=1.0, **kwargs)
return model
def MobileNetV3_small_x1_25(**kwargs):
model = MobileNetV3(model_name="small", scale=1.25, **kwargs)
return model
def MobileNetV3_large_x0_35(**kwargs):
model = MobileNetV3(model_name="large", scale=0.35, **kwargs)
return model
def MobileNetV3_large_x0_5(**kwargs):
model = MobileNetV3(model_name="large", scale=0.5, **kwargs)
return model
def MobileNetV3_large_x0_75(**kwargs):
model = MobileNetV3(model_name="large", scale=0.75, **kwargs)
return model
@manager.BACKBONES.add_component
def MobileNetV3_large_x1_0(**kwargs):
model = MobileNetV3(model_name="large", scale=1.0, **kwargs)
return model
def MobileNetV3_large_x1_25(**kwargs):
model = MobileNetV3(model_name="large", scale=1.25, **kwargs)
return model

@ -0,0 +1,398 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
__all__ = [
"ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd"
]
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
dilation=1,
groups=1,
is_vd_mode=False,
act=None,
data_format='NCHW'):
super(ConvBNLayer, self).__init__()
if dilation != 1 and kernel_size != 3:
raise RuntimeError("When the dilation isn't 1," \
"the kernel_size should be 3.")
self.is_vd_mode = is_vd_mode
self._pool2d_avg = nn.AvgPool2D(
kernel_size=2,
stride=2,
padding=0,
ceil_mode=True,
data_format=data_format)
self._conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2 \
if dilation == 1 else dilation,
dilation=dilation,
groups=groups,
bias_attr=False,
data_format=data_format)
self._batch_norm = layers.SyncBatchNorm(
out_channels, data_format=data_format)
self._act_op = layers.Activation(act=act)
def forward(self, inputs):
if self.is_vd_mode:
inputs = self._pool2d_avg(inputs)
y = self._conv(inputs)
y = self._batch_norm(y)
y = self._act_op(y)
return y
class BottleneckBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels,
stride,
shortcut=True,
if_first=False,
dilation=1,
data_format='NCHW'):
super(BottleneckBlock, self).__init__()
self.data_format = data_format
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
act='relu',
data_format=data_format)
self.dilation = dilation
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
act='relu',
dilation=dilation,
data_format=data_format)
self.conv2 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels * 4,
kernel_size=1,
act=None,
data_format=data_format)
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels * 4,
kernel_size=1,
stride=1,
is_vd_mode=False if if_first or stride == 1 else True,
data_format=data_format)
self.shortcut = shortcut
# NOTE: Use the wrap layer for quantization training
self.add = layers.Add()
self.relu = layers.Activation(act="relu")
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
conv2 = self.conv2(conv1)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = self.add(short, conv2)
y = self.relu(y)
return y
class BasicBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels,
stride,
dilation=1,
shortcut=True,
if_first=False,
data_format='NCHW'):
super(BasicBlock, self).__init__()
self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=stride,
dilation=dilation,
act='relu',
data_format=data_format)
self.conv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
dilation=dilation,
act=None,
data_format=data_format)
if not shortcut:
self.short = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
is_vd_mode=False if if_first or stride == 1 else True,
data_format=data_format)
self.shortcut = shortcut
self.dilation = dilation
self.data_format = data_format
self.add = layers.Add()
self.relu = layers.Activation(act="relu")
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = self.add(short, conv1)
y = self.relu(y)
return y
class ResNet_vd(nn.Layer):
"""
The ResNet_vd implementation based on PaddlePaddle.
The original article refers to Jingdong
Tong He, et, al. "Bag of Tricks for Image Classification with Convolutional Neural Networks"
(https://arxiv.org/pdf/1812.01187.pdf).
Args:
layers (int, optional): The layers of ResNet_vd. The supported layers are (18, 34, 50, 101, 152, 200). Default: 50.
output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 8.
multi_grid (tuple|list, optional): The grid of stage4. Defult: (1, 1, 1).
pretrained (str, optional): The path of pretrained model.
"""
def __init__(self,
layers=50,
output_stride=8,
multi_grid=(1, 1, 1),
pretrained=None,
data_format='NCHW'):
super(ResNet_vd, self).__init__()
self.data_format = data_format
self.conv1_logit = None # for gscnn shape stream
self.layers = layers
supported_layers = [18, 34, 50, 101, 152, 200]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(
supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_channels = [64, 256, 512, 1024
] if layers >= 50 else [64, 64, 128, 256]
num_filters = [64, 128, 256, 512]
# for channels of four returned stages
self.feat_channels = [c * 4 for c in num_filters
] if layers >= 50 else num_filters
dilation_dict = None
if output_stride == 8:
dilation_dict = {2: 2, 3: 4}
elif output_stride == 16:
dilation_dict = {3: 2}
self.conv1_1 = ConvBNLayer(
in_channels=3,
out_channels=32,
kernel_size=3,
stride=2,
act='relu',
data_format=data_format)
self.conv1_2 = ConvBNLayer(
in_channels=32,
out_channels=32,
kernel_size=3,
stride=1,
act='relu',
data_format=data_format)
self.conv1_3 = ConvBNLayer(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
act='relu',
data_format=data_format)
self.pool2d_max = nn.MaxPool2D(
kernel_size=3, stride=2, padding=1, data_format=data_format)
# self.block_list = []
self.stage_list = []
if layers >= 50:
for block in range(len(depth)):
shortcut = False
block_list = []
for i in range(depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
###############################################################################
# Add dilation rate for some segmentation tasks, if dilation_dict is not None.
dilation_rate = dilation_dict[
block] if dilation_dict and block in dilation_dict else 1
# Actually block here is 'stage', and i is 'block' in 'stage'
# At the stage 4, expand the the dilation_rate if given multi_grid
if block == 3:
dilation_rate = dilation_rate * multi_grid[i]
###############################################################################
bottleneck_block = self.add_sublayer(
'bb_%d_%d' % (block, i),
BottleneckBlock(
in_channels=num_channels[block]
if i == 0 else num_filters[block] * 4,
out_channels=num_filters[block],
stride=2 if i == 0 and block != 0
and dilation_rate == 1 else 1,
shortcut=shortcut,
if_first=block == i == 0,
dilation=dilation_rate,
data_format=data_format))
block_list.append(bottleneck_block)
shortcut = True
self.stage_list.append(block_list)
else:
for block in range(len(depth)):
shortcut = False
block_list = []
for i in range(depth[block]):
dilation_rate = dilation_dict[block] \
if dilation_dict and block in dilation_dict else 1
if block == 3:
dilation_rate = dilation_rate * multi_grid[i]
basic_block = self.add_sublayer(
'bb_%d_%d' % (block, i),
BasicBlock(
in_channels=num_channels[block]
if i == 0 else num_filters[block],
out_channels=num_filters[block],
stride=2 if i == 0 and block != 0 \
and dilation_rate == 1 else 1,
dilation=dilation_rate,
shortcut=shortcut,
if_first=block == i == 0,
data_format=data_format))
block_list.append(basic_block)
shortcut = True
self.stage_list.append(block_list)
self.pretrained = pretrained
self.init_weight()
def forward(self, inputs):
y = self.conv1_1(inputs)
y = self.conv1_2(y)
y = self.conv1_3(y)
self.conv1_logit = y.clone()
y = self.pool2d_max(y)
# A feature list saves the output feature map of each stage.
feat_list = []
for stage in self.stage_list:
for block in stage:
y = block(y)
feat_list.append(y)
return feat_list
def init_weight(self):
utils.load_pretrained_model(self, self.pretrained)
@manager.BACKBONES.add_component
def ResNet18_vd(**args):
model = ResNet_vd(layers=18, **args)
return model
def ResNet34_vd(**args):
model = ResNet_vd(layers=34, **args)
return model
@manager.BACKBONES.add_component
def ResNet50_vd(**args):
model = ResNet_vd(layers=50, **args)
return model
@manager.BACKBONES.add_component
def ResNet101_vd(**args):
model = ResNet_vd(layers=101, **args)
return model
def ResNet152_vd(**args):
model = ResNet_vd(layers=152, **args)
return model
def ResNet200_vd(**args):
model = ResNet_vd(layers=200, **args)
return model

@ -0,0 +1,281 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
from paddlers.models.ppseg.utils import utils
from paddlers.models.ppseg.cvlibs import manager, param_init
from paddlers.models.ppseg.models.layers.layer_libs import SyncBatchNorm
__all__ = ["STDC1", "STDC2"]
class STDCNet(nn.Layer):
"""
The STDCNet implementation based on PaddlePaddle.
The original article refers to Meituan
Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation."
(https://arxiv.org/abs/2104.13188)
Args:
base(int, optional): base channels. Default: 64.
layers(list, optional): layers numbers list. It determines STDC block numbers of STDCNet's stage3\4\5. Defualt: [4, 5, 3].
block_num(int,optional): block_num of features block. Default: 4.
type(str,optional): feature fusion method "cat"/"add". Default: "cat".
num_classes(int, optional): class number for image classification. Default: 1000.
dropout(float,optional): dropout ratio. if >0,use dropout ratio. Default: 0.20.
use_conv_last(bool,optional): whether to use the last ConvBNReLU layer . Default: False.
pretrained(str, optional): the path of pretrained model.
"""
def __init__(self,
base=64,
layers=[4, 5, 3],
block_num=4,
type="cat",
num_classes=1000,
dropout=0.20,
use_conv_last=False,
pretrained=None):
super(STDCNet, self).__init__()
if type == "cat":
block = CatBottleneck
elif type == "add":
block = AddBottleneck
self.use_conv_last = use_conv_last
self.features = self._make_layers(base, layers, block_num, block)
self.conv_last = ConvBNRelu(base * 16, max(1024, base * 16), 1, 1)
if (layers == [4, 5, 3]): #stdc1446
self.x2 = nn.Sequential(self.features[:1])
self.x4 = nn.Sequential(self.features[1:2])
self.x8 = nn.Sequential(self.features[2:6])
self.x16 = nn.Sequential(self.features[6:11])
self.x32 = nn.Sequential(self.features[11:])
elif (layers == [2, 2, 2]): #stdc813
self.x2 = nn.Sequential(self.features[:1])
self.x4 = nn.Sequential(self.features[1:2])
self.x8 = nn.Sequential(self.features[2:4])
self.x16 = nn.Sequential(self.features[4:6])
self.x32 = nn.Sequential(self.features[6:])
else:
raise NotImplementedError(
"model with layers:{} is not implemented!".format(layers))
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
"""
forward function for feature extract.
"""
feat2 = self.x2(x)
feat4 = self.x4(feat2)
feat8 = self.x8(feat4)
feat16 = self.x16(feat8)
feat32 = self.x32(feat16)
if self.use_conv_last:
feat32 = self.conv_last(feat32)
return feat2, feat4, feat8, feat16, feat32
def _make_layers(self, base, layers, block_num, block):
features = []
features += [ConvBNRelu(3, base // 2, 3, 2)]
features += [ConvBNRelu(base // 2, base, 3, 2)]
for i, layer in enumerate(layers):
for j in range(layer):
if i == 0 and j == 0:
features.append(block(base, base * 4, block_num, 2))
elif j == 0:
features.append(
block(base * int(math.pow(2, i + 1)),
base * int(math.pow(2, i + 2)), block_num, 2))
else:
features.append(
block(base * int(math.pow(2, i + 2)),
base * int(math.pow(2, i + 2)), block_num, 1))
return nn.Sequential(*features)
def init_weight(self):
for layer in self.sublayers():
if isinstance(layer, nn.Conv2D):
param_init.normal_init(layer.weight, std=0.001)
elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
param_init.constant_init(layer.weight, value=1.0)
param_init.constant_init(layer.bias, value=0.0)
if self.pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
class ConvBNRelu(nn.Layer):
def __init__(self, in_planes, out_planes, kernel=3, stride=1):
super(ConvBNRelu, self).__init__()
self.conv = nn.Conv2D(
in_planes,
out_planes,
kernel_size=kernel,
stride=stride,
padding=kernel // 2,
bias_attr=False)
self.bn = SyncBatchNorm(out_planes, data_format='NCHW')
self.relu = nn.ReLU()
def forward(self, x):
out = self.relu(self.bn(self.conv(x)))
return out
class AddBottleneck(nn.Layer):
def __init__(self, in_planes, out_planes, block_num=3, stride=1):
super(AddBottleneck, self).__init__()
assert block_num > 1, "block number should be larger than 1."
self.conv_list = nn.LayerList()
self.stride = stride
if stride == 2:
self.avd_layer = nn.Sequential(
nn.Conv2D(
out_planes // 2,
out_planes // 2,
kernel_size=3,
stride=2,
padding=1,
groups=out_planes // 2,
bias_attr=False),
nn.BatchNorm2D(out_planes // 2),
)
self.skip = nn.Sequential(
nn.Conv2D(
in_planes,
in_planes,
kernel_size=3,
stride=2,
padding=1,
groups=in_planes,
bias_attr=False),
nn.BatchNorm2D(in_planes),
nn.Conv2D(
in_planes, out_planes, kernel_size=1, bias_attr=False),
nn.BatchNorm2D(out_planes),
)
stride = 1
for idx in range(block_num):
if idx == 0:
self.conv_list.append(
ConvBNRelu(in_planes, out_planes // 2, kernel=1))
elif idx == 1 and block_num == 2:
self.conv_list.append(
ConvBNRelu(out_planes // 2, out_planes // 2, stride=stride))
elif idx == 1 and block_num > 2:
self.conv_list.append(
ConvBNRelu(out_planes // 2, out_planes // 4, stride=stride))
elif idx < block_num - 1:
self.conv_list.append(
ConvBNRelu(out_planes // int(math.pow(2, idx)),
out_planes // int(math.pow(2, idx + 1))))
else:
self.conv_list.append(
ConvBNRelu(out_planes // int(math.pow(2, idx)),
out_planes // int(math.pow(2, idx))))
def forward(self, x):
out_list = []
out = x
for idx, conv in enumerate(self.conv_list):
if idx == 0 and self.stride == 2:
out = self.avd_layer(conv(out))
else:
out = conv(out)
out_list.append(out)
if self.stride == 2:
x = self.skip(x)
return paddle.concat(out_list, axis=1) + x
class CatBottleneck(nn.Layer):
def __init__(self, in_planes, out_planes, block_num=3, stride=1):
super(CatBottleneck, self).__init__()
assert block_num > 1, "block number should be larger than 1."
self.conv_list = nn.LayerList()
self.stride = stride
if stride == 2:
self.avd_layer = nn.Sequential(
nn.Conv2D(
out_planes // 2,
out_planes // 2,
kernel_size=3,
stride=2,
padding=1,
groups=out_planes // 2,
bias_attr=False),
nn.BatchNorm2D(out_planes // 2),
)
self.skip = nn.AvgPool2D(kernel_size=3, stride=2, padding=1)
stride = 1
for idx in range(block_num):
if idx == 0:
self.conv_list.append(
ConvBNRelu(in_planes, out_planes // 2, kernel=1))
elif idx == 1 and block_num == 2:
self.conv_list.append(
ConvBNRelu(out_planes // 2, out_planes // 2, stride=stride))
elif idx == 1 and block_num > 2:
self.conv_list.append(
ConvBNRelu(out_planes // 2, out_planes // 4, stride=stride))
elif idx < block_num - 1:
self.conv_list.append(
ConvBNRelu(out_planes // int(math.pow(2, idx)),
out_planes // int(math.pow(2, idx + 1))))
else:
self.conv_list.append(
ConvBNRelu(out_planes // int(math.pow(2, idx)),
out_planes // int(math.pow(2, idx))))
def forward(self, x):
out_list = []
out1 = self.conv_list[0](x)
for idx, conv in enumerate(self.conv_list[1:]):
if idx == 0:
if self.stride == 2:
out = conv(self.avd_layer(out1))
else:
out = conv(out1)
else:
out = conv(out)
out_list.append(out)
if self.stride == 2:
out1 = self.skip(out1)
out_list.insert(0, out1)
out = paddle.concat(out_list, axis=1)
return out
@manager.BACKBONES.add_component
def STDC2(**kwargs):
model = STDCNet(base=64, layers=[4, 5, 3], **kwargs)
return model
@manager.BACKBONES.add_component
def STDC1(**kwargs):
model = STDCNet(base=64, layers=[2, 2, 2], **kwargs)
return model

@ -0,0 +1,792 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
from paddlers.models.ppseg.models.backbones.transformer_utils import *
class Mlp(nn.Layer):
""" Multilayer perceptron."""
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.reshape(
[B, H // window_size, window_size, W // window_size, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4,
5]).reshape([-1, window_size, window_size, C])
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.reshape(
[B, H // window_size, W // window_size, window_size, window_size, -1])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
return x
class WindowAttention(nn.Layer):
"""
Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""
def __init__(self,
dim,
window_size,
num_heads,
qkv_bias=True,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
# define a parameter table of relative position bias
self.relative_position_bias_table = self.create_parameter(
shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
num_heads),
default_initializer=zeros_)
self.add_parameter("relative_position_bias_table",
self.relative_position_bias_table)
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(self.window_size[0])
coords_w = paddle.arange(self.window_size[1])
coords = paddle.stack(paddle.meshgrid([coords_h,
coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
relative_coords = coords_flatten_1 - coords_flatten_2
relative_coords = relative_coords.transpose([1, 2, 0])
relative_coords[:, :,
0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
trunc_normal_(self.relative_position_bias_table)
self.softmax = nn.Softmax(axis=-1)
def forward(self, x, mask=None):
"""
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(
[B_, N, 3, self.num_heads,
C // self.num_heads]).transpose([2, 0, 3, 1, 4])
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
index = self.relative_position_index.reshape([-1])
relative_position_bias = paddle.index_select(
self.relative_position_bias_table, index)
relative_position_bias = relative_position_bias.reshape([
self.window_size[0] * self.window_size[1],
self.window_size[0] * self.window_size[1], -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
[2, 0, 1]) # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
]) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.reshape([-1, self.num_heads, N, N])
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class SwinTransformerBlock(nn.Layer):
"""
Swin Transformer Block.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self,
dim,
num_heads,
window_size=7,
shift_size=0,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.H = None
self.W = None
def forward(self, x, mask_matrix):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.reshape([B, H, W, C])
# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = x.transpose([0, 3, 1, 2])
x = F.pad(x, [pad_l, pad_r, pad_t, pad_b])
x = x.transpose([0, 2, 3, 1])
_, Hp, Wp, _ = x.shape
# cyclic shift
if self.shift_size > 0:
shifted_x = paddle.roll(
x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None
# partition windows
x_windows = window_partition(
shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.reshape(
[-1, self.window_size * self.window_size,
C]) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.reshape(
[-1, self.window_size, self.window_size, C])
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
Wp) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = paddle.roll(
shifted_x,
shifts=(self.shift_size, self.shift_size),
axis=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :]
x = x.reshape([B, H * W, C])
# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchMerging(nn.Layer):
"""
Patch Merging Layer
Args:
dim (int): Number of input channels.
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
self.norm = norm_layer(4 * dim)
def forward(self, x, H, W):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.reshape([B, H, W, C])
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
x = x.transpose([0, 3, 1, 2])
x = F.pad(x, [0, W % 2, 0, H % 2])
x = x.transpose([0, 2, 3, 1])
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.reshape([B, -1, 4 * C]) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
class BasicLayer(nn.Layer):
"""
A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of feature channels.
depth (int): Depths of this stage.
num_heads (int): Number of attention head.
window_size (int): Local window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
"""
def __init__(self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
# build blocks
self.blocks = nn.LayerList([
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i]
if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer) for i in range(depth)
])
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x, H, W):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = paddle.zeros((1, Hp, Wp, 1)) # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(
img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.reshape(
[-1, self.window_size * self.window_size])
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
huns = -100.0 * paddle.ones_like(attn_mask)
attn_mask = huns * (attn_mask != 0).astype("float32")
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Layer):
"""
Image to Patch Embedding.
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Layer, optional): Normalization layer. Default: None
"""
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
"""Forward function."""
# padding
_, _, H, W = x.shape
if W % self.patch_size[1] != 0:
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
if H % self.patch_size[0] != 0:
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
x = self.proj(x) # B C Wh Ww
if self.norm is not None:
_, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
return x
@manager.BACKBONES.add_component
class SwinTransformer(nn.Layer):
"""
The SwinTransformer implementation based on PaddlePaddle.
The original article refers to
Liu, Ze, et al. "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows"
(https://arxiv.org/abs/2103.14030)
Args:
pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default: 224.
patch_size (int | tuple(int)): Patch size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
depths (tuple[int]): Depths of each Swin Transformer stage.
num_heads (tuple[int]): Number of attention head of each stage.
window_size (int): Window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
drop_rate (float): Dropout rate.
attn_drop_rate (float): Attention dropout rate. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. Default: -1.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
pretrained=None):
super().__init__()
self.pretrain_img_size = pretrain_img_size
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1]
]
self.absolute_pos_embed = self.create_parameter(
shape=(1, embed_dim, patches_resolution[0],
patches_resolution[1]),
default_initializer=zeros_)
self.add_parameter("absolute_pos_embed", self.absolute_pos_embed)
trunc_normal_(self.absolute_pos_embed)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = np.linspace(0, drop_path_rate, sum(depths)).tolist()
# build layers
self.layers = nn.LayerList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if
(i_layer < self.num_layers - 1) else None)
self.layers.append(layer)
feat_channels = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.feat_channels = feat_channels
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(feat_channels[i_layer])
layer_name = f'norm{i_layer}'
self.add_sublayer(layer_name, layer)
self._freeze_stages()
self.pretrained = pretrained
self.init_weights(self.pretrained)
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.requires_grad = False
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
layer = self.layers[i]
layer.eval()
for param in layer.parameters():
param.requires_grad = False
def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
if pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
else:
for sublayer in self.sublayers():
if isinstance(sublayer, nn.Linear):
trunc_normal_(sublayer.weight)
if isinstance(sublayer,
nn.Linear) and sublayer.bias is not None:
zeros_(sublayer.bias)
elif isinstance(sublayer, nn.LayerNorm):
zeros_(sublayer.bias)
ones_(sublayer.weight)
def forward(self, x):
"""Forward function."""
x = self.patch_embed(x)
_, _, Wh, Ww = x.shape
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
else:
x = x.flatten(2).transpose([0, 2, 1])
x = self.pos_drop(x)
outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.reshape(
[-1, H, W, self.feat_channels[i]]).transpose([0, 3, 1, 2])
outs.append(out)
return tuple(outs)
def train(self):
"""Convert the model into training mode while keep layers freezed."""
super(SwinTransformer, self).train()
self._freeze_stages()
@manager.BACKBONES.add_component
def SwinTransformer_tiny_patch4_window7_224(**kwargs):
model = SwinTransformer(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
**kwargs)
return model
@manager.BACKBONES.add_component
def SwinTransformer_small_patch4_window7_224(**kwargs):
model = SwinTransformer(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
**kwargs)
return model
@manager.BACKBONES.add_component
def SwinTransformer_base_patch4_window7_224(**kwargs):
model = SwinTransformer(
pretrain_img_size=224,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=7,
**kwargs)
return model
@manager.BACKBONES.add_component
def SwinTransformer_base_patch4_window12_384(**kwargs):
model = SwinTransformer(
pretrain_img_size=384,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=12,
**kwargs)
return model
@manager.BACKBONES.add_component
def SwinTransformer_large_patch4_window7_224(**kwargs):
model = SwinTransformer(
pretrain_img_size=224,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=7,
**kwargs)
return model
@manager.BACKBONES.add_component
def SwinTransformer_large_patch4_window12_384(**kwargs):
model = SwinTransformer(
pretrain_img_size=384,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=12,
**kwargs)
return model

@ -0,0 +1,83 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.initializer as paddle_init
__all__ = [
'to_2tuple', 'DropPath', 'Identity', 'trunc_normal_', 'zeros_', 'ones_',
'init_weights'
]
def to_2tuple(x):
return tuple([x] * 2)
def drop_path(x, drop_prob=0., training=False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
trunc_normal_ = paddle_init.TruncatedNormal(std=.02)
zeros_ = paddle_init.Constant(value=0.)
ones_ = paddle_init.Constant(value=1.)
def init_weights(layer):
"""
Init the weights of transformer.
Args:
layer(nn.Layer): The layer to init weights.
Returns:
None
"""
if isinstance(layer, nn.Linear):
trunc_normal_(layer.weight)
if layer.bias is not None:
zeros_(layer.bias)
elif isinstance(layer, nn.LayerNorm):
zeros_(layer.bias)
ones_(layer.weight)

@ -0,0 +1,410 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils, logger
from paddlers.models.ppseg.models.backbones.transformer_utils import to_2tuple, DropPath, Identity
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
x_shape = paddle.shape(x)
N, C = x_shape[1], x_shape[2]
qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads,
C // self.num_heads)).transpose((2, 0, 3, 1,
4))
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5):
super().__init__()
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
self.img_size = to_2tuple(img_size)
self.patch_size = to_2tuple(patch_size)
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size[1]
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size[0]
def forward(self, x):
x = self.proj(x)
return x
@manager.BACKBONES.add_component
class VisionTransformer(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
epsilon=1e-5,
final_norm=False,
pretrained=None,
**args):
super().__init__()
self.img_size = img_size
self.embed_dim = embed_dim
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
self.cls_token = self.create_parameter(
shape=(1, 1, embed_dim),
default_initializer=paddle.nn.initializer.Constant(value=0.))
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
epsilon=epsilon) for i in range(depth)
])
self.final_norm = final_norm
if self.final_norm:
self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
utils.load_pretrained_model(self, self.pretrained)
# load and resize pos_embed
model_path = self.pretrained
if not os.path.exists(model_path):
model_path = utils.download_pretrained_model(model_path)
load_state_dict = paddle.load(model_path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys():
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
self.set_dict(model_state_dict)
logger.info(
"Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def forward(self, x):
x = self.patch_embed(x)
x_shape = paddle.shape(x) # b * c * h * w
cls_tokens = self.cls_token.expand((x_shape[0], -1, -1))
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c
x = paddle.concat([cls_tokens, x], axis=1)
if paddle.shape(x)[1] == self.pos_embed.shape[1]:
x = x + self.pos_embed
else:
x = x + self.resize_pos_embed(self.pos_embed,
(self.pos_h, self.pos_w), x_shape[2:])
x = self.pos_drop(x)
res = []
for idx, blk in enumerate(self.blocks):
x = blk(x)
if self.final_norm and idx == len(self.blocks) - 1:
x = self.norm(x)
res.append(x[:, 1:, :])
return res, x_shape
@manager.BACKBONES.add_component
def ViT_small_patch16_224(**kwargs):
model = VisionTransformer(
patch_size=16,
embed_dim=768,
depth=8,
num_heads=8,
mlp_ratio=3,
qk_scale=768**-0.5,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_base_patch16_224(**kwargs):
model = VisionTransformer(
patch_size=16,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=True,
epsilon=1e-6,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_base_patch16_384(**kwargs):
model = VisionTransformer(
img_size=384,
patch_size=16,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=True,
epsilon=1e-6,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_base_patch32_384(**kwargs):
model = VisionTransformer(
img_size=384,
patch_size=32,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=True,
epsilon=1e-6,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_large_patch16_224(**kwargs):
model = VisionTransformer(
patch_size=16,
embed_dim=1024,
depth=24,
num_heads=16,
mlp_ratio=4,
qkv_bias=True,
epsilon=1e-6,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_large_patch16_384(**kwargs):
model = VisionTransformer(
img_size=384,
patch_size=16,
embed_dim=1024,
depth=24,
num_heads=16,
mlp_ratio=4,
qkv_bias=True,
epsilon=1e-6,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_large_patch32_384(**kwargs):
model = VisionTransformer(
img_size=384,
patch_size=32,
embed_dim=1024,
depth=24,
num_heads=16,
mlp_ratio=4,
qkv_bias=True,
epsilon=1e-6,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_huge_patch16_224(**kwargs):
model = VisionTransformer(
patch_size=16,
embed_dim=1280,
depth=32,
num_heads=16,
mlp_ratio=4,
**kwargs)
return model
@manager.BACKBONES.add_component
def ViT_huge_patch32_384(**kwargs):
model = VisionTransformer(
img_size=384,
patch_size=32,
embed_dim=1280,
depth=32,
num_heads=16,
mlp_ratio=4,
**kwargs)
return model

@ -0,0 +1,415 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
from paddlers.models.ppseg.models import layers
__all__ = ["Xception41_deeplab", "Xception65_deeplab", "Xception71_deeplab"]
def check_data(data, number):
if type(data) == int:
return [data] * number
assert len(data) == number
return data
def check_stride(s, os):
if s <= os:
return True
else:
return False
def check_points(count, points):
if points is None:
return False
else:
if isinstance(points, list):
return (True if count in points else False)
else:
return (True if count == points else False)
def gen_bottleneck_params(backbone='xception_65'):
if backbone == 'xception_65':
bottleneck_params = {
"entry_flow": (3, [2, 2, 2], [128, 256, 728]),
"middle_flow": (16, 1, 728),
"exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
}
elif backbone == 'xception_41':
bottleneck_params = {
"entry_flow": (3, [2, 2, 2], [128, 256, 728]),
"middle_flow": (8, 1, 728),
"exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
}
elif backbone == 'xception_71':
bottleneck_params = {
"entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]),
"middle_flow": (16, 1, 728),
"exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
}
else:
raise ValueError(
"Xception backbont only support xception_41/xception_65/xception_71"
)
return bottleneck_params
class ConvBNLayer(nn.Layer):
def __init__(self,
input_channels,
output_channels,
filter_size,
stride=1,
padding=0,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self._conv = nn.Conv2D(
in_channels=input_channels,
out_channels=output_channels,
kernel_size=filter_size,
stride=stride,
padding=padding,
bias_attr=False)
self._bn = layers.SyncBatchNorm(
num_features=output_channels, epsilon=1e-3, momentum=0.99)
self._act_op = layers.Activation(act=act)
def forward(self, inputs):
return self._act_op(self._bn(self._conv(inputs)))
class Seperate_Conv(nn.Layer):
def __init__(self,
input_channels,
output_channels,
stride,
filter,
dilation=1,
act=None,
name=None):
super(Seperate_Conv, self).__init__()
self._conv1 = nn.Conv2D(
in_channels=input_channels,
out_channels=input_channels,
kernel_size=filter,
stride=stride,
groups=input_channels,
padding=(filter) // 2 * dilation,
dilation=dilation,
bias_attr=False)
self._bn1 = layers.SyncBatchNorm(
input_channels, epsilon=1e-3, momentum=0.99)
self._act_op1 = layers.Activation(act=act)
self._conv2 = nn.Conv2D(
input_channels,
output_channels,
1,
stride=1,
groups=1,
padding=0,
bias_attr=False)
self._bn2 = layers.SyncBatchNorm(
output_channels, epsilon=1e-3, momentum=0.99)
self._act_op2 = layers.Activation(act=act)
def forward(self, inputs):
x = self._conv1(inputs)
x = self._bn1(x)
x = self._act_op1(x)
x = self._conv2(x)
x = self._bn2(x)
x = self._act_op2(x)
return x
class Xception_Block(nn.Layer):
def __init__(self,
input_channels,
output_channels,
strides=1,
filter_size=3,
dilation=1,
skip_conv=True,
has_skip=True,
activation_fn_in_separable_conv=False,
name=None):
super(Xception_Block, self).__init__()
repeat_number = 3
output_channels = check_data(output_channels, repeat_number)
filter_size = check_data(filter_size, repeat_number)
strides = check_data(strides, repeat_number)
self.has_skip = has_skip
self.skip_conv = skip_conv
self.activation_fn_in_separable_conv = activation_fn_in_separable_conv
if not activation_fn_in_separable_conv:
self._conv1 = Seperate_Conv(
input_channels,
output_channels[0],
stride=strides[0],
filter=filter_size[0],
dilation=dilation,
name=name + "/separable_conv1")
self._conv2 = Seperate_Conv(
output_channels[0],
output_channels[1],
stride=strides[1],
filter=filter_size[1],
dilation=dilation,
name=name + "/separable_conv2")
self._conv3 = Seperate_Conv(
output_channels[1],
output_channels[2],
stride=strides[2],
filter=filter_size[2],
dilation=dilation,
name=name + "/separable_conv3")
else:
self._conv1 = Seperate_Conv(
input_channels,
output_channels[0],
stride=strides[0],
filter=filter_size[0],
act="relu",
dilation=dilation,
name=name + "/separable_conv1")
self._conv2 = Seperate_Conv(
output_channels[0],
output_channels[1],
stride=strides[1],
filter=filter_size[1],
act="relu",
dilation=dilation,
name=name + "/separable_conv2")
self._conv3 = Seperate_Conv(
output_channels[1],
output_channels[2],
stride=strides[2],
filter=filter_size[2],
act="relu",
dilation=dilation,
name=name + "/separable_conv3")
if has_skip and skip_conv:
self._short = ConvBNLayer(
input_channels,
output_channels[-1],
1,
stride=strides[-1],
padding=0,
name=name + "/shortcut")
def forward(self, inputs):
if not self.activation_fn_in_separable_conv:
x = F.relu(inputs)
x = self._conv1(x)
x = F.relu(x)
x = self._conv2(x)
x = F.relu(x)
x = self._conv3(x)
else:
x = self._conv1(inputs)
x = self._conv2(x)
x = self._conv3(x)
if self.has_skip is False:
return x
if self.skip_conv:
skip = self._short(inputs)
else:
skip = inputs
return x + skip
class XceptionDeeplab(nn.Layer):
"""
The Xception backobne of DeepLabv3+ implementation based on PaddlePaddle.
The original article refers to
Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation"
(https://arxiv.org/abs/1802.02611)
Args:
backbone (str): Which type of Xception_DeepLab to select. It should be one of ('xception_41', 'xception_65', 'xception_71').
pretrained (str, optional): The path of pretrained model.
output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 16.
"""
def __init__(self, backbone, pretrained=None, output_stride=16):
super(XceptionDeeplab, self).__init__()
bottleneck_params = gen_bottleneck_params(backbone)
self.backbone = backbone
self.feat_channels = [128, 2048]
self._conv1 = ConvBNLayer(
3,
32,
3,
stride=2,
padding=1,
act="relu",
name=self.backbone + "/entry_flow/conv1")
self._conv2 = ConvBNLayer(
32,
64,
3,
stride=1,
padding=1,
act="relu",
name=self.backbone + "/entry_flow/conv2")
"""
bottleneck_params = {
"entry_flow": (3, [2, 2, 2], [128, 256, 728]),
"middle_flow": (16, 1, 728),
"exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
}
if output_stride == 16:
entry_block3_stride = 2
middle_block_dilation = 1
exit_block_dilations = (1, 2)
elif output_stride == 8:
entry_block3_stride = 1
middle_block_dilation = 2
exit_block_dilations = (2, 4)
"""
self.block_num = bottleneck_params["entry_flow"][0]
self.strides = bottleneck_params["entry_flow"][1]
self.chns = bottleneck_params["entry_flow"][2]
self.strides = check_data(self.strides, self.block_num)
self.chns = check_data(self.chns, self.block_num)
self.entry_flow = []
self.middle_flow = []
self.stride = 2
self.output_stride = output_stride
s = self.stride
for i in range(self.block_num):
stride = self.strides[i] if check_stride(s * self.strides[i],
self.output_stride) else 1
xception_block = self.add_sublayer(
self.backbone + "/entry_flow/block" + str(i + 1),
Xception_Block(
input_channels=64 if i == 0 else self.chns[i - 1],
output_channels=self.chns[i],
strides=[1, 1, self.stride],
name=self.backbone + "/entry_flow/block" + str(i + 1)))
self.entry_flow.append(xception_block)
s = s * stride
self.stride = s
self.block_num = bottleneck_params["middle_flow"][0]
self.strides = bottleneck_params["middle_flow"][1]
self.chns = bottleneck_params["middle_flow"][2]
self.strides = check_data(self.strides, self.block_num)
self.chns = check_data(self.chns, self.block_num)
s = self.stride
for i in range(self.block_num):
stride = self.strides[i] if check_stride(s * self.strides[i],
self.output_stride) else 1
xception_block = self.add_sublayer(
self.backbone + "/middle_flow/block" + str(i + 1),
Xception_Block(
input_channels=728,
output_channels=728,
strides=[1, 1, self.strides[i]],
skip_conv=False,
name=self.backbone + "/middle_flow/block" + str(i + 1)))
self.middle_flow.append(xception_block)
s = s * stride
self.stride = s
self.block_num = bottleneck_params["exit_flow"][0]
self.strides = bottleneck_params["exit_flow"][1]
self.chns = bottleneck_params["exit_flow"][2]
self.strides = check_data(self.strides, self.block_num)
self.chns = check_data(self.chns, self.block_num)
s = self.stride
stride = self.strides[0] if check_stride(s * self.strides[0],
self.output_stride) else 1
self._exit_flow_1 = Xception_Block(
728,
self.chns[0], [1, 1, stride],
name=self.backbone + "/exit_flow/block1")
s = s * stride
stride = self.strides[1] if check_stride(s * self.strides[1],
self.output_stride) else 1
self._exit_flow_2 = Xception_Block(
self.chns[0][-1],
self.chns[1], [1, 1, stride],
dilation=2,
has_skip=False,
activation_fn_in_separable_conv=True,
name=self.backbone + "/exit_flow/block2")
self.pretrained = pretrained
self.init_weight()
def forward(self, inputs):
x = self._conv1(inputs)
x = self._conv2(x)
feat_list = []
for i, ef in enumerate(self.entry_flow):
x = ef(x)
if i == 0:
feat_list.append(x)
for mf in self.middle_flow:
x = mf(x)
x = self._exit_flow_1(x)
x = self._exit_flow_2(x)
feat_list.append(x)
return feat_list
def init_weight(self):
if self.pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
@manager.BACKBONES.add_component
def Xception41_deeplab(**args):
model = XceptionDeeplab('xception_41', **args)
return model
@manager.BACKBONES.add_component
def Xception65_deeplab(**args):
model = XceptionDeeplab("xception_65", **args)
return model
@manager.BACKBONES.add_component
def Xception71_deeplab(**args):
model = XceptionDeeplab("xception_71", **args)
return model

@ -0,0 +1,307 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg import utils
from paddlers.models.ppseg.cvlibs import manager, param_init
from paddlers.models.ppseg.models import layers
@manager.MODELS.add_component
class BiSeNetV2(nn.Layer):
"""
The BiSeNet V2 implementation based on PaddlePaddle.
The original article refers to
Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
(https://arxiv.org/abs/2004.02147)
Args:
num_classes (int): The unique number of target classes.
lambd (float, optional): A factor for controlling the size of semantic branch channels. Default: 0.25.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
lambd=0.25,
align_corners=False,
pretrained=None):
super().__init__()
C1, C2, C3 = 64, 64, 128
db_channels = (C1, C2, C3)
C1, C3, C4, C5 = int(C1 * lambd), int(C3 * lambd), 64, 128
sb_channels = (C1, C3, C4, C5)
mid_channels = 128
self.db = DetailBranch(db_channels)
self.sb = SemanticBranch(sb_channels)
self.bga = BGA(mid_channels, align_corners)
self.aux_head1 = SegHead(C1, C1, num_classes)
self.aux_head2 = SegHead(C3, C3, num_classes)
self.aux_head3 = SegHead(C4, C4, num_classes)
self.aux_head4 = SegHead(C5, C5, num_classes)
self.head = SegHead(mid_channels, mid_channels, num_classes)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
dfm = self.db(x)
feat1, feat2, feat3, feat4, sfm = self.sb(x)
logit = self.head(self.bga(dfm, sfm))
if not self.training:
logit_list = [logit]
else:
logit1 = self.aux_head1(feat1)
logit2 = self.aux_head2(feat2)
logit3 = self.aux_head3(feat3)
logit4 = self.aux_head4(feat4)
logit_list = [logit, logit1, logit2, logit3, logit4]
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
else:
for sublayer in self.sublayers():
if isinstance(sublayer, nn.Conv2D):
param_init.kaiming_normal_init(sublayer.weight)
elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
param_init.constant_init(sublayer.weight, value=1.0)
param_init.constant_init(sublayer.bias, value=0.0)
class StemBlock(nn.Layer):
def __init__(self, in_dim, out_dim):
super(StemBlock, self).__init__()
self.conv = layers.ConvBNReLU(in_dim, out_dim, 3, stride=2)
self.left = nn.Sequential(
layers.ConvBNReLU(out_dim, out_dim // 2, 1),
layers.ConvBNReLU(out_dim // 2, out_dim, 3, stride=2))
self.right = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
self.fuse = layers.ConvBNReLU(out_dim * 2, out_dim, 3)
def forward(self, x):
x = self.conv(x)
left = self.left(x)
right = self.right(x)
concat = paddle.concat([left, right], axis=1)
return self.fuse(concat)
class ContextEmbeddingBlock(nn.Layer):
def __init__(self, in_dim, out_dim):
super(ContextEmbeddingBlock, self).__init__()
self.gap = nn.AdaptiveAvgPool2D(1)
self.bn = layers.SyncBatchNorm(in_dim)
self.conv_1x1 = layers.ConvBNReLU(in_dim, out_dim, 1)
self.add = layers.Add()
self.conv_3x3 = nn.Conv2D(out_dim, out_dim, 3, 1, 1)
def forward(self, x):
gap = self.gap(x)
bn = self.bn(gap)
conv1 = self.add(self.conv_1x1(bn), x)
return self.conv_3x3(conv1)
class GatherAndExpansionLayer1(nn.Layer):
"""Gather And Expansion Layer with stride 1"""
def __init__(self, in_dim, out_dim, expand):
super().__init__()
expand_dim = expand * in_dim
self.conv = nn.Sequential(
layers.ConvBNReLU(in_dim, in_dim, 3),
layers.DepthwiseConvBN(in_dim, expand_dim, 3),
layers.ConvBN(expand_dim, out_dim, 1))
self.relu = layers.Activation("relu")
def forward(self, x):
return self.relu(self.conv(x) + x)
class GatherAndExpansionLayer2(nn.Layer):
"""Gather And Expansion Layer with stride 2"""
def __init__(self, in_dim, out_dim, expand):
super().__init__()
expand_dim = expand * in_dim
self.branch_1 = nn.Sequential(
layers.ConvBNReLU(in_dim, in_dim, 3),
layers.DepthwiseConvBN(in_dim, expand_dim, 3, stride=2),
layers.DepthwiseConvBN(expand_dim, expand_dim, 3),
layers.ConvBN(expand_dim, out_dim, 1))
self.branch_2 = nn.Sequential(
layers.DepthwiseConvBN(in_dim, in_dim, 3, stride=2),
layers.ConvBN(in_dim, out_dim, 1))
self.relu = layers.Activation("relu")
def forward(self, x):
return self.relu(self.branch_1(x) + self.branch_2(x))
class DetailBranch(nn.Layer):
"""The detail branch of BiSeNet, which has wide channels but shallow layers."""
def __init__(self, in_channels):
super().__init__()
C1, C2, C3 = in_channels
self.convs = nn.Sequential(
# stage 1
layers.ConvBNReLU(3, C1, 3, stride=2),
layers.ConvBNReLU(C1, C1, 3),
# stage 2
layers.ConvBNReLU(C1, C2, 3, stride=2),
layers.ConvBNReLU(C2, C2, 3),
layers.ConvBNReLU(C2, C2, 3),
# stage 3
layers.ConvBNReLU(C2, C3, 3, stride=2),
layers.ConvBNReLU(C3, C3, 3),
layers.ConvBNReLU(C3, C3, 3),
)
def forward(self, x):
return self.convs(x)
class SemanticBranch(nn.Layer):
"""The semantic branch of BiSeNet, which has narrow channels but deep layers."""
def __init__(self, in_channels):
super().__init__()
C1, C3, C4, C5 = in_channels
self.stem = StemBlock(3, C1)
self.stage3 = nn.Sequential(
GatherAndExpansionLayer2(C1, C3, 6),
GatherAndExpansionLayer1(C3, C3, 6))
self.stage4 = nn.Sequential(
GatherAndExpansionLayer2(C3, C4, 6),
GatherAndExpansionLayer1(C4, C4, 6))
self.stage5_4 = nn.Sequential(
GatherAndExpansionLayer2(C4, C5, 6),
GatherAndExpansionLayer1(C5, C5, 6),
GatherAndExpansionLayer1(C5, C5, 6),
GatherAndExpansionLayer1(C5, C5, 6))
self.ce = ContextEmbeddingBlock(C5, C5)
def forward(self, x):
stage2 = self.stem(x)
stage3 = self.stage3(stage2)
stage4 = self.stage4(stage3)
stage5_4 = self.stage5_4(stage4)
fm = self.ce(stage5_4)
return stage2, stage3, stage4, stage5_4, fm
class BGA(nn.Layer):
"""The Bilateral Guided Aggregation Layer, used to fuse the semantic features and spatial features."""
def __init__(self, out_dim, align_corners):
super().__init__()
self.align_corners = align_corners
self.db_branch_keep = nn.Sequential(
layers.DepthwiseConvBN(out_dim, out_dim, 3),
nn.Conv2D(out_dim, out_dim, 1))
self.db_branch_down = nn.Sequential(
layers.ConvBN(out_dim, out_dim, 3, stride=2),
nn.AvgPool2D(kernel_size=3, stride=2, padding=1))
self.sb_branch_keep = nn.Sequential(
layers.DepthwiseConvBN(out_dim, out_dim, 3),
nn.Conv2D(out_dim, out_dim, 1), layers.Activation(act='sigmoid'))
self.sb_branch_up = layers.ConvBN(out_dim, out_dim, 3)
self.conv = layers.ConvBN(out_dim, out_dim, 3)
def forward(self, dfm, sfm):
db_feat_keep = self.db_branch_keep(dfm)
db_feat_down = self.db_branch_down(dfm)
sb_feat_keep = self.sb_branch_keep(sfm)
sb_feat_up = self.sb_branch_up(sfm)
sb_feat_up = F.interpolate(
sb_feat_up,
paddle.shape(db_feat_keep)[2:],
mode='bilinear',
align_corners=self.align_corners)
sb_feat_up = F.sigmoid(sb_feat_up)
db_feat = db_feat_keep * sb_feat_up
sb_feat = db_feat_down * sb_feat_keep
sb_feat = F.interpolate(
sb_feat,
paddle.shape(db_feat)[2:],
mode='bilinear',
align_corners=self.align_corners)
return self.conv(db_feat + sb_feat)
class SegHead(nn.Layer):
def __init__(self, in_dim, mid_dim, num_classes):
super().__init__()
self.conv_3x3 = nn.Sequential(
layers.ConvBNReLU(in_dim, mid_dim, 3), nn.Dropout(0.1))
self.conv_1x1 = nn.Conv2D(mid_dim, num_classes, 1, 1)
def forward(self, x):
conv1 = self.conv_3x3(x)
conv2 = self.conv_1x1(conv1)
return conv2

@ -0,0 +1,259 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class BiseNetV1(nn.Layer):
"""
The BiSeNetV1 implementation based on PaddlePaddle.
The original article refers to
Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
(https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for)
Args:
num_classes (int): The unique number of target classes.
backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
conv_channel=128,
pretrained=None):
super().__init__()
self.backbone = backbone
self.spatial_path = SpatialPath(3, 128)
self.global_context = nn.Sequential(
nn.AdaptiveAvgPool2D(1),
layers.ConvBNReLU(512, conv_channel, 1, bias_attr=False),
)
self.arms = nn.LayerList([
AttentionRefinement(512, conv_channel),
AttentionRefinement(256, conv_channel),
])
self.refines = nn.LayerList([
layers.ConvBNReLU(conv_channel,
conv_channel,
3,
stride=1,
padding=1,
bias_attr=False),
layers.ConvBNReLU(conv_channel,
conv_channel,
3,
stride=1,
padding=1,
bias_attr=False),
])
self.heads = nn.LayerList([
BiSeNetHead(conv_channel, num_classes, 8, True),
BiSeNetHead(conv_channel, num_classes, 8, True),
BiSeNetHead(conv_channel * 2, num_classes, 8, False),
])
self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1)
self.pretrained = pretrained
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def forward(self, x):
spatial_out = self.spatial_path(x)
context_blocks = self.backbone(x)
context_blocks.reverse()
global_context = self.global_context(context_blocks[0])
global_context = F.interpolate(global_context,
size=paddle.shape(context_blocks[0])[2:],
mode='bilinear',
align_corners=True)
last_fm = global_context
pred_out = []
for i, (fm, arm, refine) in enumerate(
zip(context_blocks[:2], self.arms, self.refines)):
fm = arm(fm)
fm += last_fm
last_fm = F.interpolate(fm,
size=paddle.shape(context_blocks[i +
1])[2:],
mode='bilinear',
align_corners=True)
last_fm = refine(last_fm)
pred_out.append(last_fm)
context_out = last_fm
concate_fm = self.ffm(spatial_out, context_out)
pred_out.append(concate_fm)
output = []
if self.training:
for i, head in enumerate(self.heads):
out = head(pred_out[i])
output.append(out)
else:
out = self.heads[-1](pred_out[-1])
output.append(out)
return output
class SpatialPath(nn.Layer):
"""
SpatialPath module of BiseNetV1 model
Args:
in_channels (int): The number of input channels in spatial path module.
out_channels (int): The number of output channels in spatial path module.
"""
def __init__(self, in_channels, out_channels, inner_channel=64):
super().__init__()
self.conv_7x7 = layers.ConvBNReLU(in_channels,
inner_channel,
7,
stride=2,
padding=3,
bias_attr=False)
self.conv_3x3_1 = layers.ConvBNReLU(inner_channel,
inner_channel,
3,
stride=2,
padding=1,
bias_attr=False)
self.conv_3x3_2 = layers.ConvBNReLU(inner_channel,
inner_channel,
3,
stride=2,
padding=1,
bias_attr=False)
self.conv_1x1 = layers.ConvBNReLU(inner_channel,
out_channels,
1,
bias_attr=False)
def forward(self, x):
x = self.conv_7x7(x)
x = self.conv_3x3_1(x)
x = self.conv_3x3_2(x)
x = self.conv_1x1(x)
return x
class BiSeNetHead(nn.Layer):
"""
BiSeNet head of BiseNetV1 model
Args:
in_channels (int): The number of input channels in spatial path module.
out_channels (int): The number of output channels in spatial path module.
scale (int, float): The scale factor of interpolation.
"""
def __init__(self, in_channels, out_channels, scale, is_aux=False):
super().__init__()
inner_channel = 128 if is_aux else 64
self.conv_3x3 = layers.ConvBNReLU(in_channels,
inner_channel,
3,
stride=1,
padding=1,
bias_attr=False)
self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1)
self.scale = scale
def forward(self, x):
x = self.conv_3x3(x)
x = self.conv_1x1(x)
if self.scale > 1:
x = F.interpolate(x,
scale_factor=self.scale,
mode='bilinear',
align_corners=True)
return x
class AttentionRefinement(nn.Layer):
"""
AttentionRefinement module of BiseNetV1 model
Args:
in_channels (int): The number of input channels in spatial path module.
out_channels (int): The number of output channels in spatial path module.
"""
def __init__(self, in_channels, out_channels):
super().__init__()
self.conv_3x3 = layers.ConvBNReLU(in_channels,
out_channels,
3,
stride=1,
padding=1,
bias_attr=False)
self.channel_attention = nn.Sequential(
nn.AdaptiveAvgPool2D(1),
layers.ConvBNReLU(out_channels, out_channels, 1, bias_attr=False),
nn.Sigmoid(),
)
def forward(self, x):
x = self.conv_3x3(x)
se = self.channel_attention(x)
x = x * se
return x
class FeatureFusion(nn.Layer):
"""
AttentionRefinement module of BiseNetV1 model
Args:
in_channels (int): The number of input channels in spatial path module.
out_channels (int): The number of output channels in spatial path module.
reduction (int): A factor shrinks convolutional channels. Default: 1.
"""
def __init__(self, in_channels, out_channels, reduction=1):
super().__init__()
self.conv_1x1 = layers.ConvBNReLU(in_channels,
out_channels,
1,
bias_attr=False)
self.channel_attention = nn.Sequential(
nn.AdaptiveAvgPool2D(1),
layers.ConvBNReLU(out_channels,
out_channels // reduction,
1,
bias_attr=False),
layers.ConvBNReLU(out_channels // reduction,
out_channels,
1,
bias_attr=False),
nn.Sigmoid(),
)
def forward(self, x1, x2):
fm = paddle.concat([x1, x2], axis=1)
fm = self.conv_1x1(fm)
fm_se = self.channel_attention(fm)
output = fm + fm * fm_se
return output

@ -0,0 +1,218 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class DANet(nn.Layer):
"""
The DANet implementation based on PaddlePaddle.
The original article refers to
Fu, jun, et al. "Dual Attention Network for Scene Segmentation"
(https://arxiv.org/pdf/1809.02983.pdf)
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): A backbone network.
backbone_indices (tuple): The values in the tuple indicate the indices of
output of backbone.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
self.backbone_indices = backbone_indices
in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
self.head = DAHead(num_classes=num_classes, in_channels=in_channels)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feats = self.backbone(x)
feats = [feats[i] for i in self.backbone_indices]
logit_list = self.head(feats)
if not self.training:
logit_list = [logit_list[0]]
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners,
align_mode=1) for logit in logit_list
]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class DAHead(nn.Layer):
"""
The Dual attention head.
Args:
num_classes (int): The unique number of target classes.
in_channels (tuple): The number of input channels.
"""
def __init__(self, num_classes, in_channels):
super().__init__()
in_channels = in_channels[-1]
inter_channels = in_channels // 4
self.channel_conv = layers.ConvBNReLU(in_channels, inter_channels, 3)
self.position_conv = layers.ConvBNReLU(in_channels, inter_channels, 3)
self.pam = PAM(inter_channels)
self.cam = CAM(inter_channels)
self.conv1 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
self.conv2 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
self.aux_head = nn.Sequential(
nn.Dropout2D(0.1), nn.Conv2D(in_channels, num_classes, 1))
self.aux_head_pam = nn.Sequential(
nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
self.aux_head_cam = nn.Sequential(
nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
self.cls_head = nn.Sequential(
nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
def forward(self, feat_list):
feats = feat_list[-1]
channel_feats = self.channel_conv(feats)
channel_feats = self.cam(channel_feats)
channel_feats = self.conv1(channel_feats)
position_feats = self.position_conv(feats)
position_feats = self.pam(position_feats)
position_feats = self.conv2(position_feats)
feats_sum = position_feats + channel_feats
logit = self.cls_head(feats_sum)
if not self.training:
return [logit]
cam_logit = self.aux_head_cam(channel_feats)
pam_logit = self.aux_head_cam(position_feats)
aux_logit = self.aux_head(feats)
return [logit, cam_logit, pam_logit, aux_logit]
class PAM(nn.Layer):
"""Position attention module."""
def __init__(self, in_channels):
super().__init__()
mid_channels = in_channels // 8
self.mid_channels = mid_channels
self.in_channels = in_channels
self.query_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
self.key_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
self.value_conv = nn.Conv2D(in_channels, in_channels, 1, 1)
self.gamma = self.create_parameter(
shape=[1],
dtype='float32',
default_initializer=nn.initializer.Constant(0))
def forward(self, x):
x_shape = paddle.shape(x)
# query: n, h * w, c1
query = self.query_conv(x)
query = paddle.reshape(query, (0, self.mid_channels, -1))
query = paddle.transpose(query, (0, 2, 1))
# key: n, c1, h * w
key = self.key_conv(x)
key = paddle.reshape(key, (0, self.mid_channels, -1))
# sim: n, h * w, h * w
sim = paddle.bmm(query, key)
sim = F.softmax(sim, axis=-1)
value = self.value_conv(x)
value = paddle.reshape(value, (0, self.in_channels, -1))
sim = paddle.transpose(sim, (0, 2, 1))
# feat: from (n, c2, h * w) -> (n, c2, h, w)
feat = paddle.bmm(value, sim)
feat = paddle.reshape(feat,
(0, self.in_channels, x_shape[2], x_shape[3]))
out = self.gamma * feat + x
return out
class CAM(nn.Layer):
"""Channel attention module."""
def __init__(self, channels):
super().__init__()
self.channels = channels
self.gamma = self.create_parameter(
shape=[1],
dtype='float32',
default_initializer=nn.initializer.Constant(0))
def forward(self, x):
x_shape = paddle.shape(x)
# query: n, c, h * w
query = paddle.reshape(x, (0, self.channels, -1))
# key: n, h * w, c
key = paddle.reshape(x, (0, self.channels, -1))
key = paddle.transpose(key, (0, 2, 1))
# sim: n, c, c
sim = paddle.bmm(query, key)
# The danet author claims that this can avoid gradient divergence
sim = paddle.max(
sim, axis=-1, keepdim=True).tile([1, 1, self.channels]) - sim
sim = F.softmax(sim, axis=-1)
# feat: from (n, c, h * w) to (n, c, h, w)
value = paddle.reshape(x, (0, self.channels, -1))
feat = paddle.bmm(sim, value)
feat = paddle.reshape(feat, (0, self.channels, x_shape[2], x_shape[3]))
out = self.gamma * feat + x
return out

@ -0,0 +1,228 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import cv2
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.models.backbones import resnet_vd
from paddlers.models.ppseg.models import deeplab
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class DecoupledSegNet(nn.Layer):
"""
The DecoupledSegNet implementation based on PaddlePaddle.
The original article refers to
Xiangtai Li, et, al. "Improving Semantic Segmentation via Decoupled Body and Edge Supervision"
(https://arxiv.org/pdf/2007.10035.pdf)
Args:
num_classes (int): The unique number of target classes.
backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
Default: (0, 3).
aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
If output_stride=8, aspp_ratios is (1, 12, 24, 36).
Default: (1, 6, 12, 18).
aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(0, 3),
aspp_ratios=(1, 6, 12, 18),
aspp_out_channels=256,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
backbone_channels = self.backbone.feat_channels
self.head = DecoupledSegNetHead(num_classes, backbone_indices,
backbone_channels, aspp_ratios,
aspp_out_channels, align_corners)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feat_list = self.backbone(x)
logit_list = self.head(feat_list)
seg_logit, body_logit, edge_logit = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
if self.training:
return [seg_logit, body_logit, edge_logit, (seg_logit, edge_logit)]
return [seg_logit]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class DecoupledSegNetHead(nn.Layer):
"""
The DecoupledSegNetHead implementation based on PaddlePaddle.
Args:
num_classes (int): The unique number of target classes.
backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
the first index will be taken as a low-level feature in Edge presevation component;
the second one will be taken as input of ASPP component.
backbone_channels (tuple): The channels of output of backbone.
aspp_ratios (tuple): The dilation rates using in ASSP module.
aspp_out_channels (int): The output channels of ASPP module.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
"""
def __init__(self, num_classes, backbone_indices, backbone_channels,
aspp_ratios, aspp_out_channels, align_corners):
super().__init__()
self.backbone_indices = backbone_indices
self.align_corners = align_corners
self.aspp = layers.ASPPModule(
aspp_ratios=aspp_ratios,
in_channels=backbone_channels[backbone_indices[1]],
out_channels=aspp_out_channels,
align_corners=align_corners,
image_pooling=True)
self.bot_fine = nn.Conv2D(
backbone_channels[backbone_indices[0]], 48, 1, bias_attr=False)
# decoupled
self.squeeze_body_edge = SqueezeBodyEdge(
256, align_corners=self.align_corners)
self.edge_fusion = nn.Conv2D(256 + 48, 256, 1, bias_attr=False)
self.sigmoid_edge = nn.Sigmoid()
self.edge_out = nn.Sequential(
layers.ConvBNReLU(
in_channels=256,
out_channels=48,
kernel_size=3,
bias_attr=False), nn.Conv2D(48, 1, 1, bias_attr=False))
self.dsn_seg_body = nn.Sequential(
layers.ConvBNReLU(
in_channels=256,
out_channels=256,
kernel_size=3,
bias_attr=False), nn.Conv2D(
256, num_classes, 1, bias_attr=False))
self.final_seg = nn.Sequential(
layers.ConvBNReLU(
in_channels=512,
out_channels=256,
kernel_size=3,
bias_attr=False),
layers.ConvBNReLU(
in_channels=256,
out_channels=256,
kernel_size=3,
bias_attr=False),
nn.Conv2D(256, num_classes, kernel_size=1, bias_attr=False))
def forward(self, feat_list):
fine_fea = feat_list[self.backbone_indices[0]]
fine_size = paddle.shape(fine_fea)
x = feat_list[self.backbone_indices[1]]
aspp = self.aspp(x)
# decoupled
seg_body, seg_edge = self.squeeze_body_edge(aspp)
# Edge presevation and edge out
fine_fea = self.bot_fine(fine_fea)
seg_edge = F.interpolate(
seg_edge,
fine_size[2:],
mode='bilinear',
align_corners=self.align_corners)
seg_edge = self.edge_fusion(paddle.concat([seg_edge, fine_fea], axis=1))
seg_edge_out = self.edge_out(seg_edge)
seg_edge_out = self.sigmoid_edge(seg_edge_out) # seg_edge output
seg_body_out = self.dsn_seg_body(seg_body) # body out
# seg_final out
seg_out = seg_edge + F.interpolate(
seg_body,
fine_size[2:],
mode='bilinear',
align_corners=self.align_corners)
aspp = F.interpolate(
aspp,
fine_size[2:],
mode='bilinear',
align_corners=self.align_corners)
seg_out = paddle.concat([aspp, seg_out], axis=1)
seg_final_out = self.final_seg(seg_out)
return [seg_final_out, seg_body_out, seg_edge_out]
class SqueezeBodyEdge(nn.Layer):
def __init__(self, inplane, align_corners=False):
super().__init__()
self.align_corners = align_corners
self.down = nn.Sequential(
layers.ConvBNReLU(
inplane, inplane, kernel_size=3, groups=inplane, stride=2),
layers.ConvBNReLU(
inplane, inplane, kernel_size=3, groups=inplane, stride=2))
self.flow_make = nn.Conv2D(
inplane * 2, 2, kernel_size=3, padding='same', bias_attr=False)
def forward(self, x):
size = paddle.shape(x)[2:]
seg_down = self.down(x)
seg_down = F.interpolate(
seg_down,
size=size,
mode='bilinear',
align_corners=self.align_corners)
flow = self.flow_make(paddle.concat([x, seg_down], axis=1))
seg_flow_warp = self.flow_warp(x, flow, size)
seg_edge = x - seg_flow_warp
return seg_flow_warp, seg_edge
def flow_warp(self, input, flow, size):
input_shape = paddle.shape(input)
norm = size[::-1].reshape([1, 1, 1, -1])
norm.stop_gradient = True
h_grid = paddle.linspace(-1.0, 1.0, size[0]).reshape([-1, 1])
h_grid = h_grid.tile([size[1]])
w_grid = paddle.linspace(-1.0, 1.0, size[1]).reshape([-1, 1])
w_grid = w_grid.tile([size[0]]).transpose([1, 0])
grid = paddle.concat([w_grid.unsqueeze(2), h_grid.unsqueeze(2)], axis=2)
grid.unsqueeze(0).tile([input_shape[0], 1, 1, 1])
grid = grid + paddle.transpose(flow, (0, 2, 3, 1)) / norm
output = F.grid_sample(input, grid)
return output

@ -0,0 +1,308 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
__all__ = ['DeepLabV3P', 'DeepLabV3']
@manager.MODELS.add_component
class DeepLabV3P(nn.Layer):
"""
The DeepLabV3Plus implementation based on PaddlePaddle.
The original article refers to
Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation"
(https://arxiv.org/abs/1802.02611)
Args:
num_classes (int): The unique number of target classes.
backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd/Xception65.
backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
Default: (0, 3).
aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
If output_stride=8, aspp_ratios is (1, 12, 24, 36).
Default: (1, 6, 12, 18).
aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(0, 3),
aspp_ratios=(1, 6, 12, 18),
aspp_out_channels=256,
align_corners=False,
pretrained=None,
data_format="NCHW"):
super().__init__()
self.backbone = backbone
backbone_channels = [
backbone.feat_channels[i] for i in backbone_indices
]
self.head = DeepLabV3PHead(
num_classes,
backbone_indices,
backbone_channels,
aspp_ratios,
aspp_out_channels,
align_corners,
data_format=data_format)
self.align_corners = align_corners
self.pretrained = pretrained
self.data_format = data_format
self.init_weight()
def forward(self, x):
feat_list = self.backbone(x)
logit_list = self.head(feat_list)
if self.data_format == 'NCHW':
ori_shape = paddle.shape(x)[2:]
else:
ori_shape = paddle.shape(x)[1:3]
return [
F.interpolate(
logit,
ori_shape,
mode='bilinear',
align_corners=self.align_corners,
data_format=self.data_format) for logit in logit_list
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class DeepLabV3PHead(nn.Layer):
"""
The DeepLabV3PHead implementation based on PaddlePaddle.
Args:
num_classes (int): The unique number of target classes.
backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
the first index will be taken as a low-level feature in Decoder component;
the second one will be taken as input of ASPP component.
Usually backbone consists of four downsampling stage, and return an output of
each stage. If we set it as (0, 3), it means taking feature map of the first
stage in backbone as low-level feature used in Decoder, and feature map of the fourth
stage as input of ASPP.
backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
aspp_ratios (tuple): The dilation rates using in ASSP module.
aspp_out_channels (int): The output channels of ASPP module.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
"""
def __init__(self,
num_classes,
backbone_indices,
backbone_channels,
aspp_ratios,
aspp_out_channels,
align_corners,
data_format='NCHW'):
super().__init__()
self.aspp = layers.ASPPModule(
aspp_ratios,
backbone_channels[1],
aspp_out_channels,
align_corners,
use_sep_conv=True,
image_pooling=True,
data_format=data_format)
self.decoder = Decoder(
num_classes,
backbone_channels[0],
align_corners,
data_format=data_format)
self.backbone_indices = backbone_indices
def forward(self, feat_list):
logit_list = []
low_level_feat = feat_list[self.backbone_indices[0]]
x = feat_list[self.backbone_indices[1]]
x = self.aspp(x)
logit = self.decoder(x, low_level_feat)
logit_list.append(logit)
return logit_list
@manager.MODELS.add_component
class DeepLabV3(nn.Layer):
"""
The DeepLabV3 implementation based on PaddlePaddle.
The original article refers to
Liang-Chieh Chen, et, al. "Rethinking Atrous Convolution for Semantic Image Segmentation"
(https://arxiv.org/pdf/1706.05587.pdf).
Args:
Please Refer to DeepLabV3P above.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(3, ),
aspp_ratios=(1, 6, 12, 18),
aspp_out_channels=256,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
backbone_channels = [
backbone.feat_channels[i] for i in backbone_indices
]
self.head = DeepLabV3Head(num_classes, backbone_indices,
backbone_channels, aspp_ratios,
aspp_out_channels, align_corners)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feat_list = self.backbone(x)
logit_list = self.head(feat_list)
return [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class DeepLabV3Head(nn.Layer):
"""
The DeepLabV3Head implementation based on PaddlePaddle.
Args:
Please Refer to DeepLabV3PHead above.
"""
def __init__(self, num_classes, backbone_indices, backbone_channels,
aspp_ratios, aspp_out_channels, align_corners):
super().__init__()
self.aspp = layers.ASPPModule(
aspp_ratios,
backbone_channels[0],
aspp_out_channels,
align_corners,
use_sep_conv=False,
image_pooling=True)
self.cls = nn.Conv2D(
in_channels=aspp_out_channels,
out_channels=num_classes,
kernel_size=1)
self.backbone_indices = backbone_indices
def forward(self, feat_list):
logit_list = []
x = feat_list[self.backbone_indices[0]]
x = self.aspp(x)
logit = self.cls(x)
logit_list.append(logit)
return logit_list
class Decoder(nn.Layer):
"""
Decoder module of DeepLabV3P model
Args:
num_classes (int): The number of classes.
in_channels (int): The number of input channels in decoder module.
"""
def __init__(self,
num_classes,
in_channels,
align_corners,
data_format='NCHW'):
super(Decoder, self).__init__()
self.data_format = data_format
self.conv_bn_relu1 = layers.ConvBNReLU(
in_channels=in_channels,
out_channels=48,
kernel_size=1,
data_format=data_format)
self.conv_bn_relu2 = layers.SeparableConvBNReLU(
in_channels=304,
out_channels=256,
kernel_size=3,
padding=1,
data_format=data_format)
self.conv_bn_relu3 = layers.SeparableConvBNReLU(
in_channels=256,
out_channels=256,
kernel_size=3,
padding=1,
data_format=data_format)
self.conv = nn.Conv2D(
in_channels=256,
out_channels=num_classes,
kernel_size=1,
data_format=data_format)
self.align_corners = align_corners
def forward(self, x, low_level_feat):
low_level_feat = self.conv_bn_relu1(low_level_feat)
if self.data_format == 'NCHW':
low_level_shape = paddle.shape(low_level_feat)[-2:]
axis = 1
else:
low_level_shape = paddle.shape(low_level_feat)[1:3]
axis = -1
x = F.interpolate(
x,
low_level_shape,
mode='bilinear',
align_corners=self.align_corners,
data_format=self.data_format)
x = paddle.concat([x, low_level_feat], axis=axis)
x = self.conv_bn_relu2(x)
x = self.conv_bn_relu3(x)
x = self.conv(x)
return x

@ -0,0 +1,149 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class DMNet(nn.Layer):
"""
The DMNet implementation based on PaddlePaddle.
The original article refers to
Junjun He, Zhongying Deng, Yu Qiao. "Dynamic Multi-scale Filters for Semantic Segmentation"
Args:
num_classes (int): The unique number of target classes.
backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
mid_channels (int): The middle channels of convolution layer. Default: 512.
filter_sizes (list, tuple): The filter size of generated convolution kernel used in Dynamic Convolutional Module. Default: [1, 3, 5, 7].
fusion (bool): Add one conv to fuse DCM output feature. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
mid_channels=512,
filter_sizes=[1, 3, 5, 7],
fusion=False,
pretrained=None):
super().__init__()
self.backbone = backbone
self.dcm_modules = nn.LayerList()
for filter_size in filter_sizes:
self.dcm_modules.append(
DCM(filter_size, fusion, self.backbone.feat_channels[-1],
mid_channels), )
self.bottleneck = layers.ConvBNReLU(
self.backbone.feat_channels[-1] + len(filter_sizes) * mid_channels,
mid_channels,
3,
padding=1,
)
self.cls = nn.Conv2D(mid_channels, num_classes, 1)
self.fcn_head = nn.Sequential(
layers.ConvBNReLU(self.backbone.feat_channels[2],
mid_channels,
3,
padding=1),
nn.Conv2D(mid_channels, num_classes, 1),
)
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def forward(self, x):
feats = self.backbone(x)
x = feats[-1]
dcm_outs = [x]
for dcm_module in self.dcm_modules:
dcm_outs.append(dcm_module(x))
dcm_outs = paddle.concat(dcm_outs, axis=1)
x = self.bottleneck(dcm_outs)
x = self.cls(x)
x = F.interpolate(x,
scale_factor=8,
mode='bilinear',
align_corners=True)
output = [x]
if self.training:
fcn_out = self.fcn_head(feats[2])
fcn_out = F.interpolate(fcn_out,
scale_factor=8,
mode='bilinear',
align_corners=True)
output.append(fcn_out)
return output
return output
class DCM(nn.Layer):
"""
Dynamic Convolutional Module used in DMNet.
Args:
filter_size (int): The filter size of generated convolution kernel used in Dynamic Convolutional Module.
fusion (bool): Add one conv to fuse DCM output feature.
in_channels (int): Input channels.
channels (int): Channels after modules, before conv_seg.
"""
def __init__(self, filter_size, fusion, in_channels, channels):
super().__init__()
self.filter_size = filter_size
self.fusion = fusion
self.channels = channels
pad = (self.filter_size - 1) // 2
if (self.filter_size - 1) % 2 == 0:
self.pad = (pad, pad, pad, pad)
else:
self.pad = (pad + 1, pad, pad + 1, pad)
self.avg_pool = nn.AdaptiveAvgPool2D(filter_size)
self.filter_gen_conv = nn.Conv2D(in_channels, channels, 1)
self.input_redu_conv = layers.ConvBNReLU(in_channels, channels, 1)
self.norm = layers.SyncBatchNorm(channels)
self.act = nn.ReLU()
if self.fusion:
self.fusion_conv = layers.ConvBNReLU(channels, channels, 1)
def forward(self, x):
generated_filter = self.filter_gen_conv(self.avg_pool(x))
x = self.input_redu_conv(x)
b, c, h, w = x.shape
x = x.reshape([1, b * c, h, w])
generated_filter = generated_filter.reshape(
[b * c, 1, self.filter_size, self.filter_size])
x = F.pad(x, self.pad, mode='constant', value=0)
output = F.conv2d(x, weight=generated_filter, groups=b * c)
output = output.reshape([b, self.channels, h, w])
output = self.norm(output)
output = self.act(output)
if self.fusion:
output = self.fusion_conv(output)
return output

@ -0,0 +1,226 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class DNLNet(nn.Layer):
"""Disentangled Non-Local Neural Networks.
The original article refers to
Minghao Yin, et al. "Disentangled Non-Local Neural Networks"
(https://arxiv.org/abs/2006.06668)
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): A backbone network.
backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
reduction (int): Reduction factor of projection transform. Default: 2.
use_scale (bool): Whether to scale pairwise_weight by
sqrt(1/inter_channels). Default: False.
mode (str): The nonlocal mode. Options are 'embedded_gaussian',
'dot_product'. Default: 'embedded_gaussian'.
temperature (float): Temperature to adjust attention. Default: 0.05.
concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(2, 3),
reduction=2,
use_scale=True,
mode='embedded_gaussian',
temperature=0.05,
concat_input=True,
enable_auxiliary_loss=True,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
self.backbone_indices = backbone_indices
in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
self.head = DNLHead(num_classes, in_channels, reduction, use_scale,
mode, temperature, concat_input,
enable_auxiliary_loss)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feats = self.backbone(x)
feats = [feats[i] for i in self.backbone_indices]
logit_list = self.head(feats)
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners,
align_mode=1) for logit in logit_list
]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class DNLHead(nn.Layer):
"""
The DNLNet head.
Args:
num_classes (int): The unique number of target classes.
in_channels (tuple): The number of input channels.
reduction (int): Reduction factor of projection transform. Default: 2.
use_scale (bool): Whether to scale pairwise_weight by
sqrt(1/inter_channels). Default: False.
mode (str): The nonlocal mode. Options are 'embedded_gaussian',
'dot_product'. Default: 'embedded_gaussian.'.
temperature (float): Temperature to adjust attention. Default: 0.05
concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
"""
def __init__(self,
num_classes,
in_channels,
reduction,
use_scale,
mode,
temperature,
concat_input=True,
enable_auxiliary_loss=True,
**kwargs):
super(DNLHead, self).__init__()
self.in_channels = in_channels[-1]
self.concat_input = concat_input
self.enable_auxiliary_loss = enable_auxiliary_loss
inter_channels = self.in_channels // 4
self.dnl_block = DisentangledNonLocal2D(
in_channels=inter_channels,
reduction=reduction,
use_scale=use_scale,
temperature=temperature,
mode=mode)
self.conv0 = layers.ConvBNReLU(
in_channels=self.in_channels,
out_channels=inter_channels,
kernel_size=3,
bias_attr=False)
self.conv1 = layers.ConvBNReLU(
in_channels=inter_channels,
out_channels=inter_channels,
kernel_size=3,
bias_attr=False)
self.cls = nn.Sequential(
nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1))
self.aux = nn.Sequential(
layers.ConvBNReLU(
in_channels=1024,
out_channels=256,
kernel_size=3,
bias_attr=False), nn.Dropout2D(p=0.1),
nn.Conv2D(256, num_classes, 1))
if self.concat_input:
self.conv_cat = layers.ConvBNReLU(
self.in_channels + inter_channels,
inter_channels,
kernel_size=3,
bias_attr=False)
def forward(self, feat_list):
C3, C4 = feat_list
output = self.conv0(C4)
output = self.dnl_block(output)
output = self.conv1(output)
if self.concat_input:
output = self.conv_cat(paddle.concat([C4, output], axis=1))
output = self.cls(output)
if self.enable_auxiliary_loss:
auxout = self.aux(C3)
return [output, auxout]
else:
return [output]
class DisentangledNonLocal2D(layers.NonLocal2D):
"""Disentangled Non-Local Blocks.
Args:
temperature (float): Temperature to adjust attention.
"""
def __init__(self, temperature, *arg, **kwargs):
super().__init__(*arg, **kwargs)
self.temperature = temperature
self.conv_mask = nn.Conv2D(self.in_channels, 1, kernel_size=1)
def embedded_gaussian(self, theta_x, phi_x):
pairwise_weight = paddle.matmul(theta_x, phi_x)
if self.use_scale:
pairwise_weight /= theta_x.shape[-1]**0.5
pairwise_weight /= self.temperature
pairwise_weight = F.softmax(pairwise_weight, -1)
return pairwise_weight
def forward(self, x):
x_shape = paddle.shape(x)
g_x = self.g(x).reshape([0, self.inter_channels,
-1]).transpose([0, 2, 1])
if self.mode == "gaussian":
theta_x = paddle.transpose(
x.reshape([0, self.in_channels, -1]), [0, 2, 1])
if self.sub_sample:
phi_x = paddle.transpose(self.phi(x), [0, self.in_channels, -1])
else:
phi_x = paddle.transpose(x, [0, self.in_channels, -1])
elif self.mode == "concatenation":
theta_x = paddle.reshape(
self.theta(x), [0, self.inter_channels, -1, 1])
phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, 1, -1])
else:
theta_x = self.theta(x).reshape([0, self.inter_channels,
-1]).transpose([0, 2, 1])
phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, -1])
theta_x -= paddle.mean(theta_x, axis=-2, keepdim=True)
phi_x -= paddle.mean(phi_x, axis=-1, keepdim=True)
pairwise_func = getattr(self, self.mode)
pairwise_weight = pairwise_func(theta_x, phi_x)
y = paddle.matmul(pairwise_weight, g_x).transpose([0, 2, 1]).reshape(
[0, self.inter_channels, x_shape[2], x_shape[3]])
unary_mask = F.softmax(
paddle.reshape(self.conv_mask(x), [0, 1, -1]), -1)
unary_x = paddle.matmul(unary_mask, g_x).transpose([0, 2, 1]).reshape(
[0, self.inter_channels, 1, 1])
output = x + self.conv_out(y + unary_x)
return output

@ -0,0 +1,215 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class EMANet(nn.Layer):
"""
Expectation Maximization Attention Networks for Semantic Segmentation based on PaddlePaddle.
The original article refers to
Xia Li, et al. "Expectation-Maximization Attention Networks for Semantic Segmentation"
(https://arxiv.org/abs/1907.13426)
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): A backbone network.
backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
ema_channels (int): EMA module channels.
gc_channels (int): The input channels to Global Context Block.
num_bases (int): Number of bases.
stage_num (int): The iteration number for EM.
momentum (float): The parameter for updating bases.
concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(2, 3),
ema_channels=512,
gc_channels=256,
num_bases=64,
stage_num=3,
momentum=0.1,
concat_input=True,
enable_auxiliary_loss=True,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
self.backbone_indices = backbone_indices
in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
self.head = EMAHead(num_classes, in_channels, ema_channels, gc_channels,
num_bases, stage_num, momentum, concat_input,
enable_auxiliary_loss)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feats = self.backbone(x)
feats = [feats[i] for i in self.backbone_indices]
logit_list = self.head(feats)
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class EMAHead(nn.Layer):
"""
The EMANet head.
Args:
num_classes (int): The unique number of target classes.
in_channels (tuple): The number of input channels.
ema_channels (int): EMA module channels.
gc_channels (int): The input channels to Global Context Block.
num_bases (int): Number of bases.
stage_num (int): The iteration number for EM.
momentum (float): The parameter for updating bases.
concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
"""
def __init__(self,
num_classes,
in_channels,
ema_channels,
gc_channels,
num_bases,
stage_num,
momentum,
concat_input=True,
enable_auxiliary_loss=True):
super(EMAHead, self).__init__()
self.in_channels = in_channels[-1]
self.concat_input = concat_input
self.enable_auxiliary_loss = enable_auxiliary_loss
self.emau = EMAU(ema_channels, num_bases, stage_num, momentum=momentum)
self.ema_in_conv = layers.ConvBNReLU(
in_channels=self.in_channels,
out_channels=ema_channels,
kernel_size=3)
self.ema_mid_conv = nn.Conv2D(ema_channels, ema_channels, kernel_size=1)
self.ema_out_conv = layers.ConvBNReLU(
in_channels=ema_channels, out_channels=ema_channels, kernel_size=1)
self.bottleneck = layers.ConvBNReLU(
in_channels=ema_channels, out_channels=gc_channels, kernel_size=3)
self.cls = nn.Sequential(
nn.Dropout2D(p=0.1), nn.Conv2D(gc_channels, num_classes, 1))
self.aux = nn.Sequential(
layers.ConvBNReLU(
in_channels=1024, out_channels=256, kernel_size=3),
nn.Dropout2D(p=0.1), nn.Conv2D(256, num_classes, 1))
if self.concat_input:
self.conv_cat = layers.ConvBNReLU(
self.in_channels + gc_channels, gc_channels, kernel_size=3)
def forward(self, feat_list):
C3, C4 = feat_list
feats = self.ema_in_conv(C4)
identity = feats
feats = self.ema_mid_conv(feats)
recon = self.emau(feats)
recon = F.relu(recon)
recon = self.ema_out_conv(recon)
output = F.relu(identity + recon)
output = self.bottleneck(output)
if self.concat_input:
output = self.conv_cat(paddle.concat([C4, output], axis=1))
output = self.cls(output)
if self.enable_auxiliary_loss:
auxout = self.aux(C3)
return [output, auxout]
else:
return [output]
class EMAU(nn.Layer):
'''The Expectation-Maximization Attention Unit (EMAU).
Arguments:
c (int): The input and output channel number.
k (int): The number of the bases.
stage_num (int): The iteration number for EM.
momentum (float): The parameter for updating bases.
'''
def __init__(self, c, k, stage_num=3, momentum=0.1):
super(EMAU, self).__init__()
assert stage_num >= 1
self.stage_num = stage_num
self.momentum = momentum
self.c = c
tmp_mu = self.create_parameter(
shape=[1, c, k],
default_initializer=paddle.nn.initializer.KaimingNormal(k))
mu = F.normalize(paddle.to_tensor(tmp_mu), axis=1, p=2)
self.register_buffer('mu', mu)
def forward(self, x):
x_shape = paddle.shape(x)
x = x.flatten(2)
mu = paddle.tile(self.mu, [x_shape[0], 1, 1])
with paddle.no_grad():
for i in range(self.stage_num):
x_t = paddle.transpose(x, [0, 2, 1])
z = paddle.bmm(x_t, mu)
z = F.softmax(z, axis=2)
z_ = F.normalize(z, axis=1, p=1)
mu = paddle.bmm(x, z_)
mu = F.normalize(mu, axis=1, p=2)
z_t = paddle.transpose(z, [0, 2, 1])
x = paddle.matmul(mu, z_t)
x = paddle.reshape(x, [0, self.c, x_shape[2], x_shape[3]])
if self.training:
mu = paddle.mean(mu, 0, keepdim=True)
mu = F.normalize(mu, axis=1, p=2)
mu = self.mu * (1 - self.momentum) + mu * self.momentum
if paddle.distributed.get_world_size() > 1:
mu = paddle.distributed.all_reduce(mu)
mu /= paddle.distributed.get_world_size()
self.mu = mu
return x

@ -0,0 +1,224 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class ENCNet(nn.Layer):
"""
The ENCNet implementation based on PaddlePaddle.
The original article refers to
Hang Zhang, Kristin Dana, et, al. "Context Encoding for Semantic Segmentation".
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): A backbone network.
backbone_indices (tuple): The values in the tuple indicate the indices of
output of backbone.
num_codes (int): The number of encoded words. Default: 32.
mid_channels (int): The channels of middle layers. Default: 512.
use_se_loss (int): Whether use semantic encoding loss. Default: True.
add_lateral (int): Whether use lateral convolution layers. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=[1, 2, 3],
num_codes=32,
mid_channels=512,
use_se_loss=True,
add_lateral=False,
pretrained=None):
super().__init__()
self.add_lateral = add_lateral
self.num_codes = num_codes
self.backbone = backbone
self.backbone_indices = backbone_indices
in_channels = [
self.backbone.feat_channels[index] for index in backbone_indices
]
self.bottleneck = layers.ConvBNReLU(
in_channels[-1],
mid_channels,
3,
padding=1,
)
if self.add_lateral:
self.lateral_convs = nn.LayerList()
for in_ch in in_channels[:-1]:
self.lateral_convs.append(
layers.ConvBNReLU(
in_ch,
mid_channels,
1,
))
self.fusion = layers.ConvBNReLU(
len(in_channels) * mid_channels,
mid_channels,
3,
padding=1,
)
self.enc_module = EncModule(mid_channels, num_codes)
self.head = nn.Conv2D(mid_channels, num_classes, 1)
self.fcn_head = layers.AuxLayer(self.backbone.feat_channels[2],
mid_channels, num_classes)
self.use_se_loss = use_se_loss
if use_se_loss:
self.se_layer = nn.Linear(mid_channels, num_classes)
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def forward(self, inputs):
N, C, H, W = paddle.shape(inputs)
feats = self.backbone(inputs)
fcn_feat = feats[2]
feats = [feats[i] for i in self.backbone_indices]
feat = self.bottleneck(feats[-1])
if self.add_lateral:
laterals = []
for j, lateral_conv in enumerate(self.lateral_convs):
laterals.append(
F.interpolate(lateral_conv(feats[j]),
size=paddle.shape(feat)[2:],
mode='bilinear',
align_corners=False))
feat = self.fusion(paddle.concat([feat, *laterals], 1))
encode_feat, feat = self.enc_module(feat)
out = self.head(feat)
out = F.interpolate(out,
size=[H, W],
mode='bilinear',
align_corners=False)
output = [out]
if self.training:
fcn_out = self.fcn_head(fcn_feat)
fcn_out = F.interpolate(fcn_out,
size=[H, W],
mode='bilinear',
align_corners=False)
output.append(fcn_out)
if self.use_se_loss:
se_out = self.se_layer(encode_feat)
output.append(se_out)
return output
return output
class Encoding(nn.Layer):
def __init__(self, channels, num_codes):
super().__init__()
self.channels, self.num_codes = channels, num_codes
std = 1 / ((channels * num_codes)**0.5)
self.codewords = self.create_parameter(
shape=(num_codes, channels),
default_initializer=nn.initializer.Uniform(-std, std),
)
self.scale = self.create_parameter(
shape=(num_codes, ),
default_initializer=nn.initializer.Uniform(-1, 0),
)
self.channels = channels
def scaled_l2(self, x, codewords, scale):
num_codes, channels = paddle.shape(codewords)
reshaped_scale = scale.reshape([1, 1, num_codes])
expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
scaled_l2_norm = paddle.multiply(
reshaped_scale,
(expanded_x - reshaped_codewords).pow(2).sum(axis=3))
return scaled_l2_norm
def aggregate(self, assignment_weights, x, codewords):
num_codes, channels = paddle.shape(codewords)
reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
encoded_feat = paddle.multiply(
assignment_weights.unsqueeze(3),
(expanded_x - reshaped_codewords)).sum(axis=1)
encoded_feat = paddle.reshape(encoded_feat,
[-1, self.num_codes, self.channels])
return encoded_feat
def forward(self, x):
x_dims = x.ndim
assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format(
x_dims)
assert paddle.shape(
x
)[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format(
self.channels,
paddle.shape(x)[1])
batch_size = paddle.shape(x)[0]
x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1])
assignment_weights = F.softmax(self.scaled_l2(x, self.codewords,
self.scale),
axis=2)
encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
return encoded_feat
class EncModule(nn.Layer):
def __init__(self, in_channels, num_codes):
super().__init__()
self.encoding_project = layers.ConvBNReLU(
in_channels,
in_channels,
1,
)
self.encoding = nn.Sequential(
Encoding(channels=in_channels, num_codes=num_codes),
nn.BatchNorm1D(num_codes),
nn.ReLU(),
)
self.fc = nn.Sequential(
nn.Linear(in_channels, in_channels),
nn.Sigmoid(),
)
self.in_channels = in_channels
def forward(self, x):
encoding_projection = self.encoding_project(x)
encoding_feat = self.encoding(encoding_projection)
encoding_feat = encoding_feat.mean(axis=1)
batch_size, _, _, _ = paddle.shape(x)
gamma = self.fc(encoding_feat)
y = gamma.reshape([batch_size, self.in_channels, 1, 1])
output = F.relu(x + x * y)
return encoding_feat, output

@ -0,0 +1,622 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg import utils
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager, param_init
__all__ = ['ENet']
@manager.MODELS.add_component
class ENet(nn.Layer):
"""
The ENet implementation based on PaddlePaddle.
The original article refers to
Adam Paszke, Abhishek Chaurasia, Sangpil Kim, Eugenio Culurciello, et al."ENet: A Deep Neural Network Architecture for Real-Time Semantic Segmentation"
(https://arxiv.org/abs/1606.02147).
Args:
num_classes (int): The unique number of target classes.
pretrained (str, optional): The path or url of pretrained model. Default: None.
encoder_relu (bool, optional): When ``True`` ReLU is used as the activation
function; otherwise, PReLU is used. Default: False.
decoder_relu (bool, optional): When ``True`` ReLU is used as the activation
function; otherwise, PReLU is used. Default: True.
"""
def __init__(self,
num_classes,
pretrained=None,
encoder_relu=False,
decoder_relu=True):
super(ENet, self).__init__()
self.numclasses = num_classes
self.initial_block = InitialBlock(3, 16, relu=encoder_relu)
self.downsample1_0 = DownsamplingBottleneck(16,
64,
return_indices=True,
dropout_prob=0.01,
relu=encoder_relu)
self.regular1_1 = RegularBottleneck(64,
padding=1,
dropout_prob=0.01,
relu=encoder_relu)
self.regular1_2 = RegularBottleneck(64,
padding=1,
dropout_prob=0.01,
relu=encoder_relu)
self.regular1_3 = RegularBottleneck(64,
padding=1,
dropout_prob=0.01,
relu=encoder_relu)
self.regular1_4 = RegularBottleneck(64,
padding=1,
dropout_prob=0.01,
relu=encoder_relu)
self.downsample2_0 = DownsamplingBottleneck(64,
128,
return_indices=True,
dropout_prob=0.1,
relu=encoder_relu)
self.regular2_1 = RegularBottleneck(128,
padding=1,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated2_2 = RegularBottleneck(128,
dilation=2,
padding=2,
dropout_prob=0.1,
relu=encoder_relu)
self.asymmetric2_3 = RegularBottleneck(128,
kernel_size=5,
padding=2,
asymmetric=True,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated2_4 = RegularBottleneck(128,
dilation=4,
padding=4,
dropout_prob=0.1,
relu=encoder_relu)
self.regular2_5 = RegularBottleneck(128,
padding=1,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated2_6 = RegularBottleneck(128,
dilation=8,
padding=8,
dropout_prob=0.1,
relu=encoder_relu)
self.asymmetric2_7 = RegularBottleneck(128,
kernel_size=5,
asymmetric=True,
padding=2,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated2_8 = RegularBottleneck(128,
dilation=16,
padding=16,
dropout_prob=0.1,
relu=encoder_relu)
self.regular3_0 = RegularBottleneck(128,
padding=1,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated3_1 = RegularBottleneck(128,
dilation=2,
padding=2,
dropout_prob=0.1,
relu=encoder_relu)
self.asymmetric3_2 = RegularBottleneck(128,
kernel_size=5,
padding=2,
asymmetric=True,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated3_3 = RegularBottleneck(128,
dilation=4,
padding=4,
dropout_prob=0.1,
relu=encoder_relu)
self.regular3_4 = RegularBottleneck(128,
padding=1,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated3_5 = RegularBottleneck(128,
dilation=8,
padding=8,
dropout_prob=0.1,
relu=encoder_relu)
self.asymmetric3_6 = RegularBottleneck(128,
kernel_size=5,
asymmetric=True,
padding=2,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated3_7 = RegularBottleneck(128,
dilation=16,
padding=16,
dropout_prob=0.1,
relu=encoder_relu)
self.upsample4_0 = UpsamplingBottleneck(128,
64,
dropout_prob=0.1,
relu=decoder_relu)
self.regular4_1 = RegularBottleneck(64,
padding=1,
dropout_prob=0.1,
relu=decoder_relu)
self.regular4_2 = RegularBottleneck(64,
padding=1,
dropout_prob=0.1,
relu=decoder_relu)
self.upsample5_0 = UpsamplingBottleneck(64,
16,
dropout_prob=0.1,
relu=decoder_relu)
self.regular5_1 = RegularBottleneck(16,
padding=1,
dropout_prob=0.1,
relu=decoder_relu)
self.transposed_conv = nn.Conv2DTranspose(16,
num_classes,
kernel_size=3,
stride=2,
padding=1,
bias_attr=False)
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
input_size = x.shape
x = self.initial_block(x)
stage1_input_size = x.shape
x, max_indices1_0 = self.downsample1_0(x)
x = self.regular1_1(x)
x = self.regular1_2(x)
x = self.regular1_3(x)
x = self.regular1_4(x)
stage2_input_size = x.shape
x, max_indices2_0 = self.downsample2_0(x)
x = self.regular2_1(x)
x = self.dilated2_2(x)
x = self.asymmetric2_3(x)
x = self.dilated2_4(x)
x = self.regular2_5(x)
x = self.dilated2_6(x)
x = self.asymmetric2_7(x)
x = self.dilated2_8(x)
x = self.regular3_0(x)
x = self.dilated3_1(x)
x = self.asymmetric3_2(x)
x = self.dilated3_3(x)
x = self.regular3_4(x)
x = self.dilated3_5(x)
x = self.asymmetric3_6(x)
x = self.dilated3_7(x)
x = self.upsample4_0(x, max_indices2_0, output_size=stage2_input_size)
x = self.regular4_1(x)
x = self.regular4_2(x)
x = self.upsample5_0(x, max_indices1_0, output_size=stage1_input_size)
x = self.regular5_1(x)
x = self.transposed_conv(x, output_size=input_size[2:])
return [x]
def init_weight(self):
if self.pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
class InitialBlock(nn.Layer):
"""
The initial block is composed of two branches:
1. a main branch which performs a regular convolution with stride 2;
2. an extension branch which performs max-pooling.
Doing both operations in parallel and concatenating their results
allows for efficient downsampling and expansion. The main branch
outputs 13 feature maps while the extension branch outputs 3, for a
total of 16 feature maps after concatenation.
Args:
in_channels (int): the number of input channels.
out_channels (int): the number output channels.
kernel_size (int, optional): the kernel size of the filters used in
the convolution layer. Default: 3.
padding (int, optional): zero-padding added to both sides of the
input. Default: 0.
bias (bool, optional): Adds a learnable bias to the output if
``True``. Default: False.
relu (bool, optional): When ``True`` ReLU is used as the activation
function; otherwise, PReLU is used. Default: True.
"""
def __init__(self, in_channels, out_channels, bias=False, relu=True):
super(InitialBlock, self).__init__()
if relu:
activation = nn.ReLU
else:
activation = nn.PReLU
self.main_branch = nn.Conv2D(in_channels,
out_channels - 3,
kernel_size=3,
stride=2,
padding=1,
bias_attr=bias)
self.ext_branch = nn.MaxPool2D(3, stride=2, padding=1)
self.batch_norm = layers.SyncBatchNorm(out_channels)
self.out_activation = activation()
def forward(self, x):
main = self.main_branch(x)
ext = self.ext_branch(x)
out = paddle.concat((main, ext), 1)
out = self.batch_norm(out)
return self.out_activation(out)
class RegularBottleneck(nn.Layer):
"""
Regular bottlenecks are the main building block of ENet.
Main branch:
1. Shortcut connection.
Extension branch:
1. 1x1 convolution which decreases the number of channels by
``internal_ratio``, also called a projection;
2. regular, dilated or asymmetric convolution;
3. 1x1 convolution which increases the number of channels back to
``channels``, also called an expansion;
4. dropout as a regularizer.
Args:
channels (int): the number of input and output channels.
internal_ratio (int, optional): a scale factor applied to
``channels`` used to compute the number of
channels after the projection. eg. given ``channels`` equal to 128 and
internal_ratio equal to 2 the number of channels after the projection
is 64. Default: 4.
kernel_size (int, optional): the kernel size of the filters used in
the convolution layer described above in item 2 of the extension
branch. Default: 3.
padding (int, optional): zero-padding added to both sides of the
input. Default: 0.
dilation (int, optional): spacing between kernel elements for the
convolution described in item 2 of the extension branch. Default: 1.
asymmetric (bool, optional): flags if the convolution described in
item 2 of the extension branch is asymmetric or not. Default: False.
dropout_prob (float, optional): probability of an element to be
zeroed. Default: 0 (no dropout).
bias (bool, optional): Adds a learnable bias to the output if
``True``. Default: False.
relu (bool, optional): When ``True`` ReLU is used as the activation
function; otherwise, PReLU is used. Default: True.
"""
def __init__(self,
channels,
internal_ratio=4,
kernel_size=3,
padding=0,
dilation=1,
asymmetric=False,
dropout_prob=0,
bias=False,
relu=True):
super(RegularBottleneck, self).__init__()
if internal_ratio <= 1 or internal_ratio > channels:
raise RuntimeError(
"Value out of range. Expected value in the "
"interval [1, {0}], got internal_scale={1}.".format(
channels, internal_ratio))
internal_channels = channels // internal_ratio
if relu:
activation = nn.ReLU
else:
activation = nn.PReLU
self.ext_conv1 = nn.Sequential(
nn.Conv2D(channels,
internal_channels,
kernel_size=1,
stride=1,
bias_attr=bias), layers.SyncBatchNorm(internal_channels),
activation())
if asymmetric:
self.ext_conv2 = nn.Sequential(
nn.Conv2D(internal_channels,
internal_channels,
kernel_size=(kernel_size, 1),
stride=1,
padding=(padding, 0),
dilation=dilation,
bias_attr=bias),
layers.SyncBatchNorm(internal_channels), activation(),
nn.Conv2D(internal_channels,
internal_channels,
kernel_size=(1, kernel_size),
stride=1,
padding=(0, padding),
dilation=dilation,
bias_attr=bias),
layers.SyncBatchNorm(internal_channels), activation())
else:
self.ext_conv2 = nn.Sequential(
nn.Conv2D(internal_channels,
internal_channels,
kernel_size=kernel_size,
stride=1,
padding=padding,
dilation=dilation,
bias_attr=bias),
layers.SyncBatchNorm(internal_channels), activation())
self.ext_conv3 = nn.Sequential(
nn.Conv2D(internal_channels,
channels,
kernel_size=1,
stride=1,
bias_attr=bias), layers.SyncBatchNorm(channels),
activation())
self.ext_regul = nn.Dropout2D(p=dropout_prob)
self.out_activation = activation()
def forward(self, x):
main = x
ext = self.ext_conv1(x)
ext = self.ext_conv2(ext)
ext = self.ext_conv3(ext)
ext = self.ext_regul(ext)
out = main + ext
return self.out_activation(out)
class DownsamplingBottleneck(nn.Layer):
"""
Downsampling bottlenecks further downsample the feature map size.
Main branch:
1. max pooling with stride 2; indices are saved to be used for
unpooling later.
Extension branch:
1. 2x2 convolution with stride 2 that decreases the number of channels
by ``internal_ratio``, also called a projection;
2. regular convolution (by default, 3x3);
3. 1x1 convolution which increases the number of channels to
``out_channels``, also called an expansion;
4. dropout as a regularizer.
Args:
in_channels (int): the number of input channels.
out_channels (int): the number of output channels.
internal_ratio (int, optional): a scale factor applied to ``channels``
used to compute the number of channels after the projection. eg. given
``channels`` equal to 128 and internal_ratio equal to 2 the number of
channels after the projection is 64. Default: 4.
return_indices (bool, optional): if ``True``, will return the max
indices along with the outputs. Useful when unpooling later.
dropout_prob (float, optional): probability of an element to be
zeroed. Default: 0 (no dropout).
bias (bool, optional): Adds a learnable bias to the output if
``True``. Default: False.
relu (bool, optional): When ``True`` ReLU is used as the activation
function; otherwise, PReLU is used. Default: True.
"""
def __init__(self,
in_channels,
out_channels,
internal_ratio=4,
return_indices=False,
dropout_prob=0,
bias=False,
relu=True):
super(DownsamplingBottleneck, self).__init__()
self.return_indices = return_indices
if internal_ratio <= 1 or internal_ratio > in_channels:
raise RuntimeError(
"Value out of range. Expected value in the "
"interval [1, {0}], got internal_scale={1}. ".format(
in_channels, internal_ratio))
internal_channels = in_channels // internal_ratio
if relu:
activation = nn.ReLU
else:
activation = nn.PReLU
self.main_max1 = nn.MaxPool2D(2, stride=2, return_mask=return_indices)
self.ext_conv1 = nn.Sequential(
nn.Conv2D(in_channels,
internal_channels,
kernel_size=2,
stride=2,
bias_attr=bias), layers.SyncBatchNorm(internal_channels),
activation())
self.ext_conv2 = nn.Sequential(
nn.Conv2D(internal_channels,
internal_channels,
kernel_size=3,
stride=1,
padding=1,
bias_attr=bias), layers.SyncBatchNorm(internal_channels),
activation())
self.ext_conv3 = nn.Sequential(
nn.Conv2D(internal_channels,
out_channels,
kernel_size=1,
stride=1,
bias_attr=bias), layers.SyncBatchNorm(out_channels),
activation())
self.ext_regul = nn.Dropout2D(p=dropout_prob)
self.out_activation = activation()
def forward(self, x):
if self.return_indices:
main, max_indices = self.main_max1(x)
else:
main = self.main_max1(x)
ext = self.ext_conv1(x)
ext = self.ext_conv2(ext)
ext = self.ext_conv3(ext)
ext = self.ext_regul(ext)
n, ch_ext, h, w = ext.shape
ch_main = main.shape[1]
padding = paddle.zeros((n, ch_ext - ch_main, h, w))
main = paddle.concat((main, padding), 1)
out = main + ext
return self.out_activation(out), max_indices
class UpsamplingBottleneck(nn.Layer):
"""
The upsampling bottlenecks upsample the feature map resolution using max
pooling indices stored from the corresponding downsampling bottleneck.
Main branch:
1. 1x1 convolution with stride 1 that decreases the number of channels by
``internal_ratio``, also called a projection;
2. max unpool layer using the max pool indices from the corresponding
downsampling max pool layer.
Extension branch:
1. 1x1 convolution with stride 1 that decreases the number of channels by
``internal_ratio``, also called a projection;
2. transposed convolution (by default, 3x3);
3. 1x1 convolution which increases the number of channels to
``out_channels``, also called an expansion;
4. dropout as a regularizer.
Args:
in_channels (int): the number of input channels.
out_channels (int): the number of output channels.
internal_ratio (int, optional): a scale factor applied to ``in_channels``
used to compute the number of channels after the projection. eg. given
``in_channels`` equal to 128 and ``internal_ratio`` equal to 2 the number
of channels after the projection is 64. Default: 4.
dropout_prob (float, optional): probability of an element to be zeroed.
Default: 0 (no dropout).
bias (bool, optional): Adds a learnable bias to the output if ``True``.
Default: False.
relu (bool, optional): When ``True`` ReLU is used as the activation
function; otherwise, PReLU is used. Default: True.
"""
def __init__(self,
in_channels,
out_channels,
internal_ratio=4,
dropout_prob=0,
bias=False,
relu=True):
super(UpsamplingBottleneck, self).__init__()
if internal_ratio <= 1 or internal_ratio > in_channels:
raise RuntimeError(
"Value out of range. Expected value in the "
"interval [1, {0}], got internal_scale={1}. ".format(
in_channels, internal_ratio))
internal_channels = in_channels // internal_ratio
if relu:
activation = nn.ReLU
else:
activation = nn.PReLU
self.main_conv1 = nn.Sequential(
nn.Conv2D(in_channels, out_channels, kernel_size=1, bias_attr=bias),
layers.SyncBatchNorm(out_channels))
self.ext_conv1 = nn.Sequential(
nn.Conv2D(in_channels,
internal_channels,
kernel_size=1,
bias_attr=bias), layers.SyncBatchNorm(internal_channels),
activation())
self.ext_tconv1 = nn.Conv2DTranspose(internal_channels,
internal_channels,
kernel_size=2,
stride=2,
bias_attr=bias)
self.ext_tconv1_bnorm = layers.SyncBatchNorm(internal_channels)
self.ext_tconv1_activation = activation()
self.ext_conv2 = nn.Sequential(
nn.Conv2D(internal_channels,
out_channels,
kernel_size=1,
bias_attr=bias), layers.SyncBatchNorm(out_channels))
self.ext_regul = nn.Dropout2D(p=dropout_prob)
self.out_activation = activation()
def forward(self, x, max_indices, output_size):
main = self.main_conv1(x)
main = F.max_unpool2d(main,
max_indices,
kernel_size=2,
output_size=output_size)
ext = self.ext_conv1(x)
ext = self.ext_tconv1(ext, output_size=output_size[2:])
ext = self.ext_tconv1_bnorm(ext)
ext = self.ext_tconv1_activation(ext)
ext = self.ext_conv2(ext)
ext = self.ext_regul(ext)
out = main + ext
return self.out_activation(out)

@ -0,0 +1,477 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg import utils
from paddlers.models.ppseg.cvlibs import manager, param_init
from paddlers.models.ppseg.models import layers
@manager.MODELS.add_component
class ESPNetV2(nn.Layer):
"""
The ESPNetV2 implementation based on PaddlePaddle.
The original article refers to
Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network"
(https://arxiv.org/abs/1811.11431).
Args:
num_classes (int): The unique number of target classes.
in_channels (int, optional): Number of input channels. Default: 3.
scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0.
drop_prob (floa, optional): The probability of dropout. Default: 0.1.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
in_channels=3,
scale=1.0,
drop_prob=0.1,
pretrained=None):
super().__init__()
self.backbone = EESPNetBackbone(in_channels, drop_prob, scale)
self.in_channels = self.backbone.out_channels
self.proj_l4_c = layers.ConvBNPReLU(self.in_channels[3],
self.in_channels[2],
1,
stride=1,
bias_attr=False)
psp_size = 2 * self.in_channels[2]
self.eesp_psp = nn.Sequential(
EESP(psp_size,
psp_size // 2,
stride=1,
branches=4,
kernel_size_maximum=7),
PSPModule(psp_size // 2, psp_size // 2),
)
self.project_l3 = nn.Sequential(
nn.Dropout2D(p=drop_prob),
nn.Conv2D(psp_size // 2, num_classes, 1, 1, bias_attr=False),
)
self.act_l3 = BNPReLU(num_classes)
self.project_l2 = layers.ConvBNPReLU(self.in_channels[1] + num_classes,
num_classes,
1,
stride=1,
bias_attr=False)
self.project_l1 = nn.Sequential(
nn.Dropout2D(p=drop_prob),
nn.Conv2D(self.in_channels[0] + num_classes,
num_classes,
1,
1,
bias_attr=False),
)
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def hierarchical_upsample(self, x, factor=3):
for i in range(factor):
x = F.interpolate(x,
scale_factor=2,
mode='bilinear',
align_corners=True)
return x
def forward(self, x):
out_l1, out_l2, out_l3, out_l4 = self.backbone(x)
out_l4_proj = self.proj_l4_c(out_l4)
l4_to_l3 = F.interpolate(out_l4_proj,
scale_factor=2,
mode='bilinear',
align_corners=True)
merged_l3 = self.eesp_psp(paddle.concat([out_l3, l4_to_l3], axis=1))
proj_merge_l3 = self.project_l3(merged_l3)
proj_merge_l3 = self.act_l3(proj_merge_l3)
l3_to_l2 = F.interpolate(proj_merge_l3,
scale_factor=2,
mode='bilinear',
align_corners=True)
merged_l2 = self.project_l2(paddle.concat([out_l2, l3_to_l2], axis=1))
l2_to_l1 = F.interpolate(merged_l2,
scale_factor=2,
mode='bilinear',
align_corners=True)
merged_l1 = self.project_l1(paddle.concat([out_l1, l2_to_l1], axis=1))
if self.training:
return [
F.interpolate(merged_l1,
scale_factor=2,
mode='bilinear',
align_corners=True),
self.hierarchical_upsample(proj_merge_l3),
]
else:
return [
F.interpolate(merged_l1,
scale_factor=2,
mode='bilinear',
align_corners=True)
]
class BNPReLU(nn.Layer):
def __init__(self, out_channels, **kwargs):
super().__init__()
if 'data_format' in kwargs:
data_format = kwargs['data_format']
else:
data_format = 'NCHW'
self._batch_norm = layers.SyncBatchNorm(out_channels,
data_format=data_format)
self._prelu = layers.Activation("prelu")
def forward(self, x):
x = self._batch_norm(x)
x = self._prelu(x)
return x
class EESP(nn.Layer):
"""
EESP block, principle: reduce -> split -> transform -> merge
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
stride (int, optional): Factor by which we should skip (useful for down-sampling). If 2, then down-samples the feature map by 2. Default: 1.
branches (int, optional): Number of branches. Default: 4.
kernel_size_maximum (int, optional): A maximum value of receptive field allowed for EESP block. Default: 7.
down_method (str, optional): Down sample or not, only support 'avg' and 'esp'(equivalent to stride is 2 or not). Default: 'esp'.
"""
def __init__(self,
in_channels,
out_channels,
stride=1,
branches=4,
kernel_size_maximum=7,
down_method='esp'):
super(EESP, self).__init__()
if out_channels % branches != 0:
raise RuntimeError(
"The out_channes for EESP should be factorized by branches, but out_channels={} cann't be factorized by branches={}"
.format(out_channels, branches))
assert down_method in [
'avg', 'esp'
], "The down_method for EESP only support 'avg' or 'esp', but got down_method={}".format(
down_method)
self.in_channels = in_channels
self.stride = stride
in_branch_channels = int(out_channels / branches)
self.group_conv_in = layers.ConvBNPReLU(in_channels,
in_branch_channels,
1,
stride=1,
groups=branches,
bias_attr=False)
map_ksize_dilation = {
3: 1,
5: 2,
7: 3,
9: 4,
11: 5,
13: 6,
15: 7,
17: 8
}
self.kernel_sizes = []
for i in range(branches):
kernel_size = 3 + 2 * i
kernel_size = kernel_size if kernel_size <= kernel_size_maximum else 3
self.kernel_sizes.append(kernel_size)
self.kernel_sizes.sort()
self.spp_modules = nn.LayerList()
for i in range(branches):
dilation = map_ksize_dilation[self.kernel_sizes[i]]
self.spp_modules.append(
nn.Conv2D(in_branch_channels,
in_branch_channels,
kernel_size=3,
padding='same',
stride=stride,
dilation=dilation,
groups=in_branch_channels,
bias_attr=False))
self.group_conv_out = layers.ConvBN(out_channels,
out_channels,
kernel_size=1,
stride=1,
groups=branches,
bias_attr=False)
self.bn_act = BNPReLU(out_channels)
self._act = nn.PReLU()
self.down_method = True if down_method == 'avg' else False
@paddle.jit.not_to_static
def convert_group_x(self, group_merge, x):
if x.shape == group_merge.shape:
group_merge += x
return group_merge
def forward(self, x):
group_out = self.group_conv_in(x)
output = [self.spp_modules[0](group_out)]
for k in range(1, len(self.spp_modules)):
output_k = self.spp_modules[k](group_out)
output_k = output_k + output[k - 1]
output.append(output_k)
group_merge = self.group_conv_out(
self.bn_act(paddle.concat(output, axis=1)))
if self.stride == 2 and self.down_method:
return group_merge
group_merge = self.convert_group_x(group_merge, x)
out = self._act(group_merge)
return out
class PSPModule(nn.Layer):
def __init__(self, in_channels, out_channels, sizes=4):
super().__init__()
self.stages = nn.LayerList([
nn.Conv2D(in_channels,
in_channels,
kernel_size=3,
stride=1,
groups=in_channels,
padding='same',
bias_attr=False) for _ in range(sizes)
])
self.project = layers.ConvBNPReLU(in_channels * (sizes + 1),
out_channels,
1,
stride=1,
bias_attr=False)
def forward(self, feats):
h, w = paddle.shape(feats)[2:4]
out = [feats]
for stage in self.stages:
feats = F.avg_pool2d(feats, kernel_size=3, stride=2, padding='same')
upsampled = F.interpolate(stage(feats),
size=[h, w],
mode='bilinear',
align_corners=True)
out.append(upsampled)
return self.project(paddle.concat(out, axis=1))
class DownSampler(nn.Layer):
"""
Down sampler.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
branches (int, optional): Number of branches. Default: 9.
kernel_size_maximum (int, optional): A maximum value of kernel_size for EESP block. Default: 9.
shortcut (bool, optional): Use shortcut or not. Default: True.
"""
def __init__(self,
in_channels,
out_channels,
branches=4,
kernel_size_maximum=9,
shortcut=True):
super().__init__()
if out_channels < in_channels:
raise RuntimeError(
"The out_channes for DownSampler should be bigger than in_channels, but got in_channles={}, out_channels={}"
.format(in_channels, out_channels))
self.eesp = EESP(in_channels,
out_channels - in_channels,
stride=2,
branches=branches,
kernel_size_maximum=kernel_size_maximum,
down_method='avg')
self.avg = nn.AvgPool2D(kernel_size=3, padding=1, stride=2)
if shortcut:
self.shortcut_layer = nn.Sequential(
layers.ConvBNPReLU(3, 3, 3, stride=1, bias_attr=False),
layers.ConvBN(3, out_channels, 1, stride=1, bias_attr=False),
)
self._act = nn.PReLU()
def forward(self, x, inputs=None):
avg_out = self.avg(x)
eesp_out = self.eesp(x)
output = paddle.concat([avg_out, eesp_out], axis=1)
if inputs is not None:
w1 = paddle.shape(avg_out)[2]
w2 = paddle.shape(inputs)[2]
while w2 != w1:
inputs = F.avg_pool2d(inputs,
kernel_size=3,
padding=1,
stride=2)
w2 = paddle.shape(inputs)[2]
# import pdb
# pdb.set_trace()
output = output + self.shortcut_layer(inputs)
return self._act(output)
class EESPNetBackbone(nn.Layer):
"""
The EESPNetBackbone implementation based on PaddlePaddle.
The original article refers to
Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network"
(https://arxiv.org/abs/1811.11431).
Args:
in_channels (int, optional): Number of input channels. Default: 3.
drop_prob (float, optional): The probability of dropout. Default: 3.
scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0.
"""
def __init__(self, in_channels=3, drop_prob=0.1, scale=1.0):
super().__init__()
reps = [0, 3, 7, 3]
num_level = 4 # 1/2, 1/4, 1/8, 1/16
kernel_size_limitations = [13, 11, 9, 7] # kernel size limitation
branch_list = [4] * len(
kernel_size_limitations) # branches at different levels
base_channels = 32 # first conv output channels
channels_config = [base_channels] * num_level
for i in range(num_level):
if i == 0:
channels = int(base_channels * scale)
channels = math.ceil(channels / branch_list[0]) * branch_list[0]
channels_config[
i] = base_channels if channels > base_channels else channels
else:
channels_config[i] = channels * pow(2, i)
self.level1 = layers.ConvBNPReLU(in_channels,
channels_config[0],
3,
stride=2,
bias_attr=False)
self.level2 = DownSampler(
channels_config[0],
channels_config[1],
branches=branch_list[0],
kernel_size_maximum=kernel_size_limitations[0],
shortcut=True)
self.level3_0 = DownSampler(
channels_config[1],
channels_config[2],
branches=branch_list[1],
kernel_size_maximum=kernel_size_limitations[1],
shortcut=True)
self.level3 = nn.LayerList()
for i in range(reps[1]):
self.level3.append(
EESP(channels_config[2],
channels_config[2],
stride=1,
branches=branch_list[2],
kernel_size_maximum=kernel_size_limitations[2]))
self.level4_0 = DownSampler(
channels_config[2],
channels_config[3],
branches=branch_list[2],
kernel_size_maximum=kernel_size_limitations[2],
shortcut=True)
self.level4 = nn.LayerList()
for i in range(reps[2]):
self.level4.append(
EESP(channels_config[3],
channels_config[3],
stride=1,
branches=branch_list[3],
kernel_size_maximum=kernel_size_limitations[3]))
self.out_channels = channels_config
self.init_params()
def init_params(self):
for m in self.sublayers():
if isinstance(m, nn.Conv2D):
param_init.kaiming_normal_init(m.weight)
if m.bias is not None:
param_init.constant_init(m.bias, value=0.0)
elif isinstance(m, nn.BatchNorm2D):
param_init.constant_init(m.weight, value=1.0)
param_init.constant_init(m.bias, value=0.0)
elif isinstance(m, nn.Linear):
param_init.normal_init(m.weight, std=0.001)
if m.bias is not None:
param_init.constant_init(m.bias, value=0.0)
def forward(self, x):
out_l1 = self.level1(x)
out_l2 = self.level2(out_l1, x)
out_l3 = self.level3_0(out_l2, x)
for i, layer in enumerate(self.level3):
out_l3 = layer(out_l3)
out_l4 = self.level4_0(out_l3, x)
for i, layer in enumerate(self.level4):
out_l4 = layer(out_l4)
return out_l1, out_l2, out_l3, out_l4
if __name__ == '__main__':
import paddle
import numpy as np
paddle.enable_static()
startup_prog = paddle.static.default_startup_program()
exe = paddle.static.Executor(paddle.CPUPlace())
exe.run(startup_prog)
path_prefix = "./output/model"
[inference_program, feed_target_names, fetch_targets] = (
paddle.static.load_inference_model(path_prefix, exe))
print('inference_program:', inference_program)
tensor_img = np.array(np.random.random((1, 3, 1024, 2048)), dtype=np.float32)
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)

@ -0,0 +1,308 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class ESPNetV1(nn.Layer):
"""
The ESPNetV1 implementation based on PaddlePaddle.
The original article refers to
Sachin Mehta1, Mohammad Rastegari, Anat Caspi, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNet: Efficient Spatial Pyramid of Dilated Convolutions for Semantic Segmentation"
(https://arxiv.org/abs/1803.06815).
Args:
num_classes (int): The unique number of target classes.
in_channels (int, optional): Number of input channels. Default: 3.
level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 2.
level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
in_channels=3,
level2_depth=2,
level3_depth=3,
pretrained=None):
super().__init__()
self.encoder = ESPNetEncoder(num_classes, in_channels, level2_depth,
level3_depth)
self.level3_up = nn.Conv2DTranspose(num_classes,
num_classes,
2,
stride=2,
padding=0,
output_padding=0,
bias_attr=False)
self.br3 = layers.SyncBatchNorm(num_classes)
self.level2_proj = nn.Conv2D(in_channels + 128,
num_classes,
1,
bias_attr=False)
self.combine_l2_l3 = nn.Sequential(
BNPReLU(2 * num_classes),
DilatedResidualBlock(2 * num_classes, num_classes, residual=False),
)
self.level2_up = nn.Sequential(
nn.Conv2DTranspose(num_classes,
num_classes,
2,
stride=2,
padding=0,
output_padding=0,
bias_attr=False),
BNPReLU(num_classes),
)
self.out_proj = layers.ConvBNPReLU(16 + in_channels + num_classes,
num_classes,
3,
padding='same',
stride=1)
self.out_up = nn.Conv2DTranspose(num_classes,
num_classes,
2,
stride=2,
padding=0,
output_padding=0,
bias_attr=False)
self.pretrained = pretrained
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def forward(self, x):
p1, p2, p3 = self.encoder(x)
up_p3 = self.level3_up(p3)
combine = self.combine_l2_l3(paddle.concat([up_p3, p2], axis=1))
up_p2 = self.level2_up(combine)
combine = self.out_proj(paddle.concat([up_p2, p1], axis=1))
out = self.out_up(combine)
return [out]
class BNPReLU(nn.Layer):
def __init__(self, channels):
super().__init__()
self.bn = layers.SyncBatchNorm(channels)
self.act = nn.PReLU(channels)
def forward(self, x):
x = self.bn(x)
x = self.act(x)
return x
class DownSampler(nn.Layer):
"""
Down sampler.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
"""
def __init__(self, in_channels, out_channels):
super().__init__()
branch_channels = out_channels // 5
remain_channels = out_channels - branch_channels * 4
self.conv1 = nn.Conv2D(in_channels,
branch_channels,
3,
stride=2,
padding=1,
bias_attr=False)
self.d_conv1 = nn.Conv2D(branch_channels,
remain_channels,
3,
padding=1,
bias_attr=False)
self.d_conv2 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=2,
dilation=2,
bias_attr=False)
self.d_conv4 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=4,
dilation=4,
bias_attr=False)
self.d_conv8 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=8,
dilation=8,
bias_attr=False)
self.d_conv16 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=16,
dilation=16,
bias_attr=False)
self.bn = layers.SyncBatchNorm(out_channels)
self.act = nn.PReLU(out_channels)
def forward(self, x):
x = self.conv1(x)
d1 = self.d_conv1(x)
d2 = self.d_conv2(x)
d4 = self.d_conv4(x)
d8 = self.d_conv8(x)
d16 = self.d_conv16(x)
feat1 = d2
feat2 = feat1 + d4
feat3 = feat2 + d8
feat4 = feat3 + d16
feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1)
out = self.bn(feat)
out = self.act(out)
return out
class DilatedResidualBlock(nn.Layer):
'''
ESP block, principle: reduce -> split -> transform -> merge
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
residual (bool, optional): Add a residual connection through identity operation. Default: True.
'''
def __init__(self, in_channels, out_channels, residual=True):
super().__init__()
branch_channels = out_channels // 5
remain_channels = out_channels - branch_channels * 4
self.conv1 = nn.Conv2D(in_channels, branch_channels, 1, bias_attr=False)
self.d_conv1 = nn.Conv2D(branch_channels,
remain_channels,
3,
padding=1,
bias_attr=False)
self.d_conv2 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=2,
dilation=2,
bias_attr=False)
self.d_conv4 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=4,
dilation=4,
bias_attr=False)
self.d_conv8 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=8,
dilation=8,
bias_attr=False)
self.d_conv16 = nn.Conv2D(branch_channels,
branch_channels,
3,
padding=16,
dilation=16,
bias_attr=False)
self.bn = BNPReLU(out_channels)
self.residual = residual
def forward(self, x):
x_proj = self.conv1(x)
d1 = self.d_conv1(x_proj)
d2 = self.d_conv2(x_proj)
d4 = self.d_conv4(x_proj)
d8 = self.d_conv8(x_proj)
d16 = self.d_conv16(x_proj)
feat1 = d2
feat2 = feat1 + d4
feat3 = feat2 + d8
feat4 = feat3 + d16
feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1)
if self.residual:
feat = feat + x
out = self.bn(feat)
return out
class ESPNetEncoder(nn.Layer):
'''
The ESPNet-C implementation based on PaddlePaddle.
Args:
num_classes (int): The unique number of target classes.
in_channels (int, optional): Number of input channels. Default: 3.
level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 5.
level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3.
'''
def __init__(self,
num_classes,
in_channels=3,
level2_depth=5,
level3_depth=3):
super().__init__()
self.level1 = layers.ConvBNPReLU(in_channels,
16,
3,
padding='same',
stride=2)
self.br1 = BNPReLU(in_channels + 16)
self.proj1 = layers.ConvBNPReLU(in_channels + 16, num_classes, 1)
self.level2_0 = DownSampler(in_channels + 16, 64)
self.level2 = nn.Sequential(
*[DilatedResidualBlock(64, 64) for i in range(level2_depth)])
self.br2 = BNPReLU(in_channels + 128)
self.proj2 = layers.ConvBNPReLU(in_channels + 128, num_classes, 1)
self.level3_0 = DownSampler(in_channels + 128, 128)
self.level3 = nn.Sequential(
*[DilatedResidualBlock(128, 128) for i in range(level3_depth)])
self.br3 = BNPReLU(256)
self.proj3 = layers.ConvBNPReLU(256, num_classes, 1)
def forward(self, x):
f1 = self.level1(x)
down2 = F.adaptive_avg_pool2d(x, output_size=f1.shape[2:])
feat1 = paddle.concat([f1, down2], axis=1)
feat1 = self.br1(feat1)
p1 = self.proj1(feat1)
f2_res = self.level2_0(feat1)
f2 = self.level2(f2_res)
down4 = F.adaptive_avg_pool2d(x, output_size=f2.shape[2:])
feat2 = paddle.concat([f2, f2_res, down4], axis=1)
feat2 = self.br2(feat2)
p2 = self.proj2(feat2)
f3_res = self.level3_0(feat2)
f3 = self.level3(f3_res)
feat3 = paddle.concat([f3, f3_res], axis=1)
feat3 = self.br3(feat3)
p3 = self.proj3(feat3)
return p1, p2, p3

@ -0,0 +1,316 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
import paddle.nn.functional as F
import paddle
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
__all__ = ['FastSCNN']
@manager.MODELS.add_component
class FastSCNN(nn.Layer):
"""
The FastSCNN implementation based on PaddlePaddle.
As mentioned in the original paper, FastSCNN is a real-time segmentation algorithm (123.5fps)
even for high resolution images (1024x2048).
The original article refers to
Poudel, Rudra PK, et al. "Fast-scnn: Fast semantic segmentation network"
(https://arxiv.org/pdf/1902.04502.pdf).
Args:
num_classes (int): The unique number of target classes.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
enable_auxiliary_loss=True,
align_corners=False,
pretrained=None):
super().__init__()
self.learning_to_downsample = LearningToDownsample(32, 48, 64)
self.global_feature_extractor = GlobalFeatureExtractor(
in_channels=64,
block_channels=[64, 96, 128],
out_channels=128,
expansion=6,
num_blocks=[3, 3, 3],
align_corners=True)
self.feature_fusion = FeatureFusionModule(64, 128, 128, align_corners)
self.classifier = Classifier(128, num_classes)
if enable_auxiliary_loss:
self.auxlayer = layers.AuxLayer(64, 32, num_classes)
self.enable_auxiliary_loss = enable_auxiliary_loss
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
logit_list = []
input_size = paddle.shape(x)[2:]
higher_res_features = self.learning_to_downsample(x)
x = self.global_feature_extractor(higher_res_features)
x = self.feature_fusion(higher_res_features, x)
logit = self.classifier(x)
logit = F.interpolate(
logit,
input_size,
mode='bilinear',
align_corners=self.align_corners)
logit_list.append(logit)
if self.enable_auxiliary_loss:
auxiliary_logit = self.auxlayer(higher_res_features)
auxiliary_logit = F.interpolate(
auxiliary_logit,
input_size,
mode='bilinear',
align_corners=self.align_corners)
logit_list.append(auxiliary_logit)
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class LearningToDownsample(nn.Layer):
"""
Learning to downsample module.
This module consists of three downsampling blocks (one conv and two separable conv)
Args:
dw_channels1 (int, optional): The input channels of the first sep conv. Default: 32.
dw_channels2 (int, optional): The input channels of the second sep conv. Default: 48.
out_channels (int, optional): The output channels of LearningToDownsample module. Default: 64.
"""
def __init__(self, dw_channels1=32, dw_channels2=48, out_channels=64):
super(LearningToDownsample, self).__init__()
self.conv_bn_relu = layers.ConvBNReLU(
in_channels=3, out_channels=dw_channels1, kernel_size=3, stride=2)
self.dsconv_bn_relu1 = layers.SeparableConvBNReLU(
in_channels=dw_channels1,
out_channels=dw_channels2,
kernel_size=3,
stride=2,
padding=1)
self.dsconv_bn_relu2 = layers.SeparableConvBNReLU(
in_channels=dw_channels2,
out_channels=out_channels,
kernel_size=3,
stride=2,
padding=1)
def forward(self, x):
x = self.conv_bn_relu(x)
x = self.dsconv_bn_relu1(x)
x = self.dsconv_bn_relu2(x)
return x
class GlobalFeatureExtractor(nn.Layer):
"""
Global feature extractor module.
This module consists of three InvertedBottleneck blocks (like inverted residual introduced by MobileNetV2) and
a PPModule (introduced by PSPNet).
Args:
in_channels (int): The number of input channels to the module.
block_channels (tuple): A tuple represents output channels of each bottleneck block.
out_channels (int): The number of output channels of the module. Default:
expansion (int): The expansion factor in bottleneck.
num_blocks (tuple): It indicates the repeat time of each bottleneck.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
"""
def __init__(self, in_channels, block_channels, out_channels, expansion,
num_blocks, align_corners):
super(GlobalFeatureExtractor, self).__init__()
self.bottleneck1 = self._make_layer(InvertedBottleneck, in_channels,
block_channels[0], num_blocks[0],
expansion, 2)
self.bottleneck2 = self._make_layer(
InvertedBottleneck, block_channels[0], block_channels[1],
num_blocks[1], expansion, 2)
self.bottleneck3 = self._make_layer(
InvertedBottleneck, block_channels[1], block_channels[2],
num_blocks[2], expansion, 1)
self.ppm = layers.PPModule(
block_channels[2],
out_channels,
bin_sizes=(1, 2, 3, 6),
dim_reduction=True,
align_corners=align_corners)
def _make_layer(self,
block,
in_channels,
out_channels,
blocks,
expansion=6,
stride=1):
layers = []
layers.append(block(in_channels, out_channels, expansion, stride))
for _ in range(1, blocks):
layers.append(block(out_channels, out_channels, expansion, 1))
return nn.Sequential(*layers)
def forward(self, x):
x = self.bottleneck1(x)
x = self.bottleneck2(x)
x = self.bottleneck3(x)
x = self.ppm(x)
return x
class InvertedBottleneck(nn.Layer):
"""
Single Inverted bottleneck implementation.
Args:
in_channels (int): The number of input channels to bottleneck block.
out_channels (int): The number of output channels of bottleneck block.
expansion (int, optional). The expansion factor in bottleneck. Default: 6.
stride (int, optional). The stride used in depth-wise conv. Defalt: 2.
"""
def __init__(self, in_channels, out_channels, expansion=6, stride=2):
super().__init__()
self.use_shortcut = stride == 1 and in_channels == out_channels
expand_channels = in_channels * expansion
self.block = nn.Sequential(
# pw
layers.ConvBNReLU(
in_channels=in_channels,
out_channels=expand_channels,
kernel_size=1,
bias_attr=False),
# dw
layers.ConvBNReLU(
in_channels=expand_channels,
out_channels=expand_channels,
kernel_size=3,
stride=stride,
padding=1,
groups=expand_channels,
bias_attr=False),
# pw-linear
layers.ConvBN(
in_channels=expand_channels,
out_channels=out_channels,
kernel_size=1,
bias_attr=False))
def forward(self, x):
out = self.block(x)
if self.use_shortcut:
out = x + out
return out
class FeatureFusionModule(nn.Layer):
"""
Feature Fusion Module Implementation.
This module fuses high-resolution feature and low-resolution feature.
Args:
high_in_channels (int): The channels of high-resolution feature (output of LearningToDownsample).
low_in_channels (int): The channels of low-resolution feature (output of GlobalFeatureExtractor).
out_channels (int): The output channels of this module.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
"""
def __init__(self, high_in_channels, low_in_channels, out_channels,
align_corners):
super().__init__()
# Only depth-wise conv
self.dwconv = layers.ConvBNReLU(
in_channels=low_in_channels,
out_channels=out_channels,
kernel_size=3,
padding=1,
groups=128,
bias_attr=False)
self.conv_low_res = layers.ConvBN(out_channels, out_channels, 1)
self.conv_high_res = layers.ConvBN(high_in_channels, out_channels, 1)
self.align_corners = align_corners
def forward(self, high_res_input, low_res_input):
low_res_input = F.interpolate(
low_res_input,
paddle.shape(high_res_input)[2:],
mode='bilinear',
align_corners=self.align_corners)
low_res_input = self.dwconv(low_res_input)
low_res_input = self.conv_low_res(low_res_input)
high_res_input = self.conv_high_res(high_res_input)
x = high_res_input + low_res_input
return F.relu(x)
class Classifier(nn.Layer):
"""
The Classifier module implementation.
This module consists of two depth-wise conv and one conv.
Args:
input_channels (int): The input channels to this module.
num_classes (int): The unique number of target classes.
"""
def __init__(self, input_channels, num_classes):
super().__init__()
self.dsconv1 = layers.SeparableConvBNReLU(
in_channels=input_channels,
out_channels=input_channels,
kernel_size=3,
padding=1)
self.dsconv2 = layers.SeparableConvBNReLU(
in_channels=input_channels,
out_channels=input_channels,
kernel_size=3,
padding=1)
self.conv = nn.Conv2D(
in_channels=input_channels, out_channels=num_classes, kernel_size=1)
self.dropout = nn.Dropout(p=0.1) # dropout_prob
def forward(self, x):
x = self.dsconv1(x)
x = self.dsconv2(x)
x = self.dropout(x)
x = self.conv(x)
return x

@ -0,0 +1,240 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class FastFCN(nn.Layer):
"""
The FastFCN implementation based on PaddlePaddle.
The original article refers to
Huikai Wu, Junge Zhang, Kaiqi Huang. "FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation".
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): A backbone network.
backbone_indices (tuple): The values in the tuple indicate the indices of
output of backbone.
num_codes (int): The number of encoded words. Default: 32.
mid_channels (int): The channels of middle layers. Default: 512.
use_jpu (bool): Whether use jpu module. Default: True.
aux_loss (bool): Whether use auxiliary head loss. Default: True.
use_se_loss (int): Whether use semantic encoding loss. Default: True.
add_lateral (int): Whether use lateral convolution layers. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
num_codes=32,
mid_channels=512,
use_jpu=True,
aux_loss=True,
use_se_loss=True,
add_lateral=False,
pretrained=None):
super().__init__()
self.add_lateral = add_lateral
self.num_codes = num_codes
self.backbone = backbone
self.use_jpu = use_jpu
in_channels = self.backbone.feat_channels
if use_jpu:
self.jpu_layer = layers.JPU(in_channels, mid_channels)
in_channels[-1] = mid_channels * 4
self.bottleneck = layers.ConvBNReLU(
in_channels[-1],
mid_channels,
1,
padding=0,
bias_attr=False,
)
else:
self.bottleneck = layers.ConvBNReLU(
in_channels[-1],
mid_channels,
3,
padding=1,
bias_attr=False,
)
if self.add_lateral:
self.lateral_convs = nn.LayerList([
layers.ConvBNReLU(in_channels[0],
mid_channels,
1,
bias_attr=False),
layers.ConvBNReLU(in_channels[1],
mid_channels,
1,
bias_attr=False),
])
self.fusion = layers.ConvBNReLU(
3 * mid_channels,
mid_channels,
3,
padding=1,
bias_attr=False,
)
self.enc_module = EncModule(mid_channels, num_codes)
self.cls_seg = nn.Conv2D(mid_channels, num_classes, 1)
self.aux_loss = aux_loss
if self.aux_loss:
self.fcn_head = layers.AuxLayer(in_channels[-2], mid_channels,
num_classes)
self.use_se_loss = use_se_loss
if use_se_loss:
self.se_layer = nn.Linear(mid_channels, num_classes)
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def forward(self, inputs):
imsize = paddle.shape(inputs)[2:]
feats = self.backbone(inputs)
if self.use_jpu:
feats = self.jpu_layer(*feats)
fcn_feat = feats[2]
feat = self.bottleneck(feats[-1])
if self.add_lateral:
laterals = []
for i, lateral_conv in enumerate(self.lateral_convs):
laterals.append(
F.interpolate(lateral_conv(feats[i]),
size=paddle.shape(feat)[2:],
mode='bilinear',
align_corners=False))
feat = self.fusion(paddle.concat([feat, *laterals], 1))
encode_feat, feat = self.enc_module(feat)
out = self.cls_seg(feat)
out = F.interpolate(out,
size=imsize,
mode='bilinear',
align_corners=False)
output = [out]
if self.training:
fcn_out = self.fcn_head(fcn_feat)
fcn_out = F.interpolate(fcn_out,
size=imsize,
mode='bilinear',
align_corners=False)
output.append(fcn_out)
if self.use_se_loss:
se_out = self.se_layer(encode_feat)
output.append(se_out)
return output
return output
class Encoding(nn.Layer):
def __init__(self, channels, num_codes):
super().__init__()
self.channels, self.num_codes = channels, num_codes
std = 1 / ((channels * num_codes)**0.5)
self.codewords = self.create_parameter(
shape=(num_codes, channels),
default_initializer=nn.initializer.Uniform(-std, std),
)
self.scale = self.create_parameter(
shape=(num_codes, ),
default_initializer=nn.initializer.Uniform(-1, 0),
)
def scaled_l2(self, x, codewords, scale):
num_codes, channels = paddle.shape(codewords)
reshaped_scale = scale.reshape([1, 1, num_codes])
expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
scaled_l2_norm = reshaped_scale * (
expanded_x - reshaped_codewords).pow(2).sum(axis=3)
return scaled_l2_norm
def aggregate(self, assignment_weights, x, codewords):
num_codes, channels = paddle.shape(codewords)
reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
expanded_x = paddle.tile(
x.unsqueeze(2),
[1, 1, num_codes, 1],
)
encoded_feat = (assignment_weights.unsqueeze(3) *
(expanded_x - reshaped_codewords)).sum(axis=1)
return encoded_feat
def forward(self, x):
x_dims = x.ndim
assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format(
x_dims)
assert paddle.shape(
x
)[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format(
self.channels,
paddle.shape(x)[1])
batch_size = paddle.shape(x)[0]
x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1])
assignment_weights = F.softmax(self.scaled_l2(x, self.codewords,
self.scale),
axis=2)
encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
encoded_feat = encoded_feat.reshape([batch_size, self.num_codes, -1])
return encoded_feat
class EncModule(nn.Layer):
def __init__(self, in_channels, num_codes):
super().__init__()
self.encoding_project = layers.ConvBNReLU(
in_channels,
in_channels,
1,
)
self.encoding = nn.Sequential(
Encoding(channels=in_channels, num_codes=num_codes),
nn.BatchNorm1D(num_codes),
nn.ReLU(),
)
self.fc = nn.Sequential(
nn.Linear(in_channels, in_channels),
nn.Sigmoid(),
)
def forward(self, x):
encoding_projection = self.encoding_project(x)
encoding_feat = self.encoding(encoding_projection).mean(axis=1)
batch_size, channels, _, _ = paddle.shape(x)
gamma = self.fc(encoding_feat)
y = gamma.reshape([batch_size, channels, 1, 1])
output = F.relu(x + x * y)
return encoding_feat, output

@ -0,0 +1,145 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
import paddle.nn.functional as F
import paddle
from paddlers.models.ppseg import utils
from paddlers.models.ppseg.cvlibs import manager, param_init
from paddlers.models.ppseg.models import layers
@manager.MODELS.add_component
class FCN(nn.Layer):
"""
A simple implementation for FCN based on PaddlePaddle.
The original article refers to
Evan Shelhamer, et, al. "Fully Convolutional Networks for Semantic Segmentation"
(https://arxiv.org/abs/1411.4038).
Args:
num_classes (int): The unique number of target classes.
backbone (paddle.nn.Layer): Backbone networks.
backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone.
Default: (-1, ).
channels (int, optional): The channels between conv layer and the last layer of FCNHead.
If None, it will be the number of channels of input features. Default: None.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(-1, ),
channels=None,
align_corners=False,
pretrained=None,
bias=True,
data_format="NCHW"):
super(FCN, self).__init__()
if data_format != 'NCHW':
raise ('fcn only support NCHW data format')
self.backbone = backbone
backbone_channels = [
backbone.feat_channels[i] for i in backbone_indices
]
self.head = FCNHead(
num_classes,
backbone_indices,
backbone_channels,
channels,
bias=bias)
self.align_corners = align_corners
self.pretrained = pretrained
self.data_format = data_format
self.init_weight()
def forward(self, x):
feat_list = self.backbone(x)
logit_list = self.head(feat_list)
return [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class FCNHead(nn.Layer):
"""
A simple implementation for FCNHead based on PaddlePaddle
Args:
num_classes (int): The unique number of target classes.
backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone.
Default: (-1, ).
channels (int, optional): The channels between conv layer and the last layer of FCNHead.
If None, it will be the number of channels of input features. Default: None.
pretrained (str, optional): The path of pretrained model. Default: None
"""
def __init__(self,
num_classes,
backbone_indices=(-1, ),
backbone_channels=(270, ),
channels=None,
bias=True):
super(FCNHead, self).__init__()
self.num_classes = num_classes
self.backbone_indices = backbone_indices
if channels is None:
channels = backbone_channels[0]
self.conv_1 = layers.ConvBNReLU(
in_channels=backbone_channels[0],
out_channels=channels,
kernel_size=1,
stride=1,
bias_attr=bias)
self.cls = nn.Conv2D(
in_channels=channels,
out_channels=self.num_classes,
kernel_size=1,
stride=1,
bias_attr=bias)
self.init_weight()
def forward(self, feat_list):
logit_list = []
x = feat_list[self.backbone_indices[0]]
x = self.conv_1(x)
logit = self.cls(x)
logit_list.append(logit)
return logit_list
def init_weight(self):
for layer in self.sublayers():
if isinstance(layer, nn.Conv2D):
param_init.normal_init(layer.weight, std=0.001)
elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
param_init.constant_init(layer.weight, value=1.0)
param_init.constant_init(layer.bias, value=0.0)

@ -0,0 +1,222 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class GCNet(nn.Layer):
"""
The GCNet implementation based on PaddlePaddle.
The original article refers to
Cao, Yue, et al. "GCnet: Non-local networks meet squeeze-excitation networks and beyond"
(https://arxiv.org/pdf/1904.11492.pdf).
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
gc_channels (int, optional): The input channels to Global Context Block. Default: 512.
ratio (float, optional): It indicates the ratio of attention channels and gc_channels. Default: 0.25.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(2, 3),
gc_channels=512,
ratio=0.25,
enable_auxiliary_loss=True,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
backbone_channels = [
backbone.feat_channels[i] for i in backbone_indices
]
self.head = GCNetHead(num_classes, backbone_indices, backbone_channels,
gc_channels, ratio, enable_auxiliary_loss)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feat_list = self.backbone(x)
logit_list = self.head(feat_list)
return [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class GCNetHead(nn.Layer):
"""
The GCNetHead implementation.
Args:
num_classes (int): The unique number of target classes.
backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
The first index will be taken as a deep-supervision feature in auxiliary layer;
the second one will be taken as input of GlobalContextBlock.
backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
gc_channels (int): The input channels to Global Context Block.
ratio (float): It indicates the ratio of attention channels and gc_channels.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
"""
def __init__(self,
num_classes,
backbone_indices,
backbone_channels,
gc_channels,
ratio,
enable_auxiliary_loss=True):
super().__init__()
in_channels = backbone_channels[1]
self.conv_bn_relu1 = layers.ConvBNReLU(
in_channels=in_channels,
out_channels=gc_channels,
kernel_size=3,
padding=1)
self.gc_block = GlobalContextBlock(
gc_channels=gc_channels, in_channels=gc_channels, ratio=ratio)
self.conv_bn_relu2 = layers.ConvBNReLU(
in_channels=gc_channels,
out_channels=gc_channels,
kernel_size=3,
padding=1)
self.conv_bn_relu3 = layers.ConvBNReLU(
in_channels=in_channels + gc_channels,
out_channels=gc_channels,
kernel_size=3,
padding=1)
self.dropout = nn.Dropout(p=0.1)
self.conv = nn.Conv2D(
in_channels=gc_channels, out_channels=num_classes, kernel_size=1)
if enable_auxiliary_loss:
self.auxlayer = layers.AuxLayer(
in_channels=backbone_channels[0],
inter_channels=backbone_channels[0] // 4,
out_channels=num_classes)
self.backbone_indices = backbone_indices
self.enable_auxiliary_loss = enable_auxiliary_loss
def forward(self, feat_list):
logit_list = []
x = feat_list[self.backbone_indices[1]]
output = self.conv_bn_relu1(x)
output = self.gc_block(output)
output = self.conv_bn_relu2(output)
output = paddle.concat([x, output], axis=1)
output = self.conv_bn_relu3(output)
output = self.dropout(output)
logit = self.conv(output)
logit_list.append(logit)
if self.enable_auxiliary_loss:
low_level_feat = feat_list[self.backbone_indices[0]]
auxiliary_logit = self.auxlayer(low_level_feat)
logit_list.append(auxiliary_logit)
return logit_list
class GlobalContextBlock(nn.Layer):
"""
Global Context Block implementation.
Args:
in_channels (int): The input channels of Global Context Block.
ratio (float): The channels of attention map.
"""
def __init__(self, gc_channels, in_channels, ratio):
super().__init__()
self.gc_channels = gc_channels
self.conv_mask = nn.Conv2D(
in_channels=in_channels, out_channels=1, kernel_size=1)
self.softmax = nn.Softmax(axis=2)
inter_channels = int(in_channels * ratio)
self.channel_add_conv = nn.Sequential(
nn.Conv2D(
in_channels=in_channels,
out_channels=inter_channels,
kernel_size=1),
nn.LayerNorm(normalized_shape=[inter_channels, 1, 1]), nn.ReLU(),
nn.Conv2D(
in_channels=inter_channels,
out_channels=in_channels,
kernel_size=1))
def global_context_block(self, x):
x_shape = paddle.shape(x)
# [N, C, H * W]
input_x = paddle.reshape(x, shape=[0, self.gc_channels, -1])
# [N, 1, C, H * W]
input_x = paddle.unsqueeze(input_x, axis=1)
# [N, 1, H, W]
context_mask = self.conv_mask(x)
# [N, 1, H * W]
context_mask = paddle.reshape(context_mask, shape=[0, 1, -1])
context_mask = self.softmax(context_mask)
# [N, 1, H * W, 1]
context_mask = paddle.unsqueeze(context_mask, axis=-1)
# [N, 1, C, 1]
context = paddle.matmul(input_x, context_mask)
# [N, C, 1, 1]
context = paddle.reshape(context, shape=[0, self.gc_channels, 1, 1])
return context
def forward(self, x):
context = self.global_context_block(x)
channel_add_term = self.channel_add_conv(context)
out = x + channel_add_term
return out

@ -0,0 +1,291 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from paddle.nn import functional as F
from paddlers.models.ppseg.utils import utils
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager
@manager.MODELS.add_component
class GINet(nn.Layer):
"""
The GINet implementation based on PaddlePaddle.
The original article refers to
Wu, Tianyi, Yu Lu, Yu Zhu, Chuang Zhang, Ming Wu, Zhanyu Ma, and Guodong Guo. "GINet: Graph interaction network for scene parsing." In European Conference on Computer Vision, pp. 34-51. Springer, Cham, 2020.
(https://arxiv.org/pdf/2009.06160).
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): Backbone network.
backbone_indices (tuple, optional): Values in the tuple indicate the indices of output of backbone.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
jpu (bool, optional)): whether to use jpu unit in the base forward. Default:True.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=[0, 1, 2, 3],
enable_auxiliary_loss=True,
align_corners=True,
jpu=True,
pretrained=None):
super().__init__()
self.nclass = num_classes
self.aux = enable_auxiliary_loss
self.jpu = jpu
self.backbone = backbone
self.backbone_indices = backbone_indices
self.align_corners = align_corners
self.jpu = layers.JPU([512, 1024, 2048], width=512) if jpu else None
self.head = GIHead(in_channels=2048, nclass=num_classes)
if self.aux:
self.auxlayer = layers.AuxLayer(
1024, 1024 // 4, num_classes, bias_attr=False)
self.pretrained = pretrained
self.init_weight()
def base_forward(self, x):
feat_list = self.backbone(x)
c1, c2, c3, c4 = [feat_list[i] for i in self.backbone_indices]
if self.jpu:
return self.jpu(c1, c2, c3, c4)
else:
return c1, c2, c3, c4
def forward(self, x):
_, _, h, w = paddle.shape(x)
_, _, c3, c4 = self.base_forward(x)
logit_list = []
x, _ = self.head(c4)
logit_list.append(x)
if self.aux:
auxout = self.auxlayer(c3)
logit_list.append(auxout)
return [
F.interpolate(
logit, (h, w),
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class GIHead(nn.Layer):
"""The Graph Interaction Network head."""
def __init__(self, in_channels, nclass):
super().__init__()
self.nclass = nclass
inter_channels = in_channels // 4
self.inp = paddle.zeros(shape=(nclass, 300), dtype='float32')
self.inp = paddle.create_parameter(
shape=self.inp.shape,
dtype=str(self.inp.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(self.inp))
self.inp.stop_gradient = True
self.fc1 = nn.Sequential(
nn.Linear(300, 128), nn.BatchNorm1D(128), nn.ReLU())
self.fc2 = nn.Sequential(
nn.Linear(128, 256), nn.BatchNorm1D(256), nn.ReLU())
self.conv5 = layers.ConvBNReLU(
in_channels,
inter_channels,
3,
padding=1,
bias_attr=False,
stride=1)
self.gloru = GlobalReasonUnit(
in_channels=inter_channels,
num_state=256,
num_node=84,
nclass=nclass)
self.conv6 = nn.Sequential(
nn.Dropout(0.1), nn.Conv2D(inter_channels, nclass, 1))
def forward(self, x):
B, C, H, W = paddle.shape(x)
inp = self.inp
inp = self.fc1(inp)
inp = self.fc2(inp).unsqueeze(axis=0).transpose((0, 2, 1))\
.expand((B, 256, self.nclass))
out = self.conv5(x)
out, se_out = self.gloru(out, inp)
out = self.conv6(out)
return out, se_out
class GlobalReasonUnit(nn.Layer):
"""
The original paper refers to:
Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks" (https://arxiv.org/abs/1811.12814)
"""
def __init__(self, in_channels, num_state=256, num_node=84, nclass=59):
super().__init__()
self.num_state = num_state
self.conv_theta = nn.Conv2D(
in_channels, num_node, kernel_size=1, stride=1, padding=0)
self.conv_phi = nn.Conv2D(
in_channels, num_state, kernel_size=1, stride=1, padding=0)
self.graph = GraphLayer(num_state, num_node, nclass)
self.extend_dim = nn.Conv2D(
num_state, in_channels, kernel_size=1, bias_attr=False)
self.bn = layers.SyncBatchNorm(in_channels)
def forward(self, x, inp):
B = self.conv_theta(x)
sizeB = paddle.shape(B)
B = paddle.flatten(B, 2, 3)
sizex = paddle.shape(x)
x_reduce = self.conv_phi(x)
x_reduce = paddle.flatten(x_reduce, 2, 3).transpose((0, 2, 1))
V = paddle.bmm(B, x_reduce).transpose((0, 2, 1))
V = paddle.divide(V, (sizex[2] * sizex[3]).astype('float32'))
class_node, new_V = self.graph(inp, V)
D = B.transpose((0, 2, 1))
Y = paddle.bmm(D, new_V.transpose((0, 2, 1)))
Y = Y.transpose((0, 2, 1)).reshape((sizex[0], self.num_state, \
sizex[2], -1))
Y = self.extend_dim(Y)
Y = self.bn(Y)
out = Y + x
return out, class_node
class GraphLayer(nn.Layer):
def __init__(self, num_state, num_node, num_class):
super().__init__()
self.vis_gcn = GCN(num_state, num_node)
self.word_gcn = GCN(num_state, num_class)
self.transfer = GraphTransfer(num_state)
self.gamma_vis = paddle.zeros([num_node])
self.gamma_word = paddle.zeros([num_class])
self.gamma_vis = paddle.create_parameter(
shape=paddle.shape(self.gamma_vis),
dtype=str(self.gamma_vis.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(self.gamma_vis))
self.gamma_word = paddle.create_parameter(
shape=paddle.shape(self.gamma_word),
dtype=str(self.gamma_word.numpy().dtype),
default_initializer=paddle.nn.initializer.Assign(self.gamma_word))
def forward(self, inp, vis_node):
inp = self.word_gcn(inp)
new_V = self.vis_gcn(vis_node)
class_node, vis_node = self.transfer(inp, new_V)
class_node = self.gamma_word * inp + class_node
new_V = self.gamma_vis * vis_node + new_V
return class_node, new_V
class GCN(nn.Layer):
def __init__(self, num_state=128, num_node=64, bias=False):
super().__init__()
self.conv1 = nn.Conv1D(
num_node,
num_node,
kernel_size=1,
padding=0,
stride=1,
groups=1,
)
self.relu = nn.ReLU()
self.conv2 = nn.Conv1D(
num_state,
num_state,
kernel_size=1,
padding=0,
stride=1,
groups=1,
bias_attr=bias)
def forward(self, x):
h = self.conv1(x.transpose((0, 2, 1))).transpose((0, 2, 1))
h = h + x
h = self.relu(h)
h = self.conv2(h)
return h
class GraphTransfer(nn.Layer):
"""Transfer vis graph to class node, transfer class node to vis feature"""
def __init__(self, in_dim):
super().__init__()
self.channle_in = in_dim
self.query_conv = nn.Conv1D(
in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1)
self.key_conv = nn.Conv1D(
in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1)
self.value_conv_vis = nn.Conv1D(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.value_conv_word = nn.Conv1D(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.softmax_vis = nn.Softmax(axis=-1)
self.softmax_word = nn.Softmax(axis=-2)
def forward(self, word, vis_node):
m_batchsize, C, Nc = paddle.shape(word)
m_batchsize, C, Nn = paddle.shape(vis_node)
proj_query = self.query_conv(word).reshape((m_batchsize, -1, Nc))\
.transpose((0, 2, 1))
proj_key = self.key_conv(vis_node).reshape((m_batchsize, -1, Nn))
energy = paddle.bmm(proj_query, proj_key)
attention_vis = self.softmax_vis(energy).transpose((0, 2, 1))
attention_word = self.softmax_word(energy)
proj_value_vis = self.value_conv_vis(vis_node).reshape((m_batchsize, -1,
Nn))
proj_value_word = self.value_conv_word(word).reshape((m_batchsize, -1,
Nc))
class_out = paddle.bmm(proj_value_vis, attention_vis)
node_out = paddle.bmm(proj_value_word, attention_word)
return class_out, node_out

@ -0,0 +1,353 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import cv2
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.models.backbones import resnet_vd
from paddlers.models.ppseg.models import deeplab
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class GSCNN(nn.Layer):
"""
The GSCNN implementation based on PaddlePaddle.
The original article refers to
Towaki Takikawa, et, al. "Gated-SCNN: Gated Shape CNNs for Semantic Segmentation"
(https://arxiv.org/pdf/1907.05740.pdf)
Args:
num_classes (int): The unique number of target classes.
backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
Default: (0, 1, 2, 3).
aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
If output_stride=8, aspp_ratios is (1, 12, 24, 36).
Default: (1, 6, 12, 18).
aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(0, 1, 2, 3),
aspp_ratios=(1, 6, 12, 18),
aspp_out_channels=256,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
backbone_channels = self.backbone.feat_channels
self.head = GSCNNHead(num_classes, backbone_indices, backbone_channels,
aspp_ratios, aspp_out_channels, align_corners)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feat_list = self.backbone(x)
logit_list = self.head(x, feat_list, self.backbone.conv1_logit)
seg_logit, edge_logit = [
F.interpolate(
logit,
x.shape[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
return [seg_logit, (seg_logit, edge_logit), edge_logit, seg_logit]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class GSCNNHead(nn.Layer):
"""
The GSCNNHead implementation based on PaddlePaddle.
Args:
num_classes (int): The unique number of target classes.
backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
the first index will be taken as a low-level feature in Decoder component;
the last one will be taken as input of ASPP component; the second to fourth
will be taken as input for GCL component.
Usually backbone consists of four downsampling stage, and return an output of
each stage. If we set it as (0, 1, 2, 3), it means taking feature map of the first
stage in backbone as low-level feature used in Decoder, feature map of the fourth
stage as input of ASPP, and the feature map of the second to fourth stage as input of GCL.
backbone_channels (tuple): The channels of output of backbone.
aspp_ratios (tuple): The dilation rates using in ASSP module.
aspp_out_channels (int): The output channels of ASPP module.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
"""
def __init__(self, num_classes, backbone_indices, backbone_channels,
aspp_ratios, aspp_out_channels, align_corners):
super().__init__()
self.backbone_indices = backbone_indices
self.align_corners = align_corners
self.dsn1 = nn.Conv2D(
backbone_channels[backbone_indices[1]], 1, kernel_size=1)
self.dsn2 = nn.Conv2D(
backbone_channels[backbone_indices[2]], 1, kernel_size=1)
self.dsn3 = nn.Conv2D(
backbone_channels[backbone_indices[3]], 1, kernel_size=1)
self.res1 = resnet_vd.BasicBlock(64, 64, stride=1)
self.d1 = nn.Conv2D(64, 32, kernel_size=1)
self.gate1 = GatedSpatailConv2d(32, 32)
self.res2 = resnet_vd.BasicBlock(32, 32, stride=1)
self.d2 = nn.Conv2D(32, 16, kernel_size=1)
self.gate2 = GatedSpatailConv2d(16, 16)
self.res3 = resnet_vd.BasicBlock(16, 16, stride=1)
self.d3 = nn.Conv2D(16, 8, kernel_size=1)
self.gate3 = GatedSpatailConv2d(8, 8)
self.fuse = nn.Conv2D(8, 1, kernel_size=1, bias_attr=False)
self.cw = nn.Conv2D(2, 1, kernel_size=1, bias_attr=False)
self.aspp = ASPPModule(
aspp_ratios=aspp_ratios,
in_channels=backbone_channels[-1],
out_channels=aspp_out_channels,
align_corners=self.align_corners,
image_pooling=True)
self.decoder = deeplab.Decoder(
num_classes=num_classes,
in_channels=backbone_channels[0],
align_corners=self.align_corners)
def forward(self, x, feat_list, s_input):
input_shape = paddle.shape(x)
m1f = F.interpolate(
s_input,
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
l1, l2, l3 = [
feat_list[self.backbone_indices[i]]
for i in range(1, len(self.backbone_indices))
]
s1 = F.interpolate(
self.dsn1(l1),
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
s2 = F.interpolate(
self.dsn2(l2),
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
s3 = F.interpolate(
self.dsn3(l3),
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
# Get image gradient
im_arr = x.numpy().transpose((0, 2, 3, 1))
im_arr = ((im_arr * 0.5 + 0.5) * 255).astype(np.uint8)
canny = np.zeros((input_shape[0], 1, input_shape[2], input_shape[3]))
for i in range(input_shape[0]):
canny[i] = cv2.Canny(im_arr[i], 10, 100)
canny = canny / 255
canny = paddle.to_tensor(canny).astype('float32')
canny.stop_gradient = True
cs = self.res1(m1f)
cs = F.interpolate(
cs,
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
cs = self.d1(cs)
cs = self.gate1(cs, s1)
cs = self.res2(cs)
cs = F.interpolate(
cs,
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
cs = self.d2(cs)
cs = self.gate2(cs, s2)
cs = self.res3(cs)
cs = F.interpolate(
cs,
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
cs = self.d3(cs)
cs = self.gate3(cs, s3)
cs = self.fuse(cs)
cs = F.interpolate(
cs,
input_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
edge_out = F.sigmoid(cs) # Ouput of shape stream
cat = paddle.concat([edge_out, canny], axis=1)
acts = self.cw(cat)
acts = F.sigmoid(acts) # Input of fusion module
x = self.aspp(l3, acts)
low_level_feat = feat_list[self.backbone_indices[0]]
logit = self.decoder(x, low_level_feat)
logit_list = [logit, edge_out]
return logit_list
class GatedSpatailConv2d(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
groups=1,
bias_attr=False):
super().__init__()
self._gate_conv = nn.Sequential(
layers.SyncBatchNorm(in_channels + 1),
nn.Conv2D(in_channels + 1, in_channels + 1, kernel_size=1),
nn.ReLU(), nn.Conv2D(in_channels + 1, 1, kernel_size=1),
layers.SyncBatchNorm(1), nn.Sigmoid())
self.conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias_attr=bias_attr)
def forward(self, input_features, gating_features):
cat = paddle.concat([input_features, gating_features], axis=1)
alphas = self._gate_conv(cat)
x = input_features * (alphas + 1)
x = self.conv(x)
return x
class ASPPModule(nn.Layer):
"""
Atrous Spatial Pyramid Pooling.
Args:
aspp_ratios (tuple): The dilation rate using in ASSP module.
in_channels (int): The number of input channels.
out_channels (int): The number of output channels.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False.
image_pooling (bool, optional): If augmented with image-level features. Default: False
"""
def __init__(self,
aspp_ratios,
in_channels,
out_channels,
align_corners,
use_sep_conv=False,
image_pooling=False):
super().__init__()
self.align_corners = align_corners
self.aspp_blocks = nn.LayerList()
for ratio in aspp_ratios:
if use_sep_conv and ratio > 1:
conv_func = layers.SeparableConvBNReLU
else:
conv_func = layers.ConvBNReLU
block = conv_func(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1 if ratio == 1 else 3,
dilation=ratio,
padding=0 if ratio == 1 else ratio)
self.aspp_blocks.append(block)
out_size = len(self.aspp_blocks)
if image_pooling:
self.global_avg_pool = nn.Sequential(
nn.AdaptiveAvgPool2D(output_size=(1, 1)),
layers.ConvBNReLU(
in_channels, out_channels, kernel_size=1, bias_attr=False))
out_size += 1
self.image_pooling = image_pooling
self.edge_conv = layers.ConvBNReLU(
1, out_channels, kernel_size=1, bias_attr=False)
out_size += 1
self.conv_bn_relu = layers.ConvBNReLU(
in_channels=out_channels * out_size,
out_channels=out_channels,
kernel_size=1)
self.dropout = nn.Dropout(p=0.1) # drop rate
def forward(self, x, edge):
outputs = []
x_shape = paddle.shape(x)
for block in self.aspp_blocks:
y = block(x)
y = F.interpolate(
y,
x_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
outputs.append(y)
if self.image_pooling:
img_avg = self.global_avg_pool(x)
img_avg = F.interpolate(
img_avg,
x_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
outputs.append(img_avg)
edge_features = F.interpolate(
edge,
size=x_shape[2:],
mode='bilinear',
align_corners=self.align_corners)
edge_features = self.edge_conv(edge_features)
outputs.append(edge_features)
x = paddle.concat(outputs, axis=1)
x = self.conv_bn_relu(x)
x = self.dropout(x)
return x

@ -0,0 +1,308 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class HarDNet(nn.Layer):
"""
[Real Time] The FC-HardDNet 70 implementation based on PaddlePaddle.
The original article refers to
Chao, Ping, et al. "HarDNet: A Low Memory Traffic Network"
(https://arxiv.org/pdf/1909.00948.pdf)
Args:
num_classes (int): The unique number of target classes.
stem_channels (tuple|list, optional): The number of channels before the encoder. Default: (16, 24, 32, 48).
ch_list (tuple|list, optional): The number of channels at each block in the encoder. Default: (64, 96, 160, 224, 320).
grmul (float, optional): The channel multiplying factor in HarDBlock, which is m in the paper. Default: 1.7.
gr (tuple|list, optional): The growth rate in each HarDBlock, which is k in the paper. Default: (10, 16, 18, 24, 32).
n_layers (tuple|list, optional): The number of layers in each HarDBlock. Default: (4, 4, 8, 8, 8).
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
stem_channels=(16, 24, 32, 48),
ch_list=(64, 96, 160, 224, 320),
grmul=1.7,
gr=(10, 16, 18, 24, 32),
n_layers=(4, 4, 8, 8, 8),
align_corners=False,
pretrained=None):
super().__init__()
self.align_corners = align_corners
self.pretrained = pretrained
encoder_blks_num = len(n_layers)
decoder_blks_num = encoder_blks_num - 1
encoder_in_channels = stem_channels[3]
self.stem = nn.Sequential(
layers.ConvBNReLU(
3, stem_channels[0], kernel_size=3, bias_attr=False),
layers.ConvBNReLU(
stem_channels[0],
stem_channels[1],
kernel_size=3,
bias_attr=False),
layers.ConvBNReLU(
stem_channels[1],
stem_channels[2],
kernel_size=3,
stride=2,
bias_attr=False),
layers.ConvBNReLU(
stem_channels[2],
stem_channels[3],
kernel_size=3,
bias_attr=False))
self.encoder = Encoder(encoder_blks_num, encoder_in_channels, ch_list,
gr, grmul, n_layers)
skip_connection_channels = self.encoder.get_skip_channels()
decoder_in_channels = self.encoder.get_out_channels()
self.decoder = Decoder(decoder_blks_num, decoder_in_channels,
skip_connection_channels, gr, grmul, n_layers,
align_corners)
self.cls_head = nn.Conv2D(
in_channels=self.decoder.get_out_channels(),
out_channels=num_classes,
kernel_size=1)
self.init_weight()
def forward(self, x):
input_shape = paddle.shape(x)[2:]
x = self.stem(x)
x, skip_connections = self.encoder(x)
x = self.decoder(x, skip_connections)
logit = self.cls_head(x)
logit = F.interpolate(
logit,
size=input_shape,
mode="bilinear",
align_corners=self.align_corners)
return [logit]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class Encoder(nn.Layer):
"""The Encoder implementation of FC-HardDNet 70.
Args:
n_blocks (int): The number of blocks in the Encoder module.
in_channels (int): The number of input channels.
ch_list (tuple|list): The number of channels at each block in the encoder.
grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper.
gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper.
n_layers (tuple|list): The number of layers in each HarDBlock.
"""
def __init__(self, n_blocks, in_channels, ch_list, gr, grmul, n_layers):
super().__init__()
self.skip_connection_channels = []
self.shortcut_layers = []
self.blks = nn.LayerList()
ch = in_channels
for i in range(n_blocks):
blk = HarDBlock(ch, gr[i], grmul, n_layers[i])
ch = blk.get_out_ch()
self.skip_connection_channels.append(ch)
self.blks.append(blk)
if i < n_blocks - 1:
self.shortcut_layers.append(len(self.blks) - 1)
self.blks.append(
layers.ConvBNReLU(
ch, ch_list[i], kernel_size=1, bias_attr=False))
ch = ch_list[i]
if i < n_blocks - 1:
self.blks.append(nn.AvgPool2D(kernel_size=2, stride=2))
self.out_channels = ch
def forward(self, x):
skip_connections = []
for i in range(len(self.blks)):
x = self.blks[i](x)
if i in self.shortcut_layers:
skip_connections.append(x)
return x, skip_connections
def get_skip_channels(self):
return self.skip_connection_channels
def get_out_channels(self):
return self.out_channels
class Decoder(nn.Layer):
"""The Decoder implementation of FC-HardDNet 70.
Args:
n_blocks (int): The number of blocks in the Encoder module.
in_channels (int): The number of input channels.
skip_connection_channels (tuple|list): The channels of shortcut layers in encoder.
grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper.
gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper.
n_layers (tuple|list): The number of layers in each HarDBlock.
"""
def __init__(self,
n_blocks,
in_channels,
skip_connection_channels,
gr,
grmul,
n_layers,
align_corners=False):
super().__init__()
prev_block_channels = in_channels
self.n_blocks = n_blocks
self.dense_blocks_up = nn.LayerList()
self.conv1x1_up = nn.LayerList()
for i in range(n_blocks - 1, -1, -1):
cur_channels_count = prev_block_channels + skip_connection_channels[
i]
conv1x1 = layers.ConvBNReLU(
cur_channels_count,
cur_channels_count // 2,
kernel_size=1,
bias_attr=False)
blk = HarDBlock(
base_channels=cur_channels_count // 2,
growth_rate=gr[i],
grmul=grmul,
n_layers=n_layers[i])
self.conv1x1_up.append(conv1x1)
self.dense_blocks_up.append(blk)
prev_block_channels = blk.get_out_ch()
self.out_channels = prev_block_channels
self.align_corners = align_corners
def forward(self, x, skip_connections):
for i in range(self.n_blocks):
skip = skip_connections.pop()
x = F.interpolate(
x,
size=paddle.shape(skip)[2:],
mode="bilinear",
align_corners=self.align_corners)
x = paddle.concat([x, skip], axis=1)
x = self.conv1x1_up[i](x)
x = self.dense_blocks_up[i](x)
return x
def get_out_channels(self):
return self.out_channels
class HarDBlock(nn.Layer):
"""The HarDBlock implementation
Args:
base_channels (int): The base channels.
growth_rate (tuple|list): The growth rate.
grmul (float): The channel multiplying factor.
n_layers (tuple|list): The number of layers.
keepBase (bool, optional): A bool value indicates whether concatenating the first layer. Default: False.
"""
def __init__(self,
base_channels,
growth_rate,
grmul,
n_layers,
keepBase=False):
super().__init__()
self.keepBase = keepBase
self.links = []
layers_ = []
self.out_channels = 0
for i in range(n_layers):
outch, inch, link = get_link(i + 1, base_channels, growth_rate,
grmul)
self.links.append(link)
layers_.append(
layers.ConvBNReLU(inch, outch, kernel_size=3, bias_attr=False))
if (i % 2 == 0) or (i == n_layers - 1):
self.out_channels += outch
self.layers = nn.LayerList(layers_)
def forward(self, x):
layers_ = [x]
for layer in range(len(self.layers)):
link = self.links[layer]
tin = []
for i in link:
tin.append(layers_[i])
if len(tin) > 1:
x = paddle.concat(tin, axis=1)
else:
x = tin[0]
out = self.layers[layer](x)
layers_.append(out)
t = len(layers_)
out_ = []
for i in range(t):
if (i == 0 and self.keepBase) or \
(i == t - 1) or (i % 2 == 1):
out_.append(layers_[i])
out = paddle.concat(out_, 1)
return out
def get_out_ch(self):
return self.out_channels
def get_link(layer, base_ch, growth_rate, grmul):
if layer == 0:
return base_ch, 0, []
out_channels = growth_rate
link = []
for i in range(10):
dv = 2**i
if layer % dv == 0:
k = layer - dv
link.insert(0, k)
if i > 0:
out_channels *= grmul
out_channels = int(int(out_channels + 1) / 2) * 2
in_channels = 0
for i in link:
ch, _, _ = get_link(i, base_ch, growth_rate, grmul)
in_channels += ch
return out_channels, in_channels, link

@ -0,0 +1,127 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class HRNetW48Contrast(nn.Layer):
"""
The HRNetW48Contrast implementation based on PaddlePaddle.
The original article refers to
Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation"
(https://arxiv.org/abs/2101.11939).
Args:
in_channels (int): The output dimensions of backbone.
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): Backbone network, currently support HRNet_W48.
drop_prob (float): The probability of dropout.
proj_dim (int): The projection dimensions.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
in_channels,
num_classes,
backbone,
drop_prob,
proj_dim,
align_corners=False,
pretrained=None):
super().__init__()
self.in_channels = in_channels
self.backbone = backbone
self.num_classes = num_classes
self.proj_dim = proj_dim
self.align_corners = align_corners
self.cls_head = nn.Sequential(
layers.ConvBNReLU(
in_channels, in_channels, kernel_size=3, stride=1, padding=1),
nn.Dropout2D(drop_prob),
nn.Conv2D(
in_channels,
num_classes,
kernel_size=1,
stride=1,
bias_attr=False),
)
self.proj_head = ProjectionHead(
dim_in=in_channels, proj_dim=self.proj_dim)
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def forward(self, x):
feats = self.backbone(x)[0]
out = self.cls_head(feats)
logit_list = []
if self.training:
emb = self.proj_head(feats)
logit_list.append(
F.interpolate(
out,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners))
logit_list.append({'seg': out, 'embed': emb})
else:
logit_list.append(
F.interpolate(
out,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners))
return logit_list
class ProjectionHead(nn.Layer):
"""
The projection head used by contrast learning.
Args:
dim_in (int): The dimensions of input features.
proj_dim (int, optional): The output dimensions of projection head. Default: 256.
proj (str, optional): The type of projection head, only support 'linear' and 'convmlp'. Default: 'convmlp'.
"""
def __init__(self, dim_in, proj_dim=256, proj='convmlp'):
super(ProjectionHead, self).__init__()
if proj == 'linear':
self.proj = nn.Conv2D(dim_in, proj_dim, kernel_size=1)
elif proj == 'convmlp':
self.proj = nn.Sequential(
layers.ConvBNReLU(dim_in, dim_in, kernel_size=1),
nn.Conv2D(dim_in, proj_dim, kernel_size=1),
)
else:
raise ValueError(
"The type of project head only support 'linear' and 'convmlp', but got {}."
.format(proj))
def forward(self, x):
return F.normalize(self.proj(x), p=2, axis=1)

@ -0,0 +1,197 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class ISANet(nn.Layer):
"""Interlaced Sparse Self-Attention for Semantic Segmentation.
The original article refers to Lang Huang, et al. "Interlaced Sparse Self-Attention for Semantic Segmentation"
(https://arxiv.org/abs/1907.12273).
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): A backbone network.
backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
isa_channels (int): The channels of ISA Module.
down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices=(2, 3),
isa_channels=256,
down_factor=(8, 8),
enable_auxiliary_loss=True,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
self.backbone_indices = backbone_indices
in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
self.head = ISAHead(num_classes, in_channels, isa_channels, down_factor,
enable_auxiliary_loss)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feats = self.backbone(x)
feats = [feats[i] for i in self.backbone_indices]
logit_list = self.head(feats)
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners,
align_mode=1) for logit in logit_list
]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class ISAHead(nn.Layer):
"""
The ISAHead.
Args:
num_classes (int): The unique number of target classes.
in_channels (tuple): The number of input channels.
isa_channels (int): The channels of ISA Module.
down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
"""
def __init__(self, num_classes, in_channels, isa_channels, down_factor,
enable_auxiliary_loss):
super(ISAHead, self).__init__()
self.in_channels = in_channels[-1]
inter_channels = self.in_channels // 4
self.inter_channels = inter_channels
self.down_factor = down_factor
self.enable_auxiliary_loss = enable_auxiliary_loss
self.in_conv = layers.ConvBNReLU(
self.in_channels, inter_channels, 3, bias_attr=False)
self.global_relation = SelfAttentionBlock(inter_channels, isa_channels)
self.local_relation = SelfAttentionBlock(inter_channels, isa_channels)
self.out_conv = layers.ConvBNReLU(
inter_channels * 2, inter_channels, 1, bias_attr=False)
self.cls = nn.Sequential(
nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1))
self.aux = nn.Sequential(
layers.ConvBNReLU(
in_channels=1024,
out_channels=256,
kernel_size=3,
bias_attr=False), nn.Dropout2D(p=0.1),
nn.Conv2D(256, num_classes, 1))
def forward(self, feat_list):
C3, C4 = feat_list
x = self.in_conv(C4)
x_shape = paddle.shape(x)
P_h, P_w = self.down_factor
Q_h, Q_w = paddle.ceil(x_shape[2] / P_h).astype('int32'), paddle.ceil(
x_shape[3] / P_w).astype('int32')
pad_h, pad_w = (Q_h * P_h - x_shape[2]).astype('int32'), (
Q_w * P_w - x_shape[3]).astype('int32')
if pad_h > 0 or pad_w > 0:
padding = paddle.concat([
pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
],
axis=0)
feat = F.pad(x, padding)
else:
feat = x
feat = feat.reshape([0, x_shape[1], Q_h, P_h, Q_w, P_w])
feat = feat.transpose([0, 3, 5, 1, 2,
4]).reshape([-1, self.inter_channels, Q_h, Q_w])
feat = self.global_relation(feat)
feat = feat.reshape([x_shape[0], P_h, P_w, x_shape[1], Q_h, Q_w])
feat = feat.transpose([0, 4, 5, 3, 1,
2]).reshape([-1, self.inter_channels, P_h, P_w])
feat = self.local_relation(feat)
feat = feat.reshape([x_shape[0], Q_h, Q_w, x_shape[1], P_h, P_w])
feat = feat.transpose([0, 3, 1, 4, 2, 5]).reshape(
[0, self.inter_channels, P_h * Q_h, P_w * Q_w])
if pad_h > 0 or pad_w > 0:
feat = paddle.slice(
feat,
axes=[2, 3],
starts=[pad_h // 2, pad_w // 2],
ends=[pad_h // 2 + x_shape[2], pad_w // 2 + x_shape[3]])
feat = self.out_conv(paddle.concat([feat, x], axis=1))
output = self.cls(feat)
if self.enable_auxiliary_loss:
auxout = self.aux(C3)
return [output, auxout]
else:
return [output]
class SelfAttentionBlock(layers.AttentionBlock):
"""General self-attention block/non-local block.
Args:
in_channels (int): Input channels of key/query feature.
channels (int): Output channels of key/query transform.
"""
def __init__(self, in_channels, channels):
super(SelfAttentionBlock, self).__init__(
key_in_channels=in_channels,
query_in_channels=in_channels,
channels=channels,
out_channels=in_channels,
share_key_query=False,
query_downsample=None,
key_downsample=None,
key_query_num_convs=2,
key_query_norm=True,
value_out_num_convs=1,
value_out_norm=False,
matmul_norm=True,
with_out=False)
self.output_project = self.build_project(
in_channels, in_channels, num_convs=1, use_conv_module=True)
def forward(self, x):
context = super(SelfAttentionBlock, self).forward(x, x)
return self.output_project(context)

@ -0,0 +1,20 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .layer_libs import ConvBNReLU, ConvBN, SeparableConvBNReLU, DepthwiseConvBN, AuxLayer, SyncBatchNorm, JPU, ConvBNPReLU
from .activation import Activation
from .pyramid_pool import ASPPModule, PPModule
from .attention import AttentionBlock
from .nonlocal2d import NonLocal2D
from .wrap_functions import *

@ -0,0 +1,73 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
class Activation(nn.Layer):
"""
The wrapper of activations.
Args:
act (str, optional): The activation name in lowercase. It must be one of ['elu', 'gelu',
'hardshrink', 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid',
'softmax', 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax',
'hsigmoid']. Default: None, means identical transformation.
Returns:
A callable object of Activation.
Raises:
KeyError: When parameter `act` is not in the optional range.
Examples:
from paddlers.models.ppseg.models.common.activation import Activation
relu = Activation("relu")
print(relu)
# <class 'paddle.nn.layer.activation.ReLU'>
sigmoid = Activation("sigmoid")
print(sigmoid)
# <class 'paddle.nn.layer.activation.Sigmoid'>
not_exit_one = Activation("not_exit_one")
# KeyError: "not_exit_one does not exist in the current dict_keys(['elu', 'gelu', 'hardshrink',
# 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', 'softmax',
# 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', 'hsigmoid'])"
"""
def __init__(self, act=None):
super(Activation, self).__init__()
self._act = act
upper_act_names = nn.layer.activation.__dict__.keys()
lower_act_names = [act.lower() for act in upper_act_names]
act_dict = dict(zip(lower_act_names, upper_act_names))
if act is not None:
if act in act_dict.keys():
act_name = act_dict[act]
self.act_func = eval(
"nn.layer.activation.{}()".format(act_name))
else:
raise KeyError("{} does not exist in the current {}".format(
act, act_dict.keys()))
def forward(self, x):
if self._act is not None:
return self.act_func(x)
else:
return x

@ -0,0 +1,146 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
class AttentionBlock(nn.Layer):
"""General self-attention block/non-local block.
The original article refers to refer to https://arxiv.org/abs/1706.03762.
Args:
key_in_channels (int): Input channels of key feature.
query_in_channels (int): Input channels of query feature.
channels (int): Output channels of key/query transform.
out_channels (int): Output channels.
share_key_query (bool): Whether share projection weight between key
and query projection.
query_downsample (nn.Module): Query downsample module.
key_downsample (nn.Module): Key downsample module.
key_query_num_convs (int): Number of convs for key/query projection.
value_out_num_convs (int): Number of convs for value projection.
key_query_norm (bool): Whether to use BN for key/query projection.
value_out_norm (bool): Whether to use BN for value projection.
matmul_norm (bool): Whether normalize attention map with sqrt of
channels
with_out (bool): Whether use out projection.
"""
def __init__(self, key_in_channels, query_in_channels, channels,
out_channels, share_key_query, query_downsample,
key_downsample, key_query_num_convs, value_out_num_convs,
key_query_norm, value_out_norm, matmul_norm, with_out):
super(AttentionBlock, self).__init__()
if share_key_query:
assert key_in_channels == query_in_channels
self.with_out = with_out
self.key_in_channels = key_in_channels
self.query_in_channels = query_in_channels
self.out_channels = out_channels
self.channels = channels
self.share_key_query = share_key_query
self.key_project = self.build_project(
key_in_channels,
channels,
num_convs=key_query_num_convs,
use_conv_module=key_query_norm)
if share_key_query:
self.query_project = self.key_project
else:
self.query_project = self.build_project(
query_in_channels,
channels,
num_convs=key_query_num_convs,
use_conv_module=key_query_norm)
self.value_project = self.build_project(
key_in_channels,
channels if self.with_out else out_channels,
num_convs=value_out_num_convs,
use_conv_module=value_out_norm)
if self.with_out:
self.out_project = self.build_project(
channels,
out_channels,
num_convs=value_out_num_convs,
use_conv_module=value_out_norm)
else:
self.out_project = None
self.query_downsample = query_downsample
self.key_downsample = key_downsample
self.matmul_norm = matmul_norm
def build_project(self, in_channels, channels, num_convs, use_conv_module):
if use_conv_module:
convs = [
layers.ConvBNReLU(
in_channels=in_channels,
out_channels=channels,
kernel_size=1,
bias_attr=False)
]
for _ in range(num_convs - 1):
convs.append(
layers.ConvBNReLU(
in_channels=channels,
out_channels=channels,
kernel_size=1,
bias_attr=False))
else:
convs = [nn.Conv2D(in_channels, channels, 1)]
for _ in range(num_convs - 1):
convs.append(nn.Conv2D(channels, channels, 1))
if len(convs) > 1:
convs = nn.Sequential(*convs)
else:
convs = convs[0]
return convs
def forward(self, query_feats, key_feats):
query_shape = paddle.shape(query_feats)
query = self.query_project(query_feats)
if self.query_downsample is not None:
query = self.query_downsample(query)
query = query.flatten(2).transpose([0, 2, 1])
key = self.key_project(key_feats)
value = self.value_project(key_feats)
if self.key_downsample is not None:
key = self.key_downsample(key)
value = self.key_downsample(value)
key = key.flatten(2)
value = value.flatten(2).transpose([0, 2, 1])
sim_map = paddle.matmul(query, key)
if self.matmul_norm:
sim_map = (self.channels**-0.5) * sim_map
sim_map = F.softmax(sim_map, axis=-1)
context = paddle.matmul(sim_map, value)
context = paddle.transpose(context, [0, 2, 1])
context = paddle.reshape(
context, [0, self.out_channels, query_shape[2], query_shape[3]])
if self.out_project is not None:
context = self.out_project(context)
return context

@ -0,0 +1,302 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
def SyncBatchNorm(*args, **kwargs):
"""In cpu environment nn.SyncBatchNorm does not have kernel so use nn.BatchNorm2D instead"""
if paddle.get_device() == 'cpu' or os.environ.get('PADDLESEG_EXPORT_STAGE'):
return nn.BatchNorm2D(*args, **kwargs)
elif paddle.distributed.ParallelEnv().nranks == 1:
return nn.BatchNorm2D(*args, **kwargs)
else:
return nn.SyncBatchNorm(*args, **kwargs)
class ConvBNReLU(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
padding='same',
**kwargs):
super().__init__()
self._conv = nn.Conv2D(
in_channels, out_channels, kernel_size, padding=padding, **kwargs)
if 'data_format' in kwargs:
data_format = kwargs['data_format']
else:
data_format = 'NCHW'
self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
self._relu = layers.Activation("relu")
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
x = self._relu(x)
return x
class ConvBN(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
padding='same',
**kwargs):
super().__init__()
self._conv = nn.Conv2D(
in_channels, out_channels, kernel_size, padding=padding, **kwargs)
if 'data_format' in kwargs:
data_format = kwargs['data_format']
else:
data_format = 'NCHW'
self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
return x
class ConvReLUPool(nn.Layer):
def __init__(self, in_channels, out_channels):
super().__init__()
self.conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
dilation=1)
self._relu = layers.Activation("relu")
self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2)
def forward(self, x):
x = self.conv(x)
x = self._relu(x)
x = self._max_pool(x)
return x
class SeparableConvBNReLU(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
padding='same',
pointwise_bias=None,
**kwargs):
super().__init__()
self.depthwise_conv = ConvBN(
in_channels,
out_channels=in_channels,
kernel_size=kernel_size,
padding=padding,
groups=in_channels,
**kwargs)
if 'data_format' in kwargs:
data_format = kwargs['data_format']
else:
data_format = 'NCHW'
self.piontwise_conv = ConvBNReLU(
in_channels,
out_channels,
kernel_size=1,
groups=1,
data_format=data_format,
bias_attr=pointwise_bias)
def forward(self, x):
x = self.depthwise_conv(x)
x = self.piontwise_conv(x)
return x
class DepthwiseConvBN(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
padding='same',
**kwargs):
super().__init__()
self.depthwise_conv = ConvBN(
in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
padding=padding,
groups=in_channels,
**kwargs)
def forward(self, x):
x = self.depthwise_conv(x)
return x
class AuxLayer(nn.Layer):
"""
The auxiliary layer implementation for auxiliary loss.
Args:
in_channels (int): The number of input channels.
inter_channels (int): The intermediate channels.
out_channels (int): The number of output channels, and usually it is num_classes.
dropout_prob (float, optional): The drop rate. Default: 0.1.
"""
def __init__(self,
in_channels,
inter_channels,
out_channels,
dropout_prob=0.1,
**kwargs):
super().__init__()
self.conv_bn_relu = ConvBNReLU(
in_channels=in_channels,
out_channels=inter_channels,
kernel_size=3,
padding=1,
**kwargs)
self.dropout = nn.Dropout(p=dropout_prob)
self.conv = nn.Conv2D(
in_channels=inter_channels,
out_channels=out_channels,
kernel_size=1)
def forward(self, x):
x = self.conv_bn_relu(x)
x = self.dropout(x)
x = self.conv(x)
return x
class JPU(nn.Layer):
"""
Joint Pyramid Upsampling of FCN.
The original paper refers to
Wu, Huikai, et al. "Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation." arXiv preprint arXiv:1903.11816 (2019).
"""
def __init__(self, in_channels, width=512):
super().__init__()
self.conv5 = ConvBNReLU(
in_channels[-1], width, 3, padding=1, bias_attr=False)
self.conv4 = ConvBNReLU(
in_channels[-2], width, 3, padding=1, bias_attr=False)
self.conv3 = ConvBNReLU(
in_channels[-3], width, 3, padding=1, bias_attr=False)
self.dilation1 = SeparableConvBNReLU(
3 * width,
width,
3,
padding=1,
pointwise_bias=False,
dilation=1,
bias_attr=False,
stride=1,
)
self.dilation2 = SeparableConvBNReLU(
3 * width,
width,
3,
padding=2,
pointwise_bias=False,
dilation=2,
bias_attr=False,
stride=1)
self.dilation3 = SeparableConvBNReLU(
3 * width,
width,
3,
padding=4,
pointwise_bias=False,
dilation=4,
bias_attr=False,
stride=1)
self.dilation4 = SeparableConvBNReLU(
3 * width,
width,
3,
padding=8,
pointwise_bias=False,
dilation=8,
bias_attr=False,
stride=1)
def forward(self, *inputs):
feats = [
self.conv5(inputs[-1]),
self.conv4(inputs[-2]),
self.conv3(inputs[-3])
]
size = paddle.shape(feats[-1])[2:]
feats[-2] = F.interpolate(
feats[-2], size, mode='bilinear', align_corners=True)
feats[-3] = F.interpolate(
feats[-3], size, mode='bilinear', align_corners=True)
feat = paddle.concat(feats, axis=1)
feat = paddle.concat([
self.dilation1(feat),
self.dilation2(feat),
self.dilation3(feat),
self.dilation4(feat)
],
axis=1)
return inputs[0], inputs[1], inputs[2], feat
class ConvBNPReLU(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
padding='same',
**kwargs):
super().__init__()
self._conv = nn.Conv2D(in_channels,
out_channels,
kernel_size,
padding=padding,
**kwargs)
if 'data_format' in kwargs:
data_format = kwargs['data_format']
else:
data_format = 'NCHW'
self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
self._prelu = layers.Activation("prelu")
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
x = self._prelu(x)
return x

@ -0,0 +1,154 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
class NonLocal2D(nn.Layer):
"""Basic Non-local module.
This model is the implementation of "Non-local Neural Networks"
(https://arxiv.org/abs/1711.07971)
Args:
in_channels (int): Channels of the input feature map.
reduction (int): Channel reduction ratio. Default: 2.
use_scale (bool): Whether to scale pairwise_weight by `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. Default: True.
sub_sample (bool): Whether to utilize max pooling after pairwise function. Default: False.
mode (str): Options are `gaussian`, `concatenation`, `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
"""
def __init__(self,
in_channels,
reduction=2,
use_scale=True,
sub_sample=False,
mode='embedded_gaussian'):
super(NonLocal2D, self).__init__()
self.in_channels = in_channels
self.reduction = reduction
self.use_scale = use_scale
self.sub_sample = sub_sample
self.mode = mode
if mode not in [
'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
]:
raise ValueError(
"Mode should be in 'gaussian', 'concatenation','embedded_gaussian' or 'dot_product'."
)
self.inter_channels = max(in_channels // reduction, 1)
self.g = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.inter_channels,
kernel_size=1)
self.conv_out = layers.ConvBNReLU(
in_channels=self.inter_channels,
out_channels=self.in_channels,
kernel_size=1,
bias_attr=False)
if self.mode != "gaussian":
self.theta = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.inter_channels,
kernel_size=1)
self.phi = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.inter_channels,
kernel_size=1)
if self.mode == "concatenation":
self.concat_project = layers.ConvBNReLU(
in_channels=self.inter_channels * 2,
out_channels=1,
kernel_size=1,
bias_attr=False)
if self.sub_sample:
max_pool_layer = nn.MaxPool2D(kernel_size=(2, 2))
self.g = nn.Sequential(self.g, max_pool_layer)
if self.mode != 'gaussian':
self.phi = nn.Sequential(self.phi, max_pool_layer)
else:
self.phi = max_pool_layer
def gaussian(self, theta_x, phi_x):
pairwise_weight = paddle.matmul(theta_x, phi_x)
pairwise_weight = F.softmax(pairwise_weight, axis=-1)
return pairwise_weight
def embedded_gaussian(self, theta_x, phi_x):
pairwise_weight = paddle.matmul(theta_x, phi_x)
if self.use_scale:
pairwise_weight /= theta_x.shape[-1]**0.5
pairwise_weight = F.softmax(pairwise_weight, -1)
return pairwise_weight
def dot_product(self, theta_x, phi_x):
pairwise_weight = paddle.matmul(theta_x, phi_x)
pairwise_weight /= pairwise_weight.shape[-1]
return pairwise_weight
def concatenation(self, theta_x, phi_x):
h = theta_x.shape[2]
w = phi_x.shape[3]
theta_x = paddle.tile(theta_x, [1, 1, 1, w])
phi_x = paddle.tile(phi_x, [1, 1, h, 1])
concat_feature = paddle.concat([theta_x, phi_x], axis=1)
pairwise_weight = self.concat_project(concat_feature)
n, _, h, w = pairwise_weight.shape
pairwise_weight = paddle.reshape(pairwise_weight, [n, h, w])
pairwise_weight /= pairwise_weight.shape[-1]
return pairwise_weight
def forward(self, x):
n, c, h, w = x.shape
g_x = paddle.reshape(self.g(x), [n, self.inter_channels, -1])
g_x = paddle.transpose(g_x, [0, 2, 1])
if self.mode == 'gaussian':
theta_x = paddle.reshape(x, [n, self.inter_channels, -1])
theta_x = paddle.transpose(theta_x, [0, 2, 1])
if self.sub_sample:
phi_x = paddle.reshape(
self.phi(x), [n, self.inter_channels, -1])
else:
phi_x = paddle.reshape(x, [n, self.in_channels, -1])
elif self.mode == 'concatenation':
theta_x = paddle.reshape(
self.theta(x), [n, self.inter_channels, -1, 1])
phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, 1, -1])
else:
theta_x = paddle.reshape(
self.theta(x), [n, self.inter_channels, -1])
theta_x = paddle.transpose(theta_x, [0, 2, 1])
phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, -1])
pairwise_func = getattr(self, self.mode)
pairwise_weight = pairwise_func(theta_x, phi_x)
y = paddle.matmul(pairwise_weight, g_x)
y = paddle.transpose(y, [0, 2, 1])
y = paddle.reshape(y, [n, self.inter_channels, h, w])
output = x + self.conv_out(y)
return output

@ -0,0 +1,192 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn.functional as F
from paddle import nn
from paddlers.models.ppseg.models import layers
class ASPPModule(nn.Layer):
"""
Atrous Spatial Pyramid Pooling.
Args:
aspp_ratios (tuple): The dilation rate using in ASSP module.
in_channels (int): The number of input channels.
out_channels (int): The number of output channels.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False.
image_pooling (bool, optional): If augmented with image-level features. Default: False
"""
def __init__(self,
aspp_ratios,
in_channels,
out_channels,
align_corners,
use_sep_conv=False,
image_pooling=False,
data_format='NCHW'):
super().__init__()
self.align_corners = align_corners
self.data_format = data_format
self.aspp_blocks = nn.LayerList()
for ratio in aspp_ratios:
if use_sep_conv and ratio > 1:
conv_func = layers.SeparableConvBNReLU
else:
conv_func = layers.ConvBNReLU
block = conv_func(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1 if ratio == 1 else 3,
dilation=ratio,
padding=0 if ratio == 1 else ratio,
data_format=data_format)
self.aspp_blocks.append(block)
out_size = len(self.aspp_blocks)
if image_pooling:
self.global_avg_pool = nn.Sequential(
nn.AdaptiveAvgPool2D(
output_size=(1, 1), data_format=data_format),
layers.ConvBNReLU(
in_channels,
out_channels,
kernel_size=1,
bias_attr=False,
data_format=data_format))
out_size += 1
self.image_pooling = image_pooling
self.conv_bn_relu = layers.ConvBNReLU(
in_channels=out_channels * out_size,
out_channels=out_channels,
kernel_size=1,
data_format=data_format)
self.dropout = nn.Dropout(p=0.1) # drop rate
def forward(self, x):
outputs = []
if self.data_format == 'NCHW':
interpolate_shape = paddle.shape(x)[2:]
axis = 1
else:
interpolate_shape = paddle.shape(x)[1:3]
axis = -1
for block in self.aspp_blocks:
y = block(x)
outputs.append(y)
if self.image_pooling:
img_avg = self.global_avg_pool(x)
img_avg = F.interpolate(
img_avg,
interpolate_shape,
mode='bilinear',
align_corners=self.align_corners,
data_format=self.data_format)
outputs.append(img_avg)
x = paddle.concat(outputs, axis=axis)
x = self.conv_bn_relu(x)
x = self.dropout(x)
return x
class PPModule(nn.Layer):
"""
Pyramid pooling module originally in PSPNet.
Args:
in_channels (int): The number of intput channels to pyramid pooling module.
out_channels (int): The number of output channels after pyramid pooling module.
bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 2, 3, 6).
dim_reduction (bool, optional): A bool value represents if reducing dimension after pooling. Default: True.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
"""
def __init__(self, in_channels, out_channels, bin_sizes, dim_reduction,
align_corners):
super().__init__()
self.bin_sizes = bin_sizes
inter_channels = in_channels
if dim_reduction:
inter_channels = in_channels // len(bin_sizes)
# we use dimension reduction after pooling mentioned in original implementation.
self.stages = nn.LayerList([
self._make_stage(in_channels, inter_channels, size)
for size in bin_sizes
])
self.conv_bn_relu2 = layers.ConvBNReLU(
in_channels=in_channels + inter_channels * len(bin_sizes),
out_channels=out_channels,
kernel_size=3,
padding=1)
self.align_corners = align_corners
def _make_stage(self, in_channels, out_channels, size):
"""
Create one pooling layer.
In our implementation, we adopt the same dimension reduction as the original paper that might be
slightly different with other implementations.
After pooling, the channels are reduced to 1/len(bin_sizes) immediately, while some other implementations
keep the channels to be same.
Args:
in_channels (int): The number of intput channels to pyramid pooling module.
size (int): The out size of the pooled layer.
Returns:
conv (Tensor): A tensor after Pyramid Pooling Module.
"""
prior = nn.AdaptiveAvgPool2D(output_size=(size, size))
conv = layers.ConvBNReLU(
in_channels=in_channels, out_channels=out_channels, kernel_size=1)
return nn.Sequential(prior, conv)
def forward(self, input):
cat_layers = []
for stage in self.stages:
x = stage(input)
x = F.interpolate(
x,
paddle.shape(input)[2:],
mode='bilinear',
align_corners=self.align_corners)
cat_layers.append(x)
cat_layers = [input] + cat_layers[::-1]
cat = paddle.concat(cat_layers, axis=1)
out = self.conv_bn_relu2(cat)
return out

@ -0,0 +1,83 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
"""
Warp the functon api, so the normal and quantization training can use the same network.
"""
class Add(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, y, name=None):
return paddle.add(x, y, name)
class Subtract(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, y, name=None):
return paddle.subtract(x, y, name)
class Multiply(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, y, name=None):
return paddle.multiply(x, y, name)
class Divide(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, y, name=None):
return paddle.divide(x, y, name)
class Reshape(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, shape, name=None):
return paddle.reshape(x, shape, name)
class Transpose(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, perm, name=None):
return paddle.transpose(x, perm, name)
class Concat(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, axis=0, name=None):
return paddle.concat(x, axis, name)
class Flatten(nn.Layer):
def __init__(self):
super().__init__()
def forward(self, x, start_axis=0, stop_axis=-1, name=None):
return paddle.flatten(x, start_axis, stop_axis, name)

@ -0,0 +1,36 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .mixed_loss import MixedLoss
from .cross_entropy_loss import CrossEntropyLoss
from .cross_entropy_loss import DistillCrossEntropyLoss
from .binary_cross_entropy_loss import BCELoss
from .lovasz_loss import LovaszSoftmaxLoss, LovaszHingeLoss
from .gscnn_dual_task_loss import DualTaskLoss
from .edge_attention_loss import EdgeAttentionLoss
from .bootstrapped_cross_entropy import BootstrappedCrossEntropyLoss
from .dice_loss import DiceLoss
from .ohem_cross_entropy_loss import OhemCrossEntropyLoss
from .decoupledsegnet_relax_boundary_loss import RelaxBoundaryLoss
from .ohem_edge_attention_loss import OhemEdgeAttentionLoss
from .l1_loss import L1Loss
from .mean_square_error_loss import MSELoss
from .focal_loss import FocalLoss
from .kl_loss import KLLoss
from .rmi_loss import RMILoss
from .detail_aggregate_loss import DetailAggregateLoss
from .point_cross_entropy_loss import PointCrossEntropyLoss
from .pixel_contrast_cross_entropy_loss import PixelContrastCrossEntropyLoss
from .semantic_encode_cross_entropy_loss import SECrossEntropyLoss
from .semantic_connectivity_loss import SemanticConnectivityLoss

@ -0,0 +1,174 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class BCELoss(nn.Layer):
r"""
This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
layer and some reduce operations.
This measures the element-wise probability error in classification tasks
in which each class is independent.
This can be thought of as predicting labels for a data-point, where labels
are not mutually exclusive. For example, a news article can be about
politics, technology or sports at the same time or none of these.
First this operator calculate loss function as follows:
.. math::
Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
.. math::
Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
we reformulate the loss as follows:
.. math::
Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
weight tensor on the loss `Out`. The ``weight`` tensor will attach different
weight on every items in the batch. The ``pos_weight`` will attach different
weight on the positive label of each class.
Finally, this operator applies reduce operation on the loss.
If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
Note that the target labels ``label`` should be numbers between 0 and 1.
Args:
weight (Tensor | str, optional): A manual rescaling weight given to the loss of each
batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
The data type is float32, float64. If type is str, it should equal to 'dynamic'.
It will compute weight dynamically in every step.
Default is ``'None'``.
pos_weight (float|str, optional): A weight of positive examples. If type is str,
it should equal to 'dynamic'. It will compute weight dynamically in every step.
Default is ``'None'``.
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
edge_label (bool, optional): Whether to use edge label. Default: False
Shapes:
logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
N is batch_size, `*` means number of additional dimensions. The ``logit``
is usually the output of Linear layer. Available dtype is float32, float64.
label (Tensor): The target labels tensor. 2-D tensor with the same shape as
``logit``. The target labels which values should be numbers between 0 and 1.
Available dtype is float32, float64.
Returns:
A callable object of BCEWithLogitsLoss.
Examples:
.. code-block:: python
import paddle
paddle.disable_static()
logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
output = bce_logit_loss(logit, label)
print(output.numpy()) # [0.45618808]
"""
def __init__(self,
weight=None,
pos_weight=None,
ignore_index=255,
edge_label=False):
super().__init__()
self.weight = weight
self.pos_weight = pos_weight
self.ignore_index = ignore_index
self.edge_label = edge_label
self.EPS = 1e-10
if self.weight is not None:
if isinstance(self.weight, str):
if self.weight != 'dynamic':
raise ValueError(
"if type of `weight` is str, it should equal to 'dynamic', but it is {}"
.format(self.weight))
elif isinstance(self.weight, paddle.VarBase):
raise TypeError(
'The type of `weight` is wrong, it should be Tensor or str, but it is {}'
.format(type(self.weight)))
if self.pos_weight is not None:
if isinstance(self.pos_weight, str):
if self.pos_weight != 'dynamic':
raise ValueError(
"if type of `pos_weight` is str, it should equal to 'dynamic', but it is {}"
.format(self.pos_weight))
elif isinstance(self.pos_weight, float):
self.pos_weight = paddle.to_tensor(
self.pos_weight, dtype='float32')
else:
raise TypeError(
'The type of `pos_weight` is wrong, it should be float or str, but it is {}'
.format(type(self.pos_weight)))
def forward(self, logit, label):
"""
Forward computation.
Args:
logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1.
label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
value is 0 or 1, and if shape is more than 2D, this is
(N, C, D1, D2,..., Dk), k >= 1.
"""
if len(label.shape) != len(logit.shape):
label = paddle.unsqueeze(label, 1)
mask = (label != self.ignore_index)
mask = paddle.cast(mask, 'float32')
# label.shape should equal to the logit.shape
if label.shape[1] != logit.shape[1]:
label = label.squeeze(1)
label = F.one_hot(label, logit.shape[1])
label = label.transpose((0, 3, 1, 2))
if isinstance(self.weight, str):
pos_index = (label == 1)
neg_index = (label == 0)
pos_num = paddle.sum(pos_index.astype('float32'))
neg_num = paddle.sum(neg_index.astype('float32'))
sum_num = pos_num + neg_num
weight_pos = 2 * neg_num / (sum_num + self.EPS)
weight_neg = 2 * pos_num / (sum_num + self.EPS)
weight = weight_pos * label + weight_neg * (1 - label)
else:
weight = self.weight
if isinstance(self.pos_weight, str):
pos_index = (label == 1)
neg_index = (label == 0)
pos_num = paddle.sum(pos_index.astype('float32'))
neg_num = paddle.sum(neg_index.astype('float32'))
sum_num = pos_num + neg_num
pos_weight = 2 * neg_num / (sum_num + self.EPS)
else:
pos_weight = self.pos_weight
label = label.astype('float32')
loss = paddle.nn.functional.binary_cross_entropy_with_logits(
logit,
label,
weight=weight,
reduction='none',
pos_weight=pos_weight)
loss = loss * mask
loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
label.stop_gradient = True
mask.stop_gradient = True
return loss

@ -0,0 +1,73 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class BootstrappedCrossEntropyLoss(nn.Layer):
"""
Implements the cross entropy loss function.
Args:
min_K (int): the minimum number of pixels to be counted in loss computation.
loss_th (float): the loss threshold. Only loss that is larger than the threshold
would be calculated.
weight (tuple|list, optional): The weight for different classes. Default: None.
ignore_index (int, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default: 255.
"""
def __init__(self, min_K, loss_th, weight=None, ignore_index=255):
super().__init__()
self.ignore_index = ignore_index
self.K = min_K
self.threshold = loss_th
if weight is not None:
weight = paddle.to_tensor(weight, dtype='float32')
self.weight = weight
def forward(self, logit, label):
n, c, h, w = logit.shape
total_loss = 0.0
if len(label.shape) != len(logit.shape):
label = paddle.unsqueeze(label, 1)
for i in range(n):
x = paddle.unsqueeze(logit[i], 0)
y = paddle.unsqueeze(label[i], 0)
x = paddle.transpose(x, (0, 2, 3, 1))
y = paddle.transpose(y, (0, 2, 3, 1))
x = paddle.reshape(x, shape=(-1, c))
y = paddle.reshape(y, shape=(-1, ))
loss = F.cross_entropy(
x,
y,
weight=self.weight,
ignore_index=self.ignore_index,
reduction="none")
sorted_loss = paddle.sort(loss, descending=True)
if sorted_loss[self.K] > self.threshold:
new_indices = paddle.nonzero(sorted_loss > self.threshold)
loss = paddle.gather(sorted_loss, new_indices)
else:
loss = sorted_loss[:self.K]
total_loss += paddle.mean(loss)
return total_loss / float(n)

@ -0,0 +1,218 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class CrossEntropyLoss(nn.Layer):
"""
Implements the cross entropy loss function.
Args:
weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
given to each class. Its length must be equal to the number of classes.
Default ``None``.
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0].
When its value < 1.0, only compute the loss for the top k percent pixels
(e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``.
data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``.
"""
def __init__(self,
weight=None,
ignore_index=255,
top_k_percent_pixels=1.0,
data_format='NCHW'):
super(CrossEntropyLoss, self).__init__()
self.ignore_index = ignore_index
self.top_k_percent_pixels = top_k_percent_pixels
self.EPS = 1e-8
self.data_format = data_format
if weight is not None:
self.weight = paddle.to_tensor(weight, dtype='float32')
else:
self.weight = None
def forward(self, logit, label, semantic_weights=None):
"""
Forward computation.
Args:
logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1.
label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, D1, D2,..., Dk), k >= 1.
semantic_weights (Tensor, optional): Weights about loss for each pixels,
shape is the same as label. Default: None.
Returns:
(Tensor): The average loss.
"""
channel_axis = 1 if self.data_format == 'NCHW' else -1
if self.weight is not None and logit.shape[channel_axis] != len(
self.weight):
raise ValueError(
'The number of weights = {} must be the same as the number of classes = {}.'
.format(len(self.weight), logit.shape[channel_axis]))
if channel_axis == 1:
logit = paddle.transpose(logit, [0, 2, 3, 1])
label = label.astype('int64')
# In F.cross_entropy, the ignore_index is invalid, which needs to be fixed.
# When there is 255 in the label and paddle version <= 2.1.3, the cross_entropy OP will report an error, which is fixed in paddle develop version.
loss = F.cross_entropy(
logit,
label,
ignore_index=self.ignore_index,
reduction='none',
weight=self.weight)
return self._post_process_loss(logit, label, semantic_weights, loss)
def _post_process_loss(self, logit, label, semantic_weights, loss):
"""
Consider mask and top_k to calculate the final loss.
Args:
logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1.
label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, D1, D2,..., Dk), k >= 1.
semantic_weights (Tensor, optional): Weights about loss for each pixels,
shape is the same as label.
loss (Tensor): Loss tensor which is the output of cross_entropy. If soft_label
is False in cross_entropy, the shape of loss should be the same as the label.
If soft_label is True in cross_entropy, the shape of loss should be
(N, D1, D2,..., Dk, 1).
Returns:
(Tensor): The average loss.
"""
mask = label != self.ignore_index
mask = paddle.cast(mask, 'float32')
label.stop_gradient = True
mask.stop_gradient = True
if loss.ndim > mask.ndim:
loss = paddle.squeeze(loss, axis=-1)
loss = loss * mask
if semantic_weights is not None:
loss = loss * semantic_weights
if self.weight is not None:
_one_hot = F.one_hot(label, logit.shape[-1])
coef = paddle.sum(_one_hot * self.weight, axis=-1)
else:
coef = paddle.ones_like(label)
if self.top_k_percent_pixels == 1.0:
avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS)
else:
loss = loss.reshape((-1, ))
top_k_pixels = int(self.top_k_percent_pixels * loss.numel())
loss, indices = paddle.topk(loss, top_k_pixels)
coef = coef.reshape((-1, ))
coef = paddle.gather(coef, indices)
coef.stop_gradient = True
coef = coef.astype('float32')
avg_loss = loss.mean() / (paddle.mean(coef) + self.EPS)
return avg_loss
@manager.LOSSES.add_component
class DistillCrossEntropyLoss(CrossEntropyLoss):
"""
The implementation of distill cross entropy loss.
Args:
weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
given to each class. Its length must be equal to the number of classes.
Default ``None``.
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0].
When its value < 1.0, only compute the loss for the top k percent pixels
(e.g., the top 20% pixels). This is useful for hard pixel mining.
Default ``1.0``.
data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'.
Default ``'NCHW'``.
"""
def __init__(self,
weight=None,
ignore_index=255,
top_k_percent_pixels=1.0,
data_format='NCHW'):
super().__init__(weight, ignore_index, top_k_percent_pixels,
data_format)
def forward(self,
student_logit,
teacher_logit,
label,
semantic_weights=None):
"""
Forward computation.
Args:
student_logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1.
teacher_logit (Tensor): Logit tensor, the data type is float32, float64. The shape
is the same as the student_logit.
label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, D1, D2,..., Dk), k >= 1.
semantic_weights (Tensor, optional): Weights about loss for each pixels,
shape is the same as label. Default: None.
"""
if student_logit.shape != teacher_logit.shape:
raise ValueError(
'The shape of student_logit = {} must be the same as the shape of teacher_logit = {}.'
.format(student_logit.shape, teacher_logit.shape))
channel_axis = 1 if self.data_format == 'NCHW' else -1
if self.weight is not None and student_logit.shape[channel_axis] != len(
self.weight):
raise ValueError(
'The number of weights = {} must be the same as the number of classes = {}.'
.format(len(self.weight), student_logit.shape[channel_axis]))
if channel_axis == 1:
student_logit = paddle.transpose(student_logit, [0, 2, 3, 1])
teacher_logit = paddle.transpose(teacher_logit, [0, 2, 3, 1])
teacher_logit = F.softmax(teacher_logit)
loss = F.cross_entropy(
student_logit,
teacher_logit,
weight=self.weight,
reduction='none',
soft_label=True)
return self._post_process_loss(student_logit, label, semantic_weights,
loss)

@ -0,0 +1,129 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from paddle import nn
import paddle.nn.functional as F
from scipy.ndimage.interpolation import shift
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class RelaxBoundaryLoss(nn.Layer):
"""
Implements the ohem cross entropy loss function.
Args:
border (int, optional): The value of border to relax. Default: 1.
calculate_weights (bool, optional): Whether to calculate weights for every classes. Default: False.
upper_bound (float, optional): The upper bound of weights if calculating weights for every classes. Default: 1.0.
ignore_index (int64): Specifies a target value that is ignored
and does not contribute to the input gradient. Default: 255.
"""
def __init__(self,
border=1,
calculate_weights=False,
upper_bound=1.0,
ignore_index=255):
super(RelaxBoundaryLoss, self).__init__()
self.border = border
self.calculate_weights = calculate_weights
self.upper_bound = upper_bound
self.ignore_index = ignore_index
self.EPS = 1e-5
def relax_onehot(self, label, num_classes):
# pad label, and let ignore_index as num_classes
if len(label.shape) == 3:
label = label.unsqueeze(1)
h, w = label.shape[-2], label.shape[-1]
label = F.pad(label, [self.border] * 4, value=num_classes)
label = label.squeeze(1)
ignore_mask = (label == self.ignore_index).astype('int64')
label = label * (1 - ignore_mask) + num_classes * ignore_mask
onehot = 0
for i in range(-self.border, self.border + 1):
for j in range(-self.border, self.border + 1):
h_start, h_end = 1 + i, h + 1 + i
w_start, w_end = 1 + j, w + 1 + j
label_ = label[:, h_start:h_end, w_start:w_end]
onehot_ = F.one_hot(label_, num_classes + 1)
onehot += onehot_
onehot = (onehot > 0).astype('int64')
onehot = paddle.transpose(onehot, (0, 3, 1, 2))
return onehot
def calculate_weights(self, label):
hist = paddle.sum(label, axis=(1, 2)) * 1.0 / label.sum()
hist = ((hist != 0) * self.upper_bound * (1 - hist)) + 1
def custom_nll(self,
logit,
label,
class_weights=None,
border_weights=None,
ignore_mask=None):
soft = F.softmax(logit, axis=1)
# calculate the valid soft where label is 1.
soft_label = ((soft * label[:, :-1, :, :]).sum(
1, keepdim=True)) * (label[:, :-1, :, :].astype('float32'))
soft = soft * (1 - label[:, :-1, :, :]) + soft_label
logsoft = paddle.log(soft)
if class_weights is not None:
logsoft = class_weights.unsqueeze((0, 2, 3))
logsoft = label[:, :-1, :, :] * logsoft
logsoft = logsoft.sum(1)
# border loss is divided equally
logsoft = -1 / border_weights * logsoft * (1. - ignore_mask)
n, _, h, w = label.shape
logsoft = logsoft.sum() / (n * h * w - ignore_mask.sum() + 1)
return logsoft
def forward(self, logit, label):
"""
Forward computation.
Args:
logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1.
label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, D1, D2,..., Dk), k >= 1.
"""
n, c, h, w = logit.shape
label.stop_gradient = True
label = self.relax_onehot(label, c)
weights = label[:, :-1, :, :].sum(1).astype('float32')
ignore_mask = (weights == 0).astype('float32')
# border is greater than 1, other is 1
border_weights = weights + ignore_mask
loss = 0
class_weights = None
for i in range(n):
if self.calculate_weights:
class_weights = self.calculate_weights(label[i])
loss = loss + self.custom_nll(
logit[i].unsqueeze(0),
label[i].unsqueeze(0),
class_weights=class_weights,
border_weights=border_weights,
ignore_mask=ignore_mask[i])
return loss

@ -0,0 +1,116 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class DetailAggregateLoss(nn.Layer):
"""
DetailAggregateLoss's implementation based on PaddlePaddle.
The original article refers to Meituan
Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation."
(https://arxiv.org/abs/2104.13188)
Args:
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
"""
def __init__(self, ignore_index=255):
super(DetailAggregateLoss, self).__init__()
self.ignore_index = ignore_index
self.laplacian_kernel = paddle.to_tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype='float32').reshape(
(1, 1, 3, 3))
self.fuse_kernel = paddle.create_parameter([1, 3, 1, 1], dtype='float32')
def forward(self, logits, label):
"""
Args:
logits (Tensor): Logit tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1.
label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, D1, D2,..., Dk), k >= 1.
Returns: loss
"""
boundary_targets = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
padding=1)
boundary_targets = paddle.clip(boundary_targets, min=0)
boundary_targets = boundary_targets > 0.1
boundary_targets = boundary_targets.astype('float32')
boundary_targets_x2 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
stride=2, padding=1)
boundary_targets_x2 = paddle.clip(boundary_targets_x2, min=0)
boundary_targets_x4 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
stride=4, padding=1)
boundary_targets_x4 = paddle.clip(boundary_targets_x4, min=0)
boundary_targets_x8 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
stride=8, padding=1)
boundary_targets_x8 = paddle.clip(boundary_targets_x8, min=0)
boundary_targets_x8_up = F.interpolate(boundary_targets_x8, boundary_targets.shape[2:], mode='nearest')
boundary_targets_x4_up = F.interpolate(boundary_targets_x4, boundary_targets.shape[2:], mode='nearest')
boundary_targets_x2_up = F.interpolate(boundary_targets_x2, boundary_targets.shape[2:], mode='nearest')
boundary_targets_x2_up = boundary_targets_x2_up > 0.1
boundary_targets_x2_up = boundary_targets_x2_up.astype('float32')
boundary_targets_x4_up = boundary_targets_x4_up > 0.1
boundary_targets_x4_up = boundary_targets_x4_up.astype('float32')
boundary_targets_x8_up = boundary_targets_x8_up > 0.1
boundary_targets_x8_up = boundary_targets_x8_up.astype('float32')
boudary_targets_pyramids = paddle.stack((boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up),
axis=1)
boudary_targets_pyramids = paddle.squeeze(boudary_targets_pyramids, axis=2)
boudary_targets_pyramid = F.conv2d(boudary_targets_pyramids, self.fuse_kernel)
boudary_targets_pyramid = boudary_targets_pyramid > 0.1
boudary_targets_pyramid = boudary_targets_pyramid.astype('float32')
if logits.shape[-1] != boundary_targets.shape[-1]:
logits = F.interpolate(
logits, boundary_targets.shape[2:], mode='bilinear', align_corners=True)
bce_loss = F.binary_cross_entropy_with_logits(logits, boudary_targets_pyramid)
dice_loss = self.fixed_dice_loss_func(F.sigmoid(logits), boudary_targets_pyramid)
detail_loss = bce_loss + dice_loss
label.stop_gradient = True
return detail_loss
def fixed_dice_loss_func(self, input, target):
"""
simplified diceloss for DetailAggregateLoss.
"""
smooth = 1.
n = input.shape[0]
iflat = paddle.reshape(input, [n, -1])
tflat = paddle.reshape(target, [n, -1])
intersection = paddle.sum((iflat * tflat), axis=1)
loss = 1 - ((2. * intersection + smooth) /
(paddle.sum(iflat, axis=1) + paddle.sum(tflat, axis=1) + smooth))
return paddle.mean(loss)

@ -0,0 +1,56 @@
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class DiceLoss(nn.Layer):
"""
Implements the dice loss function.
Args:
ignore_index (int64): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
smooth (float32): laplace smoothing,
to smooth dice loss and accelerate convergence. following:
https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
"""
def __init__(self, ignore_index=255, smooth=0.):
super(DiceLoss, self).__init__()
self.ignore_index = ignore_index
self.eps = 1e-5
self.smooth = smooth
def forward(self, logits, labels):
labels = paddle.cast(labels, dtype='int32')
labels_one_hot = F.one_hot(labels, num_classes=logits.shape[1])
labels_one_hot = paddle.transpose(labels_one_hot, [0, 3, 1, 2])
labels_one_hot = paddle.cast(labels_one_hot, dtype='float32')
logits = F.softmax(logits, axis=1)
mask = (paddle.unsqueeze(labels, 1) != self.ignore_index)
logits = logits * mask
labels_one_hot = labels_one_hot * mask
dims = (0, ) + tuple(range(2, labels.ndimension() + 1))
intersection = paddle.sum(logits * labels_one_hot, dims)
cardinality = paddle.sum(logits + labels_one_hot, dims)
dice_loss = ((2. * intersection + self.smooth) /
(cardinality + self.eps + self.smooth)).mean()
return 1 - dice_loss

@ -0,0 +1,78 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import losses
@manager.LOSSES.add_component
class EdgeAttentionLoss(nn.Layer):
"""
Implements the cross entropy loss function. It only compute the edge part.
Args:
edge_threshold (float): The pixels greater edge_threshold as edges.
ignore_index (int64): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
"""
def __init__(self, edge_threshold=0.8, ignore_index=255):
super().__init__()
self.edge_threshold = edge_threshold
self.ignore_index = ignore_index
self.EPS = 1e-10
self.mean_mask = 1
def forward(self, logits, label):
"""
Forward computation.
Args:
logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit .
label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, C, D1, D2,..., Dk), k >= 1.
"""
seg_logit, edge_logit = logits[0], logits[1]
if len(label.shape) != len(seg_logit.shape):
label = paddle.unsqueeze(label, 1)
if edge_logit.shape != label.shape:
raise ValueError(
'The shape of edge_logit should equal to the label, but they are {} != {}'
.format(edge_logit.shape, label.shape))
filler = paddle.ones_like(label) * self.ignore_index
label = paddle.where(edge_logit > self.edge_threshold, label, filler)
seg_logit = paddle.transpose(seg_logit, [0, 2, 3, 1])
label = paddle.transpose(label, [0, 2, 3, 1])
loss = F.softmax_with_cross_entropy(
seg_logit, label, ignore_index=self.ignore_index, axis=-1)
mask = label != self.ignore_index
mask = paddle.cast(mask, 'float32')
loss = loss * mask
avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
if paddle.mean(mask) < self.mean_mask:
self.mean_mask = paddle.mean(mask)
label.stop_gradient = True
mask.stop_gradient = True
return avg_loss

@ -0,0 +1,60 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class FocalLoss(nn.Layer):
"""
Focal Loss.
Code referenced from:
https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
Args:
gamma (float): the coefficient of Focal Loss.
ignore_index (int64): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
"""
def __init__(self, gamma=2.0, ignore_index=255, edge_label=False):
super(FocalLoss, self).__init__()
self.gamma = gamma
self.ignore_index = ignore_index
self.edge_label = edge_label
def forward(self, logit, label):
logit = paddle.reshape(
logit, [logit.shape[0], logit.shape[1], -1]) # N,C,H,W => N,C,H*W
logit = paddle.transpose(logit, [0, 2, 1]) # N,C,H*W => N,H*W,C
logit = paddle.reshape(logit,
[-1, logit.shape[2]]) # N,H*W,C => N*H*W,C
label = paddle.reshape(label, [-1, 1])
range_ = paddle.arange(0, label.shape[0])
range_ = paddle.unsqueeze(range_, axis=-1)
label = paddle.cast(label, dtype='int64')
label = paddle.concat([range_, label], axis=-1)
logpt = F.log_softmax(logit)
logpt = paddle.gather_nd(logpt, label)
pt = paddle.exp(logpt.detach())
loss = -1 * (1 - pt)**self.gamma * logpt
loss = paddle.mean(loss)
return loss

@ -0,0 +1,141 @@
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class DualTaskLoss(nn.Layer):
"""
The dual task loss implement of GSCNN
Args:
ignore_index (int64): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
tau (float): the tau of gumbel softmax sample.
"""
def __init__(self, ignore_index=255, tau=0.5):
super().__init__()
self.ignore_index = ignore_index
self.tau = tau
def _gumbel_softmax_sample(self, logit, tau=1, eps=1e-10):
"""
Draw a sample from the Gumbel-Softmax distribution
based on
https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb
(MIT license)
"""
gumbel_noise = paddle.rand(logit.shape)
gumbel_noise = -paddle.log(eps - paddle.log(gumbel_noise + eps))
logit = logit + gumbel_noise
return F.softmax(logit / tau, axis=1)
def compute_grad_mag(self, x):
eps = 1e-6
n, c, h, w = x.shape
if h <= 1 or w <= 1:
raise ValueError(
'The width and height of tensor to compute grad must be greater than 1, but the shape is {}.'
.format(x.shape))
x = self.conv_tri(x, r=4)
kernel = [[-1, 0, 1]]
kernel = paddle.to_tensor(kernel).astype('float32')
kernel = 0.5 * kernel
kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0)
grad_x = F.conv2d(x, kernel_x, padding='same', groups=c)
kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0)
grad_y = F.conv2d(x, kernel_y, padding='same', groups=c)
mag = paddle.sqrt(grad_x * grad_x + grad_y * grad_y + eps)
return mag / mag.max()
def conv_tri(self, input, r):
"""
Convolves an image by a 2D triangle filter (the 1D triangle filter f is
[1:r r+1 r:-1:1]/(r+1)^2, the 2D version is simply conv2(f,f'))
"""
if r <= 1:
raise ValueError(
'`r` should be greater than 1, but it is {}.'.format(r))
kernel = [
list(range(1, r + 1)) + [r + 1] + list(reversed(range(1, r + 1)))
]
kernel = paddle.to_tensor(kernel).astype('float32')
kernel = kernel / (r + 1)**2
input_ = F.pad(input, [1, 1, 0, 0], mode='replicate')
input_ = F.pad(input_, [r, r, 0, 0], mode='reflect')
input_ = [input_[:, :, :, :r], input, input_[:, :, :, -r:]]
input_ = paddle.concat(input_, axis=3)
tem = input_.clone()
input_ = F.pad(input_, [0, 0, 1, 1], mode='replicate')
input_ = F.pad(input_, [0, 0, r, r], mode='reflect')
input_ = [input_[:, :, :r, :], tem, input_[:, :, -r:, :]]
input_ = paddle.concat(input_, axis=2)
c = input.shape[1]
kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0)
output = F.conv2d(input_, kernel_x, padding=0, groups=c)
kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0)
output = F.conv2d(output, kernel_y, padding=0, groups=c)
return output
def forward(self, logit, labels):
# import pdb; pdb.set_trace()
n, c, h, w = logit.shape
th = 1e-8
eps = 1e-10
if len(labels.shape) == 3:
labels = labels.unsqueeze(1)
mask = (labels != self.ignore_index)
mask.stop_gradient = True
logit = logit * mask
labels = labels * mask
if len(labels.shape) == 4:
labels = labels.squeeze(1)
labels.stop_gradient = True
labels = F.one_hot(labels, logit.shape[1]).transpose((0, 3, 1, 2))
labels.stop_gradient = True
g = self._gumbel_softmax_sample(logit, tau=self.tau)
g = self.compute_grad_mag(g)
g_hat = self.compute_grad_mag(labels)
loss = F.l1_loss(g, g_hat, reduction='none')
loss = loss * mask
g_mask = (g > th).astype('float32')
g_mask.stop_gradient = True
g_mask_sum = paddle.sum(g_mask)
loss_g = paddle.sum(loss * g_mask)
if g_mask_sum > eps:
loss_g = loss_g / g_mask_sum
g_hat_mask = (g_hat > th).astype('float32')
g_hat_mask.stop_gradient = True
g_hat_mask_sum = paddle.sum(g_hat_mask)
loss_g_hat = paddle.sum(loss * g_hat_mask)
if g_hat_mask_sum > eps:
loss_g_hat = loss_g_hat / g_hat_mask_sum
total_loss = 0.5 * loss_g + 0.5 * loss_g_hat
return total_loss

@ -0,0 +1,80 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class KLLoss(nn.Layer):
"""
The implementation of Kullback-Leibler divergence Loss.
Refer to https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence.
Args:
ignore_index (int64): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
temperature (float): the coefficient of kl_loss.
"""
def __init__(self, ignore_index=255, temperature=1):
super().__init__()
self.ignore_index = ignore_index
self.temperature = temperature
self.kl_loss = nn.KLDivLoss(reduction="none")
self.EPS = 1e-8
def forward(self, logit_1, logit_2, label=None):
"""
Calculate the KL loss. If the label is not None, it considers the
ignore_index in label and calculates the masked loss.
Args:
logit_1 (Tensor): Logit tensor, the data type is float32 or float64.
The shape is (N, C), where C is number of classes, and if shape is
more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
logit_2 (Tensor): Logit tensor, the data type is float32 or float64.
The shape of logit_2 and logit_1 are the same.
label (Tensor, optional): Label tensor, the data type is int64.
The shape is (N), where each value is 0 <= label[i] <= C-1, and
if shape is more than 2D, this is (N, D1, D2,..., Dk), k >= 1.
Returns:
(Tensor): The average loss.
"""
if logit_1.shape != logit_2.shape:
raise ValueError(
'The shape of logit_1 = {} must be the same as the shape of logit_2 = {}.'
.format(logit_1.shape, logit_2.shape))
logit_1 = F.log_softmax(logit_1 / self.temperature, axis=1)
logit_2 = F.softmax(logit_2 / self.temperature, axis=1)
loss = self.kl_loss(logit_1, logit_2)
loss = loss * self.temperature * self.temperature
if label is None:
avg_loss = paddle.mean(loss)
else:
mask = label != self.ignore_index
mask = paddle.cast(mask, 'float32')
mask = paddle.unsqueeze(mask, axis=1)
label.stop_gradient = True
mask.stop_gradient = True
loss = loss * mask
avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
return avg_loss

@ -0,0 +1,76 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class L1Loss(nn.L1Loss):
r"""
This interface is used to construct a callable object of the ``L1Loss`` class.
The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
If `reduction` set to ``'none'``, the loss is:
.. math::
Out = \lvert input - label\rvert
If `reduction` set to ``'mean'``, the loss is:
.. math::
Out = MEAN(\lvert input - label\rvert)
If `reduction` set to ``'sum'``, the loss is:
.. math::
Out = SUM(\lvert input - label\rvert)
Args:
reduction (str, optional): Indicate the reduction to apply to the loss,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If `reduction` is ``'none'``, the unreduced loss is returned;
If `reduction` is ``'mean'``, the reduced mean loss is returned.
If `reduction` is ``'sum'``, the reduced sum loss is returned.
Default is ``'mean'``.
ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255.
Shape:
input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
output (Tensor): The L1 Loss of ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples:
.. code-block:: python
import paddle
import numpy as np
input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
input = paddle.to_tensor(input_data)
label = paddle.to_tensor(label_data)
l1_loss = paddle.nn.L1Loss()
output = l1_loss(input, label)
print(output.numpy())
# [0.35]
l1_loss = paddle.nn.L1Loss(reduction='sum')
output = l1_loss(input, label)
print(output.numpy())
# [1.4]
l1_loss = paddle.nn.L1Loss(reduction='none')
output = l1_loss(input, label)
print(output)
# [[0.20000005 0.19999999]
# [0.2 0.79999995]]
"""
def __init__(self, reduction='mean', ignore_index=255):
super().__init__(reduction=reduction)

@ -0,0 +1,222 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lovasz-Softmax and Jaccard hinge loss in PaddlePaddle"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class LovaszSoftmaxLoss(nn.Layer):
"""
Multi-class Lovasz-Softmax loss.
Args:
ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``.
classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average.
"""
def __init__(self, ignore_index=255, classes='present'):
super(LovaszSoftmaxLoss, self).__init__()
self.ignore_index = ignore_index
self.classes = classes
def forward(self, logits, labels):
r"""
Forward computation.
Args:
logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty).
labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], ground truth labels (between 0 and C - 1).
"""
probas = F.softmax(logits, axis=1)
vprobas, vlabels = flatten_probas(probas, labels, self.ignore_index)
loss = lovasz_softmax_flat(vprobas, vlabels, classes=self.classes)
return loss
@manager.LOSSES.add_component
class LovaszHingeLoss(nn.Layer):
"""
Binary Lovasz hinge loss.
Args:
ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``.
"""
def __init__(self, ignore_index=255):
super(LovaszHingeLoss, self).__init__()
self.ignore_index = ignore_index
def forward(self, logits, labels):
r"""
Forward computation.
Args:
logits (Tensor): Shape is [N, 1, H, W] or [N, 2, H, W], logits at each pixel (between -\infty and +\infty).
labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], binary ground truth masks (0 or 1).
"""
if logits.shape[1] == 2:
logits = binary_channel_to_unary(logits)
loss = lovasz_hinge_flat(
*flatten_binary_scores(logits, labels, self.ignore_index))
return loss
def lovasz_grad(gt_sorted):
"""
Computes gradient of the Lovasz extension w.r.t sorted errors.
See Alg. 1 in paper.
"""
gts = paddle.sum(gt_sorted)
p = len(gt_sorted)
intersection = gts - paddle.cumsum(gt_sorted, axis=0)
union = gts + paddle.cumsum(1 - gt_sorted, axis=0)
jaccard = 1.0 - intersection.cast('float32') / union.cast('float32')
if p > 1: # cover 1-pixel case
jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
return jaccard
def binary_channel_to_unary(logits, eps=1e-9):
"""
Converts binary channel logits to unary channel logits for lovasz hinge loss.
"""
probas = F.softmax(logits, axis=1)
probas = probas[:, 1, :, :]
logits = paddle.log(probas + eps / (1 - probas + eps))
logits = logits.unsqueeze(1)
return logits
def lovasz_hinge_flat(logits, labels):
r"""
Binary Lovasz hinge loss.
Args:
logits (Tensor): Shape is [P], logits at each prediction (between -\infty and +\infty).
labels (Tensor): Shape is [P], binary ground truth labels (0 or 1).
"""
if len(labels) == 0:
# only void pixels, the gradients should be 0
return logits.sum() * 0.
signs = 2. * labels - 1.
signs.stop_gradient = True
errors = 1. - logits * signs
errors_sorted, perm = paddle.fluid.core.ops.argsort(errors, 'axis', 0,
'descending', True)
errors_sorted.stop_gradient = False
gt_sorted = paddle.gather(labels, perm)
grad = lovasz_grad(gt_sorted)
grad.stop_gradient = True
loss = paddle.sum(F.relu(errors_sorted) * grad)
return loss
def flatten_binary_scores(scores, labels, ignore=None):
"""
Flattens predictions in the batch (binary case).
Remove labels according to 'ignore'.
"""
scores = paddle.reshape(scores, [-1])
labels = paddle.reshape(labels, [-1])
labels.stop_gradient = True
if ignore is None:
return scores, labels
valid = labels != ignore
valid_mask = paddle.reshape(valid, (-1, 1))
indexs = paddle.nonzero(valid_mask)
indexs.stop_gradient = True
vscores = paddle.gather(scores, indexs[:, 0])
vlabels = paddle.gather(labels, indexs[:, 0])
return vscores, vlabels
def lovasz_softmax_flat(probas, labels, classes='present'):
"""
Multi-class Lovasz-Softmax loss.
Args:
probas (Tensor): Shape is [P, C], class probabilities at each prediction (between 0 and 1).
labels (Tensor): Shape is [P], ground truth labels (between 0 and C - 1).
classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average.
"""
if probas.numel() == 0:
# only void pixels, the gradients should be 0
return probas * 0.
C = probas.shape[1]
losses = []
classes_to_sum = list(range(C)) if classes in ['all', 'present'
] else classes
for c in classes_to_sum:
fg = paddle.cast(labels == c, probas.dtype) # foreground for class c
if classes == 'present' and fg.sum() == 0:
continue
fg.stop_gradient = True
if C == 1:
if len(classes_to_sum) > 1:
raise ValueError('Sigmoid output possible only with 1 class')
class_pred = probas[:, 0]
else:
class_pred = probas[:, c]
errors = paddle.abs(fg - class_pred)
errors_sorted, perm = paddle.fluid.core.ops.argsort(
errors, 'axis', 0, 'descending', True)
errors_sorted.stop_gradient = False
fg_sorted = paddle.gather(fg, perm)
fg_sorted.stop_gradient = True
grad = lovasz_grad(fg_sorted)
grad.stop_gradient = True
loss = paddle.sum(errors_sorted * grad)
losses.append(loss)
if len(classes_to_sum) == 1:
return losses[0]
losses_tensor = paddle.stack(losses)
mean_loss = paddle.mean(losses_tensor)
return mean_loss
def flatten_probas(probas, labels, ignore=None):
"""
Flattens predictions in the batch.
"""
if len(probas.shape) == 3:
probas = paddle.unsqueeze(probas, axis=1)
C = probas.shape[1]
probas = paddle.transpose(probas, [0, 2, 3, 1])
probas = paddle.reshape(probas, [-1, C])
labels = paddle.reshape(labels, [-1])
if ignore is None:
return probas, labels
valid = labels != ignore
valid_mask = paddle.reshape(valid, [-1, 1])
indexs = paddle.nonzero(valid_mask)
indexs.stop_gradient = True
vprobas = paddle.gather(probas, indexs[:, 0])
vlabels = paddle.gather(labels, indexs[:, 0])
return vprobas, vlabels

@ -0,0 +1,65 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class MSELoss(nn.MSELoss):
r"""
**Mean Square Error Loss**
Computes the mean square error (squared L2 norm) of given input and label.
If :attr:`reduction` is set to ``'none'``, loss is calculated as:
.. math::
Out = (input - label)^2
If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
.. math::
Out = \operatorname{mean}((input - label)^2)
If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
.. math::
Out = \operatorname{sum}((input - label)^2)
where `input` and `label` are `float32` tensors of same shape.
Args:
reduction (string, optional): The reduction method for the output,
could be 'none' | 'mean' | 'sum'.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``.
ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255.
Shape:
input (Tensor): Input tensor, the data type is float32 or float64
label (Tensor): Label tensor, the data type is float32 or float64
output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
Examples:
.. code-block:: python
import numpy as np
import paddle
input_data = np.array([1.5]).astype("float32")
label_data = np.array([1.7]).astype("float32")
mse_loss = paddle.nn.loss.MSELoss()
input = paddle.to_tensor(input_data)
label = paddle.to_tensor(label_data)
output = mse_loss(input, label)
print(output)
# [0.04000002]
"""
def __init__(self, reduction='mean', ignore_index=255):
super().__init__(reduction=reduction)

@ -0,0 +1,57 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class MixedLoss(nn.Layer):
"""
Weighted computations for multiple Loss.
The advantage is that mixed loss training can be achieved without changing the networking code.
Args:
losses (list[nn.Layer]): A list consisting of multiple loss classes
coef (list[float|int]): Weighting coefficient of multiple loss
Returns:
A callable object of MixedLoss.
"""
def __init__(self, losses, coef):
super(MixedLoss, self).__init__()
if not isinstance(losses, list):
raise TypeError('`losses` must be a list!')
if not isinstance(coef, list):
raise TypeError('`coef` must be a list!')
len_losses = len(losses)
len_coef = len(coef)
if len_losses != len_coef:
raise ValueError(
'The length of `losses` should equal to `coef`, but they are {} and {}.'
.format(len_losses, len_coef))
self.losses = losses
self.coef = coef
def forward(self, logits, labels):
loss_list = []
for i, loss in enumerate(self.losses):
output = loss(logits, labels)
loss_list.append(output * self.coef[i])
return loss_list

@ -0,0 +1,99 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class OhemCrossEntropyLoss(nn.Layer):
"""
Implements the ohem cross entropy loss function.
Args:
thresh (float, optional): The threshold of ohem. Default: 0.7.
min_kept (int, optional): The min number to keep in loss computation. Default: 10000.
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
"""
def __init__(self, thresh=0.7, min_kept=10000, ignore_index=255):
super(OhemCrossEntropyLoss, self).__init__()
self.thresh = thresh
self.min_kept = min_kept
self.ignore_index = ignore_index
self.EPS = 1e-5
def forward(self, logit, label):
"""
Forward computation.
Args:
logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1.
label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, D1, D2,..., Dk), k >= 1.
"""
if len(label.shape) != len(logit.shape):
label = paddle.unsqueeze(label, 1)
# get the label after ohem
n, c, h, w = logit.shape
label = label.reshape((-1, ))
valid_mask = (label != self.ignore_index).astype('int64')
num_valid = valid_mask.sum()
label = label * valid_mask
prob = F.softmax(logit, axis=1)
prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))
if self.min_kept < num_valid and num_valid > 0:
# let the value which ignored greater than 1
prob = prob + (1 - valid_mask)
# get the prob of relevant label
label_onehot = F.one_hot(label, c)
label_onehot = label_onehot.transpose((1, 0))
prob = prob * label_onehot
prob = paddle.sum(prob, axis=0)
threshold = self.thresh
if self.min_kept > 0:
index = prob.argsort()
threshold_index = index[min(len(index), self.min_kept) - 1]
threshold_index = int(threshold_index.numpy()[0])
if prob[threshold_index] > self.thresh:
threshold = prob[threshold_index]
kept_mask = (prob < threshold).astype('int64')
label = label * kept_mask
valid_mask = valid_mask * kept_mask
# make the invalid region as ignore
label = label + (1 - valid_mask) * self.ignore_index
label = label.reshape((n, 1, h, w))
valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
loss = F.softmax_with_cross_entropy(
logit, label, ignore_index=self.ignore_index, axis=1)
loss = loss * valid_mask
avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)
label.stop_gradient = True
valid_mask.stop_gradient = True
return avg_loss

@ -0,0 +1,114 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import losses
@manager.LOSSES.add_component
class OhemEdgeAttentionLoss(nn.Layer):
"""
Implements the cross entropy loss function. It only compute the edge part.
Args:
edge_threshold (float, optional): The pixels greater edge_threshold as edges. Default: 0.8.
thresh (float, optional): The threshold of ohem. Default: 0.7.
min_kept (int, optional): The min number to keep in loss computation. Default: 5000.
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
"""
def __init__(self,
edge_threshold=0.8,
thresh=0.7,
min_kept=5000,
ignore_index=255):
super().__init__()
self.edge_threshold = edge_threshold
self.thresh = thresh
self.min_kept = min_kept
self.ignore_index = ignore_index
self.EPS = 1e-10
def forward(self, logits, label):
"""
Forward computation.
Args:
logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is
(N, C), where C is number of classes, and if shape is more than 2D, this
is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit .
label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, C, D1, D2,..., Dk), k >= 1.
"""
seg_logit, edge_logit = logits[0], logits[1]
if len(label.shape) != len(seg_logit.shape):
label = paddle.unsqueeze(label, 1)
if edge_logit.shape != label.shape:
raise ValueError(
'The shape of edge_logit should equal to the label, but they are {} != {}'
.format(edge_logit.shape, label.shape))
# Filter out edge
filler = paddle.ones_like(label) * self.ignore_index
label = paddle.where(edge_logit > self.edge_threshold, label, filler)
# ohem
n, c, h, w = seg_logit.shape
label = label.reshape((-1, ))
valid_mask = (label != self.ignore_index).astype('int64')
num_valid = valid_mask.sum()
label = label * valid_mask
prob = F.softmax(seg_logit, axis=1)
prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))
if self.min_kept < num_valid and num_valid > 0:
# let the value which ignored greater than 1
prob = prob + (1 - valid_mask)
# get the prob of relevant label
label_onehot = F.one_hot(label, c)
label_onehot = label_onehot.transpose((1, 0))
prob = prob * label_onehot
prob = paddle.sum(prob, axis=0)
threshold = self.thresh
if self.min_kept > 0:
index = prob.argsort()
threshold_index = index[min(len(index), self.min_kept) - 1]
threshold_index = int(threshold_index.numpy()[0])
if prob[threshold_index] > self.thresh:
threshold = prob[threshold_index]
kept_mask = (prob < threshold).astype('int64')
label = label * kept_mask
valid_mask = valid_mask * kept_mask
# make the invalid region as ignore
label = label + (1 - valid_mask) * self.ignore_index
label = label.reshape((n, 1, h, w))
valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
loss = F.softmax_with_cross_entropy(
seg_logit, label, ignore_index=self.ignore_index, axis=1)
loss = loss * valid_mask
avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)
label.stop_gradient = True
valid_mask.stop_gradient = True
return avg_loss

@ -0,0 +1,199 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class PixelContrastCrossEntropyLoss(nn.Layer):
"""
The PixelContrastCrossEntropyLoss implementation based on PaddlePaddle.
The original article refers to
Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation"
(https://arxiv.org/abs/2101.11939).
Args:
temperature (float, optional): Controling the numerical similarity of features. Default: 0.1.
base_temperature (float, optional): Controling the numerical range of contrast loss. Default: 0.07.
ignore_index (int, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default 255.
max_samples (int, optional): Max sampling anchors. Default: 1024.
max_views (int): Sampled samplers of a class. Default: 100.
"""
def __init__(self,
temperature=0.1,
base_temperature=0.07,
ignore_index=255,
max_samples=1024,
max_views=100):
super().__init__()
self.temperature = temperature
self.base_temperature = base_temperature
self.ignore_index = ignore_index
self.max_samples = max_samples
self.max_views = max_views
def _hard_anchor_sampling(self, X, y_hat, y):
"""
Args:
X (Tensor): reshaped feats, shape = [N, H * W, feat_channels]
y_hat (Tensor): reshaped label, shape = [N, H * W]
y (Tensor): reshaped predict, shape = [N, H * W]
"""
batch_size, feat_dim = paddle.shape(X)[0], paddle.shape(X)[-1]
classes = []
total_classes = 0
for i in range(batch_size):
current_y = y_hat[i]
current_classes = paddle.unique(current_y)
current_classes = [
x for x in current_classes if x != self.ignore_index
]
current_classes = [
x for x in current_classes
if (current_y == x).nonzero().shape[0] > self.max_views
]
classes.append(current_classes)
total_classes += len(current_classes)
n_view = self.max_samples // total_classes
n_view = min(n_view, self.max_views)
X_ = []
y_ = paddle.zeros([total_classes], dtype='float32')
X_ptr = 0
for i in range(batch_size):
this_y_hat = y_hat[i]
current_y = y[i]
current_classes = classes[i]
for cls_id in current_classes:
hard_indices = paddle.logical_and(
(this_y_hat == cls_id), (current_y != cls_id)).nonzero()
easy_indices = paddle.logical_and(
(this_y_hat == cls_id), (current_y == cls_id)).nonzero()
num_hard = hard_indices.shape[0]
num_easy = easy_indices.shape[0]
if num_hard >= n_view / 2 and num_easy >= n_view / 2:
num_hard_keep = n_view // 2
num_easy_keep = n_view - num_hard_keep
elif num_hard >= n_view / 2:
num_easy_keep = num_easy
num_hard_keep = n_view - num_easy_keep
else:
num_hard_keep = num_hard
num_easy_keep = n_view - num_hard_keep
indices = None
if num_hard > 0:
perm = paddle.randperm(num_hard)
hard_indices = hard_indices[perm[:num_hard_keep]].reshape(
(-1, hard_indices.shape[-1]))
indices = hard_indices
if num_easy > 0:
perm = paddle.randperm(num_easy)
easy_indices = easy_indices[perm[:num_easy_keep]].reshape(
(-1, easy_indices.shape[-1]))
if indices is None:
indices = easy_indices
else:
indices = paddle.concat((indices, easy_indices), axis=0)
if indices is None:
raise UserWarning('hard sampling indice error')
X_.append(paddle.index_select(X[i, :, :], indices.squeeze(1)))
y_[X_ptr] = float(cls_id)
X_ptr += 1
X_ = paddle.stack(X_, axis=0)
return X_, y_
def _contrastive(self, feats_, labels_):
"""
Args:
feats_ (Tensor): sampled pixel, shape = [total_classes, n_view, feat_dim], total_classes = batch_size * single image classes
labels_ (Tensor): label, shape = [total_classes]
"""
anchor_num, n_view = feats_.shape[0], feats_.shape[1]
labels_ = labels_.reshape((-1, 1))
mask = paddle.equal(labels_, paddle.transpose(labels_,
[1, 0])).astype('float32')
contrast_count = n_view
contrast_feature = paddle.concat(paddle.unbind(feats_, axis=1), axis=0)
anchor_feature = contrast_feature
anchor_count = contrast_count
anchor_dot_contrast = paddle.matmul(
anchor_feature, paddle.transpose(contrast_feature,
[1, 0])) / self.temperature
logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True)
logits = anchor_dot_contrast - logits_max
mask = paddle.tile(mask, [anchor_count, contrast_count])
neg_mask = 1 - mask
logits_mask = 1 - paddle.eye(mask.shape[0]).astype('float32')
mask = mask * logits_mask
neg_logits = paddle.exp(logits) * neg_mask
neg_logits = neg_logits.sum(1, keepdim=True)
exp_logits = paddle.exp(logits)
log_prob = logits - paddle.log(exp_logits + neg_logits)
mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
loss = loss.mean()
return loss
def contrast_criterion(self, feats, labels=None, predict=None):
labels = labels.unsqueeze(1)
labels = F.interpolate(labels, feats.shape[2:], mode='nearest')
labels = labels.squeeze(1)
batch_size = feats.shape[0]
labels = labels.reshape((batch_size, -1))
predict = predict.reshape((batch_size, -1))
feats = paddle.transpose(feats, [0, 2, 3, 1])
feats = feats.reshape((feats.shape[0], -1, feats.shape[-1]))
feats_, labels_ = self._hard_anchor_sampling(feats, labels, predict)
loss = self._contrastive(feats_, labels_)
return loss
def forward(self, preds, label):
assert "seg" in preds, "The input of PixelContrastCrossEntropyLoss should include 'seg' output, but not found."
assert "embed" in preds, "The input of PixelContrastCrossEntropyLoss should include 'embed' output, but not found."
seg = preds['seg']
embedding = preds['embed']
predict = paddle.argmax(seg, axis=1)
loss = self.contrast_criterion(embedding, label, predict)
return loss

@ -0,0 +1,160 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class PointCrossEntropyLoss(nn.Layer):
"""
Implements the point cross entropy loss function.
The original article refers to
Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering."
(https://arxiv.org/abs/1912.08193).
Args:
weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
given to each class. Its length must be equal to the number of classes.
Default ``None``.
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0]. When its value < 1.0, only compute the loss for
the top k percent pixels (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``.
data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``.
"""
def __init__(self,
weight=None,
ignore_index=255,
top_k_percent_pixels=1.0,
data_format='NCHW',
align_corners = False):
super(PointCrossEntropyLoss, self).__init__()
if weight is not None:
weight = paddle.to_tensor(weight, dtype='float32')
self.weight = weight
self.ignore_index = ignore_index
self.top_k_percent_pixels = top_k_percent_pixels
self.EPS = 1e-8
self.data_format = data_format
self.align_corners = align_corners
def forward(self, logits, label, semantic_weights=None):
"""
Forward computation.
Args:
logits (Tensor): Logit tensor, the data type is float32, float64. Shape is
(logit,points). logit'shape: [N, C, point_num]. logit'shape:[N, point_num, 2], where C is number of classes.
label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
(N, D1, D2,..., Dk), k >= 1.
semantic_weights (Tensor, optional): Weights about loss for each pixels, shape is the same as label. Default: None.
"""
# for loss
logit, points = logits # [N, C, point_num],[N, point_num, 2]
label = label.unsqueeze(1) # [N,1,H,W]
label = point_sample(
label.astype('float32'),
points,
mode='nearest',
align_corners=self.align_corners) # [N, 1, point_num]
label = paddle.squeeze(label,axis=1).astype('int64') # [N, xx]
channel_axis = 1 if self.data_format == 'NCHW' else -1
if self.weight is not None and logit.shape[channel_axis] != len(
self.weight):
raise ValueError(
'The number of weights = {} must be the same as the number of classes = {}.'
.format(len(self.weight), logit.shape[1]))
logit = paddle.transpose(logit, [0, 2, 1])
no_ignore_label = label
#no_ignore_label[label==self.ignore_index] = 0
loss = F.cross_entropy(
logit,
no_ignore_label,
ignore_index=self.ignore_index,
reduction='none')
mask = label != self.ignore_index
mask = paddle.cast(mask, 'float32')
loss = loss * mask
if semantic_weights is not None:
loss = loss * semantic_weights
if self.weight is not None:
_one_hot = F.one_hot(label, logit.shape[-1])
_one_hot_weight = _one_hot * self.weight
loss = loss * _one_hot_weight.argmax(-1)
coef = paddle.sum(_one_hot_weight, axis=-1)
#coef = paddle.ones_like(label)
else:
coef = paddle.ones_like(label)
label.stop_gradient = True
mask.stop_gradient = True
if self.top_k_percent_pixels == 1.0:
avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS)
return avg_loss
loss = loss.reshape((-1, ))
top_k_pixels = int(self.top_k_percent_pixels * loss.numel())
loss, indices = paddle.topk(loss, top_k_pixels)
coef = coef.reshape((-1, ))
coef = paddle.gather(coef, indices)
coef.stop_gradient = True
return loss.mean() / (paddle.mean(coef) + self.EPS)
def point_sample(input, points, align_corners=False, **kwargs):
"""A wrapper around :func:`grid_sample` to support 3D point_coords tensors
Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
lie inside ``[0, 1] x [0, 1]`` square.
Args:
input (Tensor): Feature map, shape (N, C, H, W).
points (Tensor): Image based absolute point coordinates (normalized),
range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
align_corners (bool): Whether align_corners. Default: False
Returns:
Tensor: Features of `point` on `input`, shape (N, C, P) or
(N, C, Hgrid, Wgrid).
"""
def denormalize(grid):
"""Denormalize input grid from range [0, 1] to [-1, 1]
Args:
grid (Tensor): The grid to be denormalize, range [0, 1].
Returns:
Tensor: Denormalized grid, range [-1, 1].
"""
return grid * 2.0 - 1.0
add_dim = False
if points.dim() == 3:
add_dim = True
points = paddle.unsqueeze(points,axis=2) # [2, 2048, 1, 2]
output = F.grid_sample(
input, denormalize(points), align_corners=align_corners, **kwargs)
if add_dim:
output = paddle.squeeze(output,axis=3)
return output

@ -0,0 +1,256 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""rmi loss in PaddlePaddle"""
import numpy
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
_euler_num = 2.718281828
_pi = 3.14159265
_ln_2_pi = 1.837877
_CLIP_MIN = 1e-6
_CLIP_MAX = 1.0
_POS_ALPHA = 5e-4
_IS_SUM = 1
@manager.LOSSES.add_component
class RMILoss(nn.Layer):
"""
Implements the Region Mutual Information(RMI) Losshttps://arxiv.org/abs/1910.12037 for Semantic Segmentation.
Unlike vanilla rmi loss which contains Cross Entropy Loss, we disband them and only
left the RMI-related parts.
The motivation is to allow for a more flexible combination of losses during training.
For example, by employing mixed loss to merge RMI Loss with Boostrap Cross Entropy Loss,
we can achieve the online mining of hard examples together with attention to region information.
Args:
weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
given to each class. Its length must be equal to the number of classes.
Default ``None``.
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the input gradient. Default ``255``.
"""
def __init__(self,
num_classes=19,
rmi_radius=3,
rmi_pool_way=0,
rmi_pool_size=3,
rmi_pool_stride=3,
loss_weight_lambda=0.5,
ignore_index=255):
super(RMILoss, self).__init__()
self.num_classes = num_classes
assert rmi_radius in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
self.rmi_radius = rmi_radius
assert rmi_pool_way in [0, 1, 2, 3]
self.rmi_pool_way = rmi_pool_way
assert rmi_pool_size == rmi_pool_stride
self.rmi_pool_size = rmi_pool_size
self.rmi_pool_stride = rmi_pool_stride
self.weight_lambda = loss_weight_lambda
self.half_d = self.rmi_radius * self.rmi_radius
self.d = 2 * self.half_d
self.kernel_padding = self.rmi_pool_size // 2
self.ignore_index = ignore_index
def forward(self, logits_4D, labels_4D, do_rmi=True):
"""
Forward computation.
Args:
logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty).
labels (Tensor): Shape is [N, H, W], ground truth labels (between 0 and C - 1).
"""
logits_4D = paddle.cast(logits_4D, dtype='float32')
labels_4D = paddle.cast(labels_4D, dtype='float32')
loss = self.forward_sigmoid(logits_4D, labels_4D, do_rmi=do_rmi)
return loss
def forward_sigmoid(self, logits_4D, labels_4D, do_rmi=False):
"""
Using the sigmiod operation both.
Args:
logits_4D : [N, C, H, W], dtype=float32
labels_4D : [N, H, W], dtype=long
do_rmi : bool
"""
label_mask_3D = labels_4D != self.ignore_index
valid_onehot_labels_4D = paddle.cast(
F.one_hot(
paddle.cast(labels_4D, dtype='int64') * paddle.cast(
label_mask_3D, dtype='int64'),
num_classes=self.num_classes),
dtype='float32')
# label_mask_flat = paddle.cast(
# paddle.reshape(label_mask_3D, [-1]), dtype='float32')
valid_onehot_labels_4D = valid_onehot_labels_4D * paddle.unsqueeze(
label_mask_3D, axis=3)
valid_onehot_labels_4D.stop_gradient = True
probs_4D = F.sigmoid(logits_4D) * paddle.unsqueeze(
label_mask_3D, axis=1) + _CLIP_MIN
valid_onehot_labels_4D = paddle.transpose(valid_onehot_labels_4D,
[0, 3, 1, 2])
valid_onehot_labels_4D.stop_gradient = True
rmi_loss = self.rmi_lower_bound(valid_onehot_labels_4D, probs_4D)
return rmi_loss
def inverse(self, x):
return paddle.inverse(x)
def rmi_lower_bound(self, labels_4D, probs_4D):
"""
calculate the lower bound of the region mutual information.
Args:
labels_4D : [N, C, H, W], dtype=float32
probs_4D : [N, C, H, W], dtype=float32
"""
assert labels_4D.shape == probs_4D.shape, print(
'shapes', labels_4D.shape, probs_4D.shape)
p, s = self.rmi_pool_size, self.rmi_pool_stride
if self.rmi_pool_stride > 1:
if self.rmi_pool_way == 0:
labels_4D = F.max_pool2d(
labels_4D,
kernel_size=p,
stride=s,
padding=self.kernel_padding)
probs_4D = F.max_pool2d(
probs_4D,
kernel_size=p,
stride=s,
padding=self.kernel_padding)
elif self.rmi_pool_way == 1:
labels_4D = F.avg_pool2d(
labels_4D,
kernel_size=p,
stride=s,
padding=self.kernel_padding)
probs_4D = F.avg_pool2d(
probs_4D,
kernel_size=p,
stride=s,
padding=self.kernel_padding)
elif self.rmi_pool_way == 2:
shape = labels_4D.shape
new_h, new_w = shape[2] // s, shape[3] // s
labels_4D = F.interpolate(
labels_4D, size=(new_h, new_w), mode='nearest')
probs_4D = F.interpolate(
probs_4D,
size=(new_h, new_w),
mode='bilinear',
align_corners=True)
else:
raise NotImplementedError("Pool way of RMI is not defined!")
label_shape = labels_4D.shape
n, c = label_shape[0], label_shape[1]
la_vectors, pr_vectors = self.map_get_pairs(
labels_4D, probs_4D, radius=self.rmi_radius, is_combine=0)
la_vectors = paddle.reshape(la_vectors, [n, c, self.half_d, -1])
la_vectors = paddle.cast(la_vectors, dtype='float64')
la_vectors.stop_gradient = True
pr_vectors = paddle.reshape(pr_vectors, [n, c, self.half_d, -1])
pr_vectors = paddle.cast(pr_vectors, dtype='float64')
diag_matrix = paddle.unsqueeze(
paddle.unsqueeze(paddle.eye(self.half_d), axis=0), axis=0)
la_vectors = la_vectors - paddle.mean(la_vectors, axis=3, keepdim=True)
la_cov = paddle.matmul(la_vectors,
paddle.transpose(la_vectors, [0, 1, 3, 2]))
pr_vectors = pr_vectors - paddle.mean(pr_vectors, axis=3, keepdim=True)
pr_cov = paddle.matmul(pr_vectors,
paddle.transpose(pr_vectors, [0, 1, 3, 2]))
pr_cov_inv = self.inverse(
pr_cov + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA)
la_pr_cov = paddle.matmul(la_vectors,
paddle.transpose(pr_vectors, [0, 1, 3, 2]))
appro_var = la_cov - paddle.matmul(
paddle.matmul(la_pr_cov, pr_cov_inv),
paddle.transpose(la_pr_cov, [0, 1, 3, 2]))
rmi_now = 0.5 * self.log_det_by_cholesky(
appro_var + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA)
rmi_per_class = paddle.cast(
paddle.mean(
paddle.reshape(rmi_now, [-1, self.num_classes]), axis=0),
dtype='float32')
rmi_per_class = paddle.divide(rmi_per_class,
paddle.to_tensor(float(self.half_d)))
rmi_loss = paddle.sum(rmi_per_class) if _IS_SUM else paddle.mean(
rmi_per_class)
return rmi_loss
def log_det_by_cholesky(self, matrix):
"""
Args:
matrix: matrix must be a positive define matrix.
shape [N, C, D, D].
"""
chol = paddle.cholesky(matrix)
diag = paddle.diagonal(chol, offset=0, axis1=-2, axis2=-1)
chol = paddle.log(diag + 1e-8)
return 2.0 * paddle.sum(chol, axis=-1)
def map_get_pairs(self, labels_4D, probs_4D, radius=3, is_combine=True):
"""
Args:
labels_4D : labels, shape [N, C, H, W]
probs_4D : probabilities, shape [N, C, H, W]
radius : the square radius
Return:
tensor with shape [N, C, radius * radius, H - (radius - 1), W - (radius - 1)]
"""
label_shape = labels_4D.shape
h, w = label_shape[2], label_shape[3]
new_h, new_w = h - (radius - 1), w - (radius - 1)
la_ns = []
pr_ns = []
for y in range(0, radius, 1):
for x in range(0, radius, 1):
la_now = labels_4D[:, :, y:y + new_h, x:x + new_w]
pr_now = probs_4D[:, :, y:y + new_h, x:x + new_w]
la_ns.append(la_now)
pr_ns.append(pr_now)
if is_combine:
pair_ns = la_ns + pr_ns
p_vectors = paddle.stack(pair_ns, axis=2)
return p_vectors
else:
la_vectors = paddle.stack(la_ns, axis=2)
pr_vectors = paddle.stack(pr_ns, axis=2)
return la_vectors, pr_vectors

@ -0,0 +1,175 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import cv2
import numpy as np
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class SemanticConnectivityLoss(nn.Layer):
'''
SCL (Semantic Connectivity-aware Learning) framework, which introduces a SC Loss (Semantic Connectivity-aware Loss)
to improve the quality of segmentation results from the perspective of connectivity. Support multi-class segmentation.
The original article refers to
Lutao Chu, Yi Liu, Zewu Wu, Shiyu Tang, Guowei Chen, Yuying Hao, Juncai Peng, Zhiliang Yu, Zeyu Chen, Baohua Lai, Haoyi Xiong.
"PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset"
In WACV 2022 workshop
https://arxiv.org/abs/2112.07146
Running process:
Step 1. Connected Components Calculation
Step 2. Connected Components Matching and SC Loss Calculation
'''
def __init__(self, ignore_index=255, max_pred_num_conn=10, use_argmax=True):
'''
Args:
ignore_index (int): Specify a pixel value to be ignored in the annotated image and does not contribute to
the input gradient.When there are pixels that cannot be marked (or difficult to be marked) in the marked
image, they can be marked as a specific gray value. When calculating the loss value, the pixel corresponding
to the original image will not be used as the independent variable of the loss function. *Default:``255``*
max_pred_num_conn (int): Maximum number of predicted connected components. At the beginning of training,
there will be a large number of connected components, and the calculation is very time-consuming.
Therefore, it is necessary to limit the maximum number of predicted connected components,
and the rest will not participate in the calculation.
use_argmax (bool): Whether to use argmax for logits.
'''
super().__init__()
self.ignore_index = ignore_index
self.max_pred_num_conn = max_pred_num_conn
self.use_argmax = use_argmax
def forward(self, logits, labels):
'''
Args:
logits (Tensor): [N, C, H, W]
lables (Tensor): [N, H, W]
'''
preds = paddle.argmax(logits, axis=1) if self.use_argmax else logits
preds_np = preds.astype('uint8').numpy()
labels_np = labels.astype('uint8').numpy()
preds = paddle.to_tensor(preds, 'float32', stop_gradient=False)
multi_class_sc_loss = paddle.zeros([preds.shape[0]])
zero = paddle.to_tensor([0.]) # for accelerating
# Traverse each image
for i in range(preds.shape[0]):
sc_loss = 0
class_num = 0
pred_i = preds[i]
preds_np_i = preds_np[i]
labels_np_i = labels_np[i]
# Traverse each class
for class_ in np.unique(labels_np_i):
if class_ == self.ignore_index:
continue
class_num += 1
# Connected Components Calculation
preds_np_class = preds_np_i == class_
labels_np_class = labels_np_i == class_
pred_num_conn, pred_conn = cv2.connectedComponents(
preds_np_class.astype(np.uint8)) # pred_conn.shape = [H,W]
label_num_conn, label_conn = cv2.connectedComponents(
labels_np_class.astype(np.uint8))
if pred_num_conn > 2 * label_num_conn:
pred_num_conn = min(pred_num_conn, self.max_pred_num_conn)
real_pred_num = pred_num_conn - 1
real_label_num = label_num_conn - 1
# Connected Components Matching and SC Loss Calculation
if real_label_num > 0 and real_pred_num > 0:
img_connectivity = compute_class_connectiveity(
pred_conn, label_conn, pred_num_conn, label_num_conn,
pred_i, real_label_num, real_pred_num, zero)
sc_loss += 1 - img_connectivity
elif real_label_num == 0 and real_pred_num == 0:
# if no connected component, SC Loss = 0, so pass
pass
else:
preds_class = pred_i == int(class_)
not_preds_class = paddle.bitwise_not(preds_class)
labels_class = paddle.to_tensor(labels_np_class)
missed_detect = labels_class * not_preds_class
missed_detect_area = paddle.sum(missed_detect).astype(
'float32')
sc_loss += missed_detect_area / missed_detect.numel() + 1
multi_class_sc_loss[
i] = sc_loss / class_num if class_num != 0 else 0
multi_class_sc_loss = paddle.mean(multi_class_sc_loss)
return multi_class_sc_loss
def compute_class_connectiveity(pred_conn, label_conn, pred_num_conn,
label_num_conn, pred, real_label_num,
real_pred_num, zero):
pred_conn = paddle.to_tensor(pred_conn)
label_conn = paddle.to_tensor(label_conn)
pred_conn = F.one_hot(pred_conn, pred_num_conn)
label_conn = F.one_hot(label_conn, label_num_conn)
ious = paddle.zeros((real_label_num, real_pred_num))
pair_conn_sum = paddle.to_tensor([0.], stop_gradient=False)
for i in range(1, label_num_conn):
label_i = label_conn[:, :, i]
pair_conn = paddle.to_tensor([0.], stop_gradient=False)
pair_conn_num = 0
for j in range(1, pred_num_conn):
pred_j_mask = pred_conn[:, :, j]
pred_j = pred_j_mask * pred
iou = compute_iou(pred_j, label_i, zero)
ious[i - 1, j - 1] = iou
if iou != 0:
pair_conn += iou
pair_conn_num += 1
if pair_conn_num != 0:
pair_conn_sum += pair_conn / pair_conn_num
lone_pred_num = 0
pred_sum = paddle.sum(ious, axis=0)
for m in range(0, real_pred_num):
if pred_sum[m] == 0:
lone_pred_num += 1
img_connectivity = pair_conn_sum / (real_label_num + lone_pred_num)
return img_connectivity
def compute_iou(pred_i, label_i, zero):
intersect_area_i = paddle.sum(pred_i * label_i)
if paddle.equal(intersect_area_i, zero):
return 0
pred_area_i = paddle.sum(pred_i)
label_area_i = paddle.sum(label_i)
union_area_i = pred_area_i + label_area_i - intersect_area_i
if paddle.equal(union_area_i, zero):
return 1
else:
return intersect_area_i / union_area_i

@ -0,0 +1,47 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
@manager.LOSSES.add_component
class SECrossEntropyLoss(nn.Layer):
"""
The Semantic Encoding Loss implementation based on PaddlePaddle.
"""
def __init__(self, *args, **kwargs):
super(SECrossEntropyLoss, self).__init__()
def forward(self, logit, label):
if logit.ndim == 4:
logit = logit.squeeze(2).squeeze(3)
assert logit.ndim == 2, "The shape of logit should be [N, C, 1, 1] or [N, C], but the logit dim is {}.".format(
logit.ndim)
batch_size, num_classes = paddle.shape(logit)
se_label = paddle.zeros([batch_size, num_classes])
for i in range(batch_size):
hist = paddle.histogram(label[i],
bins=num_classes,
min=0,
max=num_classes - 1)
hist = hist.astype('float32') / hist.sum().astype('float32')
se_label[i] = (hist > 0).astype('float32')
loss = F.binary_cross_entropy_with_logits(logit, se_label)
return loss

@ -0,0 +1,241 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
class MLAHeads(nn.Layer):
def __init__(self, mlahead_channels=128):
super(MLAHeads, self).__init__()
self.head2 = nn.Sequential(
layers.ConvBNReLU(
mlahead_channels * 2,
mlahead_channels,
3,
padding=1,
bias_attr=False),
layers.ConvBNReLU(
mlahead_channels,
mlahead_channels,
3,
padding=1,
bias_attr=False))
self.head3 = nn.Sequential(
layers.ConvBNReLU(
mlahead_channels * 2,
mlahead_channels,
3,
padding=1,
bias_attr=False),
layers.ConvBNReLU(
mlahead_channels,
mlahead_channels,
3,
padding=1,
bias_attr=False))
self.head4 = nn.Sequential(
layers.ConvBNReLU(
mlahead_channels * 2,
mlahead_channels,
3,
padding=1,
bias_attr=False),
layers.ConvBNReLU(
mlahead_channels,
mlahead_channels,
3,
padding=1,
bias_attr=False))
self.head5 = nn.Sequential(
layers.ConvBNReLU(
mlahead_channels * 2,
mlahead_channels,
3,
padding=1,
bias_attr=False),
layers.ConvBNReLU(
mlahead_channels,
mlahead_channels,
3,
padding=1,
bias_attr=False))
def forward(self, mla_p2, mla_p3, mla_p4, mla_p5):
head2 = F.interpolate(
self.head2(mla_p2),
size=(4 * mla_p2.shape[3], 4 * mla_p2.shape[3]),
mode='bilinear',
align_corners=True)
head3 = F.interpolate(
self.head3(mla_p3),
size=(4 * mla_p3.shape[3], 4 * mla_p3.shape[3]),
mode='bilinear',
align_corners=True)
head4 = F.interpolate(
self.head4(mla_p4),
size=(4 * mla_p4.shape[3], 4 * mla_p4.shape[3]),
mode='bilinear',
align_corners=True)
head5 = F.interpolate(
self.head5(mla_p5),
size=(4 * mla_p5.shape[3], 4 * mla_p5.shape[3]),
mode='bilinear',
align_corners=True)
return paddle.concat([head2, head3, head4, head5], axis=1)
@manager.MODELS.add_component
class MLATransformer(nn.Layer):
def __init__(self,
num_classes,
in_channels,
backbone,
mlahead_channels=128,
aux_channels=256,
norm_layer=nn.BatchNorm2D,
pretrained=None,
**kwargs):
super(MLATransformer, self).__init__()
self.BatchNorm = norm_layer
self.mlahead_channels = mlahead_channels
self.num_classes = num_classes
self.in_channels = in_channels
self.backbone = backbone
self.mlahead = MLAHeads(mlahead_channels=self.mlahead_channels)
self.cls = nn.Conv2D(
4 * self.mlahead_channels, self.num_classes, 3, padding=1)
self.conv0 = layers.ConvBNReLU(
self.in_channels[0],
self.in_channels[0] * 2,
3,
padding=1,
bias_attr=False)
self.conv1 = layers.ConvBNReLU(
self.in_channels[1],
self.in_channels[1],
3,
padding=1,
bias_attr=False)
self.conv21 = layers.ConvBNReLU(
self.in_channels[2],
self.in_channels[2],
3,
padding=1,
bias_attr=False)
self.conv22 = layers.ConvBNReLU(
self.in_channels[2],
self.in_channels[2] // 2,
3,
padding=1,
bias_attr=False)
self.conv31 = layers.ConvBNReLU(
self.in_channels[3],
self.in_channels[3],
3,
padding=1,
bias_attr=False)
self.conv32 = layers.ConvBNReLU(
self.in_channels[3],
self.in_channels[3] // 2,
3,
padding=1,
bias_attr=False)
self.conv33 = layers.ConvBNReLU(
self.in_channels[3] // 2,
self.in_channels[3] // 4,
3,
padding=1,
bias_attr=False)
self.aux_head = nn.Sequential(
layers.ConvBN(
in_channels=self.in_channels[2],
out_channels=aux_channels,
kernel_size=3,
padding=1,
bias_attr=False),
nn.Conv2D(
in_channels=aux_channels,
out_channels=self.num_classes,
kernel_size=1,
))
self.pretrained = pretrained
self.init_weight()
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
def forward(self, x):
inputs = self.backbone(x)
inputs0 = self.conv0(inputs[0])
inputs1 = F.interpolate(
self.conv1(inputs[1]),
size=inputs[0].shape[2:],
mode='bilinear',
align_corners=True)
inputs2 = F.interpolate(
self.conv21(inputs[2]),
scale_factor=2,
mode='bilinear',
align_corners=True)
inputs2 = F.interpolate(
self.conv22(inputs2),
size=inputs[0].shape[2:],
mode='bilinear',
align_corners=True)
inputs3 = F.interpolate(
self.conv31(inputs[3]),
scale_factor=2,
mode='bilinear',
align_corners=True)
inputs3 = F.interpolate(
self.conv32(inputs3),
scale_factor=2,
mode='bilinear',
align_corners=True)
inputs3 = F.interpolate(
self.conv33(inputs3),
size=inputs[0].shape[2:],
mode='bilinear',
align_corners=True)
inputs2 = inputs2 + inputs3
inputs1 = inputs1 + inputs2
inputs0 = inputs0 + inputs1
feats = self.mlahead(inputs0, inputs1, inputs2, inputs3)
logit = self.cls(feats)
logit_list = [logit]
if self.training:
logit_list.append(self.aux_head(inputs[2]))
logit_list = [
F.interpolate(
logit, paddle.shape(x)[2:], mode='bilinear', align_corners=True)
for logit in logit_list
]
return logit_list

@ -0,0 +1,246 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg import utils
from paddlers.models.ppseg.cvlibs import manager, param_init
from paddlers.models.ppseg.models import layers
@manager.MODELS.add_component
class OCRNet(nn.Layer):
"""
The OCRNet implementation based on PaddlePaddle.
The original article refers to
Yuan, Yuhui, et al. "Object-Contextual Representations for Semantic Segmentation"
(https://arxiv.org/pdf/1909.11065.pdf)
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): Backbone network.
backbone_indices (tuple): A tuple indicates the indices of output of backbone.
It can be either one or two values, if two values, the first index will be taken as
a deep-supervision feature in auxiliary layer; the second one will be taken as
input of pixel representation. If one value, it is taken by both above.
ocr_mid_channels (int, optional): The number of middle channels in OCRHead. Default: 512.
ocr_key_channels (int, optional): The number of key channels in ObjectAttentionBlock. Default: 256.
align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices,
ocr_mid_channels=512,
ocr_key_channels=256,
align_corners=False,
pretrained=None):
super().__init__()
self.backbone = backbone
self.backbone_indices = backbone_indices
in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
self.head = OCRHead(
num_classes=num_classes,
in_channels=in_channels,
ocr_mid_channels=ocr_mid_channels,
ocr_key_channels=ocr_key_channels)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feats = self.backbone(x)
feats = [feats[i] for i in self.backbone_indices]
logit_list = self.head(feats)
if not self.training:
logit_list = [logit_list[0]]
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in logit_list
]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class OCRHead(nn.Layer):
"""
The Object contextual representation head.
Args:
num_classes(int): The unique number of target classes.
in_channels(tuple): The number of input channels.
ocr_mid_channels(int, optional): The number of middle channels in OCRHead. Default: 512.
ocr_key_channels(int, optional): The number of key channels in ObjectAttentionBlock. Default: 256.
"""
def __init__(self,
num_classes,
in_channels,
ocr_mid_channels=512,
ocr_key_channels=256):
super().__init__()
self.num_classes = num_classes
self.spatial_gather = SpatialGatherBlock(ocr_mid_channels, num_classes)
self.spatial_ocr = SpatialOCRModule(ocr_mid_channels, ocr_key_channels,
ocr_mid_channels)
self.indices = [-2, -1] if len(in_channels) > 1 else [-1, -1]
self.conv3x3_ocr = layers.ConvBNReLU(
in_channels[self.indices[1]], ocr_mid_channels, 3, padding=1)
self.cls_head = nn.Conv2D(ocr_mid_channels, self.num_classes, 1)
self.aux_head = nn.Sequential(
layers.ConvBNReLU(in_channels[self.indices[0]],
in_channels[self.indices[0]], 1),
nn.Conv2D(in_channels[self.indices[0]], self.num_classes, 1))
self.init_weight()
def forward(self, feat_list):
feat_shallow, feat_deep = feat_list[self.indices[0]], feat_list[
self.indices[1]]
soft_regions = self.aux_head(feat_shallow)
pixels = self.conv3x3_ocr(feat_deep)
object_regions = self.spatial_gather(pixels, soft_regions)
ocr = self.spatial_ocr(pixels, object_regions)
logit = self.cls_head(ocr)
return [logit, soft_regions]
def init_weight(self):
"""Initialize the parameters of model parts."""
for sublayer in self.sublayers():
if isinstance(sublayer, nn.Conv2D):
param_init.normal_init(sublayer.weight, std=0.001)
elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
param_init.constant_init(sublayer.weight, value=1.0)
param_init.constant_init(sublayer.bias, value=0.0)
class SpatialGatherBlock(nn.Layer):
"""Aggregation layer to compute the pixel-region representation."""
def __init__(self, pixels_channels, regions_channels):
super().__init__()
self.pixels_channels = pixels_channels
self.regions_channels = regions_channels
def forward(self, pixels, regions):
# pixels: from (n, c, h, w) to (n, h*w, c)
pixels = paddle.reshape(pixels, (0, self.pixels_channels, -1))
pixels = paddle.transpose(pixels, (0, 2, 1))
# regions: from (n, k, h, w) to (n, k, h*w)
regions = paddle.reshape(regions, (0, self.regions_channels, -1))
regions = F.softmax(regions, axis=2)
# feats: from (n, k, c) to (n, c, k, 1)
feats = paddle.bmm(regions, pixels)
feats = paddle.transpose(feats, (0, 2, 1))
feats = paddle.unsqueeze(feats, axis=-1)
return feats
class SpatialOCRModule(nn.Layer):
"""Aggregate the global object representation to update the representation for each pixel."""
def __init__(self,
in_channels,
key_channels,
out_channels,
dropout_rate=0.1):
super().__init__()
self.attention_block = ObjectAttentionBlock(in_channels, key_channels)
self.conv1x1 = nn.Sequential(
layers.ConvBNReLU(2 * in_channels, out_channels, 1),
nn.Dropout2D(dropout_rate))
def forward(self, pixels, regions):
context = self.attention_block(pixels, regions)
feats = paddle.concat([context, pixels], axis=1)
feats = self.conv1x1(feats)
return feats
class ObjectAttentionBlock(nn.Layer):
"""A self-attention module."""
def __init__(self, in_channels, key_channels):
super().__init__()
self.in_channels = in_channels
self.key_channels = key_channels
self.f_pixel = nn.Sequential(
layers.ConvBNReLU(in_channels, key_channels, 1),
layers.ConvBNReLU(key_channels, key_channels, 1))
self.f_object = nn.Sequential(
layers.ConvBNReLU(in_channels, key_channels, 1),
layers.ConvBNReLU(key_channels, key_channels, 1))
self.f_down = layers.ConvBNReLU(in_channels, key_channels, 1)
self.f_up = layers.ConvBNReLU(key_channels, in_channels, 1)
def forward(self, x, proxy):
x_shape = paddle.shape(x)
# query : from (n, c1, h1, w1) to (n, h1*w1, key_channels)
query = self.f_pixel(x)
query = paddle.reshape(query, (0, self.key_channels, -1))
query = paddle.transpose(query, (0, 2, 1))
# key : from (n, c2, h2, w2) to (n, key_channels, h2*w2)
key = self.f_object(proxy)
key = paddle.reshape(key, (0, self.key_channels, -1))
# value : from (n, c2, h2, w2) to (n, h2*w2, key_channels)
value = self.f_down(proxy)
value = paddle.reshape(value, (0, self.key_channels, -1))
value = paddle.transpose(value, (0, 2, 1))
# sim_map (n, h1*w1, h2*w2)
sim_map = paddle.bmm(query, key)
sim_map = (self.key_channels**-.5) * sim_map
sim_map = F.softmax(sim_map, axis=-1)
# context from (n, h1*w1, key_channels) to (n , out_channels, h1, w1)
context = paddle.bmm(sim_map, value)
context = paddle.transpose(context, (0, 2, 1))
context = paddle.reshape(context,
(0, self.key_channels, x_shape[2], x_shape[3]))
context = self.f_up(context)
return context

@ -0,0 +1,201 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class PFPNNet(nn.Layer):
"""
The Panoptic Feature Pyramid Networks implementation based on PaddlePaddle.
The original article refers to
Alexander Kirillov, Ross Girshick, Kaiming He, Piotr Dollár, et al. "Panoptic Feature Pyramid Networks"
(https://arxiv.org/abs/1901.02446)
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(self,
num_classes,
backbone,
backbone_indices,
channels,
enable_auxiliary_loss=False,
align_corners=False,
dropout_ratio=0.1,
fpn_inplanes=[256, 512, 1024, 2048],
pretrained=None):
super(PFPNNet, self).__init__()
self.backbone = backbone
self.backbone_indices = backbone_indices
self.in_channels = [
self.backbone.feat_channels[i] for i in backbone_indices
]
self.align_corners = align_corners
self.pretrained = pretrained
self.enable_auxiliary_loss = enable_auxiliary_loss
self.head = PFPNHead(num_class=num_classes,
fpn_inplanes=fpn_inplanes,
dropout_ratio=dropout_ratio,
channels=channels,
fpn_dim=channels,
enable_auxiliary_loss=self.enable_auxiliary_loss)
self.init_weight()
def forward(self, x):
feats = self.backbone(x)
feats = [feats[i] for i in self.backbone_indices]
logit_list = self.head(feats)
return [
F.interpolate(logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners)
for logit in logit_list
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class PFPNHead(nn.Layer):
"""
The PFPNHead implementation.
Args:
inplane (int): Input channels of PPM module.
num_class (int): The unique number of target classes.
fpn_inplanes (list): The feature channels from backbone.
fpn_dim (int, optional): The input channels of FPN module. Default: 512.
enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
"""
def __init__(self,
num_class,
fpn_inplanes,
channels,
dropout_ratio=0.1,
fpn_dim=256,
enable_auxiliary_loss=False,
align_corners=False):
super(PFPNHead, self).__init__()
self.enable_auxiliary_loss = enable_auxiliary_loss
self.align_corners = align_corners
self.lateral_convs = nn.LayerList()
self.fpn_out = nn.LayerList()
for fpn_inplane in fpn_inplanes:
self.lateral_convs.append(
nn.Sequential(nn.Conv2D(fpn_inplane, fpn_dim, 1),
layers.SyncBatchNorm(fpn_dim), nn.ReLU()))
self.fpn_out.append(
nn.Sequential(
layers.ConvBNReLU(fpn_dim, fpn_dim, 3, bias_attr=False)))
self.scale_heads = nn.LayerList()
for index in range(len(fpn_inplanes)):
head_length = max(
1, int(np.log2(fpn_inplanes[index]) - np.log2(fpn_inplanes[0])))
scale_head = nn.LayerList()
for head_index in range(head_length):
scale_head.append(
layers.ConvBNReLU(
fpn_dim,
channels,
3,
padding=1,
))
if fpn_inplanes[index] != fpn_inplanes[0]:
scale_head.append(
nn.Upsample(scale_factor=2,
mode='bilinear',
align_corners=align_corners))
self.scale_heads.append(nn.Sequential(*scale_head))
if dropout_ratio:
self.dropout = nn.Dropout2D(dropout_ratio)
if self.enable_auxiliary_loss:
self.dsn = nn.Sequential(
layers.ConvBNReLU(fpn_inplanes[2],
fpn_inplanes[2],
3,
padding=1), nn.Dropout2D(dropout_ratio),
nn.Conv2D(fpn_inplanes[2], num_class, kernel_size=1))
else:
self.dropout = None
if self.enable_auxiliary_loss:
self.dsn = nn.Sequential(
layers.ConvBNReLU(fpn_inplanes[2],
fpn_inplanes[2],
3,
padding=1),
nn.Conv2D(fpn_inplanes[2], num_class, kernel_size=1))
self.conv_last = nn.Sequential(
layers.ConvBNReLU(len(fpn_inplanes) * fpn_dim,
fpn_dim,
3,
bias_attr=False),
nn.Conv2D(fpn_dim, num_class, kernel_size=1))
self.conv_seg = nn.Conv2D(channels, num_class, kernel_size=1)
def cls_seg(self, feat):
if self.dropout is not None:
feat = self.dropout(feat)
output = self.conv_seg(feat)
return output
def forward(self, conv_out):
last_out = self.lateral_convs[-1](conv_out[-1])
f = last_out
fpn_feature_list = [last_out]
for i in reversed(range(len(conv_out) - 1)):
conv_x = conv_out[i]
conv_x = self.lateral_convs[i](conv_x)
prev_shape = paddle.shape(conv_x)[2:]
f = conv_x + F.interpolate(
f, prev_shape, mode='bilinear', align_corners=True)
fpn_feature_list.append(self.fpn_out[i](f))
output_size = paddle.shape(fpn_feature_list[-1])[2:]
x = self.scale_heads[0](fpn_feature_list[-1])
for index in range(len(self.scale_heads) - 2, 0, -1):
x = x + F.interpolate(self.scale_heads[index](
fpn_feature_list[index]),
size=output_size,
mode='bilinear',
align_corners=self.align_corners)
x = self.cls_seg(x)
if self.enable_auxiliary_loss:
dsn = self.dsn(conv_out[2])
return [x, dsn]
else:
return [x]

@ -0,0 +1,832 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
@manager.MODELS.add_component
class PointRend(nn.Layer):
"""
The SemanticFPN-PointRend implementation based on PaddlePaddle.
The original article refers to
Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering."
(https://arxiv.org/abs/1912.08193).
Args:
num_classes (int): The unique number of target classes.
backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
backbone_indices (tuple, optional): Four values in the tuple indicate the indices of output of backbone.
fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction in FPN. Default: [256, 512, 1024, 2048].
fpn_outplanes (int, optional): The output channels in FPN. Default: 256.
point_num_fcs (int, optional): Number of fc layers in the head in PointHead. Default: 3.
point_in_channels (list, optional): input channels of fc block in PointHead. Default: [256].
point_out_channels (int, optional): Fc block's output channels in PointHead. Default: 256.
point_in_index (list, optional): The indexs of input features to use in PointHead. Default: [0].
point_num_points (int, optional): The number of point in training mode in PointHead. Default: 2048.
point_oversample_ratio (int, optional): The sample ratio of points when in training mode in PointHead.
sampled_point = num_points * oversample_ratio. Default: 3.
point_importance_sample_ratio (float, optional): The importance sample ratio for compute num_uncertain_points in PointHead. Default: 0.75.
point_scale_factor(int, optinal): The scale factor of F.interpolate in refine seg logits stage when in inference in PointHead. Default: 2.
point_subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference in PointHead. Default: 2.
point_subdivision_num_points(int, optional): The points number for refine seg logits when in inference in PointHead. Default: 8196.
point_dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in PointHead. Default: 0.1.
point_coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
the output of each fc layer in PointHead. Default: True.
point_conv_cfg(str): The config of Conv in PointHead. Default: 'Conv1D'.
point_input_transform(str): The features transform method of inputs in PointHead.
it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
PFN_feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2 in FPNHead. The first
one is of largest resolution. Default: [4, 8, 16, 32].
PFN_in_channels(list): The input feature's channels list in FPNHead. Default: [256, 256, 256, 256].
PFN_channels(int,optional): The output channels of scale_head's Conv before Upsample block in FPNHead. Default: 128.
PFN_in_index(list): The indexs of input features to use. it's shape should keep with in_channels in FPNHead. Default: [0, 1, 2, 3].
PFN_dropout_ratio(float,optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in FPNHead. Default: 0.1.
PFN_conv_cfg(str): The config of Conv. Default: 'Conv2D'.
PFN_input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs' in FPNHead. Defalut: 'multiple_select'.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
pretrained (str, optional): The path or url of pretrained model. Default: None.
"""
def __init__(
self,
num_classes,
backbone,
backbone_indices,
fpn_inplanes=[256, 512, 1024, 2048],
fpn_outplanes=256,
point_in_channels=[256],
point_out_channels=256,
point_in_index=[0],
point_num_fcs=3,
point_num_points=2048,
point_oversample_ratio=3,
point_importance_sample_ratio=0.75,
point_scale_factor=2,
point_subdivision_steps=2,
point_subdivision_num_points=8196,
point_dropout_ratio=0,
point_coarse_pred_each_layer=True,
point_input_transform='multiple_select', # resize_concat
point_conv_cfg='Conv1D',
PFN_feature_strides=[4, 8, 16, 32],
PFN_in_channels=[256, 256, 256, 256],
PFN_channels=128,
PFN_in_index=[0, 1, 2, 3],
PFN_dropout_ratio=0,
PFN_conv_cfg='Conv2D',
PFN_input_transform='multiple_select',
align_corners=False,
pretrained=None):
super(PointRend, self).__init__()
self.backbone = backbone
self.backbone_indices = backbone_indices
self.in_channels = [
self.backbone.feat_channels[i] for i in backbone_indices
]
self.neck = FPNNeck(
fpn_inplanes=fpn_inplanes, fpn_outplanes=fpn_outplanes)
self.pointhead = PointHead(
in_channels=point_in_channels,
out_channels=point_out_channels,
num_classes=num_classes,
in_index=point_in_index,
num_fcs=point_num_fcs,
num_points=point_num_points,
oversample_ratio=point_oversample_ratio,
importance_sample_ratio=point_importance_sample_ratio,
scale_factor=point_scale_factor,
subdivision_steps=point_subdivision_steps,
subdivision_num_points=point_subdivision_num_points,
dropout_ratio=point_dropout_ratio,
align_corners=align_corners,
coarse_pred_each_layer=point_coarse_pred_each_layer,
input_transform=point_input_transform, # resize_concat
conv_cfg=point_conv_cfg)
self.fpnhead = FPNHead(
feature_strides=PFN_feature_strides,
in_channels=PFN_in_channels,
channels=PFN_channels,
num_class=num_classes,
in_index=PFN_in_index,
dropout_ratio=PFN_dropout_ratio,
conv_cfg=PFN_conv_cfg,
input_transform=PFN_input_transform,
align_corners=align_corners)
self.align_corners = align_corners
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
feats = self.backbone(x)
feats = [feats[i] for i in self.backbone_indices]
fpn_feats = self.neck(feats) # [n,256,64,128]*3 & [n,256,128,256]
pfn_logits = self.fpnhead(
fpn_feats
) # segmainoutput decode_head[0] 512*1024->[n, 19, 64, 128]
point_logits = self.pointhead(
fpn_feats, pfn_logits) # segpointoutput decode_head[1]
if self.training:
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in pfn_logits
]
logit_list.append(point_logits)
else:
logit_list = [
F.interpolate(
logit,
paddle.shape(x)[2:],
mode='bilinear',
align_corners=self.align_corners) for logit in point_logits
]
return logit_list
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class PointHead(nn.Layer):
"""
The PointHead implementation based on PaddlePaddle.
PointHead use shared multi-layer perceptron (equivalent to
nn.Conv1D) to predict the logit of input points. The fine-grained feature
and coarse feature will be concatenate together for predication.
The original article refers to:
Kirillov A , Wu Y , He K , et al "PointRend: Image Segmentation As Rendering."
(https://arxiv.org/abs/1912.08193)
Args:
num_classes (int): Number of classes for logits. Default: 19.
num_fcs (int, optional): Number of fc layers in the head. Default: 3.
in_channels (list): input channels of fc block. Default: [256].
out_channels (int, optional): Fc block's output channels. Default: 256.
in_index (list): The indexs of input features to use. Default: [0].
num_points (int, optional): The number of point in training mode. Default: 2048.
oversample_ratio (int, optional): The sample ratio of points when in training mode.
sampled_point = num_points * oversample_ratio. Default: 3.
importance_sample_ratio(float, optional): The importance sample ratio for compute num_uncertain_points. Default: 0.75.
scale_factor(int, optional): The scale factor of F.interpolate in refine seg logits stage when in inference. Default: 2.
subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference. Default: 2.
subdivision_num_points(int, optional): The points number for refine seg logits when in inference. Default: 8196.
dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
the output of each fc layer. Default: True.
conv_cfg(str): The config of Conv. Default: 'Conv1D'.
input_transform(str): The features transform method of inputs.
it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
"""
def __init__(
self,
num_classes=19,
num_fcs=3,
in_channels=[256],
out_channels=256,
in_index=[0],
num_points=2048,
oversample_ratio=3,
importance_sample_ratio=0.75,
scale_factor=2,
subdivision_steps=2,
subdivision_num_points=8196,
dropout_ratio=0.1,
coarse_pred_each_layer=True,
conv_cfg='Conv1D',
input_transform='multiple_select', # resize_concat
align_corners=False):
super(PointHead, self).__init__()
self.in_channels = in_channels
self.channels = out_channels
self.in_index = in_index
self.num_classes = num_classes
self.num_fcs = num_fcs
self.num_points = num_points
self.oversample_ratio = oversample_ratio
self.importance_sample_ratio = importance_sample_ratio
self.scale_factor = scale_factor
self.subdivision_steps = subdivision_steps
self.subdivision_num_points = paddle.to_tensor(subdivision_num_points, dtype="int32")
self.dropout_ratio = dropout_ratio
self.coarse_pred_each_layer = coarse_pred_each_layer
self.align_corners = align_corners
self.input_transform = input_transform
fc_in_channels = sum(self.in_channels) + self.num_classes
fc_channels = self.channels
self.fcs = nn.LayerList()
for k in range(num_fcs):
fc = ConvModule(
fc_in_channels,
fc_channels,
kernel_size=1,
stride=1,
padding=0,
conv_cfg=conv_cfg,
)
self.fcs.append(fc)
fc_in_channels = fc_channels
fc_in_channels += self.num_classes if self.coarse_pred_each_layer else 0
self.fc_seg = nn.Conv1D(
fc_in_channels,
self.num_classes,
kernel_size=1,
stride=1,
padding=0)
if self.dropout_ratio > 0:
self.dropout = nn.Dropout(self.dropout_ratio)
else:
self.dropout = None
def cls_seg(self, feat):
"""Classify each pixel with fc."""
if self.dropout is not None:
feat = self.dropout(feat)
output = self.fc_seg(feat)
return output
def _get_fine_grained_point_feats(self, x, points):
"""
Sample from fine grained features.
Args:
x (list[Tensor]): Feature pyramid from by neck or backbone.
points (Tensor): Point coordinates, shape (batch_size,
num_points, 2).
Returns:
fine_grained_feats (Tensor): Sampled fine grained feature,
shape (batch_size, sum(channels of x), num_points).
"""
fine_grained_feats_list = [
point_sample(_, points, align_corners=self.align_corners) for _ in x
]
if len(fine_grained_feats_list) > 1:
fine_grained_feats = paddle.concat(fine_grained_feats_list, axis=1)
else:
fine_grained_feats = fine_grained_feats_list[0]
return fine_grained_feats
def _get_coarse_point_feats(self, prev_output, points):
"""
Sample from fine grained features.
Args:
prev_output (list[Tensor]): Prediction of previous decode head.
points (Tensor): Point coordinates, shape (batch_size,
num_points, 2).
Returns:
coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
num_classes, num_points).
"""
coarse_feats = point_sample(
prev_output, points, align_corners=self.align_corners)
return coarse_feats
def _transform_inputs(self, inputs):
"""
Transform inputs for decoder.
Args:
inputs (list[Tensor]): List of multi-level img features.
Returns:
Tensor: The transformed inputs
"""
if self.input_transform == 'resize_concat':
inputs = [inputs[i] for i in self.in_index]
upsampled_inputs = [
F.interpolate(
x,
size=paddle.shape(inputs[0])[2:],
mode='bilinear',
align_corners=self.align_corners) for x in inputs
]
inputs = paddle.concat(upsampled_inputs, axis=1)
elif self.input_transform == 'multiple_select':
inputs = [inputs[i] for i in self.in_index]
else:
inputs = inputs[self.in_index[0]]
return inputs
def get_points_train(self, seg_logits, uncertainty_func): # finish
"""
Sample points for training.
Sample points in [0, 1] x [0, 1] coordinate space based on their
uncertainty. The uncertainties are calculated for each point using
'uncertainty_func' function that takes point's logit prediction as
input.
Args:
seg_logits (Tensor): Semantic segmentation logits, shape (
batch_size, num_classes, height, width).
uncertainty_func (func): uncertainty calculation function.
cfg (dict): Training config of point head.
Returns:
point_coords (Tensor): A tensor of shape (batch_size, num_points,
2) that contains the coordinates of ``num_points`` sampled
points.
"""
num_points = self.num_points
oversample_ratio = self.oversample_ratio
importance_sample_ratio = self.importance_sample_ratio
assert oversample_ratio >= 1
assert 0 <= importance_sample_ratio <= 1
batch_size = paddle.shape(seg_logits)[0]
num_sampled = int(num_points * oversample_ratio)
point_coords = paddle.rand([batch_size, num_sampled, 2])
point_logits = point_sample(seg_logits, point_coords)
# It is crucial to calculate uncertainty based on the sampled
# prediction value for the points. Calculating uncertainties of the
# coarse predictions first and sampling them for points leads to
# incorrect results. To illustrate this: assume uncertainty func(
# logits)=-abs(logits), a sampled point between two coarse
# predictions with -1 and 1 logits has 0 logits, and therefore 0
# uncertainty value. However, if we calculate uncertainties for the
# coarse predictions first, both will have -1 uncertainty,
# and sampled point will get -1 uncertainty.
point_uncertainties = uncertainty_func(point_logits)
num_uncertain_points = int(importance_sample_ratio * num_points)
num_random_points = num_points - num_uncertain_points
idx = paddle.topk(
point_uncertainties[:, 0, :], k=num_uncertain_points, axis=1)[1]
shift = num_sampled * paddle.arange(batch_size, dtype='int64')
idx += shift.unsqueeze([-1])
idx = idx.reshape([-1])
point_coords = paddle.index_select(
point_coords.reshape([-1, 2]), idx, axis=0)
point_coords = point_coords.reshape(
[batch_size, num_uncertain_points, 2])
if num_random_points > 0:
rand_point_coords = paddle.rand([batch_size, num_random_points, 2])
point_coords = paddle.concat((point_coords, rand_point_coords),
axis=1)
return point_coords
def get_points_test(self, seg_logits, uncertainty_func): # finish
"""
Sample points for testing.
Find ``num_points`` most uncertain points from ``uncertainty_map``.
Args:
seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
height, width) for class-specific or class-agnostic prediction.
uncertainty_func (func): uncertainty calculation function.
cfg (dict): Testing config of point head.
Returns:
point_indices (Tensor): A tensor of shape (batch_size, num_points)
that contains indices from [0, height x width) of the most
uncertain points.
point_coords (Tensor): A tensor of shape (batch_size, num_points,
2) that contains [0, 1] x [0, 1] normalized coordinates of the
most uncertain points from the ``height x width`` grid .
"""
num_points = self.subdivision_num_points
uncertainty_map = uncertainty_func(seg_logits)
batch_size = paddle.shape(uncertainty_map)[0]
height = paddle.shape(uncertainty_map)[2]
width = paddle.shape(uncertainty_map)[3]
h_step = 1.0 / height
w_step = 1.0 / width
uncertainty_map = uncertainty_map.reshape([batch_size, height * width])
num_points = paddle.min(paddle.concat([height * width, num_points]))
point_indices = paddle.topk(uncertainty_map, num_points, axis=1)[1]
point_coords = paddle.zeros([batch_size, num_points, 2],
dtype='float32')
point_coords[:, :, 0] = w_step / 2.0 + (
point_indices % width).astype('float32') * w_step
point_coords[:, :, 1] = h_step / 2.0 + (
point_indices // width).astype('float32') * h_step
return point_indices, point_coords
def scatter_paddle(self, refined_seg_logits, point_indices, point_logits):
"""
paddle version scatter : equal to pytorch version scatter(-1,point_indices,point_logits).
Args:
refined_seg_logits(Tensor): shape=[batch_size, channels, height * width]
point_indices(Tensor): shape=[batch_size, channels, height * width]
point_logits(Tensor): shape[batch_size, channels, height * width]
Returns:
scattered refined_seg_logits(Tensor).
"""
original_shape = paddle.shape(refined_seg_logits) # [batch_size, channels, height * width]
new_refined_seg_logits = refined_seg_logits.flatten(0, 1) # [N*C,H*W]
offsets = (paddle.arange(paddle.shape(new_refined_seg_logits)[0]) *
paddle.shape(new_refined_seg_logits)[1]).unsqueeze(-1) # [N*C,1]
point_indices = point_indices.flatten(0, 1) # [N*C,H*W]
new_point_indices = (point_indices + offsets).flatten()
point_logits = point_logits.flatten() # [N*C*H*W]
refined_seg_logits = paddle.scatter(
refined_seg_logits.flatten(),
new_point_indices,
point_logits,
overwrite=True)
return refined_seg_logits.reshape(shape=original_shape)
def forward_train(self, x, prev_output):
with paddle.no_grad():
points = self.get_points_train(prev_output, calculate_uncertainty)
fine_grained_point_feats = self._get_fine_grained_point_feats(
x, points) # [2, 256, 2048]
coarse_point_feats = self._get_coarse_point_feats(
prev_output, points) # [2, 19, 2048]
# forward for train
fusion_point_feats = paddle.concat(
[fine_grained_point_feats, coarse_point_feats], axis=1)
for fc in self.fcs:
fusion_point_feats = fc(fusion_point_feats)
if self.coarse_pred_each_layer:
fusion_point_feats = paddle.concat(
(fusion_point_feats, coarse_point_feats), axis=1)
point_logits = self.cls_seg(fusion_point_feats)
return [point_logits, points] # for points loss
def forward(self, inputs, prev_output):
"""
Forward function.
Args:
inputs (list[Tensor]): List of multi-level img features.
prev_output (Tensor): The output of previous decode head.
Returns:
[point_logits,points]: For points loss when in training.
[refined_seg_logits]: Output refined seg logits when in inference.
"""
prev_output = prev_output[0]
x = self._transform_inputs(inputs)
if self.training:
return self.forward_train(x, prev_output)
else:
refined_seg_logits = prev_output.clone()
for _ in range(self.subdivision_steps):
refined_seg_logits = F.interpolate(
refined_seg_logits,
scale_factor=self.scale_factor,
mode='bilinear',
align_corners=self.align_corners)
save_shape = paddle.shape(refined_seg_logits)
point_indices, points = self.get_points_test(
refined_seg_logits, calculate_uncertainty)
fine_grained_point_feats = self._get_fine_grained_point_feats(
x, points)
coarse_point_feats = self._get_coarse_point_feats(
prev_output, points)
# forward for inference
fusion_point_feats = paddle.concat(
[fine_grained_point_feats, coarse_point_feats], axis=1)
for fc in self.fcs:
fusion_point_feats = fc(fusion_point_feats)
if self.coarse_pred_each_layer:
fusion_point_feats = paddle.concat(
(fusion_point_feats, coarse_point_feats), axis=1)
point_logits = self.cls_seg(fusion_point_feats)
point_indices = paddle.unsqueeze(point_indices, axis=1)
point_indices = paddle.expand(point_indices, [-1, save_shape[1], -1])
refined_seg_logits = paddle.flatten(refined_seg_logits, 2)
refined_seg_logits = self.scatter_paddle(
refined_seg_logits, point_indices,
point_logits) # 2->height * width dim
refined_seg_logits = refined_seg_logits.reshape(save_shape)
return [refined_seg_logits]
class FPNHead(nn.Layer):
"""
This head is the implementation of Semantic FPN in paddle.
The original article refers to:
Kirillov, A. , et al. "Panoptic Feature Pyramid Networks."
(https://arxiv.org/abs/1901.02446)
Args:
num_classes(int): The unique number of target classes. Default: 19.
feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2. The first
one is of largest resolution. Default: [4, 8, 16, 32].
in_channels(list): The input feature's channels list. Default: [256, 256, 256, 256].
channels(int, optional): The output channels of scale_head's Conv before Upsample block. Default: 128.
in_index(list): The indexs of input features to use. it's shape should keep with in_channels. Default: [0, 1, 2, 3].
dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
conv_cfg(str): The config of Conv. Default: 'Conv2D'.
input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
"""
def __init__(
self,
num_class=19,
feature_strides=[4, 8, 16, 32],
in_channels=[256, 256, 256, 256],
channels=128,
in_index=[0, 1, 2, 3],
dropout_ratio=0.1,
conv_cfg='Conv2D',
input_transform='multiple_select',
align_corners=False,
):
super(FPNHead, self).__init__()
assert len(feature_strides) == len(in_channels)
assert min(feature_strides) == feature_strides[0]
self.feature_strides = feature_strides
self.in_channels = in_channels
self.channels = channels
self.in_index = in_index
self.num_class = num_class
self.conv_cfg = conv_cfg
self.dropout_ratio = dropout_ratio
self.input_transform = input_transform
self.align_corners = align_corners
self.scale_heads = nn.LayerList()
for i in range(len(feature_strides)):
head_length = max(
1,
int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
scale_head = []
for k in range(head_length):
scale_head.append(
ConvModule(
self.in_channels[i] if k == 0 else self.channels,
self.channels,
3,
padding=1,
conv_cfg=self.conv_cfg))
if feature_strides[i] != feature_strides[0]:
scale_head.append(
Upsample(
scale_factor=2,
mode='bilinear',
align_corners=self.align_corners))
self.scale_heads.append(nn.Sequential(*scale_head))
self.conv_seg = nn.Conv2D(self.channels, self.num_class, kernel_size=1)
if self.dropout_ratio is not None:
self.dropout = nn.Dropout2D(self.dropout_ratio)
else:
self.dropout = None
def cls_seg(self, feat):
if self.dropout is not None:
feat = self.dropout(feat)
output = self.conv_seg(feat)
return output
def _transform_inputs(self, inputs):
"""
Transform inputs for decoder.
Args:
inputs (list[Tensor]): List of multi-level img features.
Returns:
Tensor: The transformed inputs
"""
if self.input_transform == 'resize_concat':
inputs = [inputs[i] for i in self.in_index]
upsampled_inputs = [
F.interpolate(
x,
size=paddle.shape(inputs[0])[2:],
mode='bilinear',
align_corners=self.align_corners) for x in inputs
]
inputs = paddle.concat(upsampled_inputs, axis=1)
elif self.input_transform == 'multiple_select':
inputs = [inputs[i] for i in self.in_index]
else:
inputs = inputs[self.in_index[0]]
return inputs
def forward(self, inputs):
x = self._transform_inputs(inputs)
output = self.scale_heads[0](x[0])
for i in range(1, len(self.feature_strides)):
output = output + F.interpolate(
self.scale_heads[i](x[i]),
size=paddle.shape(output)[2:],
mode='bilinear',
align_corners=self.align_corners)
output = self.cls_seg(output)
return [output]
class FPNNeck(nn.Layer):
"""
The FPN Neck implementation in paddle.
Args:
fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction. Default: [256, 512, 1024, 2048].
fpn_outplanes (int, optional): The output channels. Default: 256.
"""
def __init__(
self,
fpn_inplanes=[256, 512, 1024, 2048],
fpn_outplanes=256,
):
super(FPNNeck, self).__init__()
self.lateral_convs = []
self.fpn_out = []
# FPN head
for fpn_inplane in fpn_inplanes:
self.lateral_convs.append(
nn.Sequential(
nn.Conv2D(fpn_inplane, fpn_outplanes, 1),
layers.SyncBatchNorm(fpn_outplanes), nn.ReLU()))
self.fpn_out.append(
nn.Sequential(
layers.ConvBNReLU(
fpn_outplanes, fpn_outplanes, 3, bias_attr=False)))
self.lateral_convs = nn.LayerList(self.lateral_convs)
self.fpn_out = nn.LayerList(self.fpn_out)
def forward(self, conv_out):
last_out = self.lateral_convs[-1](conv_out[-1])
f = last_out
fpn_feature_list = [last_out]
for i in reversed(range(len(conv_out) - 1)):
conv_x = conv_out[i]
conv_x = self.lateral_convs[i](conv_x)
prev_shape = paddle.shape(conv_x)[2:]
f = conv_x + F.interpolate(
f, prev_shape, mode='bilinear', align_corners=True)
fpn_feature_list.append(self.fpn_out[i](f))
return fpn_feature_list
class ConvModule(nn.Layer):
"""
ConvModule includes Conv1/Conv2D.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
padding=0,
stride=1,
conv_cfg='Conv1D',
norm_cfg='None',
**kwargs):
super().__init__()
if (conv_cfg == 'Conv1D'):
self._conv = nn.Conv1D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
**kwargs)
if (conv_cfg == 'Conv2D'):
self._conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
**kwargs)
if 'data_format' in kwargs:
data_format = kwargs['data_format']
else:
data_format = 'NCHW'
if (norm_cfg != 'None'):
self._batch_norm = layers.SyncBatchNorm(
out_channels, data_format=data_format)
else:
self._batch_norm = None
def forward(self, x):
x = self._conv(x)
if (self._batch_norm != None):
x = self._batch_norm(x)
x = F.relu(x)
return x
class Upsample(nn.Layer):
"""
Upsample Module.
"""
def __init__(self,
size=None,
scale_factor=None,
mode='nearest',
align_corners=None):
super(Upsample, self).__init__()
self.size = size
if isinstance(scale_factor, tuple):
self.scale_factor = tuple(float(factor) for factor in scale_factor)
else:
self.scale_factor = float(scale_factor) if scale_factor else None
self.mode = mode
self.align_corners = align_corners
def forward(self, x):
if not self.size:
return F.interpolate(x, None, self.scale_factor, self.mode, self.align_corners)
else:
return F.interpolate(x, self.size, None, self.mode, self.align_corners)
def point_sample(input, points, align_corners=False, **kwargs):
"""
A wrapper around :func:`grid_sample` to support 3D point_coords tensors
Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
lie inside ``[0, 1] x [0, 1]`` square.
Args:
input (Tensor): Feature map, shape (N, C, H, W).
points (Tensor): Image based absolute point coordinates (normalized),
range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
align_corners (bool): Whether align_corners. Default: False
Returns:
Tensor: Features of `point` on `input`, shape (N, C, P) or
(N, C, Hgrid, Wgrid).
"""
def denormalize(grid):
"""Denormalize input grid from range [0, 1] to [-1, 1]
Args:
grid (Tensor): The grid to be denormalize, range [0, 1].
Returns:
Tensor: Denormalized grid, range [-1, 1].
"""
return grid * 2.0 - 1.0
add_dim = False
if points.dim() == 3:
add_dim = True
points = paddle.unsqueeze(points, axis=2)
output = F.grid_sample(
input, denormalize(points), align_corners=align_corners, **kwargs)
if add_dim:
output = paddle.squeeze(output, axis=3)
return output
def calculate_uncertainty(seg_logits):
"""
Estimate uncertainty based on seg logits.
For each location of the prediction ``seg_logits`` we estimate
uncertainty as the difference between top first and top second
predicted logits.
Args:
seg_logits (Tensor): Semantic segmentation logits,
shape (batch_size, num_classes, height, width).
Returns:
scores (Tensor): T uncertainty scores with the most uncertain
locations having the highest uncertainty score, shape (
batch_size, 1, height, width)
"""
top2_scores = paddle.topk(seg_logits, k=2, axis=1)[0]
return paddle.unsqueeze(top2_scores[:, 1] - top2_scores[:, 0], axis=1)

@ -0,0 +1,226 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
from paddlers.models.ppseg import utils
from paddlers.models.ppseg.cvlibs import manager
@manager.MODELS.add_component
class PortraitNet(nn.Layer):
"""
The PortraitNet implementation based on PaddlePaddle.
The original article refers to
Song-Hai Zhanga, Xin Donga, Jia Lib, Ruilong Lia, Yong-Liang Yangc
"PortraitNet: Real-time Portrait Segmentation Network for Mobile Device"
(https://www.yongliangyang.net/docs/mobilePotrait_c&g19.pdf).
Args:
num_classes (int, optional): The unique number of target classes. Default: 2.
backbone (Paddle.nn.Layer): Backbone network, currently support MobileNetV2.
add_edge (bool, optional): Whether output to edge. Default: False
pretrained (str, optional): The path or url of pretrained model. Default: None
"""
def __init__(self,
num_classes,
backbone,
min_channel=16,
channel_ratio=1.0,
add_edge=False,
pretrained=None):
super(PortraitNet, self).__init__()
self.backbone = backbone
self.head = PortraitNetHead(num_classes, min_channel, channel_ratio,
add_edge)
self.pretrained = pretrained
self.init_weight()
def forward(self, x):
img = x[:, :3, :, :]
img_ori = x[:, 3:, :, :]
feat_list = self.backbone(img)
logits_list = self.head(feat_list)
feat_list = self.backbone(img_ori)
logits_ori_list = self.head(feat_list)
return [
logits_list[0], logits_ori_list[0], logits_list[1],
logits_ori_list[1]
]
def init_weight(self):
if self.pretrained is not None:
utils.load_entire_model(self, self.pretrained)
class PortraitNetHead(nn.Layer):
def __init__(self,
num_classes,
min_channel=16,
channel_ratio=1.0,
add_edge=False):
super().__init__()
self.min_channel = min_channel
self.channel_ratio = channel_ratio
self.add_edge = add_edge
self.deconv1 = nn.Conv2DTranspose(
self.depth(96),
self.depth(96),
groups=1,
kernel_size=4,
stride=2,
padding=1,
bias_attr=False)
self.deconv2 = nn.Conv2DTranspose(
self.depth(32),
self.depth(32),
groups=1,
kernel_size=4,
stride=2,
padding=1,
bias_attr=False)
self.deconv3 = nn.Conv2DTranspose(
self.depth(24),
self.depth(24),
groups=1,
kernel_size=4,
stride=2,
padding=1,
bias_attr=False)
self.deconv4 = nn.Conv2DTranspose(
self.depth(16),
self.depth(16),
groups=1,
kernel_size=4,
stride=2,
padding=1,
bias_attr=False)
self.deconv5 = nn.Conv2DTranspose(
self.depth(8),
self.depth(8),
groups=1,
kernel_size=4,
stride=2,
padding=1,
bias_attr=False)
self.transit1 = ResidualBlock(self.depth(320), self.depth(96))
self.transit2 = ResidualBlock(self.depth(96), self.depth(32))
self.transit3 = ResidualBlock(self.depth(32), self.depth(24))
self.transit4 = ResidualBlock(self.depth(24), self.depth(16))
self.transit5 = ResidualBlock(self.depth(16), self.depth(8))
self.pred = nn.Conv2D(
self.depth(8), num_classes, 3, 1, 1, bias_attr=False)
if self.add_edge:
self.edge = nn.Conv2D(
self.depth(8), num_classes, 3, 1, 1, bias_attr=False)
def depth(self, channels):
min_channel = min(channels, self.min_channel)
return max(min_channel, int(channels * self.channel_ratio))
def forward(self, feat_list):
feature_1_4, feature_1_8, feature_1_16, feature_1_32 = feat_list
up_1_16 = self.deconv1(self.transit1(feature_1_32))
up_1_8 = self.deconv2(self.transit2(feature_1_16 + up_1_16))
up_1_4 = self.deconv3(self.transit3(feature_1_8 + up_1_8))
up_1_2 = self.deconv4(self.transit4(feature_1_4 + up_1_4))
up_1_1 = self.deconv5(self.transit5(up_1_2))
pred = self.pred(up_1_1)
if self.add_edge:
edge = self.edge(up_1_1)
return pred, edge
else:
return pred
class ConvDw(nn.Layer):
def __init__(self, inp, oup, kernel, stride):
super(ConvDw, self).__init__()
self.conv = nn.Sequential(
nn.Conv2D(
inp,
inp,
kernel,
stride, (kernel - 1) // 2,
groups=inp,
bias_attr=False),
nn.BatchNorm2D(num_features=inp, epsilon=1e-05, momentum=0.1),
nn.ReLU(),
nn.Conv2D(inp, oup, 1, 1, 0, bias_attr=False),
nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
nn.ReLU(),
)
def forward(self, x):
return self.conv(x)
class ResidualBlock(nn.Layer):
def __init__(self, inp, oup, stride=1):
super(ResidualBlock, self).__init__()
self.block = nn.Sequential(
ConvDw(inp, oup, 3, stride=stride),
nn.Conv2D(
in_channels=oup,
out_channels=oup,
kernel_size=3,
stride=1,
padding=1,
groups=oup,
bias_attr=False),
nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
nn.ReLU(),
nn.Conv2D(
in_channels=oup,
out_channels=oup,
kernel_size=1,
stride=1,
padding=0,
bias_attr=False),
nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
)
if inp == oup:
self.residual = None
else:
self.residual = nn.Sequential(
nn.Conv2D(
in_channels=inp,
out_channels=oup,
kernel_size=1,
stride=1,
padding=0,
bias_attr=False),
nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
)
self.relu = nn.ReLU()
def forward(self, x):
residual = x
out = self.block(x)
if self.residual is not None:
residual = self.residual(x)
out += residual
out = self.relu(out)
return out

@ -0,0 +1,226 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlers.models.ppseg.cvlibs import manager, param_init
from paddlers.models.ppseg.models import layers
from paddlers.models.ppseg.utils import utils
__all__ = ['PPHumanSegLite']
@manager.MODELS.add_component
class PPHumanSegLite(nn.Layer):
"A self-developed ultra lightweight model from paddlers.models.ppseg, is suitable for real-time scene segmentation on web or mobile terminals."
def __init__(self, num_classes, pretrained=None, align_corners=False):
super().__init__()
self.pretrained = pretrained
self.num_classes = num_classes
self.align_corners = align_corners
self.conv_bn0 = _ConvBNReLU(3, 36, 3, 2, 1)
self.conv_bn1 = _ConvBNReLU(36, 18, 1, 1, 0)
self.block1 = nn.Sequential(
InvertedResidual(36, stride=2, out_channels=72),
InvertedResidual(72, stride=1), InvertedResidual(72, stride=1),
InvertedResidual(72, stride=1))
self.block2 = nn.Sequential(
InvertedResidual(72, stride=2), InvertedResidual(144, stride=1),
InvertedResidual(144, stride=1), InvertedResidual(144, stride=1),
InvertedResidual(144, stride=1), InvertedResidual(144, stride=1),
InvertedResidual(144, stride=1), InvertedResidual(144, stride=1))
self.depthwise_separable0 = _SeparableConvBNReLU(144, 64, 3, stride=1)
self.depthwise_separable1 = _SeparableConvBNReLU(82, 64, 3, stride=1)
self.depthwise_separable2 = _SeparableConvBNReLU(
64, self.num_classes, 3, stride=1)
self.init_weight()
def forward(self, x):
# Encoder
input_shape = paddle.shape(x)[2:]
x = self.conv_bn0(x) # 1/2
shortcut = self.conv_bn1(x) # shortcut
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) # 1/4
x = self.block1(x) # 1/8
x = self.block2(x) # 1/16
# Decoder
x = self.depthwise_separable0(x)
shortcut_shape = paddle.shape(shortcut)[2:]
x = F.interpolate(
x,
shortcut_shape,
mode='bilinear',
align_corners=self.align_corners)
x = paddle.concat(x=[shortcut, x], axis=1)
x = self.depthwise_separable1(x)
logit = self.depthwise_separable2(x)
logit = F.interpolate(
logit,
input_shape,
mode='bilinear',
align_corners=self.align_corners)
return [logit]
def init_weight(self):
for layer in self.sublayers():
if isinstance(layer, nn.Conv2D):
param_init.normal_init(layer.weight, std=0.001)
elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
param_init.constant_init(layer.weight, value=1.0)
param_init.constant_init(layer.bias, value=0.0)
if self.pretrained is not None:
utils.load_pretrained_model(self, self.pretrained)
class _ConvBNReLU(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
**kwargs):
super().__init__()
weight_attr = paddle.ParamAttr(
learning_rate=1, initializer=nn.initializer.KaimingUniform())
self._conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size,
padding=padding,
stride=stride,
groups=groups,
weight_attr=weight_attr,
bias_attr=False,
**kwargs)
self._batch_norm = layers.SyncBatchNorm(out_channels)
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
x = F.relu(x)
return x
class _ConvBN(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
**kwargs):
super().__init__()
weight_attr = paddle.ParamAttr(
learning_rate=1, initializer=nn.initializer.KaimingUniform())
self._conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size,
padding=padding,
stride=stride,
groups=groups,
weight_attr=weight_attr,
bias_attr=False,
**kwargs)
self._batch_norm = layers.SyncBatchNorm(out_channels)
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
return x
class _SeparableConvBNReLU(nn.Layer):
def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
super().__init__()
self.depthwise_conv = _ConvBN(
in_channels,
out_channels=in_channels,
kernel_size=kernel_size,
padding=int(kernel_size / 2),
groups=in_channels,
**kwargs)
self.piontwise_conv = _ConvBNReLU(
in_channels,
out_channels,
kernel_size=1,
groups=1,
stride=1,
padding=0)
def forward(self, x):
x = self.depthwise_conv(x)
x = self.piontwise_conv(x)
return x
class InvertedResidual(nn.Layer):
def __init__(self, input_channels, stride, out_channels=None):
super().__init__()
if stride == 1:
branch_channel = int(input_channels / 2)
else:
branch_channel = input_channels
if out_channels is None:
self.in_channels = int(branch_channel)
else:
self.in_channels = int(out_channels / 2)
self._depthwise_separable_0 = _SeparableConvBNReLU(
input_channels, self.in_channels, 3, stride=stride)
self._conv = _ConvBNReLU(
branch_channel, self.in_channels, 1, stride=1, padding=0)
self._depthwise_separable_1 = _SeparableConvBNReLU(
self.in_channels, self.in_channels, 3, stride=stride)
self.stride = stride
def forward(self, input):
if self.stride == 1:
shortcut, branch = paddle.split(x=input, num_or_sections=2, axis=1)
else:
branch = input
shortcut = self._depthwise_separable_0(input)
branch_1x1 = self._conv(branch)
branch_dw1x1 = self._depthwise_separable_1(branch_1x1)
output = paddle.concat(x=[shortcut, branch_dw1x1], axis=1)
# channel shuffle
out_shape = paddle.shape(output)
h, w = out_shape[2], out_shape[3]
output = paddle.reshape(x=output, shape=[0, 2, self.in_channels, h, w])
output = paddle.transpose(x=output, perm=[0, 2, 1, 3, 4])
output = paddle.reshape(x=output, shape=[0, 2 * self.in_channels, h, w])
return output

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save