From 691e5c438dbb004e260ae1644f02465a04e1de48 Mon Sep 17 00:00:00 2001
From: chulutao <chulutao@baidu.com>
Date: Wed, 2 Mar 2022 19:55:59 +0800
Subject: [PATCH] add segmentation task. optimize directory structure

---
 docs/README.md                                |    1 +
 paddlers/datasets/__init__.py                 |    1 +
 paddlers/datasets/seg_dataset.py              |   91 ++
 paddlers/models/ppseg/__init__.py             |   17 +
 paddlers/models/ppseg/core/__init__.py        |   20 +
 paddlers/models/ppseg/core/infer.py           |  309 ++++
 paddlers/models/ppseg/core/predict.py         |  150 ++
 paddlers/models/ppseg/core/train.py           |  326 +++++
 paddlers/models/ppseg/core/val.py             |  199 +++
 paddlers/models/ppseg/cvlibs/__init__.py      |   17 +
 paddlers/models/ppseg/cvlibs/callbacks.py     |  279 ++++
 paddlers/models/ppseg/cvlibs/config.py        |  404 ++++++
 paddlers/models/ppseg/cvlibs/manager.py       |  149 ++
 paddlers/models/ppseg/cvlibs/param_init.py    |  120 ++
 paddlers/models/ppseg/datasets/__init__.py    |   29 +
 paddlers/models/ppseg/datasets/ade.py         |  111 ++
 paddlers/models/ppseg/datasets/chase_db1.py   |   98 ++
 paddlers/models/ppseg/datasets/cityscapes.py  |   87 ++
 paddlers/models/ppseg/datasets/cocostuff.py   |   82 ++
 paddlers/models/ppseg/datasets/dataset.py     |  162 +++
 paddlers/models/ppseg/datasets/drive.py       |   96 ++
 paddlers/models/ppseg/datasets/eg1800.py      |  136 ++
 paddlers/models/ppseg/datasets/hrf.py         |   95 ++
 .../mini_deep_globe_road_extraction.py        |   95 ++
 .../models/ppseg/datasets/optic_disc_seg.py   |   97 ++
 .../models/ppseg/datasets/pascal_context.py   |   82 ++
 .../models/ppseg/datasets/pp_humanseg14k.py   |   82 ++
 paddlers/models/ppseg/datasets/stare.py       |   95 ++
 paddlers/models/ppseg/datasets/supervisely.py |  135 ++
 paddlers/models/ppseg/datasets/voc.py         |  112 ++
 paddlers/models/ppseg/models/__init__.py      |   57 +
 paddlers/models/ppseg/models/ann.py           |  434 ++++++
 .../models/ppseg/models/attention_unet.py     |  178 +++
 .../models/ppseg/models/backbones/__init__.py |   23 +
 .../models/ppseg/models/backbones/hrnet.py    |  837 +++++++++++
 .../ppseg/models/backbones/mix_transformer.py |  588 ++++++++
 .../ppseg/models/backbones/mobilenetv2.py     |  168 +++
 .../ppseg/models/backbones/mobilenetv3.py     |  364 +++++
 .../ppseg/models/backbones/resnet_vd.py       |  398 ++++++
 .../models/ppseg/models/backbones/stdcnet.py  |  281 ++++
 .../models/backbones/swin_transformer.py      |  792 ++++++++++
 .../models/backbones/transformer_utils.py     |   83 ++
 .../models/backbones/vision_transformer.py    |  410 ++++++
 .../models/backbones/xception_deeplab.py      |  415 ++++++
 paddlers/models/ppseg/models/bisenet.py       |  307 ++++
 paddlers/models/ppseg/models/bisenetv1.py     |  259 ++++
 paddlers/models/ppseg/models/danet.py         |  218 +++
 .../models/ppseg/models/decoupled_segnet.py   |  228 +++
 paddlers/models/ppseg/models/deeplab.py       |  308 ++++
 paddlers/models/ppseg/models/dmnet.py         |  149 ++
 paddlers/models/ppseg/models/dnlnet.py        |  226 +++
 paddlers/models/ppseg/models/emanet.py        |  215 +++
 paddlers/models/ppseg/models/encnet.py        |  224 +++
 paddlers/models/ppseg/models/enet.py          |  622 ++++++++
 paddlers/models/ppseg/models/espnet.py        |  477 +++++++
 paddlers/models/ppseg/models/espnetv1.py      |  308 ++++
 paddlers/models/ppseg/models/fast_scnn.py     |  316 ++++
 paddlers/models/ppseg/models/fastfcn.py       |  240 ++++
 paddlers/models/ppseg/models/fcn.py           |  145 ++
 paddlers/models/ppseg/models/gcnet.py         |  222 +++
 paddlers/models/ppseg/models/ginet.py         |  291 ++++
 paddlers/models/ppseg/models/gscnn.py         |  353 +++++
 paddlers/models/ppseg/models/hardnet.py       |  308 ++++
 .../models/ppseg/models/hrnet_contrast.py     |  127 ++
 paddlers/models/ppseg/models/isanet.py        |  197 +++
 .../models/ppseg/models/layers/__init__.py    |   20 +
 .../models/ppseg/models/layers/activation.py  |   73 +
 .../models/ppseg/models/layers/attention.py   |  146 ++
 .../models/ppseg/models/layers/layer_libs.py  |  302 ++++
 .../models/ppseg/models/layers/nonlocal2d.py  |  154 ++
 .../ppseg/models/layers/pyramid_pool.py       |  192 +++
 .../ppseg/models/layers/wrap_functions.py     |   83 ++
 .../models/ppseg/models/losses/__init__.py    |   36 +
 .../losses/binary_cross_entropy_loss.py       |  174 +++
 .../losses/bootstrapped_cross_entropy.py      |   73 +
 .../ppseg/models/losses/cross_entropy_loss.py |  218 +++
 .../decoupledsegnet_relax_boundary_loss.py    |  129 ++
 .../models/losses/detail_aggregate_loss.py    |  116 ++
 .../models/ppseg/models/losses/dice_loss.py   |   56 +
 .../models/losses/edge_attention_loss.py      |   78 +
 .../models/ppseg/models/losses/focal_loss.py  |   60 +
 .../models/losses/gscnn_dual_task_loss.py     |  141 ++
 .../models/ppseg/models/losses/kl_loss.py     |   80 ++
 .../models/ppseg/models/losses/l1_loss.py     |   76 +
 .../models/ppseg/models/losses/lovasz_loss.py |  222 +++
 .../models/losses/mean_square_error_loss.py   |   65 +
 .../models/ppseg/models/losses/mixed_loss.py  |   57 +
 .../models/losses/ohem_cross_entropy_loss.py  |   99 ++
 .../models/losses/ohem_edge_attention_loss.py |  114 ++
 .../pixel_contrast_cross_entropy_loss.py      |  199 +++
 .../models/losses/point_cross_entropy_loss.py |  160 +++
 .../models/ppseg/models/losses/rmi_loss.py    |  256 ++++
 .../losses/semantic_connectivity_loss.py      |  175 +++
 .../semantic_encode_cross_entropy_loss.py     |   47 +
 .../models/ppseg/models/mla_transformer.py    |  241 ++++
 paddlers/models/ppseg/models/ocrnet.py        |  246 ++++
 paddlers/models/ppseg/models/pfpnnet.py       |  201 +++
 paddlers/models/ppseg/models/pointrend.py     |  832 +++++++++++
 paddlers/models/ppseg/models/portraitnet.py   |  226 +++
 .../models/ppseg/models/pphumanseg_lite.py    |  226 +++
 paddlers/models/ppseg/models/pspnet.py        |  147 ++
 paddlers/models/ppseg/models/segformer.py     |  177 +++
 paddlers/models/ppseg/models/segmenter.py     |  256 ++++
 paddlers/models/ppseg/models/segnet.py        |  142 ++
 paddlers/models/ppseg/models/setr.py          |  440 ++++++
 paddlers/models/ppseg/models/sfnet.py         |  233 +++
 paddlers/models/ppseg/models/stdcseg.py       |  216 +++
 paddlers/models/ppseg/models/u2net.py         |  574 ++++++++
 paddlers/models/ppseg/models/unet.py          |  155 ++
 paddlers/models/ppseg/models/unet_3plus.py    |  307 ++++
 paddlers/models/ppseg/models/unet_plusplus.py |  236 +++
 paddlers/models/ppseg/transforms/__init__.py  |   16 +
 .../models/ppseg/transforms/functional.py     |  177 +++
 .../models/ppseg/transforms/transforms.py     | 1271 +++++++++++++++++
 paddlers/models/ppseg/utils/__init__.py       |   23 +
 paddlers/models/ppseg/utils/config_check.py   |   59 +
 paddlers/models/ppseg/utils/download.py       |  163 +++
 paddlers/models/ppseg/utils/ema.py            |  102 ++
 paddlers/models/ppseg/utils/logger.py         |   49 +
 paddlers/models/ppseg/utils/metrics.py        |  210 +++
 paddlers/models/ppseg/utils/op_flops_funs.py  |   22 +
 paddlers/models/ppseg/utils/progbar.py        |  209 +++
 paddlers/models/ppseg/utils/timer.py          |   53 +
 paddlers/models/ppseg/utils/train_profiler.py |  112 ++
 paddlers/models/ppseg/utils/utils.py          |  175 +++
 paddlers/models/ppseg/utils/visualize.py      |  105 ++
 paddlers/requirements.txt                     |    0
 paddlers/tasks/__init__.py                    |    2 +
 paddlers/tasks/base.py                        |    2 +-
 paddlers/tasks/segmenter.py                   |  768 ++++++++++
 paddlers/transforms/operators.py              |    2 +-
 requirements.txt                              |   16 +
 .../tutorials => tutorials}/train/ppyolo.py   |    0
 .../deeplabv3p_resnet50_vd.py                 |   58 +
 134 files changed, 26493 insertions(+), 2 deletions(-)
 create mode 100644 paddlers/datasets/seg_dataset.py
 create mode 100644 paddlers/models/ppseg/core/__init__.py
 create mode 100644 paddlers/models/ppseg/core/infer.py
 create mode 100644 paddlers/models/ppseg/core/predict.py
 create mode 100644 paddlers/models/ppseg/core/train.py
 create mode 100644 paddlers/models/ppseg/core/val.py
 create mode 100644 paddlers/models/ppseg/cvlibs/__init__.py
 create mode 100644 paddlers/models/ppseg/cvlibs/callbacks.py
 create mode 100644 paddlers/models/ppseg/cvlibs/config.py
 create mode 100644 paddlers/models/ppseg/cvlibs/manager.py
 create mode 100644 paddlers/models/ppseg/cvlibs/param_init.py
 create mode 100644 paddlers/models/ppseg/datasets/__init__.py
 create mode 100644 paddlers/models/ppseg/datasets/ade.py
 create mode 100644 paddlers/models/ppseg/datasets/chase_db1.py
 create mode 100644 paddlers/models/ppseg/datasets/cityscapes.py
 create mode 100644 paddlers/models/ppseg/datasets/cocostuff.py
 create mode 100644 paddlers/models/ppseg/datasets/dataset.py
 create mode 100644 paddlers/models/ppseg/datasets/drive.py
 create mode 100644 paddlers/models/ppseg/datasets/eg1800.py
 create mode 100644 paddlers/models/ppseg/datasets/hrf.py
 create mode 100644 paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
 create mode 100644 paddlers/models/ppseg/datasets/optic_disc_seg.py
 create mode 100644 paddlers/models/ppseg/datasets/pascal_context.py
 create mode 100644 paddlers/models/ppseg/datasets/pp_humanseg14k.py
 create mode 100644 paddlers/models/ppseg/datasets/stare.py
 create mode 100644 paddlers/models/ppseg/datasets/supervisely.py
 create mode 100644 paddlers/models/ppseg/datasets/voc.py
 create mode 100644 paddlers/models/ppseg/models/__init__.py
 create mode 100644 paddlers/models/ppseg/models/ann.py
 create mode 100644 paddlers/models/ppseg/models/attention_unet.py
 create mode 100644 paddlers/models/ppseg/models/backbones/__init__.py
 create mode 100644 paddlers/models/ppseg/models/backbones/hrnet.py
 create mode 100644 paddlers/models/ppseg/models/backbones/mix_transformer.py
 create mode 100644 paddlers/models/ppseg/models/backbones/mobilenetv2.py
 create mode 100644 paddlers/models/ppseg/models/backbones/mobilenetv3.py
 create mode 100644 paddlers/models/ppseg/models/backbones/resnet_vd.py
 create mode 100644 paddlers/models/ppseg/models/backbones/stdcnet.py
 create mode 100644 paddlers/models/ppseg/models/backbones/swin_transformer.py
 create mode 100644 paddlers/models/ppseg/models/backbones/transformer_utils.py
 create mode 100644 paddlers/models/ppseg/models/backbones/vision_transformer.py
 create mode 100644 paddlers/models/ppseg/models/backbones/xception_deeplab.py
 create mode 100644 paddlers/models/ppseg/models/bisenet.py
 create mode 100644 paddlers/models/ppseg/models/bisenetv1.py
 create mode 100644 paddlers/models/ppseg/models/danet.py
 create mode 100644 paddlers/models/ppseg/models/decoupled_segnet.py
 create mode 100644 paddlers/models/ppseg/models/deeplab.py
 create mode 100644 paddlers/models/ppseg/models/dmnet.py
 create mode 100644 paddlers/models/ppseg/models/dnlnet.py
 create mode 100644 paddlers/models/ppseg/models/emanet.py
 create mode 100644 paddlers/models/ppseg/models/encnet.py
 create mode 100644 paddlers/models/ppseg/models/enet.py
 create mode 100644 paddlers/models/ppseg/models/espnet.py
 create mode 100644 paddlers/models/ppseg/models/espnetv1.py
 create mode 100644 paddlers/models/ppseg/models/fast_scnn.py
 create mode 100644 paddlers/models/ppseg/models/fastfcn.py
 create mode 100644 paddlers/models/ppseg/models/fcn.py
 create mode 100644 paddlers/models/ppseg/models/gcnet.py
 create mode 100644 paddlers/models/ppseg/models/ginet.py
 create mode 100644 paddlers/models/ppseg/models/gscnn.py
 create mode 100644 paddlers/models/ppseg/models/hardnet.py
 create mode 100644 paddlers/models/ppseg/models/hrnet_contrast.py
 create mode 100644 paddlers/models/ppseg/models/isanet.py
 create mode 100644 paddlers/models/ppseg/models/layers/__init__.py
 create mode 100644 paddlers/models/ppseg/models/layers/activation.py
 create mode 100644 paddlers/models/ppseg/models/layers/attention.py
 create mode 100644 paddlers/models/ppseg/models/layers/layer_libs.py
 create mode 100644 paddlers/models/ppseg/models/layers/nonlocal2d.py
 create mode 100644 paddlers/models/ppseg/models/layers/pyramid_pool.py
 create mode 100644 paddlers/models/ppseg/models/layers/wrap_functions.py
 create mode 100644 paddlers/models/ppseg/models/losses/__init__.py
 create mode 100644 paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
 create mode 100644 paddlers/models/ppseg/models/losses/cross_entropy_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/dice_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/edge_attention_loss.py
 create mode 100755 paddlers/models/ppseg/models/losses/focal_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
 create mode 100755 paddlers/models/ppseg/models/losses/kl_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/l1_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/lovasz_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/mean_square_error_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/mixed_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/rmi_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
 create mode 100644 paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
 create mode 100644 paddlers/models/ppseg/models/mla_transformer.py
 create mode 100644 paddlers/models/ppseg/models/ocrnet.py
 create mode 100644 paddlers/models/ppseg/models/pfpnnet.py
 create mode 100644 paddlers/models/ppseg/models/pointrend.py
 create mode 100644 paddlers/models/ppseg/models/portraitnet.py
 create mode 100644 paddlers/models/ppseg/models/pphumanseg_lite.py
 create mode 100644 paddlers/models/ppseg/models/pspnet.py
 create mode 100644 paddlers/models/ppseg/models/segformer.py
 create mode 100644 paddlers/models/ppseg/models/segmenter.py
 create mode 100644 paddlers/models/ppseg/models/segnet.py
 create mode 100644 paddlers/models/ppseg/models/setr.py
 create mode 100644 paddlers/models/ppseg/models/sfnet.py
 create mode 100644 paddlers/models/ppseg/models/stdcseg.py
 create mode 100644 paddlers/models/ppseg/models/u2net.py
 create mode 100644 paddlers/models/ppseg/models/unet.py
 create mode 100644 paddlers/models/ppseg/models/unet_3plus.py
 create mode 100644 paddlers/models/ppseg/models/unet_plusplus.py
 create mode 100644 paddlers/models/ppseg/transforms/__init__.py
 create mode 100644 paddlers/models/ppseg/transforms/functional.py
 create mode 100644 paddlers/models/ppseg/transforms/transforms.py
 create mode 100644 paddlers/models/ppseg/utils/__init__.py
 create mode 100644 paddlers/models/ppseg/utils/config_check.py
 create mode 100644 paddlers/models/ppseg/utils/download.py
 create mode 100644 paddlers/models/ppseg/utils/ema.py
 create mode 100644 paddlers/models/ppseg/utils/logger.py
 create mode 100644 paddlers/models/ppseg/utils/metrics.py
 create mode 100644 paddlers/models/ppseg/utils/op_flops_funs.py
 create mode 100644 paddlers/models/ppseg/utils/progbar.py
 create mode 100644 paddlers/models/ppseg/utils/timer.py
 create mode 100644 paddlers/models/ppseg/utils/train_profiler.py
 create mode 100644 paddlers/models/ppseg/utils/utils.py
 create mode 100644 paddlers/models/ppseg/utils/visualize.py
 delete mode 100644 paddlers/requirements.txt
 create mode 100644 paddlers/tasks/segmenter.py
 create mode 100644 requirements.txt
 rename {paddlers/tutorials => tutorials}/train/ppyolo.py (100%)
 create mode 100644 tutorials/train/semantic_segmentation/deeplabv3p_resnet50_vd.py

diff --git a/docs/README.md b/docs/README.md
index e69de29..40174d8 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1 @@
+PaddleSeg commit fec42fd869b6f796c74cd510671595e3512bc8e9
\ No newline at end of file
diff --git a/paddlers/datasets/__init__.py b/paddlers/datasets/__init__.py
index 4eff485..4e9e35e 100644
--- a/paddlers/datasets/__init__.py
+++ b/paddlers/datasets/__init__.py
@@ -1 +1,2 @@
 from .voc import VOCDetection
+from .seg_dataset import SegDataset
diff --git a/paddlers/datasets/seg_dataset.py b/paddlers/datasets/seg_dataset.py
new file mode 100644
index 0000000..61f0b46
--- /dev/null
+++ b/paddlers/datasets/seg_dataset.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+
+from paddle.io import Dataset
+from paddlers.utils import logging, get_num_workers, get_encoding, path_normalization, is_pic
+
+
+class SegDataset(Dataset):
+    """读取语义分割任务数据集，并对样本进行相应的处理。
+
+    Args:
+        data_dir (str): 数据集所在的目录路径。
+        file_list (str): 描述数据集图片文件和对应标注文件的文件路径（文本内每行路径为相对data_dir的相对路）。
+        label_list (str): 描述数据集包含的类别信息文件路径。默认值为None。
+        transforms (paddlers.transforms): 数据集中每个样本的预处理/增强算子。
+        num_workers (int|str): 数据集中样本在预处理过程中的线程或进程数。默认为'auto'。
+        shuffle (bool): 是否需要对数据集中样本打乱顺序。默认为False。
+    """
+
+    def __init__(self,
+                 data_dir,
+                 file_list,
+                 label_list=None,
+                 transforms=None,
+                 num_workers='auto',
+                 shuffle=False):
+        super(SegDataset, self).__init__()
+        self.transforms = copy.deepcopy(transforms)
+        # TODO batch padding
+        self.batch_transforms = None
+        self.num_workers = get_num_workers(num_workers)
+        self.shuffle = shuffle
+        self.file_list = list()
+        self.labels = list()
+
+        # TODO：非None时，让用户跳转数据集分析生成label_list
+        # 不要在此处分析label file
+        if label_list is not None:
+            with open(label_list, encoding=get_encoding(label_list)) as f:
+                for line in f:
+                    item = line.strip()
+                    self.labels.append(item)
+        with open(file_list, encoding=get_encoding(file_list)) as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) > 2:
+                    raise Exception(
+                        "A space is defined as the delimiter to separate the image and label path, " \
+                        "so the space cannot be in the image or label path, but the line[{}] of " \
+                        " file_list[{}] has a space in the image or label path.".format(line, file_list))
+                items[0] = path_normalization(items[0])
+                items[1] = path_normalization(items[1])
+                if not is_pic(items[0]) or not is_pic(items[1]):
+                    continue
+                full_path_im = osp.join(data_dir, items[0])
+                full_path_label = osp.join(data_dir, items[1])
+                if not osp.exists(full_path_im):
+                    raise IOError('Image file {} does not exist!'.format(
+                        full_path_im))
+                if not osp.exists(full_path_label):
+                    raise IOError('Label file {} does not exist!'.format(
+                        full_path_label))
+                self.file_list.append({
+                    'image': full_path_im,
+                    'mask': full_path_label
+                })
+        self.num_samples = len(self.file_list)
+        logging.info("{} samples in file {}".format(
+            len(self.file_list), file_list))
+
+    def __getitem__(self, idx):
+        sample = copy.deepcopy(self.file_list[idx])
+        outputs = self.transforms(sample)
+        return outputs
+
+    def __len__(self):
+        return len(self.file_list)
diff --git a/paddlers/models/ppseg/__init__.py b/paddlers/models/ppseg/__init__.py
index e69de29..f5d3451 100644
--- a/paddlers/models/ppseg/__init__.py
+++ b/paddlers/models/ppseg/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import models, datasets, transforms
+
+__version__ = 'develop'
diff --git a/paddlers/models/ppseg/core/__init__.py b/paddlers/models/ppseg/core/__init__.py
new file mode 100644
index 0000000..3518906
--- /dev/null
+++ b/paddlers/models/ppseg/core/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .train import train
+from .val import evaluate
+from .predict import predict
+from . import infer
+
+__all__ = ['train', 'evaluate', 'predict']
diff --git a/paddlers/models/ppseg/core/infer.py b/paddlers/models/ppseg/core/infer.py
new file mode 100644
index 0000000..a66288d
--- /dev/null
+++ b/paddlers/models/ppseg/core/infer.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections.abc
+from itertools import combinations
+
+import numpy as np
+import cv2
+import paddle
+import paddle.nn.functional as F
+
+
+def get_reverse_list(ori_shape, transforms):
+    """
+    get reverse list of transform.
+
+    Args:
+        ori_shape (list): Origin shape of image.
+        transforms (list): List of transform.
+
+    Returns:
+        list: List of tuple, there are two format:
+            ('resize', (h, w)) The image shape before resize,
+            ('padding', (h, w)) The image shape before padding.
+    """
+    reverse_list = []
+    h, w = ori_shape[0], ori_shape[1]
+    for op in transforms:
+        if op.__class__.__name__ in ['Resize']:
+            reverse_list.append(('resize', (h, w)))
+            h, w = op.target_size[0], op.target_size[1]
+        if op.__class__.__name__ in ['ResizeByLong']:
+            reverse_list.append(('resize', (h, w)))
+            long_edge = max(h, w)
+            short_edge = min(h, w)
+            short_edge = int(round(short_edge * op.long_size / long_edge))
+            long_edge = op.long_size
+            if h > w:
+                h = long_edge
+                w = short_edge
+            else:
+                w = long_edge
+                h = short_edge
+        if op.__class__.__name__ in ['ResizeByShort']:
+            reverse_list.append(('resize', (h, w)))
+            long_edge = max(h, w)
+            short_edge = min(h, w)
+            long_edge = int(round(long_edge * op.short_size / short_edge))
+            short_edge = op.short_size
+            if h > w:
+                h = long_edge
+                w = short_edge
+            else:
+                w = long_edge
+                h = short_edge
+        if op.__class__.__name__ in ['Padding']:
+            reverse_list.append(('padding', (h, w)))
+            w, h = op.target_size[0], op.target_size[1]
+        if op.__class__.__name__ in ['PaddingByAspectRatio']:
+            reverse_list.append(('padding', (h, w)))
+            ratio = w / h
+            if ratio == op.aspect_ratio:
+                pass
+            elif ratio > op.aspect_ratio:
+                h = int(w / op.aspect_ratio)
+            else:
+                w = int(h * op.aspect_ratio)
+        if op.__class__.__name__ in ['LimitLong']:
+            long_edge = max(h, w)
+            short_edge = min(h, w)
+            if ((op.max_long is not None) and (long_edge > op.max_long)):
+                reverse_list.append(('resize', (h, w)))
+                long_edge = op.max_long
+                short_edge = int(round(short_edge * op.max_long / long_edge))
+            elif ((op.min_long is not None) and (long_edge < op.min_long)):
+                reverse_list.append(('resize', (h, w)))
+                long_edge = op.min_long
+                short_edge = int(round(short_edge * op.min_long / long_edge))
+            if h > w:
+                h = long_edge
+                w = short_edge
+            else:
+                w = long_edge
+                h = short_edge
+    return reverse_list
+
+
+def reverse_transform(pred, ori_shape, transforms, mode='nearest'):
+    """recover pred to origin shape"""
+    reverse_list = get_reverse_list(ori_shape, transforms)
+    intTypeList = [paddle.int8, paddle.int16, paddle.int32, paddle.int64]
+    dtype = pred.dtype
+    for item in reverse_list[::-1]:
+        if item[0] == 'resize':
+            h, w = item[1][0], item[1][1]
+            if paddle.get_device() == 'cpu' and dtype in intTypeList:
+                pred = paddle.cast(pred, 'float32')
+                pred = F.interpolate(pred, (h, w), mode=mode)
+                pred = paddle.cast(pred, dtype)
+            else:
+                pred = F.interpolate(pred, (h, w), mode=mode)
+        elif item[0] == 'padding':
+            h, w = item[1][0], item[1][1]
+            pred = pred[:, :, 0:h, 0:w]
+        else:
+            raise Exception("Unexpected info '{}' in im_info".format(item[0]))
+    return pred
+
+
+def flip_combination(flip_horizontal=False, flip_vertical=False):
+    """
+    Get flip combination.
+
+    Args:
+        flip_horizontal (bool): Whether to flip horizontally. Default: False.
+        flip_vertical (bool): Whether to flip vertically. Default: False.
+
+    Returns:
+        list: List of tuple. The first element of tuple is whether to flip horizontally,
+            and the second is whether to flip vertically.
+    """
+
+    flip_comb = [(False, False)]
+    if flip_horizontal:
+        flip_comb.append((True, False))
+    if flip_vertical:
+        flip_comb.append((False, True))
+        if flip_horizontal:
+            flip_comb.append((True, True))
+    return flip_comb
+
+
+def tensor_flip(x, flip):
+    """Flip tensor according directions"""
+    if flip[0]:
+        x = x[:, :, :, ::-1]
+    if flip[1]:
+        x = x[:, :, ::-1, :]
+    return x
+
+
+def slide_inference(model, im, crop_size, stride):
+    """
+    Infer by sliding window.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        im (Tensor): the input image.
+        crop_size (tuple|list). The size of sliding window, (w, h).
+        stride (tuple|list). The size of stride, (w, h).
+
+    Return:
+        Tensor: The logit of input image.
+    """
+    h_im, w_im = im.shape[-2:]
+    w_crop, h_crop = crop_size
+    w_stride, h_stride = stride
+    # calculate the crop nums
+    rows = np.int(np.ceil(1.0 * (h_im - h_crop) / h_stride)) + 1
+    cols = np.int(np.ceil(1.0 * (w_im - w_crop) / w_stride)) + 1
+    # prevent negative sliding rounds when imgs after scaling << crop_size
+    rows = 1 if h_im <= h_crop else rows
+    cols = 1 if w_im <= w_crop else cols
+    # TODO 'Tensor' object does not support item assignment. If support, use tensor to calculation.
+    final_logit = None
+    count = np.zeros([1, 1, h_im, w_im])
+    for r in range(rows):
+        for c in range(cols):
+            h1 = r * h_stride
+            w1 = c * w_stride
+            h2 = min(h1 + h_crop, h_im)
+            w2 = min(w1 + w_crop, w_im)
+            h1 = max(h2 - h_crop, 0)
+            w1 = max(w2 - w_crop, 0)
+            im_crop = im[:, :, h1:h2, w1:w2]
+            logits = model(im_crop)
+            if not isinstance(logits, collections.abc.Sequence):
+                raise TypeError(
+                    "The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}"
+                    .format(type(logits)))
+            logit = logits[0].numpy()
+            if final_logit is None:
+                final_logit = np.zeros([1, logit.shape[1], h_im, w_im])
+            final_logit[:, :, h1:h2, w1:w2] += logit[:, :, :h2 - h1, :w2 - w1]
+            count[:, :, h1:h2, w1:w2] += 1
+    if np.sum(count == 0) != 0:
+        raise RuntimeError(
+            'There are pixel not predicted. It is possible that stride is greater than crop_size'
+        )
+    final_logit = final_logit / count
+    final_logit = paddle.to_tensor(final_logit)
+    return final_logit
+
+
+def inference(model,
+              im,
+              ori_shape=None,
+              transforms=None,
+              is_slide=False,
+              stride=None,
+              crop_size=None):
+    """
+    Inference for image.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        im (Tensor): the input image.
+        ori_shape (list): Origin shape of image.
+        transforms (list): Transforms for image.
+        is_slide (bool): Whether to infer by sliding window. Default: False.
+        crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True.
+        stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True.
+
+    Returns:
+        Tensor: If ori_shape is not None, a prediction with shape (1, 1, h, w) is returned.
+            If ori_shape is None, a logit with shape (1, num_classes, h, w) is returned.
+    """
+    if hasattr(model, 'data_format') and model.data_format == 'NHWC':
+        im = im.transpose((0, 2, 3, 1))
+    if not is_slide:
+        logits = model(im)
+        if not isinstance(logits, collections.abc.Sequence):
+            raise TypeError(
+                "The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}"
+                .format(type(logits)))
+        logit = logits[0]
+    else:
+        logit = slide_inference(model, im, crop_size=crop_size, stride=stride)
+    if hasattr(model, 'data_format') and model.data_format == 'NHWC':
+        logit = logit.transpose((0, 3, 1, 2))
+    if ori_shape is not None:
+        logit = reverse_transform(logit, ori_shape, transforms, mode='bilinear')
+        pred = paddle.argmax(logit, axis=1, keepdim=True, dtype='int32')
+        return pred, logit
+    else:
+        return logit
+
+
+def aug_inference(model,
+                  im,
+                  ori_shape,
+                  transforms,
+                  scales=1.0,
+                  flip_horizontal=False,
+                  flip_vertical=False,
+                  is_slide=False,
+                  stride=None,
+                  crop_size=None):
+    """
+    Infer with augmentation.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        im (Tensor): the input image.
+        ori_shape (list): Origin shape of image.
+        transforms (list): Transforms for image.
+        scales (float|tuple|list):  Scales for resize. Default: 1.
+        flip_horizontal (bool): Whether to flip horizontally. Default: False.
+        flip_vertical (bool): Whether to flip vertically. Default: False.
+        is_slide (bool): Whether to infer by sliding wimdow. Default: False.
+        crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True.
+        stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True.
+
+    Returns:
+        Tensor: Prediction of image with shape (1, 1, h, w) is returned.
+    """
+    if isinstance(scales, float):
+        scales = [scales]
+    elif not isinstance(scales, (tuple, list)):
+        raise TypeError(
+            '`scales` expects float/tuple/list type, but received {}'.format(
+                type(scales)))
+    final_logit = 0
+    h_input, w_input = im.shape[-2], im.shape[-1]
+    flip_comb = flip_combination(flip_horizontal, flip_vertical)
+    for scale in scales:
+        h = int(h_input * scale + 0.5)
+        w = int(w_input * scale + 0.5)
+        im = F.interpolate(im, (h, w), mode='bilinear')
+        for flip in flip_comb:
+            im_flip = tensor_flip(im, flip)
+            logit = inference(
+                model,
+                im_flip,
+                is_slide=is_slide,
+                crop_size=crop_size,
+                stride=stride)
+            logit = tensor_flip(logit, flip)
+            logit = F.interpolate(logit, (h_input, w_input), mode='bilinear')
+
+            logit = F.softmax(logit, axis=1)
+            final_logit = final_logit + logit
+
+    final_logit = reverse_transform(
+        final_logit, ori_shape, transforms, mode='bilinear')
+    pred = paddle.argmax(final_logit, axis=1, keepdim=True, dtype='int32')
+
+    return pred, final_logit
diff --git a/paddlers/models/ppseg/core/predict.py b/paddlers/models/ppseg/core/predict.py
new file mode 100644
index 0000000..8680c09
--- /dev/null
+++ b/paddlers/models/ppseg/core/predict.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+
+import cv2
+import numpy as np
+import paddle
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.core import infer
+from paddlers.models.ppseg.utils import logger, progbar, visualize
+
+
+def mkdir(path):
+    sub_dir = os.path.dirname(path)
+    if not os.path.exists(sub_dir):
+        os.makedirs(sub_dir)
+
+
+def partition_list(arr, m):
+    """split the list 'arr' into m pieces"""
+    n = int(math.ceil(len(arr) / float(m)))
+    return [arr[i:i + n] for i in range(0, len(arr), n)]
+
+
+def predict(model,
+            model_path,
+            transforms,
+            image_list,
+            image_dir=None,
+            save_dir='output',
+            aug_pred=False,
+            scales=1.0,
+            flip_horizontal=True,
+            flip_vertical=False,
+            is_slide=False,
+            stride=None,
+            crop_size=None,
+            custom_color=None):
+    """
+    predict and visualize the image_list.
+
+    Args:
+        model (nn.Layer): Used to predict for input image.
+        model_path (str): The path of pretrained model.
+        transforms (transform.Compose): Preprocess for input image.
+        image_list (list): A list of image path to be predicted.
+        image_dir (str, optional): The root directory of the images predicted. Default: None.
+        save_dir (str, optional): The directory to save the visualized results. Default: 'output'.
+        aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False.
+        scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0.
+        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True.
+        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False.
+        is_slide (bool, optional): Whether to predict by sliding window. Default: False.
+        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
+
+    """
+    utils.utils.load_entire_model(model, model_path)
+    model.eval()
+    nranks = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    if nranks > 1:
+        img_lists = partition_list(image_list, nranks)
+    else:
+        img_lists = [image_list]
+
+    added_saved_dir = os.path.join(save_dir, 'added_prediction')
+    pred_saved_dir = os.path.join(save_dir, 'pseudo_color_prediction')
+
+    logger.info("Start to predict...")
+    progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1)
+    color_map = visualize.get_color_map_list(256, custom_color=custom_color)
+    with paddle.no_grad():
+        for i, im_path in enumerate(img_lists[local_rank]):
+            im = cv2.imread(im_path)
+            ori_shape = im.shape[:2]
+            im, _ = transforms(im)
+            im = im[np.newaxis, ...]
+            im = paddle.to_tensor(im)
+
+            if aug_pred:
+                pred, _  = infer.aug_inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=transforms.transforms,
+                    scales=scales,
+                    flip_horizontal=flip_horizontal,
+                    flip_vertical=flip_vertical,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+            else:
+                pred, _ = infer.inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=transforms.transforms,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+            pred = paddle.squeeze(pred)
+            pred = pred.numpy().astype('uint8')
+
+            # get the saved name
+            if image_dir is not None:
+                im_file = im_path.replace(image_dir, '')
+            else:
+                im_file = os.path.basename(im_path)
+            if im_file[0] == '/' or im_file[0] == '\\':
+                im_file = im_file[1:]
+
+            # save added image
+            added_image = utils.visualize.visualize(
+                im_path, pred, color_map, weight=0.6)
+            added_image_path = os.path.join(added_saved_dir, im_file)
+            mkdir(added_image_path)
+            cv2.imwrite(added_image_path, added_image)
+
+            # save pseudo color prediction
+            pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
+            pred_saved_path = os.path.join(
+                pred_saved_dir,
+                os.path.splitext(im_file)[0] + ".png")
+            mkdir(pred_saved_path)
+            pred_mask.save(pred_saved_path)
+
+            # pred_im = utils.visualize(im_path, pred, weight=0.0)
+            # pred_saved_path = os.path.join(pred_saved_dir, im_file)
+            # mkdir(pred_saved_path)
+            # cv2.imwrite(pred_saved_path, pred_im)
+
+            progbar_pred.update(i + 1)
diff --git a/paddlers/models/ppseg/core/train.py b/paddlers/models/ppseg/core/train.py
new file mode 100644
index 0000000..6b02067
--- /dev/null
+++ b/paddlers/models/ppseg/core/train.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+from collections import deque
+import shutil
+
+import paddle
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.utils import (TimeAverager, calculate_eta, resume, logger,
+                             worker_init_fn, train_profiler, op_flops_funs)
+from paddlers.models.ppseg.core.val import evaluate
+
+
+def check_logits_losses(logits_list, losses):
+    len_logits = len(logits_list)
+    len_losses = len(losses['types'])
+    if len_logits != len_losses:
+        raise RuntimeError(
+            'The length of logits_list should equal to the types of loss config: {} != {}.'
+            .format(len_logits, len_losses))
+
+
+def loss_computation(logits_list, labels, losses, edges=None):
+    check_logits_losses(logits_list, losses)
+    loss_list = []
+    for i in range(len(logits_list)):
+        logits = logits_list[i]
+        loss_i = losses['types'][i]
+        coef_i = losses['coef'][i]
+
+        if loss_i.__class__.__name__ in ('BCELoss',
+                                         'FocalLoss') and loss_i.edge_label:
+            # If use edges as labels According to loss type.
+            loss_list.append(coef_i * loss_i(logits, edges))
+        elif loss_i.__class__.__name__ == 'MixedLoss':
+            mixed_loss_list = loss_i(logits, labels)
+            for mixed_loss in mixed_loss_list:
+                loss_list.append(coef_i * mixed_loss)
+        elif loss_i.__class__.__name__ in ("KLLoss", ):
+            loss_list.append(
+                coef_i * loss_i(logits_list[0], logits_list[1].detach()))
+        else:
+            loss_list.append(coef_i * loss_i(logits, labels))
+    return loss_list
+
+
+def train(model,
+          train_dataset,
+          val_dataset=None,
+          optimizer=None,
+          save_dir='output',
+          iters=10000,
+          batch_size=2,
+          resume_model=None,
+          save_interval=1000,
+          log_iters=10,
+          num_workers=0,
+          use_vdl=False,
+          losses=None,
+          keep_checkpoint_max=5,
+          test_config=None,
+          precision='fp32',
+          profiler_options=None,
+          to_static_training=False):
+    """
+    Launch training.
+
+    Args:
+        model（nn.Layer): A sementic segmentation model.
+        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
+        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
+        optimizer (paddle.optimizer.Optimizer): The optimizer.
+        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
+        iters (int, optional): How may iters to train the model. Defualt: 10000.
+        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
+        resume_model (str, optional): The path of resume model.
+        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
+        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
+        num_workers (int, optional): Num workers for data loader. Default: 0.
+        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
+        losses (dict, optional): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
+            The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
+        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
+        test_config(dict, optional): Evaluation config.
+        precision (str, optional): Use AMP if precision='fp16'. If precision='fp32', the training is normal.
+        profiler_options (str, optional): The option of train profiler.
+        to_static_training (bool, optional): Whether to use @to_static for training.
+    """
+    model.train()
+    nranks = paddle.distributed.ParallelEnv().nranks
+    local_rank = paddle.distributed.ParallelEnv().local_rank
+
+    start_iter = 0
+    if resume_model is not None:
+        start_iter = resume(model, optimizer, resume_model)
+
+    if not os.path.isdir(save_dir):
+        if os.path.exists(save_dir):
+            os.remove(save_dir)
+        os.makedirs(save_dir)
+
+    if nranks > 1:
+        paddle.distributed.fleet.init(is_collective=True)
+        optimizer = paddle.distributed.fleet.distributed_optimizer(
+            optimizer)  # The return is Fleet object
+        ddp_model = paddle.distributed.fleet.distributed_model(model)
+
+    batch_sampler = paddle.io.DistributedBatchSampler(
+        train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+
+    loader = paddle.io.DataLoader(
+        train_dataset,
+        batch_sampler=batch_sampler,
+        num_workers=num_workers,
+        return_list=True,
+        worker_init_fn=worker_init_fn,
+    )
+
+    # use amp
+    if precision == 'fp16':
+        logger.info('use amp to train')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+    if use_vdl:
+        from visualdl import LogWriter
+        log_writer = LogWriter(save_dir)
+
+    if to_static_training:
+        model = paddle.jit.to_static(model)
+        logger.info("Successfully to apply @to_static")
+
+    avg_loss = 0.0
+    avg_loss_list = []
+    iters_per_epoch = len(batch_sampler)
+    best_mean_iou = -1.0
+    best_model_iter = -1
+    reader_cost_averager = TimeAverager()
+    batch_cost_averager = TimeAverager()
+    save_models = deque()
+    batch_start = time.time()
+
+    iter = start_iter
+    while iter < iters:
+        for data in loader:
+            iter += 1
+            if iter > iters:
+                version = paddle.__version__
+                if version == '2.1.2':
+                    continue
+                else:
+                    break
+            reader_cost_averager.record(time.time() - batch_start)
+            images = data[0]
+            labels = data[1].astype('int64')
+            edges = None
+            if len(data) == 3:
+                edges = data[2].astype('int64')
+            if hasattr(model, 'data_format') and model.data_format == 'NHWC':
+                images = images.transpose((0, 2, 3, 1))
+
+            if precision == 'fp16':
+                with paddle.amp.auto_cast(
+                        enable=True,
+                        custom_white_list={
+                            "elementwise_add", "batch_norm", "sync_batch_norm"
+                        },
+                        custom_black_list={'bilinear_interp_v2'}):
+                    if nranks > 1:
+                        logits_list = ddp_model(images)
+                    else:
+                        logits_list = model(images)
+                    loss_list = loss_computation(
+                        logits_list=logits_list,
+                        labels=labels,
+                        losses=losses,
+                        edges=edges)
+                    loss = sum(loss_list)
+
+                scaled = scaler.scale(loss)  # scale the loss
+                scaled.backward()  # do backward
+                if isinstance(optimizer, paddle.distributed.fleet.Fleet):
+                    scaler.minimize(optimizer.user_defined_optimizer, scaled)
+                else:
+                    scaler.minimize(optimizer, scaled)  # update parameters
+            else:
+                if nranks > 1:
+                    logits_list = ddp_model(images)
+                else:
+                    logits_list = model(images)
+                loss_list = loss_computation(
+                    logits_list=logits_list,
+                    labels=labels,
+                    losses=losses,
+                    edges=edges)
+                loss = sum(loss_list)
+                loss.backward()
+                # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step.
+                if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau):
+                    optimizer.step(loss)
+                else:
+                    optimizer.step()
+
+            lr = optimizer.get_lr()
+
+            # update lr
+            if isinstance(optimizer, paddle.distributed.fleet.Fleet):
+                lr_sche = optimizer.user_defined_optimizer._learning_rate
+            else:
+                lr_sche = optimizer._learning_rate
+            if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler):
+                lr_sche.step()
+
+            train_profiler.add_profiler_step(profiler_options)
+
+            model.clear_gradients()
+            avg_loss += loss.numpy()[0]
+            if not avg_loss_list:
+                avg_loss_list = [l.numpy() for l in loss_list]
+            else:
+                for i in range(len(loss_list)):
+                    avg_loss_list[i] += loss_list[i].numpy()
+            batch_cost_averager.record(
+                time.time() - batch_start, num_samples=batch_size)
+
+            if (iter) % log_iters == 0 and local_rank == 0:
+                avg_loss /= log_iters
+                avg_loss_list = [l[0] / log_iters for l in avg_loss_list]
+                remain_iters = iters - iter
+                avg_train_batch_cost = batch_cost_averager.get_average()
+                avg_train_reader_cost = reader_cost_averager.get_average()
+                eta = calculate_eta(remain_iters, avg_train_batch_cost)
+                logger.info(
+                    "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}"
+                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
+                            avg_loss, lr, avg_train_batch_cost,
+                            avg_train_reader_cost,
+                            batch_cost_averager.get_ips_average(), eta))
+                if use_vdl:
+                    log_writer.add_scalar('Train/loss', avg_loss, iter)
+                    # Record all losses if there are more than 2 losses.
+                    if len(avg_loss_list) > 1:
+                        avg_loss_dict = {}
+                        for i, value in enumerate(avg_loss_list):
+                            avg_loss_dict['loss_' + str(i)] = value
+                        for key, value in avg_loss_dict.items():
+                            log_tag = 'Train/' + key
+                            log_writer.add_scalar(log_tag, value, iter)
+
+                    log_writer.add_scalar('Train/lr', lr, iter)
+                    log_writer.add_scalar('Train/batch_cost',
+                                          avg_train_batch_cost, iter)
+                    log_writer.add_scalar('Train/reader_cost',
+                                          avg_train_reader_cost, iter)
+                avg_loss = 0.0
+                avg_loss_list = []
+                reader_cost_averager.reset()
+                batch_cost_averager.reset()
+
+            if (iter % save_interval == 0
+                    or iter == iters) and (val_dataset is not None):
+                num_workers = 1 if num_workers > 0 else 0
+
+                if test_config is None:
+                    test_config = {}
+
+                mean_iou, acc, _, _, _ = evaluate(
+                    model, val_dataset, num_workers=num_workers, **test_config)
+
+                model.train()
+
+            if (iter % save_interval == 0 or iter == iters) and local_rank == 0:
+                current_save_dir = os.path.join(save_dir,
+                                                "iter_{}".format(iter))
+                if not os.path.isdir(current_save_dir):
+                    os.makedirs(current_save_dir)
+                paddle.save(model.state_dict(),
+                            os.path.join(current_save_dir, 'model.pdparams'))
+                paddle.save(optimizer.state_dict(),
+                            os.path.join(current_save_dir, 'model.pdopt'))
+                save_models.append(current_save_dir)
+                if len(save_models) > keep_checkpoint_max > 0:
+                    model_to_remove = save_models.popleft()
+                    shutil.rmtree(model_to_remove)
+
+                if val_dataset is not None:
+                    if mean_iou > best_mean_iou:
+                        best_mean_iou = mean_iou
+                        best_model_iter = iter
+                        best_model_dir = os.path.join(save_dir, "best_model")
+                        paddle.save(
+                            model.state_dict(),
+                            os.path.join(best_model_dir, 'model.pdparams'))
+                    logger.info(
+                        '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
+                        .format(best_mean_iou, best_model_iter))
+
+                    if use_vdl:
+                        log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
+                        log_writer.add_scalar('Evaluate/Acc', acc, iter)
+            batch_start = time.time()
+
+    # Calculate flops.
+    if local_rank == 0:
+        _, c, h, w = images.shape
+        _ = paddle.flops(
+            model, [1, c, h, w],
+            custom_ops={paddle.nn.SyncBatchNorm: op_flops_funs.count_syncbn})
+
+    # Sleep for half a second to let dataloader release resources.
+    time.sleep(0.5)
+    if use_vdl:
+        log_writer.close()
diff --git a/paddlers/models/ppseg/core/val.py b/paddlers/models/ppseg/core/val.py
new file mode 100644
index 0000000..10ba276
--- /dev/null
+++ b/paddlers/models/ppseg/core/val.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import time
+import paddle
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.utils import metrics, TimeAverager, calculate_eta, logger, progbar
+from paddlers.models.ppseg.core import infer
+
+np.set_printoptions(suppress=True)
+
+
+def evaluate(model,
+             eval_dataset,
+             aug_eval=False,
+             scales=1.0,
+             flip_horizontal=False,
+             flip_vertical=False,
+             is_slide=False,
+             stride=None,
+             crop_size=None,
+             num_workers=0,
+             print_detail=True,
+             auc_roc=False):
+    """
+    Launch evalution.
+
+    Args:
+        model（nn.Layer): A sementic segmentation model.
+        eval_dataset (paddle.io.Dataset): Used to read and process validation datasets.
+        aug_eval (bool, optional): Whether to use mulit-scales and flip augment for evaluation. Default: False.
+        scales (list|float, optional): Scales for augment. It is valid when `aug_eval` is True. Default: 1.0.
+        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_eval` is True. Default: True.
+        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_eval` is True. Default: False.
+        is_slide (bool, optional): Whether to evaluate by sliding window. Default: False.
+        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        num_workers (int, optional): Num workers for data loader. Default: 0.
+        print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True.
+        auc_roc(bool, optional): whether add auc_roc metric
+
+    Returns:
+        float: The mIoU of validation datasets.
+        float: The accuracy of validation datasets.
+    """
+    model.eval()
+    nranks = paddle.distributed.ParallelEnv().nranks
+    local_rank = paddle.distributed.ParallelEnv().local_rank
+    if nranks > 1:
+        # Initialize parallel environment if not done.
+        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
+        ):
+            paddle.distributed.init_parallel_env()
+    batch_sampler = paddle.io.DistributedBatchSampler(
+        eval_dataset, batch_size=1, shuffle=False, drop_last=False)
+    loader = paddle.io.DataLoader(
+        eval_dataset,
+        batch_sampler=batch_sampler,
+        num_workers=num_workers,
+        return_list=True,
+    )
+
+    total_iters = len(loader)
+    intersect_area_all = paddle.zeros([1], dtype='int64')
+    pred_area_all = paddle.zeros([1], dtype='int64')
+    label_area_all = paddle.zeros([1], dtype='int64')
+    logits_all = None
+    label_all = None
+
+    if print_detail:
+        logger.info(
+            "Start evaluating (total_samples: {}, total_iters: {})...".format(
+                len(eval_dataset), total_iters))
+    #TODO(chenguowei): fix log print error with multi-gpus
+    progbar_val = progbar.Progbar(
+        target=total_iters, verbose=1 if nranks < 2 else 2)
+    reader_cost_averager = TimeAverager()
+    batch_cost_averager = TimeAverager()
+    batch_start = time.time()
+    with paddle.no_grad():
+        for iter, (im, label) in enumerate(loader):
+            reader_cost_averager.record(time.time() - batch_start)
+            label = label.astype('int64')
+
+            ori_shape = label.shape[-2:]
+            if aug_eval:
+                pred, logits = infer.aug_inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=eval_dataset.transforms.transforms,
+                    scales=scales,
+                    flip_horizontal=flip_horizontal,
+                    flip_vertical=flip_vertical,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+            else:
+                pred, logits = infer.inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=eval_dataset.transforms.transforms,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+
+            intersect_area, pred_area, label_area = metrics.calculate_area(
+                pred,
+                label,
+                eval_dataset.num_classes,
+                ignore_index=eval_dataset.ignore_index)
+
+            # Gather from all ranks
+            if nranks > 1:
+                intersect_area_list = []
+                pred_area_list = []
+                label_area_list = []
+                paddle.distributed.all_gather(intersect_area_list,
+                                              intersect_area)
+                paddle.distributed.all_gather(pred_area_list, pred_area)
+                paddle.distributed.all_gather(label_area_list, label_area)
+
+                # Some image has been evaluated and should be eliminated in last iter
+                if (iter + 1) * nranks > len(eval_dataset):
+                    valid = len(eval_dataset) - iter * nranks
+                    intersect_area_list = intersect_area_list[:valid]
+                    pred_area_list = pred_area_list[:valid]
+                    label_area_list = label_area_list[:valid]
+
+                for i in range(len(intersect_area_list)):
+                    intersect_area_all = intersect_area_all + intersect_area_list[
+                        i]
+                    pred_area_all = pred_area_all + pred_area_list[i]
+                    label_area_all = label_area_all + label_area_list[i]
+            else:
+                intersect_area_all = intersect_area_all + intersect_area
+                pred_area_all = pred_area_all + pred_area
+                label_area_all = label_area_all + label_area
+
+                if auc_roc:
+                    logits = F.softmax(logits, axis=1)
+                    if logits_all is None:
+                        logits_all = logits.numpy()
+                        label_all = label.numpy()
+                    else:
+                        logits_all = np.concatenate(
+                            [logits_all, logits.numpy()])  # (KN, C, H, W)
+                        label_all = np.concatenate([label_all, label.numpy()])
+
+            batch_cost_averager.record(
+                time.time() - batch_start, num_samples=len(label))
+            batch_cost = batch_cost_averager.get_average()
+            reader_cost = reader_cost_averager.get_average()
+
+            if local_rank == 0 and print_detail:
+                progbar_val.update(iter + 1, [('batch_cost', batch_cost),
+                                              ('reader cost', reader_cost)])
+            reader_cost_averager.reset()
+            batch_cost_averager.reset()
+            batch_start = time.time()
+
+    class_iou, miou = metrics.mean_iou(intersect_area_all, pred_area_all,
+                                       label_area_all)
+    class_acc, acc = metrics.accuracy(intersect_area_all, pred_area_all)
+    kappa = metrics.kappa(intersect_area_all, pred_area_all, label_area_all)
+    class_dice, mdice = metrics.dice(intersect_area_all, pred_area_all,
+                                     label_area_all)
+
+    if auc_roc:
+        auc_roc = metrics.auc_roc(
+            logits_all, label_all, num_classes=eval_dataset.num_classes)
+        auc_infor = ' Auc_roc: {:.4f}'.format(auc_roc)
+
+    if print_detail:
+        infor = "[EVAL] #Images: {} mIoU: {:.4f} Acc: {:.4f} Kappa: {:.4f} Dice: {:.4f}".format(
+            len(eval_dataset), miou, acc, kappa, mdice)
+        infor = infor + auc_infor if auc_roc else infor
+        logger.info(infor)
+        logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4)))
+        logger.info("[EVAL] Class Acc: \n" + str(np.round(class_acc, 4)))
+    return miou, acc, class_iou, class_acc, kappa
diff --git a/paddlers/models/ppseg/cvlibs/__init__.py b/paddlers/models/ppseg/cvlibs/__init__.py
new file mode 100644
index 0000000..5fcb1d6
--- /dev/null
+++ b/paddlers/models/ppseg/cvlibs/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import manager
+from . import param_init
+from .config import Config
diff --git a/paddlers/models/ppseg/cvlibs/callbacks.py b/paddlers/models/ppseg/cvlibs/callbacks.py
new file mode 100644
index 0000000..075e1eb
--- /dev/null
+++ b/paddlers/models/ppseg/cvlibs/callbacks.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+
+import numpy as np
+import paddle
+from paddle.distributed.parallel import ParallelEnv
+from visualdl import LogWriter
+from paddlers.models.ppseg.utils.progbar import Progbar
+import paddlers.models.ppseg.utils.logger as logger
+
+
+class CallbackList(object):
+    """
+    Container abstracting a list of callbacks.
+
+    Args:
+        callbacks (list[Callback]): List of `Callback` instances.
+    """
+
+    def __init__(self, callbacks=None):
+        callbacks = callbacks or []
+        self.callbacks = [c for c in callbacks]
+
+    def append(self, callback):
+        self.callbacks.append(callback)
+
+    def set_params(self, params):
+        for callback in self.callbacks:
+            callback.set_params(params)
+
+    def set_model(self, model):
+        for callback in self.callbacks:
+            callback.set_model(model)
+
+    def set_optimizer(self, optimizer):
+        for callback in self.callbacks:
+            callback.set_optimizer(optimizer)
+
+    def on_iter_begin(self, iter, logs=None):
+        """Called right before processing a batch.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_iter_begin(iter, logs)
+        self._t_enter_iter = time.time()
+
+    def on_iter_end(self, iter, logs=None):
+        """Called at the end of a batch.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_iter_end(iter, logs)
+        self._t_exit_iter = time.time()
+
+    def on_train_begin(self, logs=None):
+        """Called at the beginning of training.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_train_begin(logs)
+
+    def on_train_end(self, logs=None):
+        """Called at the end of training.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_train_end(logs)
+
+    def __iter__(self):
+        return iter(self.callbacks)
+
+
+class Callback(object):
+    """Abstract base class used to build new callbacks.
+    """
+
+    def __init__(self):
+        self.validation_data = None
+
+    def set_params(self, params):
+        self.params = params
+
+    def set_model(self, model):
+        self.model = model
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+    def on_iter_begin(self, iter, logs=None):
+        pass
+
+    def on_iter_end(self, iter, logs=None):
+        pass
+
+    def on_train_begin(self, logs=None):
+        pass
+
+    def on_train_end(self, logs=None):
+        pass
+
+
+class BaseLogger(Callback):
+    def __init__(self, period=10):
+        super(BaseLogger, self).__init__()
+        self.period = period
+
+    def _reset(self):
+        self.totals = {}
+
+    def on_train_begin(self, logs=None):
+        self.totals = {}
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        #(iter - 1) // iters_per_epoch + 1
+        for k, v in logs.items():
+            if k in self.totals.keys():
+                self.totals[k] += v
+            else:
+                self.totals[k] = v
+
+        if iter % self.period == 0 and ParallelEnv().local_rank == 0:
+
+            for k in self.totals:
+                logs[k] = self.totals[k] / self.period
+            self._reset()
+
+
+class TrainLogger(Callback):
+    def __init__(self, log_freq=10):
+        self.log_freq = log_freq
+
+    def _calculate_eta(self, remaining_iters, speed):
+        if remaining_iters < 0:
+            remaining_iters = 0
+        remaining_time = int(remaining_iters * speed)
+        result = "{:0>2}:{:0>2}:{:0>2}"
+        arr = []
+        for i in range(2, -1, -1):
+            arr.append(int(remaining_time / 60**i))
+            remaining_time %= 60**i
+        return result.format(*arr)
+
+    def on_iter_end(self, iter, logs=None):
+
+        if iter % self.log_freq == 0 and ParallelEnv().local_rank == 0:
+            total_iters = self.params["total_iters"]
+            iters_per_epoch = self.params["iters_per_epoch"]
+            remaining_iters = total_iters - iter
+            eta = self._calculate_eta(remaining_iters, logs["batch_cost"])
+            current_epoch = (iter - 1) // self.params["iters_per_epoch"] + 1
+            loss = logs["loss"]
+            lr = self.optimizer.get_lr()
+            batch_cost = logs["batch_cost"]
+            reader_cost = logs["reader_cost"]
+
+            logger.info(
+                "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
+                .format(current_epoch, iter, total_iters, loss, lr, batch_cost,
+                        reader_cost, eta))
+
+
+class ProgbarLogger(Callback):
+    def __init__(self):
+        super(ProgbarLogger, self).__init__()
+
+    def on_train_begin(self, logs=None):
+        self.verbose = self.params["verbose"]
+        self.total_iters = self.params["total_iters"]
+        self.target = self.params["total_iters"]
+        self.progbar = Progbar(target=self.target, verbose=self.verbose)
+        self.seen = 0
+        self.log_values = []
+
+    def on_iter_begin(self, iter, logs=None):
+        #self.seen = 0
+        if self.seen < self.target:
+            self.log_values = []
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        self.seen += 1
+        for k in self.params['metrics']:
+            if k in logs:
+                self.log_values.append((k, logs[k]))
+
+        #if self.verbose and self.seen < self.target and ParallelEnv.local_rank == 0:
+        #print(self.log_values)
+        if self.seen < self.target:
+            self.progbar.update(self.seen, self.log_values)
+
+
+class ModelCheckpoint(Callback):
+    def __init__(self,
+                 save_dir,
+                 monitor="miou",
+                 save_best_only=False,
+                 save_params_only=True,
+                 mode="max",
+                 period=1):
+
+        super(ModelCheckpoint, self).__init__()
+        self.monitor = monitor
+        self.save_dir = save_dir
+        self.save_best_only = save_best_only
+        self.save_params_only = save_params_only
+        self.period = period
+        self.iters_since_last_save = 0
+
+        if mode == "min":
+            self.monitor_op = np.less
+            self.best = np.Inf
+        elif mode == "max":
+            self.monitor_op = np.greater
+            self.best = -np.Inf
+        else:
+            raise RuntimeError("`mode` is neither \"min\" nor \"max\"!")
+
+    def on_train_begin(self, logs=None):
+        self.verbose = self.params["verbose"]
+        save_dir = self.save_dir
+        if not os.path.isdir(save_dir):
+            if os.path.exists(save_dir):
+                os.remove(save_dir)
+            os.makedirs(save_dir)
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        self.iters_since_last_save += 1
+        current_save_dir = os.path.join(self.save_dir, "iter_{}".format(iter))
+        current_save_dir = os.path.abspath(current_save_dir)
+        #if self.iters_since_last_save % self.period and ParallelEnv().local_rank == 0:
+        #self.iters_since_last_save = 0
+        if iter % self.period == 0 and ParallelEnv().local_rank == 0:
+            if self.verbose > 0:
+                print("iter {iter_num}: saving model to {path}".format(
+                    iter_num=iter, path=current_save_dir))
+
+            paddle.save(self.model.state_dict(),
+                        os.path.join(current_save_dir, 'model.pdparams'))
+
+            if not self.save_params_only:
+                paddle.save(self.optimizer.state_dict(),
+                            os.path.join(current_save_dir, 'model.pdopt'))
+
+
+class VisualDL(Callback):
+    def __init__(self, log_dir="./log", freq=1):
+        super(VisualDL, self).__init__()
+        self.log_dir = log_dir
+        self.freq = freq
+
+    def on_train_begin(self, logs=None):
+        self.writer = LogWriter(self.log_dir)
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        if iter % self.freq == 0 and ParallelEnv().local_rank == 0:
+            for k, v in logs.items():
+                self.writer.add_scalar("Train/{}".format(k), v, iter)
+
+        self.writer.flush()
+
+    def on_train_end(self, logs=None):
+        self.writer.close()
diff --git a/paddlers/models/ppseg/cvlibs/config.py b/paddlers/models/ppseg/cvlibs/config.py
new file mode 100644
index 0000000..64beb09
--- /dev/null
+++ b/paddlers/models/ppseg/cvlibs/config.py
@@ -0,0 +1,404 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import os
+from typing import Any, Dict, Generic
+
+import paddle
+import yaml
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import logger
+
+
+class Config(object):
+    '''
+    Training configuration parsing. The only yaml/yml file is supported.
+
+    The following hyper-parameters are available in the config file:
+        batch_size: The number of samples per gpu.
+        iters: The total training steps.
+        train_dataset: A training data config including type/data_root/transforms/mode.
+            For data type, please refer to paddleseg.datasets.
+            For specific transforms, please refer to paddleseg.transforms.transforms.
+        val_dataset: A validation data config including type/data_root/transforms/mode.
+        optimizer: A optimizer config, but currently PaddleSeg only supports sgd with momentum in config file.
+            In addition, weight_decay could be set as a regularization.
+        learning_rate: A learning rate config. If decay is configured, learning _rate value is the starting learning rate,
+             where only poly decay is supported using the config file. In addition, decay power and end_lr are tuned experimentally.
+        loss: A loss config. Multi-loss config is available. The loss type order is consistent with the seg model outputs,
+            where the coef term indicates the weight of corresponding loss. Note that the number of coef must be the same as the number of
+            model outputs, and there could be only one loss type if using the same loss type among the outputs, otherwise the number of
+            loss type must be consistent with coef.
+        model: A model config including type/backbone and model-dependent arguments.
+            For model type, please refer to paddleseg.models.
+            For backbone, please refer to paddleseg.models.backbones.
+
+    Args:
+        path (str) : The path of config file, supports yaml format only.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs.config import Config
+
+        # Create a cfg object with yaml file path.
+        cfg = Config(yaml_cfg_path)
+
+        # Parsing the argument when its property is used.
+        train_dataset = cfg.train_dataset
+
+        # the argument of model should be parsed after dataset,
+        # since the model builder uses some properties in dataset.
+        model = cfg.model
+        ...
+    '''
+
+    def __init__(self,
+                 path: str,
+                 learning_rate: float = None,
+                 batch_size: int = None,
+                 iters: int = None):
+        if not path:
+            raise ValueError('Please specify the configuration file path.')
+
+        if not os.path.exists(path):
+            raise FileNotFoundError('File {} does not exist'.format(path))
+
+        self._model = None
+        self._losses = None
+        if path.endswith('yml') or path.endswith('yaml'):
+            self.dic = self._parse_from_yaml(path)
+        else:
+            raise RuntimeError('Config file should in yaml format!')
+
+        self.update(
+            learning_rate=learning_rate, batch_size=batch_size, iters=iters)
+
+    def _update_dic(self, dic, base_dic):
+        """
+        Update config from dic based base_dic
+        """
+        base_dic = base_dic.copy()
+        dic = dic.copy()
+
+        if dic.get('_inherited_', True) == False:
+            dic.pop('_inherited_')
+            return dic
+
+        for key, val in dic.items():
+            if isinstance(val, dict) and key in base_dic:
+                base_dic[key] = self._update_dic(val, base_dic[key])
+            else:
+                base_dic[key] = val
+        dic = base_dic
+        return dic
+
+    def _parse_from_yaml(self, path: str):
+        '''Parse a yaml file and build config'''
+        with codecs.open(path, 'r', 'utf-8') as file:
+            dic = yaml.load(file, Loader=yaml.FullLoader)
+
+        if '_base_' in dic:
+            cfg_dir = os.path.dirname(path)
+            base_path = dic.pop('_base_')
+            base_path = os.path.join(cfg_dir, base_path)
+            base_dic = self._parse_from_yaml(base_path)
+            dic = self._update_dic(dic, base_dic)
+        return dic
+
+    def update(self,
+               learning_rate: float = None,
+               batch_size: int = None,
+               iters: int = None):
+        '''Update config'''
+        if learning_rate:
+            if 'lr_scheduler' in self.dic:
+                self.dic['lr_scheduler']['learning_rate'] = learning_rate
+            else:
+                self.dic['learning_rate']['value'] = learning_rate
+
+        if batch_size:
+            self.dic['batch_size'] = batch_size
+
+        if iters:
+            self.dic['iters'] = iters
+
+    @property
+    def batch_size(self) -> int:
+        return self.dic.get('batch_size', 1)
+
+    @property
+    def iters(self) -> int:
+        iters = self.dic.get('iters')
+        if not iters:
+            raise RuntimeError('No iters specified in the configuration file.')
+        return iters
+
+    @property
+    def lr_scheduler(self) -> paddle.optimizer.lr.LRScheduler:
+        if 'lr_scheduler' not in self.dic:
+            raise RuntimeError(
+                'No `lr_scheduler` specified in the configuration file.')
+        params = self.dic.get('lr_scheduler')
+
+        lr_type = params.pop('type')
+        if lr_type == 'PolynomialDecay':
+            params.setdefault('decay_steps', self.iters)
+            params.setdefault('end_lr', 0)
+            params.setdefault('power', 0.9)
+
+        return getattr(paddle.optimizer.lr, lr_type)(**params)
+
+    @property
+    def learning_rate(self) -> paddle.optimizer.lr.LRScheduler:
+        logger.warning(
+            '''`learning_rate` in configuration file will be deprecated, please use `lr_scheduler` instead. E.g
+            lr_scheduler:
+                type: PolynomialDecay
+                learning_rate: 0.01''')
+
+        _learning_rate = self.dic.get('learning_rate', {})
+        if isinstance(_learning_rate, float):
+            return _learning_rate
+
+        _learning_rate = self.dic.get('learning_rate', {}).get('value')
+        if not _learning_rate:
+            raise RuntimeError(
+                'No learning rate specified in the configuration file.')
+
+        args = self.decay_args
+        decay_type = args.pop('type')
+
+        if decay_type == 'poly':
+            lr = _learning_rate
+            return paddle.optimizer.lr.PolynomialDecay(lr, **args)
+        elif decay_type == 'piecewise':
+            values = _learning_rate
+            return paddle.optimizer.lr.PiecewiseDecay(values=values, **args)
+        elif decay_type == 'stepdecay':
+            lr = _learning_rate
+            return paddle.optimizer.lr.StepDecay(lr, **args)
+        else:
+            raise RuntimeError('Only poly and piecewise decay support.')
+
+    @property
+    def optimizer(self) -> paddle.optimizer.Optimizer:
+        if 'lr_scheduler' in self.dic:
+            lr = self.lr_scheduler
+        else:
+            lr = self.learning_rate
+        args = self.optimizer_args
+        optimizer_type = args.pop('type')
+
+        if optimizer_type == 'sgd':
+            return paddle.optimizer.Momentum(
+                lr, parameters=self.model.parameters(), **args)
+        elif optimizer_type == 'adam':
+            return paddle.optimizer.Adam(
+                lr, parameters=self.model.parameters(), **args)
+        elif optimizer_type in paddle.optimizer.__all__:
+            return getattr(paddle.optimizer, optimizer_type)(
+                lr, parameters=self.model.parameters(), **args)
+
+        raise RuntimeError('Unknown optimizer type {}.'.format(optimizer_type))
+
+    @property
+    def optimizer_args(self) -> dict:
+        args = self.dic.get('optimizer', {}).copy()
+        if args['type'] == 'sgd':
+            args.setdefault('momentum', 0.9)
+
+        return args
+
+    @property
+    def decay_args(self) -> dict:
+        args = self.dic.get('learning_rate', {}).get('decay', {
+            'type': 'poly',
+            'power': 0.9
+        }).copy()
+
+        if args['type'] == 'poly':
+            args.setdefault('decay_steps', self.iters)
+            args.setdefault('end_lr', 0)
+
+        return args
+
+    @property
+    def loss(self) -> dict:
+        if self._losses is None:
+            self._losses = self._prepare_loss('loss')
+        return self._losses
+
+    @property
+    def distill_loss(self) -> dict:
+        if not hasattr(self, '_distill_losses'):
+            self._distill_losses = self._prepare_loss('distill_loss')
+        return self._distill_losses
+
+    def _prepare_loss(self, loss_name):
+        """
+        Parse the loss parameters and load the loss layers.
+
+        Args:
+            loss_name (str): The root name of loss in the yaml file.
+        Returns:
+            dict: A dict including the loss parameters and layers.
+        """
+        args = self.dic.get(loss_name, {}).copy()
+        if 'types' in args and 'coef' in args:
+            len_types = len(args['types'])
+            len_coef = len(args['coef'])
+            if len_types != len_coef:
+                if len_types == 1:
+                    args['types'] = args['types'] * len_coef
+                else:
+                    raise ValueError(
+                        'The length of types should equal to coef or equal to 1 in loss config, but they are {} and {}.'
+                        .format(len_types, len_coef))
+        else:
+            raise ValueError(
+                'Loss config should contain keys of "types" and "coef"')
+
+        losses = dict()
+        for key, val in args.items():
+            if key == 'types':
+                losses['types'] = []
+                for item in args['types']:
+                    if item['type'] != 'MixedLoss':
+                        if 'ignore_index' in item:
+                            assert item['ignore_index'] == self.train_dataset.ignore_index, 'If ignore_index of loss is set, '\
+                            'the ignore_index of loss and train_dataset must be the same. \nCurrently, loss ignore_index = {}, '\
+                            'train_dataset ignore_index = {}. \nIt is recommended not to set loss ignore_index, so it is consistent with '\
+                            'train_dataset by default.'.format(item['ignore_index'], self.train_dataset.ignore_index)
+                        item['ignore_index'] = \
+                            self.train_dataset.ignore_index
+                    losses['types'].append(self._load_object(item))
+            else:
+                losses[key] = val
+        if len(losses['coef']) != len(losses['types']):
+            raise RuntimeError(
+                'The length of coef should equal to types in loss config: {} != {}.'
+                .format(len(losses['coef']), len(losses['types'])))
+        return losses
+
+    @property
+    def model(self) -> paddle.nn.Layer:
+        model_cfg = self.dic.get('model').copy()
+        if not model_cfg:
+            raise RuntimeError('No model specified in the configuration file.')
+        if not 'num_classes' in model_cfg:
+            num_classes = None
+            if self.train_dataset_config:
+                if hasattr(self.train_dataset_class, 'NUM_CLASSES'):
+                    num_classes = self.train_dataset_class.NUM_CLASSES
+                elif hasattr(self.train_dataset, 'num_classes'):
+                    num_classes = self.train_dataset.num_classes
+            elif self.val_dataset_config:
+                if hasattr(self.val_dataset_class, 'NUM_CLASSES'):
+                    num_classes = self.val_dataset_class.NUM_CLASSES
+                elif hasattr(self.val_dataset, 'num_classes'):
+                    num_classes = self.val_dataset.num_classes
+
+            if num_classes is not None:
+                model_cfg['num_classes'] = num_classes
+
+        if not self._model:
+            self._model = self._load_object(model_cfg)
+        return self._model
+
+    @property
+    def train_dataset_config(self) -> Dict:
+        return self.dic.get('train_dataset', {}).copy()
+
+    @property
+    def val_dataset_config(self) -> Dict:
+        return self.dic.get('val_dataset', {}).copy()
+
+    @property
+    def train_dataset_class(self) -> Generic:
+        dataset_type = self.train_dataset_config['type']
+        return self._load_component(dataset_type)
+
+    @property
+    def val_dataset_class(self) -> Generic:
+        dataset_type = self.val_dataset_config['type']
+        return self._load_component(dataset_type)
+
+    @property
+    def train_dataset(self) -> paddle.io.Dataset:
+        _train_dataset = self.train_dataset_config
+        if not _train_dataset:
+            return None
+        return self._load_object(_train_dataset)
+
+    @property
+    def val_dataset(self) -> paddle.io.Dataset:
+        _val_dataset = self.val_dataset_config
+        if not _val_dataset:
+            return None
+        return self._load_object(_val_dataset)
+
+    def _load_component(self, com_name: str) -> Any:
+        com_list = [
+            manager.MODELS, manager.BACKBONES, manager.DATASETS,
+            manager.TRANSFORMS, manager.LOSSES
+        ]
+
+        for com in com_list:
+            if com_name in com.components_dict:
+                return com[com_name]
+        else:
+            raise RuntimeError(
+                'The specified component was not found {}.'.format(com_name))
+
+    def _load_object(self, cfg: dict) -> Any:
+        cfg = cfg.copy()
+        if 'type' not in cfg:
+            raise RuntimeError('No object information in {}.'.format(cfg))
+
+        component = self._load_component(cfg.pop('type'))
+
+        params = {}
+        for key, val in cfg.items():
+            if self._is_meta_type(val):
+                params[key] = self._load_object(val)
+            elif isinstance(val, list):
+                params[key] = [
+                    self._load_object(item)
+                    if self._is_meta_type(item) else item for item in val
+                ]
+            else:
+                params[key] = val
+
+        return component(**params)
+
+    @property
+    def test_config(self) -> Dict:
+        return self.dic.get('test_config', {})
+
+    @property
+    def export_config(self) -> Dict:
+        return self.dic.get('export', {})
+
+    @property
+    def to_static_training(self) -> bool:
+        '''Whether to use @to_static for training'''
+        return self.dic.get('to_static_training', False)
+
+    def _is_meta_type(self, item: Any) -> bool:
+        return isinstance(item, dict) and 'type' in item
+
+    def __str__(self) -> str:
+        return yaml.dump(self.dic)
diff --git a/paddlers/models/ppseg/cvlibs/manager.py b/paddlers/models/ppseg/cvlibs/manager.py
new file mode 100644
index 0000000..0a6ba61
--- /dev/null
+++ b/paddlers/models/ppseg/cvlibs/manager.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from collections.abc import Sequence
+
+import warnings
+
+
+class ComponentManager:
+    """
+    Implement a manager class to add the new component properly.
+    The component can be added as either class or function type.
+
+    Args:
+        name (str): The name of component.
+
+    Returns:
+        A callable object of ComponentManager.
+
+    Examples 1:
+
+        from paddlers.models.ppseg.cvlibs.manager import ComponentManager
+
+        model_manager = ComponentManager()
+
+        class AlexNet: ...
+        class ResNet: ...
+
+        model_manager.add_component(AlexNet)
+        model_manager.add_component(ResNet)
+
+        # Or pass a sequence alliteratively:
+        model_manager.add_component([AlexNet, ResNet])
+        print(model_manager.components_dict)
+        # {'AlexNet': <class '__main__.AlexNet'>, 'ResNet': <class '__main__.ResNet'>}
+
+    Examples 2:
+
+        # Or an easier way, using it as a Python decorator, while just add it above the class declaration.
+        from paddlers.models.ppseg.cvlibs.manager import ComponentManager
+
+        model_manager = ComponentManager()
+
+        @model_manager.add_component
+        class AlexNet: ...
+
+        @model_manager.add_component
+        class ResNet: ...
+
+        print(model_manager.components_dict)
+        # {'AlexNet': <class '__main__.AlexNet'>, 'ResNet': <class '__main__.ResNet'>}
+    """
+
+    def __init__(self, name=None):
+        self._components_dict = dict()
+        self._name = name
+
+    def __len__(self):
+        return len(self._components_dict)
+
+    def __repr__(self):
+        name_str = self._name if self._name else self.__class__.__name__
+        return "{}:{}".format(name_str, list(self._components_dict.keys()))
+
+    def __getitem__(self, item):
+        if item not in self._components_dict.keys():
+            raise KeyError("{} does not exist in availabel {}".format(
+                item, self))
+        return self._components_dict[item]
+
+    @property
+    def components_dict(self):
+        return self._components_dict
+
+    @property
+    def name(self):
+        return self._name
+
+    def _add_single_component(self, component):
+        """
+        Add a single component into the corresponding manager.
+
+        Args:
+            component (function|class): A new component.
+
+        Raises:
+            TypeError: When `component` is neither class nor function.
+            KeyError: When `component` was added already.
+        """
+
+        # Currently only support class or function type
+        if not (inspect.isclass(component) or inspect.isfunction(component)):
+            raise TypeError(
+                "Expect class/function type, but received {}".format(
+                    type(component)))
+
+        # Obtain the internal name of the component
+        component_name = component.__name__
+
+        # Check whether the component was added already
+        if component_name in self._components_dict.keys():
+            warnings.warn(
+                "{} exists already! It is now updated to {} !!!".format(
+                    component_name, component))
+            self._components_dict[component_name] = component
+
+        else:
+            # Take the internal name of the component as its key
+            self._components_dict[component_name] = component
+
+    def add_component(self, components):
+        """
+        Add component(s) into the corresponding manager.
+
+        Args:
+            components (function|class|list|tuple): Support four types of components.
+
+        Returns:
+            components (function|class|list|tuple): Same with input components.
+        """
+
+        # Check whether the type is a sequence
+        if isinstance(components, Sequence):
+            for component in components:
+                self._add_single_component(component)
+        else:
+            component = components
+            self._add_single_component(component)
+
+        return components
+
+
+MODELS = ComponentManager("models")
+BACKBONES = ComponentManager("backbones")
+DATASETS = ComponentManager("datasets")
+TRANSFORMS = ComponentManager("transforms")
+LOSSES = ComponentManager("losses")
diff --git a/paddlers/models/ppseg/cvlibs/param_init.py b/paddlers/models/ppseg/cvlibs/param_init.py
new file mode 100644
index 0000000..2213a1a
--- /dev/null
+++ b/paddlers/models/ppseg/cvlibs/param_init.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+
+def constant_init(param, **kwargs):
+    """
+    Initialize the `param` with constants.
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        param_init.constant_init(linear.weight, value=2.0)
+        print(linear.weight.numpy())
+        # result is [[2. 2. 2. 2.], [2. 2. 2. 2.]]
+
+    """
+    initializer = nn.initializer.Constant(**kwargs)
+    initializer(param, param.block)
+
+
+def normal_init(param, **kwargs):
+    """
+    Initialize the `param` with a Normal distribution.
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        param_init.normal_init(linear.weight, loc=0.0, scale=1.0)
+
+    """
+    initializer = nn.initializer.Normal(**kwargs)
+    initializer(param, param.block)
+
+
+def kaiming_normal_init(param, **kwargs):
+    r"""
+    Initialize the input tensor with Kaiming Normal initialization.
+
+    This function implements the `param` initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
+    .. math::
+        x = \sqrt{\\frac{6.0}{fan\_in}}
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+    .. math::
+        \sqrt{\\frac{2.0}{fan\_in}}
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        # uniform is used to decide whether to use uniform or normal distribution
+        param_init.kaiming_normal_init(linear.weight)
+
+    """
+    initializer = nn.initializer.KaimingNormal(**kwargs)
+    initializer(param, param.block)
+
+
+def kaiming_uniform(param, **kwargs):
+    r"""Implements the Kaiming Uniform initializer
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities.
+
+    In case of Uniform distribution, the range is [-x, x], where
+    .. math::
+        x = \sqrt{\\frac{6.0}{fan\_in}}
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        param_init.kaiming_uniform(linear.weight)
+    """
+
+    initializer = nn.initializer.KaimingUniform(**kwargs)
+    initializer(param, param.block)
diff --git a/paddlers/models/ppseg/datasets/__init__.py b/paddlers/models/ppseg/datasets/__init__.py
new file mode 100644
index 0000000..ee79d0f
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset import Dataset
+from .cityscapes import Cityscapes
+from .voc import PascalVOC
+from .ade import ADE20K
+from .optic_disc_seg import OpticDiscSeg
+from .pascal_context import PascalContext
+from .mini_deep_globe_road_extraction import MiniDeepGlobeRoadExtraction
+from .eg1800 import EG1800
+from .supervisely import SUPERVISELY
+from .cocostuff import CocoStuff
+from .stare import STARE
+from .drive import DRIVE
+from .hrf import HRF
+from .chase_db1 import CHASEDB1
+from .pp_humanseg14k import PPHumanSeg14K
diff --git a/paddlers/models/ppseg/datasets/ade.py b/paddlers/models/ppseg/datasets/ade.py
new file mode 100644
index 0000000..9a9682d
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/ade.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+from PIL import Image
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+import paddlers.models.ppseg.transforms.functional as F
+
+URL = "http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip"
+
+
+@manager.DATASETS.add_component
+class ADE20K(Dataset):
+    """
+    ADE20K dataset `http://sceneparsing.csail.mit.edu/`.
+
+    Args:
+        transforms (list): A list of image transformations.
+        dataset_root (str, optional): The ADK20K dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 150
+
+    def __init__(self, transforms, dataset_root=None, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val']:
+            raise ValueError(
+                "`mode` should be one of ('train', 'val') in ADE20K dataset, but got {}."
+                .format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME,
+                extraname='ADEChallengeData2016')
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            img_dir = os.path.join(self.dataset_root, 'images/training')
+            label_dir = os.path.join(self.dataset_root, 'annotations/training')
+        elif mode == 'val':
+            img_dir = os.path.join(self.dataset_root, 'images/validation')
+            label_dir = os.path.join(self.dataset_root,
+                                     'annotations/validation')
+        img_files = os.listdir(img_dir)
+        label_files = [i.replace('.jpg', '.png') for i in img_files]
+        for i in range(len(img_files)):
+            img_path = os.path.join(img_dir, img_files[i])
+            label_path = os.path.join(label_dir, label_files[i])
+            self.file_list.append([img_path, label_path])
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+        if self.mode == 'val':
+            im, _ = self.transforms(im=image_path)
+            label = np.asarray(Image.open(label_path))
+            # The class 0 is ignored. And it will equal to 255 after
+            # subtracted 1, because the dtype of label is uint8.
+            label = label - 1
+            label = label[np.newaxis, :, :]
+            return im, label
+        else:
+            im, label = self.transforms(im=image_path, label=label_path)
+            label = label - 1
+            # Recover the ignore pixels adding by transform
+            label[label == 254] = 255
+            if self.edge:
+                edge_mask = F.mask_to_binary_edge(
+                    label, radius=2, num_classes=self.num_classes)
+                return im, label, edge_mask
+            else:
+                return im, label
diff --git a/paddlers/models/ppseg/datasets/chase_db1.py b/paddlers/models/ppseg/datasets/chase_db1.py
new file mode 100644
index 0000000..9ddec59
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/chase_db1.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/chase_db1/chase_db1.zip'
+
+
+@manager.DATASETS.add_component
+class CHASEDB1(Dataset):
+    """
+    CHASE_DB1 dataset is a dataset for retinal vessel segmentation
+    which contains 28 color retina images with the size of 999×960 pixels.
+    It is collected from both left and right eyes of 14 school children.
+    Each image is annotated by two independent human experts, and we choose the labels from 1st expert.
+    (https://blogs.kingston.ac.uk/retinal/chasedb1/)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255  # labels only have 1/0, thus ignore_index is not necessary
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
diff --git a/paddlers/models/ppseg/datasets/cityscapes.py b/paddlers/models/ppseg/datasets/cityscapes.py
new file mode 100644
index 0000000..1a5ba79
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/cityscapes.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import glob
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class Cityscapes(Dataset):
+    """
+    Cityscapes dataset `https://www.cityscapes-dataset.com/`.
+    The folder structure is as follow:
+
+        cityscapes
+        |
+        |--leftImg8bit
+        |  |--train
+        |  |--val
+        |  |--test
+        |
+        |--gtFine
+        |  |--train
+        |  |--val
+        |  |--test
+
+    Make sure there are **labelTrainIds.png in gtFine directory. If not, please run the conver_cityscapes.py in tools.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): Cityscapes dataset directory.
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 19
+
+    def __init__(self, transforms, dataset_root, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "mode should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        img_dir = os.path.join(self.dataset_root, 'leftImg8bit')
+        label_dir = os.path.join(self.dataset_root, 'gtFine')
+        if self.dataset_root is None or not os.path.isdir(
+                self.dataset_root) or not os.path.isdir(
+                    img_dir) or not os.path.isdir(label_dir):
+            raise ValueError(
+                "The dataset is not Found or the folder structure is nonconfoumance."
+            )
+
+        label_files = sorted(
+            glob.glob(
+                os.path.join(label_dir, mode, '*',
+                             '*_gtFine_labelTrainIds.png')))
+        img_files = sorted(
+            glob.glob(os.path.join(img_dir, mode, '*', '*_leftImg8bit.png')))
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
diff --git a/paddlers/models/ppseg/datasets/cocostuff.py b/paddlers/models/ppseg/datasets/cocostuff.py
new file mode 100644
index 0000000..d3ea771
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/cocostuff.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import glob
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class CocoStuff(Dataset):
+    """
+    COCO-Stuff dataset `https://github.com/nightrome/cocostuff`.
+    The folder structure is as follow:
+
+        cocostuff
+        |
+        |--images
+        |  |--train2017
+        |  |--val2017
+        |
+        |--annotations
+        |  |--train2017
+        |  |--val2017
+
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): Cityscapes dataset directory.
+        mode (str): Which part of dataset to use. it is one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 171
+
+    def __init__(self, transforms, dataset_root, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val']:
+            raise ValueError(
+                "mode should be 'train', 'val', but got {}.".format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        img_dir = os.path.join(self.dataset_root, 'images')
+        label_dir = os.path.join(self.dataset_root, 'annotations')
+        if self.dataset_root is None or not os.path.isdir(
+                self.dataset_root) or not os.path.isdir(
+                    img_dir) or not os.path.isdir(label_dir):
+            raise ValueError(
+                "The dataset is not Found or the folder structure is nonconfoumance."
+            )
+
+        label_files = sorted(
+            glob.glob(os.path.join(label_dir, mode + '2017', '*.png')))
+
+        img_files = sorted(
+            glob.glob(os.path.join(img_dir, mode + '2017', '*.jpg')))
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
diff --git a/paddlers/models/ppseg/datasets/dataset.py b/paddlers/models/ppseg/datasets/dataset.py
new file mode 100644
index 0000000..b33473a
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/dataset.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import numpy as np
+from PIL import Image
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+import paddlers.models.ppseg.transforms.functional as F
+
+
+@manager.DATASETS.add_component
+class Dataset(paddle.io.Dataset):
+    """
+    Pass in a custom dataset that conforms to the format.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory.
+        num_classes (int): Number of classes.
+        mode (str, optional): which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+        train_path (str, optional): The train dataset file. When mode is 'train', train_path is necessary.
+            The contents of train_path file are as follow:
+            image1.jpg ground_truth1.png
+            image2.jpg ground_truth2.png
+        val_path (str. optional): The evaluation dataset file. When mode is 'val', val_path is necessary.
+            The contents is the same as train_path
+        test_path (str, optional): The test dataset file. When mode is 'test', test_path is necessary.
+            The annotation file is not necessary in test_path file.
+        separator (str, optional): The separator of dataset list. Default: ' '.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+
+        Examples:
+
+            import paddlers.models.ppseg.transforms as T
+            from paddlers.models.ppseg.datasets import Dataset
+
+            transforms = [T.RandomPaddingCrop(crop_size=(512,512)), T.Normalize()]
+            dataset_root = 'dataset_root_path'
+            train_path = 'train_path'
+            num_classes = 2
+            dataset = Dataset(transforms = transforms,
+                              dataset_root = dataset_root,
+                              num_classes = 2,
+                              train_path = train_path,
+                              mode = 'train')
+
+    """
+
+    def __init__(self,
+                 transforms,
+                 dataset_root,
+                 num_classes,
+                 mode='train',
+                 train_path=None,
+                 val_path=None,
+                 test_path=None,
+                 separator=' ',
+                 ignore_index=255,
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        self.mode = mode.lower()
+        self.num_classes = num_classes
+        self.ignore_index = ignore_index
+        self.edge = edge
+
+        if self.mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "mode should be 'train', 'val' or 'test', but got {}.".format(
+                    self.mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if not os.path.exists(self.dataset_root):
+            raise FileNotFoundError('there is not `dataset_root`: {}.'.format(
+                self.dataset_root))
+
+        if self.mode == 'train':
+            if train_path is None:
+                raise ValueError(
+                    'When `mode` is "train", `train_path` is necessary, but it is None.'
+                )
+            elif not os.path.exists(train_path):
+                raise FileNotFoundError(
+                    '`train_path` is not found: {}'.format(train_path))
+            else:
+                file_path = train_path
+        elif self.mode == 'val':
+            if val_path is None:
+                raise ValueError(
+                    'When `mode` is "val", `val_path` is necessary, but it is None.'
+                )
+            elif not os.path.exists(val_path):
+                raise FileNotFoundError(
+                    '`val_path` is not found: {}'.format(val_path))
+            else:
+                file_path = val_path
+        else:
+            if test_path is None:
+                raise ValueError(
+                    'When `mode` is "test", `test_path` is necessary, but it is None.'
+                )
+            elif not os.path.exists(test_path):
+                raise FileNotFoundError(
+                    '`test_path` is not found: {}'.format(test_path))
+            else:
+                file_path = test_path
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split(separator)
+                if len(items) != 2:
+                    if self.mode == 'train' or self.mode == 'val':
+                        raise ValueError(
+                            "File list format incorrect! In training or evaluation task it should be"
+                            " image_name{}label_name\\n".format(separator))
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    label_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    label_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, label_path])
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+        if self.mode == 'test':
+            im, _ = self.transforms(im=image_path)
+            im = im[np.newaxis, ...]
+            return im, image_path
+        elif self.mode == 'val':
+            im, _ = self.transforms(im=image_path)
+            label = np.asarray(Image.open(label_path))
+            label = label[np.newaxis, :, :]
+            return im, label
+        else:
+            im, label = self.transforms(im=image_path, label=label_path)
+            if self.edge:
+                edge_mask = F.mask_to_binary_edge(
+                    label, radius=2, num_classes=self.num_classes)
+                return im, label, edge_mask
+            else:
+                return im, label
+
+    def __len__(self):
+        return len(self.file_list)
diff --git a/paddlers/models/ppseg/datasets/drive.py b/paddlers/models/ppseg/datasets/drive.py
new file mode 100644
index 0000000..f4180fc
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/drive.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/drive/drive.zip'
+
+
+@manager.DATASETS.add_component
+class DRIVE(Dataset):
+    """
+    The Digital Retinal Images for Vessel Extraction (DRIVE) dataset is a dataset for retinal vessel segmentation.
+    It consists of a total of JPEG 40 color fundus images which is of size (584, 565); including 7 abnormal pathology cases.
+    (http://www.isi.uu.nl/Research/Databases/DRIVE/)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255  # labels only have 1/0, thus ignore_index is not necessary
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
diff --git a/paddlers/models/ppseg/datasets/eg1800.py b/paddlers/models/ppseg/datasets/eg1800.py
new file mode 100644
index 0000000..fbeef04
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/eg1800.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+
+import cv2
+import numpy as np
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+import paddlers.models.ppseg.transforms.functional as F
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/EG1800.zip"
+
+
+@manager.DATASETS.add_component
+class EG1800(Dataset):
+    """
+    EG1800 dataset `http://xiaoyongshen.me/webpage_portrait/index.html`.
+
+    Args:
+        common_transforms (list): A list of common image transformations for two inputs of portrait net.
+        transforms1 (list): A list of image transformations for the first input of portrait net.
+        transforms2 (list): A list of image transformations for the second input of portrait net.
+        dataset_root (str, optional): The EG1800 dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 common_transforms,
+                 transforms1,
+                 transforms2,
+                 dataset_root=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.common_transforms = Compose(common_transforms)
+        self.transforms = self.common_transforms
+        if transforms1 is not None:
+            self.transforms1 = Compose(transforms1, to_rgb=False)
+        if transforms2 is not None:
+            self.transforms2 = Compose(transforms2, to_rgb=False)
+        mode = mode.lower()
+        self.ignore_index = 255
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.input_width = 224
+        self.input_height = 224
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+            
+        if mode == 'train':
+            path = os.path.join(dataset_root, 'eg1800_train.txt')
+        else:
+            path = os.path.join(dataset_root, 'eg1800_test.txt')
+        with open(path, 'r') as f:
+            files = f.readlines()
+        img_files = [
+            os.path.join(dataset_root, 'Images', file).strip() for file in files
+        ]
+        label_files = [
+            os.path.join(dataset_root, 'Labels', file).strip() for file in files
+        ]
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
+        pass
+
+    def __getitem__(self, item):
+        image_path, label_path = self.file_list[item]
+        im = cv2.imread(image_path)
+        label = cv2.imread(label_path, 0)
+        label[label > 1] = 0
+
+        if self.mode == "val":
+            common_im, label = self.common_transforms(im=im, label=label)
+            im = np.float32(common_im[::-1, :, :])  # RGB => BGR
+            im_aug = copy.deepcopy(im)
+        else:
+            common_im, label = self.common_transforms(im=im, label=label)
+            common_im = np.transpose(common_im, [1, 2, 0])
+            # add augmentation
+            im, _ = self.transforms1(common_im)
+            im_aug, _ = self.transforms2(common_im)
+
+            im = np.float32(im[::-1, :, :])  # RGB => BGR
+            im_aug = np.float32(im_aug[::-1, :, :])  # RGB => BGR
+
+        label = cv2.resize(
+            np.uint8(label), (self.input_width, self.input_height),
+            interpolation=cv2.INTER_NEAREST)
+
+        # add mask blur
+        label = np.uint8(cv2.blur(label, (5, 5)))
+        label[label >= 0.5] = 1
+        label[label < 0.5] = 0
+
+        edge_mask = F.mask_to_binary_edge(
+            label, radius=4, num_classes=self.num_classes)
+        edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1)
+        im = np.concatenate([im_aug, im])
+        if self.mode == "train":
+            return im, label, edge_mask
+        else:
+            return im, label
diff --git a/paddlers/models/ppseg/datasets/hrf.py b/paddlers/models/ppseg/datasets/hrf.py
new file mode 100644
index 0000000..eadd8b2
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/hrf.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/hrf/hrf.zip'
+
+
+@manager.DATASETS.add_component
+class HRF(Dataset):
+    """
+    The HRF dataset is a dataset for retinal vessel segmentation which comprises 45 images and is organized as 15 subsets. Each subset contains one healthy fundus image, one image of patient with diabetic retinopathy and one glaucoma image. The image sizes are 3,304 x 2,336, with a training/testing image split of 21/24.
+    (https://doi.org/10.1155/2013/154860)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
diff --git a/paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py b/paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
new file mode 100644
index 0000000..253967a
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .dataset import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/MiniDeepGlobeRoadExtraction.zip"
+
+
+@manager.DATASETS.add_component
+class MiniDeepGlobeRoadExtraction(Dataset):
+    """
+    MiniDeepGlobeRoadExtraction dataset is extraced from DeepGlobe CVPR2018 challenge (http://deepglobe.org/)
+
+    There are 800 images in the training set and 200 images in the validation set.
+
+    Args:
+        dataset_root (str, optional): The dataset directory. Default: None.
+        transforms (list, optional): Transforms for image. Default: None.
+        mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val']:
+            raise ValueError(
+                "`mode` should be 'train' or 'val', but got {}.".format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train.txt')
+        else:
+            file_path = os.path.join(self.dataset_root, 'val.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split('|')
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name|label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
diff --git a/paddlers/models/ppseg/datasets/optic_disc_seg.py b/paddlers/models/ppseg/datasets/optic_disc_seg.py
new file mode 100644
index 0000000..805c80d
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/optic_disc_seg.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .dataset import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/optic_disc_seg.zip"
+
+
+@manager.DATASETS.add_component
+class OpticDiscSeg(Dataset):
+    """
+    OpticDiscSeg dataset is extraced from iChallenge-AMD
+    (https://ai.baidu.com/broad/subordinate?dataset=amd).
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+        else:
+            file_path = os.path.join(self.dataset_root, 'test_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
diff --git a/paddlers/models/ppseg/datasets/pascal_context.py b/paddlers/models/ppseg/datasets/pascal_context.py
new file mode 100644
index 0000000..a7e8ade
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/pascal_context.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from PIL import Image
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class PascalContext(Dataset):
+    """
+    PascalVOC2010 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`.
+    If you want to use pascal context dataset, please run the convert_voc2010.py in tools firstly.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        mode (str): Which part of dataset to use. it is one of ('train', 'trainval', 'context', 'val').
+            If you want to set mode to 'context', please make sure the dataset have been augmented. Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 60
+
+    def __init__(self, transforms=None, dataset_root=None, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'trainval', 'val']:
+            raise ValueError(
+                "`mode` should be one of ('train', 'trainval', 'val') in PascalContext dataset, but got {}."
+                .format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+        if self.dataset_root is None:
+            raise ValueError(
+                "The dataset is not Found or the folder structure is nonconfoumance."
+            )
+
+        image_set_dir = os.path.join(self.dataset_root, 'ImageSets',
+                                     'Segmentation')
+
+        if mode == 'train':
+            file_path = os.path.join(image_set_dir, 'train_context.txt')
+        elif mode == 'val':
+            file_path = os.path.join(image_set_dir, 'val_context.txt')
+        elif mode == 'trainval':
+            file_path = os.path.join(image_set_dir, 'trainval_context.txt')
+        if not os.path.exists(file_path):
+            raise RuntimeError(
+                "PASCAL-Context annotations are not ready, "
+                "Please make sure voc_context.py has been properly run.")
+
+        img_dir = os.path.join(self.dataset_root, 'JPEGImages')
+        label_dir = os.path.join(self.dataset_root, 'Context')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
+                label_path = os.path.join(label_dir, ''.join([line, '.png']))
+                self.file_list.append([image_path, label_path])
diff --git a/paddlers/models/ppseg/datasets/pp_humanseg14k.py b/paddlers/models/ppseg/datasets/pp_humanseg14k.py
new file mode 100644
index 0000000..ba124a3
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/pp_humanseg14k.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .dataset import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class PPHumanSeg14K(Dataset):
+    """
+    This is the PP-HumanSeg14K Dataset.
+
+    This dataset was introduced in the work:
+    Chu, Lutao, et al. "PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset." Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2022.
+
+    This dataset is divided into training set, validation set and test set. The training set includes 8770 pictures, the validation set includes 2431 pictures, and the test set includes 2482 pictures.
+
+    Args:
+        dataset_root (str, optional): The dataset directory. Default: None.
+        transforms (list, optional): Transforms for image. Default: None.
+        mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val.txt')
+        else:
+            file_path = os.path.join(self.dataset_root, 'test.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split(' ')
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
diff --git a/paddlers/models/ppseg/datasets/stare.py b/paddlers/models/ppseg/datasets/stare.py
new file mode 100644
index 0000000..1acf64e
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/stare.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/stare/stare.zip'
+
+
+@manager.DATASETS.add_component
+class STARE(Dataset):
+    """
+    STARE dataset is processed from the STARE(STructured Analysis of the Retina) project.
+    (https://cecas.clemson.edu/~ahoover/stare/)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)  # data  STARE
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
diff --git a/paddlers/models/ppseg/datasets/supervisely.py b/paddlers/models/ppseg/datasets/supervisely.py
new file mode 100644
index 0000000..5956de6
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/supervisely.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+
+import cv2
+import numpy as np
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+import paddlers.models.ppseg.transforms.functional as F
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/Supervisely_face.zip"
+
+
+@manager.DATASETS.add_component
+class SUPERVISELY(Dataset):
+    """
+    Supervise.ly dataset `https://supervise.ly/`.
+
+    Args:
+        common_transforms (list): A list of common image transformations for two inputs of portrait net.
+        transforms1 (list): A list of image transformations for the first input of portrait net.
+        transforms2 (list): A list of image transformations for the second input of portrait net.
+        dataset_root (str, optional): The Supervise.ly dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 common_transforms,
+                 transforms1,
+                 transforms2,
+                 dataset_root=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.common_transforms = Compose(common_transforms)
+        self.transforms = self.common_transforms
+        if transforms1 is not None:
+            self.transforms1 = Compose(transforms1, to_rgb=False)
+        if transforms2 is not None:
+            self.transforms2 = Compose(transforms2, to_rgb=False)
+        mode = mode.lower()
+        self.ignore_index = 255
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.input_width = 224
+        self.input_height = 224
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+            
+        if mode == 'train':
+            path = os.path.join(dataset_root, 'supervisely_face_train_easy.txt')
+        else:
+            path = os.path.join(dataset_root, 'supervisely_face_test_easy.txt')
+        with open(path, 'r') as f:
+            files = f.readlines()
+        files = ["/".join(file.split('/')[1:]) for file in files]
+        img_files = [os.path.join(dataset_root, file).strip() for file in files]
+        label_files = [
+            os.path.join(dataset_root, file.replace('/img/', '/ann/')).strip()
+            for file in files
+        ]
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
+
+    def __getitem__(self, item):
+        image_path, label_path = self.file_list[item]
+        im = cv2.imread(image_path)
+        label = cv2.imread(label_path, 0)
+        label[label > 0] = 1
+
+        if self.mode == "val":
+            common_im, label = self.common_transforms(im=im, label=label)
+            im = np.float32(common_im[::-1, :, :])  # RGB => BGR
+            im_aug = copy.deepcopy(im)
+        else:
+            common_im, label = self.common_transforms(im=im, label=label)
+            common_im = np.transpose(common_im, [1, 2, 0])
+            # add augmentation
+            im, _ = self.transforms1(common_im)
+            im_aug, _ = self.transforms2(common_im)
+
+            im = np.float32(im[::-1, :, :])  # RGB => BGR
+            im_aug = np.float32(im_aug[::-1, :, :])  # RGB => BGR
+
+        label = cv2.resize(
+            np.uint8(label), (self.input_width, self.input_height),
+            interpolation=cv2.INTER_NEAREST)
+
+        # add mask blur
+        label = np.uint8(cv2.blur(label, (5, 5)))
+        label[label >= 0.5] = 1
+        label[label < 0.5] = 0
+
+        edge_mask = F.mask_to_binary_edge(
+            label, radius=4, num_classes=self.num_classes)
+        edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1)
+        im = np.concatenate([im_aug, im])
+        if self.mode == "train":
+            return im, label, edge_mask
+        else:
+            return im, label
diff --git a/paddlers/models/ppseg/datasets/voc.py b/paddlers/models/ppseg/datasets/voc.py
new file mode 100644
index 0000000..ffaf5d3
--- /dev/null
+++ b/paddlers/models/ppseg/datasets/voc.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar"
+
+
+@manager.DATASETS.add_component
+class PascalVOC(Dataset):
+    """
+    PascalVOC2012 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`.
+    If you want to augment the dataset, please run the voc_augment.py in tools.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'trainval', 'trainaug', 'val').
+            If you want to set mode to 'trainaug', please make sure the dataset have been augmented. Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 21
+
+    def __init__(self, transforms, dataset_root=None, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'trainval', 'trainaug', 'val']:
+            raise ValueError(
+                "`mode` should be one of ('train', 'trainval', 'trainaug', 'val') in PascalVOC dataset, but got {}."
+                .format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME,
+                extraname='VOCdevkit')
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        image_set_dir = os.path.join(self.dataset_root, 'VOC2012', 'ImageSets',
+                                     'Segmentation')
+        if mode == 'train':
+            file_path = os.path.join(image_set_dir, 'train.txt')
+        elif mode == 'val':
+            file_path = os.path.join(image_set_dir, 'val.txt')
+        elif mode == 'trainval':
+            file_path = os.path.join(image_set_dir, 'trainval.txt')
+        elif mode == 'trainaug':
+            file_path = os.path.join(image_set_dir, 'train.txt')
+            file_path_aug = os.path.join(image_set_dir, 'aug.txt')
+
+            if not os.path.exists(file_path_aug):
+                raise RuntimeError(
+                    "When `mode` is 'trainaug', Pascal Voc dataset should be augmented, "
+                    "Please make sure voc_augment.py has been properly run when using this mode."
+                )
+
+        img_dir = os.path.join(self.dataset_root, 'VOC2012', 'JPEGImages')
+        label_dir = os.path.join(self.dataset_root, 'VOC2012',
+                                 'SegmentationClass')
+        label_dir_aug = os.path.join(self.dataset_root, 'VOC2012',
+                                     'SegmentationClassAug')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
+                label_path = os.path.join(label_dir, ''.join([line, '.png']))
+                self.file_list.append([image_path, label_path])
+        if mode == 'trainaug':
+            with open(file_path_aug, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
+                    label_path = os.path.join(label_dir_aug,
+                                              ''.join([line, '.png']))
+                    self.file_list.append([image_path, label_path])
diff --git a/paddlers/models/ppseg/models/__init__.py b/paddlers/models/ppseg/models/__init__.py
new file mode 100644
index 0000000..62b3cc6
--- /dev/null
+++ b/paddlers/models/ppseg/models/__init__.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backbones import *
+from .losses import *
+
+from .ann import *
+from .bisenet import *
+from .danet import *
+from .deeplab import *
+from .fast_scnn import *
+from .fcn import *
+from .gcnet import *
+from .ocrnet import *
+from .pspnet import *
+from .gscnn import GSCNN
+from .unet import UNet
+from .hardnet import HarDNet
+from .u2net import U2Net, U2Netp
+from .attention_unet import AttentionUNet
+from .unet_plusplus import UNetPlusPlus
+from .unet_3plus import UNet3Plus
+from .decoupled_segnet import DecoupledSegNet
+from .emanet import *
+from .isanet import *
+from .dnlnet import *
+from .setr import *
+from .sfnet import *
+from .pphumanseg_lite import *
+from .mla_transformer import MLATransformer
+from .portraitnet import PortraitNet
+from .stdcseg import STDCSeg
+from .segformer import SegFormer
+from .pointrend import PointRend
+from .ginet import GINet
+from .segmenter import *
+from .segnet import SegNet
+from .encnet import ENCNet
+from .hrnet_contrast import HRNetW48Contrast
+from .espnet import ESPNetV2
+from .dmnet import DMNet
+from .espnetv1 import ESPNetV1
+from .enet import ENet
+from .bisenetv1 import BiseNetV1
+from .fastfcn import FastFCN
+from .pfpnnet import PFPNNet
\ No newline at end of file
diff --git a/paddlers/models/ppseg/models/ann.py b/paddlers/models/ppseg/models/ann.py
new file mode 100644
index 0000000..ad8961c
--- /dev/null
+++ b/paddlers/models/ppseg/models/ann.py
@@ -0,0 +1,434 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ANN(nn.Layer):
+    """
+    The ANN implementation based on PaddlePaddle.
+
+    The original article refers to
+    Zhen, Zhu, et al. "Asymmetric Non-local Neural Networks for Semantic Segmentation"
+    (https://arxiv.org/pdf/1908.07678.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+        key_value_channels (int, optional): The key and value channels of self-attention map in both AFNB and APNB modules.
+            Default: 256.
+        inter_channels (int, optional): Both input and output channels of APNB modules. Default: 512.
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 key_value_channels=256,
+                 inter_channels=512,
+                 psp_size=(1, 3, 6, 8),
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = ANNHead(num_classes, backbone_indices, backbone_channels,
+                            key_value_channels, inter_channels, psp_size,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class ANNHead(nn.Layer):
+    """
+    The ANNHead implementation.
+
+    It mainly consists of AFNB and APNB modules.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            The first index will be taken as low-level features; the second one will be
+            taken as high-level features in AFNB module. Usually backbone consists of four
+            downsampling stage, such as ResNet, and return an output of each stage. If it is (2, 3),
+            it means taking feature map of the third stage and the fourth stage in backbone.
+        backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
+        key_value_channels (int): The key and value channels of self-attention map in both AFNB and APNB modules.
+        inter_channels (int): Both input and output channels of APNB modules.
+        psp_size (tuple): The out size of pooled feature maps.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 backbone_channels,
+                 key_value_channels,
+                 inter_channels,
+                 psp_size,
+                 enable_auxiliary_loss=True):
+        super().__init__()
+
+        low_in_channels = backbone_channels[0]
+        high_in_channels = backbone_channels[1]
+
+        self.fusion = AFNB(
+            low_in_channels=low_in_channels,
+            high_in_channels=high_in_channels,
+            out_channels=high_in_channels,
+            key_channels=key_value_channels,
+            value_channels=key_value_channels,
+            dropout_prob=0.05,
+            repeat_sizes=([1]),
+            psp_size=psp_size)
+
+        self.context = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=high_in_channels,
+                out_channels=inter_channels,
+                kernel_size=3,
+                padding=1),
+            APNB(
+                in_channels=inter_channels,
+                out_channels=inter_channels,
+                key_channels=key_value_channels,
+                value_channels=key_value_channels,
+                dropout_prob=0.05,
+                repeat_sizes=([1]),
+                psp_size=psp_size))
+
+        self.cls = nn.Conv2D(
+            in_channels=inter_channels, out_channels=num_classes, kernel_size=1)
+        self.auxlayer = layers.AuxLayer(
+            in_channels=low_in_channels,
+            inter_channels=low_in_channels // 2,
+            out_channels=num_classes,
+            dropout_prob=0.05)
+
+        self.backbone_indices = backbone_indices
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+    def forward(self, feat_list):
+        logit_list = []
+        low_level_x = feat_list[self.backbone_indices[0]]
+        high_level_x = feat_list[self.backbone_indices[1]]
+        x = self.fusion(low_level_x, high_level_x)
+        x = self.context(x)
+        logit = self.cls(x)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            auxiliary_logit = self.auxlayer(low_level_x)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
+
+
+class AFNB(nn.Layer):
+    """
+    Asymmetric Fusion Non-local Block.
+
+    Args:
+        low_in_channels (int): Low-level-feature channels.
+        high_in_channels (int): High-level-feature channels.
+        out_channels (int): Out channels of AFNB module.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        dropout_prob (float): The dropout rate of output.
+        repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]).
+        psp_size (tuple. optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 low_in_channels,
+                 high_in_channels,
+                 out_channels,
+                 key_channels,
+                 value_channels,
+                 dropout_prob,
+                 repeat_sizes=([1]),
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.psp_size = psp_size
+        self.stages = nn.LayerList([
+            SelfAttentionBlock_AFNB(low_in_channels, high_in_channels,
+                                    key_channels, value_channels, out_channels,
+                                    size) for size in repeat_sizes
+        ])
+        self.conv_bn = layers.ConvBN(
+            in_channels=out_channels + high_in_channels,
+            out_channels=out_channels,
+            kernel_size=1)
+        self.dropout = nn.Dropout(p=dropout_prob)
+
+    def forward(self, low_feats, high_feats):
+        priors = [stage(low_feats, high_feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+
+        output = self.conv_bn(paddle.concat([context, high_feats], axis=1))
+        output = self.dropout(output)
+
+        return output
+
+
+class APNB(nn.Layer):
+    """
+    Asymmetric Pyramid Non-local Block.
+
+    Args:
+        in_channels (int): The input channels of APNB module.
+        out_channels (int): Out channels of APNB module.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        dropout_prob (float): The dropout rate of output.
+        repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]).
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 key_channels,
+                 value_channels,
+                 dropout_prob,
+                 repeat_sizes=([1]),
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.psp_size = psp_size
+        self.stages = nn.LayerList([
+            SelfAttentionBlock_APNB(in_channels, out_channels, key_channels,
+                                    value_channels, size)
+            for size in repeat_sizes
+        ])
+        self.conv_bn = layers.ConvBNReLU(
+            in_channels=in_channels * 2,
+            out_channels=out_channels,
+            kernel_size=1)
+        self.dropout = nn.Dropout(p=dropout_prob)
+
+    def forward(self, x):
+        priors = [stage(x) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+
+        output = self.conv_bn(paddle.concat([context, x], axis=1))
+        output = self.dropout(output)
+
+        return output
+
+
+def _pp_module(x, psp_size):
+    n, c, h, w = x.shape
+    priors = []
+    for size in psp_size:
+        feat = F.adaptive_avg_pool2d(x, size)
+        feat = paddle.reshape(feat, shape=(0, c, -1))
+        priors.append(feat)
+    center = paddle.concat(priors, axis=-1)
+    return center
+
+
+class SelfAttentionBlock_AFNB(nn.Layer):
+    """
+    Self-Attention Block for AFNB module.
+
+    Args:
+        low_in_channels (int): Low-level-feature channels.
+        high_in_channels (int): High-level-feature channels.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        out_channels (int, optional): Out channels of AFNB module. Default: None.
+        scale (int, optional): Pooling size. Default: 1.
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 low_in_channels,
+                 high_in_channels,
+                 key_channels,
+                 value_channels,
+                 out_channels=None,
+                 scale=1,
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.scale = scale
+        self.in_channels = low_in_channels
+        self.out_channels = out_channels
+        self.key_channels = key_channels
+        self.value_channels = value_channels
+        if out_channels == None:
+            self.out_channels = high_in_channels
+        self.pool = nn.MaxPool2D(scale)
+        self.f_key = layers.ConvBNReLU(
+            in_channels=low_in_channels,
+            out_channels=key_channels,
+            kernel_size=1)
+        self.f_query = layers.ConvBNReLU(
+            in_channels=high_in_channels,
+            out_channels=key_channels,
+            kernel_size=1)
+        self.f_value = nn.Conv2D(
+            in_channels=low_in_channels,
+            out_channels=value_channels,
+            kernel_size=1)
+
+        self.W = nn.Conv2D(
+            in_channels=value_channels,
+            out_channels=out_channels,
+            kernel_size=1)
+
+        self.psp_size = psp_size
+
+    def forward(self, low_feats, high_feats):
+        batch_size, _, h, w = high_feats.shape
+
+        value = self.f_value(low_feats)
+        value = _pp_module(value, self.psp_size)
+        value = paddle.transpose(value, (0, 2, 1))
+
+        query = self.f_query(high_feats)
+        query = paddle.reshape(query, shape=(0, self.key_channels, -1))
+        query = paddle.transpose(query, perm=(0, 2, 1))
+
+        key = self.f_key(low_feats)
+        key = _pp_module(key, self.psp_size)
+
+        sim_map = paddle.matmul(query, key)
+        sim_map = (self.key_channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        context = paddle.matmul(sim_map, value)
+        context = paddle.transpose(context, perm=(0, 2, 1))
+        hf_shape = paddle.shape(high_feats)
+        context = paddle.reshape(
+            context, shape=[0, self.value_channels, hf_shape[2], hf_shape[3]])
+
+        context = self.W(context)
+
+        return context
+
+
+class SelfAttentionBlock_APNB(nn.Layer):
+    """
+    Self-Attention Block for APNB module.
+
+    Args:
+        in_channels (int): The input channels of APNB module.
+        out_channels (int): The out channels of APNB module.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        scale (int, optional): Pooling size. Default: 1.
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 key_channels,
+                 value_channels,
+                 scale=1,
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.scale = scale
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.key_channels = key_channels
+        self.value_channels = value_channels
+        self.pool = nn.MaxPool2D(scale)
+        self.f_key = layers.ConvBNReLU(
+            in_channels=self.in_channels,
+            out_channels=self.key_channels,
+            kernel_size=1)
+        self.f_query = self.f_key
+        self.f_value = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.value_channels,
+            kernel_size=1)
+        self.W = nn.Conv2D(
+            in_channels=self.value_channels,
+            out_channels=self.out_channels,
+            kernel_size=1)
+
+        self.psp_size = psp_size
+
+    def forward(self, x):
+        batch_size, _, h, w = x.shape
+        if self.scale > 1:
+            x = self.pool(x)
+
+        value = self.f_value(x)
+        value = _pp_module(value, self.psp_size)
+        value = paddle.transpose(value, perm=(0, 2, 1))
+
+        query = self.f_query(x)
+        query = paddle.reshape(query, shape=(0, self.key_channels, -1))
+        query = paddle.transpose(query, perm=(0, 2, 1))
+
+        key = self.f_key(x)
+        key = _pp_module(key, self.psp_size)
+
+        sim_map = paddle.matmul(query, key)
+        sim_map = (self.key_channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        context = paddle.matmul(sim_map, value)
+        context = paddle.transpose(context, perm=(0, 2, 1))
+
+        x_shape = paddle.shape(x)
+        context = paddle.reshape(
+            context, shape=[0, self.value_channels, x_shape[2], x_shape[3]])
+        context = self.W(context)
+
+        return context
diff --git a/paddlers/models/ppseg/models/attention_unet.py b/paddlers/models/ppseg/models/attention_unet.py
new file mode 100644
index 0000000..73abee3
--- /dev/null
+++ b/paddlers/models/ppseg/models/attention_unet.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg import utils
+import numpy as np
+
+
+@manager.MODELS.add_component
+class AttentionUNet(nn.Layer):
+    """
+    The Attention-UNet implementation based on PaddlePaddle.
+    As mentioned in the original paper, author proposes a novel attention gate (AG)
+    that automatically learns to focus on target structures of varying shapes and sizes.
+    Models trained with AGs implicitly learn to suppress irrelevant regions in an input image while
+    highlighting salient features useful for a specific task.
+
+    The original article refers to
+    Oktay, O, et, al. "Attention u-net: Learning where to look for the pancreas."
+    (https://arxiv.org/pdf/1804.03999.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self, num_classes, pretrained=None):
+        super().__init__()
+        n_channels = 3
+        self.encoder = Encoder(n_channels, [64, 128, 256, 512])
+        filters = np.array([64, 128, 256, 512, 1024])
+        self.up5 = UpConv(ch_in=filters[4], ch_out=filters[3])
+        self.att5 = AttentionBlock(
+            F_g=filters[3], F_l=filters[3], F_out=filters[2])
+        self.up_conv5 = ConvBlock(ch_in=filters[4], ch_out=filters[3])
+
+        self.up4 = UpConv(ch_in=filters[3], ch_out=filters[2])
+        self.att4 = AttentionBlock(
+            F_g=filters[2], F_l=filters[2], F_out=filters[1])
+        self.up_conv4 = ConvBlock(ch_in=filters[3], ch_out=filters[2])
+
+        self.up3 = UpConv(ch_in=filters[2], ch_out=filters[1])
+        self.att3 = AttentionBlock(
+            F_g=filters[1], F_l=filters[1], F_out=filters[0])
+        self.up_conv3 = ConvBlock(ch_in=filters[2], ch_out=filters[1])
+
+        self.up2 = UpConv(ch_in=filters[1], ch_out=filters[0])
+        self.att2 = AttentionBlock(
+            F_g=filters[0], F_l=filters[0], F_out=filters[0] // 2)
+        self.up_conv2 = ConvBlock(ch_in=filters[1], ch_out=filters[0])
+
+        self.conv_1x1 = nn.Conv2D(
+            filters[0], num_classes, kernel_size=1, stride=1, padding=0)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        x5, (x1, x2, x3, x4) = self.encoder(x)
+        d5 = self.up5(x5)
+        x4 = self.att5(g=d5, x=x4)
+        d5 = paddle.concat([x4, d5], axis=1)
+        d5 = self.up_conv5(d5)
+
+        d4 = self.up4(d5)
+        x3 = self.att4(g=d4, x=x3)
+        d4 = paddle.concat((x3, d4), axis=1)
+        d4 = self.up_conv4(d4)
+
+        d3 = self.up3(d4)
+        x2 = self.att3(g=d3, x=x2)
+        d3 = paddle.concat((x2, d3), axis=1)
+        d3 = self.up_conv3(d3)
+
+        d2 = self.up2(d3)
+        x1 = self.att2(g=d2, x=x1)
+        d2 = paddle.concat((x1, d2), axis=1)
+        d2 = self.up_conv2(d2)
+
+        logit = self.conv_1x1(d2)
+        logit_list = [logit]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class AttentionBlock(nn.Layer):
+    def __init__(self, F_g, F_l, F_out):
+        super().__init__()
+        self.W_g = nn.Sequential(
+            nn.Conv2D(F_g, F_out, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(F_out))
+
+        self.W_x = nn.Sequential(
+            nn.Conv2D(F_l, F_out, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(F_out))
+
+        self.psi = nn.Sequential(
+            nn.Conv2D(F_out, 1, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(1), nn.Sigmoid())
+
+        self.relu = nn.ReLU()
+
+    def forward(self, g, x):
+        g1 = self.W_g(g)
+        x1 = self.W_x(x)
+        psi = self.relu(g1 + x1)
+        psi = self.psi(psi)
+        res = x * psi
+        return res
+
+
+class UpConv(nn.Layer):
+    def __init__(self, ch_in, ch_out):
+        super().__init__()
+        self.up = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode="bilinear"),
+            nn.Conv2D(ch_in, ch_out, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2D(ch_out), nn.ReLU())
+
+    def forward(self, x):
+        return self.up(x)
+
+
+class Encoder(nn.Layer):
+    def __init__(self, input_channels, filters):
+        super().__init__()
+        self.double_conv = nn.Sequential(
+            layers.ConvBNReLU(input_channels, 64, 3),
+            layers.ConvBNReLU(64, 64, 3))
+        down_channels = filters
+        self.down_sample_list = nn.LayerList([
+            self.down_sampling(channel, channel * 2)
+            for channel in down_channels
+        ])
+
+    def down_sampling(self, in_channels, out_channels):
+        modules = []
+        modules.append(nn.MaxPool2D(kernel_size=2, stride=2))
+        modules.append(layers.ConvBNReLU(in_channels, out_channels, 3))
+        modules.append(layers.ConvBNReLU(out_channels, out_channels, 3))
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        short_cuts = []
+        x = self.double_conv(x)
+        for down_sample in self.down_sample_list:
+            short_cuts.append(x)
+            x = down_sample(x)
+        return x, short_cuts
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out):
+        super(ConvBlock, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(ch_in, ch_out, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2D(ch_out), nn.ReLU(),
+            nn.Conv2D(ch_out, ch_out, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2D(ch_out), nn.ReLU())
+
+    def forward(self, x):
+        return self.conv(x)
diff --git a/paddlers/models/ppseg/models/backbones/__init__.py b/paddlers/models/ppseg/models/backbones/__init__.py
new file mode 100644
index 0000000..108f87d
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .hrnet import *
+from .resnet_vd import *
+from .xception_deeplab import *
+from .mobilenetv3 import *
+from .vision_transformer import *
+from .swin_transformer import *
+from .mobilenetv2 import *
+from .mix_transformer import *
+from .stdcnet import *
diff --git a/paddlers/models/ppseg/models/backbones/hrnet.py b/paddlers/models/ppseg/models/backbones/hrnet.py
new file mode 100644
index 0000000..5a98ea7
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/hrnet.py
@@ -0,0 +1,837 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = [
+    "HRNet_W18_Small_V1", "HRNet_W18_Small_V2", "HRNet_W18", "HRNet_W30",
+    "HRNet_W32", "HRNet_W40", "HRNet_W44", "HRNet_W48", "HRNet_W60", "HRNet_W64"
+]
+
+
+class HRNet(nn.Layer):
+    """
+    The HRNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Jingdong Wang, et, al. "HRNet：Deep High-Resolution Representation Learning for Visual Recognition"
+    (https://arxiv.org/pdf/1908.07919.pdf).
+
+    Args:
+        pretrained (str, optional): The path of pretrained model.
+        stage1_num_modules (int, optional): Number of modules for stage1. Default 1.
+        stage1_num_blocks (list, optional): Number of blocks per module for stage1. Default (4).
+        stage1_num_channels (list, optional): Number of channels per branch for stage1. Default (64).
+        stage2_num_modules (int, optional): Number of modules for stage2. Default 1.
+        stage2_num_blocks (list, optional): Number of blocks per module for stage2. Default (4, 4).
+        stage2_num_channels (list, optional): Number of channels per branch for stage2. Default (18, 36).
+        stage3_num_modules (int, optional): Number of modules for stage3. Default 4.
+        stage3_num_blocks (list, optional): Number of blocks per module for stage3. Default (4, 4, 4).
+        stage3_num_channels (list, optional): Number of channels per branch for stage3. Default [18, 36, 72).
+        stage4_num_modules (int, optional): Number of modules for stage4. Default 3.
+        stage4_num_blocks (list, optional): Number of blocks per module for stage4. Default (4, 4, 4, 4).
+        stage4_num_channels (list, optional): Number of channels per branch for stage4. Default (18, 36, 72. 144).
+        has_se (bool, optional): Whether to use Squeeze-and-Excitation module. Default False.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 stage1_num_modules=1,
+                 stage1_num_blocks=(4, ),
+                 stage1_num_channels=(64, ),
+                 stage2_num_modules=1,
+                 stage2_num_blocks=(4, 4),
+                 stage2_num_channels=(18, 36),
+                 stage3_num_modules=4,
+                 stage3_num_blocks=(4, 4, 4),
+                 stage3_num_channels=(18, 36, 72),
+                 stage4_num_modules=3,
+                 stage4_num_blocks=(4, 4, 4, 4),
+                 stage4_num_channels=(18, 36, 72, 144),
+                 has_se=False,
+                 align_corners=False,
+                 padding_same=True):
+        super(HRNet, self).__init__()
+        self.pretrained = pretrained
+        self.stage1_num_modules = stage1_num_modules
+        self.stage1_num_blocks = stage1_num_blocks
+        self.stage1_num_channels = stage1_num_channels
+        self.stage2_num_modules = stage2_num_modules
+        self.stage2_num_blocks = stage2_num_blocks
+        self.stage2_num_channels = stage2_num_channels
+        self.stage3_num_modules = stage3_num_modules
+        self.stage3_num_blocks = stage3_num_blocks
+        self.stage3_num_channels = stage3_num_channels
+        self.stage4_num_modules = stage4_num_modules
+        self.stage4_num_blocks = stage4_num_blocks
+        self.stage4_num_channels = stage4_num_channels
+        self.has_se = has_se
+        self.align_corners = align_corners
+        self.feat_channels = [sum(stage4_num_channels)]
+
+        self.conv_layer1_1 = layers.ConvBNReLU(
+            in_channels=3,
+            out_channels=64,
+            kernel_size=3,
+            stride=2,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        self.conv_layer1_2 = layers.ConvBNReLU(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=2,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        self.la1 = Layer1(
+            num_channels=64,
+            num_blocks=self.stage1_num_blocks[0],
+            num_filters=self.stage1_num_channels[0],
+            has_se=has_se,
+            name="layer2",
+            padding_same=padding_same)
+
+        self.tr1 = TransitionLayer(
+            in_channels=[self.stage1_num_channels[0] * 4],
+            out_channels=self.stage2_num_channels,
+            name="tr1",
+            padding_same=padding_same)
+
+        self.st2 = Stage(
+            num_channels=self.stage2_num_channels,
+            num_modules=self.stage2_num_modules,
+            num_blocks=self.stage2_num_blocks,
+            num_filters=self.stage2_num_channels,
+            has_se=self.has_se,
+            name="st2",
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+        self.tr2 = TransitionLayer(
+            in_channels=self.stage2_num_channels,
+            out_channels=self.stage3_num_channels,
+            name="tr2",
+            padding_same=padding_same)
+        self.st3 = Stage(
+            num_channels=self.stage3_num_channels,
+            num_modules=self.stage3_num_modules,
+            num_blocks=self.stage3_num_blocks,
+            num_filters=self.stage3_num_channels,
+            has_se=self.has_se,
+            name="st3",
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+        self.tr3 = TransitionLayer(
+            in_channels=self.stage3_num_channels,
+            out_channels=self.stage4_num_channels,
+            name="tr3",
+            padding_same=padding_same)
+        self.st4 = Stage(
+            num_channels=self.stage4_num_channels,
+            num_modules=self.stage4_num_modules,
+            num_blocks=self.stage4_num_blocks,
+            num_filters=self.stage4_num_channels,
+            has_se=self.has_se,
+            name="st4",
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+        self.init_weight()
+
+    def forward(self, x):
+        conv1 = self.conv_layer1_1(x)
+        conv2 = self.conv_layer1_2(conv1)
+
+        la1 = self.la1(conv2)
+
+        tr1 = self.tr1([la1])
+        st2 = self.st2(tr1)
+
+        tr2 = self.tr2(st2)
+        st3 = self.st3(tr2)
+
+        tr3 = self.tr3(st3)
+        st4 = self.st4(tr3)
+
+        size = paddle.shape(st4[0])[2:]
+        x1 = F.interpolate(
+            st4[1], size, mode='bilinear', align_corners=self.align_corners)
+        x2 = F.interpolate(
+            st4[2], size, mode='bilinear', align_corners=self.align_corners)
+        x3 = F.interpolate(
+            st4[3], size, mode='bilinear', align_corners=self.align_corners)
+        x = paddle.concat([st4[0], x1, x2, x3], axis=1)
+
+        return [x]
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class Layer1(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 num_blocks,
+                 has_se=False,
+                 name=None,
+                 padding_same=True):
+        super(Layer1, self).__init__()
+
+        self.bottleneck_block_list = []
+
+        for i in range(num_blocks):
+            bottleneck_block = self.add_sublayer(
+                "bb_{}_{}".format(name, i + 1),
+                BottleneckBlock(
+                    num_channels=num_channels if i == 0 else num_filters * 4,
+                    num_filters=num_filters,
+                    has_se=has_se,
+                    stride=1,
+                    downsample=True if i == 0 else False,
+                    name=name + '_' + str(i + 1),
+                    padding_same=padding_same))
+            self.bottleneck_block_list.append(bottleneck_block)
+
+    def forward(self, x):
+        conv = x
+        for block_func in self.bottleneck_block_list:
+            conv = block_func(conv)
+        return conv
+
+
+class TransitionLayer(nn.Layer):
+    def __init__(self, in_channels, out_channels, name=None, padding_same=True):
+        super(TransitionLayer, self).__init__()
+
+        num_in = len(in_channels)
+        num_out = len(out_channels)
+        self.conv_bn_func_list = []
+        for i in range(num_out):
+            residual = None
+            if i < num_in:
+                if in_channels[i] != out_channels[i]:
+                    residual = self.add_sublayer(
+                        "transition_{}_layer_{}".format(name, i + 1),
+                        layers.ConvBNReLU(
+                            in_channels=in_channels[i],
+                            out_channels=out_channels[i],
+                            kernel_size=3,
+                            padding=1 if not padding_same else 'same',
+                            bias_attr=False))
+            else:
+                residual = self.add_sublayer(
+                    "transition_{}_layer_{}".format(name, i + 1),
+                    layers.ConvBNReLU(
+                        in_channels=in_channels[-1],
+                        out_channels=out_channels[i],
+                        kernel_size=3,
+                        stride=2,
+                        padding=1 if not padding_same else 'same',
+                        bias_attr=False))
+            self.conv_bn_func_list.append(residual)
+
+    def forward(self, x):
+        outs = []
+        for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
+            if conv_bn_func is None:
+                outs.append(x[idx])
+            else:
+                if idx < len(x):
+                    outs.append(conv_bn_func(x[idx]))
+                else:
+                    outs.append(conv_bn_func(x[-1]))
+        return outs
+
+
+class Branches(nn.Layer):
+    def __init__(self,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 has_se=False,
+                 name=None,
+                 padding_same=True):
+        super(Branches, self).__init__()
+
+        self.basic_block_list = []
+
+        for i in range(len(out_channels)):
+            self.basic_block_list.append([])
+            for j in range(num_blocks[i]):
+                in_ch = in_channels[i] if j == 0 else out_channels[i]
+                basic_block_func = self.add_sublayer(
+                    "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
+                    BasicBlock(
+                        num_channels=in_ch,
+                        num_filters=out_channels[i],
+                        has_se=has_se,
+                        name=name + '_branch_layer_' + str(i + 1) + '_' +
+                        str(j + 1),
+                        padding_same=padding_same))
+                self.basic_block_list[i].append(basic_block_func)
+
+    def forward(self, x):
+        outs = []
+        for idx, input in enumerate(x):
+            conv = input
+            for basic_block_func in self.basic_block_list[idx]:
+                conv = basic_block_func(conv)
+            outs.append(conv)
+        return outs
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 has_se,
+                 stride=1,
+                 downsample=False,
+                 name=None,
+                 padding_same=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = layers.ConvBNReLU(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=1,
+            bias_attr=False)
+
+        self.conv2 = layers.ConvBNReLU(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            kernel_size=3,
+            stride=stride,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        self.conv3 = layers.ConvBN(
+            in_channels=num_filters,
+            out_channels=num_filters * 4,
+            kernel_size=1,
+            bias_attr=False)
+
+        if self.downsample:
+            self.conv_down = layers.ConvBN(
+                in_channels=num_channels,
+                out_channels=num_filters * 4,
+                kernel_size=1,
+                bias_attr=False)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters * 4,
+                num_filters=num_filters * 4,
+                reduction_ratio=16,
+                name=name + '_fc')
+
+        self.add = layers.Add()
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        residual = x
+        conv1 = self.conv1(x)
+        conv2 = self.conv2(conv1)
+        conv3 = self.conv3(conv2)
+
+        if self.downsample:
+            residual = self.conv_down(x)
+
+        if self.has_se:
+            conv3 = self.se(conv3)
+
+        y = self.add(conv3, residual)
+        y = self.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride=1,
+                 has_se=False,
+                 downsample=False,
+                 name=None,
+                 padding_same=True):
+        super(BasicBlock, self).__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = layers.ConvBNReLU(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=3,
+            stride=stride,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+        self.conv2 = layers.ConvBN(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            kernel_size=3,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        if self.downsample:
+            self.conv_down = layers.ConvBNReLU(
+                in_channels=num_channels,
+                out_channels=num_filters,
+                kernel_size=1,
+                bias_attr=False)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters,
+                num_filters=num_filters,
+                reduction_ratio=16,
+                name=name + '_fc')
+
+        self.add = layers.Add()
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        residual = x
+        conv1 = self.conv1(x)
+        conv2 = self.conv2(conv1)
+
+        if self.downsample:
+            residual = self.conv_down(x)
+
+        if self.has_se:
+            conv2 = self.se(conv2)
+
+        y = self.add(conv2, residual)
+        y = self.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = nn.AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = nn.Linear(
+            num_channels,
+            med_ch,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(-stdv, stdv)))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = nn.Linear(
+            med_ch,
+            num_filters,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(-stdv, stdv)))
+
+    def forward(self, x):
+        pool = self.pool2d_gap(x)
+        pool = paddle.reshape(pool, shape=[-1, self._num_channels])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.reshape(
+            excitation, shape=[-1, self._num_channels, 1, 1])
+        out = x * excitation
+        return out
+
+
+class Stage(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_modules,
+                 num_blocks,
+                 num_filters,
+                 has_se=False,
+                 multi_scale_output=True,
+                 name=None,
+                 align_corners=False,
+                 padding_same=True):
+        super(Stage, self).__init__()
+
+        self._num_modules = num_modules
+
+        self.stage_func_list = []
+        for i in range(num_modules):
+            if i == num_modules - 1 and not multi_scale_output:
+                stage_func = self.add_sublayer(
+                    "stage_{}_{}".format(name, i + 1),
+                    HighResolutionModule(
+                        num_channels=num_channels,
+                        num_blocks=num_blocks,
+                        num_filters=num_filters,
+                        has_se=has_se,
+                        multi_scale_output=False,
+                        name=name + '_' + str(i + 1),
+                        align_corners=align_corners,
+                        padding_same=padding_same))
+            else:
+                stage_func = self.add_sublayer(
+                    "stage_{}_{}".format(name, i + 1),
+                    HighResolutionModule(
+                        num_channels=num_channels,
+                        num_blocks=num_blocks,
+                        num_filters=num_filters,
+                        has_se=has_se,
+                        name=name + '_' + str(i + 1),
+                        align_corners=align_corners,
+                        padding_same=padding_same))
+
+            self.stage_func_list.append(stage_func)
+
+    def forward(self, x):
+        out = x
+        for idx in range(self._num_modules):
+            out = self.stage_func_list[idx](out)
+        return out
+
+
+class HighResolutionModule(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_blocks,
+                 num_filters,
+                 has_se=False,
+                 multi_scale_output=True,
+                 name=None,
+                 align_corners=False,
+                 padding_same=True):
+        super(HighResolutionModule, self).__init__()
+
+        self.branches_func = Branches(
+            num_blocks=num_blocks,
+            in_channels=num_channels,
+            out_channels=num_filters,
+            has_se=has_se,
+            name=name,
+            padding_same=padding_same)
+
+        self.fuse_func = FuseLayers(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            multi_scale_output=multi_scale_output,
+            name=name,
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+    def forward(self, x):
+        out = self.branches_func(x)
+        out = self.fuse_func(out)
+        return out
+
+
+class FuseLayers(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 multi_scale_output=True,
+                 name=None,
+                 align_corners=False,
+                 padding_same=True):
+        super(FuseLayers, self).__init__()
+
+        self._actual_ch = len(in_channels) if multi_scale_output else 1
+        self._in_channels = in_channels
+        self.align_corners = align_corners
+
+        self.residual_func_list = []
+        for i in range(self._actual_ch):
+            for j in range(len(in_channels)):
+                if j > i:
+                    residual_func = self.add_sublayer(
+                        "residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
+                        layers.ConvBN(
+                            in_channels=in_channels[j],
+                            out_channels=out_channels[i],
+                            kernel_size=1,
+                            bias_attr=False))
+                    self.residual_func_list.append(residual_func)
+                elif j < i:
+                    pre_num_filters = in_channels[j]
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            residual_func = self.add_sublayer(
+                                "residual_{}_layer_{}_{}_{}".format(
+                                    name, i + 1, j + 1, k + 1),
+                                layers.ConvBN(
+                                    in_channels=pre_num_filters,
+                                    out_channels=out_channels[i],
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1 if not padding_same else 'same',
+                                    bias_attr=False))
+                            pre_num_filters = out_channels[i]
+                        else:
+                            residual_func = self.add_sublayer(
+                                "residual_{}_layer_{}_{}_{}".format(
+                                    name, i + 1, j + 1, k + 1),
+                                layers.ConvBNReLU(
+                                    in_channels=pre_num_filters,
+                                    out_channels=out_channels[j],
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1 if not padding_same else 'same',
+                                    bias_attr=False))
+                            pre_num_filters = out_channels[j]
+                        self.residual_func_list.append(residual_func)
+
+    def forward(self, x):
+        outs = []
+        residual_func_idx = 0
+        for i in range(self._actual_ch):
+            residual = x[i]
+            residual_shape = paddle.shape(residual)[-2:]
+            for j in range(len(self._in_channels)):
+                if j > i:
+                    y = self.residual_func_list[residual_func_idx](x[j])
+                    residual_func_idx += 1
+
+                    y = F.interpolate(
+                        y,
+                        residual_shape,
+                        mode='bilinear',
+                        align_corners=self.align_corners)
+                    residual = residual + y
+                elif j < i:
+                    y = x[j]
+                    for k in range(i - j):
+                        y = self.residual_func_list[residual_func_idx](y)
+                        residual_func_idx += 1
+
+                    residual = residual + y
+
+            residual = F.relu(residual)
+            outs.append(residual)
+
+        return outs
+
+
+@manager.BACKBONES.add_component
+def HRNet_W18_Small_V1(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[1],
+        stage1_num_channels=[32],
+        stage2_num_modules=1,
+        stage2_num_blocks=[2, 2],
+        stage2_num_channels=[16, 32],
+        stage3_num_modules=1,
+        stage3_num_blocks=[2, 2, 2],
+        stage3_num_channels=[16, 32, 64],
+        stage4_num_modules=1,
+        stage4_num_blocks=[2, 2, 2, 2],
+        stage4_num_channels=[16, 32, 64, 128],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W18_Small_V2(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[2],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[2, 2],
+        stage2_num_channels=[18, 36],
+        stage3_num_modules=3,
+        stage3_num_blocks=[2, 2, 2],
+        stage3_num_channels=[18, 36, 72],
+        stage4_num_modules=2,
+        stage4_num_blocks=[2, 2, 2, 2],
+        stage4_num_channels=[18, 36, 72, 144],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W18(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[18, 36],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[18, 36, 72],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[18, 36, 72, 144],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W30(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[30, 60],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[30, 60, 120],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[30, 60, 120, 240],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W32(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[32, 64],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[32, 64, 128],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[32, 64, 128, 256],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W40(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[40, 80],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[40, 80, 160],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[40, 80, 160, 320],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W44(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[44, 88],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[44, 88, 176],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[44, 88, 176, 352],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W48(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[48, 96],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[48, 96, 192],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[48, 96, 192, 384],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W60(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[60, 120],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[60, 120, 240],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[60, 120, 240, 480],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W64(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[64, 128],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[64, 128, 256],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[64, 128, 256, 512],
+        **kwargs)
+    return model
diff --git a/paddlers/models/ppseg/models/backbones/mix_transformer.py b/paddlers/models/ppseg/models/backbones/mix_transformer.py
new file mode 100644
index 0000000..4b2edd0
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/mix_transformer.py
@@ -0,0 +1,588 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.nn.initializer as paddle_init
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models.backbones.transformer_utils import *
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.dim = dim
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2D(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x, H, W):
+        x_shape = paddle.shape(x)
+        B, N = x_shape[0], x_shape[1]
+        C = self.dim
+
+        q = self.q(x).reshape([B, N, self.num_heads,
+                               C // self.num_heads]).transpose([0, 2, 1, 3])
+
+        if self.sr_ratio > 1:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1])
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(
+                [B, -1, 2, self.num_heads,
+                 C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        else:
+            kv = self.kv(x).reshape(
+                [B, -1, 2, self.num_heads,
+                 C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.proj(x)
+        x_shape = paddle.shape(x)
+        H, W = x_shape[2], x_shape[3]
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class MixVisionTransformer(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 pretrained=None):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.feat_channels = embed_dims[:]
+
+        # patch_embed
+        self.patch_embed1 = OverlapPatchEmbed(
+            img_size=img_size,
+            patch_size=7,
+            stride=4,
+            in_chans=in_chans,
+            embed_dim=embed_dims[0])
+        self.patch_embed2 = OverlapPatchEmbed(
+            img_size=img_size // 4,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[0],
+            embed_dim=embed_dims[1])
+        self.patch_embed3 = OverlapPatchEmbed(
+            img_size=img_size // 8,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[1],
+            embed_dim=embed_dims[2])
+        self.patch_embed4 = OverlapPatchEmbed(
+            img_size=img_size // 16,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[2],
+            embed_dim=embed_dims[3])
+
+        # transformer encoder
+        dpr = [
+            x.numpy() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+        self.block1 = nn.LayerList([
+            Block(
+                dim=embed_dims[0],
+                num_heads=num_heads[0],
+                mlp_ratio=mlp_ratios[0],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[0]) for i in range(depths[0])
+        ])
+        self.norm1 = norm_layer(embed_dims[0])
+
+        cur += depths[0]
+        self.block2 = nn.LayerList([
+            Block(
+                dim=embed_dims[1],
+                num_heads=num_heads[1],
+                mlp_ratio=mlp_ratios[1],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[1]) for i in range(depths[1])
+        ])
+        self.norm2 = norm_layer(embed_dims[1])
+
+        cur += depths[1]
+        self.block3 = nn.LayerList([
+            Block(
+                dim=embed_dims[2],
+                num_heads=num_heads[2],
+                mlp_ratio=mlp_ratios[2],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[2]) for i in range(depths[2])
+        ])
+        self.norm3 = norm_layer(embed_dims[2])
+
+        cur += depths[2]
+        self.block4 = nn.LayerList([
+            Block(
+                dim=embed_dims[3],
+                num_heads=num_heads[3],
+                mlp_ratio=mlp_ratios[3],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[3]) for i in range(depths[3])
+        ])
+        self.norm4 = norm_layer(embed_dims[3])
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+        else:
+            self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def reset_drop_path(self, drop_path_rate):
+        dpr = [
+            x.item()
+            for x in paddle.linspace(0, drop_path_rate, sum(self.depths))
+        ]
+        cur = 0
+        for i in range(self.depths[0]):
+            self.block1[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[0]
+        for i in range(self.depths[1]):
+            self.block2[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[1]
+        for i in range(self.depths[2]):
+            self.block3[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[2]
+        for i in range(self.depths[3]):
+            self.block4[i].drop_path.drop_prob = dpr[cur + i]
+
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = paddle.shape(x)[0]
+        outs = []
+
+        # stage 1
+        x, H, W = self.patch_embed1(x)
+        for i, blk in enumerate(self.block1):
+            x = blk(x, H, W)
+
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, self.feat_channels[0]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        # stage 2
+        x, H, W = self.patch_embed2(x)
+        for i, blk in enumerate(self.block2):
+            x = blk(x, H, W)
+        x = self.norm2(x)
+        x = x.reshape([B, H, W, self.feat_channels[1]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        # stage 3
+        x, H, W = self.patch_embed3(x)
+        for i, blk in enumerate(self.block3):
+            x = blk(x, H, W)
+        x = self.norm3(x)
+        x = x.reshape([B, H, W, self.feat_channels[2]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        # stage 4
+        x, H, W = self.patch_embed4(x)
+        for i, blk in enumerate(self.block4):
+            x = blk(x, H, W)
+        x = self.norm4(x)
+        x = x.reshape([B, H, W, self.feat_channels[3]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        return outs
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dim = dim
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        x_shape = paddle.shape(x)
+        B, N = x_shape[0], x_shape[1]
+        x = x.transpose([0, 2, 1]).reshape([B, self.dim, H, W])
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose([0, 2, 1])
+
+        return x
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B0(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[32, 64, 160, 256],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B1(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B2(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B3(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B4(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B5(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 6, 40, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
diff --git a/paddlers/models/ppseg/models/backbones/mobilenetv2.py b/paddlers/models/ppseg/models/backbones/mobilenetv2.py
new file mode 100644
index 0000000..f5c3816
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/mobilenetv2.py
@@ -0,0 +1,168 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg import utils
+
+
+@manager.BACKBONES.add_component
+class MobileNetV2(nn.Layer):
+    """
+        The MobileNetV2 implementation based on PaddlePaddle.
+
+        The original article refers to
+        Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
+        "MobileNetV2: Inverted Residuals and Linear Bottlenecks"
+        (https://arxiv.org/abs/1801.04381).
+
+        Args:
+            channel_ratio (float, optional): The ratio of channel. Default: 1.0
+            min_channel (int, optional): The minimum of channel. Default: 16
+            pretrained (str, optional): The path or url of pretrained model. Default: None
+        """
+
+    def __init__(self, channel_ratio=1.0, min_channel=16, pretrained=None):
+        super(MobileNetV2, self).__init__()
+        self.channel_ratio = channel_ratio
+        self.min_channel = min_channel
+        self.pretrained = pretrained
+
+        self.stage0 = conv_bn(3, self.depth(32), 3, 2)
+
+        self.stage1 = InvertedResidual(self.depth(32), self.depth(16), 1, 1)
+
+        self.stage2 = nn.Sequential(
+            InvertedResidual(self.depth(16), self.depth(24), 2, 6),
+            InvertedResidual(self.depth(24), self.depth(24), 1, 6),
+        )
+
+        self.stage3 = nn.Sequential(
+            InvertedResidual(self.depth(24), self.depth(32), 2, 6),
+            InvertedResidual(self.depth(32), self.depth(32), 1, 6),
+            InvertedResidual(self.depth(32), self.depth(32), 1, 6),
+        )
+
+        self.stage4 = nn.Sequential(
+            InvertedResidual(self.depth(32), self.depth(64), 2, 6),
+            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
+            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
+            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
+        )
+
+        self.stage5 = nn.Sequential(
+            InvertedResidual(self.depth(64), self.depth(96), 1, 6),
+            InvertedResidual(self.depth(96), self.depth(96), 1, 6),
+            InvertedResidual(self.depth(96), self.depth(96), 1, 6),
+        )
+
+        self.stage6 = nn.Sequential(
+            InvertedResidual(self.depth(96), self.depth(160), 2, 6),
+            InvertedResidual(self.depth(160), self.depth(160), 1, 6),
+            InvertedResidual(self.depth(160), self.depth(160), 1, 6),
+        )
+
+        self.stage7 = InvertedResidual(self.depth(160), self.depth(320), 1, 6)
+
+        self.init_weight()
+
+    def depth(self, channels):
+        min_channel = min(channels, self.min_channel)
+        return max(min_channel, int(channels * self.channel_ratio))
+
+    def forward(self, x):
+        feat_list = []
+
+        feature_1_2 = self.stage0(x)
+        feature_1_2 = self.stage1(feature_1_2)
+        feature_1_4 = self.stage2(feature_1_2)
+        feature_1_8 = self.stage3(feature_1_4)
+        feature_1_16 = self.stage4(feature_1_8)
+        feature_1_16 = self.stage5(feature_1_16)
+        feature_1_32 = self.stage6(feature_1_16)
+        feature_1_32 = self.stage7(feature_1_32)
+        feat_list.append(feature_1_4)
+        feat_list.append(feature_1_8)
+        feat_list.append(feature_1_16)
+        feat_list.append(feature_1_32)
+        return feat_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+def conv_bn(inp, oup, kernel, stride):
+    return nn.Sequential(
+        nn.Conv2D(
+            in_channels=inp,
+            out_channels=oup,
+            kernel_size=kernel,
+            stride=stride,
+            padding=(kernel - 1) // 2,
+            bias_attr=False),
+        nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+        nn.ReLU())
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, inp, oup, stride, expand_ratio, dilation=1):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                inp * expand_ratio,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias_attr=False),
+            nn.BatchNorm2D(
+                num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(
+                inp * expand_ratio,
+                inp * expand_ratio,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=inp * expand_ratio,
+                bias_attr=False),
+            nn.BatchNorm2D(
+                num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(
+                inp * expand_ratio,
+                oup,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+        )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
diff --git a/paddlers/models/ppseg/models/backbones/mobilenetv3.py b/paddlers/models/ppseg/models/backbones/mobilenetv3.py
new file mode 100644
index 0000000..e2c15ed
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/mobilenetv3.py
@@ -0,0 +1,364 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models import layers
+
+__all__ = [
+    "MobileNetV3_small_x0_35", "MobileNetV3_small_x0_5",
+    "MobileNetV3_small_x0_75", "MobileNetV3_small_x1_0",
+    "MobileNetV3_small_x1_25", "MobileNetV3_large_x0_35",
+    "MobileNetV3_large_x0_5", "MobileNetV3_large_x0_75",
+    "MobileNetV3_large_x1_0", "MobileNetV3_large_x1_25"
+]
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class MobileNetV3(nn.Layer):
+    """
+    The MobileNetV3 implementation based on PaddlePaddle.
+
+    The original article refers to Jingdong
+    Andrew Howard, et, al. "Searching for MobileNetV3"
+    (https://arxiv.org/pdf/1905.02244.pdf).
+
+    Args:
+        pretrained (str, optional): The path of pretrained model.
+        scale (float, optional): The scale of channels . Default: 1.0.
+        model_name (str, optional): Model name. It determines the type of MobileNetV3. The value is 'small' or 'large'. Defualt: 'small'.
+        output_stride (int, optional): The stride of output features compared to input images. The value should be one of (2, 4, 8, 16, 32). Default: None.
+
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 scale=1.0,
+                 model_name="small",
+                 output_stride=None):
+        super(MobileNetV3, self).__init__()
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", 1],
+                [3, 64, 24, False, "relu", 2],
+                [3, 72, 24, False, "relu", 1],  # output 1 -> out_index=2
+                [5, 72, 40, True, "relu", 2],
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],  # output 2 -> out_index=5
+                [3, 240, 80, False, "hard_swish", 2],
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish",
+                 1],  # output 3 -> out_index=11
+                [5, 672, 160, True, "hard_swish", 2],
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish",
+                 1],  # output 3 -> out_index=14
+            ]
+            self.out_indices = [2, 5, 11, 14]
+            self.feat_channels = [
+                make_divisible(i * scale) for i in [24, 40, 112, 160]
+            ]
+
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", 2],  # output 1 -> out_index=0
+                [3, 72, 24, False, "relu", 2],
+                [3, 88, 24, False, "relu", 1],  # output 2 -> out_index=3
+                [5, 96, 40, True, "hard_swish", 2],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],  # output 3 -> out_index=7
+                [5, 288, 96, True, "hard_swish", 2],
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],  # output 4 -> out_index=10
+            ]
+            self.out_indices = [0, 3, 7, 10]
+            self.feat_channels = [
+                make_divisible(i * scale) for i in [16, 24, 48, 96]
+            ]
+
+            self.cls_ch_squeeze = 576
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        ###################################################
+        # modify stride and dilation based on output_stride
+        self.dilation_cfg = [1] * len(self.cfg)
+        self.modify_bottle_params(output_stride=output_stride)
+        ###################################################
+
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act="hard_swish")
+
+        self.block_list = []
+
+        inplanes = make_divisible(inplanes * scale)
+        for i, (k, exp, c, se, nl, s) in enumerate(self.cfg):
+            ######################################
+            # add dilation rate
+            dilation_rate = self.dilation_cfg[i]
+            ######################################
+            self.block_list.append(
+                ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    dilation=dilation_rate,
+                    use_se=se,
+                    act=nl,
+                    name="conv" + str(i + 2)))
+            self.add_sublayer(
+                sublayer=self.block_list[-1], name="conv" + str(i + 2))
+            inplanes = make_divisible(scale * c)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def modify_bottle_params(self, output_stride=None):
+
+        if output_stride is not None and output_stride % 2 != 0:
+            raise ValueError("output stride must to be even number")
+        if output_stride is not None:
+            stride = 2
+            rate = 1
+            for i, _cfg in enumerate(self.cfg):
+                stride = stride * _cfg[-1]
+                if stride > output_stride:
+                    rate = rate * _cfg[-1]
+                    self.cfg[i][-1] = 1
+
+                self.dilation_cfg[i] = rate
+
+    def forward(self, inputs, label=None):
+        x = self.conv1(inputs)
+        # A feature list saves each downsampling feature.
+        feat_list = []
+        for i, block in enumerate(self.block_list):
+            x = block(x)
+            if i in self.out_indices:
+                feat_list.append(x)
+
+        return feat_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+
+        self.conv = nn.Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = layers.SyncBatchNorm(
+            num_features=out_c,
+            weight_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(0.0)),
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(0.0)))
+        self._act_op = layers.Activation(act='hardswish')
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self._act_op(x)
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 dilation=1,
+                 act=None,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding='same',
+            dilation=dilation,
+            num_groups=mid_c,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+        self.dilation = dilation
+
+    def forward(self, inputs):
+        x = self.expand_conv(inputs)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = inputs + x
+        return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.conv2 = nn.Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+def MobileNetV3_small_x0_35(**kwargs):
+    model = MobileNetV3(model_name="small", scale=0.35, **kwargs)
+    return model
+
+
+def MobileNetV3_small_x0_5(**kwargs):
+    model = MobileNetV3(model_name="small", scale=0.5, **kwargs)
+    return model
+
+
+def MobileNetV3_small_x0_75(**kwargs):
+    model = MobileNetV3(model_name="small", scale=0.75, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV3_small_x1_0(**kwargs):
+    model = MobileNetV3(model_name="small", scale=1.0, **kwargs)
+    return model
+
+
+def MobileNetV3_small_x1_25(**kwargs):
+    model = MobileNetV3(model_name="small", scale=1.25, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x0_35(**kwargs):
+    model = MobileNetV3(model_name="large", scale=0.35, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x0_5(**kwargs):
+    model = MobileNetV3(model_name="large", scale=0.5, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x0_75(**kwargs):
+    model = MobileNetV3(model_name="large", scale=0.75, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV3_large_x1_0(**kwargs):
+    model = MobileNetV3(model_name="large", scale=1.0, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x1_25(**kwargs):
+    model = MobileNetV3(model_name="large", scale=1.25, **kwargs)
+    return model
diff --git a/paddlers/models/ppseg/models/backbones/resnet_vd.py b/paddlers/models/ppseg/models/backbones/resnet_vd.py
new file mode 100644
index 0000000..90c92f3
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/resnet_vd.py
@@ -0,0 +1,398 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = [
+    "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd"
+]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 data_format='NCHW'):
+        super(ConvBNLayer, self).__init__()
+        if dilation != 1 and kernel_size != 3:
+            raise RuntimeError("When the dilation isn't 1," \
+                "the kernel_size should be 3.")
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2D(
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            ceil_mode=True,
+            data_format=data_format)
+        self._conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2 \
+                if dilation == 1 else dilation,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=False,
+            data_format=data_format)
+
+        self._batch_norm = layers.SyncBatchNorm(
+            out_channels, data_format=data_format)
+        self._act_op = layers.Activation(act=act)
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        y = self._act_op(y)
+
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 dilation=1,
+                 data_format='NCHW'):
+        super(BottleneckBlock, self).__init__()
+
+        self.data_format = data_format
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            data_format=data_format)
+
+        self.dilation = dilation
+
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            dilation=dilation,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first or stride == 1 else True,
+                data_format=data_format)
+
+        self.shortcut = shortcut
+        # NOTE: Use the wrap layer for quantization training
+        self.add = layers.Add()
+        self.relu = layers.Activation(act="relu")
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = self.add(short, conv2)
+        y = self.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dilation=1,
+                 shortcut=True,
+                 if_first=False,
+                 data_format='NCHW'):
+        super(BasicBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            dilation=dilation,
+            act='relu',
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            dilation=dilation,
+            act=None,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first or stride == 1 else True,
+                data_format=data_format)
+
+        self.shortcut = shortcut
+        self.dilation = dilation
+        self.data_format = data_format
+        self.add = layers.Add()
+        self.relu = layers.Activation(act="relu")
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = self.add(short, conv1)
+        y = self.relu(y)
+
+        return y
+
+
+class ResNet_vd(nn.Layer):
+    """
+    The ResNet_vd implementation based on PaddlePaddle.
+
+    The original article refers to Jingdong
+    Tong He, et, al. "Bag of Tricks for Image Classification with Convolutional Neural Networks"
+    (https://arxiv.org/pdf/1812.01187.pdf).
+
+    Args:
+        layers (int, optional): The layers of ResNet_vd. The supported layers are (18, 34, 50, 101, 152, 200). Default: 50.
+        output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 8.
+        multi_grid (tuple|list, optional): The grid of stage4. Defult: (1, 1, 1).
+        pretrained (str, optional): The path of pretrained model.
+
+    """
+
+    def __init__(self,
+                 layers=50,
+                 output_stride=8,
+                 multi_grid=(1, 1, 1),
+                 pretrained=None,
+                 data_format='NCHW'):
+        super(ResNet_vd, self).__init__()
+
+        self.data_format = data_format
+        self.conv1_logit = None  # for gscnn shape stream
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024
+                        ] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        # for channels of four returned stages
+        self.feat_channels = [c * 4 for c in num_filters
+                              ] if layers >= 50 else num_filters
+
+        dilation_dict = None
+        if output_stride == 8:
+            dilation_dict = {2: 2, 3: 4}
+        elif output_stride == 16:
+            dilation_dict = {3: 2}
+
+        self.conv1_1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=32,
+            kernel_size=3,
+            stride=2,
+            act='relu',
+            data_format=data_format)
+        self.conv1_2 = ConvBNLayer(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            data_format=data_format)
+        self.conv1_3 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            data_format=data_format)
+        self.pool2d_max = nn.MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+
+        # self.block_list = []
+        self.stage_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                block_list = []
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+
+                    ###############################################################################
+                    # Add dilation rate for some segmentation tasks, if dilation_dict is not None.
+                    dilation_rate = dilation_dict[
+                        block] if dilation_dict and block in dilation_dict else 1
+
+                    # Actually block here is 'stage', and i is 'block' in 'stage'
+                    # At the stage 4, expand the the dilation_rate if given multi_grid
+                    if block == 3:
+                        dilation_rate = dilation_rate * multi_grid[i]
+                    ###############################################################################
+
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0
+                            and dilation_rate == 1 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            dilation=dilation_rate,
+                            data_format=data_format))
+
+                    block_list.append(bottleneck_block)
+                    shortcut = True
+                self.stage_list.append(block_list)
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                block_list = []
+                for i in range(depth[block]):
+                    dilation_rate = dilation_dict[block] \
+                        if dilation_dict and block in dilation_dict else 1
+                    if block == 3:
+                        dilation_rate = dilation_rate * multi_grid[i]
+
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 \
+                                and dilation_rate == 1 else 1,
+                            dilation=dilation_rate,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            data_format=data_format))
+                    block_list.append(basic_block)
+                    shortcut = True
+                self.stage_list.append(block_list)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        self.conv1_logit = y.clone()
+        y = self.pool2d_max(y)
+
+        # A feature list saves the output feature map of each stage.
+        feat_list = []
+        for stage in self.stage_list:
+            for block in stage:
+                y = block(y)
+            feat_list.append(y)
+
+        return feat_list
+
+    def init_weight(self):
+        utils.load_pretrained_model(self, self.pretrained)
+
+
+@manager.BACKBONES.add_component
+def ResNet18_vd(**args):
+    model = ResNet_vd(layers=18, **args)
+    return model
+
+
+def ResNet34_vd(**args):
+    model = ResNet_vd(layers=34, **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ResNet50_vd(**args):
+    model = ResNet_vd(layers=50, **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ResNet101_vd(**args):
+    model = ResNet_vd(layers=101, **args)
+    return model
+
+
+def ResNet152_vd(**args):
+    model = ResNet_vd(layers=152, **args)
+    return model
+
+
+def ResNet200_vd(**args):
+    model = ResNet_vd(layers=200, **args)
+    return model
diff --git a/paddlers/models/ppseg/models/backbones/stdcnet.py b/paddlers/models/ppseg/models/backbones/stdcnet.py
new file mode 100644
index 0000000..5fe919f
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/stdcnet.py
@@ -0,0 +1,281 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models.layers.layer_libs import SyncBatchNorm
+
+__all__ = ["STDC1", "STDC2"]
+
+
+class STDCNet(nn.Layer):
+    """
+    The STDCNet implementation based on PaddlePaddle.
+
+    The original article refers to Meituan
+    Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation."
+    (https://arxiv.org/abs/2104.13188)
+
+    Args:
+        base(int, optional): base channels. Default: 64.
+        layers(list, optional): layers numbers list. It determines STDC block numbers of STDCNet's stage3\4\5. Defualt: [4, 5, 3].
+        block_num(int,optional): block_num of features block. Default: 4.
+        type(str,optional): feature fusion method "cat"/"add". Default: "cat".
+        num_classes(int, optional): class number for image classification. Default: 1000.
+        dropout(float,optional): dropout ratio. if >0,use dropout ratio.  Default: 0.20.
+        use_conv_last(bool,optional): whether to use the last ConvBNReLU layer . Default: False.
+        pretrained(str, optional): the path of pretrained model.
+    """
+
+    def __init__(self,
+                 base=64,
+                 layers=[4, 5, 3],
+                 block_num=4,
+                 type="cat",
+                 num_classes=1000,
+                 dropout=0.20,
+                 use_conv_last=False,
+                 pretrained=None):
+        super(STDCNet, self).__init__()
+        if type == "cat":
+            block = CatBottleneck
+        elif type == "add":
+            block = AddBottleneck
+        self.use_conv_last = use_conv_last
+        self.features = self._make_layers(base, layers, block_num, block)
+        self.conv_last = ConvBNRelu(base * 16, max(1024, base * 16), 1, 1)
+
+        if (layers == [4, 5, 3]):  #stdc1446
+            self.x2 = nn.Sequential(self.features[:1])
+            self.x4 = nn.Sequential(self.features[1:2])
+            self.x8 = nn.Sequential(self.features[2:6])
+            self.x16 = nn.Sequential(self.features[6:11])
+            self.x32 = nn.Sequential(self.features[11:])
+        elif (layers == [2, 2, 2]):  #stdc813
+            self.x2 = nn.Sequential(self.features[:1])
+            self.x4 = nn.Sequential(self.features[1:2])
+            self.x8 = nn.Sequential(self.features[2:4])
+            self.x16 = nn.Sequential(self.features[4:6])
+            self.x32 = nn.Sequential(self.features[6:])
+        else:
+            raise NotImplementedError(
+                "model with layers:{} is not implemented!".format(layers))
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        """
+        forward function for feature extract.
+        """
+        feat2 = self.x2(x)
+        feat4 = self.x4(feat2)
+        feat8 = self.x8(feat4)
+        feat16 = self.x16(feat8)
+        feat32 = self.x32(feat16)
+        if self.use_conv_last:
+            feat32 = self.conv_last(feat32)
+        return feat2, feat4, feat8, feat16, feat32
+
+    def _make_layers(self, base, layers, block_num, block):
+        features = []
+        features += [ConvBNRelu(3, base // 2, 3, 2)]
+        features += [ConvBNRelu(base // 2, base, 3, 2)]
+
+        for i, layer in enumerate(layers):
+            for j in range(layer):
+                if i == 0 and j == 0:
+                    features.append(block(base, base * 4, block_num, 2))
+                elif j == 0:
+                    features.append(
+                        block(base * int(math.pow(2, i + 1)),
+                              base * int(math.pow(2, i + 2)), block_num, 2))
+                else:
+                    features.append(
+                        block(base * int(math.pow(2, i + 2)),
+                              base * int(math.pow(2, i + 2)), block_num, 1))
+
+        return nn.Sequential(*features)
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class ConvBNRelu(nn.Layer):
+    def __init__(self, in_planes, out_planes, kernel=3, stride=1):
+        super(ConvBNRelu, self).__init__()
+        self.conv = nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=kernel,
+            stride=stride,
+            padding=kernel // 2,
+            bias_attr=False)
+        self.bn = SyncBatchNorm(out_planes, data_format='NCHW')
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out = self.relu(self.bn(self.conv(x)))
+        return out
+
+
+class AddBottleneck(nn.Layer):
+    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+        super(AddBottleneck, self).__init__()
+        assert block_num > 1, "block number should be larger than 1."
+        self.conv_list = nn.LayerList()
+        self.stride = stride
+        if stride == 2:
+            self.avd_layer = nn.Sequential(
+                nn.Conv2D(
+                    out_planes // 2,
+                    out_planes // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    groups=out_planes // 2,
+                    bias_attr=False),
+                nn.BatchNorm2D(out_planes // 2),
+            )
+            self.skip = nn.Sequential(
+                nn.Conv2D(
+                    in_planes,
+                    in_planes,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    groups=in_planes,
+                    bias_attr=False),
+                nn.BatchNorm2D(in_planes),
+                nn.Conv2D(
+                    in_planes, out_planes, kernel_size=1, bias_attr=False),
+                nn.BatchNorm2D(out_planes),
+            )
+            stride = 1
+
+        for idx in range(block_num):
+            if idx == 0:
+                self.conv_list.append(
+                    ConvBNRelu(in_planes, out_planes // 2, kernel=1))
+            elif idx == 1 and block_num == 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 2, stride=stride))
+            elif idx == 1 and block_num > 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 4, stride=stride))
+            elif idx < block_num - 1:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx + 1))))
+            else:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx))))
+
+    def forward(self, x):
+        out_list = []
+        out = x
+        for idx, conv in enumerate(self.conv_list):
+            if idx == 0 and self.stride == 2:
+                out = self.avd_layer(conv(out))
+            else:
+                out = conv(out)
+            out_list.append(out)
+        if self.stride == 2:
+            x = self.skip(x)
+        return paddle.concat(out_list, axis=1) + x
+
+
+class CatBottleneck(nn.Layer):
+    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+        super(CatBottleneck, self).__init__()
+        assert block_num > 1, "block number should be larger than 1."
+        self.conv_list = nn.LayerList()
+        self.stride = stride
+        if stride == 2:
+            self.avd_layer = nn.Sequential(
+                nn.Conv2D(
+                    out_planes // 2,
+                    out_planes // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    groups=out_planes // 2,
+                    bias_attr=False),
+                nn.BatchNorm2D(out_planes // 2),
+            )
+            self.skip = nn.AvgPool2D(kernel_size=3, stride=2, padding=1)
+            stride = 1
+
+        for idx in range(block_num):
+            if idx == 0:
+                self.conv_list.append(
+                    ConvBNRelu(in_planes, out_planes // 2, kernel=1))
+            elif idx == 1 and block_num == 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 2, stride=stride))
+            elif idx == 1 and block_num > 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 4, stride=stride))
+            elif idx < block_num - 1:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx + 1))))
+            else:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx))))
+
+    def forward(self, x):
+        out_list = []
+        out1 = self.conv_list[0](x)
+        for idx, conv in enumerate(self.conv_list[1:]):
+            if idx == 0:
+                if self.stride == 2:
+                    out = conv(self.avd_layer(out1))
+                else:
+                    out = conv(out1)
+            else:
+                out = conv(out)
+            out_list.append(out)
+
+        if self.stride == 2:
+            out1 = self.skip(out1)
+        out_list.insert(0, out1)
+        out = paddle.concat(out_list, axis=1)
+        return out
+
+
+@manager.BACKBONES.add_component
+def STDC2(**kwargs):
+    model = STDCNet(base=64, layers=[4, 5, 3], **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def STDC1(**kwargs):
+    model = STDCNet(base=64, layers=[2, 2, 2], **kwargs)
+    return model
diff --git a/paddlers/models/ppseg/models/backbones/swin_transformer.py b/paddlers/models/ppseg/models/backbones/swin_transformer.py
new file mode 100644
index 0000000..00e8d33
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/swin_transformer.py
@@ -0,0 +1,792 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models.backbones.transformer_utils import *
+
+
+class Mlp(nn.Layer):
+    """ Multilayer perceptron."""
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4,
+                           5]).reshape([-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [B, H // window_size, W // window_size, window_size, window_size, -1])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    """
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = self.create_parameter(
+            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                   num_heads),
+            default_initializer=zeros_)
+        self.add_parameter("relative_position_bias_table",
+                           self.relative_position_bias_table)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h,
+                                               coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+
+        relative_coords = relative_coords.transpose([1, 2, 0])
+
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [B_, N, 3, self.num_heads,
+             C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+
+        index = self.relative_position_index.reshape([-1])
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """
+    Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, C])
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+
+        x = x.transpose([0, 3, 1, 2])
+        x = F.pad(x, [pad_l, pad_r, pad_t, pad_b])
+        x = x.transpose([0, 2, 3, 1])
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([B, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """
+    Patch Merging Layer
+
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.reshape([B, H, W, C])
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = x.transpose([0, 3, 1, 2])
+            x = F.pad(x, [0, W % 2, 0, H % 2])
+            x = x.transpose([0, 2, 3, 1])
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([B, -1, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """
+    A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels.
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = paddle.zeros((1, Hp, Wp, 1))  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape(
+            [-1, self.window_size * self.window_size])
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+
+        huns = -100.0 * paddle.ones_like(attn_mask)
+        attn_mask = huns * (attn_mask != 0).astype("float32")
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """
+    Image to Patch Embedding.
+
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.shape
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@manager.BACKBONES.add_component
+class SwinTransformer(nn.Layer):
+    """
+    The SwinTransformer implementation based on PaddlePaddle.
+
+    The original article refers to
+    Liu, Ze, et al. "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows"
+    (https://arxiv.org/abs/2103.14030)
+
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default: 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. Default: -1.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 pretrained=None):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = self.create_parameter(
+                shape=(1, embed_dim, patches_resolution[0],
+                       patches_resolution[1]),
+                default_initializer=zeros_)
+            self.add_parameter("absolute_pos_embed", self.absolute_pos_embed)
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate, sum(depths)).tolist()
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if
+                (i_layer < self.num_layers - 1) else None)
+            self.layers.append(layer)
+
+        feat_channels = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.feat_channels = feat_channels
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(feat_channels[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self._freeze_stages()
+
+        self.pretrained = pretrained
+        self.init_weights(self.pretrained)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                layer = self.layers[i]
+                layer.eval()
+                for param in layer.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+        else:
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Linear):
+                    trunc_normal_(sublayer.weight)
+                    if isinstance(sublayer,
+                                  nn.Linear) and sublayer.bias is not None:
+                        zeros_(sublayer.bias)
+                elif isinstance(sublayer, nn.LayerNorm):
+                    zeros_(sublayer.bias)
+                    ones_(sublayer.weight)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        _, _, Wh, Ww = x.shape
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.reshape(
+                    [-1, H, W, self.feat_channels[i]]).transpose([0, 3, 1, 2])
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train()
+        self._freeze_stages()
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_tiny_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_small_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_base_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_base_patch4_window12_384(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_large_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_large_patch4_window12_384(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        **kwargs)
+
+    return model
diff --git a/paddlers/models/ppseg/models/backbones/transformer_utils.py b/paddlers/models/ppseg/models/backbones/transformer_utils.py
new file mode 100644
index 0000000..db3e536
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/transformer_utils.py
@@ -0,0 +1,83 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.initializer as paddle_init
+
+__all__ = [
+    'to_2tuple', 'DropPath', 'Identity', 'trunc_normal_', 'zeros_', 'ones_',
+    'init_weights'
+]
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+trunc_normal_ = paddle_init.TruncatedNormal(std=.02)
+zeros_ = paddle_init.Constant(value=0.)
+ones_ = paddle_init.Constant(value=1.)
+
+
+def init_weights(layer):
+    """
+    Init the weights of transformer.
+    Args:
+        layer(nn.Layer): The layer to init weights.
+    Returns:
+        None
+    """
+    if isinstance(layer, nn.Linear):
+        trunc_normal_(layer.weight)
+        if layer.bias is not None:
+            zeros_(layer.bias)
+    elif isinstance(layer, nn.LayerNorm):
+        zeros_(layer.bias)
+        ones_(layer.weight)
diff --git a/paddlers/models/ppseg/models/backbones/vision_transformer.py b/paddlers/models/ppseg/models/backbones/vision_transformer.py
new file mode 100644
index 0000000..778cbb3
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/vision_transformer.py
@@ -0,0 +1,410 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils, logger
+from paddlers.models.ppseg.models.backbones.transformer_utils import to_2tuple, DropPath, Identity
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        N, C = x_shape[1], x_shape[2]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads,
+                                   C // self.num_heads)).transpose((2, 0, 3, 1,
+                                                                    4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        self.img_size = to_2tuple(img_size)
+        self.patch_size = to_2tuple(patch_size)
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size[1]
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size[0]
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+
+
+@manager.BACKBONES.add_component
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 **args):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        self.pos_w = self.patch_embed.num_patches_in_w
+        self.pos_h = self.patch_embed.num_patches_in_h
+
+        self.pos_embed = self.create_parameter(
+            shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.final_norm = final_norm
+        if self.final_norm:
+            self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        utils.load_pretrained_model(self, self.pretrained)
+
+        # load and resize pos_embed
+        model_path = self.pretrained
+        if not os.path.exists(model_path):
+            model_path = utils.download_pretrained_model(model_path)
+
+        load_state_dict = paddle.load(model_path)
+        model_state_dict = self.state_dict()
+        pos_embed_name = "pos_embed"
+        if pos_embed_name in load_state_dict.keys():
+            load_pos_embed = paddle.to_tensor(
+                load_state_dict[pos_embed_name], dtype="float32")
+            if self.pos_embed.shape != load_pos_embed.shape:
+                pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                    load_pos_embed, (pos_size, pos_size),
+                    (self.pos_h, self.pos_w))
+                self.set_dict(model_state_dict)
+                logger.info(
+                    "Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x_shape = paddle.shape(x)  # b * c * h * w
+
+        cls_tokens = self.cls_token.expand((x_shape[0], -1, -1))
+        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
+        x = paddle.concat([cls_tokens, x], axis=1)
+
+        if paddle.shape(x)[1] == self.pos_embed.shape[1]:
+            x = x + self.pos_embed
+        else:
+            x = x + self.resize_pos_embed(self.pos_embed,
+                                          (self.pos_h, self.pos_w), x_shape[2:])
+        x = self.pos_drop(x)
+
+        res = []
+        for idx, blk in enumerate(self.blocks):
+            x = blk(x)
+            if self.final_norm and idx == len(self.blocks) - 1:
+                x = self.norm(x)
+            res.append(x[:, 1:, :])
+
+        return res, x_shape
+
+
+@manager.BACKBONES.add_component
+def ViT_small_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=8,
+        num_heads=8,
+        mlp_ratio=3,
+        qk_scale=768**-0.5,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_base_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_base_patch16_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_base_patch32_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_large_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_large_patch16_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_large_patch32_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_huge_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_huge_patch32_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        **kwargs)
+    return model
diff --git a/paddlers/models/ppseg/models/backbones/xception_deeplab.py b/paddlers/models/ppseg/models/backbones/xception_deeplab.py
new file mode 100644
index 0000000..83152ef
--- /dev/null
+++ b/paddlers/models/ppseg/models/backbones/xception_deeplab.py
@@ -0,0 +1,415 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models import layers
+
+__all__ = ["Xception41_deeplab", "Xception65_deeplab", "Xception71_deeplab"]
+
+
+def check_data(data, number):
+    if type(data) == int:
+        return [data] * number
+    assert len(data) == number
+    return data
+
+
+def check_stride(s, os):
+    if s <= os:
+        return True
+    else:
+        return False
+
+
+def check_points(count, points):
+    if points is None:
+        return False
+    else:
+        if isinstance(points, list):
+            return (True if count in points else False)
+        else:
+            return (True if count == points else False)
+
+
+def gen_bottleneck_params(backbone='xception_65'):
+    if backbone == 'xception_65':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_41':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (8, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_71':
+        bottleneck_params = {
+            "entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    else:
+        raise ValueError(
+            "Xception backbont only support xception_41/xception_65/xception_71"
+        )
+    return bottleneck_params
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            bias_attr=False)
+        self._bn = layers.SyncBatchNorm(
+            num_features=output_channels, epsilon=1e-3, momentum=0.99)
+
+        self._act_op = layers.Activation(act=act)
+
+    def forward(self, inputs):
+        return self._act_op(self._bn(self._conv(inputs)))
+
+
+class Seperate_Conv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride,
+                 filter,
+                 dilation=1,
+                 act=None,
+                 name=None):
+        super(Seperate_Conv, self).__init__()
+
+        self._conv1 = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=filter,
+            stride=stride,
+            groups=input_channels,
+            padding=(filter) // 2 * dilation,
+            dilation=dilation,
+            bias_attr=False)
+        self._bn1 = layers.SyncBatchNorm(
+            input_channels, epsilon=1e-3, momentum=0.99)
+
+        self._act_op1 = layers.Activation(act=act)
+
+        self._conv2 = nn.Conv2D(
+            input_channels,
+            output_channels,
+            1,
+            stride=1,
+            groups=1,
+            padding=0,
+            bias_attr=False)
+        self._bn2 = layers.SyncBatchNorm(
+            output_channels, epsilon=1e-3, momentum=0.99)
+
+        self._act_op2 = layers.Activation(act=act)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._bn1(x)
+        x = self._act_op1(x)
+        x = self._conv2(x)
+        x = self._bn2(x)
+        x = self._act_op2(x)
+        return x
+
+
+class Xception_Block(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 strides=1,
+                 filter_size=3,
+                 dilation=1,
+                 skip_conv=True,
+                 has_skip=True,
+                 activation_fn_in_separable_conv=False,
+                 name=None):
+        super(Xception_Block, self).__init__()
+
+        repeat_number = 3
+        output_channels = check_data(output_channels, repeat_number)
+        filter_size = check_data(filter_size, repeat_number)
+        strides = check_data(strides, repeat_number)
+
+        self.has_skip = has_skip
+        self.skip_conv = skip_conv
+        self.activation_fn_in_separable_conv = activation_fn_in_separable_conv
+        if not activation_fn_in_separable_conv:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                dilation=dilation,
+                name=name + "/separable_conv3")
+        else:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv3")
+
+        if has_skip and skip_conv:
+            self._short = ConvBNLayer(
+                input_channels,
+                output_channels[-1],
+                1,
+                stride=strides[-1],
+                padding=0,
+                name=name + "/shortcut")
+
+    def forward(self, inputs):
+        if not self.activation_fn_in_separable_conv:
+            x = F.relu(inputs)
+            x = self._conv1(x)
+            x = F.relu(x)
+            x = self._conv2(x)
+            x = F.relu(x)
+            x = self._conv3(x)
+        else:
+            x = self._conv1(inputs)
+            x = self._conv2(x)
+            x = self._conv3(x)
+        if self.has_skip is False:
+            return x
+        if self.skip_conv:
+            skip = self._short(inputs)
+        else:
+            skip = inputs
+        return x + skip
+
+
+class XceptionDeeplab(nn.Layer):
+    """
+    The Xception backobne of DeepLabv3+ implementation based on PaddlePaddle.
+
+    The original article refers to
+     Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation"
+     (https://arxiv.org/abs/1802.02611)
+
+     Args:
+         backbone (str): Which type of Xception_DeepLab to select. It should be one of ('xception_41', 'xception_65', 'xception_71').
+         pretrained (str, optional): The path of pretrained model.
+         output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 16.
+
+    """
+
+    def __init__(self, backbone, pretrained=None, output_stride=16):
+
+        super(XceptionDeeplab, self).__init__()
+
+        bottleneck_params = gen_bottleneck_params(backbone)
+        self.backbone = backbone
+        self.feat_channels = [128, 2048]
+
+        self._conv1 = ConvBNLayer(
+            3,
+            32,
+            3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv1")
+        self._conv2 = ConvBNLayer(
+            32,
+            64,
+            3,
+            stride=1,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv2")
+        """
+            bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+
+        if output_stride == 16:
+            entry_block3_stride = 2
+            middle_block_dilation = 1
+            exit_block_dilations = (1, 2)
+        elif output_stride == 8:
+            entry_block3_stride = 1
+            middle_block_dilation = 2
+            exit_block_dilations = (2, 4)
+
+        """
+        self.block_num = bottleneck_params["entry_flow"][0]
+        self.strides = bottleneck_params["entry_flow"][1]
+        self.chns = bottleneck_params["entry_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+
+        self.entry_flow = []
+        self.middle_flow = []
+
+        self.stride = 2
+        self.output_stride = output_stride
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/entry_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=64 if i == 0 else self.chns[i - 1],
+                    output_channels=self.chns[i],
+                    strides=[1, 1, self.stride],
+                    name=self.backbone + "/entry_flow/block" + str(i + 1)))
+            self.entry_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["middle_flow"][0]
+        self.strides = bottleneck_params["middle_flow"][1]
+        self.chns = bottleneck_params["middle_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/middle_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=728,
+                    output_channels=728,
+                    strides=[1, 1, self.strides[i]],
+                    skip_conv=False,
+                    name=self.backbone + "/middle_flow/block" + str(i + 1)))
+            self.middle_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["exit_flow"][0]
+        self.strides = bottleneck_params["exit_flow"][1]
+        self.chns = bottleneck_params["exit_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+        stride = self.strides[0] if check_stride(s * self.strides[0],
+                                                 self.output_stride) else 1
+        self._exit_flow_1 = Xception_Block(
+            728,
+            self.chns[0], [1, 1, stride],
+            name=self.backbone + "/exit_flow/block1")
+        s = s * stride
+        stride = self.strides[1] if check_stride(s * self.strides[1],
+                                                 self.output_stride) else 1
+        self._exit_flow_2 = Xception_Block(
+            self.chns[0][-1],
+            self.chns[1], [1, 1, stride],
+            dilation=2,
+            has_skip=False,
+            activation_fn_in_separable_conv=True,
+            name=self.backbone + "/exit_flow/block2")
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        feat_list = []
+        for i, ef in enumerate(self.entry_flow):
+            x = ef(x)
+            if i == 0:
+                feat_list.append(x)
+        for mf in self.middle_flow:
+            x = mf(x)
+        x = self._exit_flow_1(x)
+        x = self._exit_flow_2(x)
+        feat_list.append(x)
+        return feat_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+@manager.BACKBONES.add_component
+def Xception41_deeplab(**args):
+    model = XceptionDeeplab('xception_41', **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def Xception65_deeplab(**args):
+    model = XceptionDeeplab("xception_65", **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def Xception71_deeplab(**args):
+    model = XceptionDeeplab("xception_71", **args)
+    return model
diff --git a/paddlers/models/ppseg/models/bisenet.py b/paddlers/models/ppseg/models/bisenet.py
new file mode 100644
index 0000000..32efc36
--- /dev/null
+++ b/paddlers/models/ppseg/models/bisenet.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class BiSeNetV2(nn.Layer):
+    """
+    The BiSeNet V2 implementation based on PaddlePaddle.
+
+    The original article refers to
+    Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
+    (https://arxiv.org/abs/2004.02147)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        lambd (float, optional): A factor for controlling the size of semantic branch channels. Default: 0.25.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 lambd=0.25,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        C1, C2, C3 = 64, 64, 128
+        db_channels = (C1, C2, C3)
+        C1, C3, C4, C5 = int(C1 * lambd), int(C3 * lambd), 64, 128
+        sb_channels = (C1, C3, C4, C5)
+        mid_channels = 128
+
+        self.db = DetailBranch(db_channels)
+        self.sb = SemanticBranch(sb_channels)
+
+        self.bga = BGA(mid_channels, align_corners)
+        self.aux_head1 = SegHead(C1, C1, num_classes)
+        self.aux_head2 = SegHead(C3, C3, num_classes)
+        self.aux_head3 = SegHead(C4, C4, num_classes)
+        self.aux_head4 = SegHead(C5, C5, num_classes)
+        self.head = SegHead(mid_channels, mid_channels, num_classes)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        dfm = self.db(x)
+        feat1, feat2, feat3, feat4, sfm = self.sb(x)
+        logit = self.head(self.bga(dfm, sfm))
+
+        if not self.training:
+            logit_list = [logit]
+        else:
+            logit1 = self.aux_head1(feat1)
+            logit2 = self.aux_head2(feat2)
+            logit3 = self.aux_head3(feat3)
+            logit4 = self.aux_head4(feat4)
+            logit_list = [logit, logit1, logit2, logit3, logit4]
+
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+        else:
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Conv2D):
+                    param_init.kaiming_normal_init(sublayer.weight)
+                elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                    param_init.constant_init(sublayer.weight, value=1.0)
+                    param_init.constant_init(sublayer.bias, value=0.0)
+
+
+class StemBlock(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(StemBlock, self).__init__()
+
+        self.conv = layers.ConvBNReLU(in_dim, out_dim, 3, stride=2)
+
+        self.left = nn.Sequential(
+            layers.ConvBNReLU(out_dim, out_dim // 2, 1),
+            layers.ConvBNReLU(out_dim // 2, out_dim, 3, stride=2))
+
+        self.right = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.fuse = layers.ConvBNReLU(out_dim * 2, out_dim, 3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        left = self.left(x)
+        right = self.right(x)
+        concat = paddle.concat([left, right], axis=1)
+        return self.fuse(concat)
+
+
+class ContextEmbeddingBlock(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(ContextEmbeddingBlock, self).__init__()
+
+        self.gap = nn.AdaptiveAvgPool2D(1)
+        self.bn = layers.SyncBatchNorm(in_dim)
+
+        self.conv_1x1 = layers.ConvBNReLU(in_dim, out_dim, 1)
+        self.add = layers.Add()
+        self.conv_3x3 = nn.Conv2D(out_dim, out_dim, 3, 1, 1)
+
+    def forward(self, x):
+        gap = self.gap(x)
+        bn = self.bn(gap)
+        conv1 = self.add(self.conv_1x1(bn), x)
+        return self.conv_3x3(conv1)
+
+
+class GatherAndExpansionLayer1(nn.Layer):
+    """Gather And Expansion Layer with stride 1"""
+
+    def __init__(self, in_dim, out_dim, expand):
+        super().__init__()
+
+        expand_dim = expand * in_dim
+
+        self.conv = nn.Sequential(
+            layers.ConvBNReLU(in_dim, in_dim, 3),
+            layers.DepthwiseConvBN(in_dim, expand_dim, 3),
+            layers.ConvBN(expand_dim, out_dim, 1))
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        return self.relu(self.conv(x) + x)
+
+
+class GatherAndExpansionLayer2(nn.Layer):
+    """Gather And Expansion Layer with stride 2"""
+
+    def __init__(self, in_dim, out_dim, expand):
+        super().__init__()
+
+        expand_dim = expand * in_dim
+
+        self.branch_1 = nn.Sequential(
+            layers.ConvBNReLU(in_dim, in_dim, 3),
+            layers.DepthwiseConvBN(in_dim, expand_dim, 3, stride=2),
+            layers.DepthwiseConvBN(expand_dim, expand_dim, 3),
+            layers.ConvBN(expand_dim, out_dim, 1))
+
+        self.branch_2 = nn.Sequential(
+            layers.DepthwiseConvBN(in_dim, in_dim, 3, stride=2),
+            layers.ConvBN(in_dim, out_dim, 1))
+
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        return self.relu(self.branch_1(x) + self.branch_2(x))
+
+
+class DetailBranch(nn.Layer):
+    """The detail branch of BiSeNet, which has wide channels but shallow layers."""
+
+    def __init__(self, in_channels):
+        super().__init__()
+
+        C1, C2, C3 = in_channels
+
+        self.convs = nn.Sequential(
+            # stage 1
+            layers.ConvBNReLU(3, C1, 3, stride=2),
+            layers.ConvBNReLU(C1, C1, 3),
+            # stage 2
+            layers.ConvBNReLU(C1, C2, 3, stride=2),
+            layers.ConvBNReLU(C2, C2, 3),
+            layers.ConvBNReLU(C2, C2, 3),
+            # stage 3
+            layers.ConvBNReLU(C2, C3, 3, stride=2),
+            layers.ConvBNReLU(C3, C3, 3),
+            layers.ConvBNReLU(C3, C3, 3),
+        )
+
+    def forward(self, x):
+        return self.convs(x)
+
+
+class SemanticBranch(nn.Layer):
+    """The semantic branch of BiSeNet, which has narrow channels but deep layers."""
+
+    def __init__(self, in_channels):
+        super().__init__()
+        C1, C3, C4, C5 = in_channels
+
+        self.stem = StemBlock(3, C1)
+
+        self.stage3 = nn.Sequential(
+            GatherAndExpansionLayer2(C1, C3, 6),
+            GatherAndExpansionLayer1(C3, C3, 6))
+
+        self.stage4 = nn.Sequential(
+            GatherAndExpansionLayer2(C3, C4, 6),
+            GatherAndExpansionLayer1(C4, C4, 6))
+
+        self.stage5_4 = nn.Sequential(
+            GatherAndExpansionLayer2(C4, C5, 6),
+            GatherAndExpansionLayer1(C5, C5, 6),
+            GatherAndExpansionLayer1(C5, C5, 6),
+            GatherAndExpansionLayer1(C5, C5, 6))
+
+        self.ce = ContextEmbeddingBlock(C5, C5)
+
+    def forward(self, x):
+        stage2 = self.stem(x)
+        stage3 = self.stage3(stage2)
+        stage4 = self.stage4(stage3)
+        stage5_4 = self.stage5_4(stage4)
+        fm = self.ce(stage5_4)
+        return stage2, stage3, stage4, stage5_4, fm
+
+
+class BGA(nn.Layer):
+    """The Bilateral Guided Aggregation Layer, used to fuse the semantic features and spatial features."""
+
+    def __init__(self, out_dim, align_corners):
+        super().__init__()
+
+        self.align_corners = align_corners
+
+        self.db_branch_keep = nn.Sequential(
+            layers.DepthwiseConvBN(out_dim, out_dim, 3),
+            nn.Conv2D(out_dim, out_dim, 1))
+
+        self.db_branch_down = nn.Sequential(
+            layers.ConvBN(out_dim, out_dim, 3, stride=2),
+            nn.AvgPool2D(kernel_size=3, stride=2, padding=1))
+
+        self.sb_branch_keep = nn.Sequential(
+            layers.DepthwiseConvBN(out_dim, out_dim, 3),
+            nn.Conv2D(out_dim, out_dim, 1), layers.Activation(act='sigmoid'))
+
+        self.sb_branch_up = layers.ConvBN(out_dim, out_dim, 3)
+
+        self.conv = layers.ConvBN(out_dim, out_dim, 3)
+
+    def forward(self, dfm, sfm):
+        db_feat_keep = self.db_branch_keep(dfm)
+        db_feat_down = self.db_branch_down(dfm)
+        sb_feat_keep = self.sb_branch_keep(sfm)
+
+        sb_feat_up = self.sb_branch_up(sfm)
+        sb_feat_up = F.interpolate(
+            sb_feat_up,
+            paddle.shape(db_feat_keep)[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        sb_feat_up = F.sigmoid(sb_feat_up)
+        db_feat = db_feat_keep * sb_feat_up
+
+        sb_feat = db_feat_down * sb_feat_keep
+        sb_feat = F.interpolate(
+            sb_feat,
+            paddle.shape(db_feat)[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return self.conv(db_feat + sb_feat)
+
+
+class SegHead(nn.Layer):
+    def __init__(self, in_dim, mid_dim, num_classes):
+        super().__init__()
+
+        self.conv_3x3 = nn.Sequential(
+            layers.ConvBNReLU(in_dim, mid_dim, 3), nn.Dropout(0.1))
+
+        self.conv_1x1 = nn.Conv2D(mid_dim, num_classes, 1, 1)
+
+    def forward(self, x):
+        conv1 = self.conv_3x3(x)
+        conv2 = self.conv_1x1(conv1)
+        return conv2
diff --git a/paddlers/models/ppseg/models/bisenetv1.py b/paddlers/models/ppseg/models/bisenetv1.py
new file mode 100644
index 0000000..c40bb33
--- /dev/null
+++ b/paddlers/models/ppseg/models/bisenetv1.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class BiseNetV1(nn.Layer):
+    """
+    The BiSeNetV1 implementation based on PaddlePaddle.
+
+    The original article refers to
+    Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
+    (https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 conv_channel=128,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.spatial_path = SpatialPath(3, 128)
+        self.global_context = nn.Sequential(
+            nn.AdaptiveAvgPool2D(1),
+            layers.ConvBNReLU(512, conv_channel, 1, bias_attr=False),
+        )
+
+        self.arms = nn.LayerList([
+            AttentionRefinement(512, conv_channel),
+            AttentionRefinement(256, conv_channel),
+        ])
+        self.refines = nn.LayerList([
+            layers.ConvBNReLU(conv_channel,
+                              conv_channel,
+                              3,
+                              stride=1,
+                              padding=1,
+                              bias_attr=False),
+            layers.ConvBNReLU(conv_channel,
+                              conv_channel,
+                              3,
+                              stride=1,
+                              padding=1,
+                              bias_attr=False),
+        ])
+
+        self.heads = nn.LayerList([
+            BiSeNetHead(conv_channel, num_classes, 8, True),
+            BiSeNetHead(conv_channel, num_classes, 8, True),
+            BiSeNetHead(conv_channel * 2, num_classes, 8, False),
+        ])
+
+        self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1)
+
+        self.pretrained = pretrained
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        spatial_out = self.spatial_path(x)
+        context_blocks = self.backbone(x)
+        context_blocks.reverse()
+
+        global_context = self.global_context(context_blocks[0])
+        global_context = F.interpolate(global_context,
+                                       size=paddle.shape(context_blocks[0])[2:],
+                                       mode='bilinear',
+                                       align_corners=True)
+        last_fm = global_context
+        pred_out = []
+
+        for i, (fm, arm, refine) in enumerate(
+                zip(context_blocks[:2], self.arms, self.refines)):
+            fm = arm(fm)
+            fm += last_fm
+            last_fm = F.interpolate(fm,
+                                    size=paddle.shape(context_blocks[i +
+                                                                     1])[2:],
+                                    mode='bilinear',
+                                    align_corners=True)
+            last_fm = refine(last_fm)
+            pred_out.append(last_fm)
+        context_out = last_fm
+
+        concate_fm = self.ffm(spatial_out, context_out)
+        pred_out.append(concate_fm)
+
+        output = []
+        if self.training:
+            for i, head in enumerate(self.heads):
+                out = head(pred_out[i])
+                output.append(out)
+        else:
+            out = self.heads[-1](pred_out[-1])
+            output.append(out)
+        return output
+
+
+class SpatialPath(nn.Layer):
+    """
+    SpatialPath module of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+    """
+    def __init__(self, in_channels, out_channels, inner_channel=64):
+        super().__init__()
+        self.conv_7x7 = layers.ConvBNReLU(in_channels,
+                                          inner_channel,
+                                          7,
+                                          stride=2,
+                                          padding=3,
+                                          bias_attr=False)
+        self.conv_3x3_1 = layers.ConvBNReLU(inner_channel,
+                                            inner_channel,
+                                            3,
+                                            stride=2,
+                                            padding=1,
+                                            bias_attr=False)
+        self.conv_3x3_2 = layers.ConvBNReLU(inner_channel,
+                                            inner_channel,
+                                            3,
+                                            stride=2,
+                                            padding=1,
+                                            bias_attr=False)
+        self.conv_1x1 = layers.ConvBNReLU(inner_channel,
+                                          out_channels,
+                                          1,
+                                          bias_attr=False)
+
+    def forward(self, x):
+        x = self.conv_7x7(x)
+        x = self.conv_3x3_1(x)
+        x = self.conv_3x3_2(x)
+        x = self.conv_1x1(x)
+        return x
+
+
+class BiSeNetHead(nn.Layer):
+    """
+    BiSeNet head of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+        scale (int, float): The scale factor of interpolation.
+    """
+    def __init__(self, in_channels, out_channels, scale, is_aux=False):
+        super().__init__()
+        inner_channel = 128 if is_aux else 64
+        self.conv_3x3 = layers.ConvBNReLU(in_channels,
+                                          inner_channel,
+                                          3,
+                                          stride=1,
+                                          padding=1,
+                                          bias_attr=False)
+        self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1)
+        self.scale = scale
+
+    def forward(self, x):
+        x = self.conv_3x3(x)
+        x = self.conv_1x1(x)
+        if self.scale > 1:
+            x = F.interpolate(x,
+                              scale_factor=self.scale,
+                              mode='bilinear',
+                              align_corners=True)
+        return x
+
+
+class AttentionRefinement(nn.Layer):
+    """
+    AttentionRefinement module of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv_3x3 = layers.ConvBNReLU(in_channels,
+                                          out_channels,
+                                          3,
+                                          stride=1,
+                                          padding=1,
+                                          bias_attr=False)
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2D(1),
+            layers.ConvBNReLU(out_channels, out_channels, 1, bias_attr=False),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        x = self.conv_3x3(x)
+        se = self.channel_attention(x)
+        x = x * se
+        return x
+
+
+class FeatureFusion(nn.Layer):
+    """
+    AttentionRefinement module of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+        reduction (int): A factor shrinks convolutional channels. Default: 1.
+    """
+    def __init__(self, in_channels, out_channels, reduction=1):
+        super().__init__()
+        self.conv_1x1 = layers.ConvBNReLU(in_channels,
+                                          out_channels,
+                                          1,
+                                          bias_attr=False)
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2D(1),
+            layers.ConvBNReLU(out_channels,
+                              out_channels // reduction,
+                              1,
+                              bias_attr=False),
+            layers.ConvBNReLU(out_channels // reduction,
+                              out_channels,
+                              1,
+                              bias_attr=False),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x1, x2):
+        fm = paddle.concat([x1, x2], axis=1)
+        fm = self.conv_1x1(fm)
+        fm_se = self.channel_attention(fm)
+        output = fm + fm * fm_se
+        return output
diff --git a/paddlers/models/ppseg/models/danet.py b/paddlers/models/ppseg/models/danet.py
new file mode 100644
index 0000000..9db4e01
--- /dev/null
+++ b/paddlers/models/ppseg/models/danet.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DANet(nn.Layer):
+    """
+    The DANet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Fu, jun, et al. "Dual Attention Network for Scene Segmentation"
+    (https://arxiv.org/pdf/1809.02983.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of
+            output of backbone.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+
+        self.head = DAHead(num_classes=num_classes, in_channels=in_channels)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        if not self.training:
+            logit_list = [logit_list[0]]
+
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                align_mode=1) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DAHead(nn.Layer):
+    """
+    The Dual attention head.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+    """
+
+    def __init__(self, num_classes, in_channels):
+        super().__init__()
+        in_channels = in_channels[-1]
+        inter_channels = in_channels // 4
+
+        self.channel_conv = layers.ConvBNReLU(in_channels, inter_channels, 3)
+        self.position_conv = layers.ConvBNReLU(in_channels, inter_channels, 3)
+        self.pam = PAM(inter_channels)
+        self.cam = CAM(inter_channels)
+        self.conv1 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
+        self.conv2 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
+
+        self.aux_head = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(in_channels, num_classes, 1))
+
+        self.aux_head_pam = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
+
+        self.aux_head_cam = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
+
+        self.cls_head = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
+
+    def forward(self, feat_list):
+        feats = feat_list[-1]
+        channel_feats = self.channel_conv(feats)
+        channel_feats = self.cam(channel_feats)
+        channel_feats = self.conv1(channel_feats)
+
+        position_feats = self.position_conv(feats)
+        position_feats = self.pam(position_feats)
+        position_feats = self.conv2(position_feats)
+
+        feats_sum = position_feats + channel_feats
+        logit = self.cls_head(feats_sum)
+
+        if not self.training:
+            return [logit]
+
+        cam_logit = self.aux_head_cam(channel_feats)
+        pam_logit = self.aux_head_cam(position_feats)
+        aux_logit = self.aux_head(feats)
+        return [logit, cam_logit, pam_logit, aux_logit]
+
+
+class PAM(nn.Layer):
+    """Position attention module."""
+
+    def __init__(self, in_channels):
+        super().__init__()
+        mid_channels = in_channels // 8
+        self.mid_channels = mid_channels
+        self.in_channels = in_channels
+
+        self.query_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
+        self.key_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
+        self.value_conv = nn.Conv2D(in_channels, in_channels, 1, 1)
+
+        self.gamma = self.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0))
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+
+        # query: n, h * w, c1
+        query = self.query_conv(x)
+        query = paddle.reshape(query, (0, self.mid_channels, -1))
+        query = paddle.transpose(query, (0, 2, 1))
+
+        # key: n, c1, h * w
+        key = self.key_conv(x)
+        key = paddle.reshape(key, (0, self.mid_channels, -1))
+
+        # sim: n, h * w, h * w
+        sim = paddle.bmm(query, key)
+        sim = F.softmax(sim, axis=-1)
+
+        value = self.value_conv(x)
+        value = paddle.reshape(value, (0, self.in_channels, -1))
+        sim = paddle.transpose(sim, (0, 2, 1))
+
+        # feat: from (n, c2, h * w) -> (n, c2, h, w)
+        feat = paddle.bmm(value, sim)
+        feat = paddle.reshape(feat,
+                              (0, self.in_channels, x_shape[2], x_shape[3]))
+
+        out = self.gamma * feat + x
+        return out
+
+
+class CAM(nn.Layer):
+    """Channel attention module."""
+
+    def __init__(self, channels):
+        super().__init__()
+
+        self.channels = channels
+        self.gamma = self.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0))
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        # query: n, c, h * w
+        query = paddle.reshape(x, (0, self.channels, -1))
+        # key: n, h * w, c
+        key = paddle.reshape(x, (0, self.channels, -1))
+        key = paddle.transpose(key, (0, 2, 1))
+
+        # sim: n, c, c
+        sim = paddle.bmm(query, key)
+        # The danet author claims that this can avoid gradient divergence
+        sim = paddle.max(
+            sim, axis=-1, keepdim=True).tile([1, 1, self.channels]) - sim
+        sim = F.softmax(sim, axis=-1)
+
+        # feat: from (n, c, h * w) to (n, c, h, w)
+        value = paddle.reshape(x, (0, self.channels, -1))
+        feat = paddle.bmm(sim, value)
+        feat = paddle.reshape(feat, (0, self.channels, x_shape[2], x_shape[3]))
+
+        out = self.gamma * feat + x
+        return out
diff --git a/paddlers/models/ppseg/models/decoupled_segnet.py b/paddlers/models/ppseg/models/decoupled_segnet.py
new file mode 100644
index 0000000..08dae29
--- /dev/null
+++ b/paddlers/models/ppseg/models/decoupled_segnet.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.models.backbones import resnet_vd
+from paddlers.models.ppseg.models import deeplab
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DecoupledSegNet(nn.Layer):
+    """
+    The DecoupledSegNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Xiangtai Li, et, al. "Improving Semantic Segmentation via Decoupled Body and Edge Supervision"
+    (https://arxiv.org/pdf/2007.10035.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+           Default: (0, 3).
+        aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
+            If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
+            If output_stride=8, aspp_ratios is (1, 12, 24, 36).
+            Default: (1, 6, 12, 18).
+        aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(0, 3),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        backbone_channels = self.backbone.feat_channels
+        self.head = DecoupledSegNetHead(num_classes, backbone_indices,
+                                        backbone_channels, aspp_ratios,
+                                        aspp_out_channels, align_corners)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+
+        seg_logit, body_logit, edge_logit = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+        if self.training:
+            return [seg_logit, body_logit, edge_logit, (seg_logit, edge_logit)]
+        return [seg_logit]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DecoupledSegNetHead(nn.Layer):
+    """
+    The DecoupledSegNetHead implementation based on PaddlePaddle.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            the first index will be taken as a low-level feature in Edge presevation component;
+            the second one will be taken as input of ASPP component.
+        backbone_channels (tuple): The channels of output of backbone.
+        aspp_ratios (tuple): The dilation rates using in ASSP module.
+        aspp_out_channels (int): The output channels of ASPP module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, num_classes, backbone_indices, backbone_channels,
+                 aspp_ratios, aspp_out_channels, align_corners):
+        super().__init__()
+        self.backbone_indices = backbone_indices
+        self.align_corners = align_corners
+        self.aspp = layers.ASPPModule(
+            aspp_ratios=aspp_ratios,
+            in_channels=backbone_channels[backbone_indices[1]],
+            out_channels=aspp_out_channels,
+            align_corners=align_corners,
+            image_pooling=True)
+
+        self.bot_fine = nn.Conv2D(
+            backbone_channels[backbone_indices[0]], 48, 1, bias_attr=False)
+        # decoupled
+        self.squeeze_body_edge = SqueezeBodyEdge(
+            256, align_corners=self.align_corners)
+        self.edge_fusion = nn.Conv2D(256 + 48, 256, 1, bias_attr=False)
+        self.sigmoid_edge = nn.Sigmoid()
+        self.edge_out = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=256,
+                out_channels=48,
+                kernel_size=3,
+                bias_attr=False), nn.Conv2D(48, 1, 1, bias_attr=False))
+        self.dsn_seg_body = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=256,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False), nn.Conv2D(
+                    256, num_classes, 1, bias_attr=False))
+
+        self.final_seg = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=512,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                in_channels=256,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False),
+            nn.Conv2D(256, num_classes, kernel_size=1, bias_attr=False))
+
+    def forward(self, feat_list):
+        fine_fea = feat_list[self.backbone_indices[0]]
+        fine_size = paddle.shape(fine_fea)
+        x = feat_list[self.backbone_indices[1]]
+        aspp = self.aspp(x)
+
+        # decoupled
+        seg_body, seg_edge = self.squeeze_body_edge(aspp)
+        # Edge presevation and edge out
+        fine_fea = self.bot_fine(fine_fea)
+        seg_edge = F.interpolate(
+            seg_edge,
+            fine_size[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        seg_edge = self.edge_fusion(paddle.concat([seg_edge, fine_fea], axis=1))
+        seg_edge_out = self.edge_out(seg_edge)
+        seg_edge_out = self.sigmoid_edge(seg_edge_out)  # seg_edge output
+        seg_body_out = self.dsn_seg_body(seg_body)  # body out
+
+        # seg_final out
+        seg_out = seg_edge + F.interpolate(
+            seg_body,
+            fine_size[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        aspp = F.interpolate(
+            aspp,
+            fine_size[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        seg_out = paddle.concat([aspp, seg_out], axis=1)
+        seg_final_out = self.final_seg(seg_out)
+
+        return [seg_final_out, seg_body_out, seg_edge_out]
+
+
+class SqueezeBodyEdge(nn.Layer):
+    def __init__(self, inplane, align_corners=False):
+        super().__init__()
+        self.align_corners = align_corners
+        self.down = nn.Sequential(
+            layers.ConvBNReLU(
+                inplane, inplane, kernel_size=3, groups=inplane, stride=2),
+            layers.ConvBNReLU(
+                inplane, inplane, kernel_size=3, groups=inplane, stride=2))
+        self.flow_make = nn.Conv2D(
+            inplane * 2, 2, kernel_size=3, padding='same', bias_attr=False)
+
+    def forward(self, x):
+        size = paddle.shape(x)[2:]
+        seg_down = self.down(x)
+        seg_down = F.interpolate(
+            seg_down,
+            size=size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        flow = self.flow_make(paddle.concat([x, seg_down], axis=1))
+        seg_flow_warp = self.flow_warp(x, flow, size)
+        seg_edge = x - seg_flow_warp
+        return seg_flow_warp, seg_edge
+
+    def flow_warp(self, input, flow, size):
+        input_shape = paddle.shape(input)
+        norm = size[::-1].reshape([1, 1, 1, -1])
+        norm.stop_gradient = True
+        h_grid = paddle.linspace(-1.0, 1.0, size[0]).reshape([-1, 1])
+        h_grid = h_grid.tile([size[1]])
+        w_grid = paddle.linspace(-1.0, 1.0, size[1]).reshape([-1, 1])
+        w_grid = w_grid.tile([size[0]]).transpose([1, 0])
+        grid = paddle.concat([w_grid.unsqueeze(2), h_grid.unsqueeze(2)], axis=2)
+        grid.unsqueeze(0).tile([input_shape[0], 1, 1, 1])
+        grid = grid + paddle.transpose(flow, (0, 2, 3, 1)) / norm
+
+        output = F.grid_sample(input, grid)
+        return output
diff --git a/paddlers/models/ppseg/models/deeplab.py b/paddlers/models/ppseg/models/deeplab.py
new file mode 100644
index 0000000..3e50572
--- /dev/null
+++ b/paddlers/models/ppseg/models/deeplab.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = ['DeepLabV3P', 'DeepLabV3']
+
+
+@manager.MODELS.add_component
+class DeepLabV3P(nn.Layer):
+    """
+    The DeepLabV3Plus implementation based on PaddlePaddle.
+
+    The original article refers to
+     Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation"
+     (https://arxiv.org/abs/1802.02611)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd/Xception65.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+           Default: (0, 3).
+        aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
+            If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
+            If output_stride=8, aspp_ratios is (1, 12, 24, 36).
+            Default: (1, 6, 12, 18).
+        aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+        data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(0, 3),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = DeepLabV3PHead(
+            num_classes,
+            backbone_indices,
+            backbone_channels,
+            aspp_ratios,
+            aspp_out_channels,
+            align_corners,
+            data_format=data_format)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.data_format = data_format
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        if self.data_format == 'NCHW':
+            ori_shape = paddle.shape(x)[2:]
+        else:
+            ori_shape = paddle.shape(x)[1:3]
+        return [
+            F.interpolate(
+                logit,
+                ori_shape,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                data_format=self.data_format) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DeepLabV3PHead(nn.Layer):
+    """
+    The DeepLabV3PHead implementation based on PaddlePaddle.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            the first index will be taken as a low-level feature in Decoder component;
+            the second one will be taken as input of ASPP component.
+            Usually backbone consists of four downsampling stage, and return an output of
+            each stage. If we set it as (0, 3), it means taking feature map of the first
+            stage in backbone as low-level feature used in Decoder, and feature map of the fourth
+            stage as input of ASPP.
+        backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
+        aspp_ratios (tuple): The dilation rates using in ASSP module.
+        aspp_out_channels (int): The output channels of ASPP module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+        data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 backbone_channels,
+                 aspp_ratios,
+                 aspp_out_channels,
+                 align_corners,
+                 data_format='NCHW'):
+        super().__init__()
+
+        self.aspp = layers.ASPPModule(
+            aspp_ratios,
+            backbone_channels[1],
+            aspp_out_channels,
+            align_corners,
+            use_sep_conv=True,
+            image_pooling=True,
+            data_format=data_format)
+        self.decoder = Decoder(
+            num_classes,
+            backbone_channels[0],
+            align_corners,
+            data_format=data_format)
+        self.backbone_indices = backbone_indices
+
+    def forward(self, feat_list):
+        logit_list = []
+        low_level_feat = feat_list[self.backbone_indices[0]]
+        x = feat_list[self.backbone_indices[1]]
+        x = self.aspp(x)
+        logit = self.decoder(x, low_level_feat)
+        logit_list.append(logit)
+
+        return logit_list
+
+
+@manager.MODELS.add_component
+class DeepLabV3(nn.Layer):
+    """
+    The DeepLabV3 implementation based on PaddlePaddle.
+
+    The original article refers to
+     Liang-Chieh Chen, et, al. "Rethinking Atrous Convolution for Semantic Image Segmentation"
+     (https://arxiv.org/pdf/1706.05587.pdf).
+
+    Args:
+        Please Refer to DeepLabV3P above.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(3, ),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = DeepLabV3Head(num_classes, backbone_indices,
+                                  backbone_channels, aspp_ratios,
+                                  aspp_out_channels, align_corners)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DeepLabV3Head(nn.Layer):
+    """
+    The DeepLabV3Head implementation based on PaddlePaddle.
+
+    Args:
+        Please Refer to DeepLabV3PHead above.
+    """
+
+    def __init__(self, num_classes, backbone_indices, backbone_channels,
+                 aspp_ratios, aspp_out_channels, align_corners):
+        super().__init__()
+
+        self.aspp = layers.ASPPModule(
+            aspp_ratios,
+            backbone_channels[0],
+            aspp_out_channels,
+            align_corners,
+            use_sep_conv=False,
+            image_pooling=True)
+
+        self.cls = nn.Conv2D(
+            in_channels=aspp_out_channels,
+            out_channels=num_classes,
+            kernel_size=1)
+
+        self.backbone_indices = backbone_indices
+
+    def forward(self, feat_list):
+        logit_list = []
+        x = feat_list[self.backbone_indices[0]]
+        x = self.aspp(x)
+        logit = self.cls(x)
+        logit_list.append(logit)
+
+        return logit_list
+
+
+class Decoder(nn.Layer):
+    """
+    Decoder module of DeepLabV3P model
+
+    Args:
+        num_classes (int): The number of classes.
+        in_channels (int): The number of input channels in decoder module.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 align_corners,
+                 data_format='NCHW'):
+        super(Decoder, self).__init__()
+
+        self.data_format = data_format
+        self.conv_bn_relu1 = layers.ConvBNReLU(
+            in_channels=in_channels,
+            out_channels=48,
+            kernel_size=1,
+            data_format=data_format)
+
+        self.conv_bn_relu2 = layers.SeparableConvBNReLU(
+            in_channels=304,
+            out_channels=256,
+            kernel_size=3,
+            padding=1,
+            data_format=data_format)
+        self.conv_bn_relu3 = layers.SeparableConvBNReLU(
+            in_channels=256,
+            out_channels=256,
+            kernel_size=3,
+            padding=1,
+            data_format=data_format)
+        self.conv = nn.Conv2D(
+            in_channels=256,
+            out_channels=num_classes,
+            kernel_size=1,
+            data_format=data_format)
+
+        self.align_corners = align_corners
+
+    def forward(self, x, low_level_feat):
+        low_level_feat = self.conv_bn_relu1(low_level_feat)
+        if self.data_format == 'NCHW':
+            low_level_shape = paddle.shape(low_level_feat)[-2:]
+            axis = 1
+        else:
+            low_level_shape = paddle.shape(low_level_feat)[1:3]
+            axis = -1
+        x = F.interpolate(
+            x,
+            low_level_shape,
+            mode='bilinear',
+            align_corners=self.align_corners,
+            data_format=self.data_format)
+        x = paddle.concat([x, low_level_feat], axis=axis)
+        x = self.conv_bn_relu2(x)
+        x = self.conv_bn_relu3(x)
+        x = self.conv(x)
+        return x
diff --git a/paddlers/models/ppseg/models/dmnet.py b/paddlers/models/ppseg/models/dmnet.py
new file mode 100644
index 0000000..1c4d753
--- /dev/null
+++ b/paddlers/models/ppseg/models/dmnet.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DMNet(nn.Layer):
+    """
+    The DMNet implementation based on PaddlePaddle.
+
+    The original article refers to
+     Junjun He, Zhongying Deng, Yu Qiao. "Dynamic Multi-scale Filters for Semantic Segmentation"
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
+        mid_channels (int): The middle channels of convolution layer. Default: 512.
+        filter_sizes (list, tuple): The filter size of generated convolution kernel used in Dynamic Convolutional Module. Default: [1, 3, 5, 7].
+        fusion (bool): Add one conv to fuse DCM output feature. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 mid_channels=512,
+                 filter_sizes=[1, 3, 5, 7],
+                 fusion=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.dcm_modules = nn.LayerList()
+        for filter_size in filter_sizes:
+            self.dcm_modules.append(
+                DCM(filter_size, fusion, self.backbone.feat_channels[-1],
+                    mid_channels), )
+        self.bottleneck = layers.ConvBNReLU(
+            self.backbone.feat_channels[-1] + len(filter_sizes) * mid_channels,
+            mid_channels,
+            3,
+            padding=1,
+        )
+        self.cls = nn.Conv2D(mid_channels, num_classes, 1)
+
+        self.fcn_head = nn.Sequential(
+            layers.ConvBNReLU(self.backbone.feat_channels[2],
+                              mid_channels,
+                              3,
+                              padding=1),
+            nn.Conv2D(mid_channels, num_classes, 1),
+        )
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        x = feats[-1]
+        dcm_outs = [x]
+        for dcm_module in self.dcm_modules:
+            dcm_outs.append(dcm_module(x))
+        dcm_outs = paddle.concat(dcm_outs, axis=1)
+        x = self.bottleneck(dcm_outs)
+        x = self.cls(x)
+        x = F.interpolate(x,
+                          scale_factor=8,
+                          mode='bilinear',
+                          align_corners=True)
+        output = [x]
+        if self.training:
+            fcn_out = self.fcn_head(feats[2])
+            fcn_out = F.interpolate(fcn_out,
+                                    scale_factor=8,
+                                    mode='bilinear',
+                                    align_corners=True)
+            output.append(fcn_out)
+            return output
+        return output
+
+
+class DCM(nn.Layer):
+    """
+    Dynamic Convolutional Module used in DMNet.
+
+    Args:
+        filter_size (int): The filter size of generated convolution kernel used in Dynamic Convolutional Module.
+        fusion (bool): Add one conv to fuse DCM output feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+    """
+    def __init__(self, filter_size, fusion, in_channels, channels):
+        super().__init__()
+        self.filter_size = filter_size
+        self.fusion = fusion
+        self.channels = channels
+
+        pad = (self.filter_size - 1) // 2
+        if (self.filter_size - 1) % 2 == 0:
+            self.pad = (pad, pad, pad, pad)
+        else:
+            self.pad = (pad + 1, pad, pad + 1, pad)
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(filter_size)
+        self.filter_gen_conv = nn.Conv2D(in_channels, channels, 1)
+        self.input_redu_conv = layers.ConvBNReLU(in_channels, channels, 1)
+
+        self.norm = layers.SyncBatchNorm(channels)
+        self.act = nn.ReLU()
+
+        if self.fusion:
+            self.fusion_conv = layers.ConvBNReLU(channels, channels, 1)
+
+    def forward(self, x):
+        generated_filter = self.filter_gen_conv(self.avg_pool(x))
+        x = self.input_redu_conv(x)
+        b, c, h, w = x.shape
+        x = x.reshape([1, b * c, h, w])
+        generated_filter = generated_filter.reshape(
+            [b * c, 1, self.filter_size, self.filter_size])
+
+        x = F.pad(x, self.pad, mode='constant', value=0)
+        output = F.conv2d(x, weight=generated_filter, groups=b * c)
+        output = output.reshape([b, self.channels, h, w])
+        output = self.norm(output)
+        output = self.act(output)
+        if self.fusion:
+            output = self.fusion_conv(output)
+        return output
diff --git a/paddlers/models/ppseg/models/dnlnet.py b/paddlers/models/ppseg/models/dnlnet.py
new file mode 100644
index 0000000..79737eb
--- /dev/null
+++ b/paddlers/models/ppseg/models/dnlnet.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DNLNet(nn.Layer):
+    """Disentangled Non-Local Neural Networks.
+
+    The original article refers to
+    Minghao Yin, et al. "Disentangled Non-Local Neural Networks"
+    (https://arxiv.org/abs/2006.06668)
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: False.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian'.
+        temperature (float): Temperature to adjust attention. Default: 0.05.
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 temperature=0.05,
+                 concat_input=True,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+        self.head = DNLHead(num_classes, in_channels, reduction, use_scale,
+                            mode, temperature, concat_input,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                align_mode=1) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DNLHead(nn.Layer):
+    """
+    The DNLNet head.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: False.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+        temperature (float): Temperature to adjust attention. Default: 0.05
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 reduction,
+                 use_scale,
+                 mode,
+                 temperature,
+                 concat_input=True,
+                 enable_auxiliary_loss=True,
+                 **kwargs):
+        super(DNLHead, self).__init__()
+        self.in_channels = in_channels[-1]
+        self.concat_input = concat_input
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        inter_channels = self.in_channels // 4
+
+        self.dnl_block = DisentangledNonLocal2D(
+            in_channels=inter_channels,
+            reduction=reduction,
+            use_scale=use_scale,
+            temperature=temperature,
+            mode=mode)
+        self.conv0 = layers.ConvBNReLU(
+            in_channels=self.in_channels,
+            out_channels=inter_channels,
+            kernel_size=3,
+            bias_attr=False)
+        self.conv1 = layers.ConvBNReLU(
+            in_channels=inter_channels,
+            out_channels=inter_channels,
+            kernel_size=3,
+            bias_attr=False)
+        self.cls = nn.Sequential(
+            nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1))
+        self.aux = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=1024,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False), nn.Dropout2D(p=0.1),
+            nn.Conv2D(256, num_classes, 1))
+        if self.concat_input:
+            self.conv_cat = layers.ConvBNReLU(
+                self.in_channels + inter_channels,
+                inter_channels,
+                kernel_size=3,
+                bias_attr=False)
+
+    def forward(self, feat_list):
+        C3, C4 = feat_list
+        output = self.conv0(C4)
+        output = self.dnl_block(output)
+        output = self.conv1(output)
+        if self.concat_input:
+            output = self.conv_cat(paddle.concat([C4, output], axis=1))
+        output = self.cls(output)
+        if self.enable_auxiliary_loss:
+            auxout = self.aux(C3)
+            return [output, auxout]
+        else:
+            return [output]
+
+
+class DisentangledNonLocal2D(layers.NonLocal2D):
+    """Disentangled Non-Local Blocks.
+
+    Args:
+        temperature (float): Temperature to adjust attention.
+    """
+
+    def __init__(self, temperature, *arg, **kwargs):
+        super().__init__(*arg, **kwargs)
+        self.temperature = temperature
+        self.conv_mask = nn.Conv2D(self.in_channels, 1, kernel_size=1)
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        if self.use_scale:
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight /= self.temperature
+        pairwise_weight = F.softmax(pairwise_weight, -1)
+        return pairwise_weight
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        g_x = self.g(x).reshape([0, self.inter_channels,
+                                 -1]).transpose([0, 2, 1])
+
+        if self.mode == "gaussian":
+            theta_x = paddle.transpose(
+                x.reshape([0, self.in_channels, -1]), [0, 2, 1])
+            if self.sub_sample:
+                phi_x = paddle.transpose(self.phi(x), [0, self.in_channels, -1])
+            else:
+                phi_x = paddle.transpose(x, [0, self.in_channels, -1])
+
+        elif self.mode == "concatenation":
+            theta_x = paddle.reshape(
+                self.theta(x), [0, self.inter_channels, -1, 1])
+            phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, 1, -1])
+
+        else:
+            theta_x = self.theta(x).reshape([0, self.inter_channels,
+                                             -1]).transpose([0, 2, 1])
+            phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, -1])
+
+        theta_x -= paddle.mean(theta_x, axis=-2, keepdim=True)
+        phi_x -= paddle.mean(phi_x, axis=-1, keepdim=True)
+
+        pairwise_func = getattr(self, self.mode)
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        y = paddle.matmul(pairwise_weight, g_x).transpose([0, 2, 1]).reshape(
+            [0, self.inter_channels, x_shape[2], x_shape[3]])
+        unary_mask = F.softmax(
+            paddle.reshape(self.conv_mask(x), [0, 1, -1]), -1)
+        unary_x = paddle.matmul(unary_mask, g_x).transpose([0, 2, 1]).reshape(
+            [0, self.inter_channels, 1, 1])
+        output = x + self.conv_out(y + unary_x)
+        return output
diff --git a/paddlers/models/ppseg/models/emanet.py b/paddlers/models/ppseg/models/emanet.py
new file mode 100644
index 0000000..82a6b09
--- /dev/null
+++ b/paddlers/models/ppseg/models/emanet.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class EMANet(nn.Layer):
+    """
+    Expectation Maximization Attention Networks for Semantic Segmentation based on PaddlePaddle.
+
+    The original article refers to
+    Xia Li, et al. "Expectation-Maximization Attention Networks for Semantic Segmentation"
+    (https://arxiv.org/abs/1907.13426)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
+        ema_channels (int): EMA module channels.
+        gc_channels (int): The input channels to Global Context Block.
+        num_bases (int): Number of bases.
+        stage_num (int): The iteration number for EM.
+        momentum (float): The parameter for updating bases.
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 ema_channels=512,
+                 gc_channels=256,
+                 num_bases=64,
+                 stage_num=3,
+                 momentum=0.1,
+                 concat_input=True,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+        self.head = EMAHead(num_classes, in_channels, ema_channels, gc_channels,
+                            num_bases, stage_num, momentum, concat_input,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class EMAHead(nn.Layer):
+    """
+    The EMANet head.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+        ema_channels (int): EMA module channels.
+        gc_channels (int): The input channels to Global Context Block.
+        num_bases (int): Number of bases.
+        stage_num (int): The iteration number for EM.
+        momentum (float): The parameter for updating bases.
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 ema_channels,
+                 gc_channels,
+                 num_bases,
+                 stage_num,
+                 momentum,
+                 concat_input=True,
+                 enable_auxiliary_loss=True):
+        super(EMAHead, self).__init__()
+
+        self.in_channels = in_channels[-1]
+        self.concat_input = concat_input
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+        self.emau = EMAU(ema_channels, num_bases, stage_num, momentum=momentum)
+        self.ema_in_conv = layers.ConvBNReLU(
+            in_channels=self.in_channels,
+            out_channels=ema_channels,
+            kernel_size=3)
+        self.ema_mid_conv = nn.Conv2D(ema_channels, ema_channels, kernel_size=1)
+        self.ema_out_conv = layers.ConvBNReLU(
+            in_channels=ema_channels, out_channels=ema_channels, kernel_size=1)
+        self.bottleneck = layers.ConvBNReLU(
+            in_channels=ema_channels, out_channels=gc_channels, kernel_size=3)
+        self.cls = nn.Sequential(
+            nn.Dropout2D(p=0.1), nn.Conv2D(gc_channels, num_classes, 1))
+        self.aux = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=1024, out_channels=256, kernel_size=3),
+            nn.Dropout2D(p=0.1), nn.Conv2D(256, num_classes, 1))
+        if self.concat_input:
+            self.conv_cat = layers.ConvBNReLU(
+                self.in_channels + gc_channels, gc_channels, kernel_size=3)
+
+    def forward(self, feat_list):
+        C3, C4 = feat_list
+        feats = self.ema_in_conv(C4)
+        identity = feats
+        feats = self.ema_mid_conv(feats)
+        recon = self.emau(feats)
+        recon = F.relu(recon)
+        recon = self.ema_out_conv(recon)
+        output = F.relu(identity + recon)
+        output = self.bottleneck(output)
+        if self.concat_input:
+            output = self.conv_cat(paddle.concat([C4, output], axis=1))
+        output = self.cls(output)
+        if self.enable_auxiliary_loss:
+            auxout = self.aux(C3)
+            return [output, auxout]
+        else:
+            return [output]
+
+
+class EMAU(nn.Layer):
+    '''The Expectation-Maximization Attention Unit (EMAU).
+
+    Arguments:
+        c (int): The input and output channel number.
+        k (int): The number of the bases.
+        stage_num (int): The iteration number for EM.
+        momentum (float): The parameter for updating bases.
+    '''
+
+    def __init__(self, c, k, stage_num=3, momentum=0.1):
+        super(EMAU, self).__init__()
+        assert stage_num >= 1
+        self.stage_num = stage_num
+        self.momentum = momentum
+        self.c = c
+
+        tmp_mu = self.create_parameter(
+            shape=[1, c, k],
+            default_initializer=paddle.nn.initializer.KaimingNormal(k))
+        mu = F.normalize(paddle.to_tensor(tmp_mu), axis=1, p=2)
+        self.register_buffer('mu', mu)
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        x = x.flatten(2)
+        mu = paddle.tile(self.mu, [x_shape[0], 1, 1])
+
+        with paddle.no_grad():
+            for i in range(self.stage_num):
+                x_t = paddle.transpose(x, [0, 2, 1])
+                z = paddle.bmm(x_t, mu)
+                z = F.softmax(z, axis=2)
+                z_ = F.normalize(z, axis=1, p=1)
+                mu = paddle.bmm(x, z_)
+                mu = F.normalize(mu, axis=1, p=2)
+
+        z_t = paddle.transpose(z, [0, 2, 1])
+        x = paddle.matmul(mu, z_t)
+        x = paddle.reshape(x, [0, self.c, x_shape[2], x_shape[3]])
+
+        if self.training:
+            mu = paddle.mean(mu, 0, keepdim=True)
+            mu = F.normalize(mu, axis=1, p=2)
+            mu = self.mu * (1 - self.momentum) + mu * self.momentum
+            if paddle.distributed.get_world_size() > 1:
+                mu = paddle.distributed.all_reduce(mu)
+                mu /= paddle.distributed.get_world_size()
+            self.mu = mu
+
+        return x
diff --git a/paddlers/models/ppseg/models/encnet.py b/paddlers/models/ppseg/models/encnet.py
new file mode 100644
index 0000000..19cfa03
--- /dev/null
+++ b/paddlers/models/ppseg/models/encnet.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ENCNet(nn.Layer):
+    """
+    The ENCNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Hang Zhang, Kristin Dana, et, al. "Context Encoding for Semantic Segmentation".
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of
+            output of backbone.
+        num_codes (int): The number of encoded words. Default: 32.
+        mid_channels (int): The channels of middle layers. Default: 512.
+        use_se_loss (int): Whether use semantic encoding loss. Default: True.
+        add_lateral (int): Whether use lateral convolution layers. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=[1, 2, 3],
+                 num_codes=32,
+                 mid_channels=512,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 pretrained=None):
+        super().__init__()
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [
+            self.backbone.feat_channels[index] for index in backbone_indices
+        ]
+
+        self.bottleneck = layers.ConvBNReLU(
+            in_channels[-1],
+            mid_channels,
+            3,
+            padding=1,
+        )
+        if self.add_lateral:
+            self.lateral_convs = nn.LayerList()
+            for in_ch in in_channels[:-1]:
+                self.lateral_convs.append(
+                    layers.ConvBNReLU(
+                        in_ch,
+                        mid_channels,
+                        1,
+                    ))
+            self.fusion = layers.ConvBNReLU(
+                len(in_channels) * mid_channels,
+                mid_channels,
+                3,
+                padding=1,
+            )
+
+        self.enc_module = EncModule(mid_channels, num_codes)
+        self.head = nn.Conv2D(mid_channels, num_classes, 1)
+
+        self.fcn_head = layers.AuxLayer(self.backbone.feat_channels[2],
+                                        mid_channels, num_classes)
+
+        self.use_se_loss = use_se_loss
+        if use_se_loss:
+            self.se_layer = nn.Linear(mid_channels, num_classes)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, inputs):
+        N, C, H, W = paddle.shape(inputs)
+        feats = self.backbone(inputs)
+        fcn_feat = feats[2]
+
+        feats = [feats[i] for i in self.backbone_indices]
+        feat = self.bottleneck(feats[-1])
+
+        if self.add_lateral:
+            laterals = []
+            for j, lateral_conv in enumerate(self.lateral_convs):
+                laterals.append(
+                    F.interpolate(lateral_conv(feats[j]),
+                                  size=paddle.shape(feat)[2:],
+                                  mode='bilinear',
+                                  align_corners=False))
+            feat = self.fusion(paddle.concat([feat, *laterals], 1))
+        encode_feat, feat = self.enc_module(feat)
+        out = self.head(feat)
+        out = F.interpolate(out,
+                            size=[H, W],
+                            mode='bilinear',
+                            align_corners=False)
+        output = [out]
+        if self.training:
+            fcn_out = self.fcn_head(fcn_feat)
+            fcn_out = F.interpolate(fcn_out,
+                                    size=[H, W],
+                                    mode='bilinear',
+                                    align_corners=False)
+            output.append(fcn_out)
+            if self.use_se_loss:
+                se_out = self.se_layer(encode_feat)
+                output.append(se_out)
+            return output
+        return output
+
+
+class Encoding(nn.Layer):
+    def __init__(self, channels, num_codes):
+        super().__init__()
+        self.channels, self.num_codes = channels, num_codes
+
+        std = 1 / ((channels * num_codes)**0.5)
+        self.codewords = self.create_parameter(
+            shape=(num_codes, channels),
+            default_initializer=nn.initializer.Uniform(-std, std),
+        )
+        self.scale = self.create_parameter(
+            shape=(num_codes, ),
+            default_initializer=nn.initializer.Uniform(-1, 0),
+        )
+        self.channels = channels
+
+    def scaled_l2(self, x, codewords, scale):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_scale = scale.reshape([1, 1, num_codes])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+
+        scaled_l2_norm = paddle.multiply(
+            reshaped_scale,
+            (expanded_x - reshaped_codewords).pow(2).sum(axis=3))
+        return scaled_l2_norm
+
+    def aggregate(self, assignment_weights, x, codewords):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+
+        encoded_feat = paddle.multiply(
+            assignment_weights.unsqueeze(3),
+            (expanded_x - reshaped_codewords)).sum(axis=1)
+        encoded_feat = paddle.reshape(encoded_feat,
+                                      [-1, self.num_codes, self.channels])
+        return encoded_feat
+
+    def forward(self, x):
+        x_dims = x.ndim
+        assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format(
+            x_dims)
+        assert paddle.shape(
+            x
+        )[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format(
+            self.channels,
+            paddle.shape(x)[1])
+        batch_size = paddle.shape(x)[0]
+        x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1])
+        assignment_weights = F.softmax(self.scaled_l2(x, self.codewords,
+                                                      self.scale),
+                                       axis=2)
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        return encoded_feat
+
+
+class EncModule(nn.Layer):
+    def __init__(self, in_channels, num_codes):
+        super().__init__()
+        self.encoding_project = layers.ConvBNReLU(
+            in_channels,
+            in_channels,
+            1,
+        )
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            nn.BatchNorm1D(num_codes),
+            nn.ReLU(),
+        )
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels),
+            nn.Sigmoid(),
+        )
+        self.in_channels = in_channels
+
+    def forward(self, x):
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection)
+
+        encoding_feat = encoding_feat.mean(axis=1)
+        batch_size, _, _, _ = paddle.shape(x)
+
+        gamma = self.fc(encoding_feat)
+        y = gamma.reshape([batch_size, self.in_channels, 1, 1])
+        output = F.relu(x + x * y)
+        return encoding_feat, output
diff --git a/paddlers/models/ppseg/models/enet.py b/paddlers/models/ppseg/models/enet.py
new file mode 100644
index 0000000..0fd3b7a
--- /dev/null
+++ b/paddlers/models/ppseg/models/enet.py
@@ -0,0 +1,622 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager, param_init
+
+__all__ = ['ENet']
+
+
+@manager.MODELS.add_component
+class ENet(nn.Layer):
+    """
+    The ENet implementation based on PaddlePaddle.
+
+    The original article refers to
+        Adam Paszke, Abhishek Chaurasia, Sangpil Kim, Eugenio Culurciello, et al."ENet: A Deep Neural Network Architecture for Real-Time Semantic Segmentation"
+        (https://arxiv.org/abs/1606.02147).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+        encoder_relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: False.
+        decoder_relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 num_classes,
+                 pretrained=None,
+                 encoder_relu=False,
+                 decoder_relu=True):
+        super(ENet, self).__init__()
+
+        self.numclasses = num_classes
+        self.initial_block = InitialBlock(3, 16, relu=encoder_relu)
+
+        self.downsample1_0 = DownsamplingBottleneck(16,
+                                                    64,
+                                                    return_indices=True,
+                                                    dropout_prob=0.01,
+                                                    relu=encoder_relu)
+        self.regular1_1 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+        self.regular1_2 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+        self.regular1_3 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+        self.regular1_4 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+
+        self.downsample2_0 = DownsamplingBottleneck(64,
+                                                    128,
+                                                    return_indices=True,
+                                                    dropout_prob=0.1,
+                                                    relu=encoder_relu)
+        self.regular2_1 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated2_2 = RegularBottleneck(128,
+                                            dilation=2,
+                                            padding=2,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric2_3 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               padding=2,
+                                               asymmetric=True,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated2_4 = RegularBottleneck(128,
+                                            dilation=4,
+                                            padding=4,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.regular2_5 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated2_6 = RegularBottleneck(128,
+                                            dilation=8,
+                                            padding=8,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric2_7 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               asymmetric=True,
+                                               padding=2,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated2_8 = RegularBottleneck(128,
+                                            dilation=16,
+                                            padding=16,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+
+        self.regular3_0 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated3_1 = RegularBottleneck(128,
+                                            dilation=2,
+                                            padding=2,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric3_2 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               padding=2,
+                                               asymmetric=True,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated3_3 = RegularBottleneck(128,
+                                            dilation=4,
+                                            padding=4,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.regular3_4 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated3_5 = RegularBottleneck(128,
+                                            dilation=8,
+                                            padding=8,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric3_6 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               asymmetric=True,
+                                               padding=2,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated3_7 = RegularBottleneck(128,
+                                            dilation=16,
+                                            padding=16,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+
+        self.upsample4_0 = UpsamplingBottleneck(128,
+                                                64,
+                                                dropout_prob=0.1,
+                                                relu=decoder_relu)
+        self.regular4_1 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=decoder_relu)
+        self.regular4_2 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=decoder_relu)
+
+        self.upsample5_0 = UpsamplingBottleneck(64,
+                                                16,
+                                                dropout_prob=0.1,
+                                                relu=decoder_relu)
+        self.regular5_1 = RegularBottleneck(16,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=decoder_relu)
+        self.transposed_conv = nn.Conv2DTranspose(16,
+                                                  num_classes,
+                                                  kernel_size=3,
+                                                  stride=2,
+                                                  padding=1,
+                                                  bias_attr=False)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+
+        input_size = x.shape
+        x = self.initial_block(x)
+
+        stage1_input_size = x.shape
+        x, max_indices1_0 = self.downsample1_0(x)
+        x = self.regular1_1(x)
+        x = self.regular1_2(x)
+        x = self.regular1_3(x)
+        x = self.regular1_4(x)
+
+        stage2_input_size = x.shape
+        x, max_indices2_0 = self.downsample2_0(x)
+        x = self.regular2_1(x)
+        x = self.dilated2_2(x)
+        x = self.asymmetric2_3(x)
+        x = self.dilated2_4(x)
+        x = self.regular2_5(x)
+        x = self.dilated2_6(x)
+        x = self.asymmetric2_7(x)
+        x = self.dilated2_8(x)
+
+        x = self.regular3_0(x)
+        x = self.dilated3_1(x)
+        x = self.asymmetric3_2(x)
+        x = self.dilated3_3(x)
+        x = self.regular3_4(x)
+        x = self.dilated3_5(x)
+        x = self.asymmetric3_6(x)
+        x = self.dilated3_7(x)
+
+        x = self.upsample4_0(x, max_indices2_0, output_size=stage2_input_size)
+        x = self.regular4_1(x)
+        x = self.regular4_2(x)
+
+        x = self.upsample5_0(x, max_indices1_0, output_size=stage1_input_size)
+        x = self.regular5_1(x)
+        x = self.transposed_conv(x, output_size=input_size[2:])
+        return [x]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class InitialBlock(nn.Layer):
+    """
+    The initial block is composed of two branches:
+    1. a main branch which performs a regular convolution with stride 2;
+    2. an extension branch which performs max-pooling.
+    Doing both operations in parallel and concatenating their results
+    allows for efficient downsampling and expansion. The main branch
+    outputs 13 feature maps while the extension branch outputs 3, for a
+    total of 16 feature maps after concatenation.
+
+    Args:
+        in_channels (int): the number of input channels.
+        out_channels (int): the number output channels.
+        kernel_size (int, optional): the kernel size of the filters used in
+            the convolution layer. Default: 3.
+        padding (int, optional): zero-padding added to both sides of the
+            input. Default: 0.
+        bias (bool, optional): Adds a learnable bias to the output if
+            ``True``. Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self, in_channels, out_channels, bias=False, relu=True):
+        super(InitialBlock, self).__init__()
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.main_branch = nn.Conv2D(in_channels,
+                                     out_channels - 3,
+                                     kernel_size=3,
+                                     stride=2,
+                                     padding=1,
+                                     bias_attr=bias)
+
+        self.ext_branch = nn.MaxPool2D(3, stride=2, padding=1)
+
+        self.batch_norm = layers.SyncBatchNorm(out_channels)
+
+        self.out_activation = activation()
+
+    def forward(self, x):
+        main = self.main_branch(x)
+        ext = self.ext_branch(x)
+
+        out = paddle.concat((main, ext), 1)
+
+        out = self.batch_norm(out)
+
+        return self.out_activation(out)
+
+
+class RegularBottleneck(nn.Layer):
+    """
+    Regular bottlenecks are the main building block of ENet.
+    Main branch:
+    1. Shortcut connection.
+    Extension branch:
+    1. 1x1 convolution which decreases the number of channels by
+        ``internal_ratio``, also called a projection;
+    2. regular, dilated or asymmetric convolution;
+    3. 1x1 convolution which increases the number of channels back to
+        ``channels``, also called an expansion;
+    4. dropout as a regularizer.
+
+    Args:
+        channels (int): the number of input and output channels.
+        internal_ratio (int, optional): a scale factor applied to
+            ``channels`` used to compute the number of
+            channels after the projection. eg. given ``channels`` equal to 128 and
+            internal_ratio equal to 2 the number of channels after the projection
+            is 64. Default: 4.
+        kernel_size (int, optional): the kernel size of the filters used in
+            the convolution layer described above in item 2 of the extension
+            branch. Default: 3.
+        padding (int, optional): zero-padding added to both sides of the
+            input. Default: 0.
+        dilation (int, optional): spacing between kernel elements for the
+            convolution described in item 2 of the extension branch. Default: 1.
+            asymmetric (bool, optional): flags if the convolution described in
+            item 2 of the extension branch is asymmetric or not. Default: False.
+        dropout_prob (float, optional): probability of an element to be
+            zeroed. Default: 0 (no dropout).
+        bias (bool, optional): Adds a learnable bias to the output if
+            ``True``. Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 channels,
+                 internal_ratio=4,
+                 kernel_size=3,
+                 padding=0,
+                 dilation=1,
+                 asymmetric=False,
+                 dropout_prob=0,
+                 bias=False,
+                 relu=True):
+        super(RegularBottleneck, self).__init__()
+
+        if internal_ratio <= 1 or internal_ratio > channels:
+            raise RuntimeError(
+                "Value out of range. Expected value in the "
+                "interval [1, {0}], got internal_scale={1}.".format(
+                    channels, internal_ratio))
+
+        internal_channels = channels // internal_ratio
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.ext_conv1 = nn.Sequential(
+            nn.Conv2D(channels,
+                      internal_channels,
+                      kernel_size=1,
+                      stride=1,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        if asymmetric:
+            self.ext_conv2 = nn.Sequential(
+                nn.Conv2D(internal_channels,
+                          internal_channels,
+                          kernel_size=(kernel_size, 1),
+                          stride=1,
+                          padding=(padding, 0),
+                          dilation=dilation,
+                          bias_attr=bias),
+                layers.SyncBatchNorm(internal_channels), activation(),
+                nn.Conv2D(internal_channels,
+                          internal_channels,
+                          kernel_size=(1, kernel_size),
+                          stride=1,
+                          padding=(0, padding),
+                          dilation=dilation,
+                          bias_attr=bias),
+                layers.SyncBatchNorm(internal_channels), activation())
+        else:
+            self.ext_conv2 = nn.Sequential(
+                nn.Conv2D(internal_channels,
+                          internal_channels,
+                          kernel_size=kernel_size,
+                          stride=1,
+                          padding=padding,
+                          dilation=dilation,
+                          bias_attr=bias),
+                layers.SyncBatchNorm(internal_channels), activation())
+
+        self.ext_conv3 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      channels,
+                      kernel_size=1,
+                      stride=1,
+                      bias_attr=bias), layers.SyncBatchNorm(channels),
+            activation())
+
+        self.ext_regul = nn.Dropout2D(p=dropout_prob)
+
+        self.out_activation = activation()
+
+    def forward(self, x):
+        main = x
+
+        ext = self.ext_conv1(x)
+        ext = self.ext_conv2(ext)
+        ext = self.ext_conv3(ext)
+        ext = self.ext_regul(ext)
+
+        out = main + ext
+
+        return self.out_activation(out)
+
+
+class DownsamplingBottleneck(nn.Layer):
+    """
+    Downsampling bottlenecks further downsample the feature map size.
+    Main branch:
+    1. max pooling with stride 2; indices are saved to be used for
+        unpooling later.
+    Extension branch:
+    1. 2x2 convolution with stride 2 that decreases the number of channels
+        by ``internal_ratio``, also called a projection;
+    2. regular convolution (by default, 3x3);
+    3. 1x1 convolution which increases the number of channels to
+        ``out_channels``, also called an expansion;
+    4. dropout as a regularizer.
+
+    Args:
+        in_channels (int): the number of input channels.
+        out_channels (int): the number of output channels.
+        internal_ratio (int, optional): a scale factor applied to ``channels``
+            used to compute the number of channels after the projection. eg. given
+            ``channels`` equal to 128 and internal_ratio equal to 2 the number of
+            channels after the projection is 64. Default: 4.
+        return_indices (bool, optional):  if ``True``, will return the max
+            indices along with the outputs. Useful when unpooling later.
+        dropout_prob (float, optional): probability of an element to be
+            zeroed. Default: 0 (no dropout).
+        bias (bool, optional): Adds a learnable bias to the output if
+            ``True``. Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 internal_ratio=4,
+                 return_indices=False,
+                 dropout_prob=0,
+                 bias=False,
+                 relu=True):
+        super(DownsamplingBottleneck, self).__init__()
+
+        self.return_indices = return_indices
+
+        if internal_ratio <= 1 or internal_ratio > in_channels:
+            raise RuntimeError(
+                "Value out of range. Expected value in the "
+                "interval [1, {0}], got internal_scale={1}. ".format(
+                    in_channels, internal_ratio))
+
+        internal_channels = in_channels // internal_ratio
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.main_max1 = nn.MaxPool2D(2, stride=2, return_mask=return_indices)
+
+        self.ext_conv1 = nn.Sequential(
+            nn.Conv2D(in_channels,
+                      internal_channels,
+                      kernel_size=2,
+                      stride=2,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        self.ext_conv2 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      internal_channels,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        self.ext_conv3 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      out_channels,
+                      kernel_size=1,
+                      stride=1,
+                      bias_attr=bias), layers.SyncBatchNorm(out_channels),
+            activation())
+
+        self.ext_regul = nn.Dropout2D(p=dropout_prob)
+
+        self.out_activation = activation()
+
+    def forward(self, x):
+        if self.return_indices:
+            main, max_indices = self.main_max1(x)
+        else:
+            main = self.main_max1(x)
+
+        ext = self.ext_conv1(x)
+        ext = self.ext_conv2(ext)
+        ext = self.ext_conv3(ext)
+        ext = self.ext_regul(ext)
+
+        n, ch_ext, h, w = ext.shape
+        ch_main = main.shape[1]
+        padding = paddle.zeros((n, ch_ext - ch_main, h, w))
+
+        main = paddle.concat((main, padding), 1)
+
+        out = main + ext
+
+        return self.out_activation(out), max_indices
+
+
+class UpsamplingBottleneck(nn.Layer):
+    """
+    The upsampling bottlenecks upsample the feature map resolution using max
+        pooling indices stored from the corresponding downsampling bottleneck.
+    Main branch:
+    1. 1x1 convolution with stride 1 that decreases the number of channels by
+        ``internal_ratio``, also called a projection;
+    2. max unpool layer using the max pool indices from the corresponding
+        downsampling max pool layer.
+    Extension branch:
+    1. 1x1 convolution with stride 1 that decreases the number of channels by
+        ``internal_ratio``, also called a projection;
+    2. transposed convolution (by default, 3x3);
+    3. 1x1 convolution which increases the number of channels to
+        ``out_channels``, also called an expansion;
+    4. dropout as a regularizer.
+
+    Args:
+        in_channels (int): the number of input channels.
+        out_channels (int): the number of output channels.
+        internal_ratio (int, optional): a scale factor applied to ``in_channels``
+            used to compute the number of channels after the projection. eg. given
+            ``in_channels`` equal to 128 and ``internal_ratio`` equal to 2 the number
+            of channels after the projection is 64. Default: 4.
+        dropout_prob (float, optional): probability of an element to be zeroed.
+            Default: 0 (no dropout).
+        bias (bool, optional): Adds a learnable bias to the output if ``True``.
+            Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 internal_ratio=4,
+                 dropout_prob=0,
+                 bias=False,
+                 relu=True):
+        super(UpsamplingBottleneck, self).__init__()
+
+        if internal_ratio <= 1 or internal_ratio > in_channels:
+            raise RuntimeError(
+                "Value out of range. Expected value in the "
+                "interval [1, {0}], got internal_scale={1}. ".format(
+                    in_channels, internal_ratio))
+
+        internal_channels = in_channels // internal_ratio
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.main_conv1 = nn.Sequential(
+            nn.Conv2D(in_channels, out_channels, kernel_size=1, bias_attr=bias),
+            layers.SyncBatchNorm(out_channels))
+
+        self.ext_conv1 = nn.Sequential(
+            nn.Conv2D(in_channels,
+                      internal_channels,
+                      kernel_size=1,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        self.ext_tconv1 = nn.Conv2DTranspose(internal_channels,
+                                             internal_channels,
+                                             kernel_size=2,
+                                             stride=2,
+                                             bias_attr=bias)
+        self.ext_tconv1_bnorm = layers.SyncBatchNorm(internal_channels)
+        self.ext_tconv1_activation = activation()
+
+        self.ext_conv2 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      out_channels,
+                      kernel_size=1,
+                      bias_attr=bias), layers.SyncBatchNorm(out_channels))
+
+        self.ext_regul = nn.Dropout2D(p=dropout_prob)
+
+        self.out_activation = activation()
+
+    def forward(self, x, max_indices, output_size):
+        main = self.main_conv1(x)
+        main = F.max_unpool2d(main,
+                              max_indices,
+                              kernel_size=2,
+                              output_size=output_size)
+
+        ext = self.ext_conv1(x)
+        ext = self.ext_tconv1(ext, output_size=output_size[2:])
+        ext = self.ext_tconv1_bnorm(ext)
+        ext = self.ext_tconv1_activation(ext)
+        ext = self.ext_conv2(ext)
+        ext = self.ext_regul(ext)
+
+        out = main + ext
+
+        return self.out_activation(out)
diff --git a/paddlers/models/ppseg/models/espnet.py b/paddlers/models/ppseg/models/espnet.py
new file mode 100644
index 0000000..21cf4f9
--- /dev/null
+++ b/paddlers/models/ppseg/models/espnet.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class ESPNetV2(nn.Layer):
+    """
+    The ESPNetV2 implementation based on PaddlePaddle.
+
+    The original article refers to
+    Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network"
+    (https://arxiv.org/abs/1811.11431).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): Number of input channels. Default: 3.
+        scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0.
+        drop_prob (floa, optional): The probability of dropout. Default: 0.1.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels=3,
+                 scale=1.0,
+                 drop_prob=0.1,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = EESPNetBackbone(in_channels, drop_prob, scale)
+        self.in_channels = self.backbone.out_channels
+        self.proj_l4_c = layers.ConvBNPReLU(self.in_channels[3],
+                                            self.in_channels[2],
+                                            1,
+                                            stride=1,
+                                            bias_attr=False)
+        psp_size = 2 * self.in_channels[2]
+        self.eesp_psp = nn.Sequential(
+            EESP(psp_size,
+                 psp_size // 2,
+                 stride=1,
+                 branches=4,
+                 kernel_size_maximum=7),
+            PSPModule(psp_size // 2, psp_size // 2),
+        )
+
+        self.project_l3 = nn.Sequential(
+            nn.Dropout2D(p=drop_prob),
+            nn.Conv2D(psp_size // 2, num_classes, 1, 1, bias_attr=False),
+        )
+        self.act_l3 = BNPReLU(num_classes)
+        self.project_l2 = layers.ConvBNPReLU(self.in_channels[1] + num_classes,
+                                             num_classes,
+                                             1,
+                                             stride=1,
+                                             bias_attr=False)
+        self.project_l1 = nn.Sequential(
+            nn.Dropout2D(p=drop_prob),
+            nn.Conv2D(self.in_channels[0] + num_classes,
+                      num_classes,
+                      1,
+                      1,
+                      bias_attr=False),
+        )
+
+        self.pretrained = pretrained
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def hierarchical_upsample(self, x, factor=3):
+        for i in range(factor):
+            x = F.interpolate(x,
+                              scale_factor=2,
+                              mode='bilinear',
+                              align_corners=True)
+        return x
+
+    def forward(self, x):
+        out_l1, out_l2, out_l3, out_l4 = self.backbone(x)
+
+        out_l4_proj = self.proj_l4_c(out_l4)
+        l4_to_l3 = F.interpolate(out_l4_proj,
+                                 scale_factor=2,
+                                 mode='bilinear',
+                                 align_corners=True)
+        merged_l3 = self.eesp_psp(paddle.concat([out_l3, l4_to_l3], axis=1))
+        proj_merge_l3 = self.project_l3(merged_l3)
+        proj_merge_l3 = self.act_l3(proj_merge_l3)
+
+        l3_to_l2 = F.interpolate(proj_merge_l3,
+                                 scale_factor=2,
+                                 mode='bilinear',
+                                 align_corners=True)
+        merged_l2 = self.project_l2(paddle.concat([out_l2, l3_to_l2], axis=1))
+
+        l2_to_l1 = F.interpolate(merged_l2,
+                                 scale_factor=2,
+                                 mode='bilinear',
+                                 align_corners=True)
+        merged_l1 = self.project_l1(paddle.concat([out_l1, l2_to_l1], axis=1))
+
+        if self.training:
+            return [
+                F.interpolate(merged_l1,
+                              scale_factor=2,
+                              mode='bilinear',
+                              align_corners=True),
+                self.hierarchical_upsample(proj_merge_l3),
+            ]
+        else:
+            return [
+                F.interpolate(merged_l1,
+                              scale_factor=2,
+                              mode='bilinear',
+                              align_corners=True)
+            ]
+
+
+class BNPReLU(nn.Layer):
+    def __init__(self, out_channels, **kwargs):
+        super().__init__()
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = layers.SyncBatchNorm(out_channels,
+                                                data_format=data_format)
+        self._prelu = layers.Activation("prelu")
+
+    def forward(self, x):
+        x = self._batch_norm(x)
+        x = self._prelu(x)
+        return x
+
+
+class EESP(nn.Layer):
+    """
+    EESP block, principle: reduce -> split -> transform -> merge
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int, optional): Factor by which we should skip (useful for down-sampling). If 2, then down-samples the feature map by 2. Default: 1.
+        branches (int, optional): Number of branches. Default: 4.
+        kernel_size_maximum (int, optional): A maximum value of receptive field allowed for EESP block. Default: 7.
+        down_method (str, optional): Down sample or not, only support 'avg' and 'esp'(equivalent to stride is 2 or not). Default: 'esp'.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 branches=4,
+                 kernel_size_maximum=7,
+                 down_method='esp'):
+        super(EESP, self).__init__()
+        if out_channels % branches != 0:
+            raise RuntimeError(
+                "The out_channes for EESP should be factorized by branches, but out_channels={} cann't be factorized by branches={}"
+                .format(out_channels, branches))
+        assert down_method in [
+            'avg', 'esp'
+        ], "The down_method for EESP only support 'avg' or 'esp', but got down_method={}".format(
+            down_method)
+        self.in_channels = in_channels
+        self.stride = stride
+
+        in_branch_channels = int(out_channels / branches)
+        self.group_conv_in = layers.ConvBNPReLU(in_channels,
+                                                in_branch_channels,
+                                                1,
+                                                stride=1,
+                                                groups=branches,
+                                                bias_attr=False)
+
+        map_ksize_dilation = {
+            3: 1,
+            5: 2,
+            7: 3,
+            9: 4,
+            11: 5,
+            13: 6,
+            15: 7,
+            17: 8
+        }
+        self.kernel_sizes = []
+        for i in range(branches):
+            kernel_size = 3 + 2 * i
+            kernel_size = kernel_size if kernel_size <= kernel_size_maximum else 3
+            self.kernel_sizes.append(kernel_size)
+        self.kernel_sizes.sort()
+
+        self.spp_modules = nn.LayerList()
+        for i in range(branches):
+            dilation = map_ksize_dilation[self.kernel_sizes[i]]
+            self.spp_modules.append(
+                nn.Conv2D(in_branch_channels,
+                          in_branch_channels,
+                          kernel_size=3,
+                          padding='same',
+                          stride=stride,
+                          dilation=dilation,
+                          groups=in_branch_channels,
+                          bias_attr=False))
+        self.group_conv_out = layers.ConvBN(out_channels,
+                                            out_channels,
+                                            kernel_size=1,
+                                            stride=1,
+                                            groups=branches,
+                                            bias_attr=False)
+        self.bn_act = BNPReLU(out_channels)
+        self._act = nn.PReLU()
+        self.down_method = True if down_method == 'avg' else False
+
+    @paddle.jit.not_to_static
+    def convert_group_x(self, group_merge, x):
+        if x.shape == group_merge.shape:
+            group_merge += x
+
+        return group_merge
+
+    def forward(self, x):
+        group_out = self.group_conv_in(x)
+        output = [self.spp_modules[0](group_out)]
+
+        for k in range(1, len(self.spp_modules)):
+            output_k = self.spp_modules[k](group_out)
+            output_k = output_k + output[k - 1]
+            output.append(output_k)
+
+        group_merge = self.group_conv_out(
+            self.bn_act(paddle.concat(output, axis=1)))
+
+        if self.stride == 2 and self.down_method:
+            return group_merge
+
+        group_merge = self.convert_group_x(group_merge, x)
+        out = self._act(group_merge)
+        return out
+
+
+class PSPModule(nn.Layer):
+    def __init__(self, in_channels, out_channels, sizes=4):
+        super().__init__()
+        self.stages = nn.LayerList([
+            nn.Conv2D(in_channels,
+                      in_channels,
+                      kernel_size=3,
+                      stride=1,
+                      groups=in_channels,
+                      padding='same',
+                      bias_attr=False) for _ in range(sizes)
+        ])
+        self.project = layers.ConvBNPReLU(in_channels * (sizes + 1),
+                                          out_channels,
+                                          1,
+                                          stride=1,
+                                          bias_attr=False)
+
+    def forward(self, feats):
+        h, w = paddle.shape(feats)[2:4]
+        out = [feats]
+        for stage in self.stages:
+            feats = F.avg_pool2d(feats, kernel_size=3, stride=2, padding='same')
+            upsampled = F.interpolate(stage(feats),
+                                      size=[h, w],
+                                      mode='bilinear',
+                                      align_corners=True)
+            out.append(upsampled)
+        return self.project(paddle.concat(out, axis=1))
+
+
+class DownSampler(nn.Layer):
+    """
+    Down sampler.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        branches (int, optional): Number of branches. Default: 9.
+        kernel_size_maximum (int, optional): A maximum value of kernel_size for EESP block. Default: 9.
+        shortcut (bool, optional): Use shortcut or not. Default: True.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 branches=4,
+                 kernel_size_maximum=9,
+                 shortcut=True):
+        super().__init__()
+        if out_channels < in_channels:
+            raise RuntimeError(
+                "The out_channes for DownSampler should be bigger than in_channels, but got in_channles={}, out_channels={}"
+                .format(in_channels, out_channels))
+        self.eesp = EESP(in_channels,
+                         out_channels - in_channels,
+                         stride=2,
+                         branches=branches,
+                         kernel_size_maximum=kernel_size_maximum,
+                         down_method='avg')
+        self.avg = nn.AvgPool2D(kernel_size=3, padding=1, stride=2)
+        if shortcut:
+            self.shortcut_layer = nn.Sequential(
+                layers.ConvBNPReLU(3, 3, 3, stride=1, bias_attr=False),
+                layers.ConvBN(3, out_channels, 1, stride=1, bias_attr=False),
+            )
+        self._act = nn.PReLU()
+
+    def forward(self, x, inputs=None):
+        avg_out = self.avg(x)
+        eesp_out = self.eesp(x)
+        output = paddle.concat([avg_out, eesp_out], axis=1)
+
+        if inputs is not None:
+            w1 = paddle.shape(avg_out)[2]
+            w2 = paddle.shape(inputs)[2]
+            
+            while w2 != w1:
+                inputs = F.avg_pool2d(inputs,
+                                      kernel_size=3,
+                                      padding=1,
+                                      stride=2)
+                w2 = paddle.shape(inputs)[2]
+            # import pdb
+            # pdb.set_trace()
+            output = output + self.shortcut_layer(inputs)
+        return self._act(output)
+
+
+class EESPNetBackbone(nn.Layer):
+    """
+    The EESPNetBackbone implementation based on PaddlePaddle.
+
+    The original article refers to
+    Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network"
+    (https://arxiv.org/abs/1811.11431).
+
+    Args:
+        in_channels (int, optional): Number of input channels. Default: 3.
+        drop_prob (float, optional): The probability of dropout. Default: 3.
+        scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0.
+    """
+    def __init__(self, in_channels=3, drop_prob=0.1, scale=1.0):
+        super().__init__()
+        reps = [0, 3, 7, 3]
+
+        num_level = 4  # 1/2, 1/4, 1/8, 1/16
+        kernel_size_limitations = [13, 11, 9, 7]  # kernel size limitation
+        branch_list = [4] * len(
+            kernel_size_limitations)  # branches at different levels
+
+        base_channels = 32  # first conv output channels
+        channels_config = [base_channels] * num_level
+
+        for i in range(num_level):
+            if i == 0:
+                channels = int(base_channels * scale)
+                channels = math.ceil(channels / branch_list[0]) * branch_list[0]
+                channels_config[
+                    i] = base_channels if channels > base_channels else channels
+            else:
+                channels_config[i] = channels * pow(2, i)
+
+        self.level1 = layers.ConvBNPReLU(in_channels,
+                                         channels_config[0],
+                                         3,
+                                         stride=2,
+                                         bias_attr=False)
+
+        self.level2 = DownSampler(
+            channels_config[0],
+            channels_config[1],
+            branches=branch_list[0],
+            kernel_size_maximum=kernel_size_limitations[0],
+            shortcut=True)
+
+        self.level3_0 = DownSampler(
+            channels_config[1],
+            channels_config[2],
+            branches=branch_list[1],
+            kernel_size_maximum=kernel_size_limitations[1],
+            shortcut=True)
+        self.level3 = nn.LayerList()
+        for i in range(reps[1]):
+            self.level3.append(
+                EESP(channels_config[2],
+                     channels_config[2],
+                     stride=1,
+                     branches=branch_list[2],
+                     kernel_size_maximum=kernel_size_limitations[2]))
+
+        self.level4_0 = DownSampler(
+            channels_config[2],
+            channels_config[3],
+            branches=branch_list[2],
+            kernel_size_maximum=kernel_size_limitations[2],
+            shortcut=True)
+        self.level4 = nn.LayerList()
+        for i in range(reps[2]):
+            self.level4.append(
+                EESP(channels_config[3],
+                     channels_config[3],
+                     stride=1,
+                     branches=branch_list[3],
+                     kernel_size_maximum=kernel_size_limitations[3]))
+
+        self.out_channels = channels_config
+
+        self.init_params()
+
+    def init_params(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                param_init.kaiming_normal_init(m.weight)
+                if m.bias is not None:
+                    param_init.constant_init(m.bias, value=0.0)
+            elif isinstance(m, nn.BatchNorm2D):
+                param_init.constant_init(m.weight, value=1.0)
+                param_init.constant_init(m.bias, value=0.0)
+            elif isinstance(m, nn.Linear):
+                param_init.normal_init(m.weight, std=0.001)
+                if m.bias is not None:
+                    param_init.constant_init(m.bias, value=0.0)
+
+    def forward(self, x):
+        out_l1 = self.level1(x)
+        out_l2 = self.level2(out_l1, x)
+        out_l3 = self.level3_0(out_l2, x)
+        for i, layer in enumerate(self.level3):
+            out_l3 = layer(out_l3)
+        out_l4 = self.level4_0(out_l3, x)
+        for i, layer in enumerate(self.level4):
+            out_l4 = layer(out_l4)
+        return out_l1, out_l2, out_l3, out_l4
+
+
+if __name__ == '__main__':
+    import paddle
+    import numpy as np
+
+    paddle.enable_static()
+
+    startup_prog = paddle.static.default_startup_program()
+
+    exe = paddle.static.Executor(paddle.CPUPlace())
+    exe.run(startup_prog)
+    path_prefix = "./output/model"
+
+    [inference_program, feed_target_names, fetch_targets] = (
+        paddle.static.load_inference_model(path_prefix, exe))
+    print('inference_program:', inference_program)
+
+    tensor_img = np.array(np.random.random((1, 3, 1024, 2048)), dtype=np.float32)
+    results = exe.run(inference_program,
+                feed={feed_target_names[0]: tensor_img},
+                fetch_list=fetch_targets)
diff --git a/paddlers/models/ppseg/models/espnetv1.py b/paddlers/models/ppseg/models/espnetv1.py
new file mode 100644
index 0000000..0a6c992
--- /dev/null
+++ b/paddlers/models/ppseg/models/espnetv1.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ESPNetV1(nn.Layer):
+    """
+    The ESPNetV1 implementation based on PaddlePaddle.
+
+    The original article refers to
+      Sachin Mehta1, Mohammad Rastegari, Anat Caspi, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNet: Efficient Spatial Pyramid of Dilated Convolutions for Semantic Segmentation"
+      (https://arxiv.org/abs/1803.06815).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): Number of input channels. Default: 3.
+        level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 2.
+        level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels=3,
+                 level2_depth=2,
+                 level3_depth=3,
+                 pretrained=None):
+        super().__init__()
+        self.encoder = ESPNetEncoder(num_classes, in_channels, level2_depth,
+                                     level3_depth)
+
+        self.level3_up = nn.Conv2DTranspose(num_classes,
+                                            num_classes,
+                                            2,
+                                            stride=2,
+                                            padding=0,
+                                            output_padding=0,
+                                            bias_attr=False)
+        self.br3 = layers.SyncBatchNorm(num_classes)
+        self.level2_proj = nn.Conv2D(in_channels + 128,
+                                     num_classes,
+                                     1,
+                                     bias_attr=False)
+        self.combine_l2_l3 = nn.Sequential(
+            BNPReLU(2 * num_classes),
+            DilatedResidualBlock(2 * num_classes, num_classes, residual=False),
+        )
+        self.level2_up = nn.Sequential(
+            nn.Conv2DTranspose(num_classes,
+                               num_classes,
+                               2,
+                               stride=2,
+                               padding=0,
+                               output_padding=0,
+                               bias_attr=False),
+            BNPReLU(num_classes),
+        )
+        self.out_proj = layers.ConvBNPReLU(16 + in_channels + num_classes,
+                                           num_classes,
+                                           3,
+                                           padding='same',
+                                           stride=1)
+        self.out_up = nn.Conv2DTranspose(num_classes,
+                                         num_classes,
+                                         2,
+                                         stride=2,
+                                         padding=0,
+                                         output_padding=0,
+                                         bias_attr=False)
+        self.pretrained = pretrained
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        p1, p2, p3 = self.encoder(x)
+        up_p3 = self.level3_up(p3)
+
+        combine = self.combine_l2_l3(paddle.concat([up_p3, p2], axis=1))
+        up_p2 = self.level2_up(combine)
+
+        combine = self.out_proj(paddle.concat([up_p2, p1], axis=1))
+        out = self.out_up(combine)
+        return [out]
+
+
+class BNPReLU(nn.Layer):
+    def __init__(self, channels):
+        super().__init__()
+        self.bn = layers.SyncBatchNorm(channels)
+        self.act = nn.PReLU(channels)
+
+    def forward(self, x):
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DownSampler(nn.Layer):
+    """
+    Down sampler.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        branch_channels = out_channels // 5
+        remain_channels = out_channels - branch_channels * 4
+        self.conv1 = nn.Conv2D(in_channels,
+                               branch_channels,
+                               3,
+                               stride=2,
+                               padding=1,
+                               bias_attr=False)
+        self.d_conv1 = nn.Conv2D(branch_channels,
+                                 remain_channels,
+                                 3,
+                                 padding=1,
+                                 bias_attr=False)
+        self.d_conv2 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=2,
+                                 dilation=2,
+                                 bias_attr=False)
+        self.d_conv4 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=4,
+                                 dilation=4,
+                                 bias_attr=False)
+        self.d_conv8 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=8,
+                                 dilation=8,
+                                 bias_attr=False)
+        self.d_conv16 = nn.Conv2D(branch_channels,
+                                  branch_channels,
+                                  3,
+                                  padding=16,
+                                  dilation=16,
+                                  bias_attr=False)
+        self.bn = layers.SyncBatchNorm(out_channels)
+        self.act = nn.PReLU(out_channels)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        d1 = self.d_conv1(x)
+        d2 = self.d_conv2(x)
+        d4 = self.d_conv4(x)
+        d8 = self.d_conv8(x)
+        d16 = self.d_conv16(x)
+
+        feat1 = d2
+        feat2 = feat1 + d4
+        feat3 = feat2 + d8
+        feat4 = feat3 + d16
+
+        feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1)
+        out = self.bn(feat)
+        out = self.act(out)
+        return out
+
+
+class DilatedResidualBlock(nn.Layer):
+    '''
+    ESP block, principle: reduce -> split -> transform -> merge
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        residual (bool, optional): Add a residual connection through identity operation. Default: True.
+    '''
+    def __init__(self, in_channels, out_channels, residual=True):
+        super().__init__()
+        branch_channels = out_channels // 5
+        remain_channels = out_channels - branch_channels * 4
+        self.conv1 = nn.Conv2D(in_channels, branch_channels, 1, bias_attr=False)
+        self.d_conv1 = nn.Conv2D(branch_channels,
+                                 remain_channels,
+                                 3,
+                                 padding=1,
+                                 bias_attr=False)
+        self.d_conv2 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=2,
+                                 dilation=2,
+                                 bias_attr=False)
+        self.d_conv4 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=4,
+                                 dilation=4,
+                                 bias_attr=False)
+        self.d_conv8 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=8,
+                                 dilation=8,
+                                 bias_attr=False)
+        self.d_conv16 = nn.Conv2D(branch_channels,
+                                  branch_channels,
+                                  3,
+                                  padding=16,
+                                  dilation=16,
+                                  bias_attr=False)
+
+        self.bn = BNPReLU(out_channels)
+        self.residual = residual
+
+    def forward(self, x):
+        x_proj = self.conv1(x)
+        d1 = self.d_conv1(x_proj)
+        d2 = self.d_conv2(x_proj)
+        d4 = self.d_conv4(x_proj)
+        d8 = self.d_conv8(x_proj)
+        d16 = self.d_conv16(x_proj)
+
+        feat1 = d2
+        feat2 = feat1 + d4
+        feat3 = feat2 + d8
+        feat4 = feat3 + d16
+
+        feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1)
+
+        if self.residual:
+            feat = feat + x
+        out = self.bn(feat)
+        return out
+
+
+class ESPNetEncoder(nn.Layer):
+    '''
+    The ESPNet-C implementation based on PaddlePaddle.
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): Number of input channels. Default: 3.
+        level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 5.
+        level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3.
+    '''
+    def __init__(self,
+                 num_classes,
+                 in_channels=3,
+                 level2_depth=5,
+                 level3_depth=3):
+        super().__init__()
+        self.level1 = layers.ConvBNPReLU(in_channels,
+                                         16,
+                                         3,
+                                         padding='same',
+                                         stride=2)
+        self.br1 = BNPReLU(in_channels + 16)
+        self.proj1 = layers.ConvBNPReLU(in_channels + 16, num_classes, 1)
+
+        self.level2_0 = DownSampler(in_channels + 16, 64)
+        self.level2 = nn.Sequential(
+            *[DilatedResidualBlock(64, 64) for i in range(level2_depth)])
+        self.br2 = BNPReLU(in_channels + 128)
+        self.proj2 = layers.ConvBNPReLU(in_channels + 128, num_classes, 1)
+
+        self.level3_0 = DownSampler(in_channels + 128, 128)
+        self.level3 = nn.Sequential(
+            *[DilatedResidualBlock(128, 128) for i in range(level3_depth)])
+        self.br3 = BNPReLU(256)
+        self.proj3 = layers.ConvBNPReLU(256, num_classes, 1)
+
+    def forward(self, x):
+        f1 = self.level1(x)
+        down2 = F.adaptive_avg_pool2d(x, output_size=f1.shape[2:])
+        feat1 = paddle.concat([f1, down2], axis=1)
+        feat1 = self.br1(feat1)
+        p1 = self.proj1(feat1)
+
+        f2_res = self.level2_0(feat1)
+        f2 = self.level2(f2_res)
+        down4 = F.adaptive_avg_pool2d(x, output_size=f2.shape[2:])
+        feat2 = paddle.concat([f2, f2_res, down4], axis=1)
+        feat2 = self.br2(feat2)
+        p2 = self.proj2(feat2)
+
+        f3_res = self.level3_0(feat2)
+        f3 = self.level3(f3_res)
+        feat3 = paddle.concat([f3, f3_res], axis=1)
+        feat3 = self.br3(feat3)
+        p3 = self.proj3(feat3)
+
+        return p1, p2, p3
diff --git a/paddlers/models/ppseg/models/fast_scnn.py b/paddlers/models/ppseg/models/fast_scnn.py
new file mode 100644
index 0000000..e553a8f
--- /dev/null
+++ b/paddlers/models/ppseg/models/fast_scnn.py
@@ -0,0 +1,316 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import paddle
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = ['FastSCNN']
+
+
+@manager.MODELS.add_component
+class FastSCNN(nn.Layer):
+    """
+    The FastSCNN implementation based on PaddlePaddle.
+    As mentioned in the original paper, FastSCNN is a real-time segmentation algorithm (123.5fps)
+    even for high resolution images (1024x2048).
+    The original article refers to
+    Poudel, Rudra PK, et al. "Fast-scnn: Fast semantic segmentation network"
+    (https://arxiv.org/pdf/1902.04502.pdf).
+    Args:
+        num_classes (int): The unique number of target classes.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
+            If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+
+        super().__init__()
+
+        self.learning_to_downsample = LearningToDownsample(32, 48, 64)
+        self.global_feature_extractor = GlobalFeatureExtractor(
+            in_channels=64,
+            block_channels=[64, 96, 128],
+            out_channels=128,
+            expansion=6,
+            num_blocks=[3, 3, 3],
+            align_corners=True)
+        self.feature_fusion = FeatureFusionModule(64, 128, 128, align_corners)
+        self.classifier = Classifier(128, num_classes)
+
+        if enable_auxiliary_loss:
+            self.auxlayer = layers.AuxLayer(64, 32, num_classes)
+
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        logit_list = []
+        input_size = paddle.shape(x)[2:]
+        higher_res_features = self.learning_to_downsample(x)
+        x = self.global_feature_extractor(higher_res_features)
+        x = self.feature_fusion(higher_res_features, x)
+        logit = self.classifier(x)
+        logit = F.interpolate(
+            logit,
+            input_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            auxiliary_logit = self.auxlayer(higher_res_features)
+            auxiliary_logit = F.interpolate(
+                auxiliary_logit,
+                input_size,
+                mode='bilinear',
+                align_corners=self.align_corners)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class LearningToDownsample(nn.Layer):
+    """
+    Learning to downsample module.
+    This module consists of three downsampling blocks (one conv and two separable conv)
+    Args:
+        dw_channels1 (int, optional): The input channels of the first sep conv. Default: 32.
+        dw_channels2 (int, optional): The input channels of the second sep conv. Default: 48.
+        out_channels (int, optional): The output channels of LearningToDownsample module. Default: 64.
+    """
+
+    def __init__(self, dw_channels1=32, dw_channels2=48, out_channels=64):
+        super(LearningToDownsample, self).__init__()
+
+        self.conv_bn_relu = layers.ConvBNReLU(
+            in_channels=3, out_channels=dw_channels1, kernel_size=3, stride=2)
+        self.dsconv_bn_relu1 = layers.SeparableConvBNReLU(
+            in_channels=dw_channels1,
+            out_channels=dw_channels2,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.dsconv_bn_relu2 = layers.SeparableConvBNReLU(
+            in_channels=dw_channels2,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+
+    def forward(self, x):
+        x = self.conv_bn_relu(x)
+        x = self.dsconv_bn_relu1(x)
+        x = self.dsconv_bn_relu2(x)
+        return x
+
+
+class GlobalFeatureExtractor(nn.Layer):
+    """
+    Global feature extractor module.
+    This module consists of three InvertedBottleneck blocks (like inverted residual introduced by MobileNetV2) and
+    a PPModule (introduced by PSPNet).
+    Args:
+        in_channels (int): The number of input channels to the module.
+        block_channels (tuple): A tuple represents output channels of each bottleneck block.
+        out_channels (int): The number of output channels of the module. Default:
+        expansion (int): The expansion factor in bottleneck.
+        num_blocks (tuple): It indicates the repeat time of each bottleneck.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, in_channels, block_channels, out_channels, expansion,
+                 num_blocks, align_corners):
+        super(GlobalFeatureExtractor, self).__init__()
+
+        self.bottleneck1 = self._make_layer(InvertedBottleneck, in_channels,
+                                            block_channels[0], num_blocks[0],
+                                            expansion, 2)
+        self.bottleneck2 = self._make_layer(
+            InvertedBottleneck, block_channels[0], block_channels[1],
+            num_blocks[1], expansion, 2)
+        self.bottleneck3 = self._make_layer(
+            InvertedBottleneck, block_channels[1], block_channels[2],
+            num_blocks[2], expansion, 1)
+
+        self.ppm = layers.PPModule(
+            block_channels[2],
+            out_channels,
+            bin_sizes=(1, 2, 3, 6),
+            dim_reduction=True,
+            align_corners=align_corners)
+
+    def _make_layer(self,
+                    block,
+                    in_channels,
+                    out_channels,
+                    blocks,
+                    expansion=6,
+                    stride=1):
+        layers = []
+        layers.append(block(in_channels, out_channels, expansion, stride))
+        for _ in range(1, blocks):
+            layers.append(block(out_channels, out_channels, expansion, 1))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.bottleneck1(x)
+        x = self.bottleneck2(x)
+        x = self.bottleneck3(x)
+        x = self.ppm(x)
+        return x
+
+
+class InvertedBottleneck(nn.Layer):
+    """
+    Single Inverted bottleneck implementation.
+    Args:
+        in_channels (int): The number of input channels to bottleneck block.
+        out_channels (int): The number of output channels of bottleneck block.
+        expansion (int, optional). The expansion factor in bottleneck. Default: 6.
+        stride (int, optional). The stride used in depth-wise conv. Defalt: 2.
+    """
+
+    def __init__(self, in_channels, out_channels, expansion=6, stride=2):
+        super().__init__()
+
+        self.use_shortcut = stride == 1 and in_channels == out_channels
+
+        expand_channels = in_channels * expansion
+        self.block = nn.Sequential(
+            # pw
+            layers.ConvBNReLU(
+                in_channels=in_channels,
+                out_channels=expand_channels,
+                kernel_size=1,
+                bias_attr=False),
+            # dw
+            layers.ConvBNReLU(
+                in_channels=expand_channels,
+                out_channels=expand_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=expand_channels,
+                bias_attr=False),
+            # pw-linear
+            layers.ConvBN(
+                in_channels=expand_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                bias_attr=False))
+
+    def forward(self, x):
+        out = self.block(x)
+        if self.use_shortcut:
+            out = x + out
+        return out
+
+
+class FeatureFusionModule(nn.Layer):
+    """
+    Feature Fusion Module Implementation.
+    This module fuses high-resolution feature and low-resolution feature.
+    Args:
+        high_in_channels (int): The channels of high-resolution feature (output of LearningToDownsample).
+        low_in_channels (int): The channels of low-resolution feature (output of GlobalFeatureExtractor).
+        out_channels (int): The output channels of this module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, high_in_channels, low_in_channels, out_channels,
+                 align_corners):
+        super().__init__()
+
+        # Only depth-wise conv
+        self.dwconv = layers.ConvBNReLU(
+            in_channels=low_in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+            groups=128,
+            bias_attr=False)
+
+        self.conv_low_res = layers.ConvBN(out_channels, out_channels, 1)
+        self.conv_high_res = layers.ConvBN(high_in_channels, out_channels, 1)
+        self.align_corners = align_corners
+
+    def forward(self, high_res_input, low_res_input):
+        low_res_input = F.interpolate(
+            low_res_input,
+            paddle.shape(high_res_input)[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        low_res_input = self.dwconv(low_res_input)
+        low_res_input = self.conv_low_res(low_res_input)
+        high_res_input = self.conv_high_res(high_res_input)
+        x = high_res_input + low_res_input
+
+        return F.relu(x)
+
+
+class Classifier(nn.Layer):
+    """
+    The Classifier module implementation.
+    This module consists of two depth-wise conv and one conv.
+    Args:
+        input_channels (int): The input channels to this module.
+        num_classes (int): The unique number of target classes.
+    """
+
+    def __init__(self, input_channels, num_classes):
+        super().__init__()
+
+        self.dsconv1 = layers.SeparableConvBNReLU(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.dsconv2 = layers.SeparableConvBNReLU(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.conv = nn.Conv2D(
+            in_channels=input_channels, out_channels=num_classes, kernel_size=1)
+
+        self.dropout = nn.Dropout(p=0.1)  # dropout_prob
+
+    def forward(self, x):
+        x = self.dsconv1(x)
+        x = self.dsconv2(x)
+        x = self.dropout(x)
+        x = self.conv(x)
+        return x
diff --git a/paddlers/models/ppseg/models/fastfcn.py b/paddlers/models/ppseg/models/fastfcn.py
new file mode 100644
index 0000000..0615622
--- /dev/null
+++ b/paddlers/models/ppseg/models/fastfcn.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class FastFCN(nn.Layer):
+    """
+    The FastFCN implementation based on PaddlePaddle.
+
+    The original article refers to
+    Huikai Wu, Junge Zhang, Kaiqi Huang. "FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation".
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of
+            output of backbone.
+        num_codes (int): The number of encoded words. Default: 32.
+        mid_channels (int): The channels of middle layers. Default: 512.
+        use_jpu (bool): Whether use jpu module. Default: True.
+        aux_loss (bool): Whether use auxiliary head loss. Default: True.
+        use_se_loss (int): Whether use semantic encoding loss. Default: True.
+        add_lateral (int): Whether use lateral convolution layers. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 num_codes=32,
+                 mid_channels=512,
+                 use_jpu=True,
+                 aux_loss=True,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 pretrained=None):
+        super().__init__()
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.backbone = backbone
+        self.use_jpu = use_jpu
+        in_channels = self.backbone.feat_channels
+
+        if use_jpu:
+            self.jpu_layer = layers.JPU(in_channels, mid_channels)
+            in_channels[-1] = mid_channels * 4
+            self.bottleneck = layers.ConvBNReLU(
+                in_channels[-1],
+                mid_channels,
+                1,
+                padding=0,
+                bias_attr=False,
+            )
+        else:
+            self.bottleneck = layers.ConvBNReLU(
+                in_channels[-1],
+                mid_channels,
+                3,
+                padding=1,
+                bias_attr=False,
+            )
+        if self.add_lateral:
+            self.lateral_convs = nn.LayerList([
+                layers.ConvBNReLU(in_channels[0],
+                                  mid_channels,
+                                  1,
+                                  bias_attr=False),
+                layers.ConvBNReLU(in_channels[1],
+                                  mid_channels,
+                                  1,
+                                  bias_attr=False),
+            ])
+
+            self.fusion = layers.ConvBNReLU(
+                3 * mid_channels,
+                mid_channels,
+                3,
+                padding=1,
+                bias_attr=False,
+            )
+
+        self.enc_module = EncModule(mid_channels, num_codes)
+        self.cls_seg = nn.Conv2D(mid_channels, num_classes, 1)
+
+        self.aux_loss = aux_loss
+        if self.aux_loss:
+            self.fcn_head = layers.AuxLayer(in_channels[-2], mid_channels,
+                                            num_classes)
+
+        self.use_se_loss = use_se_loss
+        if use_se_loss:
+            self.se_layer = nn.Linear(mid_channels, num_classes)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, inputs):
+        imsize = paddle.shape(inputs)[2:]
+        feats = self.backbone(inputs)
+        if self.use_jpu:
+            feats = self.jpu_layer(*feats)
+
+        fcn_feat = feats[2]
+
+        feat = self.bottleneck(feats[-1])
+        if self.add_lateral:
+            laterals = []
+            for i, lateral_conv in enumerate(self.lateral_convs):
+                laterals.append(
+                    F.interpolate(lateral_conv(feats[i]),
+                                  size=paddle.shape(feat)[2:],
+                                  mode='bilinear',
+                                  align_corners=False))
+            feat = self.fusion(paddle.concat([feat, *laterals], 1))
+        encode_feat, feat = self.enc_module(feat)
+        out = self.cls_seg(feat)
+        out = F.interpolate(out,
+                            size=imsize,
+                            mode='bilinear',
+                            align_corners=False)
+        output = [out]
+
+        if self.training:
+            fcn_out = self.fcn_head(fcn_feat)
+            fcn_out = F.interpolate(fcn_out,
+                                    size=imsize,
+                                    mode='bilinear',
+                                    align_corners=False)
+            output.append(fcn_out)
+            if self.use_se_loss:
+                se_out = self.se_layer(encode_feat)
+                output.append(se_out)
+            return output
+        return output
+
+
+class Encoding(nn.Layer):
+    def __init__(self, channels, num_codes):
+        super().__init__()
+        self.channels, self.num_codes = channels, num_codes
+
+        std = 1 / ((channels * num_codes)**0.5)
+        self.codewords = self.create_parameter(
+            shape=(num_codes, channels),
+            default_initializer=nn.initializer.Uniform(-std, std),
+        )
+        self.scale = self.create_parameter(
+            shape=(num_codes, ),
+            default_initializer=nn.initializer.Uniform(-1, 0),
+        )
+
+    def scaled_l2(self, x, codewords, scale):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_scale = scale.reshape([1, 1, num_codes])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+
+        scaled_l2_norm = reshaped_scale * (
+            expanded_x - reshaped_codewords).pow(2).sum(axis=3)
+        return scaled_l2_norm
+
+    def aggregate(self, assignment_weights, x, codewords):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+        expanded_x = paddle.tile(
+            x.unsqueeze(2),
+            [1, 1, num_codes, 1],
+        )
+        encoded_feat = (assignment_weights.unsqueeze(3) *
+                        (expanded_x - reshaped_codewords)).sum(axis=1)
+        return encoded_feat
+
+    def forward(self, x):
+        x_dims = x.ndim
+        assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format(
+            x_dims)
+        assert paddle.shape(
+            x
+        )[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format(
+            self.channels,
+            paddle.shape(x)[1])
+        batch_size = paddle.shape(x)[0]
+        x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1])
+        assignment_weights = F.softmax(self.scaled_l2(x, self.codewords,
+                                                      self.scale),
+                                       axis=2)
+
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        encoded_feat = encoded_feat.reshape([batch_size, self.num_codes, -1])
+        return encoded_feat
+
+
+class EncModule(nn.Layer):
+    def __init__(self, in_channels, num_codes):
+        super().__init__()
+        self.encoding_project = layers.ConvBNReLU(
+            in_channels,
+            in_channels,
+            1,
+        )
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            nn.BatchNorm1D(num_codes),
+            nn.ReLU(),
+        )
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection).mean(axis=1)
+        batch_size, channels, _, _ = paddle.shape(x)
+        gamma = self.fc(encoding_feat)
+        y = gamma.reshape([batch_size, channels, 1, 1])
+        output = F.relu(x + x * y)
+        return encoding_feat, output
diff --git a/paddlers/models/ppseg/models/fcn.py b/paddlers/models/ppseg/models/fcn.py
new file mode 100644
index 0000000..19554ca
--- /dev/null
+++ b/paddlers/models/ppseg/models/fcn.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import paddle
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class FCN(nn.Layer):
+    """
+    A simple implementation for FCN based on PaddlePaddle.
+
+    The original article refers to
+    Evan Shelhamer, et, al. "Fully Convolutional Networks for Semantic Segmentation"
+    (https://arxiv.org/abs/1411.4038).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone networks.
+        backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone.
+            Default: (-1, ).
+        channels (int, optional): The channels between conv layer and the last layer of FCNHead.
+            If None, it will be the number of channels of input features. Default: None.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(-1, ),
+                 channels=None,
+                 align_corners=False,
+                 pretrained=None,
+                 bias=True,
+                 data_format="NCHW"):
+        super(FCN, self).__init__()
+
+        if data_format != 'NCHW':
+            raise ('fcn only support NCHW data format')
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = FCNHead(
+            num_classes,
+            backbone_indices,
+            backbone_channels,
+            channels,
+            bias=bias)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.data_format = data_format
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class FCNHead(nn.Layer):
+    """
+    A simple implementation for FCNHead based on PaddlePaddle
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone.
+            Default: (-1, ).
+        channels (int, optional): The channels between conv layer and the last layer of FCNHead.
+            If None, it will be the number of channels of input features. Default: None.
+        pretrained (str, optional): The path of pretrained model. Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices=(-1, ),
+                 backbone_channels=(270, ),
+                 channels=None,
+                 bias=True):
+        super(FCNHead, self).__init__()
+
+        self.num_classes = num_classes
+        self.backbone_indices = backbone_indices
+        if channels is None:
+            channels = backbone_channels[0]
+
+        self.conv_1 = layers.ConvBNReLU(
+            in_channels=backbone_channels[0],
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            bias_attr=bias)
+        self.cls = nn.Conv2D(
+            in_channels=channels,
+            out_channels=self.num_classes,
+            kernel_size=1,
+            stride=1,
+            bias_attr=bias)
+        self.init_weight()
+
+    def forward(self, feat_list):
+        logit_list = []
+        x = feat_list[self.backbone_indices[0]]
+        x = self.conv_1(x)
+        logit = self.cls(x)
+        logit_list.append(logit)
+        return logit_list
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
diff --git a/paddlers/models/ppseg/models/gcnet.py b/paddlers/models/ppseg/models/gcnet.py
new file mode 100644
index 0000000..69eaebc
--- /dev/null
+++ b/paddlers/models/ppseg/models/gcnet.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class GCNet(nn.Layer):
+    """
+    The GCNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Cao, Yue, et al. "GCnet: Non-local networks meet squeeze-excitation networks and beyond"
+    (https://arxiv.org/pdf/1904.11492.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+        gc_channels (int, optional): The input channels to Global Context Block. Default: 512.
+        ratio (float, optional): It indicates the ratio of attention channels and gc_channels. Default: 0.25.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 gc_channels=512,
+                 ratio=0.25,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = GCNetHead(num_classes, backbone_indices, backbone_channels,
+                              gc_channels, ratio, enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class GCNetHead(nn.Layer):
+    """
+    The GCNetHead implementation.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            The first index will be taken as a deep-supervision feature in auxiliary layer;
+            the second one will be taken as input of GlobalContextBlock.
+        backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
+        gc_channels (int): The input channels to Global Context Block.
+        ratio (float): It indicates the ratio of attention channels and gc_channels.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 backbone_channels,
+                 gc_channels,
+                 ratio,
+                 enable_auxiliary_loss=True):
+
+        super().__init__()
+
+        in_channels = backbone_channels[1]
+        self.conv_bn_relu1 = layers.ConvBNReLU(
+            in_channels=in_channels,
+            out_channels=gc_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.gc_block = GlobalContextBlock(
+            gc_channels=gc_channels, in_channels=gc_channels, ratio=ratio)
+
+        self.conv_bn_relu2 = layers.ConvBNReLU(
+            in_channels=gc_channels,
+            out_channels=gc_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.conv_bn_relu3 = layers.ConvBNReLU(
+            in_channels=in_channels + gc_channels,
+            out_channels=gc_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.dropout = nn.Dropout(p=0.1)
+
+        self.conv = nn.Conv2D(
+            in_channels=gc_channels, out_channels=num_classes, kernel_size=1)
+
+        if enable_auxiliary_loss:
+            self.auxlayer = layers.AuxLayer(
+                in_channels=backbone_channels[0],
+                inter_channels=backbone_channels[0] // 4,
+                out_channels=num_classes)
+
+        self.backbone_indices = backbone_indices
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+    def forward(self, feat_list):
+        logit_list = []
+        x = feat_list[self.backbone_indices[1]]
+
+        output = self.conv_bn_relu1(x)
+        output = self.gc_block(output)
+        output = self.conv_bn_relu2(output)
+
+        output = paddle.concat([x, output], axis=1)
+        output = self.conv_bn_relu3(output)
+
+        output = self.dropout(output)
+        logit = self.conv(output)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            low_level_feat = feat_list[self.backbone_indices[0]]
+            auxiliary_logit = self.auxlayer(low_level_feat)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
+
+
+class GlobalContextBlock(nn.Layer):
+    """
+    Global Context Block implementation.
+
+    Args:
+        in_channels (int): The input channels of Global Context Block.
+        ratio (float): The channels of attention map.
+    """
+
+    def __init__(self, gc_channels, in_channels, ratio):
+        super().__init__()
+        self.gc_channels = gc_channels
+
+        self.conv_mask = nn.Conv2D(
+            in_channels=in_channels, out_channels=1, kernel_size=1)
+
+        self.softmax = nn.Softmax(axis=2)
+
+        inter_channels = int(in_channels * ratio)
+        self.channel_add_conv = nn.Sequential(
+            nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=inter_channels,
+                kernel_size=1),
+            nn.LayerNorm(normalized_shape=[inter_channels, 1, 1]), nn.ReLU(),
+            nn.Conv2D(
+                in_channels=inter_channels,
+                out_channels=in_channels,
+                kernel_size=1))
+
+    def global_context_block(self, x):
+        x_shape = paddle.shape(x)
+
+        # [N, C, H * W]
+        input_x = paddle.reshape(x, shape=[0, self.gc_channels, -1])
+        # [N, 1, C, H * W]
+        input_x = paddle.unsqueeze(input_x, axis=1)
+        # [N, 1, H, W]
+        context_mask = self.conv_mask(x)
+        # [N, 1, H * W]
+        context_mask = paddle.reshape(context_mask, shape=[0, 1, -1])
+        context_mask = self.softmax(context_mask)
+        # [N, 1, H * W, 1]
+        context_mask = paddle.unsqueeze(context_mask, axis=-1)
+        # [N, 1, C, 1]
+        context = paddle.matmul(input_x, context_mask)
+        # [N, C, 1, 1]
+        context = paddle.reshape(context, shape=[0, self.gc_channels, 1, 1])
+
+        return context
+
+    def forward(self, x):
+        context = self.global_context_block(x)
+        channel_add_term = self.channel_add_conv(context)
+        out = x + channel_add_term
+        return out
diff --git a/paddlers/models/ppseg/models/ginet.py b/paddlers/models/ppseg/models/ginet.py
new file mode 100644
index 0000000..20f7cce
--- /dev/null
+++ b/paddlers/models/ppseg/models/ginet.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import functional as F
+
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.MODELS.add_component
+class GINet(nn.Layer):
+    """
+    The GINet implementation based on PaddlePaddle.
+    The original article refers to
+    Wu, Tianyi, Yu Lu, Yu Zhu, Chuang Zhang, Ming Wu, Zhanyu Ma, and Guodong Guo. "GINet: Graph interaction network for scene parsing." In European Conference on Computer Vision, pp. 34-51. Springer, Cham, 2020.
+    (https://arxiv.org/pdf/2009.06160).
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network.
+        backbone_indices (tuple, optional): Values in the tuple indicate the indices of output of backbone.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
+            If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
+        jpu (bool, optional)): whether to use jpu unit in the base forward. Default:True.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=[0, 1, 2, 3],
+                 enable_auxiliary_loss=True,
+                 align_corners=True,
+                 jpu=True,
+                 pretrained=None):
+        super().__init__()
+        self.nclass = num_classes
+        self.aux = enable_auxiliary_loss
+        self.jpu = jpu
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.align_corners = align_corners
+
+        self.jpu = layers.JPU([512, 1024, 2048], width=512) if jpu else None
+        self.head = GIHead(in_channels=2048, nclass=num_classes)
+
+        if self.aux:
+            self.auxlayer = layers.AuxLayer(
+                1024, 1024 // 4, num_classes, bias_attr=False)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def base_forward(self, x):
+        feat_list = self.backbone(x)
+
+        c1, c2, c3, c4 = [feat_list[i] for i in self.backbone_indices]
+
+        if self.jpu:
+            return self.jpu(c1, c2, c3, c4)
+        else:
+            return c1, c2, c3, c4
+
+    def forward(self, x):
+        _, _, h, w = paddle.shape(x)
+        _, _, c3, c4 = self.base_forward(x)
+
+        logit_list = []
+        x, _ = self.head(c4)
+        logit_list.append(x)
+
+        if self.aux:
+            auxout = self.auxlayer(c3)
+
+            logit_list.append(auxout)
+
+        return [
+            F.interpolate(
+                logit, (h, w),
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class GIHead(nn.Layer):
+    """The Graph Interaction Network head."""
+
+    def __init__(self, in_channels, nclass):
+        super().__init__()
+        self.nclass = nclass
+        inter_channels = in_channels // 4
+        self.inp = paddle.zeros(shape=(nclass, 300), dtype='float32')
+        self.inp = paddle.create_parameter(
+            shape=self.inp.shape,
+            dtype=str(self.inp.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(self.inp))
+        self.inp.stop_gradient = True
+
+        self.fc1 = nn.Sequential(
+            nn.Linear(300, 128), nn.BatchNorm1D(128), nn.ReLU())
+        self.fc2 = nn.Sequential(
+            nn.Linear(128, 256), nn.BatchNorm1D(256), nn.ReLU())
+        self.conv5 = layers.ConvBNReLU(
+            in_channels,
+            inter_channels,
+            3,
+            padding=1,
+            bias_attr=False,
+            stride=1)
+
+        self.gloru = GlobalReasonUnit(
+            in_channels=inter_channels,
+            num_state=256,
+            num_node=84,
+            nclass=nclass)
+        self.conv6 = nn.Sequential(
+            nn.Dropout(0.1), nn.Conv2D(inter_channels, nclass, 1))
+
+    def forward(self, x):
+
+        B, C, H, W = paddle.shape(x)
+        inp = self.inp
+
+        inp = self.fc1(inp)
+        inp = self.fc2(inp).unsqueeze(axis=0).transpose((0, 2, 1))\
+                           .expand((B, 256, self.nclass))
+
+        out = self.conv5(x)
+
+        out, se_out = self.gloru(out, inp)
+        out = self.conv6(out)
+        return out, se_out
+
+
+class GlobalReasonUnit(nn.Layer):
+    """
+        The original paper refers to:
+            Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks" (https://arxiv.org/abs/1811.12814)
+    """
+
+    def __init__(self, in_channels, num_state=256, num_node=84, nclass=59):
+        super().__init__()
+        self.num_state = num_state
+        self.conv_theta = nn.Conv2D(
+            in_channels, num_node, kernel_size=1, stride=1, padding=0)
+        self.conv_phi = nn.Conv2D(
+            in_channels, num_state, kernel_size=1, stride=1, padding=0)
+        self.graph = GraphLayer(num_state, num_node, nclass)
+        self.extend_dim = nn.Conv2D(
+            num_state, in_channels, kernel_size=1, bias_attr=False)
+
+        self.bn = layers.SyncBatchNorm(in_channels)
+
+    def forward(self, x, inp):
+        B = self.conv_theta(x)
+        sizeB = paddle.shape(B)
+        B = paddle.flatten(B, 2, 3)
+
+        sizex = paddle.shape(x)
+        x_reduce = self.conv_phi(x)
+
+        x_reduce = paddle.flatten(x_reduce, 2, 3).transpose((0, 2, 1))
+
+        V = paddle.bmm(B, x_reduce).transpose((0, 2, 1))
+        V = paddle.divide(V, (sizex[2] * sizex[3]).astype('float32'))
+
+        class_node, new_V = self.graph(inp, V)
+        D = B.transpose((0, 2, 1))
+        Y = paddle.bmm(D, new_V.transpose((0, 2, 1)))
+        Y = Y.transpose((0, 2, 1)).reshape((sizex[0], self.num_state, \
+                                            sizex[2], -1))
+        Y = self.extend_dim(Y)
+        Y = self.bn(Y)
+        out = Y + x
+
+        return out, class_node
+
+
+class GraphLayer(nn.Layer):
+    def __init__(self, num_state, num_node, num_class):
+        super().__init__()
+        self.vis_gcn = GCN(num_state, num_node)
+        self.word_gcn = GCN(num_state, num_class)
+        self.transfer = GraphTransfer(num_state)
+        self.gamma_vis = paddle.zeros([num_node])
+        self.gamma_word = paddle.zeros([num_class])
+        self.gamma_vis = paddle.create_parameter(
+            shape=paddle.shape(self.gamma_vis),
+            dtype=str(self.gamma_vis.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(self.gamma_vis))
+        self.gamma_word = paddle.create_parameter(
+            shape=paddle.shape(self.gamma_word),
+            dtype=str(self.gamma_word.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(self.gamma_word))
+
+    def forward(self, inp, vis_node):
+        inp = self.word_gcn(inp)
+        new_V = self.vis_gcn(vis_node)
+        class_node, vis_node = self.transfer(inp, new_V)
+
+        class_node = self.gamma_word * inp + class_node
+        new_V = self.gamma_vis * vis_node + new_V
+        return class_node, new_V
+
+
+class GCN(nn.Layer):
+    def __init__(self, num_state=128, num_node=64, bias=False):
+        super().__init__()
+        self.conv1 = nn.Conv1D(
+            num_node,
+            num_node,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+        )
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv1D(
+            num_state,
+            num_state,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+            bias_attr=bias)
+
+    def forward(self, x):
+        h = self.conv1(x.transpose((0, 2, 1))).transpose((0, 2, 1))
+        h = h + x
+        h = self.relu(h)
+        h = self.conv2(h)
+        return h
+
+
+class GraphTransfer(nn.Layer):
+    """Transfer vis graph to class node, transfer class node to vis feature"""
+
+    def __init__(self, in_dim):
+        super().__init__()
+        self.channle_in = in_dim
+        self.query_conv = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1)
+        self.key_conv = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1)
+        self.value_conv_vis = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim, kernel_size=1)
+        self.value_conv_word = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim, kernel_size=1)
+        self.softmax_vis = nn.Softmax(axis=-1)
+        self.softmax_word = nn.Softmax(axis=-2)
+
+    def forward(self, word, vis_node):
+        m_batchsize, C, Nc = paddle.shape(word)
+        m_batchsize, C, Nn = paddle.shape(vis_node)
+
+        proj_query = self.query_conv(word).reshape((m_batchsize, -1, Nc))\
+                                          .transpose((0, 2, 1))
+        proj_key = self.key_conv(vis_node).reshape((m_batchsize, -1, Nn))
+
+        energy = paddle.bmm(proj_query, proj_key)
+        attention_vis = self.softmax_vis(energy).transpose((0, 2, 1))
+        attention_word = self.softmax_word(energy)
+
+        proj_value_vis = self.value_conv_vis(vis_node).reshape((m_batchsize, -1,
+                                                                Nn))
+        proj_value_word = self.value_conv_word(word).reshape((m_batchsize, -1,
+                                                              Nc))
+
+        class_out = paddle.bmm(proj_value_vis, attention_vis)
+        node_out = paddle.bmm(proj_value_word, attention_word)
+        return class_out, node_out
diff --git a/paddlers/models/ppseg/models/gscnn.py b/paddlers/models/ppseg/models/gscnn.py
new file mode 100644
index 0000000..cecb5c4
--- /dev/null
+++ b/paddlers/models/ppseg/models/gscnn.py
@@ -0,0 +1,353 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.models.backbones import resnet_vd
+from paddlers.models.ppseg.models import deeplab
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class GSCNN(nn.Layer):
+    """
+    The GSCNN implementation based on PaddlePaddle.
+    The original article refers to
+    Towaki Takikawa, et, al. "Gated-SCNN: Gated Shape CNNs for Semantic Segmentation"
+    (https://arxiv.org/pdf/1907.05740.pdf)
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+           Default: (0, 1, 2, 3).
+        aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
+            If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
+            If output_stride=8, aspp_ratios is (1, 12, 24, 36).
+            Default: (1, 6, 12, 18).
+        aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(0, 1, 2, 3),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        backbone_channels = self.backbone.feat_channels
+        self.head = GSCNNHead(num_classes, backbone_indices, backbone_channels,
+                              aspp_ratios, aspp_out_channels, align_corners)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(x, feat_list, self.backbone.conv1_logit)
+        seg_logit, edge_logit = [
+            F.interpolate(
+                logit,
+                x.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+        return [seg_logit, (seg_logit, edge_logit), edge_logit, seg_logit]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class GSCNNHead(nn.Layer):
+    """
+    The GSCNNHead implementation based on PaddlePaddle.
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            the first index will be taken as a low-level feature in Decoder component;
+            the last one will be taken as input of ASPP component; the second to fourth
+            will be taken as input for GCL component.
+            Usually backbone consists of four downsampling stage, and return an output of
+            each stage. If we set it as (0, 1, 2, 3), it means taking feature map of the first
+            stage in backbone as low-level feature used in Decoder, feature map of the fourth
+            stage as input of ASPP, and the feature map of the second to fourth stage as input of GCL.
+        backbone_channels (tuple): The channels of output of backbone.
+        aspp_ratios (tuple): The dilation rates using in ASSP module.
+        aspp_out_channels (int): The output channels of ASPP module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, num_classes, backbone_indices, backbone_channels,
+                 aspp_ratios, aspp_out_channels, align_corners):
+        super().__init__()
+        self.backbone_indices = backbone_indices
+        self.align_corners = align_corners
+
+        self.dsn1 = nn.Conv2D(
+            backbone_channels[backbone_indices[1]], 1, kernel_size=1)
+        self.dsn2 = nn.Conv2D(
+            backbone_channels[backbone_indices[2]], 1, kernel_size=1)
+        self.dsn3 = nn.Conv2D(
+            backbone_channels[backbone_indices[3]], 1, kernel_size=1)
+
+        self.res1 = resnet_vd.BasicBlock(64, 64, stride=1)
+        self.d1 = nn.Conv2D(64, 32, kernel_size=1)
+        self.gate1 = GatedSpatailConv2d(32, 32)
+        self.res2 = resnet_vd.BasicBlock(32, 32, stride=1)
+        self.d2 = nn.Conv2D(32, 16, kernel_size=1)
+        self.gate2 = GatedSpatailConv2d(16, 16)
+        self.res3 = resnet_vd.BasicBlock(16, 16, stride=1)
+        self.d3 = nn.Conv2D(16, 8, kernel_size=1)
+        self.gate3 = GatedSpatailConv2d(8, 8)
+        self.fuse = nn.Conv2D(8, 1, kernel_size=1, bias_attr=False)
+
+        self.cw = nn.Conv2D(2, 1, kernel_size=1, bias_attr=False)
+
+        self.aspp = ASPPModule(
+            aspp_ratios=aspp_ratios,
+            in_channels=backbone_channels[-1],
+            out_channels=aspp_out_channels,
+            align_corners=self.align_corners,
+            image_pooling=True)
+
+        self.decoder = deeplab.Decoder(
+            num_classes=num_classes,
+            in_channels=backbone_channels[0],
+            align_corners=self.align_corners)
+
+    def forward(self, x, feat_list, s_input):
+        input_shape = paddle.shape(x)
+        m1f = F.interpolate(
+            s_input,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        l1, l2, l3 = [
+            feat_list[self.backbone_indices[i]]
+            for i in range(1, len(self.backbone_indices))
+        ]
+        s1 = F.interpolate(
+            self.dsn1(l1),
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        s2 = F.interpolate(
+            self.dsn2(l2),
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        s3 = F.interpolate(
+            self.dsn3(l3),
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        # Get image gradient
+        im_arr = x.numpy().transpose((0, 2, 3, 1))
+        im_arr = ((im_arr * 0.5 + 0.5) * 255).astype(np.uint8)
+        canny = np.zeros((input_shape[0], 1, input_shape[2], input_shape[3]))
+        for i in range(input_shape[0]):
+            canny[i] = cv2.Canny(im_arr[i], 10, 100)
+        canny = canny / 255
+        canny = paddle.to_tensor(canny).astype('float32')
+        canny.stop_gradient = True
+
+        cs = self.res1(m1f)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        cs = self.d1(cs)
+        cs = self.gate1(cs, s1)
+
+        cs = self.res2(cs)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        cs = self.d2(cs)
+        cs = self.gate2(cs, s2)
+
+        cs = self.res3(cs)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        cs = self.d3(cs)
+        cs = self.gate3(cs, s3)
+
+        cs = self.fuse(cs)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        edge_out = F.sigmoid(cs)  # Ouput of shape stream
+
+        cat = paddle.concat([edge_out, canny], axis=1)
+        acts = self.cw(cat)
+        acts = F.sigmoid(acts)  # Input of fusion module
+
+        x = self.aspp(l3, acts)
+
+        low_level_feat = feat_list[self.backbone_indices[0]]
+        logit = self.decoder(x, low_level_feat)
+        logit_list = [logit, edge_out]
+        return logit_list
+
+
+class GatedSpatailConv2d(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias_attr=False):
+        super().__init__()
+        self._gate_conv = nn.Sequential(
+            layers.SyncBatchNorm(in_channels + 1),
+            nn.Conv2D(in_channels + 1, in_channels + 1, kernel_size=1),
+            nn.ReLU(), nn.Conv2D(in_channels + 1, 1, kernel_size=1),
+            layers.SyncBatchNorm(1), nn.Sigmoid())
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias_attr)
+
+    def forward(self, input_features, gating_features):
+        cat = paddle.concat([input_features, gating_features], axis=1)
+        alphas = self._gate_conv(cat)
+        x = input_features * (alphas + 1)
+        x = self.conv(x)
+        return x
+
+
+class ASPPModule(nn.Layer):
+    """
+    Atrous Spatial Pyramid Pooling.
+    Args:
+        aspp_ratios (tuple): The dilation rate using in ASSP module.
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+        use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False.
+        image_pooling (bool, optional): If augmented with image-level features. Default: False
+    """
+
+    def __init__(self,
+                 aspp_ratios,
+                 in_channels,
+                 out_channels,
+                 align_corners,
+                 use_sep_conv=False,
+                 image_pooling=False):
+        super().__init__()
+
+        self.align_corners = align_corners
+        self.aspp_blocks = nn.LayerList()
+
+        for ratio in aspp_ratios:
+            if use_sep_conv and ratio > 1:
+                conv_func = layers.SeparableConvBNReLU
+            else:
+                conv_func = layers.ConvBNReLU
+
+            block = conv_func(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1 if ratio == 1 else 3,
+                dilation=ratio,
+                padding=0 if ratio == 1 else ratio)
+            self.aspp_blocks.append(block)
+
+        out_size = len(self.aspp_blocks)
+
+        if image_pooling:
+            self.global_avg_pool = nn.Sequential(
+                nn.AdaptiveAvgPool2D(output_size=(1, 1)),
+                layers.ConvBNReLU(
+                    in_channels, out_channels, kernel_size=1, bias_attr=False))
+            out_size += 1
+        self.image_pooling = image_pooling
+
+        self.edge_conv = layers.ConvBNReLU(
+            1, out_channels, kernel_size=1, bias_attr=False)
+        out_size += 1
+
+        self.conv_bn_relu = layers.ConvBNReLU(
+            in_channels=out_channels * out_size,
+            out_channels=out_channels,
+            kernel_size=1)
+
+        self.dropout = nn.Dropout(p=0.1)  # drop rate
+
+    def forward(self, x, edge):
+        outputs = []
+        x_shape = paddle.shape(x)
+        for block in self.aspp_blocks:
+            y = block(x)
+            y = F.interpolate(
+                y,
+                x_shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            outputs.append(y)
+
+        if self.image_pooling:
+            img_avg = self.global_avg_pool(x)
+            img_avg = F.interpolate(
+                img_avg,
+                x_shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            outputs.append(img_avg)
+
+        edge_features = F.interpolate(
+            edge,
+            size=x_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        edge_features = self.edge_conv(edge_features)
+        outputs.append(edge_features)
+
+        x = paddle.concat(outputs, axis=1)
+        x = self.conv_bn_relu(x)
+        x = self.dropout(x)
+        return x
diff --git a/paddlers/models/ppseg/models/hardnet.py b/paddlers/models/ppseg/models/hardnet.py
new file mode 100644
index 0000000..a192ecb
--- /dev/null
+++ b/paddlers/models/ppseg/models/hardnet.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class HarDNet(nn.Layer):
+    """
+    [Real Time] The FC-HardDNet 70 implementation based on PaddlePaddle.
+    The original article refers to
+        Chao, Ping, et al. "HarDNet: A Low Memory Traffic Network"
+        (https://arxiv.org/pdf/1909.00948.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        stem_channels (tuple|list, optional): The number of channels before the encoder. Default: (16, 24, 32, 48).
+        ch_list (tuple|list, optional): The number of channels at each block in the encoder. Default: (64, 96, 160, 224, 320).
+        grmul (float, optional): The channel multiplying factor in HarDBlock, which is m in the paper. Default: 1.7.
+        gr (tuple|list, optional): The growth rate in each HarDBlock, which is k in the paper. Default: (10, 16, 18, 24, 32).
+        n_layers (tuple|list, optional): The number of layers in each HarDBlock. Default: (4, 4, 8, 8, 8).
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 stem_channels=(16, 24, 32, 48),
+                 ch_list=(64, 96, 160, 224, 320),
+                 grmul=1.7,
+                 gr=(10, 16, 18, 24, 32),
+                 n_layers=(4, 4, 8, 8, 8),
+                 align_corners=False,
+                 pretrained=None):
+
+        super().__init__()
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        encoder_blks_num = len(n_layers)
+        decoder_blks_num = encoder_blks_num - 1
+        encoder_in_channels = stem_channels[3]
+
+        self.stem = nn.Sequential(
+            layers.ConvBNReLU(
+                3, stem_channels[0], kernel_size=3, bias_attr=False),
+            layers.ConvBNReLU(
+                stem_channels[0],
+                stem_channels[1],
+                kernel_size=3,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                stem_channels[1],
+                stem_channels[2],
+                kernel_size=3,
+                stride=2,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                stem_channels[2],
+                stem_channels[3],
+                kernel_size=3,
+                bias_attr=False))
+
+        self.encoder = Encoder(encoder_blks_num, encoder_in_channels, ch_list,
+                               gr, grmul, n_layers)
+
+        skip_connection_channels = self.encoder.get_skip_channels()
+        decoder_in_channels = self.encoder.get_out_channels()
+
+        self.decoder = Decoder(decoder_blks_num, decoder_in_channels,
+                               skip_connection_channels, gr, grmul, n_layers,
+                               align_corners)
+
+        self.cls_head = nn.Conv2D(
+            in_channels=self.decoder.get_out_channels(),
+            out_channels=num_classes,
+            kernel_size=1)
+
+        self.init_weight()
+
+    def forward(self, x):
+        input_shape = paddle.shape(x)[2:]
+        x = self.stem(x)
+        x, skip_connections = self.encoder(x)
+        x = self.decoder(x, skip_connections)
+        logit = self.cls_head(x)
+        logit = F.interpolate(
+            logit,
+            size=input_shape,
+            mode="bilinear",
+            align_corners=self.align_corners)
+        return [logit]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class Encoder(nn.Layer):
+    """The Encoder implementation of FC-HardDNet 70.
+
+    Args:
+        n_blocks (int): The number of blocks in the Encoder module.
+        in_channels (int): The number of input channels.
+        ch_list (tuple|list): The number of channels at each block in the encoder.
+        grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper.
+        gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper.
+        n_layers (tuple|list): The number of layers in each HarDBlock.
+    """
+
+    def __init__(self, n_blocks, in_channels, ch_list, gr, grmul, n_layers):
+        super().__init__()
+        self.skip_connection_channels = []
+        self.shortcut_layers = []
+        self.blks = nn.LayerList()
+        ch = in_channels
+        for i in range(n_blocks):
+            blk = HarDBlock(ch, gr[i], grmul, n_layers[i])
+            ch = blk.get_out_ch()
+            self.skip_connection_channels.append(ch)
+            self.blks.append(blk)
+            if i < n_blocks - 1:
+                self.shortcut_layers.append(len(self.blks) - 1)
+            self.blks.append(
+                layers.ConvBNReLU(
+                    ch, ch_list[i], kernel_size=1, bias_attr=False))
+
+            ch = ch_list[i]
+            if i < n_blocks - 1:
+                self.blks.append(nn.AvgPool2D(kernel_size=2, stride=2))
+        self.out_channels = ch
+
+    def forward(self, x):
+        skip_connections = []
+        for i in range(len(self.blks)):
+            x = self.blks[i](x)
+            if i in self.shortcut_layers:
+                skip_connections.append(x)
+        return x, skip_connections
+
+    def get_skip_channels(self):
+        return self.skip_connection_channels
+
+    def get_out_channels(self):
+        return self.out_channels
+
+
+class Decoder(nn.Layer):
+    """The Decoder implementation of FC-HardDNet 70.
+
+    Args:
+        n_blocks (int): The number of blocks in the Encoder module.
+        in_channels (int): The number of input channels.
+        skip_connection_channels (tuple|list): The channels of shortcut layers in encoder.
+        grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper.
+        gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper.
+        n_layers (tuple|list): The number of layers in each HarDBlock.
+    """
+
+    def __init__(self,
+                 n_blocks,
+                 in_channels,
+                 skip_connection_channels,
+                 gr,
+                 grmul,
+                 n_layers,
+                 align_corners=False):
+        super().__init__()
+        prev_block_channels = in_channels
+        self.n_blocks = n_blocks
+        self.dense_blocks_up = nn.LayerList()
+        self.conv1x1_up = nn.LayerList()
+
+        for i in range(n_blocks - 1, -1, -1):
+            cur_channels_count = prev_block_channels + skip_connection_channels[
+                i]
+            conv1x1 = layers.ConvBNReLU(
+                cur_channels_count,
+                cur_channels_count // 2,
+                kernel_size=1,
+                bias_attr=False)
+            blk = HarDBlock(
+                base_channels=cur_channels_count // 2,
+                growth_rate=gr[i],
+                grmul=grmul,
+                n_layers=n_layers[i])
+
+            self.conv1x1_up.append(conv1x1)
+            self.dense_blocks_up.append(blk)
+
+            prev_block_channels = blk.get_out_ch()
+
+        self.out_channels = prev_block_channels
+        self.align_corners = align_corners
+
+    def forward(self, x, skip_connections):
+        for i in range(self.n_blocks):
+            skip = skip_connections.pop()
+            x = F.interpolate(
+                x,
+                size=paddle.shape(skip)[2:],
+                mode="bilinear",
+                align_corners=self.align_corners)
+            x = paddle.concat([x, skip], axis=1)
+            x = self.conv1x1_up[i](x)
+            x = self.dense_blocks_up[i](x)
+        return x
+
+    def get_out_channels(self):
+        return self.out_channels
+
+
+class HarDBlock(nn.Layer):
+    """The HarDBlock implementation
+
+    Args:
+        base_channels (int): The base channels.
+        growth_rate (tuple|list): The growth rate.
+        grmul (float): The channel multiplying factor.
+        n_layers (tuple|list): The number of layers.
+        keepBase (bool, optional): A bool value indicates whether concatenating the first layer. Default: False.
+    """
+
+    def __init__(self,
+                 base_channels,
+                 growth_rate,
+                 grmul,
+                 n_layers,
+                 keepBase=False):
+        super().__init__()
+        self.keepBase = keepBase
+        self.links = []
+        layers_ = []
+        self.out_channels = 0
+        for i in range(n_layers):
+            outch, inch, link = get_link(i + 1, base_channels, growth_rate,
+                                         grmul)
+
+            self.links.append(link)
+            layers_.append(
+                layers.ConvBNReLU(inch, outch, kernel_size=3, bias_attr=False))
+            if (i % 2 == 0) or (i == n_layers - 1):
+                self.out_channels += outch
+        self.layers = nn.LayerList(layers_)
+
+    def forward(self, x):
+        layers_ = [x]
+        for layer in range(len(self.layers)):
+            link = self.links[layer]
+            tin = []
+            for i in link:
+                tin.append(layers_[i])
+            if len(tin) > 1:
+                x = paddle.concat(tin, axis=1)
+            else:
+                x = tin[0]
+            out = self.layers[layer](x)
+            layers_.append(out)
+
+        t = len(layers_)
+        out_ = []
+        for i in range(t):
+            if (i == 0 and self.keepBase) or \
+                (i == t - 1) or (i % 2 == 1):
+                out_.append(layers_[i])
+        out = paddle.concat(out_, 1)
+
+        return out
+
+    def get_out_ch(self):
+        return self.out_channels
+
+
+def get_link(layer, base_ch, growth_rate, grmul):
+    if layer == 0:
+        return base_ch, 0, []
+    out_channels = growth_rate
+    link = []
+    for i in range(10):
+        dv = 2**i
+        if layer % dv == 0:
+            k = layer - dv
+            link.insert(0, k)
+            if i > 0:
+                out_channels *= grmul
+    out_channels = int(int(out_channels + 1) / 2) * 2
+    in_channels = 0
+    for i in link:
+        ch, _, _ = get_link(i, base_ch, growth_rate, grmul)
+        in_channels += ch
+    return out_channels, in_channels, link
diff --git a/paddlers/models/ppseg/models/hrnet_contrast.py b/paddlers/models/ppseg/models/hrnet_contrast.py
new file mode 100644
index 0000000..9aced77
--- /dev/null
+++ b/paddlers/models/ppseg/models/hrnet_contrast.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class HRNetW48Contrast(nn.Layer):
+    """
+    The HRNetW48Contrast implementation based on PaddlePaddle.
+
+    The original article refers to
+    Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation"
+    (https://arxiv.org/abs/2101.11939).
+
+    Args:
+        in_channels (int): The output dimensions of backbone.
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support HRNet_W48.
+        drop_prob (float): The probability of dropout.
+        proj_dim (int): The projection dimensions.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_classes,
+                 backbone,
+                 drop_prob,
+                 proj_dim,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.backbone = backbone
+        self.num_classes = num_classes
+        self.proj_dim = proj_dim
+        self.align_corners = align_corners
+
+        self.cls_head = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1),
+            nn.Dropout2D(drop_prob),
+            nn.Conv2D(
+                in_channels,
+                num_classes,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False),
+        )
+        self.proj_head = ProjectionHead(
+            dim_in=in_channels, proj_dim=self.proj_dim)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        feats = self.backbone(x)[0]
+        out = self.cls_head(feats)
+        logit_list = []
+        if self.training:
+            emb = self.proj_head(feats)
+            logit_list.append(
+                F.interpolate(
+                    out,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners))
+            logit_list.append({'seg': out, 'embed': emb})
+        else:
+            logit_list.append(
+                F.interpolate(
+                    out,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners))
+        return logit_list
+
+
+class ProjectionHead(nn.Layer):
+    """
+    The projection head used by contrast learning.
+    Args:
+        dim_in (int): The dimensions of input features.
+        proj_dim (int, optional): The output dimensions of projection head. Default: 256.
+        proj (str, optional): The type of projection head, only support 'linear' and 'convmlp'. Default: 'convmlp'.
+    """
+
+    def __init__(self, dim_in, proj_dim=256, proj='convmlp'):
+        super(ProjectionHead, self).__init__()
+        if proj == 'linear':
+            self.proj = nn.Conv2D(dim_in, proj_dim, kernel_size=1)
+        elif proj == 'convmlp':
+            self.proj = nn.Sequential(
+                layers.ConvBNReLU(dim_in, dim_in, kernel_size=1),
+                nn.Conv2D(dim_in, proj_dim, kernel_size=1),
+            )
+        else:
+            raise ValueError(
+                "The type of project head only support 'linear' and 'convmlp', but got {}."
+                .format(proj))
+
+    def forward(self, x):
+        return F.normalize(self.proj(x), p=2, axis=1)
diff --git a/paddlers/models/ppseg/models/isanet.py b/paddlers/models/ppseg/models/isanet.py
new file mode 100644
index 0000000..b2f399a
--- /dev/null
+++ b/paddlers/models/ppseg/models/isanet.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ISANet(nn.Layer):
+    """Interlaced Sparse Self-Attention for Semantic Segmentation.
+
+    The original article refers to Lang Huang, et al. "Interlaced Sparse Self-Attention for Semantic Segmentation"
+    (https://arxiv.org/abs/1907.12273).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
+        isa_channels (int): The channels of ISA Module.
+        down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 isa_channels=256,
+                 down_factor=(8, 8),
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+        self.head = ISAHead(num_classes, in_channels, isa_channels, down_factor,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                align_mode=1) for logit in logit_list
+        ]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class ISAHead(nn.Layer):
+    """
+    The ISAHead.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+        isa_channels (int): The channels of ISA Module.
+        down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self, num_classes, in_channels, isa_channels, down_factor,
+                 enable_auxiliary_loss):
+        super(ISAHead, self).__init__()
+        self.in_channels = in_channels[-1]
+        inter_channels = self.in_channels // 4
+        self.inter_channels = inter_channels
+        self.down_factor = down_factor
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.in_conv = layers.ConvBNReLU(
+            self.in_channels, inter_channels, 3, bias_attr=False)
+        self.global_relation = SelfAttentionBlock(inter_channels, isa_channels)
+        self.local_relation = SelfAttentionBlock(inter_channels, isa_channels)
+        self.out_conv = layers.ConvBNReLU(
+            inter_channels * 2, inter_channels, 1, bias_attr=False)
+        self.cls = nn.Sequential(
+            nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1))
+        self.aux = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=1024,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False), nn.Dropout2D(p=0.1),
+            nn.Conv2D(256, num_classes, 1))
+
+    def forward(self, feat_list):
+        C3, C4 = feat_list
+        x = self.in_conv(C4)
+        x_shape = paddle.shape(x)
+        P_h, P_w = self.down_factor
+        Q_h, Q_w = paddle.ceil(x_shape[2] / P_h).astype('int32'), paddle.ceil(
+            x_shape[3] / P_w).astype('int32')
+        pad_h, pad_w = (Q_h * P_h - x_shape[2]).astype('int32'), (
+            Q_w * P_w - x_shape[3]).astype('int32')
+        if pad_h > 0 or pad_w > 0:
+            padding = paddle.concat([
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ],
+                                    axis=0)
+            feat = F.pad(x, padding)
+        else:
+            feat = x
+
+        feat = feat.reshape([0, x_shape[1], Q_h, P_h, Q_w, P_w])
+        feat = feat.transpose([0, 3, 5, 1, 2,
+                               4]).reshape([-1, self.inter_channels, Q_h, Q_w])
+        feat = self.global_relation(feat)
+
+        feat = feat.reshape([x_shape[0], P_h, P_w, x_shape[1], Q_h, Q_w])
+        feat = feat.transpose([0, 4, 5, 3, 1,
+                               2]).reshape([-1, self.inter_channels, P_h, P_w])
+        feat = self.local_relation(feat)
+
+        feat = feat.reshape([x_shape[0], Q_h, Q_w, x_shape[1], P_h, P_w])
+        feat = feat.transpose([0, 3, 1, 4, 2, 5]).reshape(
+            [0, self.inter_channels, P_h * Q_h, P_w * Q_w])
+        if pad_h > 0 or pad_w > 0:
+            feat = paddle.slice(
+                feat,
+                axes=[2, 3],
+                starts=[pad_h // 2, pad_w // 2],
+                ends=[pad_h // 2 + x_shape[2], pad_w // 2 + x_shape[3]])
+
+        feat = self.out_conv(paddle.concat([feat, x], axis=1))
+        output = self.cls(feat)
+
+        if self.enable_auxiliary_loss:
+            auxout = self.aux(C3)
+            return [output, auxout]
+        else:
+            return [output]
+
+
+class SelfAttentionBlock(layers.AttentionBlock):
+    """General self-attention block/non-local block.
+
+       Args:
+            in_channels (int): Input channels of key/query feature.
+            channels (int): Output channels of key/query transform.
+    """
+
+    def __init__(self, in_channels, channels):
+        super(SelfAttentionBlock, self).__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=False)
+
+        self.output_project = self.build_project(
+            in_channels, in_channels, num_convs=1, use_conv_module=True)
+
+    def forward(self, x):
+        context = super(SelfAttentionBlock, self).forward(x, x)
+        return self.output_project(context)
diff --git a/paddlers/models/ppseg/models/layers/__init__.py b/paddlers/models/ppseg/models/layers/__init__.py
new file mode 100644
index 0000000..f66ed44
--- /dev/null
+++ b/paddlers/models/ppseg/models/layers/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .layer_libs import ConvBNReLU, ConvBN, SeparableConvBNReLU, DepthwiseConvBN, AuxLayer, SyncBatchNorm, JPU, ConvBNPReLU
+from .activation import Activation
+from .pyramid_pool import ASPPModule, PPModule
+from .attention import AttentionBlock
+from .nonlocal2d import NonLocal2D
+from .wrap_functions import *
diff --git a/paddlers/models/ppseg/models/layers/activation.py b/paddlers/models/ppseg/models/layers/activation.py
new file mode 100644
index 0000000..8bb9447
--- /dev/null
+++ b/paddlers/models/ppseg/models/layers/activation.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+
+class Activation(nn.Layer):
+    """
+    The wrapper of activations.
+
+    Args:
+        act (str, optional): The activation name in lowercase. It must be one of ['elu', 'gelu',
+            'hardshrink', 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid',
+            'softmax', 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax',
+            'hsigmoid']. Default: None, means identical transformation.
+
+    Returns:
+        A callable object of Activation.
+
+    Raises:
+        KeyError: When parameter `act` is not in the optional range.
+
+    Examples:
+
+        from paddlers.models.ppseg.models.common.activation import Activation
+
+        relu = Activation("relu")
+        print(relu)
+        # <class 'paddle.nn.layer.activation.ReLU'>
+
+        sigmoid = Activation("sigmoid")
+        print(sigmoid)
+        # <class 'paddle.nn.layer.activation.Sigmoid'>
+
+        not_exit_one = Activation("not_exit_one")
+        # KeyError: "not_exit_one does not exist in the current dict_keys(['elu', 'gelu', 'hardshrink',
+        # 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', 'softmax',
+        # 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', 'hsigmoid'])"
+    """
+
+    def __init__(self, act=None):
+        super(Activation, self).__init__()
+
+        self._act = act
+        upper_act_names = nn.layer.activation.__dict__.keys()
+        lower_act_names = [act.lower() for act in upper_act_names]
+        act_dict = dict(zip(lower_act_names, upper_act_names))
+
+        if act is not None:
+            if act in act_dict.keys():
+                act_name = act_dict[act]
+                self.act_func = eval(
+                    "nn.layer.activation.{}()".format(act_name))
+            else:
+                raise KeyError("{} does not exist in the current {}".format(
+                    act, act_dict.keys()))
+
+    def forward(self, x):
+        if self._act is not None:
+            return self.act_func(x)
+        else:
+            return x
diff --git a/paddlers/models/ppseg/models/layers/attention.py b/paddlers/models/ppseg/models/layers/attention.py
new file mode 100644
index 0000000..c6e4a9a
--- /dev/null
+++ b/paddlers/models/ppseg/models/layers/attention.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+
+
+class AttentionBlock(nn.Layer):
+    """General self-attention block/non-local block.
+
+    The original article refers to refer to https://arxiv.org/abs/1706.03762.
+    Args:
+        key_in_channels (int): Input channels of key feature.
+        query_in_channels (int): Input channels of query feature.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_downsample (nn.Module): Query downsample module.
+        key_downsample (nn.Module): Key downsample module.
+        key_query_num_convs (int): Number of convs for key/query projection.
+        value_out_num_convs (int): Number of convs for value projection.
+        key_query_norm (bool): Whether to use BN for key/query projection.
+        value_out_norm (bool): Whether to use BN for value projection.
+        matmul_norm (bool): Whether normalize attention map with sqrt of
+            channels
+        with_out (bool): Whether use out projection.
+    """
+
+    def __init__(self, key_in_channels, query_in_channels, channels,
+                 out_channels, share_key_query, query_downsample,
+                 key_downsample, key_query_num_convs, value_out_num_convs,
+                 key_query_norm, value_out_norm, matmul_norm, with_out):
+        super(AttentionBlock, self).__init__()
+        if share_key_query:
+            assert key_in_channels == query_in_channels
+        self.with_out = with_out
+        self.key_in_channels = key_in_channels
+        self.query_in_channels = query_in_channels
+        self.out_channels = out_channels
+        self.channels = channels
+        self.share_key_query = share_key_query
+        self.key_project = self.build_project(
+            key_in_channels,
+            channels,
+            num_convs=key_query_num_convs,
+            use_conv_module=key_query_norm)
+        if share_key_query:
+            self.query_project = self.key_project
+        else:
+            self.query_project = self.build_project(
+                query_in_channels,
+                channels,
+                num_convs=key_query_num_convs,
+                use_conv_module=key_query_norm)
+
+        self.value_project = self.build_project(
+            key_in_channels,
+            channels if self.with_out else out_channels,
+            num_convs=value_out_num_convs,
+            use_conv_module=value_out_norm)
+
+        if self.with_out:
+            self.out_project = self.build_project(
+                channels,
+                out_channels,
+                num_convs=value_out_num_convs,
+                use_conv_module=value_out_norm)
+        else:
+            self.out_project = None
+
+        self.query_downsample = query_downsample
+        self.key_downsample = key_downsample
+        self.matmul_norm = matmul_norm
+
+    def build_project(self, in_channels, channels, num_convs, use_conv_module):
+        if use_conv_module:
+            convs = [
+                layers.ConvBNReLU(
+                    in_channels=in_channels,
+                    out_channels=channels,
+                    kernel_size=1,
+                    bias_attr=False)
+            ]
+            for _ in range(num_convs - 1):
+                convs.append(
+                    layers.ConvBNReLU(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=1,
+                        bias_attr=False))
+        else:
+            convs = [nn.Conv2D(in_channels, channels, 1)]
+            for _ in range(num_convs - 1):
+                convs.append(nn.Conv2D(channels, channels, 1))
+
+        if len(convs) > 1:
+            convs = nn.Sequential(*convs)
+        else:
+            convs = convs[0]
+        return convs
+
+    def forward(self, query_feats, key_feats):
+        query_shape = paddle.shape(query_feats)
+        query = self.query_project(query_feats)
+        if self.query_downsample is not None:
+            query = self.query_downsample(query)
+        query = query.flatten(2).transpose([0, 2, 1])
+
+        key = self.key_project(key_feats)
+        value = self.value_project(key_feats)
+
+        if self.key_downsample is not None:
+            key = self.key_downsample(key)
+            value = self.key_downsample(value)
+
+        key = key.flatten(2)
+        value = value.flatten(2).transpose([0, 2, 1])
+        sim_map = paddle.matmul(query, key)
+        if self.matmul_norm:
+            sim_map = (self.channels**-0.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        context = paddle.matmul(sim_map, value)
+        context = paddle.transpose(context, [0, 2, 1])
+
+        context = paddle.reshape(
+            context, [0, self.out_channels, query_shape[2], query_shape[3]])
+
+        if self.out_project is not None:
+            context = self.out_project(context)
+        return context
diff --git a/paddlers/models/ppseg/models/layers/layer_libs.py b/paddlers/models/ppseg/models/layers/layer_libs.py
new file mode 100644
index 0000000..35079b7
--- /dev/null
+++ b/paddlers/models/ppseg/models/layers/layer_libs.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppseg.models import layers
+
+
+def SyncBatchNorm(*args, **kwargs):
+    """In cpu environment nn.SyncBatchNorm does not have kernel so use nn.BatchNorm2D instead"""
+    if paddle.get_device() == 'cpu' or os.environ.get('PADDLESEG_EXPORT_STAGE'):
+        return nn.BatchNorm2D(*args, **kwargs)
+    elif paddle.distributed.ParallelEnv().nranks == 1:
+        return nn.BatchNorm2D(*args, **kwargs)
+    else:
+        return nn.SyncBatchNorm(*args, **kwargs)
+
+
+class ConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels, out_channels, kernel_size, padding=padding, **kwargs)
+
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+        self._relu = layers.Activation("relu")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        x = self._relu(x)
+        return x
+
+
+class ConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+        self._conv = nn.Conv2D(
+            in_channels, out_channels, kernel_size, padding=padding, **kwargs)
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        return x
+
+
+class ConvReLUPool(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1)
+        self._relu = layers.Activation("relu")
+        self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self._relu(x)
+        x = self._max_pool(x)
+        return x
+
+
+class SeparableConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 pointwise_bias=None,
+                 **kwargs):
+        super().__init__()
+        self.depthwise_conv = ConvBN(
+            in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=in_channels,
+            **kwargs)
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self.piontwise_conv = ConvBNReLU(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            groups=1,
+            data_format=data_format,
+            bias_attr=pointwise_bias)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.piontwise_conv(x)
+        return x
+
+
+class DepthwiseConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+        self.depthwise_conv = ConvBN(
+            in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=in_channels,
+            **kwargs)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        return x
+
+
+class AuxLayer(nn.Layer):
+    """
+    The auxiliary layer implementation for auxiliary loss.
+
+    Args:
+        in_channels (int): The number of input channels.
+        inter_channels (int): The intermediate channels.
+        out_channels (int): The number of output channels, and usually it is num_classes.
+        dropout_prob (float, optional): The drop rate. Default: 0.1.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 inter_channels,
+                 out_channels,
+                 dropout_prob=0.1,
+                 **kwargs):
+        super().__init__()
+
+        self.conv_bn_relu = ConvBNReLU(
+            in_channels=in_channels,
+            out_channels=inter_channels,
+            kernel_size=3,
+            padding=1,
+            **kwargs)
+
+        self.dropout = nn.Dropout(p=dropout_prob)
+
+        self.conv = nn.Conv2D(
+            in_channels=inter_channels,
+            out_channels=out_channels,
+            kernel_size=1)
+
+    def forward(self, x):
+        x = self.conv_bn_relu(x)
+        x = self.dropout(x)
+        x = self.conv(x)
+        return x
+
+
+class JPU(nn.Layer):
+    """
+    Joint Pyramid Upsampling of FCN.
+    The original paper refers to
+        Wu, Huikai, et al. "Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation." arXiv preprint arXiv:1903.11816 (2019).
+    """
+
+    def __init__(self, in_channels, width=512):
+        super().__init__()
+
+        self.conv5 = ConvBNReLU(
+            in_channels[-1], width, 3, padding=1, bias_attr=False)
+        self.conv4 = ConvBNReLU(
+            in_channels[-2], width, 3, padding=1, bias_attr=False)
+        self.conv3 = ConvBNReLU(
+            in_channels[-3], width, 3, padding=1, bias_attr=False)
+
+        self.dilation1 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=1,
+            pointwise_bias=False,
+            dilation=1,
+            bias_attr=False,
+            stride=1,
+        )
+        self.dilation2 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=2,
+            pointwise_bias=False,
+            dilation=2,
+            bias_attr=False,
+            stride=1)
+        self.dilation3 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=4,
+            pointwise_bias=False,
+            dilation=4,
+            bias_attr=False,
+            stride=1)
+        self.dilation4 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=8,
+            pointwise_bias=False,
+            dilation=8,
+            bias_attr=False,
+            stride=1)
+
+    def forward(self, *inputs):
+        feats = [
+            self.conv5(inputs[-1]),
+            self.conv4(inputs[-2]),
+            self.conv3(inputs[-3])
+        ]
+        size = paddle.shape(feats[-1])[2:]
+        feats[-2] = F.interpolate(
+            feats[-2], size, mode='bilinear', align_corners=True)
+        feats[-3] = F.interpolate(
+            feats[-3], size, mode='bilinear', align_corners=True)
+
+        feat = paddle.concat(feats, axis=1)
+        feat = paddle.concat([
+            self.dilation1(feat),
+            self.dilation2(feat),
+            self.dilation3(feat),
+            self.dilation4(feat)
+        ],
+                             axis=1)
+
+        return inputs[0], inputs[1], inputs[2], feat
+
+
+class ConvBNPReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+
+        self._conv = nn.Conv2D(in_channels,
+                               out_channels,
+                               kernel_size,
+                               padding=padding,
+                               **kwargs)
+
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+        self._prelu = layers.Activation("prelu")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        x = self._prelu(x)
+        return x
+        
\ No newline at end of file
diff --git a/paddlers/models/ppseg/models/layers/nonlocal2d.py b/paddlers/models/ppseg/models/layers/nonlocal2d.py
new file mode 100644
index 0000000..9d6386d
--- /dev/null
+++ b/paddlers/models/ppseg/models/layers/nonlocal2d.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+
+
+class NonLocal2D(nn.Layer):
+    """Basic Non-local module.
+    This model is the implementation of "Non-local Neural Networks"
+    (https://arxiv.org/abs/1711.07971)
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        reduction (int): Channel reduction ratio. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. Default: True.
+        sub_sample (bool): Whether to utilize max pooling after pairwise function. Default: False.
+        mode (str): Options are `gaussian`, `concatenation`, `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 reduction=2,
+                 use_scale=True,
+                 sub_sample=False,
+                 mode='embedded_gaussian'):
+        super(NonLocal2D, self).__init__()
+        self.in_channels = in_channels
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.sub_sample = sub_sample
+        self.mode = mode
+        if mode not in [
+                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
+        ]:
+            raise ValueError(
+                "Mode should be in 'gaussian', 'concatenation','embedded_gaussian' or 'dot_product'."
+            )
+
+        self.inter_channels = max(in_channels // reduction, 1)
+
+        self.g = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1)
+        self.conv_out = layers.ConvBNReLU(
+            in_channels=self.inter_channels,
+            out_channels=self.in_channels,
+            kernel_size=1,
+            bias_attr=False)
+
+        if self.mode != "gaussian":
+            self.theta = nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.inter_channels,
+                kernel_size=1)
+            self.phi = nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.inter_channels,
+                kernel_size=1)
+
+        if self.mode == "concatenation":
+            self.concat_project = layers.ConvBNReLU(
+                in_channels=self.inter_channels * 2,
+                out_channels=1,
+                kernel_size=1,
+                bias_attr=False)
+
+        if self.sub_sample:
+            max_pool_layer = nn.MaxPool2D(kernel_size=(2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+    def gaussian(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        pairwise_weight = F.softmax(pairwise_weight, axis=-1)
+        return pairwise_weight
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        if self.use_scale:
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight = F.softmax(pairwise_weight, -1)
+        return pairwise_weight
+
+    def dot_product(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def concatenation(self, theta_x, phi_x):
+        h = theta_x.shape[2]
+        w = phi_x.shape[3]
+        theta_x = paddle.tile(theta_x, [1, 1, 1, w])
+        phi_x = paddle.tile(phi_x, [1, 1, h, 1])
+
+        concat_feature = paddle.concat([theta_x, phi_x], axis=1)
+        pairwise_weight = self.concat_project(concat_feature)
+        n, _, h, w = pairwise_weight.shape
+        pairwise_weight = paddle.reshape(pairwise_weight, [n, h, w])
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def forward(self, x):
+        n, c, h, w = x.shape
+        g_x = paddle.reshape(self.g(x), [n, self.inter_channels, -1])
+        g_x = paddle.transpose(g_x, [0, 2, 1])
+
+        if self.mode == 'gaussian':
+            theta_x = paddle.reshape(x, [n, self.inter_channels, -1])
+            theta_x = paddle.transpose(theta_x, [0, 2, 1])
+            if self.sub_sample:
+                phi_x = paddle.reshape(
+                    self.phi(x), [n, self.inter_channels, -1])
+            else:
+                phi_x = paddle.reshape(x, [n, self.in_channels, -1])
+
+        elif self.mode == 'concatenation':
+            theta_x = paddle.reshape(
+                self.theta(x), [n, self.inter_channels, -1, 1])
+            phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, 1, -1])
+
+        else:
+            theta_x = paddle.reshape(
+                self.theta(x), [n, self.inter_channels, -1])
+            theta_x = paddle.transpose(theta_x, [0, 2, 1])
+            phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, -1])
+
+        pairwise_func = getattr(self, self.mode)
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+        y = paddle.matmul(pairwise_weight, g_x)
+        y = paddle.transpose(y, [0, 2, 1])
+        y = paddle.reshape(y, [n, self.inter_channels, h, w])
+
+        output = x + self.conv_out(y)
+
+        return output
diff --git a/paddlers/models/ppseg/models/layers/pyramid_pool.py b/paddlers/models/ppseg/models/layers/pyramid_pool.py
new file mode 100644
index 0000000..3694437
--- /dev/null
+++ b/paddlers/models/ppseg/models/layers/pyramid_pool.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlers.models.ppseg.models import layers
+
+
+class ASPPModule(nn.Layer):
+    """
+    Atrous Spatial Pyramid Pooling.
+
+    Args:
+        aspp_ratios (tuple): The dilation rate using in ASSP module.
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+        use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False.
+        image_pooling (bool, optional): If augmented with image-level features. Default: False
+    """
+
+    def __init__(self,
+                 aspp_ratios,
+                 in_channels,
+                 out_channels,
+                 align_corners,
+                 use_sep_conv=False,
+                 image_pooling=False,
+                 data_format='NCHW'):
+        super().__init__()
+
+        self.align_corners = align_corners
+        self.data_format = data_format
+        self.aspp_blocks = nn.LayerList()
+
+        for ratio in aspp_ratios:
+            if use_sep_conv and ratio > 1:
+                conv_func = layers.SeparableConvBNReLU
+            else:
+                conv_func = layers.ConvBNReLU
+
+            block = conv_func(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1 if ratio == 1 else 3,
+                dilation=ratio,
+                padding=0 if ratio == 1 else ratio,
+                data_format=data_format)
+            self.aspp_blocks.append(block)
+
+        out_size = len(self.aspp_blocks)
+
+        if image_pooling:
+            self.global_avg_pool = nn.Sequential(
+                nn.AdaptiveAvgPool2D(
+                    output_size=(1, 1), data_format=data_format),
+                layers.ConvBNReLU(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    bias_attr=False,
+                    data_format=data_format))
+            out_size += 1
+        self.image_pooling = image_pooling
+
+        self.conv_bn_relu = layers.ConvBNReLU(
+            in_channels=out_channels * out_size,
+            out_channels=out_channels,
+            kernel_size=1,
+            data_format=data_format)
+
+        self.dropout = nn.Dropout(p=0.1)  # drop rate
+
+    def forward(self, x):
+        outputs = []
+        if self.data_format == 'NCHW':
+            interpolate_shape = paddle.shape(x)[2:]
+            axis = 1
+        else:
+            interpolate_shape = paddle.shape(x)[1:3]
+            axis = -1
+        for block in self.aspp_blocks:
+            y = block(x)
+            outputs.append(y)
+
+        if self.image_pooling:
+            img_avg = self.global_avg_pool(x)
+            img_avg = F.interpolate(
+                img_avg,
+                interpolate_shape,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                data_format=self.data_format)
+            outputs.append(img_avg)
+
+        x = paddle.concat(outputs, axis=axis)
+        x = self.conv_bn_relu(x)
+        x = self.dropout(x)
+
+        return x
+
+
+class PPModule(nn.Layer):
+    """
+    Pyramid pooling module originally in PSPNet.
+
+    Args:
+        in_channels (int): The number of intput channels to pyramid pooling module.
+        out_channels (int): The number of output channels after pyramid pooling module.
+        bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 2, 3, 6).
+        dim_reduction (bool, optional): A bool value represents if reducing dimension after pooling. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, in_channels, out_channels, bin_sizes, dim_reduction,
+                 align_corners):
+        super().__init__()
+
+        self.bin_sizes = bin_sizes
+
+        inter_channels = in_channels
+        if dim_reduction:
+            inter_channels = in_channels // len(bin_sizes)
+
+        # we use dimension reduction after pooling mentioned in original implementation.
+        self.stages = nn.LayerList([
+            self._make_stage(in_channels, inter_channels, size)
+            for size in bin_sizes
+        ])
+
+        self.conv_bn_relu2 = layers.ConvBNReLU(
+            in_channels=in_channels + inter_channels * len(bin_sizes),
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.align_corners = align_corners
+
+    def _make_stage(self, in_channels, out_channels, size):
+        """
+        Create one pooling layer.
+
+        In our implementation, we adopt the same dimension reduction as the original paper that might be
+        slightly different with other implementations.
+
+        After pooling, the channels are reduced to 1/len(bin_sizes) immediately, while some other implementations
+        keep the channels to be same.
+
+        Args:
+            in_channels (int): The number of intput channels to pyramid pooling module.
+            size (int): The out size of the pooled layer.
+
+        Returns:
+            conv (Tensor): A tensor after Pyramid Pooling Module.
+        """
+
+        prior = nn.AdaptiveAvgPool2D(output_size=(size, size))
+        conv = layers.ConvBNReLU(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1)
+
+        return nn.Sequential(prior, conv)
+
+    def forward(self, input):
+        cat_layers = []
+        for stage in self.stages:
+            x = stage(input)
+            x = F.interpolate(
+                x,
+                paddle.shape(input)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            cat_layers.append(x)
+        cat_layers = [input] + cat_layers[::-1]
+        cat = paddle.concat(cat_layers, axis=1)
+        out = self.conv_bn_relu2(cat)
+
+        return out
diff --git a/paddlers/models/ppseg/models/layers/wrap_functions.py b/paddlers/models/ppseg/models/layers/wrap_functions.py
new file mode 100644
index 0000000..c86dd24
--- /dev/null
+++ b/paddlers/models/ppseg/models/layers/wrap_functions.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+"""
+Warp the functon api, so the normal and quantization training can use the same network.
+"""
+
+
+class Add(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.add(x, y, name)
+
+
+class Subtract(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.subtract(x, y, name)
+
+
+class Multiply(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.multiply(x, y, name)
+
+
+class Divide(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.divide(x, y, name)
+
+
+class Reshape(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, shape, name=None):
+        return paddle.reshape(x, shape, name)
+
+
+class Transpose(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, perm, name=None):
+        return paddle.transpose(x, perm, name)
+
+
+class Concat(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, axis=0, name=None):
+        return paddle.concat(x, axis, name)
+
+
+class Flatten(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, start_axis=0, stop_axis=-1, name=None):
+        return paddle.flatten(x, start_axis, stop_axis, name)
diff --git a/paddlers/models/ppseg/models/losses/__init__.py b/paddlers/models/ppseg/models/losses/__init__.py
new file mode 100644
index 0000000..d59bbb0
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mixed_loss import MixedLoss
+from .cross_entropy_loss import CrossEntropyLoss
+from .cross_entropy_loss import DistillCrossEntropyLoss
+from .binary_cross_entropy_loss import BCELoss
+from .lovasz_loss import LovaszSoftmaxLoss, LovaszHingeLoss
+from .gscnn_dual_task_loss import DualTaskLoss
+from .edge_attention_loss import EdgeAttentionLoss
+from .bootstrapped_cross_entropy import BootstrappedCrossEntropyLoss
+from .dice_loss import DiceLoss
+from .ohem_cross_entropy_loss import OhemCrossEntropyLoss
+from .decoupledsegnet_relax_boundary_loss import RelaxBoundaryLoss
+from .ohem_edge_attention_loss import OhemEdgeAttentionLoss
+from .l1_loss import L1Loss
+from .mean_square_error_loss import MSELoss
+from .focal_loss import FocalLoss
+from .kl_loss import KLLoss
+from .rmi_loss import RMILoss
+from .detail_aggregate_loss import DetailAggregateLoss
+from .point_cross_entropy_loss import PointCrossEntropyLoss
+from .pixel_contrast_cross_entropy_loss import PixelContrastCrossEntropyLoss
+from .semantic_encode_cross_entropy_loss import SECrossEntropyLoss
+from .semantic_connectivity_loss import SemanticConnectivityLoss
diff --git a/paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py b/paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
new file mode 100644
index 0000000..5a02ba6
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class BCELoss(nn.Layer):
+    r"""
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+    First this operator calculate loss function as follows:
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+    Args:
+        weight (Tensor | str, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. If type is str, it should equal to 'dynamic'.
+            It will compute weight dynamically in every step.
+            Default is ``'None'``.
+        pos_weight (float|str, optional): A weight of positive examples. If type is str,
+            it should equal to 'dynamic'. It will compute weight dynamically in every step.
+            Default is ``'None'``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        edge_label (bool, optional): Whether to use edge label. Default: False
+    Shapes:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+    Returns:
+        A callable object of BCEWithLogitsLoss.
+    Examples:
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
+            output = bce_logit_loss(logit, label)
+            print(output.numpy())  # [0.45618808]
+    """
+
+    def __init__(self,
+                 weight=None,
+                 pos_weight=None,
+                 ignore_index=255,
+                 edge_label=False):
+        super().__init__()
+        self.weight = weight
+        self.pos_weight = pos_weight
+        self.ignore_index = ignore_index
+        self.edge_label = edge_label
+        self.EPS = 1e-10
+
+        if self.weight is not None:
+            if isinstance(self.weight, str):
+                if self.weight != 'dynamic':
+                    raise ValueError(
+                        "if type of `weight` is str, it should equal to 'dynamic', but it is {}"
+                        .format(self.weight))
+            elif isinstance(self.weight, paddle.VarBase):
+                raise TypeError(
+                    'The type of `weight` is wrong, it should be Tensor or str, but it is {}'
+                    .format(type(self.weight)))
+
+        if self.pos_weight is not None:
+            if isinstance(self.pos_weight, str):
+                if self.pos_weight != 'dynamic':
+                    raise ValueError(
+                        "if type of `pos_weight` is str, it should equal to 'dynamic', but it is {}"
+                        .format(self.pos_weight))
+            elif isinstance(self.pos_weight, float):
+                self.pos_weight = paddle.to_tensor(
+                    self.pos_weight, dtype='float32')
+            else:
+                raise TypeError(
+                    'The type of `pos_weight` is wrong, it should be float or str, but it is {}'
+                    .format(type(self.pos_weight)))
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
+                value is 0 or 1, and if shape is more than 2D, this is
+                (N, C, D1, D2,..., Dk), k >= 1.
+        """
+        if len(label.shape) != len(logit.shape):
+            label = paddle.unsqueeze(label, 1)
+        mask = (label != self.ignore_index)
+        mask = paddle.cast(mask, 'float32')
+        # label.shape should equal to the logit.shape
+        if label.shape[1] != logit.shape[1]:
+            label = label.squeeze(1)
+            label = F.one_hot(label, logit.shape[1])
+            label = label.transpose((0, 3, 1, 2))
+        if isinstance(self.weight, str):
+            pos_index = (label == 1)
+            neg_index = (label == 0)
+            pos_num = paddle.sum(pos_index.astype('float32'))
+            neg_num = paddle.sum(neg_index.astype('float32'))
+            sum_num = pos_num + neg_num
+            weight_pos = 2 * neg_num / (sum_num + self.EPS)
+            weight_neg = 2 * pos_num / (sum_num + self.EPS)
+            weight = weight_pos * label + weight_neg * (1 - label)
+        else:
+            weight = self.weight
+        if isinstance(self.pos_weight, str):
+            pos_index = (label == 1)
+            neg_index = (label == 0)
+            pos_num = paddle.sum(pos_index.astype('float32'))
+            neg_num = paddle.sum(neg_index.astype('float32'))
+            sum_num = pos_num + neg_num
+            pos_weight = 2 * neg_num / (sum_num + self.EPS)
+        else:
+            pos_weight = self.pos_weight
+        label = label.astype('float32')
+        loss = paddle.nn.functional.binary_cross_entropy_with_logits(
+            logit,
+            label,
+            weight=weight,
+            reduction='none',
+            pos_weight=pos_weight)
+        loss = loss * mask
+        loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
+        label.stop_gradient = True
+        mask.stop_gradient = True
+
+        return loss
diff --git a/paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py b/paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
new file mode 100644
index 0000000..b1807fa
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class BootstrappedCrossEntropyLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function.
+
+    Args:
+        min_K (int): the minimum number of pixels to be counted in loss computation.
+        loss_th (float): the loss threshold. Only loss that is larger than the threshold
+            would be calculated.
+        weight (tuple|list, optional): The weight for different classes. Default: None.
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default: 255.
+    """
+
+    def __init__(self, min_K, loss_th, weight=None, ignore_index=255):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.K = min_K
+        self.threshold = loss_th
+        if weight is not None:
+            weight = paddle.to_tensor(weight, dtype='float32')
+        self.weight = weight
+
+    def forward(self, logit, label):
+
+        n, c, h, w = logit.shape
+        total_loss = 0.0
+        if len(label.shape) != len(logit.shape):
+            label = paddle.unsqueeze(label, 1)
+
+        for i in range(n):
+            x = paddle.unsqueeze(logit[i], 0)
+            y = paddle.unsqueeze(label[i], 0)
+            x = paddle.transpose(x, (0, 2, 3, 1))
+            y = paddle.transpose(y, (0, 2, 3, 1))
+            x = paddle.reshape(x, shape=(-1, c))
+            y = paddle.reshape(y, shape=(-1, ))
+            loss = F.cross_entropy(
+                x,
+                y,
+                weight=self.weight,
+                ignore_index=self.ignore_index,
+                reduction="none")
+            sorted_loss = paddle.sort(loss, descending=True)
+            if sorted_loss[self.K] > self.threshold:
+                new_indices = paddle.nonzero(sorted_loss > self.threshold)
+                loss = paddle.gather(sorted_loss, new_indices)
+            else:
+                loss = sorted_loss[:self.K]
+
+            total_loss += paddle.mean(loss)
+        return total_loss / float(n)
diff --git a/paddlers/models/ppseg/models/losses/cross_entropy_loss.py b/paddlers/models/ppseg/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000..74af92a
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/cross_entropy_loss.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class CrossEntropyLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function.
+
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0].
+            When its value < 1.0, only compute the loss for the top k percent pixels
+            (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``.
+        data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``.
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=255,
+                 top_k_percent_pixels=1.0,
+                 data_format='NCHW'):
+        super(CrossEntropyLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.top_k_percent_pixels = top_k_percent_pixels
+        self.EPS = 1e-8
+        self.data_format = data_format
+        if weight is not None:
+            self.weight = paddle.to_tensor(weight, dtype='float32')
+        else:
+            self.weight = None
+
+    def forward(self, logit, label, semantic_weights=None):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels,
+                shape is the same as label. Default: None.
+        Returns:
+            (Tensor): The average loss.
+        """
+        channel_axis = 1 if self.data_format == 'NCHW' else -1
+        if self.weight is not None and logit.shape[channel_axis] != len(
+                self.weight):
+            raise ValueError(
+                'The number of weights = {} must be the same as the number of classes = {}.'
+                .format(len(self.weight), logit.shape[channel_axis]))
+
+        if channel_axis == 1:
+            logit = paddle.transpose(logit, [0, 2, 3, 1])
+        label = label.astype('int64')
+
+        # In F.cross_entropy, the ignore_index is invalid, which needs to be fixed.
+        # When there is 255 in the label and paddle version <= 2.1.3, the cross_entropy OP will report an error, which is fixed in paddle develop version.
+        loss = F.cross_entropy(
+            logit,
+            label,
+            ignore_index=self.ignore_index,
+            reduction='none',
+            weight=self.weight)
+
+        return self._post_process_loss(logit, label, semantic_weights, loss)
+
+    def _post_process_loss(self, logit, label, semantic_weights, loss):
+        """
+        Consider mask and top_k to calculate the final loss.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels,
+                shape is the same as label.
+            loss (Tensor): Loss tensor which is the output of cross_entropy. If soft_label
+                is False in cross_entropy, the shape of loss should be the same as the label.
+                If soft_label is True in cross_entropy, the shape of loss should be
+                (N, D1, D2,..., Dk, 1).
+        Returns:
+            (Tensor): The average loss.
+        """
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, 'float32')
+        label.stop_gradient = True
+        mask.stop_gradient = True
+
+        if loss.ndim > mask.ndim:
+            loss = paddle.squeeze(loss, axis=-1)
+        loss = loss * mask
+        if semantic_weights is not None:
+            loss = loss * semantic_weights
+
+        if self.weight is not None:
+            _one_hot = F.one_hot(label, logit.shape[-1])
+            coef = paddle.sum(_one_hot * self.weight, axis=-1)
+        else:
+            coef = paddle.ones_like(label)
+
+        if self.top_k_percent_pixels == 1.0:
+            avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS)
+        else:
+            loss = loss.reshape((-1, ))
+            top_k_pixels = int(self.top_k_percent_pixels * loss.numel())
+            loss, indices = paddle.topk(loss, top_k_pixels)
+            coef = coef.reshape((-1, ))
+            coef = paddle.gather(coef, indices)
+            coef.stop_gradient = True
+            coef = coef.astype('float32')
+            avg_loss = loss.mean() / (paddle.mean(coef) + self.EPS)
+
+        return avg_loss
+
+
+@manager.LOSSES.add_component
+class DistillCrossEntropyLoss(CrossEntropyLoss):
+    """
+    The implementation of distill cross entropy loss.
+
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0].
+            When its value < 1.0, only compute the loss for the top k percent pixels
+            (e.g., the top 20% pixels). This is useful for hard pixel mining.
+            Default ``1.0``.
+        data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'.
+            Default ``'NCHW'``.
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=255,
+                 top_k_percent_pixels=1.0,
+                 data_format='NCHW'):
+        super().__init__(weight, ignore_index, top_k_percent_pixels,
+                         data_format)
+
+    def forward(self,
+                student_logit,
+                teacher_logit,
+                label,
+                semantic_weights=None):
+        """
+        Forward computation.
+
+        Args:
+            student_logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            teacher_logit (Tensor): Logit tensor, the data type is float32, float64. The shape
+                is the same as the student_logit.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels,
+                shape is the same as label. Default: None.
+        """
+
+        if student_logit.shape != teacher_logit.shape:
+            raise ValueError(
+                'The shape of student_logit = {} must be the same as the shape of teacher_logit = {}.'
+                .format(student_logit.shape, teacher_logit.shape))
+
+        channel_axis = 1 if self.data_format == 'NCHW' else -1
+        if self.weight is not None and student_logit.shape[channel_axis] != len(
+                self.weight):
+            raise ValueError(
+                'The number of weights = {} must be the same as the number of classes = {}.'
+                .format(len(self.weight), student_logit.shape[channel_axis]))
+
+        if channel_axis == 1:
+            student_logit = paddle.transpose(student_logit, [0, 2, 3, 1])
+            teacher_logit = paddle.transpose(teacher_logit, [0, 2, 3, 1])
+
+        teacher_logit = F.softmax(teacher_logit)
+
+        loss = F.cross_entropy(
+            student_logit,
+            teacher_logit,
+            weight=self.weight,
+            reduction='none',
+            soft_label=True)
+
+        return self._post_process_loss(student_logit, label, semantic_weights,
+                                       loss)
diff --git a/paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py b/paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
new file mode 100644
index 0000000..af78cf2
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from scipy.ndimage.interpolation import shift
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class RelaxBoundaryLoss(nn.Layer):
+    """
+    Implements the ohem cross entropy loss function.
+
+    Args:
+        border (int, optional): The value of border to relax. Default: 1.
+        calculate_weights (bool, optional): Whether to calculate weights for every classes. Default: False.
+        upper_bound (float, optional): The upper bound of weights if calculating weights for every classes. Default: 1.0.
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default: 255.
+    """
+
+    def __init__(self,
+                 border=1,
+                 calculate_weights=False,
+                 upper_bound=1.0,
+                 ignore_index=255):
+        super(RelaxBoundaryLoss, self).__init__()
+        self.border = border
+        self.calculate_weights = calculate_weights
+        self.upper_bound = upper_bound
+        self.ignore_index = ignore_index
+        self.EPS = 1e-5
+
+    def relax_onehot(self, label, num_classes):
+        # pad label, and let ignore_index as num_classes
+        if len(label.shape) == 3:
+            label = label.unsqueeze(1)
+        h, w = label.shape[-2], label.shape[-1]
+        label = F.pad(label, [self.border] * 4, value=num_classes)
+        label = label.squeeze(1)
+        ignore_mask = (label == self.ignore_index).astype('int64')
+        label = label * (1 - ignore_mask) + num_classes * ignore_mask
+
+        onehot = 0
+        for i in range(-self.border, self.border + 1):
+            for j in range(-self.border, self.border + 1):
+                h_start, h_end = 1 + i, h + 1 + i
+                w_start, w_end = 1 + j, w + 1 + j
+                label_ = label[:, h_start:h_end, w_start:w_end]
+                onehot_ = F.one_hot(label_, num_classes + 1)
+                onehot += onehot_
+        onehot = (onehot > 0).astype('int64')
+        onehot = paddle.transpose(onehot, (0, 3, 1, 2))
+
+        return onehot
+
+    def calculate_weights(self, label):
+        hist = paddle.sum(label, axis=(1, 2)) * 1.0 / label.sum()
+        hist = ((hist != 0) * self.upper_bound * (1 - hist)) + 1
+
+    def custom_nll(self,
+                   logit,
+                   label,
+                   class_weights=None,
+                   border_weights=None,
+                   ignore_mask=None):
+        soft = F.softmax(logit, axis=1)
+        # calculate the valid soft where label is 1.
+        soft_label = ((soft * label[:, :-1, :, :]).sum(
+            1, keepdim=True)) * (label[:, :-1, :, :].astype('float32'))
+        soft = soft * (1 - label[:, :-1, :, :]) + soft_label
+        logsoft = paddle.log(soft)
+        if class_weights is not None:
+            logsoft = class_weights.unsqueeze((0, 2, 3))
+        logsoft = label[:, :-1, :, :] * logsoft
+        logsoft = logsoft.sum(1)
+        # border loss is divided equally
+        logsoft = -1 / border_weights * logsoft * (1. - ignore_mask)
+        n, _, h, w = label.shape
+        logsoft = logsoft.sum() / (n * h * w - ignore_mask.sum() + 1)
+        return logsoft
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+        """
+        n, c, h, w = logit.shape
+        label.stop_gradient = True
+        label = self.relax_onehot(label, c)
+        weights = label[:, :-1, :, :].sum(1).astype('float32')
+        ignore_mask = (weights == 0).astype('float32')
+        # border is greater than 1, other is 1
+        border_weights = weights + ignore_mask
+
+        loss = 0
+        class_weights = None
+        for i in range(n):
+            if self.calculate_weights:
+                class_weights = self.calculate_weights(label[i])
+            loss = loss + self.custom_nll(
+                logit[i].unsqueeze(0),
+                label[i].unsqueeze(0),
+                class_weights=class_weights,
+                border_weights=border_weights,
+                ignore_mask=ignore_mask[i])
+        return loss
diff --git a/paddlers/models/ppseg/models/losses/detail_aggregate_loss.py b/paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
new file mode 100644
index 0000000..61345fe
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
@@ -0,0 +1,116 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class DetailAggregateLoss(nn.Layer):
+    """
+    DetailAggregateLoss's implementation based on PaddlePaddle.
+
+    The original article refers to Meituan
+    Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation."
+    (https://arxiv.org/abs/2104.13188)
+
+    Args:
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+
+    """
+
+    def __init__(self, ignore_index=255):
+        super(DetailAggregateLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.laplacian_kernel = paddle.to_tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype='float32').reshape(
+            (1, 1, 3, 3))
+        self.fuse_kernel = paddle.create_parameter([1, 3, 1, 1], dtype='float32')
+
+    def forward(self, logits, label):
+        """
+        Args:
+            logits (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+        Returns: loss
+        """
+        boundary_targets = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                    padding=1)
+        boundary_targets = paddle.clip(boundary_targets, min=0)
+        boundary_targets = boundary_targets > 0.1
+        boundary_targets = boundary_targets.astype('float32')
+
+        boundary_targets_x2 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                       stride=2, padding=1)
+        boundary_targets_x2 = paddle.clip(boundary_targets_x2, min=0)
+        boundary_targets_x4 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                       stride=4, padding=1)
+        boundary_targets_x4 = paddle.clip(boundary_targets_x4, min=0)
+
+        boundary_targets_x8 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                       stride=8, padding=1)
+        boundary_targets_x8 = paddle.clip(boundary_targets_x8, min=0)
+
+        boundary_targets_x8_up = F.interpolate(boundary_targets_x8, boundary_targets.shape[2:], mode='nearest')
+        boundary_targets_x4_up = F.interpolate(boundary_targets_x4, boundary_targets.shape[2:], mode='nearest')
+        boundary_targets_x2_up = F.interpolate(boundary_targets_x2, boundary_targets.shape[2:], mode='nearest')
+
+        boundary_targets_x2_up = boundary_targets_x2_up > 0.1
+        boundary_targets_x2_up = boundary_targets_x2_up.astype('float32')
+
+        boundary_targets_x4_up = boundary_targets_x4_up > 0.1
+        boundary_targets_x4_up = boundary_targets_x4_up.astype('float32')
+
+        boundary_targets_x8_up = boundary_targets_x8_up > 0.1
+        boundary_targets_x8_up = boundary_targets_x8_up.astype('float32')
+
+        boudary_targets_pyramids = paddle.stack((boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up),
+                                                axis=1)
+
+        boudary_targets_pyramids = paddle.squeeze(boudary_targets_pyramids, axis=2)
+        boudary_targets_pyramid = F.conv2d(boudary_targets_pyramids, self.fuse_kernel)
+
+        boudary_targets_pyramid = boudary_targets_pyramid > 0.1
+        boudary_targets_pyramid = boudary_targets_pyramid.astype('float32')
+
+        if logits.shape[-1] != boundary_targets.shape[-1]:
+            logits = F.interpolate(
+                logits, boundary_targets.shape[2:], mode='bilinear', align_corners=True)
+
+        bce_loss = F.binary_cross_entropy_with_logits(logits, boudary_targets_pyramid)
+        dice_loss = self.fixed_dice_loss_func(F.sigmoid(logits), boudary_targets_pyramid)
+        detail_loss = bce_loss + dice_loss
+
+        label.stop_gradient = True
+        return detail_loss
+
+    def fixed_dice_loss_func(self, input, target):
+        """
+            simplified diceloss for DetailAggregateLoss.
+        """
+        smooth = 1.
+        n = input.shape[0]
+        iflat = paddle.reshape(input, [n, -1])
+        tflat = paddle.reshape(target, [n, -1])
+        intersection = paddle.sum((iflat * tflat), axis=1)
+        loss = 1 - ((2. * intersection + smooth) /
+                    (paddle.sum(iflat, axis=1) + paddle.sum(tflat, axis=1) + smooth))
+        return paddle.mean(loss)
\ No newline at end of file
diff --git a/paddlers/models/ppseg/models/losses/dice_loss.py b/paddlers/models/ppseg/models/losses/dice_loss.py
new file mode 100644
index 0000000..cca8058
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/dice_loss.py
@@ -0,0 +1,56 @@
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class DiceLoss(nn.Layer):
+    """
+    Implements the dice loss function.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        smooth (float32): laplace smoothing,
+            to smooth dice loss and accelerate convergence. following:
+            https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
+    """
+
+    def __init__(self, ignore_index=255, smooth=0.):
+        super(DiceLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.eps = 1e-5
+        self.smooth = smooth
+
+    def forward(self, logits, labels):
+        labels = paddle.cast(labels, dtype='int32')
+        labels_one_hot = F.one_hot(labels, num_classes=logits.shape[1])
+        labels_one_hot = paddle.transpose(labels_one_hot, [0, 3, 1, 2])
+        labels_one_hot = paddle.cast(labels_one_hot, dtype='float32')
+
+        logits = F.softmax(logits, axis=1)
+
+        mask = (paddle.unsqueeze(labels, 1) != self.ignore_index)
+        logits = logits * mask
+        labels_one_hot = labels_one_hot * mask
+
+        dims = (0, ) + tuple(range(2, labels.ndimension() + 1))
+
+        intersection = paddle.sum(logits * labels_one_hot, dims)
+        cardinality = paddle.sum(logits + labels_one_hot, dims)
+        dice_loss = ((2. * intersection + self.smooth) /
+                     (cardinality + self.eps + self.smooth)).mean()
+        return 1 - dice_loss
diff --git a/paddlers/models/ppseg/models/losses/edge_attention_loss.py b/paddlers/models/ppseg/models/losses/edge_attention_loss.py
new file mode 100644
index 0000000..44012bc
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/edge_attention_loss.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import losses
+
+
+@manager.LOSSES.add_component
+class EdgeAttentionLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function. It only compute the edge part.
+
+    Args:
+        edge_threshold (float): The pixels greater edge_threshold as edges.
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, edge_threshold=0.8, ignore_index=255):
+        super().__init__()
+        self.edge_threshold = edge_threshold
+        self.ignore_index = ignore_index
+        self.EPS = 1e-10
+        self.mean_mask = 1
+
+    def forward(self, logits, label):
+        """
+        Forward computation.
+
+        Args:
+            logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit .
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, C, D1, D2,..., Dk), k >= 1.
+        """
+        seg_logit, edge_logit = logits[0], logits[1]
+        if len(label.shape) != len(seg_logit.shape):
+            label = paddle.unsqueeze(label, 1)
+        if edge_logit.shape != label.shape:
+            raise ValueError(
+                'The shape of edge_logit should equal to the label, but they are {} != {}'
+                .format(edge_logit.shape, label.shape))
+
+        filler = paddle.ones_like(label) * self.ignore_index
+        label = paddle.where(edge_logit > self.edge_threshold, label, filler)
+
+        seg_logit = paddle.transpose(seg_logit, [0, 2, 3, 1])
+        label = paddle.transpose(label, [0, 2, 3, 1])
+        loss = F.softmax_with_cross_entropy(
+            seg_logit, label, ignore_index=self.ignore_index, axis=-1)
+
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, 'float32')
+        loss = loss * mask
+        avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
+        if paddle.mean(mask) < self.mean_mask:
+            self.mean_mask = paddle.mean(mask)
+
+        label.stop_gradient = True
+        mask.stop_gradient = True
+        return avg_loss
diff --git a/paddlers/models/ppseg/models/losses/focal_loss.py b/paddlers/models/ppseg/models/losses/focal_loss.py
new file mode 100755
index 0000000..c578345
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/focal_loss.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class FocalLoss(nn.Layer):
+    """
+    Focal Loss.
+
+    Code referenced from:
+    https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
+
+    Args:
+        gamma (float): the coefficient of Focal Loss.
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, gamma=2.0, ignore_index=255, edge_label=False):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.ignore_index = ignore_index
+        self.edge_label = edge_label
+
+    def forward(self, logit, label):
+        logit = paddle.reshape(
+            logit, [logit.shape[0], logit.shape[1], -1])  # N,C,H,W => N,C,H*W
+        logit = paddle.transpose(logit, [0, 2, 1])  # N,C,H*W => N,H*W,C
+        logit = paddle.reshape(logit,
+                               [-1, logit.shape[2]])  # N,H*W,C => N*H*W,C
+        label = paddle.reshape(label, [-1, 1])
+        range_ = paddle.arange(0, label.shape[0])
+        range_ = paddle.unsqueeze(range_, axis=-1)
+        label = paddle.cast(label, dtype='int64')
+        label = paddle.concat([range_, label], axis=-1)
+        logpt = F.log_softmax(logit)
+        logpt = paddle.gather_nd(logpt, label)
+
+        pt = paddle.exp(logpt.detach())
+        loss = -1 * (1 - pt)**self.gamma * logpt
+        loss = paddle.mean(loss)
+        return loss
diff --git a/paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py b/paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
new file mode 100644
index 0000000..777e57a
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
@@ -0,0 +1,141 @@
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class DualTaskLoss(nn.Layer):
+    """
+    The dual task loss implement of GSCNN
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        tau (float): the tau of gumbel softmax sample.
+    """
+
+    def __init__(self, ignore_index=255, tau=0.5):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.tau = tau
+
+    def _gumbel_softmax_sample(self, logit, tau=1, eps=1e-10):
+        """
+        Draw a sample from the Gumbel-Softmax distribution
+
+        based on
+        https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb
+        (MIT license)
+        """
+        gumbel_noise = paddle.rand(logit.shape)
+        gumbel_noise = -paddle.log(eps - paddle.log(gumbel_noise + eps))
+        logit = logit + gumbel_noise
+        return F.softmax(logit / tau, axis=1)
+
+    def compute_grad_mag(self, x):
+        eps = 1e-6
+        n, c, h, w = x.shape
+        if h <= 1 or w <= 1:
+            raise ValueError(
+                'The width and height of tensor to compute grad must be greater than 1, but the shape is {}.'
+                .format(x.shape))
+
+        x = self.conv_tri(x, r=4)
+        kernel = [[-1, 0, 1]]
+        kernel = paddle.to_tensor(kernel).astype('float32')
+        kernel = 0.5 * kernel
+
+        kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0)
+        grad_x = F.conv2d(x, kernel_x, padding='same', groups=c)
+        kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0)
+        grad_y = F.conv2d(x, kernel_y, padding='same', groups=c)
+        mag = paddle.sqrt(grad_x * grad_x + grad_y * grad_y + eps)
+
+        return mag / mag.max()
+
+    def conv_tri(self, input, r):
+        """
+        Convolves an image by a 2D triangle filter (the 1D triangle filter f is
+        [1:r r+1 r:-1:1]/(r+1)^2, the 2D version is simply conv2(f,f'))
+        """
+        if r <= 1:
+            raise ValueError(
+                '`r` should be greater than 1, but it is {}.'.format(r))
+
+        kernel = [
+            list(range(1, r + 1)) + [r + 1] + list(reversed(range(1, r + 1)))
+        ]
+        kernel = paddle.to_tensor(kernel).astype('float32')
+        kernel = kernel / (r + 1)**2
+        input_ = F.pad(input, [1, 1, 0, 0], mode='replicate')
+        input_ = F.pad(input_, [r, r, 0, 0], mode='reflect')
+        input_ = [input_[:, :, :, :r], input, input_[:, :, :, -r:]]
+        input_ = paddle.concat(input_, axis=3)
+        tem = input_.clone()
+
+        input_ = F.pad(input_, [0, 0, 1, 1], mode='replicate')
+        input_ = F.pad(input_, [0, 0, r, r], mode='reflect')
+        input_ = [input_[:, :, :r, :], tem, input_[:, :, -r:, :]]
+        input_ = paddle.concat(input_, axis=2)
+
+        c = input.shape[1]
+        kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0)
+        output = F.conv2d(input_, kernel_x, padding=0, groups=c)
+        kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0)
+        output = F.conv2d(output, kernel_y, padding=0, groups=c)
+        return output
+
+    def forward(self, logit, labels):
+        # import pdb; pdb.set_trace()
+        n, c, h, w = logit.shape
+        th = 1e-8
+        eps = 1e-10
+        if len(labels.shape) == 3:
+            labels = labels.unsqueeze(1)
+        mask = (labels != self.ignore_index)
+        mask.stop_gradient = True
+        logit = logit * mask
+
+        labels = labels * mask
+        if len(labels.shape) == 4:
+            labels = labels.squeeze(1)
+        labels.stop_gradient = True
+        labels = F.one_hot(labels, logit.shape[1]).transpose((0, 3, 1, 2))
+        labels.stop_gradient = True
+
+        g = self._gumbel_softmax_sample(logit, tau=self.tau)
+        g = self.compute_grad_mag(g)
+        g_hat = self.compute_grad_mag(labels)
+        loss = F.l1_loss(g, g_hat, reduction='none')
+        loss = loss * mask
+
+        g_mask = (g > th).astype('float32')
+        g_mask.stop_gradient = True
+        g_mask_sum = paddle.sum(g_mask)
+        loss_g = paddle.sum(loss * g_mask)
+        if g_mask_sum > eps:
+            loss_g = loss_g / g_mask_sum
+
+        g_hat_mask = (g_hat > th).astype('float32')
+        g_hat_mask.stop_gradient = True
+        g_hat_mask_sum = paddle.sum(g_hat_mask)
+        loss_g_hat = paddle.sum(loss * g_hat_mask)
+        if g_hat_mask_sum > eps:
+            loss_g_hat = loss_g_hat / g_hat_mask_sum
+
+        total_loss = 0.5 * loss_g + 0.5 * loss_g_hat
+
+        return total_loss
diff --git a/paddlers/models/ppseg/models/losses/kl_loss.py b/paddlers/models/ppseg/models/losses/kl_loss.py
new file mode 100755
index 0000000..23a4a6e
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/kl_loss.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class KLLoss(nn.Layer):
+    """
+    The implementation of Kullback-Leibler divergence Loss.
+    Refer to https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        temperature (float): the coefficient of kl_loss.
+    """
+
+    def __init__(self, ignore_index=255, temperature=1):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+
+        self.kl_loss = nn.KLDivLoss(reduction="none")
+        self.EPS = 1e-8
+
+    def forward(self, logit_1, logit_2, label=None):
+        """
+        Calculate the KL loss. If the label is not None, it considers the
+        ignore_index in label and calculates the masked loss.
+
+        Args:
+            logit_1 (Tensor): Logit tensor, the data type is float32 or float64.
+                The shape is (N, C), where C is number of classes, and if shape is
+                more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
+            logit_2 (Tensor): Logit tensor, the data type is float32 or float64.
+                The shape of logit_2 and logit_1 are the same.
+            label (Tensor, optional): Label tensor, the data type is int64.
+                The shape is (N), where each value is 0 <= label[i] <= C-1, and
+                if shape is more than 2D, this is (N, D1, D2,..., Dk), k >= 1.
+        Returns:
+            (Tensor): The average loss.
+        """
+        if logit_1.shape != logit_2.shape:
+            raise ValueError(
+                'The shape of logit_1 = {} must be the same as the shape of logit_2 = {}.'
+                .format(logit_1.shape, logit_2.shape))
+
+        logit_1 = F.log_softmax(logit_1 / self.temperature, axis=1)
+        logit_2 = F.softmax(logit_2 / self.temperature, axis=1)
+        loss = self.kl_loss(logit_1, logit_2)
+        loss = loss * self.temperature * self.temperature
+
+        if label is None:
+            avg_loss = paddle.mean(loss)
+        else:
+            mask = label != self.ignore_index
+            mask = paddle.cast(mask, 'float32')
+            mask = paddle.unsqueeze(mask, axis=1)
+            label.stop_gradient = True
+            mask.stop_gradient = True
+
+            loss = loss * mask
+            avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
+        return avg_loss
diff --git a/paddlers/models/ppseg/models/losses/l1_loss.py b/paddlers/models/ppseg/models/losses/l1_loss.py
new file mode 100644
index 0000000..03f84cf
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/l1_loss.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class L1Loss(nn.L1Loss):
+    r"""
+    This interface is used to construct a callable object of the ``L1Loss`` class.
+    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
+     If `reduction` set to ``'none'``, the loss is:
+    .. math::
+        Out = \lvert input - label\rvert
+    If `reduction` set to ``'mean'``, the loss is:
+    .. math::
+        Out = MEAN(\lvert input - label\rvert)
+    If `reduction` set to ``'sum'``, the loss is:
+    .. math::
+        Out = SUM(\lvert input - label\rvert)
+
+    Args:
+        reduction (str, optional): Indicate the reduction to apply to the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
+            Default is ``'mean'``.
+        ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255.
+    Shape:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        output (Tensor): The L1 Loss of ``input`` and ``label``.
+            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
+            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            l1_loss = paddle.nn.L1Loss()
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [0.35]
+            l1_loss = paddle.nn.L1Loss(reduction='sum')
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [1.4]
+            l1_loss = paddle.nn.L1Loss(reduction='none')
+            output = l1_loss(input, label)
+            print(output)
+            # [[0.20000005 0.19999999]
+            # [0.2        0.79999995]]
+    """
+
+    def __init__(self, reduction='mean', ignore_index=255):
+        super().__init__(reduction=reduction)
diff --git a/paddlers/models/ppseg/models/losses/lovasz_loss.py b/paddlers/models/ppseg/models/losses/lovasz_loss.py
new file mode 100644
index 0000000..5f04d1c
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/lovasz_loss.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lovasz-Softmax and Jaccard hinge loss in PaddlePaddle"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class LovaszSoftmaxLoss(nn.Layer):
+    """
+    Multi-class Lovasz-Softmax loss.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``.
+        classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+    """
+
+    def __init__(self, ignore_index=255, classes='present'):
+        super(LovaszSoftmaxLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.classes = classes
+
+    def forward(self, logits, labels):
+        r"""
+        Forward computation.
+
+        Args:
+            logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty).
+            labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], ground truth labels (between 0 and C - 1).
+        """
+        probas = F.softmax(logits, axis=1)
+        vprobas, vlabels = flatten_probas(probas, labels, self.ignore_index)
+        loss = lovasz_softmax_flat(vprobas, vlabels, classes=self.classes)
+        return loss
+
+
+@manager.LOSSES.add_component
+class LovaszHingeLoss(nn.Layer):
+    """
+    Binary Lovasz hinge loss.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, ignore_index=255):
+        super(LovaszHingeLoss, self).__init__()
+        self.ignore_index = ignore_index
+
+    def forward(self, logits, labels):
+        r"""
+        Forward computation.
+
+        Args:
+            logits (Tensor): Shape is [N, 1, H, W] or [N, 2, H, W], logits at each pixel (between -\infty and +\infty).
+            labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], binary ground truth masks (0 or 1).
+        """
+        if logits.shape[1] == 2:
+            logits = binary_channel_to_unary(logits)
+        loss = lovasz_hinge_flat(
+            *flatten_binary_scores(logits, labels, self.ignore_index))
+        return loss
+
+
+def lovasz_grad(gt_sorted):
+    """
+    Computes gradient of the Lovasz extension w.r.t sorted errors.
+    See Alg. 1 in paper.
+    """
+    gts = paddle.sum(gt_sorted)
+    p = len(gt_sorted)
+
+    intersection = gts - paddle.cumsum(gt_sorted, axis=0)
+    union = gts + paddle.cumsum(1 - gt_sorted, axis=0)
+    jaccard = 1.0 - intersection.cast('float32') / union.cast('float32')
+
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def binary_channel_to_unary(logits, eps=1e-9):
+    """
+    Converts binary channel logits to unary channel logits for lovasz hinge loss.
+    """
+    probas = F.softmax(logits, axis=1)
+    probas = probas[:, 1, :, :]
+    logits = paddle.log(probas + eps / (1 - probas + eps))
+    logits = logits.unsqueeze(1)
+    return logits
+
+
+def lovasz_hinge_flat(logits, labels):
+    r"""
+    Binary Lovasz hinge loss.
+
+    Args:
+        logits (Tensor): Shape is [P], logits at each prediction (between -\infty and +\infty).
+        labels (Tensor): Shape is [P], binary ground truth labels (0 or 1).
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels - 1.
+    signs.stop_gradient = True
+    errors = 1. - logits * signs
+    errors_sorted, perm = paddle.fluid.core.ops.argsort(errors, 'axis', 0,
+                                                        'descending', True)
+    errors_sorted.stop_gradient = False
+    gt_sorted = paddle.gather(labels, perm)
+    grad = lovasz_grad(gt_sorted)
+    grad.stop_gradient = True
+    loss = paddle.sum(F.relu(errors_sorted) * grad)
+    return loss
+
+
+def flatten_binary_scores(scores, labels, ignore=None):
+    """
+    Flattens predictions in the batch (binary case).
+    Remove labels according to 'ignore'.
+    """
+    scores = paddle.reshape(scores, [-1])
+    labels = paddle.reshape(labels, [-1])
+    labels.stop_gradient = True
+    if ignore is None:
+        return scores, labels
+    valid = labels != ignore
+    valid_mask = paddle.reshape(valid, (-1, 1))
+    indexs = paddle.nonzero(valid_mask)
+    indexs.stop_gradient = True
+    vscores = paddle.gather(scores, indexs[:, 0])
+    vlabels = paddle.gather(labels, indexs[:, 0])
+    return vscores, vlabels
+
+
+def lovasz_softmax_flat(probas, labels, classes='present'):
+    """
+    Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probas (Tensor): Shape is [P, C], class probabilities at each prediction (between 0 and 1).
+        labels (Tensor): Shape is [P], ground truth labels (between 0 and C - 1).
+        classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+    """
+    if probas.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probas * 0.
+    C = probas.shape[1]
+    losses = []
+    classes_to_sum = list(range(C)) if classes in ['all', 'present'
+                                                   ] else classes
+    for c in classes_to_sum:
+        fg = paddle.cast(labels == c, probas.dtype)  # foreground for class c
+        if classes == 'present' and fg.sum() == 0:
+            continue
+        fg.stop_gradient = True
+        if C == 1:
+            if len(classes_to_sum) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probas[:, 0]
+        else:
+            class_pred = probas[:, c]
+        errors = paddle.abs(fg - class_pred)
+        errors_sorted, perm = paddle.fluid.core.ops.argsort(
+            errors, 'axis', 0, 'descending', True)
+        errors_sorted.stop_gradient = False
+
+        fg_sorted = paddle.gather(fg, perm)
+        fg_sorted.stop_gradient = True
+
+        grad = lovasz_grad(fg_sorted)
+        grad.stop_gradient = True
+        loss = paddle.sum(errors_sorted * grad)
+        losses.append(loss)
+
+    if len(classes_to_sum) == 1:
+        return losses[0]
+
+    losses_tensor = paddle.stack(losses)
+    mean_loss = paddle.mean(losses_tensor)
+    return mean_loss
+
+
+def flatten_probas(probas, labels, ignore=None):
+    """
+    Flattens predictions in the batch.
+    """
+    if len(probas.shape) == 3:
+        probas = paddle.unsqueeze(probas, axis=1)
+    C = probas.shape[1]
+    probas = paddle.transpose(probas, [0, 2, 3, 1])
+    probas = paddle.reshape(probas, [-1, C])
+    labels = paddle.reshape(labels, [-1])
+    if ignore is None:
+        return probas, labels
+    valid = labels != ignore
+    valid_mask = paddle.reshape(valid, [-1, 1])
+    indexs = paddle.nonzero(valid_mask)
+    indexs.stop_gradient = True
+    vprobas = paddle.gather(probas, indexs[:, 0])
+    vlabels = paddle.gather(labels, indexs[:, 0])
+    return vprobas, vlabels
diff --git a/paddlers/models/ppseg/models/losses/mean_square_error_loss.py b/paddlers/models/ppseg/models/losses/mean_square_error_loss.py
new file mode 100644
index 0000000..3365268
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/mean_square_error_loss.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class MSELoss(nn.MSELoss):
+    r"""
+    **Mean Square Error Loss**
+    Computes the mean square error (squared L2 norm) of given input and label.
+    If :attr:`reduction` is set to ``'none'``, loss is calculated as:
+    .. math::
+        Out = (input - label)^2
+    If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
+    .. math::
+        Out = \operatorname{mean}((input - label)^2)
+    If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
+    .. math::
+        Out = \operatorname{sum}((input - label)^2)
+    where `input` and `label` are `float32` tensors of same shape.
+
+    Args:
+        reduction (string, optional): The reduction method for the output,
+            could be 'none' | 'mean' | 'sum'.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255.
+    Shape:
+        input (Tensor): Input tensor, the data type is float32 or float64
+        label (Tensor): Label tensor, the data type is float32 or float64
+        output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
+    Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            input_data = np.array([1.5]).astype("float32")
+            label_data = np.array([1.7]).astype("float32")
+            mse_loss = paddle.nn.loss.MSELoss()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = mse_loss(input, label)
+            print(output)
+            # [0.04000002]
+    """
+
+    def __init__(self, reduction='mean', ignore_index=255):
+        super().__init__(reduction=reduction)
diff --git a/paddlers/models/ppseg/models/losses/mixed_loss.py b/paddlers/models/ppseg/models/losses/mixed_loss.py
new file mode 100644
index 0000000..c850fa0
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/mixed_loss.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class MixedLoss(nn.Layer):
+    """
+    Weighted computations for multiple Loss.
+    The advantage is that mixed loss training can be achieved without changing the networking code.
+
+    Args:
+        losses (list[nn.Layer]): A list consisting of multiple loss classes
+        coef (list[float|int]): Weighting coefficient of multiple loss
+
+    Returns:
+        A callable object of MixedLoss.
+    """
+
+    def __init__(self, losses, coef):
+        super(MixedLoss, self).__init__()
+        if not isinstance(losses, list):
+            raise TypeError('`losses` must be a list!')
+        if not isinstance(coef, list):
+            raise TypeError('`coef` must be a list!')
+        len_losses = len(losses)
+        len_coef = len(coef)
+        if len_losses != len_coef:
+            raise ValueError(
+                'The length of `losses` should equal to `coef`, but they are {} and {}.'
+                .format(len_losses, len_coef))
+
+        self.losses = losses
+        self.coef = coef
+
+    def forward(self, logits, labels):
+        loss_list = []
+        for i, loss in enumerate(self.losses):
+            output = loss(logits, labels)
+            loss_list.append(output * self.coef[i])
+        return loss_list
diff --git a/paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py b/paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
new file mode 100644
index 0000000..ab424d4
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class OhemCrossEntropyLoss(nn.Layer):
+    """
+    Implements the ohem cross entropy loss function.
+
+    Args:
+        thresh (float, optional): The threshold of ohem. Default: 0.7.
+        min_kept (int, optional): The min number to keep in loss computation. Default: 10000.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, thresh=0.7, min_kept=10000, ignore_index=255):
+        super(OhemCrossEntropyLoss, self).__init__()
+        self.thresh = thresh
+        self.min_kept = min_kept
+        self.ignore_index = ignore_index
+        self.EPS = 1e-5
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+        """
+        if len(label.shape) != len(logit.shape):
+            label = paddle.unsqueeze(label, 1)
+
+        # get the label after ohem
+        n, c, h, w = logit.shape
+        label = label.reshape((-1, ))
+        valid_mask = (label != self.ignore_index).astype('int64')
+        num_valid = valid_mask.sum()
+        label = label * valid_mask
+
+        prob = F.softmax(logit, axis=1)
+        prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))
+
+        if self.min_kept < num_valid and num_valid > 0:
+            # let the value which ignored greater than 1
+            prob = prob + (1 - valid_mask)
+
+            # get the prob of relevant label
+            label_onehot = F.one_hot(label, c)
+            label_onehot = label_onehot.transpose((1, 0))
+            prob = prob * label_onehot
+            prob = paddle.sum(prob, axis=0)
+
+            threshold = self.thresh
+            if self.min_kept > 0:
+                index = prob.argsort()
+                threshold_index = index[min(len(index), self.min_kept) - 1]
+                threshold_index = int(threshold_index.numpy()[0])
+                if prob[threshold_index] > self.thresh:
+                    threshold = prob[threshold_index]
+                kept_mask = (prob < threshold).astype('int64')
+                label = label * kept_mask
+                valid_mask = valid_mask * kept_mask
+
+        # make the invalid region as ignore
+        label = label + (1 - valid_mask) * self.ignore_index
+
+        label = label.reshape((n, 1, h, w))
+        valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
+        loss = F.softmax_with_cross_entropy(
+            logit, label, ignore_index=self.ignore_index, axis=1)
+        loss = loss * valid_mask
+        avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)
+
+        label.stop_gradient = True
+        valid_mask.stop_gradient = True
+        return avg_loss
diff --git a/paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py b/paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
new file mode 100644
index 0000000..56db270
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import losses
+
+
+@manager.LOSSES.add_component
+class OhemEdgeAttentionLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function. It only compute the edge part.
+
+    Args:
+        edge_threshold (float, optional): The pixels greater edge_threshold as edges. Default: 0.8.
+        thresh (float, optional): The threshold of ohem. Default: 0.7.
+        min_kept (int, optional): The min number to keep in loss computation. Default: 5000.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self,
+                 edge_threshold=0.8,
+                 thresh=0.7,
+                 min_kept=5000,
+                 ignore_index=255):
+        super().__init__()
+        self.edge_threshold = edge_threshold
+        self.thresh = thresh
+        self.min_kept = min_kept
+        self.ignore_index = ignore_index
+        self.EPS = 1e-10
+
+    def forward(self, logits, label):
+        """
+        Forward computation.
+
+        Args:
+            logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit .
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, C, D1, D2,..., Dk), k >= 1.
+        """
+        seg_logit, edge_logit = logits[0], logits[1]
+        if len(label.shape) != len(seg_logit.shape):
+            label = paddle.unsqueeze(label, 1)
+        if edge_logit.shape != label.shape:
+            raise ValueError(
+                'The shape of edge_logit should equal to the label, but they are {} != {}'
+                .format(edge_logit.shape, label.shape))
+
+        # Filter out edge
+        filler = paddle.ones_like(label) * self.ignore_index
+        label = paddle.where(edge_logit > self.edge_threshold, label, filler)
+
+        # ohem
+        n, c, h, w = seg_logit.shape
+        label = label.reshape((-1, ))
+        valid_mask = (label != self.ignore_index).astype('int64')
+        num_valid = valid_mask.sum()
+        label = label * valid_mask
+
+        prob = F.softmax(seg_logit, axis=1)
+        prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))
+
+        if self.min_kept < num_valid and num_valid > 0:
+            # let the value which ignored greater than 1
+            prob = prob + (1 - valid_mask)
+
+            # get the prob of relevant label
+            label_onehot = F.one_hot(label, c)
+            label_onehot = label_onehot.transpose((1, 0))
+            prob = prob * label_onehot
+            prob = paddle.sum(prob, axis=0)
+
+            threshold = self.thresh
+            if self.min_kept > 0:
+                index = prob.argsort()
+                threshold_index = index[min(len(index), self.min_kept) - 1]
+                threshold_index = int(threshold_index.numpy()[0])
+                if prob[threshold_index] > self.thresh:
+                    threshold = prob[threshold_index]
+                kept_mask = (prob < threshold).astype('int64')
+                label = label * kept_mask
+                valid_mask = valid_mask * kept_mask
+        # make the invalid region as ignore
+        label = label + (1 - valid_mask) * self.ignore_index
+        label = label.reshape((n, 1, h, w))
+        valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
+
+        loss = F.softmax_with_cross_entropy(
+            seg_logit, label, ignore_index=self.ignore_index, axis=1)
+        loss = loss * valid_mask
+        avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)
+
+        label.stop_gradient = True
+        valid_mask.stop_gradient = True
+        return avg_loss
diff --git a/paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py b/paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
new file mode 100644
index 0000000..fbb1299
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class PixelContrastCrossEntropyLoss(nn.Layer):
+    """
+    The PixelContrastCrossEntropyLoss implementation based on PaddlePaddle.
+
+    The original article refers to
+    Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation"
+    (https://arxiv.org/abs/2101.11939).
+
+    Args:
+        temperature (float, optional): Controling the numerical similarity of features. Default: 0.1.
+        base_temperature (float, optional): Controling the numerical range of contrast loss. Default: 0.07.
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default 255.
+        max_samples (int, optional): Max sampling anchors. Default: 1024.
+        max_views (int): Sampled samplers of a class. Default: 100.
+    """
+    def __init__(self,
+                 temperature=0.1,
+                 base_temperature=0.07,
+                 ignore_index=255,
+                 max_samples=1024,
+                 max_views=100):
+        super().__init__()
+        self.temperature = temperature
+        self.base_temperature = base_temperature
+        self.ignore_index = ignore_index
+        self.max_samples = max_samples
+        self.max_views = max_views
+
+    def _hard_anchor_sampling(self, X, y_hat, y):
+        """
+        Args:
+            X (Tensor): reshaped feats, shape = [N, H * W, feat_channels]
+            y_hat (Tensor): reshaped label, shape = [N, H * W]
+            y (Tensor): reshaped predict, shape = [N, H * W]
+        """
+        batch_size, feat_dim = paddle.shape(X)[0], paddle.shape(X)[-1]
+        classes = []
+        total_classes = 0
+        for i in range(batch_size):
+            current_y = y_hat[i]
+            current_classes = paddle.unique(current_y)
+            current_classes = [
+                x for x in current_classes if x != self.ignore_index
+            ]
+            current_classes = [
+                x for x in current_classes
+                if (current_y == x).nonzero().shape[0] > self.max_views
+            ]
+
+            classes.append(current_classes)
+            total_classes += len(current_classes)
+
+        n_view = self.max_samples // total_classes
+        n_view = min(n_view, self.max_views)
+
+        X_ = []
+        y_ = paddle.zeros([total_classes], dtype='float32')
+
+        X_ptr = 0
+        for i in range(batch_size):
+            this_y_hat = y_hat[i]
+            current_y = y[i]
+            current_classes = classes[i]
+
+            for cls_id in current_classes:
+                hard_indices = paddle.logical_and(
+                    (this_y_hat == cls_id), (current_y != cls_id)).nonzero()
+                easy_indices = paddle.logical_and(
+                    (this_y_hat == cls_id), (current_y == cls_id)).nonzero()
+
+                num_hard = hard_indices.shape[0]
+                num_easy = easy_indices.shape[0]
+
+                if num_hard >= n_view / 2 and num_easy >= n_view / 2:
+                    num_hard_keep = n_view // 2
+                    num_easy_keep = n_view - num_hard_keep
+                elif num_hard >= n_view / 2:
+                    num_easy_keep = num_easy
+                    num_hard_keep = n_view - num_easy_keep
+                else:
+                    num_hard_keep = num_hard
+                    num_easy_keep = n_view - num_hard_keep
+
+                indices = None
+                if num_hard > 0:
+                    perm = paddle.randperm(num_hard)
+                    hard_indices = hard_indices[perm[:num_hard_keep]].reshape(
+                        (-1, hard_indices.shape[-1]))
+                    indices = hard_indices
+                if num_easy > 0:
+                    perm = paddle.randperm(num_easy)
+                    easy_indices = easy_indices[perm[:num_easy_keep]].reshape(
+                        (-1, easy_indices.shape[-1]))
+                    if indices is None:
+                        indices = easy_indices
+                    else:
+                        indices = paddle.concat((indices, easy_indices), axis=0)
+                if indices is None:
+                    raise UserWarning('hard sampling indice error')
+
+                X_.append(paddle.index_select(X[i, :, :], indices.squeeze(1)))
+                y_[X_ptr] = float(cls_id)
+                X_ptr += 1
+        X_ = paddle.stack(X_, axis=0)
+        return X_, y_
+
+    def _contrastive(self, feats_, labels_):
+        """
+        Args:
+            feats_ (Tensor): sampled pixel, shape = [total_classes, n_view, feat_dim], total_classes = batch_size * single image classes
+            labels_ (Tensor): label, shape = [total_classes]
+        """
+        anchor_num, n_view = feats_.shape[0], feats_.shape[1]
+
+        labels_ = labels_.reshape((-1, 1))
+        mask = paddle.equal(labels_, paddle.transpose(labels_,
+                                                      [1, 0])).astype('float32')
+
+        contrast_count = n_view
+        contrast_feature = paddle.concat(paddle.unbind(feats_, axis=1), axis=0)
+
+        anchor_feature = contrast_feature
+        anchor_count = contrast_count
+
+        anchor_dot_contrast = paddle.matmul(
+            anchor_feature, paddle.transpose(contrast_feature,
+                                             [1, 0])) / self.temperature
+        logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max
+
+        mask = paddle.tile(mask, [anchor_count, contrast_count])
+        neg_mask = 1 - mask
+
+        logits_mask = 1 - paddle.eye(mask.shape[0]).astype('float32')
+        mask = mask * logits_mask
+
+        neg_logits = paddle.exp(logits) * neg_mask
+        neg_logits = neg_logits.sum(1, keepdim=True)
+
+        exp_logits = paddle.exp(logits)
+
+        log_prob = logits - paddle.log(exp_logits + neg_logits)
+
+        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
+
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = loss.mean()
+
+        return loss
+
+    def contrast_criterion(self, feats, labels=None, predict=None):
+        labels = labels.unsqueeze(1)
+        labels = F.interpolate(labels, feats.shape[2:], mode='nearest')
+        labels = labels.squeeze(1)
+
+        batch_size = feats.shape[0]
+        labels = labels.reshape((batch_size, -1))
+        predict = predict.reshape((batch_size, -1))
+        feats = paddle.transpose(feats, [0, 2, 3, 1])
+        feats = feats.reshape((feats.shape[0], -1, feats.shape[-1]))
+
+        feats_, labels_ = self._hard_anchor_sampling(feats, labels, predict)
+
+        loss = self._contrastive(feats_, labels_)
+        return loss
+
+    def forward(self, preds, label):
+        assert "seg" in preds, "The input of PixelContrastCrossEntropyLoss should include 'seg' output, but not found."
+        assert "embed" in preds, "The input of PixelContrastCrossEntropyLoss should include 'embed' output, but not found."
+
+        seg = preds['seg']
+        embedding = preds['embed']
+
+        predict = paddle.argmax(seg, axis=1)
+        loss = self.contrast_criterion(embedding, label, predict)
+        return loss
diff --git a/paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py b/paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
new file mode 100644
index 0000000..78e7bbf
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+@manager.LOSSES.add_component
+class PointCrossEntropyLoss(nn.Layer):
+    """
+    Implements the point cross entropy loss function.
+
+    The original article refers to
+    Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering."
+    (https://arxiv.org/abs/1912.08193).
+
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0]. When its value < 1.0, only compute the loss for
+            the top k percent pixels (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``.
+        data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``.
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=255,
+                 top_k_percent_pixels=1.0,
+                 data_format='NCHW',
+                 align_corners = False):
+        super(PointCrossEntropyLoss, self).__init__()
+        if weight is not None:
+            weight = paddle.to_tensor(weight, dtype='float32')
+        self.weight = weight
+        self.ignore_index = ignore_index
+        self.top_k_percent_pixels = top_k_percent_pixels
+        self.EPS = 1e-8
+        self.data_format = data_format
+        self.align_corners = align_corners
+
+    def forward(self, logits, label, semantic_weights=None):
+        """
+        Forward computation.
+
+        Args:
+            logits (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (logit,points). logit'shape: [N, C, point_num]. logit'shape:[N, point_num, 2], where C is number of classes.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels, shape is the same as label. Default: None.
+        """
+        # for loss
+        logit, points = logits # [N, C, point_num],[N, point_num, 2]
+        label = label.unsqueeze(1) # [N,1,H,W]
+        label = point_sample(
+            label.astype('float32'),
+            points,
+            mode='nearest',
+            align_corners=self.align_corners) # [N, 1, point_num]
+        label = paddle.squeeze(label,axis=1).astype('int64') # [N, xx]
+
+        channel_axis = 1 if self.data_format == 'NCHW' else -1
+        if self.weight is not None and logit.shape[channel_axis] != len(
+                self.weight):
+            raise ValueError(
+                'The number of weights = {} must be the same as the number of classes = {}.'
+                .format(len(self.weight), logit.shape[1]))
+
+        logit = paddle.transpose(logit, [0, 2, 1])
+        no_ignore_label = label
+        #no_ignore_label[label==self.ignore_index] = 0
+        loss = F.cross_entropy(
+            logit,
+            no_ignore_label,
+            ignore_index=self.ignore_index,
+            reduction='none')
+
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, 'float32')
+
+        loss = loss * mask
+        if semantic_weights is not None:
+            loss = loss * semantic_weights
+
+        if self.weight is not None:
+            _one_hot = F.one_hot(label, logit.shape[-1])
+            _one_hot_weight = _one_hot * self.weight
+            loss = loss * _one_hot_weight.argmax(-1)
+            coef = paddle.sum(_one_hot_weight, axis=-1)
+            #coef = paddle.ones_like(label)
+        else:
+            coef = paddle.ones_like(label)
+
+        label.stop_gradient = True
+        mask.stop_gradient = True
+        if self.top_k_percent_pixels == 1.0:
+            avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS)
+            return avg_loss
+
+        loss = loss.reshape((-1, ))
+        top_k_pixels = int(self.top_k_percent_pixels * loss.numel())
+        loss, indices = paddle.topk(loss, top_k_pixels)
+        coef = coef.reshape((-1, ))
+        coef = paddle.gather(coef, indices)
+        coef.stop_gradient = True
+
+        return loss.mean() / (paddle.mean(coef) + self.EPS)
+
+def point_sample(input, points, align_corners=False, **kwargs):
+    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+    Args:
+        input (Tensor): Feature map, shape (N, C, H, W).
+        points (Tensor): Image based absolute point coordinates (normalized),
+            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
+        align_corners (bool): Whether align_corners. Default: False
+    Returns:
+        Tensor: Features of `point` on `input`, shape (N, C, P) or
+            (N, C, Hgrid, Wgrid).
+    """
+
+    def denormalize(grid):
+        """Denormalize input grid from range [0, 1] to [-1, 1]
+        Args:
+            grid (Tensor): The grid to be denormalize, range [0, 1].
+        Returns:
+            Tensor: Denormalized grid, range [-1, 1].
+        """
+
+        return grid * 2.0 - 1.0
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = paddle.unsqueeze(points,axis=2) # [2, 2048, 1, 2]
+    output = F.grid_sample(
+        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = paddle.squeeze(output,axis=3)
+    return output
+
+
diff --git a/paddlers/models/ppseg/models/losses/rmi_loss.py b/paddlers/models/ppseg/models/losses/rmi_loss.py
new file mode 100644
index 0000000..72b8f3b
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/rmi_loss.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""rmi loss in PaddlePaddle"""
+import numpy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+_euler_num = 2.718281828
+_pi = 3.14159265
+_ln_2_pi = 1.837877
+_CLIP_MIN = 1e-6
+_CLIP_MAX = 1.0
+_POS_ALPHA = 5e-4
+_IS_SUM = 1
+
+
+@manager.LOSSES.add_component
+class RMILoss(nn.Layer):
+    """
+    Implements the Region Mutual Information(RMI) Loss（https://arxiv.org/abs/1910.12037） for Semantic Segmentation.
+    Unlike vanilla rmi loss which contains Cross Entropy Loss, we disband them and only
+    left the RMI-related parts.
+    The motivation is to allow for a more flexible combination of losses during training.
+    For example, by employing mixed loss to merge RMI Loss with Boostrap Cross Entropy Loss,
+    we can achieve the online mining of hard examples together with attention to region information.
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self,
+                 num_classes=19,
+                 rmi_radius=3,
+                 rmi_pool_way=0,
+                 rmi_pool_size=3,
+                 rmi_pool_stride=3,
+                 loss_weight_lambda=0.5,
+                 ignore_index=255):
+        super(RMILoss, self).__init__()
+
+        self.num_classes = num_classes
+        assert rmi_radius in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.rmi_radius = rmi_radius
+        assert rmi_pool_way in [0, 1, 2, 3]
+        self.rmi_pool_way = rmi_pool_way
+        assert rmi_pool_size == rmi_pool_stride
+        self.rmi_pool_size = rmi_pool_size
+        self.rmi_pool_stride = rmi_pool_stride
+        self.weight_lambda = loss_weight_lambda
+        self.half_d = self.rmi_radius * self.rmi_radius
+        self.d = 2 * self.half_d
+        self.kernel_padding = self.rmi_pool_size // 2
+        self.ignore_index = ignore_index
+
+    def forward(self, logits_4D, labels_4D, do_rmi=True):
+        """
+        Forward computation.
+        Args:
+            logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty).
+            labels (Tensor): Shape is [N, H, W], ground truth labels (between 0 and C - 1).
+        """
+        logits_4D = paddle.cast(logits_4D, dtype='float32')
+        labels_4D = paddle.cast(labels_4D, dtype='float32')
+
+        loss = self.forward_sigmoid(logits_4D, labels_4D, do_rmi=do_rmi)
+        return loss
+
+    def forward_sigmoid(self, logits_4D, labels_4D, do_rmi=False):
+        """
+        Using the sigmiod operation both.
+        Args:
+                logits_4D   :   [N, C, H, W], dtype=float32
+                labels_4D   :   [N, H, W], dtype=long
+                do_rmi          :       bool
+        """
+        label_mask_3D = labels_4D != self.ignore_index
+        valid_onehot_labels_4D = paddle.cast(
+            F.one_hot(
+                paddle.cast(labels_4D, dtype='int64') * paddle.cast(
+                    label_mask_3D, dtype='int64'),
+                num_classes=self.num_classes),
+            dtype='float32')
+        # label_mask_flat = paddle.cast(
+        #     paddle.reshape(label_mask_3D, [-1]), dtype='float32')
+
+        valid_onehot_labels_4D = valid_onehot_labels_4D * paddle.unsqueeze(
+            label_mask_3D, axis=3)
+        valid_onehot_labels_4D.stop_gradient = True
+        probs_4D = F.sigmoid(logits_4D) * paddle.unsqueeze(
+            label_mask_3D, axis=1) + _CLIP_MIN
+
+        valid_onehot_labels_4D = paddle.transpose(valid_onehot_labels_4D,
+                                                  [0, 3, 1, 2])
+        valid_onehot_labels_4D.stop_gradient = True
+        rmi_loss = self.rmi_lower_bound(valid_onehot_labels_4D, probs_4D)
+
+        return rmi_loss
+
+    def inverse(self, x):
+        return paddle.inverse(x)
+
+    def rmi_lower_bound(self, labels_4D, probs_4D):
+        """
+        calculate the lower bound of the region mutual information.
+        Args:
+                labels_4D   :   [N, C, H, W], dtype=float32
+                probs_4D    :   [N, C, H, W], dtype=float32
+        """
+        assert labels_4D.shape == probs_4D.shape, print(
+            'shapes', labels_4D.shape, probs_4D.shape)
+
+        p, s = self.rmi_pool_size, self.rmi_pool_stride
+        if self.rmi_pool_stride > 1:
+            if self.rmi_pool_way == 0:
+                labels_4D = F.max_pool2d(
+                    labels_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+                probs_4D = F.max_pool2d(
+                    probs_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+            elif self.rmi_pool_way == 1:
+                labels_4D = F.avg_pool2d(
+                    labels_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+                probs_4D = F.avg_pool2d(
+                    probs_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+            elif self.rmi_pool_way == 2:
+                shape = labels_4D.shape
+                new_h, new_w = shape[2] // s, shape[3] // s
+                labels_4D = F.interpolate(
+                    labels_4D, size=(new_h, new_w), mode='nearest')
+                probs_4D = F.interpolate(
+                    probs_4D,
+                    size=(new_h, new_w),
+                    mode='bilinear',
+                    align_corners=True)
+            else:
+                raise NotImplementedError("Pool way of RMI is not defined!")
+
+        label_shape = labels_4D.shape
+        n, c = label_shape[0], label_shape[1]
+
+        la_vectors, pr_vectors = self.map_get_pairs(
+            labels_4D, probs_4D, radius=self.rmi_radius, is_combine=0)
+
+        la_vectors = paddle.reshape(la_vectors, [n, c, self.half_d, -1])
+        la_vectors = paddle.cast(la_vectors, dtype='float64')
+        la_vectors.stop_gradient = True
+
+        pr_vectors = paddle.reshape(pr_vectors, [n, c, self.half_d, -1])
+        pr_vectors = paddle.cast(pr_vectors, dtype='float64')
+
+        diag_matrix = paddle.unsqueeze(
+            paddle.unsqueeze(paddle.eye(self.half_d), axis=0), axis=0)
+        la_vectors = la_vectors - paddle.mean(la_vectors, axis=3, keepdim=True)
+
+        la_cov = paddle.matmul(la_vectors,
+                               paddle.transpose(la_vectors, [0, 1, 3, 2]))
+        pr_vectors = pr_vectors - paddle.mean(pr_vectors, axis=3, keepdim=True)
+        pr_cov = paddle.matmul(pr_vectors,
+                               paddle.transpose(pr_vectors, [0, 1, 3, 2]))
+
+        pr_cov_inv = self.inverse(
+            pr_cov + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA)
+
+        la_pr_cov = paddle.matmul(la_vectors,
+                                  paddle.transpose(pr_vectors, [0, 1, 3, 2]))
+
+        appro_var = la_cov - paddle.matmul(
+            paddle.matmul(la_pr_cov, pr_cov_inv),
+            paddle.transpose(la_pr_cov, [0, 1, 3, 2]))
+
+        rmi_now = 0.5 * self.log_det_by_cholesky(
+            appro_var + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA)
+
+        rmi_per_class = paddle.cast(
+            paddle.mean(
+                paddle.reshape(rmi_now, [-1, self.num_classes]), axis=0),
+            dtype='float32')
+        rmi_per_class = paddle.divide(rmi_per_class,
+                                      paddle.to_tensor(float(self.half_d)))
+
+        rmi_loss = paddle.sum(rmi_per_class) if _IS_SUM else paddle.mean(
+            rmi_per_class)
+
+        return rmi_loss
+
+    def log_det_by_cholesky(self, matrix):
+        """
+        Args:
+            matrix: matrix must be a positive define matrix.
+                    shape [N, C, D, D].
+        """
+
+        chol = paddle.cholesky(matrix)
+        diag = paddle.diagonal(chol, offset=0, axis1=-2, axis2=-1)
+        chol = paddle.log(diag + 1e-8)
+
+        return 2.0 * paddle.sum(chol, axis=-1)
+
+    def map_get_pairs(self, labels_4D, probs_4D, radius=3, is_combine=True):
+        """
+        Args:
+            labels_4D   :   labels, shape [N, C, H, W]
+            probs_4D    :   probabilities, shape [N, C, H, W]
+            radius      :   the square radius
+        Return:
+            tensor with shape [N, C, radius * radius, H - (radius - 1), W - (radius - 1)]
+        """
+
+        label_shape = labels_4D.shape
+        h, w = label_shape[2], label_shape[3]
+        new_h, new_w = h - (radius - 1), w - (radius - 1)
+        la_ns = []
+        pr_ns = []
+        for y in range(0, radius, 1):
+            for x in range(0, radius, 1):
+                la_now = labels_4D[:, :, y:y + new_h, x:x + new_w]
+                pr_now = probs_4D[:, :, y:y + new_h, x:x + new_w]
+                la_ns.append(la_now)
+                pr_ns.append(pr_now)
+
+        if is_combine:
+            pair_ns = la_ns + pr_ns
+            p_vectors = paddle.stack(pair_ns, axis=2)
+            return p_vectors
+        else:
+            la_vectors = paddle.stack(la_ns, axis=2)
+            pr_vectors = paddle.stack(pr_ns, axis=2)
+            return la_vectors, pr_vectors
diff --git a/paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py b/paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
new file mode 100644
index 0000000..15050e3
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class SemanticConnectivityLoss(nn.Layer):
+    '''
+    SCL (Semantic Connectivity-aware Learning) framework, which introduces a SC Loss (Semantic Connectivity-aware Loss)
+    to improve the quality of segmentation results from the perspective of connectivity. Support multi-class segmentation.
+
+    The original article refers to
+        Lutao Chu, Yi Liu, Zewu Wu, Shiyu Tang, Guowei Chen, Yuying Hao, Juncai Peng, Zhiliang Yu, Zeyu Chen, Baohua Lai, Haoyi Xiong.
+        "PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset"
+        In WACV 2022 workshop
+        https://arxiv.org/abs/2112.07146
+
+    Running process:
+    Step 1. Connected Components Calculation
+    Step 2. Connected Components Matching and SC Loss Calculation
+    '''
+
+    def __init__(self, ignore_index=255, max_pred_num_conn=10, use_argmax=True):
+        '''
+        Args:
+            ignore_index (int): Specify a pixel value to be ignored in the annotated image and does not contribute to
+                the input gradient.When there are pixels that cannot be marked (or difficult to be marked) in the marked
+                image, they can be marked as a specific gray value. When calculating the loss value, the pixel corresponding
+                to the original image will not be used as the independent variable of the loss function. *Default:``255``*
+            max_pred_num_conn (int): Maximum number of predicted connected components. At the beginning of training,
+                there will be a large number of connected components, and the calculation is very time-consuming.
+                Therefore, it is necessary to limit the maximum number of predicted connected components,
+                and the rest will not participate in the calculation.
+            use_argmax (bool): Whether to use argmax for logits.
+        '''
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.max_pred_num_conn = max_pred_num_conn
+        self.use_argmax = use_argmax
+
+    def forward(self, logits, labels):
+        '''
+        Args:
+            logits (Tensor): [N, C, H, W]
+            lables (Tensor): [N, H, W]
+        '''
+        preds = paddle.argmax(logits, axis=1) if self.use_argmax else logits
+        preds_np = preds.astype('uint8').numpy()
+        labels_np = labels.astype('uint8').numpy()
+        preds = paddle.to_tensor(preds, 'float32', stop_gradient=False)
+        multi_class_sc_loss = paddle.zeros([preds.shape[0]])
+        zero = paddle.to_tensor([0.])  # for accelerating
+
+        # Traverse each image
+        for i in range(preds.shape[0]):
+            sc_loss = 0
+            class_num = 0
+
+            pred_i = preds[i]
+            preds_np_i = preds_np[i]
+            labels_np_i = labels_np[i]
+
+            # Traverse each class
+            for class_ in np.unique(labels_np_i):
+                if class_ == self.ignore_index:
+                    continue
+                class_num += 1
+
+                # Connected Components Calculation
+                preds_np_class = preds_np_i == class_
+                labels_np_class = labels_np_i == class_
+                pred_num_conn, pred_conn = cv2.connectedComponents(
+                    preds_np_class.astype(np.uint8))  # pred_conn.shape = [H,W]
+                label_num_conn, label_conn = cv2.connectedComponents(
+                    labels_np_class.astype(np.uint8))
+
+                if pred_num_conn > 2 * label_num_conn:
+                    pred_num_conn = min(pred_num_conn, self.max_pred_num_conn)
+                real_pred_num = pred_num_conn - 1
+                real_label_num = label_num_conn - 1
+
+                # Connected Components Matching and SC Loss Calculation
+                if real_label_num > 0 and real_pred_num > 0:
+                    img_connectivity = compute_class_connectiveity(
+                        pred_conn, label_conn, pred_num_conn, label_num_conn,
+                        pred_i, real_label_num, real_pred_num, zero)
+                    sc_loss += 1 - img_connectivity
+                elif real_label_num == 0 and real_pred_num == 0:
+                    # if no connected component, SC Loss = 0, so pass
+                    pass
+                else:
+                    preds_class = pred_i == int(class_)
+                    not_preds_class = paddle.bitwise_not(preds_class)
+                    labels_class = paddle.to_tensor(labels_np_class)
+                    missed_detect = labels_class * not_preds_class
+                    missed_detect_area = paddle.sum(missed_detect).astype(
+                        'float32')
+                    sc_loss += missed_detect_area / missed_detect.numel() + 1
+
+            multi_class_sc_loss[
+                i] = sc_loss / class_num if class_num != 0 else 0
+        multi_class_sc_loss = paddle.mean(multi_class_sc_loss)
+        return multi_class_sc_loss
+
+
+def compute_class_connectiveity(pred_conn, label_conn, pred_num_conn,
+                                label_num_conn, pred, real_label_num,
+                                real_pred_num, zero):
+
+    pred_conn = paddle.to_tensor(pred_conn)
+    label_conn = paddle.to_tensor(label_conn)
+    pred_conn = F.one_hot(pred_conn, pred_num_conn)
+    label_conn = F.one_hot(label_conn, label_num_conn)
+
+    ious = paddle.zeros((real_label_num, real_pred_num))
+    pair_conn_sum = paddle.to_tensor([0.], stop_gradient=False)
+
+    for i in range(1, label_num_conn):
+        label_i = label_conn[:, :, i]
+
+        pair_conn = paddle.to_tensor([0.], stop_gradient=False)
+        pair_conn_num = 0
+
+        for j in range(1, pred_num_conn):
+            pred_j_mask = pred_conn[:, :, j]
+            pred_j = pred_j_mask * pred
+
+            iou = compute_iou(pred_j, label_i, zero)
+            ious[i - 1, j - 1] = iou
+            if iou != 0:
+                pair_conn += iou
+                pair_conn_num += 1
+
+        if pair_conn_num != 0:
+            pair_conn_sum += pair_conn / pair_conn_num
+    lone_pred_num = 0
+
+    pred_sum = paddle.sum(ious, axis=0)
+    for m in range(0, real_pred_num):
+        if pred_sum[m] == 0:
+            lone_pred_num += 1
+    img_connectivity = pair_conn_sum / (real_label_num + lone_pred_num)
+    return img_connectivity
+
+
+def compute_iou(pred_i, label_i, zero):
+    intersect_area_i = paddle.sum(pred_i * label_i)
+    if paddle.equal(intersect_area_i, zero):
+        return 0
+
+    pred_area_i = paddle.sum(pred_i)
+    label_area_i = paddle.sum(label_i)
+    union_area_i = pred_area_i + label_area_i - intersect_area_i
+    if paddle.equal(union_area_i, zero):
+        return 1
+    else:
+        return intersect_area_i / union_area_i
diff --git a/paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py b/paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
new file mode 100644
index 0000000..2a1802d
--- /dev/null
+++ b/paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class SECrossEntropyLoss(nn.Layer):
+    """
+    The Semantic Encoding Loss implementation based on PaddlePaddle.
+
+    """
+    def __init__(self, *args, **kwargs):
+        super(SECrossEntropyLoss, self).__init__()
+
+    def forward(self, logit, label):
+        if logit.ndim == 4:
+            logit = logit.squeeze(2).squeeze(3)
+        assert logit.ndim == 2, "The shape of logit should be [N, C, 1, 1] or [N, C], but the logit dim is  {}.".format(
+            logit.ndim)
+
+        batch_size, num_classes = paddle.shape(logit)
+        se_label = paddle.zeros([batch_size, num_classes])
+        for i in range(batch_size):
+            hist = paddle.histogram(label[i],
+                                    bins=num_classes,
+                                    min=0,
+                                    max=num_classes - 1)
+            hist = hist.astype('float32') / hist.sum().astype('float32')
+            se_label[i] = (hist > 0).astype('float32')
+        loss = F.binary_cross_entropy_with_logits(logit, se_label)
+        return loss
diff --git a/paddlers/models/ppseg/models/mla_transformer.py b/paddlers/models/ppseg/models/mla_transformer.py
new file mode 100644
index 0000000..bb4c978
--- /dev/null
+++ b/paddlers/models/ppseg/models/mla_transformer.py
@@ -0,0 +1,241 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+class MLAHeads(nn.Layer):
+    def __init__(self, mlahead_channels=128):
+        super(MLAHeads, self).__init__()
+        self.head2 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+        self.head3 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+        self.head4 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+        self.head5 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+
+    def forward(self, mla_p2, mla_p3, mla_p4, mla_p5):
+        head2 = F.interpolate(
+            self.head2(mla_p2),
+            size=(4 * mla_p2.shape[3], 4 * mla_p2.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+        head3 = F.interpolate(
+            self.head3(mla_p3),
+            size=(4 * mla_p3.shape[3], 4 * mla_p3.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+        head4 = F.interpolate(
+            self.head4(mla_p4),
+            size=(4 * mla_p4.shape[3], 4 * mla_p4.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+        head5 = F.interpolate(
+            self.head5(mla_p5),
+            size=(4 * mla_p5.shape[3], 4 * mla_p5.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+
+        return paddle.concat([head2, head3, head4, head5], axis=1)
+
+
+@manager.MODELS.add_component
+class MLATransformer(nn.Layer):
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 backbone,
+                 mlahead_channels=128,
+                 aux_channels=256,
+                 norm_layer=nn.BatchNorm2D,
+                 pretrained=None,
+                 **kwargs):
+        super(MLATransformer, self).__init__()
+
+        self.BatchNorm = norm_layer
+        self.mlahead_channels = mlahead_channels
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.backbone = backbone
+
+        self.mlahead = MLAHeads(mlahead_channels=self.mlahead_channels)
+        self.cls = nn.Conv2D(
+            4 * self.mlahead_channels, self.num_classes, 3, padding=1)
+
+        self.conv0 = layers.ConvBNReLU(
+            self.in_channels[0],
+            self.in_channels[0] * 2,
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv1 = layers.ConvBNReLU(
+            self.in_channels[1],
+            self.in_channels[1],
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv21 = layers.ConvBNReLU(
+            self.in_channels[2],
+            self.in_channels[2],
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv22 = layers.ConvBNReLU(
+            self.in_channels[2],
+            self.in_channels[2] // 2,
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv31 = layers.ConvBNReLU(
+            self.in_channels[3],
+            self.in_channels[3],
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv32 = layers.ConvBNReLU(
+            self.in_channels[3],
+            self.in_channels[3] // 2,
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv33 = layers.ConvBNReLU(
+            self.in_channels[3] // 2,
+            self.in_channels[3] // 4,
+            3,
+            padding=1,
+            bias_attr=False)
+
+        self.aux_head = nn.Sequential(
+            layers.ConvBN(
+                in_channels=self.in_channels[2],
+                out_channels=aux_channels,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False),
+            nn.Conv2D(
+                in_channels=aux_channels,
+                out_channels=self.num_classes,
+                kernel_size=1,
+            ))
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        inputs = self.backbone(x)
+
+        inputs0 = self.conv0(inputs[0])
+        inputs1 = F.interpolate(
+            self.conv1(inputs[1]),
+            size=inputs[0].shape[2:],
+            mode='bilinear',
+            align_corners=True)
+        inputs2 = F.interpolate(
+            self.conv21(inputs[2]),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        inputs2 = F.interpolate(
+            self.conv22(inputs2),
+            size=inputs[0].shape[2:],
+            mode='bilinear',
+            align_corners=True)
+        inputs3 = F.interpolate(
+            self.conv31(inputs[3]),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        inputs3 = F.interpolate(
+            self.conv32(inputs3),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        inputs3 = F.interpolate(
+            self.conv33(inputs3),
+            size=inputs[0].shape[2:],
+            mode='bilinear',
+            align_corners=True)
+        inputs2 = inputs2 + inputs3
+        inputs1 = inputs1 + inputs2
+        inputs0 = inputs0 + inputs1
+
+        feats = self.mlahead(inputs0, inputs1, inputs2, inputs3)
+        logit = self.cls(feats)
+        logit_list = [logit]
+
+        if self.training:
+            logit_list.append(self.aux_head(inputs[2]))
+
+        logit_list = [
+            F.interpolate(
+                logit, paddle.shape(x)[2:], mode='bilinear', align_corners=True)
+            for logit in logit_list
+        ]
+        return logit_list
diff --git a/paddlers/models/ppseg/models/ocrnet.py b/paddlers/models/ppseg/models/ocrnet.py
new file mode 100644
index 0000000..1225642
--- /dev/null
+++ b/paddlers/models/ppseg/models/ocrnet.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class OCRNet(nn.Layer):
+    """
+    The OCRNet implementation based on PaddlePaddle.
+    The original article refers to
+        Yuan, Yuhui, et al. "Object-Contextual Representations for Semantic Segmentation"
+        (https://arxiv.org/pdf/1909.11065.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network.
+        backbone_indices (tuple): A tuple indicates the indices of output of backbone.
+            It can be either one or two values, if two values, the first index will be taken as
+            a deep-supervision feature in auxiliary layer; the second one will be taken as
+            input of pixel representation. If one value, it is taken by both above.
+        ocr_mid_channels (int, optional): The number of middle channels in OCRHead. Default: 512.
+        ocr_key_channels (int, optional): The number of key channels in ObjectAttentionBlock. Default: 256.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 ocr_mid_channels=512,
+                 ocr_key_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+
+        self.head = OCRHead(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            ocr_mid_channels=ocr_mid_channels,
+            ocr_key_channels=ocr_key_channels)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        if not self.training:
+            logit_list = [logit_list[0]]
+
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class OCRHead(nn.Layer):
+    """
+    The Object contextual representation head.
+
+    Args:
+        num_classes(int): The unique number of target classes.
+        in_channels(tuple): The number of input channels.
+        ocr_mid_channels(int, optional): The number of middle channels in OCRHead. Default: 512.
+        ocr_key_channels(int, optional): The number of key channels in ObjectAttentionBlock. Default: 256.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 ocr_mid_channels=512,
+                 ocr_key_channels=256):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.spatial_gather = SpatialGatherBlock(ocr_mid_channels, num_classes)
+        self.spatial_ocr = SpatialOCRModule(ocr_mid_channels, ocr_key_channels,
+                                            ocr_mid_channels)
+
+        self.indices = [-2, -1] if len(in_channels) > 1 else [-1, -1]
+
+        self.conv3x3_ocr = layers.ConvBNReLU(
+            in_channels[self.indices[1]], ocr_mid_channels, 3, padding=1)
+        self.cls_head = nn.Conv2D(ocr_mid_channels, self.num_classes, 1)
+        self.aux_head = nn.Sequential(
+            layers.ConvBNReLU(in_channels[self.indices[0]],
+                              in_channels[self.indices[0]], 1),
+            nn.Conv2D(in_channels[self.indices[0]], self.num_classes, 1))
+
+        self.init_weight()
+
+    def forward(self, feat_list):
+        feat_shallow, feat_deep = feat_list[self.indices[0]], feat_list[
+            self.indices[1]]
+
+        soft_regions = self.aux_head(feat_shallow)
+        pixels = self.conv3x3_ocr(feat_deep)
+
+        object_regions = self.spatial_gather(pixels, soft_regions)
+        ocr = self.spatial_ocr(pixels, object_regions)
+
+        logit = self.cls_head(ocr)
+        return [logit, soft_regions]
+
+    def init_weight(self):
+        """Initialize the parameters of model parts."""
+        for sublayer in self.sublayers():
+            if isinstance(sublayer, nn.Conv2D):
+                param_init.normal_init(sublayer.weight, std=0.001)
+            elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(sublayer.weight, value=1.0)
+                param_init.constant_init(sublayer.bias, value=0.0)
+
+
+class SpatialGatherBlock(nn.Layer):
+    """Aggregation layer to compute the pixel-region representation."""
+
+    def __init__(self, pixels_channels, regions_channels):
+        super().__init__()
+        self.pixels_channels = pixels_channels
+        self.regions_channels = regions_channels
+
+    def forward(self, pixels, regions):
+        # pixels: from (n, c, h, w) to (n, h*w, c)
+        pixels = paddle.reshape(pixels, (0, self.pixels_channels, -1))
+        pixels = paddle.transpose(pixels, (0, 2, 1))
+
+        # regions: from (n, k, h, w) to (n, k, h*w)
+        regions = paddle.reshape(regions, (0, self.regions_channels, -1))
+        regions = F.softmax(regions, axis=2)
+
+        # feats: from (n, k, c) to (n, c, k, 1)
+        feats = paddle.bmm(regions, pixels)
+        feats = paddle.transpose(feats, (0, 2, 1))
+        feats = paddle.unsqueeze(feats, axis=-1)
+
+        return feats
+
+
+class SpatialOCRModule(nn.Layer):
+    """Aggregate the global object representation to update the representation for each pixel."""
+
+    def __init__(self,
+                 in_channels,
+                 key_channels,
+                 out_channels,
+                 dropout_rate=0.1):
+        super().__init__()
+
+        self.attention_block = ObjectAttentionBlock(in_channels, key_channels)
+        self.conv1x1 = nn.Sequential(
+            layers.ConvBNReLU(2 * in_channels, out_channels, 1),
+            nn.Dropout2D(dropout_rate))
+
+    def forward(self, pixels, regions):
+        context = self.attention_block(pixels, regions)
+        feats = paddle.concat([context, pixels], axis=1)
+        feats = self.conv1x1(feats)
+
+        return feats
+
+
+class ObjectAttentionBlock(nn.Layer):
+    """A self-attention module."""
+
+    def __init__(self, in_channels, key_channels):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.key_channels = key_channels
+
+        self.f_pixel = nn.Sequential(
+            layers.ConvBNReLU(in_channels, key_channels, 1),
+            layers.ConvBNReLU(key_channels, key_channels, 1))
+
+        self.f_object = nn.Sequential(
+            layers.ConvBNReLU(in_channels, key_channels, 1),
+            layers.ConvBNReLU(key_channels, key_channels, 1))
+
+        self.f_down = layers.ConvBNReLU(in_channels, key_channels, 1)
+
+        self.f_up = layers.ConvBNReLU(key_channels, in_channels, 1)
+
+    def forward(self, x, proxy):
+        x_shape = paddle.shape(x)
+        # query : from (n, c1, h1, w1) to (n, h1*w1, key_channels)
+        query = self.f_pixel(x)
+        query = paddle.reshape(query, (0, self.key_channels, -1))
+        query = paddle.transpose(query, (0, 2, 1))
+
+        # key : from (n, c2, h2, w2) to (n, key_channels, h2*w2)
+        key = self.f_object(proxy)
+        key = paddle.reshape(key, (0, self.key_channels, -1))
+
+        # value : from (n, c2, h2, w2) to (n, h2*w2, key_channels)
+        value = self.f_down(proxy)
+        value = paddle.reshape(value, (0, self.key_channels, -1))
+        value = paddle.transpose(value, (0, 2, 1))
+
+        # sim_map (n, h1*w1, h2*w2)
+        sim_map = paddle.bmm(query, key)
+        sim_map = (self.key_channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        # context from (n, h1*w1, key_channels) to (n , out_channels, h1, w1)
+        context = paddle.bmm(sim_map, value)
+        context = paddle.transpose(context, (0, 2, 1))
+        context = paddle.reshape(context,
+                                 (0, self.key_channels, x_shape[2], x_shape[3]))
+        context = self.f_up(context)
+
+        return context
diff --git a/paddlers/models/ppseg/models/pfpnnet.py b/paddlers/models/ppseg/models/pfpnnet.py
new file mode 100644
index 0000000..91da78f
--- /dev/null
+++ b/paddlers/models/ppseg/models/pfpnnet.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class PFPNNet(nn.Layer):
+    """
+    The Panoptic Feature Pyramid Networks implementation based on PaddlePaddle.
+
+    The original article refers to
+    Alexander Kirillov, Ross Girshick, Kaiming He, Piotr Dollár, et al. "Panoptic Feature Pyramid Networks"
+    (https://arxiv.org/abs/1901.02446)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 channels,
+                 enable_auxiliary_loss=False,
+                 align_corners=False,
+                 dropout_ratio=0.1,
+                 fpn_inplanes=[256, 512, 1024, 2048],
+                 pretrained=None):
+        super(PFPNNet, self).__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.in_channels = [
+            self.backbone.feat_channels[i] for i in backbone_indices
+        ]
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+        self.head = PFPNHead(num_class=num_classes,
+                             fpn_inplanes=fpn_inplanes,
+                             dropout_ratio=dropout_ratio,
+                             channels=channels,
+                             fpn_dim=channels,
+                             enable_auxiliary_loss=self.enable_auxiliary_loss)
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        return [
+            F.interpolate(logit,
+                          paddle.shape(x)[2:],
+                          mode='bilinear',
+                          align_corners=self.align_corners)
+            for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class PFPNHead(nn.Layer):
+    """
+    The PFPNHead implementation.
+
+    Args:
+        inplane (int): Input channels of PPM module.
+        num_class (int): The unique number of target classes.
+        fpn_inplanes (list): The feature channels from backbone.
+        fpn_dim (int, optional): The input channels of FPN module. Default: 512.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+    """
+    def __init__(self,
+                 num_class,
+                 fpn_inplanes,
+                 channels,
+                 dropout_ratio=0.1,
+                 fpn_dim=256,
+                 enable_auxiliary_loss=False,
+                 align_corners=False):
+        super(PFPNHead, self).__init__()
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.align_corners = align_corners
+        self.lateral_convs = nn.LayerList()
+        self.fpn_out = nn.LayerList()
+
+        for fpn_inplane in fpn_inplanes:
+            self.lateral_convs.append(
+                nn.Sequential(nn.Conv2D(fpn_inplane, fpn_dim, 1),
+                              layers.SyncBatchNorm(fpn_dim), nn.ReLU()))
+            self.fpn_out.append(
+                nn.Sequential(
+                    layers.ConvBNReLU(fpn_dim, fpn_dim, 3, bias_attr=False)))
+
+        self.scale_heads = nn.LayerList()
+        for index in range(len(fpn_inplanes)):
+            head_length = max(
+                1, int(np.log2(fpn_inplanes[index]) - np.log2(fpn_inplanes[0])))
+            scale_head = nn.LayerList()
+            for head_index in range(head_length):
+                scale_head.append(
+                    layers.ConvBNReLU(
+                        fpn_dim,
+                        channels,
+                        3,
+                        padding=1,
+                    ))
+                if fpn_inplanes[index] != fpn_inplanes[0]:
+                    scale_head.append(
+                        nn.Upsample(scale_factor=2,
+                                    mode='bilinear',
+                                    align_corners=align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+        if dropout_ratio:
+            self.dropout = nn.Dropout2D(dropout_ratio)
+            if self.enable_auxiliary_loss:
+                self.dsn = nn.Sequential(
+                    layers.ConvBNReLU(fpn_inplanes[2],
+                                      fpn_inplanes[2],
+                                      3,
+                                      padding=1), nn.Dropout2D(dropout_ratio),
+                    nn.Conv2D(fpn_inplanes[2], num_class, kernel_size=1))
+        else:
+            self.dropout = None
+            if self.enable_auxiliary_loss:
+                self.dsn = nn.Sequential(
+                    layers.ConvBNReLU(fpn_inplanes[2],
+                                      fpn_inplanes[2],
+                                      3,
+                                      padding=1),
+                    nn.Conv2D(fpn_inplanes[2], num_class, kernel_size=1))
+
+        self.conv_last = nn.Sequential(
+            layers.ConvBNReLU(len(fpn_inplanes) * fpn_dim,
+                              fpn_dim,
+                              3,
+                              bias_attr=False),
+            nn.Conv2D(fpn_dim, num_class, kernel_size=1))
+        self.conv_seg = nn.Conv2D(channels, num_class, kernel_size=1)
+
+    def cls_seg(self, feat):
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def forward(self, conv_out):
+        last_out = self.lateral_convs[-1](conv_out[-1])
+        f = last_out
+        fpn_feature_list = [last_out]
+        for i in reversed(range(len(conv_out) - 1)):
+            conv_x = conv_out[i]
+            conv_x = self.lateral_convs[i](conv_x)
+            prev_shape = paddle.shape(conv_x)[2:]
+            f = conv_x + F.interpolate(
+                f, prev_shape, mode='bilinear', align_corners=True)
+            fpn_feature_list.append(self.fpn_out[i](f))
+
+        output_size = paddle.shape(fpn_feature_list[-1])[2:]
+
+        x = self.scale_heads[0](fpn_feature_list[-1])
+        for index in range(len(self.scale_heads) - 2, 0, -1):
+            x = x + F.interpolate(self.scale_heads[index](
+                fpn_feature_list[index]),
+                                  size=output_size,
+                                  mode='bilinear',
+                                  align_corners=self.align_corners)
+        x = self.cls_seg(x)
+        if self.enable_auxiliary_loss:
+            dsn = self.dsn(conv_out[2])
+            return [x, dsn]
+        else:
+            return [x]
diff --git a/paddlers/models/ppseg/models/pointrend.py b/paddlers/models/ppseg/models/pointrend.py
new file mode 100644
index 0000000..5f9a880
--- /dev/null
+++ b/paddlers/models/ppseg/models/pointrend.py
@@ -0,0 +1,832 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class PointRend(nn.Layer):
+    """
+    The SemanticFPN-PointRend implementation based on PaddlePaddle.
+
+    The original article refers to
+    Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering."
+    (https://arxiv.org/abs/1912.08193).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Four values in the tuple indicate the indices of output of backbone.
+        fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction in FPN. Default: [256, 512, 1024, 2048].
+        fpn_outplanes (int, optional): The output channels in FPN. Default: 256.
+        point_num_fcs (int, optional): Number of fc layers in the head in PointHead. Default: 3.
+        point_in_channels (list, optional): input channels of fc block in PointHead. Default: [256].
+        point_out_channels (int, optional): Fc block's output channels in PointHead. Default: 256.
+        point_in_index (list, optional): The indexs of input features to use in PointHead. Default: [0].
+        point_num_points (int, optional): The number of point in training mode in PointHead. Default: 2048.
+        point_oversample_ratio (int, optional): The sample ratio of points when in training mode in PointHead.
+            sampled_point = num_points * oversample_ratio. Default: 3.
+        point_importance_sample_ratio (float, optional): The importance sample ratio for compute num_uncertain_points in PointHead. Default: 0.75.
+        point_scale_factor(int, optinal): The scale factor of F.interpolate in refine seg logits stage when in inference in PointHead. Default: 2.
+        point_subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference in PointHead. Default: 2.
+        point_subdivision_num_points(int, optional): The points number for refine seg logits when in inference in PointHead. Default: 8196.
+        point_dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in PointHead. Default: 0.1.
+        point_coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
+            the output of each fc layer in PointHead. Default: True.
+        point_conv_cfg(str): The config of Conv in PointHead. Default: 'Conv1D'.
+        point_input_transform(str): The features transform method of inputs in PointHead.
+            it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
+        PFN_feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2 in FPNHead. The first
+            one is of largest resolution. Default: [4, 8, 16, 32].
+        PFN_in_channels(list): The input feature's channels list in FPNHead. Default: [256, 256, 256, 256].
+        PFN_channels(int,optional): The output channels of scale_head's Conv before Upsample block in FPNHead. Default: 128.
+        PFN_in_index(list): The indexs of input features to use. it's shape should keep with in_channels in FPNHead. Default: [0, 1, 2, 3].
+        PFN_dropout_ratio(float,optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in FPNHead. Default: 0.1.
+        PFN_conv_cfg(str): The config of Conv. Default: 'Conv2D'.
+        PFN_input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs' in FPNHead. Defalut: 'multiple_select'.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            backbone,
+            backbone_indices,
+            fpn_inplanes=[256, 512, 1024, 2048],
+            fpn_outplanes=256,
+            point_in_channels=[256],
+            point_out_channels=256,
+            point_in_index=[0],
+            point_num_fcs=3,
+            point_num_points=2048,
+            point_oversample_ratio=3,
+            point_importance_sample_ratio=0.75,
+            point_scale_factor=2,
+            point_subdivision_steps=2,
+            point_subdivision_num_points=8196,
+            point_dropout_ratio=0,
+            point_coarse_pred_each_layer=True,
+            point_input_transform='multiple_select',  # resize_concat
+            point_conv_cfg='Conv1D',
+            PFN_feature_strides=[4, 8, 16, 32],
+            PFN_in_channels=[256, 256, 256, 256],
+            PFN_channels=128,
+            PFN_in_index=[0, 1, 2, 3],
+            PFN_dropout_ratio=0,
+            PFN_conv_cfg='Conv2D',
+            PFN_input_transform='multiple_select',
+            align_corners=False,
+            pretrained=None):
+        super(PointRend, self).__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.in_channels = [
+            self.backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.neck = FPNNeck(
+            fpn_inplanes=fpn_inplanes, fpn_outplanes=fpn_outplanes)
+        self.pointhead = PointHead(
+            in_channels=point_in_channels,
+            out_channels=point_out_channels,
+            num_classes=num_classes,
+            in_index=point_in_index,
+            num_fcs=point_num_fcs,
+            num_points=point_num_points,
+            oversample_ratio=point_oversample_ratio,
+            importance_sample_ratio=point_importance_sample_ratio,
+            scale_factor=point_scale_factor,
+            subdivision_steps=point_subdivision_steps,
+            subdivision_num_points=point_subdivision_num_points,
+            dropout_ratio=point_dropout_ratio,
+            align_corners=align_corners,
+            coarse_pred_each_layer=point_coarse_pred_each_layer,
+            input_transform=point_input_transform,  # resize_concat
+            conv_cfg=point_conv_cfg)
+        self.fpnhead = FPNHead(
+            feature_strides=PFN_feature_strides,
+            in_channels=PFN_in_channels,
+            channels=PFN_channels,
+            num_class=num_classes,
+            in_index=PFN_in_index,
+            dropout_ratio=PFN_dropout_ratio,
+            conv_cfg=PFN_conv_cfg,
+            input_transform=PFN_input_transform,
+            align_corners=align_corners)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        fpn_feats = self.neck(feats)  # [n,256,64,128]*3 & [n,256,128,256]
+        pfn_logits = self.fpnhead(
+            fpn_feats
+        )  # segmainoutput decode_head[0] 512*1024->[n, 19, 64, 128]
+        point_logits = self.pointhead(
+            fpn_feats, pfn_logits)  # segpointoutput decode_head[1]
+
+        if self.training:
+            logit_list = [
+                F.interpolate(
+                    logit,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for logit in pfn_logits
+            ]
+            logit_list.append(point_logits)
+        else:
+            logit_list = [
+                F.interpolate(
+                    logit,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for logit in point_logits
+            ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class PointHead(nn.Layer):
+    """
+    The PointHead implementation based on PaddlePaddle.
+
+    PointHead use shared multi-layer perceptron (equivalent to
+    nn.Conv1D) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    The original article refers to:
+    Kirillov A , Wu Y , He K , et al "PointRend: Image Segmentation As Rendering."
+    (https://arxiv.org/abs/1912.08193)
+
+    Args:
+        num_classes (int): Number of classes for logits. Default: 19.
+        num_fcs (int, optional): Number of fc layers in the head. Default: 3.
+        in_channels (list): input channels of fc block. Default: [256].
+        out_channels (int, optional): Fc block's output channels. Default: 256.
+        in_index (list): The indexs of input features to use. Default: [0].
+        num_points (int, optional): The number of point in training mode. Default: 2048.
+        oversample_ratio (int, optional): The sample ratio of points when in training mode.
+            sampled_point = num_points * oversample_ratio. Default: 3.
+        importance_sample_ratio(float, optional): The importance sample ratio for compute num_uncertain_points. Default: 0.75.
+        scale_factor(int, optional): The scale factor of F.interpolate in refine seg logits stage when in inference. Default: 2.
+        subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference. Default: 2.
+        subdivision_num_points(int, optional): The points number for refine seg logits when in inference. Default: 8196.
+        dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
+        coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
+            the output of each fc layer. Default: True.
+        conv_cfg(str): The config of Conv. Default: 'Conv1D'.
+        input_transform(str): The features transform method of inputs.
+            it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+    """
+
+    def __init__(
+            self,
+            num_classes=19,
+            num_fcs=3,
+            in_channels=[256],
+            out_channels=256,
+            in_index=[0],
+            num_points=2048,
+            oversample_ratio=3,
+            importance_sample_ratio=0.75,
+            scale_factor=2,
+            subdivision_steps=2,
+            subdivision_num_points=8196,
+            dropout_ratio=0.1,
+            coarse_pred_each_layer=True,
+            conv_cfg='Conv1D',
+            input_transform='multiple_select',  # resize_concat
+            align_corners=False):
+        super(PointHead, self).__init__()
+
+        self.in_channels = in_channels
+        self.channels = out_channels
+        self.in_index = in_index
+        self.num_classes = num_classes
+        self.num_fcs = num_fcs
+        self.num_points = num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.scale_factor = scale_factor
+        self.subdivision_steps = subdivision_steps
+        self.subdivision_num_points = paddle.to_tensor(subdivision_num_points, dtype="int32")
+        self.dropout_ratio = dropout_ratio
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+        self.align_corners = align_corners
+        self.input_transform = input_transform
+
+        fc_in_channels = sum(self.in_channels) + self.num_classes
+        fc_channels = self.channels
+        self.fcs = nn.LayerList()
+        for k in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+            )
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += self.num_classes if self.coarse_pred_each_layer else 0
+        self.fc_seg = nn.Conv1D(
+            fc_in_channels,
+            self.num_classes,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+
+        if self.dropout_ratio > 0:
+            self.dropout = nn.Dropout(self.dropout_ratio)
+        else:
+            self.dropout = None
+
+    def cls_seg(self, feat):
+        """Classify each pixel with fc."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.fc_seg(feat)
+        return output
+
+    def _get_fine_grained_point_feats(self, x, points):
+        """
+        Sample from fine grained features.
+
+        Args:
+            x (list[Tensor]): Feature pyramid from by neck or backbone.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+        Returns:
+            fine_grained_feats (Tensor): Sampled fine grained feature,
+                shape (batch_size, sum(channels of x), num_points).
+        """
+
+        fine_grained_feats_list = [
+            point_sample(_, points, align_corners=self.align_corners) for _ in x
+        ]
+        if len(fine_grained_feats_list) > 1:
+            fine_grained_feats = paddle.concat(fine_grained_feats_list, axis=1)
+        else:
+            fine_grained_feats = fine_grained_feats_list[0]
+        return fine_grained_feats
+
+    def _get_coarse_point_feats(self, prev_output, points):
+        """
+        Sample from fine grained features.
+
+        Args:
+            prev_output (list[Tensor]): Prediction of previous decode head.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+        Returns:
+            coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
+                num_classes, num_points).
+        """
+
+        coarse_feats = point_sample(
+            prev_output, points, align_corners=self.align_corners)
+        return coarse_feats
+
+    def _transform_inputs(self, inputs):
+        """
+        Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                F.interpolate(
+                    x,
+                    size=paddle.shape(inputs[0])[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = paddle.concat(upsampled_inputs, axis=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index[0]]
+        return inputs
+
+    def get_points_train(self, seg_logits, uncertainty_func):  # finish
+        """
+        Sample points for training.
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        'uncertainty_func' function that takes point's logit prediction as
+        input.
+
+        Args:
+            seg_logits (Tensor): Semantic segmentation logits, shape (
+                batch_size, num_classes, height, width).
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Training config of point head.
+        Returns:
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains the coordinates of ``num_points`` sampled
+                points.
+        """
+
+        num_points = self.num_points
+        oversample_ratio = self.oversample_ratio
+        importance_sample_ratio = self.importance_sample_ratio
+        assert oversample_ratio >= 1
+        assert 0 <= importance_sample_ratio <= 1
+        batch_size = paddle.shape(seg_logits)[0]
+        num_sampled = int(num_points * oversample_ratio)
+        point_coords = paddle.rand([batch_size, num_sampled, 2])
+        point_logits = point_sample(seg_logits, point_coords)
+        # It is crucial to calculate uncertainty based on the sampled
+        # prediction value for the points. Calculating uncertainties of the
+        # coarse predictions first and sampling them for points leads to
+        # incorrect results.  To illustrate this: assume uncertainty func(
+        # logits)=-abs(logits), a sampled point between two coarse
+        # predictions with -1 and 1 logits has 0 logits, and therefore 0
+        # uncertainty value. However, if we calculate uncertainties for the
+        # coarse predictions first, both will have -1 uncertainty,
+        # and sampled point will get -1 uncertainty.
+        point_uncertainties = uncertainty_func(point_logits)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = paddle.topk(
+            point_uncertainties[:, 0, :], k=num_uncertain_points, axis=1)[1]
+        shift = num_sampled * paddle.arange(batch_size, dtype='int64')
+        idx += shift.unsqueeze([-1])
+        idx = idx.reshape([-1])
+        point_coords = paddle.index_select(
+            point_coords.reshape([-1, 2]), idx, axis=0)
+        point_coords = point_coords.reshape(
+            [batch_size, num_uncertain_points, 2])
+        if num_random_points > 0:
+            rand_point_coords = paddle.rand([batch_size, num_random_points, 2])
+            point_coords = paddle.concat((point_coords, rand_point_coords),
+                                         axis=1)
+        return point_coords
+
+    def get_points_test(self, seg_logits, uncertainty_func):  # finish
+        """
+        Sample points for testing.
+        Find ``num_points`` most uncertain points from ``uncertainty_map``.
+
+        Args:
+            seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
+                height, width) for class-specific or class-agnostic prediction.
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Testing config of point head.
+        Returns:
+            point_indices (Tensor): A tensor of shape (batch_size, num_points)
+                that contains indices from [0, height x width) of the most
+                uncertain points.
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the ``height x width`` grid .
+        """
+
+        num_points = self.subdivision_num_points
+        uncertainty_map = uncertainty_func(seg_logits)
+        batch_size = paddle.shape(uncertainty_map)[0]
+        height = paddle.shape(uncertainty_map)[2]
+        width = paddle.shape(uncertainty_map)[3]
+        h_step = 1.0 / height
+        w_step = 1.0 / width
+
+        uncertainty_map = uncertainty_map.reshape([batch_size, height * width])
+        num_points = paddle.min(paddle.concat([height * width, num_points]))
+        point_indices = paddle.topk(uncertainty_map, num_points, axis=1)[1]
+        point_coords = paddle.zeros([batch_size, num_points, 2],
+                                    dtype='float32')
+        point_coords[:, :, 0] = w_step / 2.0 + (
+            point_indices % width).astype('float32') * w_step
+        point_coords[:, :, 1] = h_step / 2.0 + (
+            point_indices // width).astype('float32') * h_step
+        return point_indices, point_coords
+
+    def scatter_paddle(self, refined_seg_logits, point_indices, point_logits):
+        """
+        paddle version scatter : equal to pytorch version scatter(-1,point_indices,point_logits).
+
+        Args:
+            refined_seg_logits(Tensor): shape=[batch_size, channels, height * width]
+            point_indices(Tensor): shape=[batch_size, channels, height * width]
+            point_logits(Tensor): shape[batch_size, channels, height * width]
+        Returns:
+            scattered refined_seg_logits(Tensor).
+        """
+
+        original_shape = paddle.shape(refined_seg_logits)  # [batch_size, channels, height * width]
+        new_refined_seg_logits = refined_seg_logits.flatten(0, 1)  # [N*C,H*W]
+        offsets = (paddle.arange(paddle.shape(new_refined_seg_logits)[0]) *
+                   paddle.shape(new_refined_seg_logits)[1]).unsqueeze(-1)  # [N*C,1]
+        point_indices = point_indices.flatten(0, 1)  # [N*C,H*W]
+        new_point_indices = (point_indices + offsets).flatten()
+        point_logits = point_logits.flatten()  # [N*C*H*W]
+        refined_seg_logits = paddle.scatter(
+            refined_seg_logits.flatten(),
+            new_point_indices,
+            point_logits,
+            overwrite=True)
+        return refined_seg_logits.reshape(shape=original_shape)
+
+    def forward_train(self, x, prev_output):
+        with paddle.no_grad():
+            points = self.get_points_train(prev_output, calculate_uncertainty)
+
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, points)  # [2, 256, 2048]
+        coarse_point_feats = self._get_coarse_point_feats(
+            prev_output, points)  # [2, 19, 2048]
+        # forward for train
+        fusion_point_feats = paddle.concat(
+            [fine_grained_point_feats, coarse_point_feats], axis=1)
+        for fc in self.fcs:
+            fusion_point_feats = fc(fusion_point_feats)
+            if self.coarse_pred_each_layer:
+                fusion_point_feats = paddle.concat(
+                    (fusion_point_feats, coarse_point_feats), axis=1)
+        point_logits = self.cls_seg(fusion_point_feats)
+        return [point_logits, points]  # for points loss
+
+    def forward(self, inputs, prev_output):
+        """
+        Forward function.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+        Returns:
+            [point_logits,points]: For points loss when in training.
+            [refined_seg_logits]: Output refined seg logits when in inference.
+        """
+
+        prev_output = prev_output[0]
+        x = self._transform_inputs(inputs)
+        if self.training:
+            return self.forward_train(x, prev_output)
+        else:
+            refined_seg_logits = prev_output.clone()
+            for _ in range(self.subdivision_steps):
+                refined_seg_logits = F.interpolate(
+                    refined_seg_logits,
+                    scale_factor=self.scale_factor,
+                    mode='bilinear',
+                    align_corners=self.align_corners)
+
+                save_shape = paddle.shape(refined_seg_logits)
+                point_indices, points = self.get_points_test(
+                    refined_seg_logits, calculate_uncertainty)
+                fine_grained_point_feats = self._get_fine_grained_point_feats(
+                    x, points)
+                coarse_point_feats = self._get_coarse_point_feats(
+                    prev_output, points)
+                # forward for inference
+                fusion_point_feats = paddle.concat(
+                    [fine_grained_point_feats, coarse_point_feats], axis=1)
+                for fc in self.fcs:
+                    fusion_point_feats = fc(fusion_point_feats)
+                    if self.coarse_pred_each_layer:
+                        fusion_point_feats = paddle.concat(
+                            (fusion_point_feats, coarse_point_feats), axis=1)
+                point_logits = self.cls_seg(fusion_point_feats)
+                point_indices = paddle.unsqueeze(point_indices, axis=1)
+                point_indices = paddle.expand(point_indices, [-1, save_shape[1], -1])
+
+                refined_seg_logits = paddle.flatten(refined_seg_logits, 2)
+                refined_seg_logits = self.scatter_paddle(
+                    refined_seg_logits, point_indices,
+                    point_logits)  # 2->height * width dim
+                refined_seg_logits = refined_seg_logits.reshape(save_shape)
+            return [refined_seg_logits]
+
+
+class FPNHead(nn.Layer):
+    """
+    This head is the implementation of Semantic FPN in paddle.
+
+    The original article refers to:
+    Kirillov, A. , et al. "Panoptic Feature Pyramid Networks."
+    (https://arxiv.org/abs/1901.02446)
+
+    Args:
+        num_classes(int): The unique number of target classes. Default: 19.
+        feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2. The first
+            one is of largest resolution. Default: [4, 8, 16, 32].
+        in_channels(list): The input feature's channels list. Default: [256, 256, 256, 256].
+        channels(int, optional): The output channels of scale_head's Conv before Upsample block. Default: 128.
+        in_index(list): The indexs of input features to use. it's shape should keep with in_channels. Default: [0, 1, 2, 3].
+        dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
+        conv_cfg(str): The config of Conv. Default: 'Conv2D'.
+        input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+    """
+
+    def __init__(
+            self,
+            num_class=19,
+            feature_strides=[4, 8, 16, 32],
+            in_channels=[256, 256, 256, 256],
+            channels=128,
+            in_index=[0, 1, 2, 3],
+            dropout_ratio=0.1,
+            conv_cfg='Conv2D',
+            input_transform='multiple_select',
+            align_corners=False,
+    ):
+        super(FPNHead, self).__init__()
+        assert len(feature_strides) == len(in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+        self.in_channels = in_channels
+        self.channels = channels
+        self.in_index = in_index
+        self.num_class = num_class
+        self.conv_cfg = conv_cfg
+        self.dropout_ratio = dropout_ratio
+        self.input_transform = input_transform
+        self.align_corners = align_corners
+        self.scale_heads = nn.LayerList()
+
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.in_channels[i] if k == 0 else self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+        self.conv_seg = nn.Conv2D(self.channels, self.num_class, kernel_size=1)
+
+        if self.dropout_ratio is not None:
+            self.dropout = nn.Dropout2D(self.dropout_ratio)
+        else:
+            self.dropout = None
+
+    def cls_seg(self, feat):
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def _transform_inputs(self, inputs):
+        """
+        Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                F.interpolate(
+                    x,
+                    size=paddle.shape(inputs[0])[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = paddle.concat(upsampled_inputs, axis=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index[0]]
+
+        return inputs
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            output = output + F.interpolate(
+                self.scale_heads[i](x[i]),
+                size=paddle.shape(output)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        output = self.cls_seg(output)
+        return [output]
+
+
+class FPNNeck(nn.Layer):
+    """
+    The FPN Neck implementation in paddle.
+
+    Args:
+        fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction. Default: [256, 512, 1024, 2048].
+        fpn_outplanes (int, optional): The output channels. Default: 256.
+    """
+
+    def __init__(
+            self,
+            fpn_inplanes=[256, 512, 1024, 2048],
+            fpn_outplanes=256,
+    ):
+        super(FPNNeck, self).__init__()
+        self.lateral_convs = []
+        self.fpn_out = []
+
+        # FPN head
+        for fpn_inplane in fpn_inplanes:
+            self.lateral_convs.append(
+                nn.Sequential(
+                    nn.Conv2D(fpn_inplane, fpn_outplanes, 1),
+                    layers.SyncBatchNorm(fpn_outplanes), nn.ReLU()))
+            self.fpn_out.append(
+                nn.Sequential(
+                    layers.ConvBNReLU(
+                        fpn_outplanes, fpn_outplanes, 3, bias_attr=False)))
+
+        self.lateral_convs = nn.LayerList(self.lateral_convs)
+        self.fpn_out = nn.LayerList(self.fpn_out)
+
+    def forward(self, conv_out):
+        last_out = self.lateral_convs[-1](conv_out[-1])
+        f = last_out
+        fpn_feature_list = [last_out]
+        for i in reversed(range(len(conv_out) - 1)):
+            conv_x = conv_out[i]
+            conv_x = self.lateral_convs[i](conv_x)
+            prev_shape = paddle.shape(conv_x)[2:]
+            f = conv_x + F.interpolate(
+                f, prev_shape, mode='bilinear', align_corners=True)
+            fpn_feature_list.append(self.fpn_out[i](f))
+        return fpn_feature_list
+
+
+class ConvModule(nn.Layer):
+    """
+    ConvModule includes Conv1/Conv2D.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding=0,
+                 stride=1,
+                 conv_cfg='Conv1D',
+                 norm_cfg='None',
+                 **kwargs):
+        super().__init__()
+        if (conv_cfg == 'Conv1D'):
+            self._conv = nn.Conv1D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=padding,
+                **kwargs)
+        if (conv_cfg == 'Conv2D'):
+            self._conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=padding,
+                **kwargs)
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        if (norm_cfg != 'None'):
+            self._batch_norm = layers.SyncBatchNorm(
+                out_channels, data_format=data_format)
+        else:
+            self._batch_norm = None
+
+    def forward(self, x):
+        x = self._conv(x)
+        if (self._batch_norm != None):
+            x = self._batch_norm(x)
+        x = F.relu(x)
+        return x
+
+
+class Upsample(nn.Layer):
+    """
+    Upsample Module.
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            return F.interpolate(x, None, self.scale_factor, self.mode, self.align_corners)
+        else:
+            return F.interpolate(x, self.size, None, self.mode, self.align_corners)
+
+
+def point_sample(input, points, align_corners=False, **kwargs):
+    """
+    A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+
+    Args:
+        input (Tensor): Feature map, shape (N, C, H, W).
+        points (Tensor): Image based absolute point coordinates (normalized),
+            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
+        align_corners (bool): Whether align_corners. Default: False
+    Returns:
+        Tensor: Features of `point` on `input`, shape (N, C, P) or
+            (N, C, Hgrid, Wgrid).
+    """
+
+    def denormalize(grid):
+        """Denormalize input grid from range [0, 1] to [-1, 1]
+        Args:
+            grid (Tensor): The grid to be denormalize, range [0, 1].
+        Returns:
+            Tensor: Denormalized grid, range [-1, 1].
+        """
+        return grid * 2.0 - 1.0
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = paddle.unsqueeze(points, axis=2)
+    output = F.grid_sample(
+        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = paddle.squeeze(output, axis=3)
+    return output
+
+
+def calculate_uncertainty(seg_logits):
+    """
+    Estimate uncertainty based on seg logits.
+    For each location of the prediction ``seg_logits`` we estimate
+    uncertainty as the difference between top first and top second
+    predicted logits.
+
+    Args:
+        seg_logits (Tensor): Semantic segmentation logits,
+            shape (batch_size, num_classes, height, width).
+    Returns:
+        scores (Tensor): T uncertainty scores with the most uncertain
+            locations having the highest uncertainty score, shape (
+            batch_size, 1, height, width)
+    """
+
+    top2_scores = paddle.topk(seg_logits, k=2, axis=1)[0]
+    return paddle.unsqueeze(top2_scores[:, 1] - top2_scores[:, 0], axis=1)
diff --git a/paddlers/models/ppseg/models/portraitnet.py b/paddlers/models/ppseg/models/portraitnet.py
new file mode 100644
index 0000000..96a90b9
--- /dev/null
+++ b/paddlers/models/ppseg/models/portraitnet.py
@@ -0,0 +1,226 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.MODELS.add_component
+class PortraitNet(nn.Layer):
+    """
+    The PortraitNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Song-Hai Zhanga, Xin Donga, Jia Lib, Ruilong Lia, Yong-Liang Yangc
+    "PortraitNet: Real-time Portrait Segmentation Network for Mobile Device"
+    (https://www.yongliangyang.net/docs/mobilePotrait_c&g19.pdf).
+
+    Args:
+        num_classes (int, optional): The unique number of target classes.  Default: 2.
+        backbone (Paddle.nn.Layer): Backbone network, currently support MobileNetV2.
+        add_edge (bool, optional): Whether output to edge. Default: False
+        pretrained (str, optional): The path or url of pretrained model. Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 min_channel=16,
+                 channel_ratio=1.0,
+                 add_edge=False,
+                 pretrained=None):
+        super(PortraitNet, self).__init__()
+        self.backbone = backbone
+        self.head = PortraitNetHead(num_classes, min_channel, channel_ratio,
+                                    add_edge)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        img = x[:, :3, :, :]
+        img_ori = x[:, 3:, :, :]
+
+        feat_list = self.backbone(img)
+        logits_list = self.head(feat_list)
+
+        feat_list = self.backbone(img_ori)
+        logits_ori_list = self.head(feat_list)
+
+        return [
+            logits_list[0], logits_ori_list[0], logits_list[1],
+            logits_ori_list[1]
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class PortraitNetHead(nn.Layer):
+    def __init__(self,
+                 num_classes,
+                 min_channel=16,
+                 channel_ratio=1.0,
+                 add_edge=False):
+        super().__init__()
+        self.min_channel = min_channel
+        self.channel_ratio = channel_ratio
+        self.add_edge = add_edge
+        self.deconv1 = nn.Conv2DTranspose(
+            self.depth(96),
+            self.depth(96),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv2 = nn.Conv2DTranspose(
+            self.depth(32),
+            self.depth(32),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv3 = nn.Conv2DTranspose(
+            self.depth(24),
+            self.depth(24),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv4 = nn.Conv2DTranspose(
+            self.depth(16),
+            self.depth(16),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv5 = nn.Conv2DTranspose(
+            self.depth(8),
+            self.depth(8),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+
+        self.transit1 = ResidualBlock(self.depth(320), self.depth(96))
+        self.transit2 = ResidualBlock(self.depth(96), self.depth(32))
+        self.transit3 = ResidualBlock(self.depth(32), self.depth(24))
+        self.transit4 = ResidualBlock(self.depth(24), self.depth(16))
+        self.transit5 = ResidualBlock(self.depth(16), self.depth(8))
+
+        self.pred = nn.Conv2D(
+            self.depth(8), num_classes, 3, 1, 1, bias_attr=False)
+        if self.add_edge:
+            self.edge = nn.Conv2D(
+                self.depth(8), num_classes, 3, 1, 1, bias_attr=False)
+
+    def depth(self, channels):
+        min_channel = min(channels, self.min_channel)
+        return max(min_channel, int(channels * self.channel_ratio))
+
+    def forward(self, feat_list):
+        feature_1_4, feature_1_8, feature_1_16, feature_1_32 = feat_list
+        up_1_16 = self.deconv1(self.transit1(feature_1_32))
+        up_1_8 = self.deconv2(self.transit2(feature_1_16 + up_1_16))
+        up_1_4 = self.deconv3(self.transit3(feature_1_8 + up_1_8))
+        up_1_2 = self.deconv4(self.transit4(feature_1_4 + up_1_4))
+        up_1_1 = self.deconv5(self.transit5(up_1_2))
+
+        pred = self.pred(up_1_1)
+        if self.add_edge:
+            edge = self.edge(up_1_1)
+            return pred, edge
+        else:
+            return pred
+
+
+class ConvDw(nn.Layer):
+    def __init__(self, inp, oup, kernel, stride):
+        super(ConvDw, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                inp,
+                kernel,
+                stride, (kernel - 1) // 2,
+                groups=inp,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=inp, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(inp, oup, 1, 1, 0, bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class ResidualBlock(nn.Layer):
+    def __init__(self, inp, oup, stride=1):
+        super(ResidualBlock, self).__init__()
+
+        self.block = nn.Sequential(
+            ConvDw(inp, oup, 3, stride=stride),
+            nn.Conv2D(
+                in_channels=oup,
+                out_channels=oup,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=oup,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(
+                in_channels=oup,
+                out_channels=oup,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+        )
+        if inp == oup:
+            self.residual = None
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2D(
+                    in_channels=inp,
+                    out_channels=oup,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias_attr=False),
+                nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+            )
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+
+        out = self.block(x)
+        if self.residual is not None:
+            residual = self.residual(x)
+
+        out += residual
+        out = self.relu(out)
+        return out
diff --git a/paddlers/models/ppseg/models/pphumanseg_lite.py b/paddlers/models/ppseg/models/pphumanseg_lite.py
new file mode 100644
index 0000000..e0951a1
--- /dev/null
+++ b/paddlers/models/ppseg/models/pphumanseg_lite.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = ['PPHumanSegLite']
+
+
+@manager.MODELS.add_component
+class PPHumanSegLite(nn.Layer):
+    "A self-developed ultra lightweight model from paddlers.models.ppseg, is suitable for real-time scene segmentation on web or mobile terminals."
+
+    def __init__(self, num_classes, pretrained=None, align_corners=False):
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+
+        self.conv_bn0 = _ConvBNReLU(3, 36, 3, 2, 1)
+        self.conv_bn1 = _ConvBNReLU(36, 18, 1, 1, 0)
+
+        self.block1 = nn.Sequential(
+            InvertedResidual(36, stride=2, out_channels=72),
+            InvertedResidual(72, stride=1), InvertedResidual(72, stride=1),
+            InvertedResidual(72, stride=1))
+
+        self.block2 = nn.Sequential(
+            InvertedResidual(72, stride=2), InvertedResidual(144, stride=1),
+            InvertedResidual(144, stride=1), InvertedResidual(144, stride=1),
+            InvertedResidual(144, stride=1), InvertedResidual(144, stride=1),
+            InvertedResidual(144, stride=1), InvertedResidual(144, stride=1))
+
+        self.depthwise_separable0 = _SeparableConvBNReLU(144, 64, 3, stride=1)
+        self.depthwise_separable1 = _SeparableConvBNReLU(82, 64, 3, stride=1)
+        self.depthwise_separable2 = _SeparableConvBNReLU(
+            64, self.num_classes, 3, stride=1)
+
+        self.init_weight()
+
+    def forward(self, x):
+        # Encoder
+        input_shape = paddle.shape(x)[2:]
+
+        x = self.conv_bn0(x)  # 1/2
+        shortcut = self.conv_bn1(x)  # shortcut
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)  # 1/4
+        x = self.block1(x)  # 1/8
+        x = self.block2(x)  # 1/16
+
+        # Decoder
+        x = self.depthwise_separable0(x)
+        shortcut_shape = paddle.shape(shortcut)[2:]
+        x = F.interpolate(
+            x,
+            shortcut_shape,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = paddle.concat(x=[shortcut, x], axis=1)
+        x = self.depthwise_separable1(x)
+
+        logit = self.depthwise_separable2(x)
+        logit = F.interpolate(
+            logit,
+            input_shape,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return [logit]
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class _ConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 **kwargs):
+        super().__init__()
+        weight_attr = paddle.ParamAttr(
+            learning_rate=1, initializer=nn.initializer.KaimingUniform())
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            stride=stride,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=False,
+            **kwargs)
+
+        self._batch_norm = layers.SyncBatchNorm(out_channels)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        x = F.relu(x)
+        return x
+
+
+class _ConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 **kwargs):
+        super().__init__()
+        weight_attr = paddle.ParamAttr(
+            learning_rate=1, initializer=nn.initializer.KaimingUniform())
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            stride=stride,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=False,
+            **kwargs)
+
+        self._batch_norm = layers.SyncBatchNorm(out_channels)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        return x
+
+
+class _SeparableConvBNReLU(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
+        super().__init__()
+        self.depthwise_conv = _ConvBN(
+            in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            padding=int(kernel_size / 2),
+            groups=in_channels,
+            **kwargs)
+        self.piontwise_conv = _ConvBNReLU(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            groups=1,
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.piontwise_conv(x)
+        return x
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, input_channels, stride, out_channels=None):
+        super().__init__()
+        if stride == 1:
+            branch_channel = int(input_channels / 2)
+        else:
+            branch_channel = input_channels
+
+        if out_channels is None:
+            self.in_channels = int(branch_channel)
+        else:
+            self.in_channels = int(out_channels / 2)
+
+        self._depthwise_separable_0 = _SeparableConvBNReLU(
+            input_channels, self.in_channels, 3, stride=stride)
+        self._conv = _ConvBNReLU(
+            branch_channel, self.in_channels, 1, stride=1, padding=0)
+        self._depthwise_separable_1 = _SeparableConvBNReLU(
+            self.in_channels, self.in_channels, 3, stride=stride)
+
+        self.stride = stride
+
+    def forward(self, input):
+
+        if self.stride == 1:
+            shortcut, branch = paddle.split(x=input, num_or_sections=2, axis=1)
+        else:
+            branch = input
+            shortcut = self._depthwise_separable_0(input)
+
+        branch_1x1 = self._conv(branch)
+        branch_dw1x1 = self._depthwise_separable_1(branch_1x1)
+        output = paddle.concat(x=[shortcut, branch_dw1x1], axis=1)
+
+        # channel shuffle
+        out_shape = paddle.shape(output)
+        h, w = out_shape[2], out_shape[3]
+        output = paddle.reshape(x=output, shape=[0, 2, self.in_channels, h, w])
+        output = paddle.transpose(x=output, perm=[0, 2, 1, 3, 4])
+        output = paddle.reshape(x=output, shape=[0, 2 * self.in_channels, h, w])
+        return output
+
diff --git a/paddlers/models/ppseg/models/pspnet.py b/paddlers/models/ppseg/models/pspnet.py
new file mode 100644
index 0000000..5a6f6b7
--- /dev/null
+++ b/paddlers/models/ppseg/models/pspnet.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import paddle
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class PSPNet(nn.Layer):
+    """
+    The PSPNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Zhao, Hengshuang, et al. "Pyramid scene parsing network"
+    (https://openaccess.thecvf.com/content_cvpr_2017/papers/Zhao_Pyramid_Scene_Parsing_CVPR_2017_paper.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+        pp_out_channels (int, optional): The output channels after Pyramid Pooling Module. Default: 1024.
+        bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1,2,3,6).
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 pp_out_channels=1024,
+                 bin_sizes=(1, 2, 3, 6),
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = PSPNetHead(num_classes, backbone_indices, backbone_channels,
+                               pp_out_channels, bin_sizes,
+                               enable_auxiliary_loss, align_corners)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class PSPNetHead(nn.Layer):
+    """
+    The PSPNetHead implementation.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            The first index will be taken as a deep-supervision feature in auxiliary layer;
+            the second one will be taken as input of Pyramid Pooling Module (PPModule).
+            Usually backbone consists of four downsampling stage, and return an output of
+            each stage. If we set it as (2, 3) in ResNet, that means taking feature map of the third
+            stage (res4b22) in backbone, and feature map of the fourth stage (res5c) as input of PPModule.
+        backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
+        pp_out_channels (int): The output channels after Pyramid Pooling Module.
+        bin_sizes (tuple): The out size of pooled feature maps.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, num_classes, backbone_indices, backbone_channels,
+                 pp_out_channels, bin_sizes, enable_auxiliary_loss,
+                 align_corners):
+
+        super().__init__()
+
+        self.backbone_indices = backbone_indices
+
+        self.psp_module = layers.PPModule(
+            in_channels=backbone_channels[1],
+            out_channels=pp_out_channels,
+            bin_sizes=bin_sizes,
+            dim_reduction=True,
+            align_corners=align_corners)
+
+        self.dropout = nn.Dropout(p=0.1)  # dropout_prob
+
+        self.conv = nn.Conv2D(
+            in_channels=pp_out_channels,
+            out_channels=num_classes,
+            kernel_size=1)
+
+        if enable_auxiliary_loss:
+            self.auxlayer = layers.AuxLayer(
+                in_channels=backbone_channels[0],
+                inter_channels=backbone_channels[0] // 4,
+                out_channels=num_classes)
+
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+    def forward(self, feat_list):
+        logit_list = []
+        x = feat_list[self.backbone_indices[1]]
+        x = self.psp_module(x)
+        x = self.dropout(x)
+        logit = self.conv(x)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            auxiliary_feat = feat_list[self.backbone_indices[0]]
+            auxiliary_logit = self.auxlayer(auxiliary_feat)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
diff --git a/paddlers/models/ppseg/models/segformer.py b/paddlers/models/ppseg/models/segformer.py
new file mode 100644
index 0000000..7a7a1db
--- /dev/null
+++ b/paddlers/models/ppseg/models/segformer.py
@@ -0,0 +1,177 @@
+# The SegFormer code was heavily based on https://github.com/NVlabs/SegFormer
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/NVlabs/SegFormer#license
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+class MLP(nn.Layer):
+    """
+    Linear Embedding
+    """
+
+    def __init__(self, input_dim=2048, embed_dim=768):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, embed_dim)
+
+    def forward(self, x):
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.proj(x)
+        return x
+
+
+@manager.MODELS.add_component
+class SegFormer(nn.Layer):
+    """
+    The SegFormer implementation based on PaddlePaddle.
+
+    The original article refers to
+    Xie, Enze, et al. "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers." arXiv preprint arXiv:2105.15203 (2021).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        embedding_dim (int): The MLP decoder channel dimension.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature.
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 embedding_dim,
+                 align_corners=False,
+                 pretrained=None):
+        super(SegFormer, self).__init__()
+
+        self.pretrained = pretrained
+        self.align_corners = align_corners
+        self.backbone = backbone
+        self.num_classes = num_classes
+        c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.backbone.feat_channels
+
+        self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim)
+        self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim)
+        self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim)
+        self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim)
+
+        self.dropout = nn.Dropout2D(0.1)
+        self.linear_fuse = layers.ConvBNReLU(
+            in_channels=embedding_dim * 4,
+            out_channels=embedding_dim,
+            kernel_size=1,
+            bias_attr=False)
+
+        self.linear_pred = nn.Conv2D(
+            embedding_dim, self.num_classes, kernel_size=1)
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        c1, c2, c3, c4 = feats
+
+        ############## MLP decoder on C1-C4 ###########
+        c1_shape = paddle.shape(c1)
+        c2_shape = paddle.shape(c2)
+        c3_shape = paddle.shape(c3)
+        c4_shape = paddle.shape(c4)
+
+        _c4 = self.linear_c4(c4).transpose([0, 2, 1]).reshape(
+            [0, 0, c4_shape[2], c4_shape[3]])
+        _c4 = F.interpolate(
+            _c4,
+            size=c1_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        _c3 = self.linear_c3(c3).transpose([0, 2, 1]).reshape(
+            [0, 0, c3_shape[2], c3_shape[3]])
+        _c3 = F.interpolate(
+            _c3,
+            size=c1_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        _c2 = self.linear_c2(c2).transpose([0, 2, 1]).reshape(
+            [0, 0, c2_shape[2], c2_shape[3]])
+        _c2 = F.interpolate(
+            _c2,
+            size=c1_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        _c1 = self.linear_c1(c1).transpose([0, 2, 1]).reshape(
+            [0, 0, c1_shape[2], c1_shape[3]])
+
+        _c = self.linear_fuse(paddle.concat([_c4, _c3, _c2, _c1], axis=1))
+
+        logit = self.dropout(_c)
+        logit = self.linear_pred(logit)
+        return [
+            F.interpolate(
+                logit,
+                size=paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        ]
+
+
+@manager.MODELS.add_component
+def SegFormer_B0(**kwargs):
+    return SegFormer(
+        backbone=manager.BACKBONES['MixVisionTransformer_B0'](),
+        embedding_dim=256,
+        **kwargs)
+
+
+@manager.MODELS.add_component
+def SegFormer_B1(**kwargs):
+    return SegFormer(
+        backbone=manager.BACKBONES['MixVisionTransformer_B1'](),
+        embedding_dim=256,
+        **kwargs)
+
+
+@manager.MODELS.add_component
+def SegFormer_B2(**kwargs):
+    return SegFormer(
+        backbone=manager.BACKBONES['MixVisionTransformer_B2'](),
+        embedding_dim=768,
+        **kwargs)
+
+
+@manager.MODELS.add_component
+def SegFormer_B3(**kwargs):
+    return SegFormer(
+        backbone=manager.BACKBONES['MixVisionTransformer_B3'](),
+        embedding_dim=768,
+        **kwargs)
+
+
+@manager.MODELS.add_component
+def SegFormer_B4(**kwargs):
+    return SegFormer(
+        backbone=manager.BACKBONES['MixVisionTransformer_B4'](),
+        embedding_dim=768,
+        **kwargs)
+
+
+@manager.MODELS.add_component
+def SegFormer_B5(**kwargs):
+    return SegFormer(
+        backbone=manager.BACKBONES['MixVisionTransformer_B5'](),
+        embedding_dim=768,
+        **kwargs)
diff --git a/paddlers/models/ppseg/models/segmenter.py b/paddlers/models/ppseg/models/segmenter.py
new file mode 100644
index 0000000..37def25
--- /dev/null
+++ b/paddlers/models/ppseg/models/segmenter.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models.backbones import vision_transformer, transformer_utils
+
+__all__ = ['LinearSegmenter', 'MaskSegmenter']
+
+
+@manager.MODELS.add_component
+class LinearSegmenter(nn.Layer):
+    '''
+    The implementation of segmenter with linear head based on PaddlePaddle.
+
+    The original article refers to Strudel, Robin, et al. "Segmenter: Transformer
+    for Semantic Segmentation." arXiv preprint arXiv:2105.05633 (2021).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (nn.Layer): The backbone transformer network.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    '''
+
+    def __init__(self, num_classes, backbone, pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.head = SegmenterLinearHead(num_classes, backbone.embed_dim)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+
+        feats, shape = self.backbone(x)
+        logits = self.head(feats[-1], shape[2:])
+
+        logit_list = [
+            F.interpolate(logit, x_shape[2:], mode='bilinear')
+            for logit in logits
+        ]
+
+        return logit_list
+
+
+@manager.MODELS.add_component
+class MaskSegmenter(nn.Layer):
+    '''
+    The implementation of segmenter with mask head based on PaddlePaddle.
+
+    The original article refers to Strudel, Robin, et al. "Segmenter: Transformer
+    for Semantic Segmentation." arXiv preprint arXiv:2105.05633 (2021).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (nn.Layer): The backbone transformer network.
+        h_embed_dim (int): The embedding dim in mask head.
+        h_depth (int): The num of layers in mask head.
+        h_num_heads (int): The num of heads of MSA in mask head.
+        h_mlp_ratio (int, optional): Ratio of MLP dim in mask head. Default: 4.
+        h_drop_rate (float, optional): Drop rate of MLP in mask head. Default: 0.0.
+        h_drop_path_rate (float, optional): Drop path rate in mask head. Default: 0.0.
+        h_attn_drop_rate (float, optional): Attenation drop rate in mask head. Default: 0.0.
+        h_qkv_bias (bool, optional): Whether add bias in mask head. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    '''
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 h_embed_dim,
+                 h_depth,
+                 h_num_heads,
+                 h_mlp_ratio=4,
+                 h_drop_rate=0.0,
+                 h_drop_path_rate=0.0,
+                 h_attn_drop_rate=0.0,
+                 h_qkv_bias=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.head = SegmenterMaskHead(
+            num_classes, backbone.embed_dim, h_embed_dim, h_depth, h_num_heads,
+            h_mlp_ratio, h_drop_rate, h_drop_path_rate, h_attn_drop_rate,
+            h_qkv_bias)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+
+        feats, shape = self.backbone(x)
+        logits = self.head(feats[-1], shape[2:])
+
+        logit_list = [
+            F.interpolate(logit, x_shape[2:], mode='bilinear')
+            for logit in logits
+        ]
+
+        return logit_list
+
+
+class SegmenterLinearHead(nn.Layer):
+    '''
+    The linear head of Segmenter.
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_dim (int): The embed dim of input.
+    '''
+
+    def __init__(self, num_classes, in_dim):
+        super().__init__()
+        self.head = nn.Linear(in_dim, num_classes)
+        self.apply(transformer_utils.init_weights)
+
+    def forward(self, x, patch_embed_size):
+        """ Forward function.
+        Args:
+            x (Tensor): Input tensor of decoder.
+            patch_embed_size (Tensor): The height and width of the patch embed tensor.
+        Returns:
+            list[Tensor]: Segmentation results.
+        """
+        masks = self.head(x)
+
+        #[b, (h w), c] -> [b, c, h, w]
+        h, w = patch_embed_size[0], patch_embed_size[1]
+        masks = masks.reshape((0, h, w, paddle.shape(masks)[-1]))
+        masks = masks.transpose((0, 3, 1, 2))
+
+        return [masks]
+
+
+class SegmenterMaskHead(nn.Layer):
+    '''
+    The mask head of segmenter.
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_dim (int): The embed dim of input.
+        embed_dim (int): Embedding dim of mask transformer.
+        depth (int): The num of layers in Transformer.
+        num_heads (int): The num of heads in MSA.
+        mlp_ratio (int, optional): Ratio of MLP dim. Default: 4.
+        drop_rate (float, optional): Drop rate of MLP in MSA. Default: 0.0.
+        drop_path_rate (float, optional): Drop path rate in MSA. Default: 0.0.
+        attn_drop_rate (float, optional): Attenation drop rate in MSA. Default: 0.0.
+        qkv_bias (bool, optional): Whether add bias in qkv linear. Default: False.
+    '''
+
+    def __init__(self,
+                 num_classes,
+                 in_dim,
+                 embed_dim,
+                 depth,
+                 num_heads,
+                 mlp_ratio=4,
+                 drop_rate=0.0,
+                 drop_path_rate=0.0,
+                 attn_drop_rate=0.0,
+                 qkv_bias=False):
+        super().__init__()
+        self.num_classes = num_classes
+
+        self.proj_input = nn.Linear(in_dim, embed_dim)
+
+        self.cls_token = self.create_parameter(
+            shape=(1, num_classes, embed_dim),
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
+
+        dpr = [x for x in np.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.LayerList([
+            vision_transformer.Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[i],
+                attn_drop=attn_drop_rate,
+                qkv_bias=qkv_bias) for i in range(depth)
+        ])
+
+        initializer = paddle.nn.initializer.TruncatedNormal(std=0.02)
+        self.proj_patch = nn.Linear(
+            embed_dim,
+            embed_dim,
+            weight_attr=paddle.ParamAttr(initializer=initializer),
+            bias_attr=False)
+        self.proj_class = nn.Linear(
+            embed_dim,
+            embed_dim,
+            weight_attr=paddle.ParamAttr(initializer=initializer),
+            bias_attr=False)
+
+        self.decoder_norm = nn.LayerNorm(embed_dim)
+        self.mask_norm = nn.LayerNorm(num_classes)
+
+        self.apply(transformer_utils.init_weights)
+
+    def forward(self, x, patch_embed_size):
+        """ Forward function.
+        Args:
+            x (Tensor): Input tensor of decoder.
+            patch_embed_size (Tensor): The height and width of the patch embed tensor.
+        Returns:
+            list[Tensor]: Segmentation results.
+        """
+        x = self.proj_input(x)
+
+        cls_token = self.cls_token.expand((paddle.shape(x)[0], -1, -1))
+        x = paddle.concat([x, cls_token], axis=1)
+
+        for block in self.blocks:
+            x = block(x)
+        x = self.decoder_norm(x)
+
+        patches, masks = x[:, :-self.num_classes], x[:, -self.num_classes:]
+        patches = self.proj_patch(patches)
+        masks = self.proj_class(masks)
+        patches = patches / paddle.norm(patches, axis=-1, keepdim=True)
+        masks = masks / paddle.norm(masks, axis=-1, keepdim=True)
+
+        masks = patches @ masks.transpose((0, 2, 1))
+        masks = masks.reshape((0, 0,
+                               self.num_classes))  # For export inference model
+        masks = self.mask_norm(masks)
+
+        #[b, (h w), c] -> [b, c, h, w]
+        h, w = patch_embed_size[0], patch_embed_size[1]
+        masks = masks.reshape((0, h, w, paddle.shape(masks)[-1]))
+        masks = masks.transpose((0, 3, 1, 2))
+
+        return [masks]
diff --git a/paddlers/models/ppseg/models/segnet.py b/paddlers/models/ppseg/models/segnet.py
new file mode 100644
index 0000000..4bc49d6
--- /dev/null
+++ b/paddlers/models/ppseg/models/segnet.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class SegNet(nn.Layer):
+    """
+    The SegNet implementation based on PaddlePaddle.
+    The original article refers to
+    Badrinarayanan, Vijay, et al. "SegNet: A Deep Convolutional Encoder-Decoder Architecture for Image Segmentation"
+    (https://arxiv.org/pdf/1511.00561.pdf).
+    Args:
+        num_classes (int): The unique number of target classes.
+    """
+
+    def __init__(self, num_classes, pretrained=None):
+        super().__init__()
+
+        # Encoder Module
+
+        self.enco1 = nn.Sequential(
+            layers.ConvBNReLU(3, 64, 3, padding=1),
+            layers.ConvBNReLU(64, 64, 3, padding=1))
+
+        self.enco2 = nn.Sequential(
+            layers.ConvBNReLU(64, 128, 3, padding=1),
+            layers.ConvBNReLU(128, 128, 3, padding=1))
+
+        self.enco3 = nn.Sequential(
+            layers.ConvBNReLU(128, 256, 3, padding=1),
+            layers.ConvBNReLU(256, 256, 3, padding=1),
+            layers.ConvBNReLU(256, 256, 3, padding=1))
+
+        self.enco4 = nn.Sequential(
+            layers.ConvBNReLU(256, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 512, 3, padding=1))
+
+        self.enco5 = nn.Sequential(
+            layers.ConvBNReLU(512, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 512, 3, padding=1))
+
+        # Decoder Module
+
+        self.deco1 = nn.Sequential(
+            layers.ConvBNReLU(512, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 512, 3, padding=1))
+
+        self.deco2 = nn.Sequential(
+            layers.ConvBNReLU(512, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 512, 3, padding=1),
+            layers.ConvBNReLU(512, 256, 3, padding=1))
+
+        self.deco3 = nn.Sequential(
+            layers.ConvBNReLU(256, 256, 3, padding=1),
+            layers.ConvBNReLU(256, 256, 3, padding=1),
+            layers.ConvBNReLU(256, 128, 3, padding=1))
+
+        self.deco4 = nn.Sequential(
+            layers.ConvBNReLU(128, 128, 3, padding=1),
+            layers.ConvBNReLU(128, 128, 3, padding=1),
+            layers.ConvBNReLU(128, 64, 3, padding=1))
+
+        self.deco5 = nn.Sequential(
+            layers.ConvBNReLU(64, 64, 3, padding=1),
+            nn.Conv2D(64, num_classes, kernel_size=3, padding=1),
+        )
+
+        self.pretrained = pretrained
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        logit_list = []
+
+        x = self.enco1(x)
+        x, ind1 = F.max_pool2d(x, kernel_size=2, stride=2, return_mask=True)
+        size1 = x.shape
+
+        x = self.enco2(x)
+        x, ind2 = F.max_pool2d(x, kernel_size=2, stride=2, return_mask=True)
+        size2 = x.shape
+
+        x = self.enco3(x)
+        x, ind3 = F.max_pool2d(x, kernel_size=2, stride=2, return_mask=True)
+        size3 = x.shape
+
+        x = self.enco4(x)
+        x, ind4 = F.max_pool2d(x, kernel_size=2, stride=2, return_mask=True)
+        size4 = x.shape
+
+        x = self.enco5(x)
+        x, ind5 = F.max_pool2d(x, kernel_size=2, stride=2, return_mask=True)
+        size5 = x.shape
+
+        x = F.max_unpool2d(
+            x, indices=ind5, kernel_size=2, stride=2, output_size=size4)
+        x = self.deco1(x)
+
+        x = F.max_unpool2d(
+            x, indices=ind4, kernel_size=2, stride=2, output_size=size3)
+        x = self.deco2(x)
+
+        x = F.max_unpool2d(
+            x, indices=ind3, kernel_size=2, stride=2, output_size=size2)
+        x = self.deco3(x)
+
+        x = F.max_unpool2d(
+            x, indices=ind2, kernel_size=2, stride=2, output_size=size1)
+        x = self.deco4(x)
+
+        x = F.max_unpool2d(x, indices=ind1, kernel_size=2, stride=2)
+        x = self.deco5(x)
+
+        logit_list.append(x)
+
+        return logit_list
diff --git a/paddlers/models/ppseg/models/setr.py b/paddlers/models/ppseg/models/setr.py
new file mode 100644
index 0000000..6e818d6
--- /dev/null
+++ b/paddlers/models/ppseg/models/setr.py
@@ -0,0 +1,440 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import paddle
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class SegmentationTransformer(nn.Layer):
+    '''
+    The SETR implementation based on PaddlePaddle.
+
+    The original article refers to
+        Zheng, Sixiao, et al. "Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers"
+        (https://arxiv.org/abs/2012.15840)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network.
+        backbone_indices (tuple): A tuple indicates the indices of output of backbone.
+            It can be either one or two values, if two values, the first index will be taken as
+            a deep-supervision feature in auxiliary layer; the second one will be taken as
+            input of pixel representation. If one value, it is taken by both above.
+        head (str, optional): SETR head type(naive, pup or mla). Default: naive.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    '''
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(9, 14, 19, 23),
+                 head='naive',
+                 align_corners=False,
+                 pretrained=None,
+                 **head_config):
+
+        super().__init__()
+        self.backbone = backbone
+        self.num_classes = num_classes
+
+        if head.lower() == 'naive':
+            self.head = NaiveHead(
+                num_classes=self.num_classes,
+                backbone_indices=backbone_indices,
+                in_channels=self.backbone.embed_dim,
+                **head_config)
+        elif head.lower() == 'pup':
+            self.head = PUPHead(
+                num_classes=self.num_classes,
+                backbone_indices=backbone_indices,
+                align_corners=align_corners,
+                in_channels=self.backbone.embed_dim,
+                **head_config)
+        elif head.lower() == 'mla':
+            self.head = MLAHead(
+                num_classes=self.num_classes,
+                backbone_indices=backbone_indices,
+                in_channels=self.backbone.embed_dim,
+                **head_config)
+        else:
+            raise RuntimeError(
+                'Unsupported segmentation head type {}. Only naive/pup/mla is valid.'
+                .format(head))
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        feats, _shape = self.backbone(x)
+        logits = self.head(feats, _shape)
+        return [
+            F.interpolate(
+                _logit,
+                x_shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for _logit in logits
+        ]
+
+
+class NaiveHead(nn.Layer):
+    '''
+    The SETR Naive Head implementation.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): A tuple indicates the indices of output of backbone.
+            It can be either one or two values, if two values, the first index will be taken as
+            a deep-supervision feature in auxiliary layer; the second one will be taken as
+            input of pixel representation. If one value, it is taken by both above.
+        in_channels (int): The number of input channels. Default: 10.
+        lr_multiple (int, optional): The leanring rate multiple of head parameters. Default: 10.
+    '''
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 in_channels,
+                 lr_multiple=10):
+        super().__init__()
+
+        self.cls_head_norm = nn.LayerNorm(
+            normalized_shape=in_channels, epsilon=1e-6)
+        self.cls_head = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=in_channels, out_channels=256, kernel_size=1),
+            nn.Conv2D(in_channels=256, out_channels=num_classes, kernel_size=1))
+
+        aux_head_nums = len(backbone_indices) - 1
+        self.aux_head_norms = nn.LayerList(
+            [nn.LayerNorm(normalized_shape=in_channels, epsilon=1e-6)
+             ] * aux_head_nums)
+        self.aux_heads = nn.LayerList([
+            nn.Sequential(
+                layers.ConvBNReLU(
+                    in_channels=in_channels, out_channels=256, kernel_size=1),
+                nn.Conv2D(
+                    in_channels=256, out_channels=num_classes, kernel_size=1))
+        ] * aux_head_nums)
+
+        self.in_channels = in_channels
+        self.lr_multiple = lr_multiple
+        self.backbone_indices = backbone_indices
+        self.init_weight()
+
+    def init_weight(self):
+        for _param in self.parameters():
+            _param.optimize_attr['learning_rate'] = self.lr_multiple
+
+        for layer in self.sublayers():
+            if isinstance(layer, nn.LayerNorm):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+
+    def forward(self, x, _shape):
+        logits = []
+        feat = x[self.backbone_indices[-1]]
+        feat = self.cls_head_norm(feat).transpose([0, 2, 1]).reshape(
+            [0, self.in_channels, _shape[2], _shape[3]])
+
+        logits.append(self.cls_head(feat))
+
+        if self.training:
+            for idx, _head in enumerate(self.aux_heads):
+                feat = x[self.backbone_indices[idx]]
+                feat = self.aux_head_norms[idx](feat).transpose(
+                    [0, 2,
+                     1]).reshape([0, self.in_channels, _shape[2], _shape[3]])
+                logits.append(_head(feat))
+
+        return logits
+
+
+class PUPHead(nn.Layer):
+    '''
+    The SETR Progressive UPsampling Head implementation.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): A tuple indicates the indices of output of backbone.
+            It can be either one or two values, if two values, the first index will be taken as
+            a deep-supervision feature in auxiliary layer; the second one will be taken as
+            input of pixel representation. If one value, it is taken by both above.
+        in_channels (int): The number of input channels. Default: 10.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        lr_multiple (int, optional): The leanring rate multiple of head parameters. Default: 10.
+    '''
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 in_channels,
+                 align_corners=False,
+                 lr_multiple=10):
+        super().__init__()
+
+        inter_channels = 256
+
+        self.cls_head_norm = nn.LayerNorm(
+            normalized_shape=in_channels, epsilon=1e-6)
+        self.cls_head = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=in_channels,
+                out_channels=inter_channels,
+                kernel_size=3,
+                padding=1), nn.Upsample(scale_factor=2, mode='bilinear'),
+            layers.ConvBNReLU(
+                in_channels=inter_channels,
+                out_channels=inter_channels,
+                kernel_size=3,
+                padding=1), nn.Upsample(scale_factor=2, mode='bilinear'),
+            layers.ConvBNReLU(
+                in_channels=inter_channels,
+                out_channels=inter_channels,
+                kernel_size=3,
+                padding=1), nn.Upsample(scale_factor=2, mode='bilinear'),
+            layers.ConvBNReLU(
+                in_channels=inter_channels,
+                out_channels=inter_channels,
+                kernel_size=3,
+                padding=1),
+            nn.Conv2D(
+                in_channels=inter_channels,
+                out_channels=num_classes,
+                kernel_size=1))
+
+        aux_head_nums = len(backbone_indices)
+        self.aux_head_norms = nn.LayerList(
+            [nn.LayerNorm(normalized_shape=in_channels, epsilon=1e-6)
+             ] * aux_head_nums)
+        self.aux_heads = nn.LayerList([
+            nn.Sequential(
+                layers.ConvBNReLU(
+                    in_channels=in_channels,
+                    out_channels=inter_channels,
+                    kernel_size=3,
+                    padding=1), nn.Upsample(scale_factor=4, mode='bilinear'),
+                nn.Conv2D(
+                    in_channels=inter_channels,
+                    out_channels=num_classes,
+                    kernel_size=1))
+        ] * aux_head_nums)
+
+        self.in_channels = in_channels
+        self.lr_multiple = lr_multiple
+        self.backbone_indices = backbone_indices
+        self.init_weight()
+
+    def init_weight(self):
+        for _param in self.parameters():
+            _param.optimize_attr['learning_rate'] = self.lr_multiple
+
+        for layer in self.sublayers():
+            if isinstance(layer, nn.LayerNorm):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+
+    def forward(self, x, _shape):
+        logits = []
+        feat = x[self.backbone_indices[-1]]
+        feat = self.cls_head_norm(feat).transpose([0, 2, 1]).reshape(
+            [0, self.in_channels, _shape[2], _shape[3]])
+
+        logits.append(self.cls_head(feat))
+
+        if self.training:
+            for idx, _head in enumerate(self.aux_heads):
+                feat = x[self.backbone_indices[idx]]
+                feat = self.aux_head_norms[idx](feat).transpose(
+                    [0, 2,
+                     1]).reshape([0, self.in_channels, _shape[2], _shape[3]])
+                logits.append(_head(feat))
+
+        return logits
+
+
+class ConvMLA(nn.Layer):
+    def __init__(self, in_channels, mla_channels):
+        super().__init__()
+
+        self.mla_p2_1x1 = layers.ConvBNReLU(
+            in_channels=in_channels, out_channels=mla_channels, kernel_size=1)
+
+        self.mla_p3_1x1 = layers.ConvBNReLU(
+            in_channels=in_channels, out_channels=mla_channels, kernel_size=1)
+
+        self.mla_p4_1x1 = layers.ConvBNReLU(
+            in_channels=in_channels, out_channels=mla_channels, kernel_size=1)
+
+        self.mla_p5_1x1 = layers.ConvBNReLU(
+            in_channels=in_channels, out_channels=mla_channels, kernel_size=1)
+
+        self.mla_p2 = layers.ConvBNReLU(
+            in_channels=mla_channels,
+            out_channels=mla_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.mla_p3 = layers.ConvBNReLU(
+            in_channels=mla_channels,
+            out_channels=mla_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.mla_p4 = layers.ConvBNReLU(
+            in_channels=mla_channels,
+            out_channels=mla_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.mla_p5 = layers.ConvBNReLU(
+            in_channels=mla_channels,
+            out_channels=mla_channels,
+            kernel_size=3,
+            padding=1)
+
+    def forward(self, x):
+        res2, res3, res4, res5 = x
+
+        mla_p5_1x1 = self.mla_p5_1x1(res5)
+        mla_p4_1x1 = self.mla_p4_1x1(res4)
+        mla_p3_1x1 = self.mla_p3_1x1(res3)
+        mla_p2_1x1 = self.mla_p2_1x1(res2)
+
+        mla_p4_plus = mla_p5_1x1 + mla_p4_1x1
+        mla_p3_plus = mla_p4_plus + mla_p3_1x1
+        mla_p2_plus = mla_p3_plus + mla_p2_1x1
+
+        mla_p5 = self.mla_p5(mla_p5_1x1)
+        mla_p4 = self.mla_p4(mla_p4_plus)
+        mla_p3 = self.mla_p3(mla_p3_plus)
+        mla_p2 = self.mla_p2(mla_p2_plus)
+
+        return [mla_p2, mla_p3, mla_p4, mla_p5]
+
+
+class MLAHead(nn.Layer):
+    '''
+    The SETR Multi-Level feature Aggregation Head implementation.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): A tuple indicates the indices of output of backbone.
+            It can be either one or two values, if two values, the first index will be taken as
+            a deep-supervision feature in auxiliary layer; the second one will be taken as
+            input of pixel representation. If one value, it is taken by both above.
+        in_channels (int): The number of input channels. Default: 10.
+        mla_channels (int, optional): The number of middle channels of ConvMLA Layer. Default: 256.
+        mlahead_channels (int, optional): The number of middle channels of mla head. Default: 128.
+        lr_multiple (int, optional): The leanring rate multiple of head parameters. Default: 10.
+    '''
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 in_channels,
+                 mla_channels=256,
+                 mlahead_channels=128,
+                 lr_multiple=10):
+        super().__init__()
+
+        if len(backbone_indices) != 4:
+            raise RuntimeError
+
+        self.mla_feat_nums = len(backbone_indices)
+        self.norms = nn.LayerList(
+            [nn.LayerNorm(normalized_shape=in_channels, epsilon=1e-6)
+             ] * self.mla_feat_nums)
+
+        self.mla = ConvMLA(in_channels, mla_channels)
+
+        self.aux_heads = nn.LayerList([
+            nn.Conv2D(
+                in_channels=mla_channels,
+                out_channels=num_classes,
+                kernel_size=1)
+        ] * self.mla_feat_nums)
+
+        self.feat_convs = nn.LayerList([
+            nn.Sequential(
+                layers.ConvBNReLU(
+                    in_channels=mla_channels,
+                    out_channels=mlahead_channels,
+                    kernel_size=3,
+                    padding=1),
+                layers.ConvBNReLU(
+                    in_channels=mlahead_channels,
+                    out_channels=mlahead_channels,
+                    kernel_size=3,
+                    padding=1),
+                nn.Upsample(
+                    scale_factor=4, mode='bilinear', align_corners=True))
+        ] * self.mla_feat_nums)
+
+        self.backbone_indices = backbone_indices
+        self.in_channels = in_channels
+
+        self.cls_head = nn.Conv2D(
+            in_channels=4 * mlahead_channels,
+            out_channels=num_classes,
+            kernel_size=3,
+            padding=1)
+
+    def init_weight(self):
+        for name, _param in self.named_parameters():
+            if name.startswith('norms.') or name.startswith('mla.'):
+                continue
+
+            _param.optimize_attr['learning_rate'] = self.lr_multiple
+
+        for layer in self.sublayers():
+            if isinstance(layer, nn.LayerNorm):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+
+    def forward(self, x, _shape):
+        logits = []
+
+        feats = [x[_idx] for _idx in self.backbone_indices]
+
+        for i in range(self.mla_feat_nums):
+            feats[i] = self.norms[i](feats[i]).transpose([0, 2, 1]).reshape(
+                [0, self.in_channels, _shape[2], _shape[3]])
+
+        feats = self.mla(feats)
+        if self.training:
+            for i in range(self.mla_feat_nums):
+                logits.append(self.aux_heads[i](feats[i]))
+
+        for i in range(self.mla_feat_nums):
+            feats[i] = self.feat_convs[i](feats[i])
+
+        feat_mix = paddle.concat(feats, axis=1)
+        logits.insert(0, self.cls_head(feat_mix))
+
+        return logits
diff --git a/paddlers/models/ppseg/models/sfnet.py b/paddlers/models/ppseg/models/sfnet.py
new file mode 100644
index 0000000..eefdb97
--- /dev/null
+++ b/paddlers/models/ppseg/models/sfnet.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class SFNet(nn.Layer):
+    """
+    The SFNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Li, Xiangtai, et al. "Semantic Flow for Fast and Accurate Scene Parsing"
+    (https://arxiv.org/pdf/2002.10120.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 enable_auxiliary_loss=False,
+                 align_corners=False,
+                 pretrained=None):
+        super(SFNet, self).__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.in_channels = [
+            self.backbone.feat_channels[i] for i in backbone_indices
+        ]
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        if self.backbone.layers == 18:
+            fpn_dim = 128
+            inplane_head = 512
+            fpn_inplanes = [64, 128, 256, 512]
+        else:
+            fpn_dim = 256
+            inplane_head = 2048
+            fpn_inplanes = [256, 512, 1024, 2048]
+
+        self.head = SFNetHead(
+            inplane=inplane_head,
+            num_class=num_classes,
+            fpn_inplanes=fpn_inplanes,
+            fpn_dim=fpn_dim,
+            enable_auxiliary_loss=self.enable_auxiliary_loss)
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class SFNetHead(nn.Layer):
+    """
+    The SFNetHead implementation.
+
+    Args:
+        inplane (int): Input channels of PPM module.
+        num_class (int): The unique number of target classes.
+        fpn_inplanes (list): The feature channels from backbone.
+        fpn_dim (int, optional): The input channels of FAM module. Default: 256.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+    """
+
+    def __init__(self,
+                 inplane,
+                 num_class,
+                 fpn_inplanes,
+                 fpn_dim=256,
+                 enable_auxiliary_loss=False):
+        super(SFNetHead, self).__init__()
+        self.ppm = layers.PPModule(
+            in_channels=inplane,
+            out_channels=fpn_dim,
+            bin_sizes=(1, 2, 3, 6),
+            dim_reduction=True,
+            align_corners=True)
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.fpn_in = []
+
+        for fpn_inplane in fpn_inplanes[:-1]:
+            self.fpn_in.append(
+                nn.Sequential(
+                    nn.Conv2D(fpn_inplane, fpn_dim, 1),
+                    layers.SyncBatchNorm(fpn_dim), nn.ReLU()))
+
+        self.fpn_in = nn.LayerList(self.fpn_in)
+        self.fpn_out = []
+        self.fpn_out_align = []
+        self.dsn = []
+        for i in range(len(fpn_inplanes) - 1):
+            self.fpn_out.append(
+                nn.Sequential(
+                    layers.ConvBNReLU(fpn_dim, fpn_dim, 3, bias_attr=False)))
+            self.fpn_out_align.append(
+                AlignedModule(inplane=fpn_dim, outplane=fpn_dim // 2))
+            if self.enable_auxiliary_loss:
+                self.dsn.append(
+                    nn.Sequential(layers.AuxLayer(fpn_dim, fpn_dim, num_class)))
+
+        self.fpn_out = nn.LayerList(self.fpn_out)
+        self.fpn_out_align = nn.LayerList(self.fpn_out_align)
+
+        if self.enable_auxiliary_loss:
+            self.dsn = nn.LayerList(self.dsn)
+
+        self.conv_last = nn.Sequential(
+            layers.ConvBNReLU(
+                len(fpn_inplanes) * fpn_dim, fpn_dim, 3, bias_attr=False),
+            nn.Conv2D(fpn_dim, num_class, kernel_size=1))
+
+    def forward(self, conv_out):
+        psp_out = self.ppm(conv_out[-1])
+        f = psp_out
+        fpn_feature_list = [psp_out]
+        out = []
+        for i in reversed(range(len(conv_out) - 1)):
+            conv_x = conv_out[i]
+            conv_x = self.fpn_in[i](conv_x)
+            f = self.fpn_out_align[i]([conv_x, f])
+            f = conv_x + f
+            fpn_feature_list.append(self.fpn_out[i](f))
+            if self.enable_auxiliary_loss:
+                out.append(self.dsn[i](f))
+
+        fpn_feature_list.reverse()
+        output_size = paddle.shape(fpn_feature_list[0])[2:]
+        fusion_list = [fpn_feature_list[0]]
+
+        for i in range(1, len(fpn_feature_list)):
+            fusion_list.append(
+                F.interpolate(
+                    fpn_feature_list[i],
+                    output_size,
+                    mode='bilinear',
+                    align_corners=True))
+        fusion_out = paddle.concat(fusion_list, 1)
+        x = self.conv_last(fusion_out)
+        if self.enable_auxiliary_loss:
+            out.append(x)
+            return out
+        else:
+            return [x]
+
+
+class AlignedModule(nn.Layer):
+    """
+    The FAM module implementation.
+
+    Args:
+       inplane (int): Input channles of FAM module.
+       outplane (int): Output channels of FAN module.
+       kernel_size (int, optional): Kernel size of semantic flow convolution layer. Default: 3.
+    """
+
+    def __init__(self, inplane, outplane, kernel_size=3):
+        super(AlignedModule, self).__init__()
+        self.down_h = nn.Conv2D(inplane, outplane, 1, bias_attr=False)
+        self.down_l = nn.Conv2D(inplane, outplane, 1, bias_attr=False)
+        self.flow_make = nn.Conv2D(
+            outplane * 2,
+            2,
+            kernel_size=kernel_size,
+            padding=1,
+            bias_attr=False)
+
+    def flow_warp(self, input, flow, size):
+        input_shape = paddle.shape(input)
+        norm = size[::-1].reshape([1, 1, 1, -1])
+        norm.stop_gradient = True
+        h_grid = paddle.linspace(-1.0, 1.0, size[0]).reshape([-1, 1])
+        h_grid = h_grid.tile([size[1]])
+        w_grid = paddle.linspace(-1.0, 1.0, size[1]).reshape([-1, 1])
+        w_grid = w_grid.tile([size[0]]).transpose([1, 0])
+        grid = paddle.concat([w_grid.unsqueeze(2), h_grid.unsqueeze(2)], axis=2)
+        grid.unsqueeze(0).tile([input_shape[0], 1, 1, 1])
+        grid = grid + paddle.transpose(flow, (0, 2, 3, 1)) / norm
+
+        output = F.grid_sample(input, grid)
+        return output
+
+    def forward(self, x):
+        low_feature, h_feature = x
+        h_feature_orign = h_feature
+        size = paddle.shape(low_feature)[2:]
+        low_feature = self.down_l(low_feature)
+        h_feature = self.down_h(h_feature)
+        h_feature = F.interpolate(
+            h_feature, size=size, mode='bilinear', align_corners=True)
+        flow = self.flow_make(paddle.concat([h_feature, low_feature], 1))
+        h_feature = self.flow_warp(h_feature_orign, flow, size=size)
+        return h_feature
diff --git a/paddlers/models/ppseg/models/stdcseg.py b/paddlers/models/ppseg/models/stdcseg.py
new file mode 100644
index 0000000..f018125
--- /dev/null
+++ b/paddlers/models/ppseg/models/stdcseg.py
@@ -0,0 +1,216 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class STDCSeg(nn.Layer):
+    """
+    The STDCSeg implementation based on PaddlePaddle.
+
+    The original article refers to Meituan
+    Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation."
+    (https://arxiv.org/abs/2104.13188)
+
+    Args:
+        num_classes(int,optional): The unique number of target classes.
+        backbone(nn.Layer): Backbone network, STDCNet1446/STDCNet813. STDCNet1446->STDC2,STDCNet813->STDC813.
+        use_boundary_8(bool,non-optional): Whether to use detail loss. it should be True accroding to paper for best metric. Default: True.
+        Actually,if you want to use _boundary_2/_boundary_4/_boundary_16,you should append loss function number of DetailAggregateLoss.It should work properly.
+        use_conv_last(bool,optional): Determine ContextPath 's inplanes variable according to whether to use bockbone's last conv. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 use_boundary_2=False,
+                 use_boundary_4=False,
+                 use_boundary_8=True,
+                 use_boundary_16=False,
+                 use_conv_last=False,
+                 pretrained=None):
+        super(STDCSeg, self).__init__()
+
+        self.use_boundary_2 = use_boundary_2
+        self.use_boundary_4 = use_boundary_4
+        self.use_boundary_8 = use_boundary_8
+        self.use_boundary_16 = use_boundary_16
+        self.cp = ContextPath(backbone, use_conv_last=use_conv_last)
+        self.ffm = FeatureFusionModule(384, 256)
+        self.conv_out = SegHead(256, 256, num_classes)
+        self.conv_out8 = SegHead(128, 64, num_classes)
+        self.conv_out16 = SegHead(128, 64, num_classes)
+        self.conv_out_sp16 = SegHead(512, 64, 1)
+        self.conv_out_sp8 = SegHead(256, 64, 1)
+        self.conv_out_sp4 = SegHead(64, 64, 1)
+        self.conv_out_sp2 = SegHead(32, 64, 1)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        x_hw = paddle.shape(x)[2:]
+        feat_res2, feat_res4, feat_res8, _, feat_cp8, feat_cp16 = self.cp(x)
+
+        logit_list = []
+        if self.training:
+            feat_fuse = self.ffm(feat_res8, feat_cp8)
+            feat_out = self.conv_out(feat_fuse)
+            feat_out8 = self.conv_out8(feat_cp8)
+            feat_out16 = self.conv_out16(feat_cp16)
+
+            logit_list = [feat_out, feat_out8, feat_out16]
+            logit_list = [
+                F.interpolate(x, x_hw, mode='bilinear', align_corners=True)
+                for x in logit_list
+            ]
+
+            if self.use_boundary_2:
+                feat_out_sp2 = self.conv_out_sp2(feat_res2)
+                logit_list.append(feat_out_sp2)
+            if self.use_boundary_4:
+                feat_out_sp4 = self.conv_out_sp4(feat_res4)
+                logit_list.append(feat_out_sp4)
+            if self.use_boundary_8:
+                feat_out_sp8 = self.conv_out_sp8(feat_res8)
+                logit_list.append(feat_out_sp8)
+        else:
+            feat_fuse = self.ffm(feat_res8, feat_cp8)
+            feat_out = self.conv_out(feat_fuse)
+            feat_out = F.interpolate(
+                feat_out, x_hw, mode='bilinear', align_corners=True)
+            logit_list = [feat_out]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class SegHead(nn.Layer):
+    def __init__(self, in_chan, mid_chan, n_classes):
+        super(SegHead, self).__init__()
+        self.conv = layers.ConvBNReLU(
+            in_chan, mid_chan, kernel_size=3, stride=1, padding=1)
+        self.conv_out = nn.Conv2D(
+            mid_chan, n_classes, kernel_size=1, bias_attr=None)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_out(x)
+        return x
+
+
+class AttentionRefinementModule(nn.Layer):
+    def __init__(self, in_chan, out_chan):
+        super(AttentionRefinementModule, self).__init__()
+        self.conv = layers.ConvBNReLU(
+            in_chan, out_chan, kernel_size=3, stride=1, padding=1)
+        self.conv_atten = nn.Conv2D(
+            out_chan, out_chan, kernel_size=1, bias_attr=None)
+        self.bn_atten = nn.BatchNorm2D(out_chan)
+        self.sigmoid_atten = nn.Sigmoid()
+
+    def forward(self, x):
+        feat = self.conv(x)
+        atten = F.adaptive_avg_pool2d(feat, 1)
+        atten = self.conv_atten(atten)
+        atten = self.bn_atten(atten)
+        atten = self.sigmoid_atten(atten)
+        out = paddle.multiply(feat, atten)
+        return out
+
+
+class ContextPath(nn.Layer):
+    def __init__(self, backbone, use_conv_last=False):
+        super(ContextPath, self).__init__()
+        self.backbone = backbone
+        self.arm16 = AttentionRefinementModule(512, 128)
+        inplanes = 1024
+        if use_conv_last:
+            inplanes = 1024
+        self.arm32 = AttentionRefinementModule(inplanes, 128)
+        self.conv_head32 = layers.ConvBNReLU(
+            128, 128, kernel_size=3, stride=1, padding=1)
+        self.conv_head16 = layers.ConvBNReLU(
+            128, 128, kernel_size=3, stride=1, padding=1)
+        self.conv_avg = layers.ConvBNReLU(
+            inplanes, 128, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        feat2, feat4, feat8, feat16, feat32 = self.backbone(x)
+
+        feat8_hw = paddle.shape(feat8)[2:]
+        feat16_hw = paddle.shape(feat16)[2:]
+        feat32_hw = paddle.shape(feat32)[2:]
+
+        avg = F.adaptive_avg_pool2d(feat32, 1)
+        avg = self.conv_avg(avg)
+        avg_up = F.interpolate(avg, feat32_hw, mode='nearest')
+
+        feat32_arm = self.arm32(feat32)
+        feat32_sum = feat32_arm + avg_up
+        feat32_up = F.interpolate(feat32_sum, feat16_hw, mode='nearest')
+        feat32_up = self.conv_head32(feat32_up)
+
+        feat16_arm = self.arm16(feat16)
+        feat16_sum = feat16_arm + feat32_up
+        feat16_up = F.interpolate(feat16_sum, feat8_hw, mode='nearest')
+        feat16_up = self.conv_head16(feat16_up)
+
+        return feat2, feat4, feat8, feat16, feat16_up, feat32_up  # x8, x16
+
+
+class FeatureFusionModule(nn.Layer):
+    def __init__(self, in_chan, out_chan):
+        super(FeatureFusionModule, self).__init__()
+        self.convblk = layers.ConvBNReLU(
+            in_chan, out_chan, kernel_size=1, stride=1, padding=0)
+        self.conv1 = nn.Conv2D(
+            out_chan,
+            out_chan // 4,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=None)
+        self.conv2 = nn.Conv2D(
+            out_chan // 4,
+            out_chan,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=None)
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, fsp, fcp):
+        fcat = paddle.concat([fsp, fcp], axis=1)
+        feat = self.convblk(fcat)
+        atten = F.adaptive_avg_pool2d(feat, 1)
+        atten = self.conv1(atten)
+        atten = self.relu(atten)
+        atten = self.conv2(atten)
+        atten = self.sigmoid(atten)
+        feat_atten = paddle.multiply(feat, atten)
+        feat_out = feat_atten + feat
+        return feat_out
diff --git a/paddlers/models/ppseg/models/u2net.py b/paddlers/models/ppseg/models/u2net.py
new file mode 100644
index 0000000..2f24d06
--- /dev/null
+++ b/paddlers/models/ppseg/models/u2net.py
@@ -0,0 +1,574 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = ['U2Net', 'U2Netp']
+
+
+@manager.MODELS.add_component
+class U2Net(nn.Layer):
+    """
+    The U^2-Net implementation based on PaddlePaddle.
+
+    The original article refers to
+    Xuebin Qin, et, al. "U^2-Net: Going Deeper with Nested U-Structure for Salient Object Detection"
+    (https://arxiv.org/abs/2005.09007).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_ch (int, optional): Input channels. Default: 3.
+        pretrained (str, optional): The path or url of pretrained model for fine tuning. Default: None.
+
+    """
+
+    def __init__(self, num_classes, in_ch=3, pretrained=None):
+        super(U2Net, self).__init__()
+
+        self.stage1 = RSU7(in_ch, 32, 64)
+        self.pool12 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage2 = RSU6(64, 32, 128)
+        self.pool23 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage3 = RSU5(128, 64, 256)
+        self.pool34 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage4 = RSU4(256, 128, 512)
+        self.pool45 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage5 = RSU4F(512, 256, 512)
+        self.pool56 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage6 = RSU4F(512, 256, 512)
+
+        # decoder
+        self.stage5d = RSU4F(1024, 256, 512)
+        self.stage4d = RSU4(1024, 128, 256)
+        self.stage3d = RSU5(512, 64, 128)
+        self.stage2d = RSU6(256, 32, 64)
+        self.stage1d = RSU7(128, 16, 64)
+
+        self.side1 = nn.Conv2D(64, num_classes, 3, padding=1)
+        self.side2 = nn.Conv2D(64, num_classes, 3, padding=1)
+        self.side3 = nn.Conv2D(128, num_classes, 3, padding=1)
+        self.side4 = nn.Conv2D(256, num_classes, 3, padding=1)
+        self.side5 = nn.Conv2D(512, num_classes, 3, padding=1)
+        self.side6 = nn.Conv2D(512, num_classes, 3, padding=1)
+
+        self.outconv = nn.Conv2D(6 * num_classes, num_classes, 1)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+
+        hx = x
+
+        #stage 1
+        hx1 = self.stage1(hx)
+        hx = self.pool12(hx1)
+
+        #stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+
+        #stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+
+        #stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+
+        #stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+
+        #stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+
+        #-------------------- decoder --------------------
+        hx5d = self.stage5d(paddle.concat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+
+        hx4d = self.stage4d(paddle.concat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+
+        hx3d = self.stage3d(paddle.concat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+
+        hx2d = self.stage2d(paddle.concat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+
+        hx1d = self.stage1d(paddle.concat((hx2dup, hx1), 1))
+
+        #side output
+        d1 = self.side1(hx1d)
+
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, d1)
+
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, d1)
+
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, d1)
+
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, d1)
+
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, d1)
+
+        d0 = self.outconv(paddle.concat((d1, d2, d3, d4, d5, d6), 1))
+
+        return [d0, d1, d2, d3, d4, d5, d6]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+### U^2-Net small ###
+@manager.MODELS.add_component
+class U2Netp(nn.Layer):
+    """Please Refer to U2Net above."""
+
+    def __init__(self, num_classes, in_ch=3, pretrained=None):
+        super(U2Netp, self).__init__()
+
+        self.stage1 = RSU7(in_ch, 16, 64)
+        self.pool12 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage2 = RSU6(64, 16, 64)
+        self.pool23 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage3 = RSU5(64, 16, 64)
+        self.pool34 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage4 = RSU4(64, 16, 64)
+        self.pool45 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage5 = RSU4F(64, 16, 64)
+        self.pool56 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.stage6 = RSU4F(64, 16, 64)
+
+        # decoder
+        self.stage5d = RSU4F(128, 16, 64)
+        self.stage4d = RSU4(128, 16, 64)
+        self.stage3d = RSU5(128, 16, 64)
+        self.stage2d = RSU6(128, 16, 64)
+        self.stage1d = RSU7(128, 16, 64)
+
+        self.side1 = nn.Conv2D(64, num_classes, 3, padding=1)
+        self.side2 = nn.Conv2D(64, num_classes, 3, padding=1)
+        self.side3 = nn.Conv2D(64, num_classes, 3, padding=1)
+        self.side4 = nn.Conv2D(64, num_classes, 3, padding=1)
+        self.side5 = nn.Conv2D(64, num_classes, 3, padding=1)
+        self.side6 = nn.Conv2D(64, num_classes, 3, padding=1)
+
+        self.outconv = nn.Conv2D(6 * num_classes, num_classes, 1)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+
+        hx = x
+
+        #stage 1
+        hx1 = self.stage1(hx)
+        hx = self.pool12(hx1)
+
+        #stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+
+        #stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+
+        #stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+
+        #stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+
+        #stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+
+        #decoder
+        hx5d = self.stage5d(paddle.concat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+
+        hx4d = self.stage4d(paddle.concat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+
+        hx3d = self.stage3d(paddle.concat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+
+        hx2d = self.stage2d(paddle.concat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+
+        hx1d = self.stage1d(paddle.concat((hx2dup, hx1), 1))
+
+        #side output
+        d1 = self.side1(hx1d)
+
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, d1)
+
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, d1)
+
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, d1)
+
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, d1)
+
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, d1)
+
+        d0 = self.outconv(paddle.concat((d1, d2, d3, d4, d5, d6), 1))
+
+        return [d0, d1, d2, d3, d4, d5, d6]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class REBNCONV(nn.Layer):
+    def __init__(self, in_ch=3, out_ch=3, dirate=1):
+        super(REBNCONV, self).__init__()
+
+        self.conv_s1 = nn.Conv2D(
+            in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate)
+        self.bn_s1 = nn.BatchNorm2D(out_ch)
+        self.relu_s1 = nn.ReLU()
+
+    def forward(self, x):
+
+        hx = x
+        xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
+
+        return xout
+
+
+## upsample tensor 'src' to have the same spatial size with tensor 'tar'
+def _upsample_like(src, tar):
+
+    src = F.upsample(src, size=paddle.shape(tar)[2:], mode='bilinear')
+
+    return src
+
+
+### RSU-7 ###
+class RSU7(nn.Layer):  #UNet07DRES(nn.Layer):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU7, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool5 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+        self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+        self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+
+        hx = x
+        hxin = self.rebnconvin(hx)
+
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+
+        hx5 = self.rebnconv5(hx)
+        hx = self.pool5(hx5)
+
+        hx6 = self.rebnconv6(hx)
+
+        hx7 = self.rebnconv7(hx6)
+
+        hx6d = self.rebnconv6d(paddle.concat((hx7, hx6), 1))
+        hx6dup = _upsample_like(hx6d, hx5)
+
+        hx5d = self.rebnconv5d(paddle.concat((hx6dup, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+
+        hx4d = self.rebnconv4d(paddle.concat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+
+        hx3d = self.rebnconv3d(paddle.concat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+
+        hx2d = self.rebnconv2d(paddle.concat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+
+        hx1d = self.rebnconv1d(paddle.concat((hx2dup, hx1), 1))
+
+        return hx1d + hxin
+
+
+### RSU-6 ###
+class RSU6(nn.Layer):  #UNet06DRES(nn.Layer):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU6, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+
+        hx = x
+
+        hxin = self.rebnconvin(hx)
+
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+
+        hx5 = self.rebnconv5(hx)
+
+        hx6 = self.rebnconv6(hx5)
+
+        hx5d = self.rebnconv5d(paddle.concat((hx6, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+
+        hx4d = self.rebnconv4d(paddle.concat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+
+        hx3d = self.rebnconv3d(paddle.concat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+
+        hx2d = self.rebnconv2d(paddle.concat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+
+        hx1d = self.rebnconv1d(paddle.concat((hx2dup, hx1), 1))
+
+        return hx1d + hxin
+
+
+### RSU-5 ###
+class RSU5(nn.Layer):  #UNet05DRES(nn.Layer):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU5, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+
+        hx = x
+
+        hxin = self.rebnconvin(hx)
+
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+
+        hx4 = self.rebnconv4(hx)
+
+        hx5 = self.rebnconv5(hx4)
+
+        hx4d = self.rebnconv4d(paddle.concat((hx5, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+
+        hx3d = self.rebnconv3d(paddle.concat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+
+        hx2d = self.rebnconv2d(paddle.concat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+
+        hx1d = self.rebnconv1d(paddle.concat((hx2dup, hx1), 1))
+
+        return hx1d + hxin
+
+
+### RSU-4 ###
+class RSU4(nn.Layer):  #UNet04DRES(nn.Layer):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2D(2, stride=2, ceil_mode=True)
+
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
+
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+
+        hx = x
+
+        hxin = self.rebnconvin(hx)
+
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+
+        hx3 = self.rebnconv3(hx)
+
+        hx4 = self.rebnconv4(hx3)
+
+        hx3d = self.rebnconv3d(paddle.concat((hx4, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+
+        hx2d = self.rebnconv2d(paddle.concat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+
+        hx1d = self.rebnconv1d(paddle.concat((hx2dup, hx1), 1))
+
+        return hx1d + hxin
+
+
+### RSU-4F ###
+class RSU4F(nn.Layer):  #UNet04FRES(nn.Layer):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4F, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
+
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
+
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+
+        hx = x
+
+        hxin = self.rebnconvin(hx)
+
+        hx1 = self.rebnconv1(hxin)
+        hx2 = self.rebnconv2(hx1)
+        hx3 = self.rebnconv3(hx2)
+
+        hx4 = self.rebnconv4(hx3)
+
+        hx3d = self.rebnconv3d(paddle.concat((hx4, hx3), 1))
+        hx2d = self.rebnconv2d(paddle.concat((hx3d, hx2), 1))
+        hx1d = self.rebnconv1d(paddle.concat((hx2d, hx1), 1))
+
+        return hx1d + hxin
diff --git a/paddlers/models/ppseg/models/unet.py b/paddlers/models/ppseg/models/unet.py
new file mode 100644
index 0000000..5c3fdef
--- /dev/null
+++ b/paddlers/models/ppseg/models/unet.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class UNet(nn.Layer):
+    """
+    The UNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Olaf Ronneberger, et, al. "U-Net: Convolutional Networks for Biomedical Image Segmentation"
+    (https://arxiv.org/abs/1505.04597).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        use_deconv (bool, optional): A bool value indicates whether using deconvolution in upsampling.
+            If False, use resize_bilinear. Default: False.
+        pretrained (str, optional): The path or url of pretrained model for fine tuning. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 align_corners=False,
+                 use_deconv=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.encode = Encoder()
+        self.decode = Decoder(align_corners, use_deconv=use_deconv)
+        self.cls = self.conv = nn.Conv2D(
+            in_channels=64,
+            out_channels=num_classes,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        logit_list = []
+        x, short_cuts = self.encode(x)
+        x = self.decode(x, short_cuts)
+        logit = self.cls(x)
+        logit_list.append(logit)
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class Encoder(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+        self.double_conv = nn.Sequential(
+            layers.ConvBNReLU(3, 64, 3), layers.ConvBNReLU(64, 64, 3))
+        down_channels = [[64, 128], [128, 256], [256, 512], [512, 512]]
+        self.down_sample_list = nn.LayerList([
+            self.down_sampling(channel[0], channel[1])
+            for channel in down_channels
+        ])
+
+    def down_sampling(self, in_channels, out_channels):
+        modules = []
+        modules.append(nn.MaxPool2D(kernel_size=2, stride=2))
+        modules.append(layers.ConvBNReLU(in_channels, out_channels, 3))
+        modules.append(layers.ConvBNReLU(out_channels, out_channels, 3))
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        short_cuts = []
+        x = self.double_conv(x)
+        for down_sample in self.down_sample_list:
+            short_cuts.append(x)
+            x = down_sample(x)
+        return x, short_cuts
+
+
+class Decoder(nn.Layer):
+    def __init__(self, align_corners, use_deconv=False):
+        super().__init__()
+
+        up_channels = [[512, 256], [256, 128], [128, 64], [64, 64]]
+        self.up_sample_list = nn.LayerList([
+            UpSampling(channel[0], channel[1], align_corners, use_deconv)
+            for channel in up_channels
+        ])
+
+    def forward(self, x, short_cuts):
+        for i in range(len(short_cuts)):
+            x = self.up_sample_list[i](x, short_cuts[-(i + 1)])
+        return x
+
+
+class UpSampling(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 align_corners,
+                 use_deconv=False):
+        super().__init__()
+
+        self.align_corners = align_corners
+
+        self.use_deconv = use_deconv
+        if self.use_deconv:
+            self.deconv = nn.Conv2DTranspose(
+                in_channels,
+                out_channels // 2,
+                kernel_size=2,
+                stride=2,
+                padding=0)
+            in_channels = in_channels + out_channels // 2
+        else:
+            in_channels *= 2
+
+        self.double_conv = nn.Sequential(
+            layers.ConvBNReLU(in_channels, out_channels, 3),
+            layers.ConvBNReLU(out_channels, out_channels, 3))
+
+    def forward(self, x, short_cut):
+        if self.use_deconv:
+            x = self.deconv(x)
+        else:
+            x = F.interpolate(
+                x,
+                paddle.shape(short_cut)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        x = paddle.concat([x, short_cut], axis=1)
+        x = self.double_conv(x)
+        return x
diff --git a/paddlers/models/ppseg/models/unet_3plus.py b/paddlers/models/ppseg/models/unet_3plus.py
new file mode 100644
index 0000000..d56ff7e
--- /dev/null
+++ b/paddlers/models/ppseg/models/unet_3plus.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models.layers.layer_libs import SyncBatchNorm
+from paddlers.models.ppseg.cvlibs.param_init import kaiming_normal_init
+
+
+@manager.MODELS.add_component
+class UNet3Plus(nn.Layer):
+    """
+    The UNet3+ implementation based on PaddlePaddle.
+
+    The original article refers to
+    Huang H , Lin L , Tong R , et al. "UNet 3+: A Full-Scale Connected UNet for Medical Image Segmentation"
+    (https://arxiv.org/abs/2004.08790).
+
+    Args:
+        in_channels (int, optional): The channel number of input image.  Default: 3.
+        num_classes (int, optional): The unique number of target classes.  Default: 2.
+        is_batchnorm (bool, optional): Use batchnorm after conv or not.  Default: True.
+        is_deepsup (bool, optional): Use deep supervision or not.  Default: False.
+        is_CGM (bool, optional): Use classification-guided module or not.
+            If True, is_deepsup must be True.  Default: False.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 num_classes=2,
+                 is_batchnorm=True,
+                 is_deepsup=False,
+                 is_CGM=False):
+        super(UNet3Plus, self).__init__()
+        # parameters
+        self.is_deepsup = True if is_CGM else is_deepsup
+        self.is_CGM = is_CGM
+        # internal definition
+        self.filters = [64, 128, 256, 512, 1024]
+        self.cat_channels = self.filters[0]
+        self.cat_blocks = 5
+        self.up_channels = self.cat_channels * self.cat_blocks
+        # layers
+        self.encoder = Encoder(in_channels, self.filters, is_batchnorm)
+        self.decoder = Decoder(self.filters, self.cat_channels,
+                               self.up_channels)
+        if self.is_deepsup:
+            self.deepsup = DeepSup(self.up_channels, self.filters, num_classes)
+            if self.is_CGM:
+                self.cls = nn.Sequential(
+                    nn.Dropout(p=0.5), nn.Conv2D(self.filters[4], 2, 1),
+                    nn.AdaptiveMaxPool2D(1), nn.Sigmoid())
+        else:
+            self.outconv1 = nn.Conv2D(
+                self.up_channels, num_classes, 3, padding=1)
+        # initialise weights
+        for sublayer in self.sublayers():
+            if isinstance(sublayer, nn.Conv2D):
+                kaiming_normal_init(sublayer.weight)
+            elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                kaiming_normal_init(sublayer.weight)
+
+    def dotProduct(self, seg, cls):
+        B, N, H, W = seg.shape
+        seg = seg.reshape((B, N, H * W))
+        clssp = paddle.ones([1, N])
+        ecls = (cls * clssp).reshape([B, N, 1])
+        final = seg * ecls
+        final = final.reshape((B, N, H, W))
+        return final
+
+    def forward(self, inputs):
+        hs = self.encoder(inputs)
+        hds = self.decoder(hs)
+        if self.is_deepsup:
+            out = self.deepsup(hds)
+            if self.is_CGM:
+                # classification-guided module
+                cls_branch = self.cls(hds[-1]).squeeze(3).squeeze(
+                    2)  # (B,N,1,1)->(B,N)
+                cls_branch_max = cls_branch.argmax(axis=1)
+                cls_branch_max = cls_branch_max.reshape((-1, 1)).astype('float')
+                out = [self.dotProduct(d, cls_branch_max) for d in out]
+        else:
+            out = [self.outconv1(hds[0])]  # d1->320*320*num_classes
+        return out
+
+
+class Encoder(nn.Layer):
+    def __init__(self, in_channels, filters, is_batchnorm):
+        super(Encoder, self).__init__()
+        self.conv1 = UnetConv2D(in_channels, filters[0], is_batchnorm)
+        self.poolconv2 = MaxPoolConv2D(filters[0], filters[1], is_batchnorm)
+        self.poolconv3 = MaxPoolConv2D(filters[1], filters[2], is_batchnorm)
+        self.poolconv4 = MaxPoolConv2D(filters[2], filters[3], is_batchnorm)
+        self.poolconv5 = MaxPoolConv2D(filters[3], filters[4], is_batchnorm)
+
+    def forward(self, inputs):
+        h1 = self.conv1(inputs)  # h1->320*320*64
+        h2 = self.poolconv2(h1)  # h2->160*160*128
+        h3 = self.poolconv3(h2)  # h3->80*80*256
+        h4 = self.poolconv4(h3)  # h4->40*40*512
+        hd5 = self.poolconv5(h4)  # h5->20*20*1024
+        return [h1, h2, h3, h4, hd5]
+
+
+class Decoder(nn.Layer):
+    def __init__(self, filters, cat_channels, up_channels):
+        super(Decoder, self).__init__()
+        '''stage 4d'''
+        # h1->320*320, hd4->40*40, Pooling 8 times
+        self.h1_PT_hd4 = nn.MaxPool2D(8, 8, ceil_mode=True)
+        self.h1_PT_hd4_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # h2->160*160, hd4->40*40, Pooling 4 times
+        self.h2_PT_hd4 = nn.MaxPool2D(4, 4, ceil_mode=True)
+        self.h2_PT_hd4_cbr = ConvBnReLU2D(filters[1], cat_channels)
+        # h3->80*80, hd4->40*40, Pooling 2 times
+        self.h3_PT_hd4 = nn.MaxPool2D(2, 2, ceil_mode=True)
+        self.h3_PT_hd4_cbr = ConvBnReLU2D(filters[2], cat_channels)
+        # h4->40*40, hd4->40*40, Concatenation
+        self.h4_Cat_hd4_cbr = ConvBnReLU2D(filters[3], cat_channels)
+        # hd5->20*20, hd4->40*40, Upsample 2 times
+        self.hd5_UT_hd4 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd5_UT_hd4_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_PT_hd4, h2_PT_hd4, h3_PT_hd4, h4_Cat_hd4, hd5_UT_hd4)
+        self.cbr4d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+        '''stage 3d'''
+        # h1->320*320, hd3->80*80, Pooling 4 times
+        self.h1_PT_hd3 = nn.MaxPool2D(4, 4, ceil_mode=True)
+        self.h1_PT_hd3_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # h2->160*160, hd3->80*80, Pooling 2 times
+        self.h2_PT_hd3 = nn.MaxPool2D(2, 2, ceil_mode=True)
+        self.h2_PT_hd3_cbr = ConvBnReLU2D(filters[1], cat_channels)
+        # h3->80*80, hd3->80*80, Concatenation
+        self.h3_Cat_hd3_cbr = ConvBnReLU2D(filters[2], cat_channels)
+        # hd4->40*40, hd4->80*80, Upsample 2 times
+        self.hd4_UT_hd3 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd4_UT_hd3_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd5->20*20, hd4->80*80, Upsample 4 times
+        self.hd5_UT_hd3 = nn.Upsample(scale_factor=4, mode='bilinear')  # 14*14
+        self.hd5_UT_hd3_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_PT_hd3, h2_PT_hd3, h3_Cat_hd3, hd4_UT_hd3, hd5_UT_hd3)
+        self.cbr3d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+        '''stage 2d '''
+        # h1->320*320, hd2->160*160, Pooling 2 times
+        self.h1_PT_hd2 = nn.MaxPool2D(2, 2, ceil_mode=True)
+        self.h1_PT_hd2_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # h2->160*160, hd2->160*160, Concatenation
+        self.h2_Cat_hd2_cbr = ConvBnReLU2D(filters[1], cat_channels)
+        # hd3->80*80, hd2->160*160, Upsample 2 times
+        self.hd3_UT_hd2 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd3_UT_hd2_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd4->40*40, hd2->160*160, Upsample 4 times
+        self.hd4_UT_hd2 = nn.Upsample(scale_factor=4, mode='bilinear')  # 14*14
+        self.hd4_UT_hd2_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd5->20*20, hd2->160*160, Upsample 8 times
+        self.hd5_UT_hd2 = nn.Upsample(scale_factor=8, mode='bilinear')  # 14*14
+        self.hd5_UT_hd2_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_PT_hd2, h2_Cat_hd2, hd3_UT_hd2, hd4_UT_hd2, hd5_UT_hd2)
+        self.cbr2d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+        '''stage 1d'''
+        # h1->320*320, hd1->320*320, Concatenation
+        self.h1_Cat_hd1_cbr = ConvBnReLU2D(filters[0], cat_channels)
+        # hd2->160*160, hd1->320*320, Upsample 2 times
+        self.hd2_UT_hd1 = nn.Upsample(scale_factor=2, mode='bilinear')  # 14*14
+        self.hd2_UT_hd1_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd3->80*80, hd1->320*320, Upsample 4 times
+        self.hd3_UT_hd1 = nn.Upsample(scale_factor=4, mode='bilinear')  # 14*14
+        self.hd3_UT_hd1_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd4->40*40, hd1->320*320, Upsample 8 times
+        self.hd4_UT_hd1 = nn.Upsample(scale_factor=8, mode='bilinear')  # 14*14
+        self.hd4_UT_hd1_cbr = ConvBnReLU2D(up_channels, cat_channels)
+        # hd5->20*20, hd1->320*320, Upsample 16 times
+        self.hd5_UT_hd1 = nn.Upsample(scale_factor=16, mode='bilinear')  # 14*14
+        self.hd5_UT_hd1_cbr = ConvBnReLU2D(filters[4], cat_channels)
+        # fusion(h1_Cat_hd1, hd2_UT_hd1, hd3_UT_hd1, hd4_UT_hd1, hd5_UT_hd1)
+        self.cbr1d_1 = ConvBnReLU2D(up_channels, up_channels)  # 16
+
+    def forward(self, inputs):
+        h1, h2, h3, h4, hd5 = inputs
+        h1_PT_hd4 = self.h1_PT_hd4_cbr(self.h1_PT_hd4(h1))
+        h2_PT_hd4 = self.h2_PT_hd4_cbr(self.h2_PT_hd4(h2))
+        h3_PT_hd4 = self.h3_PT_hd4_cbr(self.h3_PT_hd4(h3))
+        h4_Cat_hd4 = self.h4_Cat_hd4_cbr(h4)
+        hd5_UT_hd4 = self.hd5_UT_hd4_cbr(self.hd5_UT_hd4(hd5))
+        # hd4->40*40*up_channels
+        hd4 = self.cbr4d_1(
+            paddle.concat(
+                [h1_PT_hd4, h2_PT_hd4, h3_PT_hd4, h4_Cat_hd4, hd5_UT_hd4], 1))
+        h1_PT_hd3 = self.h1_PT_hd3_cbr(self.h1_PT_hd3(h1))
+        h2_PT_hd3 = self.h2_PT_hd3_cbr(self.h2_PT_hd3(h2))
+        h3_Cat_hd3 = self.h3_Cat_hd3_cbr(h3)
+        hd4_UT_hd3 = self.hd4_UT_hd3_cbr(self.hd4_UT_hd3(hd4))
+        hd5_UT_hd3 = self.hd5_UT_hd3_cbr(self.hd5_UT_hd3(hd5))
+        # hd3->80*80*up_channels
+        hd3 = self.cbr3d_1(
+            paddle.concat(
+                [h1_PT_hd3, h2_PT_hd3, h3_Cat_hd3, hd4_UT_hd3, hd5_UT_hd3], 1))
+        h1_PT_hd2 = self.h1_PT_hd2_cbr(self.h1_PT_hd2(h1))
+        h2_Cat_hd2 = self.h2_Cat_hd2_cbr(h2)
+        hd3_UT_hd2 = self.hd3_UT_hd2_cbr(self.hd3_UT_hd2(hd3))
+        hd4_UT_hd2 = self.hd4_UT_hd2_cbr(self.hd4_UT_hd2(hd4))
+        hd5_UT_hd2 = self.hd5_UT_hd2_cbr(self.hd5_UT_hd2(hd5))
+        # hd2->160*160*up_channels
+        hd2 = self.cbr2d_1(
+            paddle.concat(
+                [h1_PT_hd2, h2_Cat_hd2, hd3_UT_hd2, hd4_UT_hd2, hd5_UT_hd2], 1))
+        h1_Cat_hd1 = self.h1_Cat_hd1_cbr(h1)
+        hd2_UT_hd1 = self.hd2_UT_hd1_cbr(self.hd2_UT_hd1(hd2))
+        hd3_UT_hd1 = self.hd3_UT_hd1_cbr(self.hd3_UT_hd1(hd3))
+        hd4_UT_hd1 = self.hd4_UT_hd1_cbr(self.hd4_UT_hd1(hd4))
+        hd5_UT_hd1 = self.hd5_UT_hd1_cbr(self.hd5_UT_hd1(hd5))
+        # hd1->320*320*up_channels
+        hd1 = self.cbr1d_1(
+            paddle.concat(
+                [h1_Cat_hd1, hd2_UT_hd1, hd3_UT_hd1, hd4_UT_hd1, hd5_UT_hd1],
+                1))
+        return [hd1, hd2, hd3, hd4, hd5]
+
+
+class DeepSup(nn.Layer):
+    def __init__(self, up_channels, filters, num_classes):
+        super(DeepSup, self).__init__()
+        self.convup5 = ConvUp2D(filters[4], num_classes, 16)
+        self.convup4 = ConvUp2D(up_channels, num_classes, 8)
+        self.convup3 = ConvUp2D(up_channels, num_classes, 4)
+        self.convup2 = ConvUp2D(up_channels, num_classes, 2)
+        self.outconv1 = nn.Conv2D(up_channels, num_classes, 3, padding=1)
+
+    def forward(self, inputs):
+        hd1, hd2, hd3, hd4, hd5 = inputs
+        d5 = self.convup5(hd5)  # 16->256
+        d4 = self.convup4(hd4)  # 32->256
+        d3 = self.convup3(hd3)  # 64->256
+        d2 = self.convup2(hd2)  # 128->256
+        d1 = self.outconv1(hd1)  # 256
+        return [d1, d2, d3, d4, d5]
+
+
+class ConvBnReLU2D(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBnReLU2D, self).__init__(
+            nn.Conv2D(in_channels, out_channels, 3, padding=1),
+            nn.BatchNorm(out_channels), nn.ReLU())
+
+
+class ConvUp2D(nn.Sequential):
+    def __init__(self, in_channels, out_channels, scale_factor):
+        super(ConvUp2D, self).__init__(
+            nn.Conv2D(in_channels, out_channels, 3, padding=1),
+            nn.Upsample(scale_factor=scale_factor, mode='bilinear'))
+
+
+class MaxPoolConv2D(nn.Sequential):
+    def __init__(self, in_channels, out_channels, is_batchnorm):
+        super(MaxPoolConv2D, self).__init__(
+            nn.MaxPool2D(kernel_size=2),
+            UnetConv2D(in_channels, out_channels, is_batchnorm))
+
+
+class UnetConv2D(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 is_batchnorm,
+                 num_conv=2,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1):
+        super(UnetConv2D, self).__init__()
+        self.num_conv = num_conv
+        for i in range(num_conv):
+            conv = (nn.Sequential(nn.Conv2D(in_channels, out_channels, kernel_size, stride, padding),
+                                  nn.BatchNorm(out_channels),
+                                  nn.ReLU()) \
+                    if is_batchnorm else \
+                    nn.Sequential(nn.Conv2D(in_channels, out_channels, kernel_size, stride, padding),
+                                  nn.ReLU()))
+            setattr(self, 'conv%d' % (i + 1), conv)
+            in_channels = out_channels
+        # initialise the blocks
+        for children in self.children():
+            children.weight_attr = paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.KaimingNormal)
+            children.bias_attr = paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.KaimingNormal)
+
+    def forward(self, inputs):
+        x = inputs
+        for i in range(self.num_conv):
+            conv = getattr(self, 'conv%d' % (i + 1))
+            x = conv(x)
+        return x
diff --git a/paddlers/models/ppseg/models/unet_plusplus.py b/paddlers/models/ppseg/models/unet_plusplus.py
new file mode 100644
index 0000000..10d1189
--- /dev/null
+++ b/paddlers/models/ppseg/models/unet_plusplus.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import load_entire_model
+from paddlers.models.ppseg.cvlibs.param_init import kaiming_normal_init
+from paddlers.models.ppseg.models.layers.layer_libs import SyncBatchNorm
+
+
+@manager.MODELS.add_component
+class UNetPlusPlus(nn.Layer):
+    """
+    The UNet++ implementation based on PaddlePaddle.
+
+    The original article refers to
+    Zongwei Zhou, et, al. "UNet++: A Nested U-Net Architecture for Medical Image Segmentation"
+    (https://arxiv.org/abs/1807.10165).
+
+    Args:
+        in_channels (int): The channel number of input image.
+        num_classes (int): The unique number of target classes.
+        use_deconv (bool, optional): A bool value indicates whether using deconvolution in upsampling.
+            If False, use resize_bilinear. Default: False.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model for fine tuning. Default: None.
+        is_ds (bool): use deep supervision or not. Default: True
+        """
+
+    def __init__(self,
+                 in_channels,
+                 num_classes,
+                 use_deconv=False,
+                 align_corners=False,
+                 pretrained=None,
+                 is_ds=True):
+        super(UNetPlusPlus, self).__init__()
+        self.pretrained = pretrained
+        self.is_ds = is_ds
+        channels = [32, 64, 128, 256, 512]
+        self.pool = nn.MaxPool2D(kernel_size=2, stride=2)
+
+        self.conv0_0 = DoubleConv(in_channels, channels[0])
+        self.conv1_0 = DoubleConv(channels[0], channels[1])
+        self.conv2_0 = DoubleConv(channels[1], channels[2])
+        self.conv3_0 = DoubleConv(channels[2], channels[3])
+        self.conv4_0 = DoubleConv(channels[3], channels[4])
+
+        self.up_cat0_1 = UpSampling(
+            channels[1],
+            channels[0],
+            n_cat=2,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+        self.up_cat1_1 = UpSampling(
+            channels[2],
+            channels[1],
+            n_cat=2,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+        self.up_cat2_1 = UpSampling(
+            channels[3],
+            channels[2],
+            n_cat=2,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+        self.up_cat3_1 = UpSampling(
+            channels[4],
+            channels[3],
+            n_cat=2,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+
+        self.up_cat0_2 = UpSampling(
+            channels[1],
+            channels[0],
+            n_cat=3,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+        self.up_cat1_2 = UpSampling(
+            channels[2],
+            channels[1],
+            n_cat=3,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+        self.up_cat2_2 = UpSampling(
+            channels[3],
+            channels[2],
+            n_cat=3,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+
+        self.up_cat0_3 = UpSampling(
+            channels[1],
+            channels[0],
+            n_cat=4,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+        self.up_cat1_3 = UpSampling(
+            channels[2],
+            channels[1],
+            n_cat=4,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+
+        self.up_cat0_4 = UpSampling(
+            channels[1],
+            channels[0],
+            n_cat=5,
+            use_deconv=use_deconv,
+            align_corners=align_corners)
+
+        self.out_1 = nn.Conv2D(channels[0], num_classes, 1, 1, 0)
+        self.out_2 = nn.Conv2D(channels[0], num_classes, 1, 1, 0)
+        self.out_3 = nn.Conv2D(channels[0], num_classes, 1, 1, 0)
+        self.out_4 = nn.Conv2D(channels[0], num_classes, 1, 1, 0)
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            load_entire_model(self, self.pretrained)
+        else:
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Conv2D):
+                    kaiming_normal_init(sublayer.weight)
+                elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                    kaiming_normal_init(sublayer.weight)
+
+    def forward(self, inputs):
+        # 0 down
+        X0_0 = self.conv0_0(inputs)  # n,32,h,w
+        pool_0 = self.pool(X0_0)  # n,32,h/2,w/2
+        X1_0 = self.conv1_0(pool_0)  # n,64,h/2,w/2
+        pool_1 = self.pool(X1_0)  # n,64,h/4,w/4
+        X2_0 = self.conv2_0(pool_1)  # n,128,h/4,w/4
+        pool_2 = self.pool(X2_0)  # n,128,h/8,n/8
+        X3_0 = self.conv3_0(pool_2)  # n,256,h/8,w/8
+        pool_3 = self.pool(X3_0)  # n,256,h/16,w/16
+        X4_0 = self.conv4_0(pool_3)  # n,512,h/16,w/16
+
+        # 1 up+concat
+        X0_1 = self.up_cat0_1(X1_0, X0_0)  # n,32,h,w
+        X1_1 = self.up_cat1_1(X2_0, X1_0)  # n,64,h/2,w/2
+        X2_1 = self.up_cat2_1(X3_0, X2_0)  # n,128,h/4,w/4
+        X3_1 = self.up_cat3_1(X4_0, X3_0)  # n,256,h/8,w/8
+
+        # 2 up+concat
+        X0_2 = self.up_cat0_2(X1_1, X0_0, X0_1)  # n,32,h,w
+        X1_2 = self.up_cat1_2(X2_1, X1_0, X1_1)  # n,64,h/2,w/2
+        X2_2 = self.up_cat2_2(X3_1, X2_0, X2_1)  # n,128,h/4,w/4
+
+        # 3 up+concat
+        X0_3 = self.up_cat0_3(X1_2, X0_0, X0_1, X0_2)  # n,32,h,w
+        X1_3 = self.up_cat1_3(X2_2, X1_0, X1_1, X1_2)  # n,64,h/2,w/2
+
+        # 4 up+concat
+        X0_4 = self.up_cat0_4(X1_3, X0_0, X0_1, X0_2, X0_3)  # n,32,h,w
+
+        # out conv1*1
+        out_1 = self.out_1(X0_1)  # n,num_classes,h,w
+        out_2 = self.out_2(X0_2)  # n,num_classes,h,w
+        out_3 = self.out_3(X0_3)  # n,num_classes,h,w
+        out_4 = self.out_4(X0_4)  # n,num_classes,h,w
+
+        output = (out_1 + out_2 + out_3 + out_4) / 4
+
+        if self.is_ds:
+            return [output]
+        else:
+            return [out_4]
+
+
+class DoubleConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 filter_size=3,
+                 stride=1,
+                 padding=1):
+        super(DoubleConv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(in_channels, out_channels, filter_size, stride, padding),
+            SyncBatchNorm(out_channels), nn.ReLU(),
+            nn.Conv2D(out_channels, out_channels, filter_size, stride, padding),
+            SyncBatchNorm(out_channels), nn.ReLU())
+
+    def forward(self, inputs):
+        conv = self.conv(inputs)
+
+        return conv
+
+
+class UpSampling(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 n_cat,
+                 use_deconv=False,
+                 align_corners=False):
+        super(UpSampling, self).__init__()
+        if use_deconv:
+            self.up = nn.Conv2DTranspose(
+                in_channels, out_channels, kernel_size=2, stride=2, padding=0)
+        else:
+            self.up = nn.Sequential(
+                nn.Upsample(
+                    scale_factor=2,
+                    mode='bilinear',
+                    align_corners=align_corners),
+                nn.Conv2D(in_channels, out_channels, 1, 1, 0))
+
+        self.conv = DoubleConv(n_cat * out_channels, out_channels)
+
+    def forward(self, high_feature, *low_features):
+        features = [self.up(high_feature)]
+        for feature in low_features:
+            features.append(feature)
+        cat_features = paddle.concat(features, axis=1)
+        out = self.conv(cat_features)
+
+        return out
diff --git a/paddlers/models/ppseg/transforms/__init__.py b/paddlers/models/ppseg/transforms/__init__.py
new file mode 100644
index 0000000..8f1d5ae
--- /dev/null
+++ b/paddlers/models/ppseg/transforms/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .transforms import *
+from . import functional
diff --git a/paddlers/models/ppseg/transforms/functional.py b/paddlers/models/ppseg/transforms/functional.py
new file mode 100644
index 0000000..bcd4824
--- /dev/null
+++ b/paddlers/models/ppseg/transforms/functional.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+from PIL import Image, ImageEnhance
+from scipy.ndimage.morphology import distance_transform_edt
+
+
+def normalize(im, mean, std):
+    im = im.astype(np.float32, copy=False) / 255.0
+    im -= mean
+    im /= std
+    return im
+
+
+def resize(im, target_size=608, interp=cv2.INTER_LINEAR):
+    if isinstance(target_size, list) or isinstance(target_size, tuple):
+        w = target_size[0]
+        h = target_size[1]
+    else:
+        w = target_size
+        h = target_size
+    im = cv2.resize(im, (w, h), interpolation=interp)
+    return im
+
+
+def resize_long(im, long_size=224, interpolation=cv2.INTER_LINEAR):
+    value = max(im.shape[0], im.shape[1])
+    scale = float(long_size) / float(value)
+    resized_width = int(round(im.shape[1] * scale))
+    resized_height = int(round(im.shape[0] * scale))
+
+    im = cv2.resize(
+        im, (resized_width, resized_height), interpolation=interpolation)
+    return im
+
+
+def resize_short(im, short_size=224, interpolation=cv2.INTER_LINEAR):
+    value = min(im.shape[0], im.shape[1])
+    scale = float(short_size) / float(value)
+    resized_width = int(round(im.shape[1] * scale))
+    resized_height = int(round(im.shape[0] * scale))
+
+    im = cv2.resize(
+        im, (resized_width, resized_height), interpolation=interpolation)
+    return im
+
+
+def horizontal_flip(im):
+    if len(im.shape) == 3:
+        im = im[:, ::-1, :]
+    elif len(im.shape) == 2:
+        im = im[:, ::-1]
+    return im
+
+
+def vertical_flip(im):
+    if len(im.shape) == 3:
+        im = im[::-1, :, :]
+    elif len(im.shape) == 2:
+        im = im[::-1, :]
+    return im
+
+
+def brightness(im, brightness_lower, brightness_upper):
+    brightness_delta = np.random.uniform(brightness_lower, brightness_upper)
+    im = ImageEnhance.Brightness(im).enhance(brightness_delta)
+    return im
+
+
+def contrast(im, contrast_lower, contrast_upper):
+    contrast_delta = np.random.uniform(contrast_lower, contrast_upper)
+    im = ImageEnhance.Contrast(im).enhance(contrast_delta)
+    return im
+
+
+def saturation(im, saturation_lower, saturation_upper):
+    saturation_delta = np.random.uniform(saturation_lower, saturation_upper)
+    im = ImageEnhance.Color(im).enhance(saturation_delta)
+    return im
+
+
+def hue(im, hue_lower, hue_upper):
+    hue_delta = np.random.uniform(hue_lower, hue_upper)
+    im = np.array(im.convert('HSV'))
+    im[:, :, 0] = im[:, :, 0] + hue_delta
+    im = Image.fromarray(im, mode='HSV').convert('RGB')
+    return im
+
+
+def sharpness(im, sharpness_lower, sharpness_upper):
+    sharpness_delta = np.random.uniform(sharpness_lower, sharpness_upper)
+    im = ImageEnhance.Sharpness(im).enhance(sharpness_delta)
+    return im
+
+
+def rotate(im, rotate_lower, rotate_upper):
+    rotate_delta = np.random.uniform(rotate_lower, rotate_upper)
+    im = im.rotate(int(rotate_delta))
+    return im
+
+
+def mask_to_onehot(mask, num_classes):
+    """
+    Convert a mask (H, W) to onehot (K, H, W).
+
+    Args:
+        mask (np.ndarray): Label mask with shape (H, W)
+        num_classes (int): Number of classes.
+
+    Returns:
+        np.ndarray: Onehot mask with shape(K, H, W).
+    """
+    _mask = [mask == i for i in range(num_classes)]
+    _mask = np.array(_mask).astype(np.uint8)
+    return _mask
+
+
+def onehot_to_binary_edge(mask, radius):
+    """
+    Convert a onehot mask (K, H, W) to a edge mask.
+
+    Args:
+        mask (np.ndarray): Onehot mask with shape (K, H, W)
+        radius (int|float): Radius of edge.
+
+    Returns:
+        np.ndarray: Edge mask with shape(H, W).
+    """
+    if radius < 1:
+        raise ValueError('`radius` should be greater than or equal to 1')
+    num_classes = mask.shape[0]
+
+    edge = np.zeros(mask.shape[1:])
+    # pad borders
+    mask = np.pad(
+        mask, ((0, 0), (1, 1), (1, 1)), mode='constant', constant_values=0)
+    for i in range(num_classes):
+        dist = distance_transform_edt(
+            mask[i, :]) + distance_transform_edt(1.0 - mask[i, :])
+        dist = dist[1:-1, 1:-1]
+        dist[dist > radius] = 0
+        edge += dist
+
+    edge = np.expand_dims(edge, axis=0)
+    edge = (edge > 0).astype(np.uint8)
+    return edge
+
+
+def mask_to_binary_edge(mask, radius, num_classes):
+    """
+    Convert a segmentic segmentation mask (H, W) to a binary edge mask(H, W).
+
+    Args:
+        mask (np.ndarray): Label mask with shape (H, W)
+        radius (int|float): Radius of edge.
+        num_classes (int): Number of classes.
+
+    Returns:
+        np.ndarray: Edge mask with shape(H, W).
+    """
+    mask = mask.squeeze()
+    onehot = mask_to_onehot(mask, num_classes)
+    edge = onehot_to_binary_edge(onehot, radius)
+    return edge
diff --git a/paddlers/models/ppseg/transforms/transforms.py b/paddlers/models/ppseg/transforms/transforms.py
new file mode 100644
index 0000000..aa1c6f3
--- /dev/null
+++ b/paddlers/models/ppseg/transforms/transforms.py
@@ -0,0 +1,1271 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import math
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import functional
+
+
+@manager.TRANSFORMS.add_component
+class Compose:
+    """
+    Do transformation on input data with corresponding pre-processing and augmentation operations.
+    The shape of input data to all operations is [height, width, channels].
+
+    Args:
+        transforms (list): A list contains data pre-processing or augmentation. Empty list means only reading images, no transformation.
+        to_rgb (bool, optional): If converting image to RGB color space. Default: True.
+
+    Raises:
+        TypeError: When 'transforms' is not a list.
+        ValueError: when the length of 'transforms' is less than 1.
+    """
+
+    def __init__(self, transforms, to_rgb=True):
+        if not isinstance(transforms, list):
+            raise TypeError('The transforms must be a list!')
+        self.transforms = transforms
+        self.to_rgb = to_rgb
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (str|np.ndarray): It is either image path or image object.
+            label (str|np.ndarray): It is either label path or label ndarray.
+
+        Returns:
+            (tuple). A tuple including image, image info, and label after transformation.
+        """
+        if isinstance(im, str):
+            im = cv2.imread(im).astype('float32')
+        if isinstance(label, str):
+            label = np.asarray(Image.open(label))
+        if im is None:
+            raise ValueError('Can\'t read The image file {}!'.format(im))
+        if self.to_rgb:
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+        for op in self.transforms:
+            outputs = op(im, label)
+            im = outputs[0]
+            if len(outputs) == 2:
+                label = outputs[1]
+        im = np.transpose(im, (2, 0, 1))
+        return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomHorizontalFlip:
+    """
+    Flip an image horizontally with a certain probability.
+
+    Args:
+        prob (float, optional): A probability of horizontally flipping. Default: 0.5.
+    """
+
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, im, label=None):
+        if random.random() < self.prob:
+            im = functional.horizontal_flip(im)
+            if label is not None:
+                label = functional.horizontal_flip(label)
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomVerticalFlip:
+    """
+    Flip an image vertically with a certain probability.
+
+    Args:
+        prob (float, optional): A probability of vertical flipping. Default: 0.1.
+    """
+
+    def __init__(self, prob=0.1):
+        self.prob = prob
+
+    def __call__(self, im, label=None):
+        if random.random() < self.prob:
+            im = functional.vertical_flip(im)
+            if label is not None:
+                label = functional.vertical_flip(label)
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class Resize:
+    """
+    Resize an image.
+
+    Args:
+        target_size (list|tuple, optional): The target size of image. Default: (512, 512).
+        interp (str, optional): The interpolation mode of resize is consistent with opencv.
+            ['NEAREST', 'LINEAR', 'CUBIC', 'AREA', 'LANCZOS4', 'RANDOM']. Note that when it is
+            'RANDOM', a random interpolation mode would be specified. Default: "LINEAR".
+
+    Raises:
+        TypeError: When 'target_size' type is neither list nor tuple.
+        ValueError: When "interp" is out of pre-defined methods ('NEAREST', 'LINEAR', 'CUBIC',
+        'AREA', 'LANCZOS4', 'RANDOM').
+    """
+
+    # The interpolation mode
+    interp_dict = {
+        'NEAREST': cv2.INTER_NEAREST,
+        'LINEAR': cv2.INTER_LINEAR,
+        'CUBIC': cv2.INTER_CUBIC,
+        'AREA': cv2.INTER_AREA,
+        'LANCZOS4': cv2.INTER_LANCZOS4
+    }
+
+    def __init__(self, target_size=(512, 512), interp='LINEAR'):
+        self.interp = interp
+        if not (interp == "RANDOM" or interp in self.interp_dict):
+            raise ValueError("`interp` should be one of {}".format(
+                self.interp_dict.keys()))
+        if isinstance(target_size, list) or isinstance(target_size, tuple):
+            if len(target_size) != 2:
+                raise ValueError(
+                    '`target_size` should include 2 elements, but it is {}'.
+                        format(target_size))
+        else:
+            raise TypeError(
+                "Type of `target_size` is invalid. It should be list or tuple, but it is {}"
+                    .format(type(target_size)))
+
+        self.target_size = target_size
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label),
+
+        Raises:
+            TypeError: When the 'img' type is not numpy.
+            ValueError: When the length of "im" shape is not 3.
+        """
+
+        if not isinstance(im, np.ndarray):
+            raise TypeError("Resize: image type is not numpy.")
+        if len(im.shape) != 3:
+            raise ValueError('Resize: image is not 3-dimensional.')
+        if self.interp == "RANDOM":
+            interp = random.choice(list(self.interp_dict.keys()))
+        else:
+            interp = self.interp
+        im = functional.resize(im, self.target_size, self.interp_dict[interp])
+        if label is not None:
+            label = functional.resize(label, self.target_size,
+                                      cv2.INTER_NEAREST)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class ResizeByLong:
+    """
+    Resize the long side of an image to given size, and then scale the other side proportionally.
+
+    Args:
+        long_size (int): The target size of long side.
+    """
+
+    def __init__(self, long_size):
+        self.long_size = long_size
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        im = functional.resize_long(im, self.long_size)
+        if label is not None:
+            label = functional.resize_long(label, self.long_size,
+                                           cv2.INTER_NEAREST)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class ResizeByShort:
+    """
+    Resize the short side of an image to given size, and then scale the other side proportionally.
+
+    Args:
+        short_size (int): The target size of short side.
+    """
+
+    def __init__(self, short_size):
+        self.short_size = short_size
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        im = functional.resize_short(im, self.short_size)
+        if label is not None:
+            label = functional.resize_short(label, self.short_size,
+                                            cv2.INTER_NEAREST)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class LimitLong:
+    """
+    Limit the long edge of image.
+
+    If the long edge is larger than max_long, resize the long edge
+    to max_long, while scale the short edge proportionally.
+
+    If the long edge is smaller than min_long, resize the long edge
+    to min_long, while scale the short edge proportionally.
+
+    Args:
+        max_long (int, optional): If the long edge of image is larger than max_long,
+            it will be resize to max_long. Default: None.
+        min_long (int, optional): If the long edge of image is smaller than min_long,
+            it will be resize to min_long. Default: None.
+    """
+
+    def __init__(self, max_long=None, min_long=None):
+        if max_long is not None:
+            if not isinstance(max_long, int):
+                raise TypeError(
+                    "Type of `max_long` is invalid. It should be int, but it is {}"
+                        .format(type(max_long)))
+        if min_long is not None:
+            if not isinstance(min_long, int):
+                raise TypeError(
+                    "Type of `min_long` is invalid. It should be int, but it is {}"
+                        .format(type(min_long)))
+        if (max_long is not None) and (min_long is not None):
+            if min_long > max_long:
+                raise ValueError(
+                    '`max_long should not smaller than min_long, but they are {} and {}'
+                        .format(max_long, min_long))
+        self.max_long = max_long
+        self.min_long = min_long
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+        h, w = im.shape[0], im.shape[1]
+        long_edge = max(h, w)
+        target = long_edge
+        if (self.max_long is not None) and (long_edge > self.max_long):
+            target = self.max_long
+        elif (self.min_long is not None) and (long_edge < self.min_long):
+            target = self.min_long
+
+        if target != long_edge:
+            im = functional.resize_long(im, target)
+            if label is not None:
+                label = functional.resize_long(label, target, cv2.INTER_NEAREST)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class ResizeRangeScaling:
+    """
+    Resize the long side of an image into a range, and then scale the other side proportionally.
+
+    Args:
+        min_value (int, optional): The minimum value of long side after resize. Default: 400.
+        max_value (int, optional): The maximum value of long side after resize. Default: 600.
+    """
+
+    def __init__(self, min_value=400, max_value=600):
+        if min_value > max_value:
+            raise ValueError('min_value must be less than max_value, '
+                             'but they are {} and {}.'.format(
+                min_value, max_value))
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        if self.min_value == self.max_value:
+            random_size = self.max_value
+        else:
+            random_size = int(
+                np.random.uniform(self.min_value, self.max_value) + 0.5)
+        im = functional.resize_long(im, random_size, cv2.INTER_LINEAR)
+        if label is not None:
+            label = functional.resize_long(label, random_size,
+                                           cv2.INTER_NEAREST)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class ResizeStepScaling:
+    """
+    Scale an image proportionally within a range.
+
+    Args:
+        min_scale_factor (float, optional): The minimum scale. Default: 0.75.
+        max_scale_factor (float, optional): The maximum scale. Default: 1.25.
+        scale_step_size (float, optional): The scale interval. Default: 0.25.
+
+    Raises:
+        ValueError: When min_scale_factor is smaller than max_scale_factor.
+    """
+
+    def __init__(self,
+                 min_scale_factor=0.75,
+                 max_scale_factor=1.25,
+                 scale_step_size=0.25):
+        if min_scale_factor > max_scale_factor:
+            raise ValueError(
+                'min_scale_factor must be less than max_scale_factor, '
+                'but they are {} and {}.'.format(min_scale_factor,
+                                                 max_scale_factor))
+        self.min_scale_factor = min_scale_factor
+        self.max_scale_factor = max_scale_factor
+        self.scale_step_size = scale_step_size
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        if self.min_scale_factor == self.max_scale_factor:
+            scale_factor = self.min_scale_factor
+
+        elif self.scale_step_size == 0:
+            scale_factor = np.random.uniform(self.min_scale_factor,
+                                             self.max_scale_factor)
+
+        else:
+            num_steps = int((self.max_scale_factor - self.min_scale_factor) /
+                            self.scale_step_size + 1)
+            scale_factors = np.linspace(self.min_scale_factor,
+                                        self.max_scale_factor,
+                                        num_steps).tolist()
+            np.random.shuffle(scale_factors)
+            scale_factor = scale_factors[0]
+        w = int(round(scale_factor * im.shape[1]))
+        h = int(round(scale_factor * im.shape[0]))
+
+        im = functional.resize(im, (w, h), cv2.INTER_LINEAR)
+        if label is not None:
+            label = functional.resize(label, (w, h), cv2.INTER_NEAREST)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class Normalize:
+    """
+    Normalize an image.
+
+    Args:
+        mean (list, optional): The mean value of a data set. Default: [0.5, 0.5, 0.5].
+        std (list, optional): The standard deviation of a data set. Default: [0.5, 0.5, 0.5].
+
+    Raises:
+        ValueError: When mean/std is not list or any value in std is 0.
+    """
+
+    def __init__(self, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)):
+        self.mean = mean
+        self.std = std
+        if not (isinstance(self.mean, (list, tuple))
+                and isinstance(self.std, (list, tuple))):
+            raise ValueError(
+                "{}: input type is invalid. It should be list or tuple".format(
+                    self))
+        from functools import reduce
+        if reduce(lambda x, y: x * y, self.std) == 0:
+            raise ValueError('{}: std is invalid!'.format(self))
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+        std = np.array(self.std)[np.newaxis, np.newaxis, :]
+        im = functional.normalize(im, mean, std)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class Padding:
+    """
+    Add bottom-right padding to a raw image or annotation image.
+
+    Args:
+        target_size (list|tuple): The target size after padding.
+        im_padding_value (list, optional): The padding value of raw image.
+            Default: [127.5, 127.5, 127.5].
+        label_padding_value (int, optional): The padding value of annotation image. Default: 255.
+
+    Raises:
+        TypeError: When target_size is neither list nor tuple.
+        ValueError: When the length of target_size is not 2.
+    """
+
+    def __init__(self,
+                 target_size,
+                 im_padding_value=(127.5, 127.5, 127.5),
+                 label_padding_value=255):
+        if isinstance(target_size, list) or isinstance(target_size, tuple):
+            if len(target_size) != 2:
+                raise ValueError(
+                    '`target_size` should include 2 elements, but it is {}'.
+                        format(target_size))
+        else:
+            raise TypeError(
+                "Type of target_size is invalid. It should be list or tuple, now is {}"
+                    .format(type(target_size)))
+        self.target_size = target_size
+        self.im_padding_value = im_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        im_height, im_width = im.shape[0], im.shape[1]
+        if isinstance(self.target_size, int):
+            target_height = self.target_size
+            target_width = self.target_size
+        else:
+            target_height = self.target_size[1]
+            target_width = self.target_size[0]
+        pad_height = target_height - im_height
+        pad_width = target_width - im_width
+        if pad_height < 0 or pad_width < 0:
+            raise ValueError(
+                'The size of image should be less than `target_size`, but the size of image ({}, {}) is larger than `target_size` ({}, {})'
+                    .format(im_width, im_height, target_width, target_height))
+        else:
+            im = cv2.copyMakeBorder(
+                im,
+                0,
+                pad_height,
+                0,
+                pad_width,
+                cv2.BORDER_CONSTANT,
+                value=self.im_padding_value)
+            if label is not None:
+                label = cv2.copyMakeBorder(
+                    label,
+                    0,
+                    pad_height,
+                    0,
+                    pad_width,
+                    cv2.BORDER_CONSTANT,
+                    value=self.label_padding_value)
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class PaddingByAspectRatio:
+    """
+
+    Args:
+        aspect_ratio (int|float, optional): The aspect ratio = width / height. Default: 1.
+    """
+
+    def __init__(self,
+                 aspect_ratio=1,
+                 im_padding_value=(127.5, 127.5, 127.5),
+                 label_padding_value=255):
+        self.aspect_ratio = aspect_ratio
+        self.im_padding_value = im_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        img_height = im.shape[0]
+        img_width = im.shape[1]
+        ratio = img_width / img_height
+        if ratio == self.aspect_ratio:
+            if label is None:
+                return (im,)
+            else:
+                return (im, label)
+        elif ratio > self.aspect_ratio:
+            img_height = int(img_width / self.aspect_ratio)
+        else:
+            img_width = int(img_height * self.aspect_ratio)
+        padding = Padding((img_width, img_height),
+                          im_padding_value=self.im_padding_value,
+                          label_padding_value=self.label_padding_value)
+        return padding(im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomPaddingCrop:
+    """
+    Crop a sub-image from a raw image and annotation image randomly. If the target cropping size
+    is larger than original image, then the bottom-right padding will be added.
+
+    Args:
+        crop_size (tuple, optional): The target cropping size. Default: (512, 512).
+        im_padding_value (list, optional): The padding value of raw image.
+            Default: [127.5, 127.5, 127.5].
+        label_padding_value (int, optional): The padding value of annotation image. Default: 255.
+
+    Raises:
+        TypeError: When crop_size is neither list nor tuple.
+        ValueError: When the length of crop_size is not 2.
+    """
+
+    def __init__(self,
+                 crop_size=(512, 512),
+                 im_padding_value=(127.5, 127.5, 127.5),
+                 label_padding_value=255):
+        if isinstance(crop_size, list) or isinstance(crop_size, tuple):
+            if len(crop_size) != 2:
+                raise ValueError(
+                    'Type of `crop_size` is list or tuple. It should include 2 elements, but it is {}'
+                        .format(crop_size))
+        else:
+            raise TypeError(
+                "The type of `crop_size` is invalid. It should be list or tuple, but it is {}"
+                    .format(type(crop_size)))
+        self.crop_size = crop_size
+        self.im_padding_value = im_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        if isinstance(self.crop_size, int):
+            crop_width = self.crop_size
+            crop_height = self.crop_size
+        else:
+            crop_width = self.crop_size[0]
+            crop_height = self.crop_size[1]
+
+        img_height = im.shape[0]
+        img_width = im.shape[1]
+
+        if img_height == crop_height and img_width == crop_width:
+            if label is None:
+                return (im,)
+            else:
+                return (im, label)
+        else:
+            pad_height = max(crop_height - img_height, 0)
+            pad_width = max(crop_width - img_width, 0)
+            if (pad_height > 0 or pad_width > 0):
+                im = cv2.copyMakeBorder(
+                    im,
+                    0,
+                    pad_height,
+                    0,
+                    pad_width,
+                    cv2.BORDER_CONSTANT,
+                    value=self.im_padding_value)
+                if label is not None:
+                    label = cv2.copyMakeBorder(
+                        label,
+                        0,
+                        pad_height,
+                        0,
+                        pad_width,
+                        cv2.BORDER_CONSTANT,
+                        value=self.label_padding_value)
+                img_height = im.shape[0]
+                img_width = im.shape[1]
+
+            if crop_height > 0 and crop_width > 0:
+                h_off = np.random.randint(img_height - crop_height + 1)
+                w_off = np.random.randint(img_width - crop_width + 1)
+
+                im = im[h_off:(crop_height + h_off), w_off:(
+                        w_off + crop_width), :]
+                if label is not None:
+                    label = label[h_off:(crop_height + h_off), w_off:(
+                            w_off + crop_width)]
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomCenterCrop:
+    """
+    Crops the given the input data at the center.
+    Args:
+        retain_ratio (tuple or list, optional): The length of the input list or tuple must be 2. Default: (0.5, 0.5).
+        the first value is used for width and the second is for height.
+        In addition, the minimum size of the cropped image is [width * retain_ratio[0], height * retain_ratio[1]].
+    Raises:
+        TypeError: When retain_ratio is neither list nor tuple. Default: None.
+        ValueError: When the value of retain_ratio is not in [0-1].
+    """
+
+    def __init__(self,
+                 retain_ratio=(0.5, 0.5)):
+        if isinstance(retain_ratio, list) or isinstance(retain_ratio, tuple):
+            if len(retain_ratio) != 2:
+                raise ValueError(
+                    'When type of `retain_ratio` is list or tuple, it shoule include 2 elements, but it is {}'.format(
+                        retain_ratio)
+                )
+            if retain_ratio[0] > 1 or retain_ratio[1] > 1 or retain_ratio[0] < 0 or retain_ratio[1] < 0:
+                raise ValueError(
+                    'Value of `retain_ratio` should be in [0, 1], but it is {}'.format(retain_ratio)
+                )
+        else:
+            raise TypeError(
+                "The type of `retain_ratio` is invalid. It should be list or tuple, but it is {}"
+                    .format(type(retain_ratio)))
+        self.retain_ratio = retain_ratio
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+        retain_width = self.retain_ratio[0]
+        retain_height = self.retain_ratio[1]
+
+        img_height = im.shape[0]
+        img_width = im.shape[1]
+
+        if retain_width == 1. and retain_height == 1.:
+            if label is None:
+                return (im,)
+            else:
+                return (im, label)
+        else:
+            randw = np.random.randint(img_width * (1 - retain_width))
+            randh = np.random.randint(img_height * (1 - retain_height))
+            offsetw = 0 if randw == 0 else np.random.randint(randw)
+            offseth = 0 if randh == 0 else np.random.randint(randh)
+            p0, p1, p2, p3 = offseth, img_height + offseth - randh, offsetw, img_width + offsetw - randw
+            im = im[p0:p1, p2:p3, :]
+            if label is not None:
+                label = label[p0:p1, p2:p3, :]
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class ScalePadding:
+    """
+        Add center padding to a raw image or annotation image,then scale the
+        image to target size.
+
+        Args:
+            target_size (list|tuple, optional): The target size of image. Default: (512, 512).
+            im_padding_value (list, optional): The padding value of raw image.
+                Default: [127.5, 127.5, 127.5].
+            label_padding_value (int, optional): The padding value of annotation image. Default: 255.
+
+        Raises:
+            TypeError: When target_size is neither list nor tuple.
+            ValueError: When the length of target_size is not 2.
+    """
+
+    def __init__(self,
+                 target_size=(512, 512),
+                 im_padding_value=(127.5, 127.5, 127.5),
+                 label_padding_value=255):
+        if isinstance(target_size, list) or isinstance(target_size, tuple):
+            if len(target_size) != 2:
+                raise ValueError(
+                    '`target_size` should include 2 elements, but it is {}'.
+                        format(target_size))
+        else:
+            raise TypeError(
+                "Type of `target_size` is invalid. It should be list or tuple, but it is {}"
+                    .format(type(target_size)))
+
+        self.target_size = target_size
+        self.im_padding_value = im_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+        height = im.shape[0]
+        width = im.shape[1]
+
+        new_im = np.zeros(
+            (max(height, width), max(height, width), 3)) + self.im_padding_value
+        if label is not None:
+            new_label = np.zeros((max(height, width), max(
+                height, width))) + self.label_padding_value
+
+        if height > width:
+            padding = int((height - width) / 2)
+            new_im[:, padding:padding + width, :] = im
+            if label is not None:
+                new_label[:, padding:padding + width] = label
+        else:
+            padding = int((width - height) / 2)
+            new_im[padding:padding + height, :, :] = im
+            if label is not None:
+                new_label[padding:padding + height, :] = label
+
+        im = np.uint8(new_im)
+        im = functional.resize(im, self.target_size, interp=cv2.INTER_CUBIC)
+        if label is not None:
+            label = np.uint8(new_label)
+            label = functional.resize(
+                label, self.target_size, interp=cv2.INTER_CUBIC)
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomNoise:
+    """
+    Superimposing noise on an image with a certain probability.
+
+    Args:
+        prob (float, optional): A probability of blurring an image. Default: 0.5.
+        max_sigma(float, optional): The maximum value of standard deviation of the distribution.
+            Default: 10.0.
+    """
+
+    def __init__(self, prob=0.5, max_sigma=10.0):
+        self.prob = prob
+        self.max_sigma = max_sigma
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+        if random.random() < self.prob:
+            mu = 0
+            sigma = random.random() * self.max_sigma
+            im = np.array(im, dtype=np.float32)
+            im += np.random.normal(mu, sigma, im.shape)
+            im[im > 255] = 255
+            im[im < 0] = 0
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomBlur:
+    """
+    Blurring an image by a Gaussian function with a certain probability.
+
+    Args:
+        prob (float, optional): A probability of blurring an image. Default: 0.1.
+        blur_type(str, optional): A type of blurring an image,
+            gaussian stands for cv2.GaussianBlur,
+            median stands for cv2.medianBlur,
+            blur stands for cv2.blur,
+            random represents randomly selected from above.
+            Default: gaussian.
+    """
+
+    def __init__(self, prob=0.1, blur_type="gaussian"):
+        self.prob = prob
+        self.blur_type = blur_type
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        if self.prob <= 0:
+            n = 0
+        elif self.prob >= 1:
+            n = 1
+        else:
+            n = int(1.0 / self.prob)
+        if n > 0:
+            if np.random.randint(0, n) == 0:
+                radius = np.random.randint(3, 10)
+                if radius % 2 != 1:
+                    radius = radius + 1
+                if radius > 9:
+                    radius = 9
+                im = np.array(im, dtype='uint8')
+                if self.blur_type == "gaussian":
+                    im = cv2.GaussianBlur(im, (radius, radius), 0, 0)
+                elif self.blur_type == "median":
+                    im = cv2.medianBlur(im, radius)
+                elif self.blur_type == "blur":
+                    im = cv2.blur(im, (radius, radius))
+                elif self.blur_type == "random":
+                    select = random.random()
+                    if select < 0.3:
+                        im = cv2.GaussianBlur(im, (radius, radius), 0)
+                    elif select < 0.6:
+                        im = cv2.medianBlur(im, radius)
+                    else:
+                        im = cv2.blur(im, (radius, radius))
+                else:
+                    im = cv2.GaussianBlur(im, (radius, radius), 0, 0)
+        im = np.array(im, dtype='float32')
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomRotation:
+    """
+    Rotate an image randomly with padding.
+
+    Args:
+        max_rotation (float, optional): The maximum rotation degree. Default: 15.
+        im_padding_value (list, optional): The padding value of raw image.
+            Default: [127.5, 127.5, 127.5].
+        label_padding_value (int, optional): The padding value of annotation image. Default: 255.
+    """
+
+    def __init__(self,
+                 max_rotation=15,
+                 im_padding_value=(127.5, 127.5, 127.5),
+                 label_padding_value=255):
+        self.max_rotation = max_rotation
+        self.im_padding_value = im_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        if self.max_rotation > 0:
+            (h, w) = im.shape[:2]
+            do_rotation = np.random.uniform(-self.max_rotation,
+                                            self.max_rotation)
+            pc = (w // 2, h // 2)
+            r = cv2.getRotationMatrix2D(pc, do_rotation, 1.0)
+            cos = np.abs(r[0, 0])
+            sin = np.abs(r[0, 1])
+
+            nw = int((h * sin) + (w * cos))
+            nh = int((h * cos) + (w * sin))
+
+            (cx, cy) = pc
+            r[0, 2] += (nw / 2) - cx
+            r[1, 2] += (nh / 2) - cy
+            dsize = (nw, nh)
+            im = cv2.warpAffine(
+                im,
+                r,
+                dsize=dsize,
+                flags=cv2.INTER_LINEAR,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=self.im_padding_value)
+            if label is not None:
+                label = cv2.warpAffine(
+                    label,
+                    r,
+                    dsize=dsize,
+                    flags=cv2.INTER_NEAREST,
+                    borderMode=cv2.BORDER_CONSTANT,
+                    borderValue=self.label_padding_value)
+
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomScaleAspect:
+    """
+    Crop a sub-image from an original image with a range of area ratio and aspect and
+    then scale the sub-image back to the size of the original image.
+
+    Args:
+        min_scale (float, optional): The minimum area ratio of cropped image to the original image. Default: 0.5.
+        aspect_ratio (float, optional): The minimum aspect ratio. Default: 0.33.
+    """
+
+    def __init__(self, min_scale=0.5, aspect_ratio=0.33):
+        self.min_scale = min_scale
+        self.aspect_ratio = aspect_ratio
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        if self.min_scale != 0 and self.aspect_ratio != 0:
+            img_height = im.shape[0]
+            img_width = im.shape[1]
+            for i in range(0, 10):
+                area = img_height * img_width
+                target_area = area * np.random.uniform(self.min_scale, 1.0)
+                aspectRatio = np.random.uniform(self.aspect_ratio,
+                                                1.0 / self.aspect_ratio)
+
+                dw = int(np.sqrt(target_area * 1.0 * aspectRatio))
+                dh = int(np.sqrt(target_area * 1.0 / aspectRatio))
+                if (np.random.randint(10) < 5):
+                    tmp = dw
+                    dw = dh
+                    dh = tmp
+
+                if (dh < img_height and dw < img_width):
+                    h1 = np.random.randint(0, img_height - dh)
+                    w1 = np.random.randint(0, img_width - dw)
+
+                    im = im[h1:(h1 + dh), w1:(w1 + dw), :]
+                    im = cv2.resize(
+                        im, (img_width, img_height),
+                        interpolation=cv2.INTER_LINEAR)
+                    if label is not None:
+                        label = label[h1:(h1 + dh), w1:(w1 + dw)]
+                        label = cv2.resize(
+                            label, (img_width, img_height),
+                            interpolation=cv2.INTER_NEAREST)
+                    break
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomDistort:
+    """
+    Distort an image with random configurations.
+
+    Args:
+        brightness_range (float, optional): A range of brightness. Default: 0.5.
+        brightness_prob (float, optional): A probability of adjusting brightness. Default: 0.5.
+        contrast_range (float, optional): A range of contrast. Default: 0.5.
+        contrast_prob (float, optional): A probability of adjusting contrast. Default: 0.5.
+        saturation_range (float, optional): A range of saturation. Default: 0.5.
+        saturation_prob (float, optional): A probability of adjusting saturation. Default: 0.5.
+        hue_range (int, optional): A range of hue. Default: 18.
+        hue_prob (float, optional): A probability of adjusting hue. Default: 0.5.
+        sharpness_range (float, optional): A range of sharpness. Default: 0.5.
+        sharpness_prob (float, optional): A probability of adjusting saturation. Default: 0.
+    """
+
+    def __init__(self,
+                 brightness_range=0.5,
+                 brightness_prob=0.5,
+                 contrast_range=0.5,
+                 contrast_prob=0.5,
+                 saturation_range=0.5,
+                 saturation_prob=0.5,
+                 hue_range=18,
+                 hue_prob=0.5,
+                 sharpness_range=0.5,
+                 sharpness_prob=0):
+        self.brightness_range = brightness_range
+        self.brightness_prob = brightness_prob
+        self.contrast_range = contrast_range
+        self.contrast_prob = contrast_prob
+        self.saturation_range = saturation_range
+        self.saturation_prob = saturation_prob
+        self.hue_range = hue_range
+        self.hue_prob = hue_prob
+        self.sharpness_range = sharpness_range
+        self.sharpness_prob = sharpness_prob
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        brightness_lower = 1 - self.brightness_range
+        brightness_upper = 1 + self.brightness_range
+        contrast_lower = 1 - self.contrast_range
+        contrast_upper = 1 + self.contrast_range
+        saturation_lower = 1 - self.saturation_range
+        saturation_upper = 1 + self.saturation_range
+        hue_lower = -self.hue_range
+        hue_upper = self.hue_range
+        sharpness_lower = 1 - self.sharpness_range
+        sharpness_upper = 1 + self.sharpness_range
+        ops = [
+            functional.brightness, functional.contrast, functional.saturation,
+            functional.hue, functional.sharpness
+        ]
+        random.shuffle(ops)
+        params_dict = {
+            'brightness': {
+                'brightness_lower': brightness_lower,
+                'brightness_upper': brightness_upper
+            },
+            'contrast': {
+                'contrast_lower': contrast_lower,
+                'contrast_upper': contrast_upper
+            },
+            'saturation': {
+                'saturation_lower': saturation_lower,
+                'saturation_upper': saturation_upper
+            },
+            'hue': {
+                'hue_lower': hue_lower,
+                'hue_upper': hue_upper
+            },
+            'sharpness': {
+                'sharpness_lower': sharpness_lower,
+                'sharpness_upper': sharpness_upper,
+            }
+        }
+        prob_dict = {
+            'brightness': self.brightness_prob,
+            'contrast': self.contrast_prob,
+            'saturation': self.saturation_prob,
+            'hue': self.hue_prob,
+            'sharpness': self.sharpness_prob
+        }
+        im = im.astype('uint8')
+        im = Image.fromarray(im)
+        for id in range(len(ops)):
+            params = params_dict[ops[id].__name__]
+            prob = prob_dict[ops[id].__name__]
+            params['im'] = im
+            if np.random.uniform(0, 1) < prob:
+                im = ops[id](**params)
+        im = np.asarray(im).astype('float32')
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
+
+
+@manager.TRANSFORMS.add_component
+class RandomAffine:
+    """
+    Affine transform an image with random configurations.
+
+    Args:
+        size (tuple, optional): The target size after affine transformation. Default: (224, 224).
+        translation_offset (float, optional): The maximum translation offset. Default: 0.
+        max_rotation (float, optional): The maximum rotation degree. Default: 15.
+        min_scale_factor (float, optional): The minimum scale. Default: 0.75.
+        max_scale_factor (float, optional): The maximum scale. Default: 1.25.
+        im_padding_value (float, optional): The padding value of raw image. Default: (128, 128, 128).
+        label_padding_value (int, optional): The padding value of annotation image. Default: (255, 255, 255).
+    """
+
+    def __init__(self,
+                 size=(224, 224),
+                 translation_offset=0,
+                 max_rotation=15,
+                 min_scale_factor=0.75,
+                 max_scale_factor=1.25,
+                 im_padding_value=(128, 128, 128),
+                 label_padding_value=(255, 255, 255)):
+        self.size = size
+        self.translation_offset = translation_offset
+        self.max_rotation = max_rotation
+        self.min_scale_factor = min_scale_factor
+        self.max_scale_factor = max_scale_factor
+        self.im_padding_value = im_padding_value
+        self.label_padding_value = label_padding_value
+
+    def __call__(self, im, label=None):
+        """
+        Args:
+            im (np.ndarray): The Image data.
+            label (np.ndarray, optional): The label data. Default: None.
+
+        Returns:
+            (tuple). When label is None, it returns (im, ), otherwise it returns (im, label).
+        """
+
+        w, h = self.size
+        bbox = [0, 0, im.shape[1] - 1, im.shape[0] - 1]
+        x_offset = (random.random() - 0.5) * 2 * self.translation_offset
+        y_offset = (random.random() - 0.5) * 2 * self.translation_offset
+        dx = (w - (bbox[2] + bbox[0])) / 2.0
+        dy = (h - (bbox[3] + bbox[1])) / 2.0
+
+        matrix_trans = np.array([[1.0, 0, dx], [0, 1.0, dy], [0, 0, 1.0]])
+
+        angle = random.random() * 2 * self.max_rotation - self.max_rotation
+        scale = random.random() * (self.max_scale_factor - self.min_scale_factor
+                                   ) + self.min_scale_factor
+        scale *= np.mean(
+            [float(w) / (bbox[2] - bbox[0]),
+             float(h) / (bbox[3] - bbox[1])])
+        alpha = scale * math.cos(angle / 180.0 * math.pi)
+        beta = scale * math.sin(angle / 180.0 * math.pi)
+
+        centerx = w / 2.0 + x_offset
+        centery = h / 2.0 + y_offset
+        matrix = np.array(
+            [[alpha, beta, (1 - alpha) * centerx - beta * centery],
+             [-beta, alpha, beta * centerx + (1 - alpha) * centery],
+             [0, 0, 1.0]])
+
+        matrix = matrix.dot(matrix_trans)[0:2, :]
+        im = cv2.warpAffine(
+            np.uint8(im),
+            matrix,
+            tuple(self.size),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=self.im_padding_value)
+        if label is not None:
+            label = cv2.warpAffine(
+                np.uint8(label),
+                matrix,
+                tuple(self.size),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT)
+        if label is None:
+            return (im,)
+        else:
+            return (im, label)
\ No newline at end of file
diff --git a/paddlers/models/ppseg/utils/__init__.py b/paddlers/models/ppseg/utils/__init__.py
new file mode 100644
index 0000000..63c7894
--- /dev/null
+++ b/paddlers/models/ppseg/utils/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import logger
+from . import download
+from . import metrics
+from .env import seg_env, get_sys_env
+from .utils import *
+from .timer import TimeAverager, calculate_eta
+from . import visualize
+from .config_check import config_check
+from .ema import EMA
diff --git a/paddlers/models/ppseg/utils/config_check.py b/paddlers/models/ppseg/utils/config_check.py
new file mode 100644
index 0000000..47a7049
--- /dev/null
+++ b/paddlers/models/ppseg/utils/config_check.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def config_check(cfg, train_dataset=None, val_dataset=None):
+    """
+    To check config。
+
+    Args:
+        cfg (paddleseg.cvlibs.Config): An object of paddleseg.cvlibs.Config.
+        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
+        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
+    """
+
+    num_classes_check(cfg, train_dataset, val_dataset)
+
+
+def num_classes_check(cfg, train_dataset, val_dataset):
+    """"
+    Check that the num_classes in model, train_dataset and val_dataset is consistent.
+    """
+    num_classes_set = set()
+    if train_dataset and hasattr(train_dataset, 'num_classes'):
+        num_classes_set.add(train_dataset.num_classes)
+    if val_dataset and hasattr(val_dataset, 'num_classes'):
+        num_classes_set.add(val_dataset.num_classes)
+    if cfg.dic.get('model', None) and cfg.dic['model'].get('num_classes', None):
+        num_classes_set.add(cfg.dic['model'].get('num_classes'))
+    if (not cfg.train_dataset) and (not cfg.val_dataset):
+        raise ValueError(
+            'One of `train_dataset` or `val_dataset should be given, but there are none.'
+        )
+    if len(num_classes_set) == 0:
+        raise ValueError(
+            '`num_classes` is not found. Please set it in model, train_dataset or val_dataset'
+        )
+    elif len(num_classes_set) > 1:
+        raise ValueError(
+            '`num_classes` is not consistent: {}. Please set it consistently in model or train_dataset or val_dataset'
+            .format(num_classes_set))
+    else:
+        num_classes = num_classes_set.pop()
+        if train_dataset:
+            train_dataset.num_classes = num_classes
+        if val_dataset:
+            val_dataset.num_classes = num_classes
diff --git a/paddlers/models/ppseg/utils/download.py b/paddlers/models/ppseg/utils/download.py
new file mode 100644
index 0000000..7b4a1c3
--- /dev/null
+++ b/paddlers/models/ppseg/utils/download.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import os
+import shutil
+import sys
+import tarfile
+import time
+import zipfile
+
+import requests
+
+lasttime = time.time()
+FLUSH_INTERVAL = 0.1
+
+
+def progress(str, end=False):
+    global lasttime
+    if end:
+        str += "\n"
+        lasttime = 0
+    if time.time() - lasttime >= FLUSH_INTERVAL:
+        sys.stdout.write("\r%s" % str)
+        lasttime = time.time()
+        sys.stdout.flush()
+
+
+def _download_file(url, savepath, print_progress):
+    if print_progress:
+        print("Connecting to {}".format(url))
+    r = requests.get(url, stream=True, timeout=15)
+    total_length = r.headers.get('content-length')
+
+    if total_length is None:
+        with open(savepath, 'wb') as f:
+            shutil.copyfileobj(r.raw, f)
+    else:
+        with open(savepath, 'wb') as f:
+            dl = 0
+            total_length = int(total_length)
+            starttime = time.time()
+            if print_progress:
+                print("Downloading %s" % os.path.basename(savepath))
+            for data in r.iter_content(chunk_size=4096):
+                dl += len(data)
+                f.write(data)
+                if print_progress:
+                    done = int(50 * dl / total_length)
+                    progress("[%-50s] %.2f%%" %
+                             ('=' * done, float(100 * dl) / total_length))
+        if print_progress:
+            progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True)
+
+
+def _uncompress_file_zip(filepath, extrapath):
+    files = zipfile.ZipFile(filepath, 'r')
+    filelist = files.namelist()
+    rootpath = filelist[0]
+    total_num = len(filelist)
+    for index, file in enumerate(filelist):
+        files.extract(file, extrapath)
+        yield total_num, index, rootpath
+    files.close()
+    yield total_num, index, rootpath
+
+
+def _uncompress_file_tar(filepath, extrapath, mode="r:gz"):
+    files = tarfile.open(filepath, mode)
+    filelist = files.getnames()
+    total_num = len(filelist)
+    rootpath = filelist[0]
+    for index, file in enumerate(filelist):
+        files.extract(file, extrapath)
+        yield total_num, index, rootpath
+    files.close()
+    yield total_num, index, rootpath
+
+
+def _uncompress_file(filepath, extrapath, delete_file, print_progress):
+    if print_progress:
+        print("Uncompress %s" % os.path.basename(filepath))
+
+    if filepath.endswith("zip"):
+        handler = _uncompress_file_zip
+    elif filepath.endswith("tgz"):
+        handler = functools.partial(_uncompress_file_tar, mode="r:*")
+    else:
+        handler = functools.partial(_uncompress_file_tar, mode="r")
+
+    for total_num, index, rootpath in handler(filepath, extrapath):
+        if print_progress:
+            done = int(50 * float(index) / total_num)
+            progress(
+                "[%-50s] %.2f%%" % ('=' * done, float(100 * index) / total_num))
+    if print_progress:
+        progress("[%-50s] %.2f%%" % ('=' * 50, 100), end=True)
+
+    if delete_file:
+        os.remove(filepath)
+
+    return rootpath
+
+
+def download_file_and_uncompress(url,
+                                 savepath=None,
+                                 extrapath=None,
+                                 extraname=None,
+                                 print_progress=True,
+                                 cover=False,
+                                 delete_file=True):
+    if savepath is None:
+        savepath = "."
+
+    if extrapath is None:
+        extrapath = "."
+
+    savename = url.split("/")[-1]
+    if not os.path.exists(savepath):
+        os.makedirs(savepath)
+
+    savepath = os.path.join(savepath, savename)
+    savename = ".".join(savename.split(".")[:-1])
+    savename = os.path.join(extrapath, savename)
+    extraname = savename if extraname is None else os.path.join(
+        extrapath, extraname)
+
+    if cover:
+        if os.path.exists(savepath):
+            shutil.rmtree(savepath)
+        if os.path.exists(savename):
+            shutil.rmtree(savename)
+        if os.path.exists(extraname):
+            shutil.rmtree(extraname)
+
+    if not os.path.exists(extraname):
+        if not os.path.exists(savename):
+            if not os.path.exists(savepath):
+                _download_file(url, savepath, print_progress)
+
+            if (not tarfile.is_tarfile(savepath)) and (
+                    not zipfile.is_zipfile(savepath)):
+                if not os.path.exists(extraname):
+                    os.makedirs(extraname)
+                shutil.move(savepath, extraname)
+                return extraname
+
+            savename = _uncompress_file(savepath, extrapath, delete_file,
+                                        print_progress)
+            savename = os.path.join(extrapath, savename)
+        shutil.move(savename, extraname)
+    return extraname
diff --git a/paddlers/models/ppseg/utils/ema.py b/paddlers/models/ppseg/utils/ema.py
new file mode 100644
index 0000000..046474f
--- /dev/null
+++ b/paddlers/models/ppseg/utils/ema.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+
+class EMA(object):
+    """
+    The implementation of Exponential Moving Average for the trainable parameters.
+
+    Args:
+        model (nn.Layer): The model for applying EMA.
+        decay (float, optional): Decay is used to calculate ema_variable by
+            `ema_variable = decay * ema_variable + (1 - decay) * new_variable`.
+            Default: 0.99.
+    
+    Returns:
+        None
+    
+    Examples:
+        .. code-block:: python
+
+            # 1. Define model and dataset
+        
+            # 2. Create EMA
+            ema = EMA(model, decay=0.99)
+
+            # 3. Train stage
+            for data in dataloader():
+                ...
+                optimizer.step()
+                ema.step()
+
+            # 4. Evaluate stage
+            ema.apply()     # Use the EMA data to replace the origin data
+
+            for data in dataloader():
+                ...
+            
+            ema.restore()   # Restore the origin data to the model
+
+    """
+    def __init__(self, model, decay=0.99):
+        super().__init__()
+
+        assert isinstance(model, paddle.nn.Layer), \
+            "The model should be the instance of paddle.nn.Layer."
+        assert decay >= 0 and decay <= 1.0, \
+            "The decay = {} should in [0.0, 1.0]".format(decay)
+
+        self._model = model
+        self._decay = decay
+        self._ema_data = {}
+        self._backup_data = {}
+
+        for name, param in self._model.named_parameters():
+            if not param.stop_gradient:
+                self._ema_data[name] = param.numpy()
+
+    def step(self):
+        """
+        Calculate the EMA data for all trainable parameters.
+        """
+        for name, param in self._model.named_parameters():
+            if not param.stop_gradient:
+                assert name in self._ema_data, \
+                    "The param ({}) isn't in the model".format(name)
+                self._ema_data[name] = self._decay * self._ema_data[name] \
+                    + (1.0 - self._decay) * param.numpy()
+
+    def apply(self):
+        """
+        Save the origin data and use the EMA data to replace the origin data.
+        """
+        for name, param in self._model.named_parameters():
+            if not param.stop_gradient:
+                assert name in self._ema_data, \
+                    "The param ({}) isn't in the model".format(name)
+                self._backup_data[name] = param.numpy()
+                param.set_value(self._ema_data[name])
+
+    def restore(self):
+        """
+        Restore the origin data to the model.
+        """
+        for name, param in self._model.named_parameters():
+            if not param.stop_gradient:
+                assert name in self._backup_data, \
+                    "The param ({}) isn't in the model".format(name)
+                param.set_value(self._backup_data[name])
+        self._backup_data = {}
diff --git a/paddlers/models/ppseg/utils/logger.py b/paddlers/models/ppseg/utils/logger.py
new file mode 100644
index 0000000..e7ef757
--- /dev/null
+++ b/paddlers/models/ppseg/utils/logger.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+
+import paddle
+
+levels = {0: 'ERROR', 1: 'WARNING', 2: 'INFO', 3: 'DEBUG'}
+log_level = 2
+
+
+def log(level=2, message=""):
+    if paddle.distributed.ParallelEnv().local_rank == 0:
+        current_time = time.time()
+        time_array = time.localtime(current_time)
+        current_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
+        if log_level >= level:
+            print(
+                "{} [{}]\t{}".format(current_time, levels[level],
+                                     message).encode("utf-8").decode("latin1"))
+            sys.stdout.flush()
+
+
+def debug(message=""):
+    log(level=3, message=message)
+
+
+def info(message=""):
+    log(level=2, message=message)
+
+
+def warning(message=""):
+    log(level=1, message=message)
+
+
+def error(message=""):
+    log(level=0, message=message)
diff --git a/paddlers/models/ppseg/utils/metrics.py b/paddlers/models/ppseg/utils/metrics.py
new file mode 100644
index 0000000..cbc4aec
--- /dev/null
+++ b/paddlers/models/ppseg/utils/metrics.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import sklearn.metrics as skmetrics
+
+
+def calculate_area(pred, label, num_classes, ignore_index=255):
+    """
+    Calculate intersect, prediction and label area
+
+    Args:
+        pred (Tensor): The prediction by model.
+        label (Tensor): The ground truth of image.
+        num_classes (int): The unique number of target classes.
+        ignore_index (int): Specifies a target value that is ignored. Default: 255.
+
+    Returns:
+        Tensor: The intersection area of prediction and the ground on all class.
+        Tensor: The prediction area on all class.
+        Tensor: The ground truth area on all class
+    """
+    if len(pred.shape) == 4:
+        pred = paddle.squeeze(pred, axis=1)
+    if len(label.shape) == 4:
+        label = paddle.squeeze(label, axis=1)
+    if not pred.shape == label.shape:
+        raise ValueError('Shape of `pred` and `label should be equal, '
+                         'but there are {} and {}.'.format(
+                             pred.shape, label.shape))
+    pred_area = []
+    label_area = []
+    intersect_area = []
+    mask = label != ignore_index
+
+    for i in range(num_classes):
+        pred_i = paddle.logical_and(pred == i, mask)
+        label_i = label == i
+        intersect_i = paddle.logical_and(pred_i, label_i)
+        pred_area.append(paddle.sum(paddle.cast(pred_i, "int32")))
+        label_area.append(paddle.sum(paddle.cast(label_i, "int32")))
+        intersect_area.append(paddle.sum(paddle.cast(intersect_i, "int32")))
+
+    pred_area = paddle.concat(pred_area)
+    label_area = paddle.concat(label_area)
+    intersect_area = paddle.concat(intersect_area)
+
+    return intersect_area, pred_area, label_area
+
+
+def auc_roc(logits, label, num_classes, ignore_index=None):
+    """
+    Calculate area under the roc curve
+
+    Args:
+        logits (Tensor): The prediction by model on testset, of shape (N,C,H,W) .
+        label (Tensor): The ground truth of image.   (N,1,H,W)
+        num_classes (int): The unique number of target classes.
+        ignore_index (int): Specifies a target value that is ignored. Default: 255.
+
+    Returns:
+        auc_roc(float): The area under roc curve
+    """
+    if ignore_index or len(np.unique(label)) > num_classes:
+        raise RuntimeError('labels with ignore_index is not supported yet.')
+
+    if len(label.shape) != 4:
+        raise ValueError(
+            'The shape of label is not 4 dimension as (N, C, H, W), it is {}'.
+            format(label.shape))
+
+    if len(logits.shape) != 4:
+        raise ValueError(
+            'The shape of logits is not 4 dimension as (N, C, H, W), it is {}'.
+            format(logits.shape))
+
+    N, C, H, W = logits.shape
+    logits = np.transpose(logits, (1, 0, 2, 3))
+    logits = logits.reshape([C, N * H * W]).transpose([1, 0])
+
+    label = np.transpose(label, (1, 0, 2, 3))
+    label = label.reshape([1, N * H * W]).squeeze()
+
+    if not logits.shape[0] == label.shape[0]:
+        raise ValueError('length of `logit` and `label` should be equal, '
+                         'but they are {} and {}.'.format(
+                             logits.shape[0], label.shape[0]))
+
+    if num_classes == 2:
+        auc = skmetrics.roc_auc_score(label, logits[:, 1])
+    else:
+        auc = skmetrics.roc_auc_score(label, logits, multi_class='ovr')
+
+    return auc
+
+
+def mean_iou(intersect_area, pred_area, label_area):
+    """
+    Calculate iou.
+
+    Args:
+        intersect_area (Tensor): The intersection area of prediction and ground truth on all classes.
+        pred_area (Tensor): The prediction area on all classes.
+        label_area (Tensor): The ground truth area on all classes.
+
+    Returns:
+        np.ndarray: iou on all classes.
+        float: mean iou of all classes.
+    """
+    intersect_area = intersect_area.numpy()
+    pred_area = pred_area.numpy()
+    label_area = label_area.numpy()
+    union = pred_area + label_area - intersect_area
+    class_iou = []
+    for i in range(len(intersect_area)):
+        if union[i] == 0:
+            iou = 0
+        else:
+            iou = intersect_area[i] / union[i]
+        class_iou.append(iou)
+    miou = np.mean(class_iou)
+    return np.array(class_iou), miou
+
+
+def dice(intersect_area, pred_area, label_area):
+    """
+    Calculate DICE.
+
+    Args:
+        intersect_area (Tensor): The intersection area of prediction and ground truth on all classes.
+        pred_area (Tensor): The prediction area on all classes.
+        label_area (Tensor): The ground truth area on all classes.
+
+    Returns:
+        np.ndarray: DICE on all classes.
+        float: mean DICE of all classes.
+    """
+    intersect_area = intersect_area.numpy()
+    pred_area = pred_area.numpy()
+    label_area = label_area.numpy()
+    union = pred_area + label_area
+    class_dice = []
+    for i in range(len(intersect_area)):
+        if union[i] == 0:
+            dice = 0
+        else:
+            dice = (2 * intersect_area[i]) / union[i]
+        class_dice.append(dice)
+    mdice = np.mean(class_dice)
+    return np.array(class_dice), mdice
+
+
+def accuracy(intersect_area, pred_area):
+    """
+    Calculate accuracy
+
+    Args:
+        intersect_area (Tensor): The intersection area of prediction and ground truth on all classes..
+        pred_area (Tensor): The prediction area on all classes.
+
+    Returns:
+        np.ndarray: accuracy on all classes.
+        float: mean accuracy.
+    """
+    intersect_area = intersect_area.numpy()
+    pred_area = pred_area.numpy()
+    class_acc = []
+    for i in range(len(intersect_area)):
+        if pred_area[i] == 0:
+            acc = 0
+        else:
+            acc = intersect_area[i] / pred_area[i]
+        class_acc.append(acc)
+    macc = np.sum(intersect_area) / np.sum(pred_area)
+    return np.array(class_acc), macc
+
+
+def kappa(intersect_area, pred_area, label_area):
+    """
+    Calculate kappa coefficient
+
+    Args:
+        intersect_area (Tensor): The intersection area of prediction and ground truth on all classes..
+        pred_area (Tensor): The prediction area on all classes.
+        label_area (Tensor): The ground truth area on all classes.
+
+    Returns:
+        float: kappa coefficient.
+    """
+    intersect_area = intersect_area.numpy()
+    pred_area = pred_area.numpy()
+    label_area = label_area.numpy()
+    total_area = np.sum(label_area)
+    po = np.sum(intersect_area) / total_area
+    pe = np.sum(pred_area * label_area) / (total_area * total_area)
+    kappa = (po - pe) / (1 - pe)
+    return kappa
diff --git a/paddlers/models/ppseg/utils/op_flops_funs.py b/paddlers/models/ppseg/utils/op_flops_funs.py
new file mode 100644
index 0000000..28353d8
--- /dev/null
+++ b/paddlers/models/ppseg/utils/op_flops_funs.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implement the counting flops functions for some ops.
+"""
+
+
+def count_syncbn(m, x, y):
+    x = x[0]
+    nelements = x.numel()
+    m.total_ops += int(2 * nelements)
diff --git a/paddlers/models/ppseg/utils/progbar.py b/paddlers/models/ppseg/utils/progbar.py
new file mode 100644
index 0000000..563cc5e
--- /dev/null
+++ b/paddlers/models/ppseg/utils/progbar.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+import numpy as np
+
+
+class Progbar(object):
+    """
+    Displays a progress bar.
+        It refers to https://github.com/keras-team/keras/blob/keras-2/keras/utils/generic_utils.py
+
+    Args:
+        target (int): Total number of steps expected, None if unknown.
+        width (int): Progress bar width on screen.
+        verbose (int): Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+        stateful_metrics (list|tuple): Iterable of string names of metrics that should *not* be
+            averaged over time. Metrics in this list will be displayed as-is. All
+            others will be averaged by the progbar before display.
+        interval (float): Minimum visual progress update interval (in seconds).
+        unit_name (str): Display name for step counts (usually "step" or "sample").
+    """
+
+    def __init__(self,
+                 target,
+                 width=30,
+                 verbose=1,
+                 interval=0.05,
+                 stateful_metrics=None,
+                 unit_name='step'):
+        self.target = target
+        self.width = width
+        self.verbose = verbose
+        self.interval = interval
+        self.unit_name = unit_name
+        if stateful_metrics:
+            self.stateful_metrics = set(stateful_metrics)
+        else:
+            self.stateful_metrics = set()
+
+        self._dynamic_display = ((hasattr(sys.stderr, 'isatty')
+                                  and sys.stderr.isatty())
+                                 or 'ipykernel' in sys.modules
+                                 or 'posix' in sys.modules
+                                 or 'PYCHARM_HOSTED' in os.environ)
+        self._total_width = 0
+        self._seen_so_far = 0
+        # We use a dict + list to avoid garbage collection
+        # issues found in OrderedDict
+        self._values = {}
+        self._values_order = []
+        self._start = time.time()
+        self._last_update = 0
+
+    def update(self, current, values=None, finalize=None):
+        """
+        Updates the progress bar.
+
+        Args:
+            current (int): Index of current step.
+            values (list): List of tuples: `(name, value_for_last_step)`. If `name` is in
+                `stateful_metrics`, `value_for_last_step` will be displayed as-is.
+                Else, an average of the metric over time will be displayed.
+            finalize (bool): Whether this is the last update for the progress bar. If
+                `None`, defaults to `current >= self.target`.
+        """
+
+        if finalize is None:
+            if self.target is None:
+                finalize = False
+            else:
+                finalize = current >= self.target
+
+        values = values or []
+        for k, v in values:
+            if k not in self._values_order:
+                self._values_order.append(k)
+            if k not in self.stateful_metrics:
+                # In the case that progress bar doesn't have a target value in the first
+                # epoch, both on_batch_end and on_epoch_end will be called, which will
+                # cause 'current' and 'self._seen_so_far' to have the same value. Force
+                # the minimal value to 1 here, otherwise stateful_metric will be 0s.
+                value_base = max(current - self._seen_so_far, 1)
+                if k not in self._values:
+                    self._values[k] = [v * value_base, value_base]
+                else:
+                    self._values[k][0] += v * value_base
+                    self._values[k][1] += value_base
+            else:
+                # Stateful metrics output a numeric value. This representation
+                # means "take an average from a single value" but keeps the
+                # numeric formatting.
+                self._values[k] = [v, 1]
+        self._seen_so_far = current
+
+        now = time.time()
+        info = ' - %.0fs' % (now - self._start)
+        if self.verbose == 1:
+            if now - self._last_update < self.interval and not finalize:
+                return
+
+            prev_total_width = self._total_width
+            if self._dynamic_display:
+                sys.stderr.write('\b' * prev_total_width)
+                sys.stderr.write('\r')
+            else:
+                sys.stderr.write('\n')
+
+            if self.target is not None:
+                numdigits = int(np.log10(self.target)) + 1
+                bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
+                prog = float(current) / self.target
+                prog_width = int(self.width * prog)
+                if prog_width > 0:
+                    bar += ('=' * (prog_width - 1))
+                    if current < self.target:
+                        bar += '>'
+                    else:
+                        bar += '='
+                bar += ('.' * (self.width - prog_width))
+                bar += ']'
+            else:
+                bar = '%7d/Unknown' % current
+
+            self._total_width = len(bar)
+            sys.stderr.write(bar)
+
+            if current:
+                time_per_unit = (now - self._start) / current
+            else:
+                time_per_unit = 0
+
+            if self.target is None or finalize:
+                if time_per_unit >= 1 or time_per_unit == 0:
+                    info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
+                elif time_per_unit >= 1e-3:
+                    info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
+                else:
+                    info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
+            else:
+                eta = time_per_unit * (self.target - current)
+                if eta > 3600:
+                    eta_format = '%d:%02d:%02d' % (eta // 3600,
+                                                   (eta % 3600) // 60, eta % 60)
+                elif eta > 60:
+                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
+                else:
+                    eta_format = '%ds' % eta
+
+                info = ' - ETA: %s' % eta_format
+
+            for k in self._values_order:
+                info += ' - %s:' % k
+                if isinstance(self._values[k], list):
+                    avg = np.mean(
+                        self._values[k][0] / max(1, self._values[k][1]))
+                    if abs(avg) > 1e-3:
+                        info += ' %.4f' % avg
+                    else:
+                        info += ' %.4e' % avg
+                else:
+                    info += ' %s' % self._values[k]
+
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += (' ' * (prev_total_width - self._total_width))
+
+            if finalize:
+                info += '\n'
+
+            sys.stderr.write(info)
+            sys.stderr.flush()
+
+        elif self.verbose == 2:
+            if finalize:
+                numdigits = int(np.log10(self.target)) + 1
+                count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
+                info = count + info
+                for k in self._values_order:
+                    info += ' - %s:' % k
+                    avg = np.mean(
+                        self._values[k][0] / max(1, self._values[k][1]))
+                    if avg > 1e-3:
+                        info += ' %.4f' % avg
+                    else:
+                        info += ' %.4e' % avg
+                info += '\n'
+
+                sys.stderr.write(info)
+                sys.stderr.flush()
+
+        self._last_update = now
+
+    def add(self, n, values=None):
+        self.update(self._seen_so_far + n, values)
diff --git a/paddlers/models/ppseg/utils/timer.py b/paddlers/models/ppseg/utils/timer.py
new file mode 100644
index 0000000..d7d7467
--- /dev/null
+++ b/paddlers/models/ppseg/utils/timer.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+
+class TimeAverager(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self._cnt = 0
+        self._total_time = 0
+        self._total_samples = 0
+
+    def record(self, usetime, num_samples=None):
+        self._cnt += 1
+        self._total_time += usetime
+        if num_samples:
+            self._total_samples += num_samples
+
+    def get_average(self):
+        if self._cnt == 0:
+            return 0
+        return self._total_time / float(self._cnt)
+
+    def get_ips_average(self):
+        if not self._total_samples or self._cnt == 0:
+            return 0
+        return float(self._total_samples) / self._total_time
+
+
+def calculate_eta(remaining_step, speed):
+    if remaining_step < 0:
+        remaining_step = 0
+    remaining_time = int(remaining_step * speed)
+    result = "{:0>2}:{:0>2}:{:0>2}"
+    arr = []
+    for i in range(2, -1, -1):
+        arr.append(int(remaining_time / 60**i))
+        remaining_time %= 60**i
+    return result.format(*arr)
diff --git a/paddlers/models/ppseg/utils/train_profiler.py b/paddlers/models/ppseg/utils/train_profiler.py
new file mode 100644
index 0000000..4b4d53b
--- /dev/null
+++ b/paddlers/models/ppseg/utils/train_profiler.py
@@ -0,0 +1,112 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'.
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+
+        if options_str != "":
+            self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/paddlers/models/ppseg/utils/utils.py b/paddlers/models/ppseg/utils/utils.py
new file mode 100644
index 0000000..0c98c46
--- /dev/null
+++ b/paddlers/models/ppseg/utils/utils.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import filelock
+import os
+import tempfile
+import numpy as np
+import random
+from urllib.parse import urlparse, unquote
+
+import paddle
+
+from paddlers.models.ppseg.utils import logger, seg_env
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+
+
+@contextlib.contextmanager
+def generate_tempdir(directory: str = None, **kwargs):
+    '''Generate a temporary directory'''
+    directory = seg_env.TMP_HOME if not directory else directory
+    with tempfile.TemporaryDirectory(dir=directory, **kwargs) as _dir:
+        yield _dir
+
+
+def load_entire_model(model, pretrained):
+    if pretrained is not None:
+        load_pretrained_model(model, pretrained)
+    else:
+        logger.warning('Not all pretrained params of {} are loaded, ' \
+                       'training from scratch or a pretrained backbone.'.format(model.__class__.__name__))
+
+
+def download_pretrained_model(pretrained_model):
+    """
+    Download pretrained model from url.
+    Args:
+        pretrained_model (str): the url of pretrained weight
+    Returns:
+        str: the path of pretrained weight
+    """
+    assert urlparse(pretrained_model).netloc, "The url is not valid."
+
+    pretrained_model = unquote(pretrained_model)
+    savename = pretrained_model.split('/')[-1]
+    if not savename.endswith(('tgz', 'tar.gz', 'tar', 'zip')):
+        savename = pretrained_model.split('/')[-2]
+    else:
+        savename = savename.split('.')[0]
+
+    with generate_tempdir() as _dir:
+        with filelock.FileLock(os.path.join(seg_env.TMP_HOME, savename)):
+            pretrained_model = download_file_and_uncompress(
+                pretrained_model,
+                savepath=_dir,
+                extrapath=seg_env.PRETRAINED_MODEL_HOME,
+                extraname=savename)
+            pretrained_model = os.path.join(pretrained_model, 'model.pdparams')
+    return pretrained_model
+
+
+def load_pretrained_model(model, pretrained_model):
+    if pretrained_model is not None:
+        logger.info('Loading pretrained model from {}'.format(pretrained_model))
+
+        if urlparse(pretrained_model).netloc:
+            pretrained_model = download_pretrained_model(pretrained_model)
+
+        if os.path.exists(pretrained_model):
+            para_state_dict = paddle.load(pretrained_model)
+
+            model_state_dict = model.state_dict()
+            keys = model_state_dict.keys()
+            num_params_loaded = 0
+            for k in keys:
+                if k not in para_state_dict:
+                    logger.warning("{} is not in pretrained model".format(k))
+                elif list(para_state_dict[k].shape) != list(
+                        model_state_dict[k].shape):
+                    logger.warning(
+                        "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
+                        .format(k, para_state_dict[k].shape,
+                                model_state_dict[k].shape))
+                else:
+                    model_state_dict[k] = para_state_dict[k]
+                    num_params_loaded += 1
+            model.set_dict(model_state_dict)
+            logger.info("There are {}/{} variables loaded into {}.".format(
+                num_params_loaded, len(model_state_dict),
+                model.__class__.__name__))
+
+        else:
+            raise ValueError(
+                'The pretrained model directory is not Found: {}'.format(
+                    pretrained_model))
+    else:
+        logger.info(
+            'No pretrained model to load, {} will be trained from scratch.'.
+            format(model.__class__.__name__))
+
+
+def resume(model, optimizer, resume_model):
+    if resume_model is not None:
+        logger.info('Resume model from {}'.format(resume_model))
+        if os.path.exists(resume_model):
+            resume_model = os.path.normpath(resume_model)
+            ckpt_path = os.path.join(resume_model, 'model.pdparams')
+            para_state_dict = paddle.load(ckpt_path)
+            ckpt_path = os.path.join(resume_model, 'model.pdopt')
+            opti_state_dict = paddle.load(ckpt_path)
+            model.set_state_dict(para_state_dict)
+            optimizer.set_state_dict(opti_state_dict)
+
+            iter = resume_model.split('_')[-1]
+            iter = int(iter)
+            return iter
+        else:
+            raise ValueError(
+                'Directory of the model needed to resume is not Found: {}'.
+                format(resume_model))
+    else:
+        logger.info('No model needed to resume.')
+
+
+def worker_init_fn(worker_id):
+    np.random.seed(random.randint(0, 100000))
+
+
+def get_image_list(image_path):
+    """Get image list"""
+    valid_suffix = [
+        '.JPEG', '.jpeg', '.JPG', '.jpg', '.BMP', '.bmp', '.PNG', '.png'
+    ]
+    image_list = []
+    image_dir = None
+    if os.path.isfile(image_path):
+        if os.path.splitext(image_path)[-1] in valid_suffix:
+            image_list.append(image_path)
+        else:
+            image_dir = os.path.dirname(image_path)
+            with open(image_path, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if len(line.split()) > 1:
+                        line = line.split()[0]
+                    image_list.append(os.path.join(image_dir, line))
+    elif os.path.isdir(image_path):
+        image_dir = image_path
+        for root, dirs, files in os.walk(image_path):
+            for f in files:
+                if '.ipynb_checkpoints' in root:
+                    continue
+                if os.path.splitext(f)[-1] in valid_suffix:
+                    image_list.append(os.path.join(root, f))
+    else:
+        raise FileNotFoundError(
+            '`--image_path` is not found. it should be a path of image, or a file list containing image paths, or a directory including images.'
+        )
+
+    if len(image_list) == 0:
+        raise RuntimeError(
+            'There are not image file in `--image_path`={}'.format(image_path))
+
+    return image_list, image_dir
diff --git a/paddlers/models/ppseg/utils/visualize.py b/paddlers/models/ppseg/utils/visualize.py
new file mode 100644
index 0000000..bc2947c
--- /dev/null
+++ b/paddlers/models/ppseg/utils/visualize.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import cv2
+import numpy as np
+from PIL import Image as PILImage
+
+
+def visualize(image, result, color_map, save_dir=None, weight=0.6):
+    """
+    Convert predict result to color image, and save added image.
+
+    Args:
+        image (str): The path of origin image.
+        result (np.ndarray): The predict result of image.
+        color_map (list): The color used to save the prediction results.
+        save_dir (str): The directory for saving visual image. Default: None.
+        weight (float): The image weight of visual image, and the result weight is (1 - weight). Default: 0.6
+
+    Returns:
+        vis_result (np.ndarray): If `save_dir` is None, return the visualized result.
+    """
+
+    color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
+    color_map = np.array(color_map).astype("uint8")
+    # Use OpenCV LUT for color mapping
+    c1 = cv2.LUT(result, color_map[:, 0])
+    c2 = cv2.LUT(result, color_map[:, 1])
+    c3 = cv2.LUT(result, color_map[:, 2])
+    pseudo_img = np.dstack((c3, c2, c1))
+
+    im = cv2.imread(image)
+    vis_result = cv2.addWeighted(im, weight, pseudo_img, 1 - weight, 0)
+
+    if save_dir is not None:
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        image_name = os.path.split(image)[-1]
+        out_path = os.path.join(save_dir, image_name)
+        cv2.imwrite(out_path, vis_result)
+    else:
+        return vis_result
+
+
+def get_pseudo_color_map(pred, color_map=None):
+    """
+    Get the pseudo color image.
+
+    Args:
+        pred (numpy.ndarray): the origin predicted image.
+        color_map (list, optional): the palette color map. Default: None,
+            use paddleseg's default color map.
+    
+    Returns:
+        (numpy.ndarray): the pseduo image.
+    """
+    pred_mask = PILImage.fromarray(pred.astype(np.uint8), mode='P')
+    if color_map is None:
+        color_map = get_color_map_list(256)
+    pred_mask.putpalette(color_map)
+    return pred_mask
+
+
+def get_color_map_list(num_classes, custom_color=None):
+    """
+    Returns the color map for visualizing the segmentation mask,
+    which can support arbitrary number of classes.
+
+    Args:
+        num_classes (int): Number of classes.
+        custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
+
+    Returns:
+        (list). The color map.
+    """
+
+    num_classes += 1
+    color_map = num_classes * [0, 0, 0]
+    for i in range(0, num_classes):
+        j = 0
+        lab = i
+        while lab:
+            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
+            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
+            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
+            j += 1
+            lab >>= 3
+    color_map = color_map[3:]
+
+    if custom_color:
+        color_map[:len(custom_color)] = custom_color
+    return color_map
diff --git a/paddlers/requirements.txt b/paddlers/requirements.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/paddlers/tasks/__init__.py b/paddlers/tasks/__init__.py
index 67d2c85..e66ba0d 100644
--- a/paddlers/tasks/__init__.py
+++ b/paddlers/tasks/__init__.py
@@ -1,2 +1,4 @@
 from . import det
+from .segmenter import *
 from .load_model import load_model
+
diff --git a/paddlers/tasks/base.py b/paddlers/tasks/base.py
index 2e667b7..3c57ee1 100644
--- a/paddlers/tasks/base.py
+++ b/paddlers/tasks/base.py
@@ -300,7 +300,7 @@ class BaseModel:
             vdl_logdir = osp.join(save_dir, 'vdl_log')
             log_writer = LogWriter(vdl_logdir)
 
-        # task_id: refer to paddlex
+        # task_id: refer to paddlers
         task_id = getattr(paddlers, "task_id", "")
 
         thresh = .0001
diff --git a/paddlers/tasks/segmenter.py b/paddlers/tasks/segmenter.py
new file mode 100644
index 0000000..77507c7
--- /dev/null
+++ b/paddlers/tasks/segmenter.py
@@ -0,0 +1,768 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os.path as osp
+import numpy as np
+import cv2
+from collections import OrderedDict
+import paddle
+import paddle.nn.functional as F
+from paddle.static import InputSpec
+import paddlers.models.ppseg as paddleseg
+import paddlers
+from paddlers.transforms import arrange_transforms
+from paddlers.utils import get_single_card_bs, DisablePrint
+import paddlers.utils.logging as logging
+from .base import BaseModel
+from .utils import seg_metrics as metrics
+from paddlers.utils.checkpoint import seg_pretrain_weights_dict
+from paddlers.transforms import Decode, Resize
+
+__all__ = ["UNet", "DeepLabV3P", "FastSCNN", "HRNet", "BiSeNetV2"]
+
+
+class BaseSegmenter(BaseModel):
+    def __init__(self,
+                 model_name,
+                 num_classes=2,
+                 use_mixed_loss=False,
+                 **params):
+        self.init_params = locals()
+        if 'with_net' in self.init_params:
+            del self.init_params['with_net']
+        super(BaseSegmenter, self).__init__('segmenter')
+        if not hasattr(paddleseg.models, model_name):
+            raise Exception("ERROR: There's no model named {}.".format(
+                model_name))
+        self.model_name = model_name
+        self.num_classes = num_classes
+        self.use_mixed_loss = use_mixed_loss
+        self.losses = None
+        self.labels = None
+        if params.get('with_net', True):
+            params.pop('with_net', None)
+            self.net = self.build_net(**params)
+        self.find_unused_parameters = True
+
+    def build_net(self, **params):
+        # TODO: when using paddle.utils.unique_name.guard,
+        # DeepLabv3p and HRNet will raise a error
+        net = paddleseg.models.__dict__[self.model_name](
+            num_classes=self.num_classes, **params)
+        return net
+
+    def _fix_transforms_shape(self, image_shape):
+        if hasattr(self, 'test_transforms'):
+            if self.test_transforms is not None:
+                has_resize_op = False
+                resize_op_idx = -1
+                normalize_op_idx = len(self.test_transforms.transforms)
+                for idx, op in enumerate(self.test_transforms.transforms):
+                    name = op.__class__.__name__
+                    if name == 'Normalize':
+                        normalize_op_idx = idx
+                    if 'Resize' in name:
+                        has_resize_op = True
+                        resize_op_idx = idx
+
+                if not has_resize_op:
+                    self.test_transforms.transforms.insert(
+                        normalize_op_idx, Resize(target_size=image_shape))
+                else:
+                    self.test_transforms.transforms[resize_op_idx] = Resize(
+                        target_size=image_shape)
+
+    def _get_test_inputs(self, image_shape):
+        if image_shape is not None:
+            if len(image_shape) == 2:
+                image_shape = [1, 3] + image_shape
+            self._fix_transforms_shape(image_shape[-2:])
+        else:
+            image_shape = [None, 3, -1, -1]
+        self.fixed_input_shape = image_shape
+        input_spec = [
+            InputSpec(
+                shape=image_shape, name='image', dtype='float32')
+        ]
+        return input_spec
+
+    def run(self, net, inputs, mode):
+        net_out = net(inputs[0])
+        logit = net_out[0]
+        outputs = OrderedDict()
+        if mode == 'test':
+            origin_shape = inputs[1]
+            if self.status == 'Infer':
+                label_map_list, score_map_list = self._postprocess(
+                    net_out, origin_shape, transforms=inputs[2])
+            else:
+                logit_list = self._postprocess(
+                    logit, origin_shape, transforms=inputs[2])
+                label_map_list = []
+                score_map_list = []
+                for logit in logit_list:
+                    logit = paddle.transpose(logit, perm=[0, 2, 3, 1])  # NHWC
+                    label_map_list.append(
+                        paddle.argmax(
+                            logit, axis=-1, keepdim=False, dtype='int32')
+                        .squeeze().numpy())
+                    score_map_list.append(
+                        F.softmax(
+                            logit, axis=-1).squeeze().numpy().astype(
+                                'float32'))
+            outputs['label_map'] = label_map_list
+            outputs['score_map'] = score_map_list
+
+        if mode == 'eval':
+            if self.status == 'Infer':
+                pred = paddle.unsqueeze(net_out[0], axis=1)  # NCHW
+            else:
+                pred = paddle.argmax(
+                    logit, axis=1, keepdim=True, dtype='int32')
+            label = inputs[1]
+            origin_shape = [label.shape[-2:]]
+            pred = self._postprocess(
+                pred, origin_shape, transforms=inputs[2])[0]  # NCHW
+            intersect_area, pred_area, label_area = paddleseg.utils.metrics.calculate_area(
+                pred, label, self.num_classes)
+            outputs['intersect_area'] = intersect_area
+            outputs['pred_area'] = pred_area
+            outputs['label_area'] = label_area
+            outputs['conf_mat'] = metrics.confusion_matrix(pred, label,
+                                                           self.num_classes)
+        if mode == 'train':
+            loss_list = metrics.loss_computation(
+                logits_list=net_out, labels=inputs[1], losses=self.losses)
+            loss = sum(loss_list)
+            outputs['loss'] = loss
+        return outputs
+
+    def default_loss(self):
+        if isinstance(self.use_mixed_loss, bool):
+            if self.use_mixed_loss:
+                losses = [
+                    paddleseg.models.CrossEntropyLoss(),
+                    paddleseg.models.LovaszSoftmaxLoss()
+                ]
+                coef = [.8, .2]
+                loss_type = [
+                    paddleseg.models.MixedLoss(
+                        losses=losses, coef=coef),
+                ]
+            else:
+                loss_type = [paddleseg.models.CrossEntropyLoss()]
+        else:
+            losses, coef = list(zip(*self.use_mixed_loss))
+            if not set(losses).issubset(
+                ['CrossEntropyLoss', 'DiceLoss', 'LovaszSoftmaxLoss']):
+                raise ValueError(
+                    "Only 'CrossEntropyLoss', 'DiceLoss', 'LovaszSoftmaxLoss' are supported."
+                )
+            losses = [getattr(paddleseg.models, loss)() for loss in losses]
+            loss_type = [
+                paddleseg.models.MixedLoss(
+                    losses=losses, coef=list(coef))
+            ]
+        if self.model_name == 'FastSCNN':
+            loss_type *= 2
+            loss_coef = [1.0, 0.4]
+        elif self.model_name == 'BiSeNetV2':
+            loss_type *= 5
+            loss_coef = [1.0] * 5
+        else:
+            loss_coef = [1.0]
+        losses = {'types': loss_type, 'coef': loss_coef}
+        return losses
+
+    def default_optimizer(self,
+                          parameters,
+                          learning_rate,
+                          num_epochs,
+                          num_steps_each_epoch,
+                          lr_decay_power=0.9):
+        decay_step = num_epochs * num_steps_each_epoch
+        lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
+            learning_rate, decay_step, end_lr=0, power=lr_decay_power)
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=lr_scheduler,
+            parameters=parameters,
+            momentum=0.9,
+            weight_decay=4e-5)
+        return optimizer
+
+    def train(self,
+              num_epochs,
+              train_dataset,
+              train_batch_size=2,
+              eval_dataset=None,
+              optimizer=None,
+              save_interval_epochs=1,
+              log_interval_steps=2,
+              save_dir='output',
+              pretrain_weights='CITYSCAPES',
+              learning_rate=0.01,
+              lr_decay_power=0.9,
+              early_stop=False,
+              early_stop_patience=5,
+              use_vdl=True,
+              resume_checkpoint=None):
+        """
+        Train the model.
+        Args:
+            num_epochs(int): The number of epochs.
+            train_dataset(paddlers.dataset): Training dataset.
+            train_batch_size(int, optional): Total batch size among all cards used in training. Defaults to 2.
+            eval_dataset(paddlers.dataset, optional):
+                Evaluation dataset. If None, the model will not be evaluated furing training process. Defaults to None.
+            optimizer(paddle.optimizer.Optimizer or None, optional):
+                Optimizer used in training. If None, a default optimizer is used. Defaults to None.
+            save_interval_epochs(int, optional): Epoch interval for saving the model. Defaults to 1.
+            log_interval_steps(int, optional): Step interval for printing training information. Defaults to 10.
+            save_dir(str, optional): Directory to save the model. Defaults to 'output'.
+            pretrain_weights(str or None, optional):
+                None or name/path of pretrained weights. If None, no pretrained weights will be loaded. Defaults to 'CITYSCAPES'.
+            learning_rate(float, optional): Learning rate for training. Defaults to .025.
+            lr_decay_power(float, optional): Learning decay power. Defaults to .9.
+            early_stop(bool, optional): Whether to adopt early stop strategy. Defaults to False.
+            early_stop_patience(int, optional): Early stop patience. Defaults to 5.
+            use_vdl(bool, optional): Whether to use VisualDL to monitor the training process. Defaults to True.
+            resume_checkpoint(str or None, optional): The path of the checkpoint to resume training from.
+                If None, no training checkpoint will be resumed. At most one of `resume_checkpoint` and
+                `pretrain_weights` can be set simultaneously. Defaults to None.
+
+        """
+        if self.status == 'Infer':
+            logging.error(
+                "Exported inference model does not support training.",
+                exit=True)
+        if pretrain_weights is not None and resume_checkpoint is not None:
+            logging.error(
+                "pretrain_weights and resume_checkpoint cannot be set simultaneously.",
+                exit=True)
+        self.labels = train_dataset.labels
+        if self.losses is None:
+            self.losses = self.default_loss()
+
+        if optimizer is None:
+            num_steps_each_epoch = train_dataset.num_samples // train_batch_size
+            self.optimizer = self.default_optimizer(
+                self.net.parameters(), learning_rate, num_epochs,
+                num_steps_each_epoch, lr_decay_power)
+        else:
+            self.optimizer = optimizer
+
+        if pretrain_weights is not None and not osp.exists(pretrain_weights):
+            if pretrain_weights not in seg_pretrain_weights_dict[
+                    self.model_name]:
+                logging.warning(
+                    "Path of pretrain_weights('{}') does not exist!".format(
+                        pretrain_weights))
+                logging.warning("Pretrain_weights is forcibly set to '{}'. "
+                                "If don't want to use pretrain weights, "
+                                "set pretrain_weights to be None.".format(
+                                    seg_pretrain_weights_dict[self.model_name][
+                                        0]))
+                pretrain_weights = seg_pretrain_weights_dict[self.model_name][
+                    0]
+        elif pretrain_weights is not None and osp.exists(pretrain_weights):
+            if osp.splitext(pretrain_weights)[-1] != '.pdparams':
+                logging.error(
+                    "Invalid pretrain weights. Please specify a '.pdparams' file.",
+                    exit=True)
+        pretrained_dir = osp.join(save_dir, 'pretrain')
+        is_backbone_weights = pretrain_weights == 'IMAGENET'
+        self.net_initialize(
+            pretrain_weights=pretrain_weights,
+            save_dir=pretrained_dir,
+            resume_checkpoint=resume_checkpoint,
+            is_backbone_weights=is_backbone_weights)
+
+        self.train_loop(
+            num_epochs=num_epochs,
+            train_dataset=train_dataset,
+            train_batch_size=train_batch_size,
+            eval_dataset=eval_dataset,
+            save_interval_epochs=save_interval_epochs,
+            log_interval_steps=log_interval_steps,
+            save_dir=save_dir,
+            early_stop=early_stop,
+            early_stop_patience=early_stop_patience,
+            use_vdl=use_vdl)
+
+    def quant_aware_train(self,
+                          num_epochs,
+                          train_dataset,
+                          train_batch_size=2,
+                          eval_dataset=None,
+                          optimizer=None,
+                          save_interval_epochs=1,
+                          log_interval_steps=2,
+                          save_dir='output',
+                          learning_rate=0.0001,
+                          lr_decay_power=0.9,
+                          early_stop=False,
+                          early_stop_patience=5,
+                          use_vdl=True,
+                          resume_checkpoint=None,
+                          quant_config=None):
+        """
+        Quantization-aware training.
+        Args:
+            num_epochs(int): The number of epochs.
+            train_dataset(paddlers.dataset): Training dataset.
+            train_batch_size(int, optional): Total batch size among all cards used in training. Defaults to 2.
+            eval_dataset(paddlers.dataset, optional):
+                Evaluation dataset. If None, the model will not be evaluated furing training process. Defaults to None.
+            optimizer(paddle.optimizer.Optimizer or None, optional):
+                Optimizer used in training. If None, a default optimizer is used. Defaults to None.
+            save_interval_epochs(int, optional): Epoch interval for saving the model. Defaults to 1.
+            log_interval_steps(int, optional): Step interval for printing training information. Defaults to 10.
+            save_dir(str, optional): Directory to save the model. Defaults to 'output'.
+            learning_rate(float, optional): Learning rate for training. Defaults to .025.
+            lr_decay_power(float, optional): Learning decay power. Defaults to .9.
+            early_stop(bool, optional): Whether to adopt early stop strategy. Defaults to False.
+            early_stop_patience(int, optional): Early stop patience. Defaults to 5.
+            use_vdl(bool, optional): Whether to use VisualDL to monitor the training process. Defaults to True.
+            quant_config(dict or None, optional): Quantization configuration. If None, a default rule of thumb
+                configuration will be used. Defaults to None.
+            resume_checkpoint(str or None, optional): The path of the checkpoint to resume quantization-aware training
+                from. If None, no training checkpoint will be resumed. Defaults to None.
+
+        """
+        self._prepare_qat(quant_config)
+        self.train(
+            num_epochs=num_epochs,
+            train_dataset=train_dataset,
+            train_batch_size=train_batch_size,
+            eval_dataset=eval_dataset,
+            optimizer=optimizer,
+            save_interval_epochs=save_interval_epochs,
+            log_interval_steps=log_interval_steps,
+            save_dir=save_dir,
+            pretrain_weights=None,
+            learning_rate=learning_rate,
+            lr_decay_power=lr_decay_power,
+            early_stop=early_stop,
+            early_stop_patience=early_stop_patience,
+            use_vdl=use_vdl,
+            resume_checkpoint=resume_checkpoint)
+
+    def evaluate(self, eval_dataset, batch_size=1, return_details=False):
+        """
+        Evaluate the model.
+        Args:
+            eval_dataset(paddlers.dataset): Evaluation dataset.
+            batch_size(int, optional): Total batch size among all cards used for evaluation. Defaults to 1.
+            return_details(bool, optional): Whether to return evaluation details. Defaults to False.
+
+        Returns:
+            collections.OrderedDict with key-value pairs:
+                {"miou": `mean intersection over union`,
+                 "category_iou": `category-wise mean intersection over union`,
+                 "oacc": `overall accuracy`,
+                 "category_acc": `category-wise accuracy`,
+                 "kappa": ` kappa coefficient`,
+                 "category_F1-score": `F1 score`}.
+
+        """
+        arrange_transforms(
+            model_type=self.model_type,
+            transforms=eval_dataset.transforms,
+            mode='eval')
+
+        self.net.eval()
+        nranks = paddle.distributed.get_world_size()
+        local_rank = paddle.distributed.get_rank()
+        if nranks > 1:
+            # Initialize parallel environment if not done.
+            if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
+            ):
+                paddle.distributed.init_parallel_env()
+
+        batch_size_each_card = get_single_card_bs(batch_size)
+        if batch_size_each_card > 1:
+            batch_size_each_card = 1
+            batch_size = batch_size_each_card * paddlers.env_info['num']
+            logging.warning(
+                "Segmenter only supports batch_size=1 for each gpu/cpu card " \
+                "during evaluation, so batch_size " \
+                "is forcibly set to {}.".format(batch_size))
+        self.eval_data_loader = self.build_data_loader(
+            eval_dataset, batch_size=batch_size, mode='eval')
+
+        intersect_area_all = 0
+        pred_area_all = 0
+        label_area_all = 0
+        conf_mat_all = []
+        logging.info(
+            "Start to evaluate(total_samples={}, total_steps={})...".format(
+                eval_dataset.num_samples,
+                math.ceil(eval_dataset.num_samples * 1.0 / batch_size)))
+        with paddle.no_grad():
+            for step, data in enumerate(self.eval_data_loader):
+                data.append(eval_dataset.transforms.transforms)
+                outputs = self.run(self.net, data, 'eval')
+                pred_area = outputs['pred_area']
+                label_area = outputs['label_area']
+                intersect_area = outputs['intersect_area']
+                conf_mat = outputs['conf_mat']
+
+                # Gather from all ranks
+                if nranks > 1:
+                    intersect_area_list = []
+                    pred_area_list = []
+                    label_area_list = []
+                    conf_mat_list = []
+                    paddle.distributed.all_gather(intersect_area_list,
+                                                  intersect_area)
+                    paddle.distributed.all_gather(pred_area_list, pred_area)
+                    paddle.distributed.all_gather(label_area_list, label_area)
+                    paddle.distributed.all_gather(conf_mat_list, conf_mat)
+
+                    # Some image has been evaluated and should be eliminated in last iter
+                    if (step + 1) * nranks > len(eval_dataset):
+                        valid = len(eval_dataset) - step * nranks
+                        intersect_area_list = intersect_area_list[:valid]
+                        pred_area_list = pred_area_list[:valid]
+                        label_area_list = label_area_list[:valid]
+                        conf_mat_list = conf_mat_list[:valid]
+
+                    intersect_area_all += sum(intersect_area_list)
+                    pred_area_all += sum(pred_area_list)
+                    label_area_all += sum(label_area_list)
+                    conf_mat_all.extend(conf_mat_list)
+
+                else:
+                    intersect_area_all = intersect_area_all + intersect_area
+                    pred_area_all = pred_area_all + pred_area
+                    label_area_all = label_area_all + label_area
+                    conf_mat_all.append(conf_mat)
+        class_iou, miou = paddleseg.utils.metrics.mean_iou(
+            intersect_area_all, pred_area_all, label_area_all)
+        # TODO 确认是按oacc还是macc
+        class_acc, oacc = paddleseg.utils.metrics.accuracy(intersect_area_all,
+                                                           pred_area_all)
+        kappa = paddleseg.utils.metrics.kappa(intersect_area_all,
+                                              pred_area_all, label_area_all)
+        category_f1score = metrics.f1_score(intersect_area_all, pred_area_all,
+                                            label_area_all)
+        eval_metrics = OrderedDict(
+            zip([
+                'miou', 'category_iou', 'oacc', 'category_acc', 'kappa',
+                'category_F1-score'
+            ], [miou, class_iou, oacc, class_acc, kappa, category_f1score]))
+
+        if return_details:
+            conf_mat = sum(conf_mat_all)
+            eval_details = {'confusion_matrix': conf_mat.tolist()}
+            return eval_metrics, eval_details
+        return eval_metrics
+
+    def predict(self, img_file, transforms=None):
+        """
+        Do inference.
+        Args:
+            Args:
+            img_file(List[np.ndarray or str], str or np.ndarray):
+                Image path or decoded image data in a BGR format, which also could constitute a list,
+                meaning all images to be predicted as a mini-batch.
+            transforms(paddlers.transforms.Compose or None, optional):
+                Transforms for inputs. If None, the transforms for evaluation process will be used. Defaults to None.
+
+        Returns:
+            If img_file is a string or np.array, the result is a dict with key-value pairs:
+            {"label map": `label map`, "score_map": `score map`}.
+            If img_file is a list, the result is a list composed of dicts with the corresponding fields:
+            label_map(np.ndarray): the predicted label map (HW)
+            score_map(np.ndarray): the prediction score map (HWC)
+
+        """
+        if transforms is None and not hasattr(self, 'test_transforms'):
+            raise Exception("transforms need to be defined, now is None.")
+        if transforms is None:
+            transforms = self.test_transforms
+        if isinstance(img_file, (str, np.ndarray)):
+            images = [img_file]
+        else:
+            images = img_file
+        batch_im, batch_origin_shape = self._preprocess(images, transforms,
+                                                        self.model_type)
+        self.net.eval()
+        data = (batch_im, batch_origin_shape, transforms.transforms)
+        outputs = self.run(self.net, data, 'test')
+        label_map_list = outputs['label_map']
+        score_map_list = outputs['score_map']
+        if isinstance(img_file, list):
+            prediction = [{
+                'label_map': l,
+                'score_map': s
+            } for l, s in zip(label_map_list, score_map_list)]
+        else:
+            prediction = {
+                'label_map': label_map_list[0],
+                'score_map': score_map_list[0]
+            }
+        return prediction
+
+    def _preprocess(self, images, transforms, to_tensor=True):
+        arrange_transforms(
+            model_type=self.model_type, transforms=transforms, mode='test')
+        batch_im = list()
+        batch_ori_shape = list()
+        for im in images:
+            sample = {'image': im}
+            if isinstance(sample['image'], str):
+                sample = Decode(to_rgb=False)(sample)
+            ori_shape = sample['image'].shape[:2]
+            im = transforms(sample)[0]
+            batch_im.append(im)
+            batch_ori_shape.append(ori_shape)
+        if to_tensor:
+            batch_im = paddle.to_tensor(batch_im)
+        else:
+            batch_im = np.asarray(batch_im)
+
+        return batch_im, batch_ori_shape
+
+    @staticmethod
+    def get_transforms_shape_info(batch_ori_shape, transforms):
+        batch_restore_list = list()
+        for ori_shape in batch_ori_shape:
+            restore_list = list()
+            h, w = ori_shape[0], ori_shape[1]
+            for op in transforms:
+                if op.__class__.__name__ == 'Resize':
+                    restore_list.append(('resize', (h, w)))
+                    h, w = op.target_size
+                elif op.__class__.__name__ == 'ResizeByShort':
+                    restore_list.append(('resize', (h, w)))
+                    im_short_size = min(h, w)
+                    im_long_size = max(h, w)
+                    scale = float(op.short_size) / float(im_short_size)
+                    if 0 < op.max_size < np.round(scale * im_long_size):
+                        scale = float(op.max_size) / float(im_long_size)
+                    h = int(round(h * scale))
+                    w = int(round(w * scale))
+                elif op.__class__.__name__ == 'ResizeByLong':
+                    restore_list.append(('resize', (h, w)))
+                    im_long_size = max(h, w)
+                    scale = float(op.long_size) / float(im_long_size)
+                    h = int(round(h * scale))
+                    w = int(round(w * scale))
+                elif op.__class__.__name__ == 'Padding':
+                    if op.target_size:
+                        target_h, target_w = op.target_size
+                    else:
+                        target_h = int(
+                            (np.ceil(h / op.size_divisor) * op.size_divisor))
+                        target_w = int(
+                            (np.ceil(w / op.size_divisor) * op.size_divisor))
+
+                    if op.pad_mode == -1:
+                        offsets = op.offsets
+                    elif op.pad_mode == 0:
+                        offsets = [0, 0]
+                    elif op.pad_mode == 1:
+                        offsets = [(target_h - h) // 2, (target_w - w) // 2]
+                    else:
+                        offsets = [target_h - h, target_w - w]
+                    restore_list.append(('padding', (h, w), offsets))
+                    h, w = target_h, target_w
+
+            batch_restore_list.append(restore_list)
+        return batch_restore_list
+
+    def _postprocess(self, batch_pred, batch_origin_shape, transforms):
+        batch_restore_list = BaseSegmenter.get_transforms_shape_info(
+            batch_origin_shape, transforms)
+        if isinstance(batch_pred, (tuple, list)) and self.status == 'Infer':
+            return self._infer_postprocess(
+                batch_label_map=batch_pred[0],
+                batch_score_map=batch_pred[1],
+                batch_restore_list=batch_restore_list)
+        results = []
+        if batch_pred.dtype == paddle.float32:
+            mode = 'bilinear'
+        else:
+            mode = 'nearest'
+        for pred, restore_list in zip(batch_pred, batch_restore_list):
+            pred = paddle.unsqueeze(pred, axis=0)
+            for item in restore_list[::-1]:
+                h, w = item[1][0], item[1][1]
+                if item[0] == 'resize':
+                    pred = F.interpolate(
+                        pred, (h, w), mode=mode, data_format='NCHW')
+                elif item[0] == 'padding':
+                    x, y = item[2]
+                    pred = pred[:, :, y:y + h, x:x + w]
+                else:
+                    pass
+            results.append(pred)
+        return results
+
+    def _infer_postprocess(self, batch_label_map, batch_score_map,
+                           batch_restore_list):
+        label_maps = []
+        score_maps = []
+        for label_map, score_map, restore_list in zip(
+                batch_label_map, batch_score_map, batch_restore_list):
+            if not isinstance(label_map, np.ndarray):
+                label_map = paddle.unsqueeze(label_map, axis=[0, 3])
+                score_map = paddle.unsqueeze(score_map, axis=0)
+            for item in restore_list[::-1]:
+                h, w = item[1][0], item[1][1]
+                if item[0] == 'resize':
+                    if isinstance(label_map, np.ndarray):
+                        label_map = cv2.resize(
+                            label_map, (w, h), interpolation=cv2.INTER_NEAREST)
+                        score_map = cv2.resize(
+                            score_map, (w, h), interpolation=cv2.INTER_LINEAR)
+                    else:
+                        label_map = F.interpolate(
+                            label_map, (h, w),
+                            mode='nearest',
+                            data_format='NHWC')
+                        score_map = F.interpolate(
+                            score_map, (h, w),
+                            mode='bilinear',
+                            data_format='NHWC')
+                elif item[0] == 'padding':
+                    x, y = item[2]
+                    if isinstance(label_map, np.ndarray):
+                        label_map = label_map[..., y:y + h, x:x + w]
+                        score_map = score_map[..., y:y + h, x:x + w]
+                    else:
+                        label_map = label_map[:, :, y:y + h, x:x + w]
+                        score_map = score_map[:, :, y:y + h, x:x + w]
+                else:
+                    pass
+            label_map = label_map.squeeze()
+            score_map = score_map.squeeze()
+            if not isinstance(label_map, np.ndarray):
+                label_map = label_map.numpy()
+                score_map = score_map.numpy()
+            label_maps.append(label_map.squeeze())
+            score_maps.append(score_map.squeeze())
+        return label_maps, score_maps
+
+
+class UNet(BaseSegmenter):
+    def __init__(self,
+                 num_classes=2,
+                 use_mixed_loss=False,
+                 use_deconv=False,
+                 align_corners=False,
+                 **params):
+        params.update({
+            'use_deconv': use_deconv,
+            'align_corners': align_corners
+        })
+        super(UNet, self).__init__(
+            model_name='UNet',
+            num_classes=num_classes,
+            use_mixed_loss=use_mixed_loss,
+            **params)
+
+
+class DeepLabV3P(BaseSegmenter):
+    def __init__(self,
+                 num_classes=2,
+                 backbone='ResNet50_vd',
+                 use_mixed_loss=False,
+                 output_stride=8,
+                 backbone_indices=(0, 3),
+                 aspp_ratios=(1, 12, 24, 36),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 **params):
+        self.backbone_name = backbone
+        if backbone not in ['ResNet50_vd', 'ResNet101_vd']:
+            raise ValueError(
+                "backbone: {} is not supported. Please choose one of "
+                "('ResNet50_vd', 'ResNet101_vd')".format(backbone))
+        if params.get('with_net', True):
+            with DisablePrint():
+                backbone = getattr(paddleseg.models, backbone)(
+                    output_stride=output_stride)
+        else:
+            backbone = None
+        params.update({
+            'backbone': backbone,
+            'backbone_indices': backbone_indices,
+            'aspp_ratios': aspp_ratios,
+            'aspp_out_channels': aspp_out_channels,
+            'align_corners': align_corners
+        })
+        super(DeepLabV3P, self).__init__(
+            model_name='DeepLabV3P',
+            num_classes=num_classes,
+            use_mixed_loss=use_mixed_loss,
+            **params)
+
+
+class FastSCNN(BaseSegmenter):
+    def __init__(self,
+                 num_classes=2,
+                 use_mixed_loss=False,
+                 align_corners=False,
+                 **params):
+        params.update({'align_corners': align_corners})
+        super(FastSCNN, self).__init__(
+            model_name='FastSCNN',
+            num_classes=num_classes,
+            use_mixed_loss=use_mixed_loss,
+            **params)
+
+
+class HRNet(BaseSegmenter):
+    def __init__(self,
+                 num_classes=2,
+                 width=48,
+                 use_mixed_loss=False,
+                 align_corners=False,
+                 **params):
+        if width not in (18, 48):
+            raise ValueError(
+                "width={} is not supported, please choose from [18, 48]".
+                format(width))
+        self.backbone_name = 'HRNet_W{}'.format(width)
+        if params.get('with_net', True):
+            with DisablePrint():
+                backbone = getattr(paddleseg.models, self.backbone_name)(
+                    align_corners=align_corners)
+        else:
+            backbone = None
+
+        params.update({'backbone': backbone, 'align_corners': align_corners})
+        super(HRNet, self).__init__(
+            model_name='FCN',
+            num_classes=num_classes,
+            use_mixed_loss=use_mixed_loss,
+            **params)
+        self.model_name = 'HRNet'
+
+
+class BiSeNetV2(BaseSegmenter):
+    def __init__(self,
+                 num_classes=2,
+                 use_mixed_loss=False,
+                 align_corners=False,
+                 **params):
+        params.update({'align_corners': align_corners})
+        super(BiSeNetV2, self).__init__(
+            model_name='BiSeNetV2',
+            num_classes=num_classes,
+            use_mixed_loss=use_mixed_loss,
+            **params)
diff --git a/paddlers/transforms/operators.py b/paddlers/transforms/operators.py
index f7410f2..415fd2c 100644
--- a/paddlers/transforms/operators.py
+++ b/paddlers/transforms/operators.py
@@ -867,7 +867,7 @@ class RandomExpand(Transform):
         label_padding_value(int, optional): Filling value for the mask. Defaults to 255.
 
     See Also:
-        paddlex.transforms.Padding
+        paddlers.transforms.Padding
     """
 
     def __init__(self,
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..804ab1b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+tqdm
+scipy
+colorama
+cython
+pycocotools
+visualdl >= 2.1.1
+paddleslim == 2.2.1
+shapely
+paddlepaddle-gpu >= 2.2.0
+opencv-python
+scikit-learn==0.20.3
+lap
+motmetrics
+matplotlib
+chardet
+openpyxl
diff --git a/paddlers/tutorials/train/ppyolo.py b/tutorials/train/ppyolo.py
similarity index 100%
rename from paddlers/tutorials/train/ppyolo.py
rename to tutorials/train/ppyolo.py
diff --git a/tutorials/train/semantic_segmentation/deeplabv3p_resnet50_vd.py b/tutorials/train/semantic_segmentation/deeplabv3p_resnet50_vd.py
new file mode 100644
index 0000000..4c46255
--- /dev/null
+++ b/tutorials/train/semantic_segmentation/deeplabv3p_resnet50_vd.py
@@ -0,0 +1,58 @@
+import sys
+
+sys.path.append("/mnt/chulutao/PaddleRS")
+
+import paddlers as pdrs
+from paddlers import transforms as T
+
+# 下载和解压视盘分割数据集
+optic_dataset = 'https://bj.bcebos.com/paddlex/datasets/optic_disc_seg.tar.gz'
+pdrs.utils.download_and_decompress(optic_dataset, path='./')
+
+# 定义训练和验证时的transforms
+# API说明：https://github.com/PaddlePaddle/paddlers/blob/develop/docs/apis/transforms/transforms.md
+train_transforms = T.Compose([
+    T.Resize(target_size=512),
+    T.RandomHorizontalFlip(),
+    T.Normalize(
+        mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+])
+
+eval_transforms = T.Compose([
+    T.Resize(target_size=512),
+    T.Normalize(
+        mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+])
+
+# 定义训练和验证所用的数据集
+# API说明：https://github.com/PaddlePaddle/paddlers/blob/develop/docs/apis/datasets.md
+train_dataset = pdrs.datasets.SegDataset(
+    data_dir='optic_disc_seg',
+    file_list='optic_disc_seg/train_list.txt',
+    label_list='optic_disc_seg/labels.txt',
+    transforms=train_transforms,
+    num_workers=0,
+    shuffle=True)
+
+eval_dataset = pdrs.datasets.SegDataset(
+    data_dir='optic_disc_seg',
+    file_list='optic_disc_seg/val_list.txt',
+    label_list='optic_disc_seg/labels.txt',
+    transforms=eval_transforms,
+    num_workers=0,
+    shuffle=False)
+
+# 初始化模型，并进行训练
+# 可使用VisualDL查看训练指标，参考https://github.com/PaddlePaddle/paddlers/blob/develop/docs/visualdl.md
+num_classes = len(train_dataset.labels)
+model = pdrs.tasks.DeepLabV3P(num_classes=num_classes, backbone='ResNet50_vd')
+
+# API说明：https://github.com/PaddlePaddle/paddlers/blob/develop/docs/apis/models/semantic_segmentation.md
+# 各参数介绍与调整说明：https://github.com/PaddlePaddle/paddlers/blob/develop/docs/parameters.md
+model.train(
+    num_epochs=10,
+    train_dataset=train_dataset,
+    train_batch_size=4,
+    eval_dataset=eval_dataset,
+    learning_rate=0.01,
+    save_dir='output/deeplabv3p_r50vd')