add segmentation task. optimize directory structure

3 years ago · 691e5c438d
parent 16c85bf3c2
commit 691e5c438d
134 changed files with 26493 additions and 2 deletions
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1 @@
+PaddleSeg commit fec42fd869b6f796c74cd510671595e3512bc8e9
--- a/paddlers/datasets/init.py
+++ b/paddlers/datasets/init.py
@ -1 +1,2 @@
 from .voc import VOCDetection
+from .seg_dataset import SegDataset
--- a/paddlers/datasets/seg_dataset.py
+++ b/paddlers/datasets/seg_dataset.py
@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+import copy
+
+from paddle.io import Dataset
+from paddlers.utils import logging, get_num_workers, get_encoding, path_normalization, is_pic
+
+
+class SegDataset(Dataset):
+    """读取语义分割任务数据集，并对样本进行相应的处理。
+
+    Args:
+        data_dir (str): 数据集所在的目录路径。
+        file_list (str): 描述数据集图片文件和对应标注文件的文件路径（文本内每行路径为相对data_dir的相对路）。
+        label_list (str): 描述数据集包含的类别信息文件路径。默认值为None。
+        transforms (paddlers.transforms): 数据集中每个样本的预处理/增强算子。
+        num_workers (int|str): 数据集中样本在预处理过程中的线程或进程数。默认为'auto'。
+        shuffle (bool): 是否需要对数据集中样本打乱顺序。默认为False。
+    """
+
+    def __init__(self,
+                 data_dir,
+                 file_list,
+                 label_list=None,
+                 transforms=None,
+                 num_workers='auto',
+                 shuffle=False):
+        super(SegDataset, self).__init__()
+        self.transforms = copy.deepcopy(transforms)
+        # TODO batch padding
+        self.batch_transforms = None
+        self.num_workers = get_num_workers(num_workers)
+        self.shuffle = shuffle
+        self.file_list = list()
+        self.labels = list()
+
+        # TODO：非None时，让用户跳转数据集分析生成label_list
+        # 不要在此处分析label file
+        if label_list is not None:
+            with open(label_list, encoding=get_encoding(label_list)) as f:
+                for line in f:
+                    item = line.strip()
+                    self.labels.append(item)
+        with open(file_list, encoding=get_encoding(file_list)) as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) > 2:
+                    raise Exception(
+                        "A space is defined as the delimiter to separate the image and label path, " \
+                        "so the space cannot be in the image or label path, but the line[{}] of " \
+                        " file_list[{}] has a space in the image or label path.".format(line, file_list))
+                items[0] = path_normalization(items[0])
+                items[1] = path_normalization(items[1])
+                if not is_pic(items[0]) or not is_pic(items[1]):
+                    continue
+                full_path_im = osp.join(data_dir, items[0])
+                full_path_label = osp.join(data_dir, items[1])
+                if not osp.exists(full_path_im):
+                    raise IOError('Image file {} does not exist!'.format(
+                        full_path_im))
+                if not osp.exists(full_path_label):
+                    raise IOError('Label file {} does not exist!'.format(
+                        full_path_label))
+                self.file_list.append({
+                    'image': full_path_im,
+                    'mask': full_path_label
+                })
+        self.num_samples = len(self.file_list)
+        logging.info("{} samples in file {}".format(
+            len(self.file_list), file_list))
+
+    def __getitem__(self, idx):
+        sample = copy.deepcopy(self.file_list[idx])
+        outputs = self.transforms(sample)
+        return outputs
+
+    def __len__(self):
+        return len(self.file_list)
--- a/paddlers/models/ppseg/init.py
+++ b/paddlers/models/ppseg/init.py
@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import models, datasets, transforms
+
+__version__ = 'develop'
--- a/paddlers/models/ppseg/core/init.py
+++ b/paddlers/models/ppseg/core/init.py
@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .train import train
+from .val import evaluate
+from .predict import predict
+from . import infer
+
+__all__ = ['train', 'evaluate', 'predict']
--- a/paddlers/models/ppseg/core/infer.py
+++ b/paddlers/models/ppseg/core/infer.py
@ -0,0 +1,309 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections.abc
+from itertools import combinations
+
+import numpy as np
+import cv2
+import paddle
+import paddle.nn.functional as F
+
+
+def get_reverse_list(ori_shape, transforms):
+    """
+    get reverse list of transform.
+
+    Args:
+        ori_shape (list): Origin shape of image.
+        transforms (list): List of transform.
+
+    Returns:
+        list: List of tuple, there are two format:
+            ('resize', (h, w)) The image shape before resize,
+            ('padding', (h, w)) The image shape before padding.
+    """
+    reverse_list = []
+    h, w = ori_shape[0], ori_shape[1]
+    for op in transforms:
+        if op.__class__.__name__ in ['Resize']:
+            reverse_list.append(('resize', (h, w)))
+            h, w = op.target_size[0], op.target_size[1]
+        if op.__class__.__name__ in ['ResizeByLong']:
+            reverse_list.append(('resize', (h, w)))
+            long_edge = max(h, w)
+            short_edge = min(h, w)
+            short_edge = int(round(short_edge * op.long_size / long_edge))
+            long_edge = op.long_size
+            if h > w:
+                h = long_edge
+                w = short_edge
+            else:
+                w = long_edge
+                h = short_edge
+        if op.__class__.__name__ in ['ResizeByShort']:
+            reverse_list.append(('resize', (h, w)))
+            long_edge = max(h, w)
+            short_edge = min(h, w)
+            long_edge = int(round(long_edge * op.short_size / short_edge))
+            short_edge = op.short_size
+            if h > w:
+                h = long_edge
+                w = short_edge
+            else:
+                w = long_edge
+                h = short_edge
+        if op.__class__.__name__ in ['Padding']:
+            reverse_list.append(('padding', (h, w)))
+            w, h = op.target_size[0], op.target_size[1]
+        if op.__class__.__name__ in ['PaddingByAspectRatio']:
+            reverse_list.append(('padding', (h, w)))
+            ratio = w / h
+            if ratio == op.aspect_ratio:
+                pass
+            elif ratio > op.aspect_ratio:
+                h = int(w / op.aspect_ratio)
+            else:
+                w = int(h * op.aspect_ratio)
+        if op.__class__.__name__ in ['LimitLong']:
+            long_edge = max(h, w)
+            short_edge = min(h, w)
+            if ((op.max_long is not None) and (long_edge > op.max_long)):
+                reverse_list.append(('resize', (h, w)))
+                long_edge = op.max_long
+                short_edge = int(round(short_edge * op.max_long / long_edge))
+            elif ((op.min_long is not None) and (long_edge < op.min_long)):
+                reverse_list.append(('resize', (h, w)))
+                long_edge = op.min_long
+                short_edge = int(round(short_edge * op.min_long / long_edge))
+            if h > w:
+                h = long_edge
+                w = short_edge
+            else:
+                w = long_edge
+                h = short_edge
+    return reverse_list
+
+
+def reverse_transform(pred, ori_shape, transforms, mode='nearest'):
+    """recover pred to origin shape"""
+    reverse_list = get_reverse_list(ori_shape, transforms)
+    intTypeList = [paddle.int8, paddle.int16, paddle.int32, paddle.int64]
+    dtype = pred.dtype
+    for item in reverse_list[::-1]:
+        if item[0] == 'resize':
+            h, w = item[1][0], item[1][1]
+            if paddle.get_device() == 'cpu' and dtype in intTypeList:
+                pred = paddle.cast(pred, 'float32')
+                pred = F.interpolate(pred, (h, w), mode=mode)
+                pred = paddle.cast(pred, dtype)
+            else:
+                pred = F.interpolate(pred, (h, w), mode=mode)
+        elif item[0] == 'padding':
+            h, w = item[1][0], item[1][1]
+            pred = pred[:, :, 0:h, 0:w]
+        else:
+            raise Exception("Unexpected info '{}' in im_info".format(item[0]))
+    return pred
+
+
+def flip_combination(flip_horizontal=False, flip_vertical=False):
+    """
+    Get flip combination.
+
+    Args:
+        flip_horizontal (bool): Whether to flip horizontally. Default: False.
+        flip_vertical (bool): Whether to flip vertically. Default: False.
+
+    Returns:
+        list: List of tuple. The first element of tuple is whether to flip horizontally,
+            and the second is whether to flip vertically.
+    """
+
+    flip_comb = [(False, False)]
+    if flip_horizontal:
+        flip_comb.append((True, False))
+    if flip_vertical:
+        flip_comb.append((False, True))
+        if flip_horizontal:
+            flip_comb.append((True, True))
+    return flip_comb
+
+
+def tensor_flip(x, flip):
+    """Flip tensor according directions"""
+    if flip[0]:
+        x = x[:, :, :, ::-1]
+    if flip[1]:
+        x = x[:, :, ::-1, :]
+    return x
+
+
+def slide_inference(model, im, crop_size, stride):
+    """
+    Infer by sliding window.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        im (Tensor): the input image.
+        crop_size (tuple|list). The size of sliding window, (w, h).
+        stride (tuple|list). The size of stride, (w, h).
+
+    Return:
+        Tensor: The logit of input image.
+    """
+    h_im, w_im = im.shape[-2:]
+    w_crop, h_crop = crop_size
+    w_stride, h_stride = stride
+    # calculate the crop nums
+    rows = np.int(np.ceil(1.0 * (h_im - h_crop) / h_stride)) + 1
+    cols = np.int(np.ceil(1.0 * (w_im - w_crop) / w_stride)) + 1
+    # prevent negative sliding rounds when imgs after scaling << crop_size
+    rows = 1 if h_im <= h_crop else rows
+    cols = 1 if w_im <= w_crop else cols
+    # TODO 'Tensor' object does not support item assignment. If support, use tensor to calculation.
+    final_logit = None
+    count = np.zeros([1, 1, h_im, w_im])
+    for r in range(rows):
+        for c in range(cols):
+            h1 = r * h_stride
+            w1 = c * w_stride
+            h2 = min(h1 + h_crop, h_im)
+            w2 = min(w1 + w_crop, w_im)
+            h1 = max(h2 - h_crop, 0)
+            w1 = max(w2 - w_crop, 0)
+            im_crop = im[:, :, h1:h2, w1:w2]
+            logits = model(im_crop)
+            if not isinstance(logits, collections.abc.Sequence):
+                raise TypeError(
+                    "The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}"
+                    .format(type(logits)))
+            logit = logits[0].numpy()
+            if final_logit is None:
+                final_logit = np.zeros([1, logit.shape[1], h_im, w_im])
+            final_logit[:, :, h1:h2, w1:w2] += logit[:, :, :h2 - h1, :w2 - w1]
+            count[:, :, h1:h2, w1:w2] += 1
+    if np.sum(count == 0) != 0:
+        raise RuntimeError(
+            'There are pixel not predicted. It is possible that stride is greater than crop_size'
+        )
+    final_logit = final_logit / count
+    final_logit = paddle.to_tensor(final_logit)
+    return final_logit
+
+
+def inference(model,
+              im,
+              ori_shape=None,
+              transforms=None,
+              is_slide=False,
+              stride=None,
+              crop_size=None):
+    """
+    Inference for image.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        im (Tensor): the input image.
+        ori_shape (list): Origin shape of image.
+        transforms (list): Transforms for image.
+        is_slide (bool): Whether to infer by sliding window. Default: False.
+        crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True.
+        stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True.
+
+    Returns:
+        Tensor: If ori_shape is not None, a prediction with shape (1, 1, h, w) is returned.
+            If ori_shape is None, a logit with shape (1, num_classes, h, w) is returned.
+    """
+    if hasattr(model, 'data_format') and model.data_format == 'NHWC':
+        im = im.transpose((0, 2, 3, 1))
+    if not is_slide:
+        logits = model(im)
+        if not isinstance(logits, collections.abc.Sequence):
+            raise TypeError(
+                "The type of logits must be one of collections.abc.Sequence, e.g. list, tuple. But received {}"
+                .format(type(logits)))
+        logit = logits[0]
+    else:
+        logit = slide_inference(model, im, crop_size=crop_size, stride=stride)
+    if hasattr(model, 'data_format') and model.data_format == 'NHWC':
+        logit = logit.transpose((0, 3, 1, 2))
+    if ori_shape is not None:
+        logit = reverse_transform(logit, ori_shape, transforms, mode='bilinear')
+        pred = paddle.argmax(logit, axis=1, keepdim=True, dtype='int32')
+        return pred, logit
+    else:
+        return logit
+
+
+def aug_inference(model,
+                  im,
+                  ori_shape,
+                  transforms,
+                  scales=1.0,
+                  flip_horizontal=False,
+                  flip_vertical=False,
+                  is_slide=False,
+                  stride=None,
+                  crop_size=None):
+    """
+    Infer with augmentation.
+
+    Args:
+        model (paddle.nn.Layer): model to get logits of image.
+        im (Tensor): the input image.
+        ori_shape (list): Origin shape of image.
+        transforms (list): Transforms for image.
+        scales (float|tuple|list):  Scales for resize. Default: 1.
+        flip_horizontal (bool): Whether to flip horizontally. Default: False.
+        flip_vertical (bool): Whether to flip vertically. Default: False.
+        is_slide (bool): Whether to infer by sliding wimdow. Default: False.
+        crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True.
+        stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True.
+
+    Returns:
+        Tensor: Prediction of image with shape (1, 1, h, w) is returned.
+    """
+    if isinstance(scales, float):
+        scales = [scales]
+    elif not isinstance(scales, (tuple, list)):
+        raise TypeError(
+            '`scales` expects float/tuple/list type, but received {}'.format(
+                type(scales)))
+    final_logit = 0
+    h_input, w_input = im.shape[-2], im.shape[-1]
+    flip_comb = flip_combination(flip_horizontal, flip_vertical)
+    for scale in scales:
+        h = int(h_input * scale + 0.5)
+        w = int(w_input * scale + 0.5)
+        im = F.interpolate(im, (h, w), mode='bilinear')
+        for flip in flip_comb:
+            im_flip = tensor_flip(im, flip)
+            logit = inference(
+                model,
+                im_flip,
+                is_slide=is_slide,
+                crop_size=crop_size,
+                stride=stride)
+            logit = tensor_flip(logit, flip)
+            logit = F.interpolate(logit, (h_input, w_input), mode='bilinear')
+
+            logit = F.softmax(logit, axis=1)
+            final_logit = final_logit + logit
+
+    final_logit = reverse_transform(
+        final_logit, ori_shape, transforms, mode='bilinear')
+    pred = paddle.argmax(final_logit, axis=1, keepdim=True, dtype='int32')
+
+    return pred, final_logit
--- a/paddlers/models/ppseg/core/predict.py
+++ b/paddlers/models/ppseg/core/predict.py
@ -0,0 +1,150 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+
+import cv2
+import numpy as np
+import paddle
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.core import infer
+from paddlers.models.ppseg.utils import logger, progbar, visualize
+
+
+def mkdir(path):
+    sub_dir = os.path.dirname(path)
+    if not os.path.exists(sub_dir):
+        os.makedirs(sub_dir)
+
+
+def partition_list(arr, m):
+    """split the list 'arr' into m pieces"""
+    n = int(math.ceil(len(arr) / float(m)))
+    return [arr[i:i + n] for i in range(0, len(arr), n)]
+
+
+def predict(model,
+            model_path,
+            transforms,
+            image_list,
+            image_dir=None,
+            save_dir='output',
+            aug_pred=False,
+            scales=1.0,
+            flip_horizontal=True,
+            flip_vertical=False,
+            is_slide=False,
+            stride=None,
+            crop_size=None,
+            custom_color=None):
+    """
+    predict and visualize the image_list.
+
+    Args:
+        model (nn.Layer): Used to predict for input image.
+        model_path (str): The path of pretrained model.
+        transforms (transform.Compose): Preprocess for input image.
+        image_list (list): A list of image path to be predicted.
+        image_dir (str, optional): The root directory of the images predicted. Default: None.
+        save_dir (str, optional): The directory to save the visualized results. Default: 'output'.
+        aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False.
+        scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0.
+        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True.
+        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False.
+        is_slide (bool, optional): Whether to predict by sliding window. Default: False.
+        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
+
+    """
+    utils.utils.load_entire_model(model, model_path)
+    model.eval()
+    nranks = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    if nranks > 1:
+        img_lists = partition_list(image_list, nranks)
+    else:
+        img_lists = [image_list]
+
+    added_saved_dir = os.path.join(save_dir, 'added_prediction')
+    pred_saved_dir = os.path.join(save_dir, 'pseudo_color_prediction')
+
+    logger.info("Start to predict...")
+    progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1)
+    color_map = visualize.get_color_map_list(256, custom_color=custom_color)
+    with paddle.no_grad():
+        for i, im_path in enumerate(img_lists[local_rank]):
+            im = cv2.imread(im_path)
+            ori_shape = im.shape[:2]
+            im, _ = transforms(im)
+            im = im[np.newaxis, ...]
+            im = paddle.to_tensor(im)
+
+            if aug_pred:
+                pred, _  = infer.aug_inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=transforms.transforms,
+                    scales=scales,
+                    flip_horizontal=flip_horizontal,
+                    flip_vertical=flip_vertical,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+            else:
+                pred, _ = infer.inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=transforms.transforms,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+            pred = paddle.squeeze(pred)
+            pred = pred.numpy().astype('uint8')
+
+            # get the saved name
+            if image_dir is not None:
+                im_file = im_path.replace(image_dir, '')
+            else:
+                im_file = os.path.basename(im_path)
+            if im_file[0] == '/' or im_file[0] == '\\':
+                im_file = im_file[1:]
+
+            # save added image
+            added_image = utils.visualize.visualize(
+                im_path, pred, color_map, weight=0.6)
+            added_image_path = os.path.join(added_saved_dir, im_file)
+            mkdir(added_image_path)
+            cv2.imwrite(added_image_path, added_image)
+
+            # save pseudo color prediction
+            pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
+            pred_saved_path = os.path.join(
+                pred_saved_dir,
+                os.path.splitext(im_file)[0] + ".png")
+            mkdir(pred_saved_path)
+            pred_mask.save(pred_saved_path)
+
+            # pred_im = utils.visualize(im_path, pred, weight=0.0)
+            # pred_saved_path = os.path.join(pred_saved_dir, im_file)
+            # mkdir(pred_saved_path)
+            # cv2.imwrite(pred_saved_path, pred_im)
+
+            progbar_pred.update(i + 1)
--- a/paddlers/models/ppseg/core/train.py
+++ b/paddlers/models/ppseg/core/train.py
@ -0,0 +1,326 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+from collections import deque
+import shutil
+
+import paddle
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.utils import (TimeAverager, calculate_eta, resume, logger,
+                             worker_init_fn, train_profiler, op_flops_funs)
+from paddlers.models.ppseg.core.val import evaluate
+
+
+def check_logits_losses(logits_list, losses):
+    len_logits = len(logits_list)
+    len_losses = len(losses['types'])
+    if len_logits != len_losses:
+        raise RuntimeError(
+            'The length of logits_list should equal to the types of loss config: {} != {}.'
+            .format(len_logits, len_losses))
+
+
+def loss_computation(logits_list, labels, losses, edges=None):
+    check_logits_losses(logits_list, losses)
+    loss_list = []
+    for i in range(len(logits_list)):
+        logits = logits_list[i]
+        loss_i = losses['types'][i]
+        coef_i = losses['coef'][i]
+
+        if loss_i.__class__.__name__ in ('BCELoss',
+                                         'FocalLoss') and loss_i.edge_label:
+            # If use edges as labels According to loss type.
+            loss_list.append(coef_i * loss_i(logits, edges))
+        elif loss_i.__class__.__name__ == 'MixedLoss':
+            mixed_loss_list = loss_i(logits, labels)
+            for mixed_loss in mixed_loss_list:
+                loss_list.append(coef_i * mixed_loss)
+        elif loss_i.__class__.__name__ in ("KLLoss", ):
+            loss_list.append(
+                coef_i * loss_i(logits_list[0], logits_list[1].detach()))
+        else:
+            loss_list.append(coef_i * loss_i(logits, labels))
+    return loss_list
+
+
+def train(model,
+          train_dataset,
+          val_dataset=None,
+          optimizer=None,
+          save_dir='output',
+          iters=10000,
+          batch_size=2,
+          resume_model=None,
+          save_interval=1000,
+          log_iters=10,
+          num_workers=0,
+          use_vdl=False,
+          losses=None,
+          keep_checkpoint_max=5,
+          test_config=None,
+          precision='fp32',
+          profiler_options=None,
+          to_static_training=False):
+    """
+    Launch training.
+
+    Args:
+        model（nn.Layer): A sementic segmentation model.
+        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
+        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
+        optimizer (paddle.optimizer.Optimizer): The optimizer.
+        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
+        iters (int, optional): How may iters to train the model. Defualt: 10000.
+        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
+        resume_model (str, optional): The path of resume model.
+        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
+        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
+        num_workers (int, optional): Num workers for data loader. Default: 0.
+        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
+        losses (dict, optional): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
+            The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
+        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
+        test_config(dict, optional): Evaluation config.
+        precision (str, optional): Use AMP if precision='fp16'. If precision='fp32', the training is normal.
+        profiler_options (str, optional): The option of train profiler.
+        to_static_training (bool, optional): Whether to use @to_static for training.
+    """
+    model.train()
+    nranks = paddle.distributed.ParallelEnv().nranks
+    local_rank = paddle.distributed.ParallelEnv().local_rank
+
+    start_iter = 0
+    if resume_model is not None:
+        start_iter = resume(model, optimizer, resume_model)
+
+    if not os.path.isdir(save_dir):
+        if os.path.exists(save_dir):
+            os.remove(save_dir)
+        os.makedirs(save_dir)
+
+    if nranks > 1:
+        paddle.distributed.fleet.init(is_collective=True)
+        optimizer = paddle.distributed.fleet.distributed_optimizer(
+            optimizer)  # The return is Fleet object
+        ddp_model = paddle.distributed.fleet.distributed_model(model)
+
+    batch_sampler = paddle.io.DistributedBatchSampler(
+        train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+
+    loader = paddle.io.DataLoader(
+        train_dataset,
+        batch_sampler=batch_sampler,
+        num_workers=num_workers,
+        return_list=True,
+        worker_init_fn=worker_init_fn,
+    )
+
+    # use amp
+    if precision == 'fp16':
+        logger.info('use amp to train')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+    if use_vdl:
+        from visualdl import LogWriter
+        log_writer = LogWriter(save_dir)
+
+    if to_static_training:
+        model = paddle.jit.to_static(model)
+        logger.info("Successfully to apply @to_static")
+
+    avg_loss = 0.0
+    avg_loss_list = []
+    iters_per_epoch = len(batch_sampler)
+    best_mean_iou = -1.0
+    best_model_iter = -1
+    reader_cost_averager = TimeAverager()
+    batch_cost_averager = TimeAverager()
+    save_models = deque()
+    batch_start = time.time()
+
+    iter = start_iter
+    while iter < iters:
+        for data in loader:
+            iter += 1
+            if iter > iters:
+                version = paddle.__version__
+                if version == '2.1.2':
+                    continue
+                else:
+                    break
+            reader_cost_averager.record(time.time() - batch_start)
+            images = data[0]
+            labels = data[1].astype('int64')
+            edges = None
+            if len(data) == 3:
+                edges = data[2].astype('int64')
+            if hasattr(model, 'data_format') and model.data_format == 'NHWC':
+                images = images.transpose((0, 2, 3, 1))
+
+            if precision == 'fp16':
+                with paddle.amp.auto_cast(
+                        enable=True,
+                        custom_white_list={
+                            "elementwise_add", "batch_norm", "sync_batch_norm"
+                        },
+                        custom_black_list={'bilinear_interp_v2'}):
+                    if nranks > 1:
+                        logits_list = ddp_model(images)
+                    else:
+                        logits_list = model(images)
+                    loss_list = loss_computation(
+                        logits_list=logits_list,
+                        labels=labels,
+                        losses=losses,
+                        edges=edges)
+                    loss = sum(loss_list)
+
+                scaled = scaler.scale(loss)  # scale the loss
+                scaled.backward()  # do backward
+                if isinstance(optimizer, paddle.distributed.fleet.Fleet):
+                    scaler.minimize(optimizer.user_defined_optimizer, scaled)
+                else:
+                    scaler.minimize(optimizer, scaled)  # update parameters
+            else:
+                if nranks > 1:
+                    logits_list = ddp_model(images)
+                else:
+                    logits_list = model(images)
+                loss_list = loss_computation(
+                    logits_list=logits_list,
+                    labels=labels,
+                    losses=losses,
+                    edges=edges)
+                loss = sum(loss_list)
+                loss.backward()
+                # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step.
+                if isinstance(optimizer, paddle.optimizer.lr.ReduceOnPlateau):
+                    optimizer.step(loss)
+                else:
+                    optimizer.step()
+
+            lr = optimizer.get_lr()
+
+            # update lr
+            if isinstance(optimizer, paddle.distributed.fleet.Fleet):
+                lr_sche = optimizer.user_defined_optimizer._learning_rate
+            else:
+                lr_sche = optimizer._learning_rate
+            if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler):
+                lr_sche.step()
+
+            train_profiler.add_profiler_step(profiler_options)
+
+            model.clear_gradients()
+            avg_loss += loss.numpy()[0]
+            if not avg_loss_list:
+                avg_loss_list = [l.numpy() for l in loss_list]
+            else:
+                for i in range(len(loss_list)):
+                    avg_loss_list[i] += loss_list[i].numpy()
+            batch_cost_averager.record(
+                time.time() - batch_start, num_samples=batch_size)
+
+            if (iter) % log_iters == 0 and local_rank == 0:
+                avg_loss /= log_iters
+                avg_loss_list = [l[0] / log_iters for l in avg_loss_list]
+                remain_iters = iters - iter
+                avg_train_batch_cost = batch_cost_averager.get_average()
+                avg_train_reader_cost = reader_cost_averager.get_average()
+                eta = calculate_eta(remain_iters, avg_train_batch_cost)
+                logger.info(
+                    "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}"
+                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
+                            avg_loss, lr, avg_train_batch_cost,
+                            avg_train_reader_cost,
+                            batch_cost_averager.get_ips_average(), eta))
+                if use_vdl:
+                    log_writer.add_scalar('Train/loss', avg_loss, iter)
+                    # Record all losses if there are more than 2 losses.
+                    if len(avg_loss_list) > 1:
+                        avg_loss_dict = {}
+                        for i, value in enumerate(avg_loss_list):
+                            avg_loss_dict['loss_' + str(i)] = value
+                        for key, value in avg_loss_dict.items():
+                            log_tag = 'Train/' + key
+                            log_writer.add_scalar(log_tag, value, iter)
+
+                    log_writer.add_scalar('Train/lr', lr, iter)
+                    log_writer.add_scalar('Train/batch_cost',
+                                          avg_train_batch_cost, iter)
+                    log_writer.add_scalar('Train/reader_cost',
+                                          avg_train_reader_cost, iter)
+                avg_loss = 0.0
+                avg_loss_list = []
+                reader_cost_averager.reset()
+                batch_cost_averager.reset()
+
+            if (iter % save_interval == 0
+                    or iter == iters) and (val_dataset is not None):
+                num_workers = 1 if num_workers > 0 else 0
+
+                if test_config is None:
+                    test_config = {}
+
+                mean_iou, acc, _, _, _ = evaluate(
+                    model, val_dataset, num_workers=num_workers, **test_config)
+
+                model.train()
+
+            if (iter % save_interval == 0 or iter == iters) and local_rank == 0:
+                current_save_dir = os.path.join(save_dir,
+                                                "iter_{}".format(iter))
+                if not os.path.isdir(current_save_dir):
+                    os.makedirs(current_save_dir)
+                paddle.save(model.state_dict(),
+                            os.path.join(current_save_dir, 'model.pdparams'))
+                paddle.save(optimizer.state_dict(),
+                            os.path.join(current_save_dir, 'model.pdopt'))
+                save_models.append(current_save_dir)
+                if len(save_models) > keep_checkpoint_max > 0:
+                    model_to_remove = save_models.popleft()
+                    shutil.rmtree(model_to_remove)
+
+                if val_dataset is not None:
+                    if mean_iou > best_mean_iou:
+                        best_mean_iou = mean_iou
+                        best_model_iter = iter
+                        best_model_dir = os.path.join(save_dir, "best_model")
+                        paddle.save(
+                            model.state_dict(),
+                            os.path.join(best_model_dir, 'model.pdparams'))
+                    logger.info(
+                        '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
+                        .format(best_mean_iou, best_model_iter))
+
+                    if use_vdl:
+                        log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
+                        log_writer.add_scalar('Evaluate/Acc', acc, iter)
+            batch_start = time.time()
+
+    # Calculate flops.
+    if local_rank == 0:
+        _, c, h, w = images.shape
+        _ = paddle.flops(
+            model, [1, c, h, w],
+            custom_ops={paddle.nn.SyncBatchNorm: op_flops_funs.count_syncbn})
+
+    # Sleep for half a second to let dataloader release resources.
+    time.sleep(0.5)
+    if use_vdl:
+        log_writer.close()
--- a/paddlers/models/ppseg/core/val.py
+++ b/paddlers/models/ppseg/core/val.py
@ -0,0 +1,199 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import time
+import paddle
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.utils import metrics, TimeAverager, calculate_eta, logger, progbar
+from paddlers.models.ppseg.core import infer
+
+np.set_printoptions(suppress=True)
+
+
+def evaluate(model,
+             eval_dataset,
+             aug_eval=False,
+             scales=1.0,
+             flip_horizontal=False,
+             flip_vertical=False,
+             is_slide=False,
+             stride=None,
+             crop_size=None,
+             num_workers=0,
+             print_detail=True,
+             auc_roc=False):
+    """
+    Launch evalution.
+
+    Args:
+        model（nn.Layer): A sementic segmentation model.
+        eval_dataset (paddle.io.Dataset): Used to read and process validation datasets.
+        aug_eval (bool, optional): Whether to use mulit-scales and flip augment for evaluation. Default: False.
+        scales (list|float, optional): Scales for augment. It is valid when `aug_eval` is True. Default: 1.0.
+        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_eval` is True. Default: True.
+        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_eval` is True. Default: False.
+        is_slide (bool, optional): Whether to evaluate by sliding window. Default: False.
+        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
+            It should be provided when `is_slide` is True.
+        num_workers (int, optional): Num workers for data loader. Default: 0.
+        print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True.
+        auc_roc(bool, optional): whether add auc_roc metric
+
+    Returns:
+        float: The mIoU of validation datasets.
+        float: The accuracy of validation datasets.
+    """
+    model.eval()
+    nranks = paddle.distributed.ParallelEnv().nranks
+    local_rank = paddle.distributed.ParallelEnv().local_rank
+    if nranks > 1:
+        # Initialize parallel environment if not done.
+        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
+        ):
+            paddle.distributed.init_parallel_env()
+    batch_sampler = paddle.io.DistributedBatchSampler(
+        eval_dataset, batch_size=1, shuffle=False, drop_last=False)
+    loader = paddle.io.DataLoader(
+        eval_dataset,
+        batch_sampler=batch_sampler,
+        num_workers=num_workers,
+        return_list=True,
+    )
+
+    total_iters = len(loader)
+    intersect_area_all = paddle.zeros([1], dtype='int64')
+    pred_area_all = paddle.zeros([1], dtype='int64')
+    label_area_all = paddle.zeros([1], dtype='int64')
+    logits_all = None
+    label_all = None
+
+    if print_detail:
+        logger.info(
+            "Start evaluating (total_samples: {}, total_iters: {})...".format(
+                len(eval_dataset), total_iters))
+    #TODO(chenguowei): fix log print error with multi-gpus
+    progbar_val = progbar.Progbar(
+        target=total_iters, verbose=1 if nranks < 2 else 2)
+    reader_cost_averager = TimeAverager()
+    batch_cost_averager = TimeAverager()
+    batch_start = time.time()
+    with paddle.no_grad():
+        for iter, (im, label) in enumerate(loader):
+            reader_cost_averager.record(time.time() - batch_start)
+            label = label.astype('int64')
+
+            ori_shape = label.shape[-2:]
+            if aug_eval:
+                pred, logits = infer.aug_inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=eval_dataset.transforms.transforms,
+                    scales=scales,
+                    flip_horizontal=flip_horizontal,
+                    flip_vertical=flip_vertical,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+            else:
+                pred, logits = infer.inference(
+                    model,
+                    im,
+                    ori_shape=ori_shape,
+                    transforms=eval_dataset.transforms.transforms,
+                    is_slide=is_slide,
+                    stride=stride,
+                    crop_size=crop_size)
+
+            intersect_area, pred_area, label_area = metrics.calculate_area(
+                pred,
+                label,
+                eval_dataset.num_classes,
+                ignore_index=eval_dataset.ignore_index)
+
+            # Gather from all ranks
+            if nranks > 1:
+                intersect_area_list = []
+                pred_area_list = []
+                label_area_list = []
+                paddle.distributed.all_gather(intersect_area_list,
+                                              intersect_area)
+                paddle.distributed.all_gather(pred_area_list, pred_area)
+                paddle.distributed.all_gather(label_area_list, label_area)
+
+                # Some image has been evaluated and should be eliminated in last iter
+                if (iter + 1) * nranks > len(eval_dataset):
+                    valid = len(eval_dataset) - iter * nranks
+                    intersect_area_list = intersect_area_list[:valid]
+                    pred_area_list = pred_area_list[:valid]
+                    label_area_list = label_area_list[:valid]
+
+                for i in range(len(intersect_area_list)):
+                    intersect_area_all = intersect_area_all + intersect_area_list[
+                        i]
+                    pred_area_all = pred_area_all + pred_area_list[i]
+                    label_area_all = label_area_all + label_area_list[i]
+            else:
+                intersect_area_all = intersect_area_all + intersect_area
+                pred_area_all = pred_area_all + pred_area
+                label_area_all = label_area_all + label_area
+
+                if auc_roc:
+                    logits = F.softmax(logits, axis=1)
+                    if logits_all is None:
+                        logits_all = logits.numpy()
+                        label_all = label.numpy()
+                    else:
+                        logits_all = np.concatenate(
+                            [logits_all, logits.numpy()])  # (KN, C, H, W)
+                        label_all = np.concatenate([label_all, label.numpy()])
+
+            batch_cost_averager.record(
+                time.time() - batch_start, num_samples=len(label))
+            batch_cost = batch_cost_averager.get_average()
+            reader_cost = reader_cost_averager.get_average()
+
+            if local_rank == 0 and print_detail:
+                progbar_val.update(iter + 1, [('batch_cost', batch_cost),
+                                              ('reader cost', reader_cost)])
+            reader_cost_averager.reset()
+            batch_cost_averager.reset()
+            batch_start = time.time()
+
+    class_iou, miou = metrics.mean_iou(intersect_area_all, pred_area_all,
+                                       label_area_all)
+    class_acc, acc = metrics.accuracy(intersect_area_all, pred_area_all)
+    kappa = metrics.kappa(intersect_area_all, pred_area_all, label_area_all)
+    class_dice, mdice = metrics.dice(intersect_area_all, pred_area_all,
+                                     label_area_all)
+
+    if auc_roc:
+        auc_roc = metrics.auc_roc(
+            logits_all, label_all, num_classes=eval_dataset.num_classes)
+        auc_infor = ' Auc_roc: {:.4f}'.format(auc_roc)
+
+    if print_detail:
+        infor = "[EVAL] #Images: {} mIoU: {:.4f} Acc: {:.4f} Kappa: {:.4f} Dice: {:.4f}".format(
+            len(eval_dataset), miou, acc, kappa, mdice)
+        infor = infor + auc_infor if auc_roc else infor
+        logger.info(infor)
+        logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4)))
+        logger.info("[EVAL] Class Acc: \n" + str(np.round(class_acc, 4)))
+    return miou, acc, class_iou, class_acc, kappa
--- a/paddlers/models/ppseg/cvlibs/init.py
+++ b/paddlers/models/ppseg/cvlibs/init.py
@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import manager
+from . import param_init
+from .config import Config
--- a/paddlers/models/ppseg/cvlibs/callbacks.py
+++ b/paddlers/models/ppseg/cvlibs/callbacks.py
@ -0,0 +1,279 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+
+import numpy as np
+import paddle
+from paddle.distributed.parallel import ParallelEnv
+from visualdl import LogWriter
+from paddlers.models.ppseg.utils.progbar import Progbar
+import paddlers.models.ppseg.utils.logger as logger
+
+
+class CallbackList(object):
+    """
+    Container abstracting a list of callbacks.
+
+    Args:
+        callbacks (list[Callback]): List of `Callback` instances.
+    """
+
+    def __init__(self, callbacks=None):
+        callbacks = callbacks or []
+        self.callbacks = [c for c in callbacks]
+
+    def append(self, callback):
+        self.callbacks.append(callback)
+
+    def set_params(self, params):
+        for callback in self.callbacks:
+            callback.set_params(params)
+
+    def set_model(self, model):
+        for callback in self.callbacks:
+            callback.set_model(model)
+
+    def set_optimizer(self, optimizer):
+        for callback in self.callbacks:
+            callback.set_optimizer(optimizer)
+
+    def on_iter_begin(self, iter, logs=None):
+        """Called right before processing a batch.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_iter_begin(iter, logs)
+        self._t_enter_iter = time.time()
+
+    def on_iter_end(self, iter, logs=None):
+        """Called at the end of a batch.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_iter_end(iter, logs)
+        self._t_exit_iter = time.time()
+
+    def on_train_begin(self, logs=None):
+        """Called at the beginning of training.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_train_begin(logs)
+
+    def on_train_end(self, logs=None):
+        """Called at the end of training.
+        """
+        logs = logs or {}
+        for callback in self.callbacks:
+            callback.on_train_end(logs)
+
+    def __iter__(self):
+        return iter(self.callbacks)
+
+
+class Callback(object):
+    """Abstract base class used to build new callbacks.
+    """
+
+    def __init__(self):
+        self.validation_data = None
+
+    def set_params(self, params):
+        self.params = params
+
+    def set_model(self, model):
+        self.model = model
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+    def on_iter_begin(self, iter, logs=None):
+        pass
+
+    def on_iter_end(self, iter, logs=None):
+        pass
+
+    def on_train_begin(self, logs=None):
+        pass
+
+    def on_train_end(self, logs=None):
+        pass
+
+
+class BaseLogger(Callback):
+    def __init__(self, period=10):
+        super(BaseLogger, self).__init__()
+        self.period = period
+
+    def _reset(self):
+        self.totals = {}
+
+    def on_train_begin(self, logs=None):
+        self.totals = {}
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        #(iter - 1) // iters_per_epoch + 1
+        for k, v in logs.items():
+            if k in self.totals.keys():
+                self.totals[k] += v
+            else:
+                self.totals[k] = v
+
+        if iter % self.period == 0 and ParallelEnv().local_rank == 0:
+
+            for k in self.totals:
+                logs[k] = self.totals[k] / self.period
+            self._reset()
+
+
+class TrainLogger(Callback):
+    def __init__(self, log_freq=10):
+        self.log_freq = log_freq
+
+    def _calculate_eta(self, remaining_iters, speed):
+        if remaining_iters < 0:
+            remaining_iters = 0
+        remaining_time = int(remaining_iters * speed)
+        result = "{:0>2}:{:0>2}:{:0>2}"
+        arr = []
+        for i in range(2, -1, -1):
+            arr.append(int(remaining_time / 60**i))
+            remaining_time %= 60**i
+        return result.format(*arr)
+
+    def on_iter_end(self, iter, logs=None):
+
+        if iter % self.log_freq == 0 and ParallelEnv().local_rank == 0:
+            total_iters = self.params["total_iters"]
+            iters_per_epoch = self.params["iters_per_epoch"]
+            remaining_iters = total_iters - iter
+            eta = self._calculate_eta(remaining_iters, logs["batch_cost"])
+            current_epoch = (iter - 1) // self.params["iters_per_epoch"] + 1
+            loss = logs["loss"]
+            lr = self.optimizer.get_lr()
+            batch_cost = logs["batch_cost"]
+            reader_cost = logs["reader_cost"]
+
+            logger.info(
+                "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
+                .format(current_epoch, iter, total_iters, loss, lr, batch_cost,
+                        reader_cost, eta))
+
+
+class ProgbarLogger(Callback):
+    def __init__(self):
+        super(ProgbarLogger, self).__init__()
+
+    def on_train_begin(self, logs=None):
+        self.verbose = self.params["verbose"]
+        self.total_iters = self.params["total_iters"]
+        self.target = self.params["total_iters"]
+        self.progbar = Progbar(target=self.target, verbose=self.verbose)
+        self.seen = 0
+        self.log_values = []
+
+    def on_iter_begin(self, iter, logs=None):
+        #self.seen = 0
+        if self.seen < self.target:
+            self.log_values = []
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        self.seen += 1
+        for k in self.params['metrics']:
+            if k in logs:
+                self.log_values.append((k, logs[k]))
+
+        #if self.verbose and self.seen < self.target and ParallelEnv.local_rank == 0:
+        #print(self.log_values)
+        if self.seen < self.target:
+            self.progbar.update(self.seen, self.log_values)
+
+
+class ModelCheckpoint(Callback):
+    def __init__(self,
+                 save_dir,
+                 monitor="miou",
+                 save_best_only=False,
+                 save_params_only=True,
+                 mode="max",
+                 period=1):
+
+        super(ModelCheckpoint, self).__init__()
+        self.monitor = monitor
+        self.save_dir = save_dir
+        self.save_best_only = save_best_only
+        self.save_params_only = save_params_only
+        self.period = period
+        self.iters_since_last_save = 0
+
+        if mode == "min":
+            self.monitor_op = np.less
+            self.best = np.Inf
+        elif mode == "max":
+            self.monitor_op = np.greater
+            self.best = -np.Inf
+        else:
+            raise RuntimeError("`mode` is neither \"min\" nor \"max\"!")
+
+    def on_train_begin(self, logs=None):
+        self.verbose = self.params["verbose"]
+        save_dir = self.save_dir
+        if not os.path.isdir(save_dir):
+            if os.path.exists(save_dir):
+                os.remove(save_dir)
+            os.makedirs(save_dir)
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        self.iters_since_last_save += 1
+        current_save_dir = os.path.join(self.save_dir, "iter_{}".format(iter))
+        current_save_dir = os.path.abspath(current_save_dir)
+        #if self.iters_since_last_save % self.period and ParallelEnv().local_rank == 0:
+        #self.iters_since_last_save = 0
+        if iter % self.period == 0 and ParallelEnv().local_rank == 0:
+            if self.verbose > 0:
+                print("iter {iter_num}: saving model to {path}".format(
+                    iter_num=iter, path=current_save_dir))
+
+            paddle.save(self.model.state_dict(),
+                        os.path.join(current_save_dir, 'model.pdparams'))
+
+            if not self.save_params_only:
+                paddle.save(self.optimizer.state_dict(),
+                            os.path.join(current_save_dir, 'model.pdopt'))
+
+
+class VisualDL(Callback):
+    def __init__(self, log_dir="./log", freq=1):
+        super(VisualDL, self).__init__()
+        self.log_dir = log_dir
+        self.freq = freq
+
+    def on_train_begin(self, logs=None):
+        self.writer = LogWriter(self.log_dir)
+
+    def on_iter_end(self, iter, logs=None):
+        logs = logs or {}
+        if iter % self.freq == 0 and ParallelEnv().local_rank == 0:
+            for k, v in logs.items():
+                self.writer.add_scalar("Train/{}".format(k), v, iter)
+
+        self.writer.flush()
+
+    def on_train_end(self, logs=None):
+        self.writer.close()
--- a/paddlers/models/ppseg/cvlibs/config.py
+++ b/paddlers/models/ppseg/cvlibs/config.py
@ -0,0 +1,404 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import os
+from typing import Any, Dict, Generic
+
+import paddle
+import yaml
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import logger
+
+
+class Config(object):
+    '''
+    Training configuration parsing. The only yaml/yml file is supported.
+
+    The following hyper-parameters are available in the config file:
+        batch_size: The number of samples per gpu.
+        iters: The total training steps.
+        train_dataset: A training data config including type/data_root/transforms/mode.
+            For data type, please refer to paddleseg.datasets.
+            For specific transforms, please refer to paddleseg.transforms.transforms.
+        val_dataset: A validation data config including type/data_root/transforms/mode.
+        optimizer: A optimizer config, but currently PaddleSeg only supports sgd with momentum in config file.
+            In addition, weight_decay could be set as a regularization.
+        learning_rate: A learning rate config. If decay is configured, learning _rate value is the starting learning rate,
+             where only poly decay is supported using the config file. In addition, decay power and end_lr are tuned experimentally.
+        loss: A loss config. Multi-loss config is available. The loss type order is consistent with the seg model outputs,
+            where the coef term indicates the weight of corresponding loss. Note that the number of coef must be the same as the number of
+            model outputs, and there could be only one loss type if using the same loss type among the outputs, otherwise the number of
+            loss type must be consistent with coef.
+        model: A model config including type/backbone and model-dependent arguments.
+            For model type, please refer to paddleseg.models.
+            For backbone, please refer to paddleseg.models.backbones.
+
+    Args:
+        path (str) : The path of config file, supports yaml format only.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs.config import Config
+
+        # Create a cfg object with yaml file path.
+        cfg = Config(yaml_cfg_path)
+
+        # Parsing the argument when its property is used.
+        train_dataset = cfg.train_dataset
+
+        # the argument of model should be parsed after dataset,
+        # since the model builder uses some properties in dataset.
+        model = cfg.model
+        ...
+    '''
+
+    def __init__(self,
+                 path: str,
+                 learning_rate: float = None,
+                 batch_size: int = None,
+                 iters: int = None):
+        if not path:
+            raise ValueError('Please specify the configuration file path.')
+
+        if not os.path.exists(path):
+            raise FileNotFoundError('File {} does not exist'.format(path))
+
+        self._model = None
+        self._losses = None
+        if path.endswith('yml') or path.endswith('yaml'):
+            self.dic = self._parse_from_yaml(path)
+        else:
+            raise RuntimeError('Config file should in yaml format!')
+
+        self.update(
+            learning_rate=learning_rate, batch_size=batch_size, iters=iters)
+
+    def _update_dic(self, dic, base_dic):
+        """
+        Update config from dic based base_dic
+        """
+        base_dic = base_dic.copy()
+        dic = dic.copy()
+
+        if dic.get('_inherited_', True) == False:
+            dic.pop('_inherited_')
+            return dic
+
+        for key, val in dic.items():
+            if isinstance(val, dict) and key in base_dic:
+                base_dic[key] = self._update_dic(val, base_dic[key])
+            else:
+                base_dic[key] = val
+        dic = base_dic
+        return dic
+
+    def _parse_from_yaml(self, path: str):
+        '''Parse a yaml file and build config'''
+        with codecs.open(path, 'r', 'utf-8') as file:
+            dic = yaml.load(file, Loader=yaml.FullLoader)
+
+        if '_base_' in dic:
+            cfg_dir = os.path.dirname(path)
+            base_path = dic.pop('_base_')
+            base_path = os.path.join(cfg_dir, base_path)
+            base_dic = self._parse_from_yaml(base_path)
+            dic = self._update_dic(dic, base_dic)
+        return dic
+
+    def update(self,
+               learning_rate: float = None,
+               batch_size: int = None,
+               iters: int = None):
+        '''Update config'''
+        if learning_rate:
+            if 'lr_scheduler' in self.dic:
+                self.dic['lr_scheduler']['learning_rate'] = learning_rate
+            else:
+                self.dic['learning_rate']['value'] = learning_rate
+
+        if batch_size:
+            self.dic['batch_size'] = batch_size
+
+        if iters:
+            self.dic['iters'] = iters
+
+    @property
+    def batch_size(self) -> int:
+        return self.dic.get('batch_size', 1)
+
+    @property
+    def iters(self) -> int:
+        iters = self.dic.get('iters')
+        if not iters:
+            raise RuntimeError('No iters specified in the configuration file.')
+        return iters
+
+    @property
+    def lr_scheduler(self) -> paddle.optimizer.lr.LRScheduler:
+        if 'lr_scheduler' not in self.dic:
+            raise RuntimeError(
+                'No `lr_scheduler` specified in the configuration file.')
+        params = self.dic.get('lr_scheduler')
+
+        lr_type = params.pop('type')
+        if lr_type == 'PolynomialDecay':
+            params.setdefault('decay_steps', self.iters)
+            params.setdefault('end_lr', 0)
+            params.setdefault('power', 0.9)
+
+        return getattr(paddle.optimizer.lr, lr_type)(**params)
+
+    @property
+    def learning_rate(self) -> paddle.optimizer.lr.LRScheduler:
+        logger.warning(
+            '''`learning_rate` in configuration file will be deprecated, please use `lr_scheduler` instead. E.g
+            lr_scheduler:
+                type: PolynomialDecay
+                learning_rate: 0.01''')
+
+        _learning_rate = self.dic.get('learning_rate', {})
+        if isinstance(_learning_rate, float):
+            return _learning_rate
+
+        _learning_rate = self.dic.get('learning_rate', {}).get('value')
+        if not _learning_rate:
+            raise RuntimeError(
+                'No learning rate specified in the configuration file.')
+
+        args = self.decay_args
+        decay_type = args.pop('type')
+
+        if decay_type == 'poly':
+            lr = _learning_rate
+            return paddle.optimizer.lr.PolynomialDecay(lr, **args)
+        elif decay_type == 'piecewise':
+            values = _learning_rate
+            return paddle.optimizer.lr.PiecewiseDecay(values=values, **args)
+        elif decay_type == 'stepdecay':
+            lr = _learning_rate
+            return paddle.optimizer.lr.StepDecay(lr, **args)
+        else:
+            raise RuntimeError('Only poly and piecewise decay support.')
+
+    @property
+    def optimizer(self) -> paddle.optimizer.Optimizer:
+        if 'lr_scheduler' in self.dic:
+            lr = self.lr_scheduler
+        else:
+            lr = self.learning_rate
+        args = self.optimizer_args
+        optimizer_type = args.pop('type')
+
+        if optimizer_type == 'sgd':
+            return paddle.optimizer.Momentum(
+                lr, parameters=self.model.parameters(), **args)
+        elif optimizer_type == 'adam':
+            return paddle.optimizer.Adam(
+                lr, parameters=self.model.parameters(), **args)
+        elif optimizer_type in paddle.optimizer.__all__:
+            return getattr(paddle.optimizer, optimizer_type)(
+                lr, parameters=self.model.parameters(), **args)
+
+        raise RuntimeError('Unknown optimizer type {}.'.format(optimizer_type))
+
+    @property
+    def optimizer_args(self) -> dict:
+        args = self.dic.get('optimizer', {}).copy()
+        if args['type'] == 'sgd':
+            args.setdefault('momentum', 0.9)
+
+        return args
+
+    @property
+    def decay_args(self) -> dict:
+        args = self.dic.get('learning_rate', {}).get('decay', {
+            'type': 'poly',
+            'power': 0.9
+        }).copy()
+
+        if args['type'] == 'poly':
+            args.setdefault('decay_steps', self.iters)
+            args.setdefault('end_lr', 0)
+
+        return args
+
+    @property
+    def loss(self) -> dict:
+        if self._losses is None:
+            self._losses = self._prepare_loss('loss')
+        return self._losses
+
+    @property
+    def distill_loss(self) -> dict:
+        if not hasattr(self, '_distill_losses'):
+            self._distill_losses = self._prepare_loss('distill_loss')
+        return self._distill_losses
+
+    def _prepare_loss(self, loss_name):
+        """
+        Parse the loss parameters and load the loss layers.
+
+        Args:
+            loss_name (str): The root name of loss in the yaml file.
+        Returns:
+            dict: A dict including the loss parameters and layers.
+        """
+        args = self.dic.get(loss_name, {}).copy()
+        if 'types' in args and 'coef' in args:
+            len_types = len(args['types'])
+            len_coef = len(args['coef'])
+            if len_types != len_coef:
+                if len_types == 1:
+                    args['types'] = args['types'] * len_coef
+                else:
+                    raise ValueError(
+                        'The length of types should equal to coef or equal to 1 in loss config, but they are {} and {}.'
+                        .format(len_types, len_coef))
+        else:
+            raise ValueError(
+                'Loss config should contain keys of "types" and "coef"')
+
+        losses = dict()
+        for key, val in args.items():
+            if key == 'types':
+                losses['types'] = []
+                for item in args['types']:
+                    if item['type'] != 'MixedLoss':
+                        if 'ignore_index' in item:
+                            assert item['ignore_index'] == self.train_dataset.ignore_index, 'If ignore_index of loss is set, '\
+                            'the ignore_index of loss and train_dataset must be the same. \nCurrently, loss ignore_index = {}, '\
+                            'train_dataset ignore_index = {}. \nIt is recommended not to set loss ignore_index, so it is consistent with '\
+                            'train_dataset by default.'.format(item['ignore_index'], self.train_dataset.ignore_index)
+                        item['ignore_index'] = \
+                            self.train_dataset.ignore_index
+                    losses['types'].append(self._load_object(item))
+            else:
+                losses[key] = val
+        if len(losses['coef']) != len(losses['types']):
+            raise RuntimeError(
+                'The length of coef should equal to types in loss config: {} != {}.'
+                .format(len(losses['coef']), len(losses['types'])))
+        return losses
+
+    @property
+    def model(self) -> paddle.nn.Layer:
+        model_cfg = self.dic.get('model').copy()
+        if not model_cfg:
+            raise RuntimeError('No model specified in the configuration file.')
+        if not 'num_classes' in model_cfg:
+            num_classes = None
+            if self.train_dataset_config:
+                if hasattr(self.train_dataset_class, 'NUM_CLASSES'):
+                    num_classes = self.train_dataset_class.NUM_CLASSES
+                elif hasattr(self.train_dataset, 'num_classes'):
+                    num_classes = self.train_dataset.num_classes
+            elif self.val_dataset_config:
+                if hasattr(self.val_dataset_class, 'NUM_CLASSES'):
+                    num_classes = self.val_dataset_class.NUM_CLASSES
+                elif hasattr(self.val_dataset, 'num_classes'):
+                    num_classes = self.val_dataset.num_classes
+
+            if num_classes is not None:
+                model_cfg['num_classes'] = num_classes
+
+        if not self._model:
+            self._model = self._load_object(model_cfg)
+        return self._model
+
+    @property
+    def train_dataset_config(self) -> Dict:
+        return self.dic.get('train_dataset', {}).copy()
+
+    @property
+    def val_dataset_config(self) -> Dict:
+        return self.dic.get('val_dataset', {}).copy()
+
+    @property
+    def train_dataset_class(self) -> Generic:
+        dataset_type = self.train_dataset_config['type']
+        return self._load_component(dataset_type)
+
+    @property
+    def val_dataset_class(self) -> Generic:
+        dataset_type = self.val_dataset_config['type']
+        return self._load_component(dataset_type)
+
+    @property
+    def train_dataset(self) -> paddle.io.Dataset:
+        _train_dataset = self.train_dataset_config
+        if not _train_dataset:
+            return None
+        return self._load_object(_train_dataset)
+
+    @property
+    def val_dataset(self) -> paddle.io.Dataset:
+        _val_dataset = self.val_dataset_config
+        if not _val_dataset:
+            return None
+        return self._load_object(_val_dataset)
+
+    def _load_component(self, com_name: str) -> Any:
+        com_list = [
+            manager.MODELS, manager.BACKBONES, manager.DATASETS,
+            manager.TRANSFORMS, manager.LOSSES
+        ]
+
+        for com in com_list:
+            if com_name in com.components_dict:
+                return com[com_name]
+        else:
+            raise RuntimeError(
+                'The specified component was not found {}.'.format(com_name))
+
+    def _load_object(self, cfg: dict) -> Any:
+        cfg = cfg.copy()
+        if 'type' not in cfg:
+            raise RuntimeError('No object information in {}.'.format(cfg))
+
+        component = self._load_component(cfg.pop('type'))
+
+        params = {}
+        for key, val in cfg.items():
+            if self._is_meta_type(val):
+                params[key] = self._load_object(val)
+            elif isinstance(val, list):
+                params[key] = [
+                    self._load_object(item)
+                    if self._is_meta_type(item) else item for item in val
+                ]
+            else:
+                params[key] = val
+
+        return component(**params)
+
+    @property
+    def test_config(self) -> Dict:
+        return self.dic.get('test_config', {})
+
+    @property
+    def export_config(self) -> Dict:
+        return self.dic.get('export', {})
+
+    @property
+    def to_static_training(self) -> bool:
+        '''Whether to use @to_static for training'''
+        return self.dic.get('to_static_training', False)
+
+    def _is_meta_type(self, item: Any) -> bool:
+        return isinstance(item, dict) and 'type' in item
+
+    def __str__(self) -> str:
+        return yaml.dump(self.dic)
--- a/paddlers/models/ppseg/cvlibs/manager.py
+++ b/paddlers/models/ppseg/cvlibs/manager.py
@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from collections.abc import Sequence
+
+import warnings
+
+
+class ComponentManager:
+    """
+    Implement a manager class to add the new component properly.
+    The component can be added as either class or function type.
+
+    Args:
+        name (str): The name of component.
+
+    Returns:
+        A callable object of ComponentManager.
+
+    Examples 1:
+
+        from paddlers.models.ppseg.cvlibs.manager import ComponentManager
+
+        model_manager = ComponentManager()
+
+        class AlexNet: ...
+        class ResNet: ...
+
+        model_manager.add_component(AlexNet)
+        model_manager.add_component(ResNet)
+
+        # Or pass a sequence alliteratively:
+        model_manager.add_component([AlexNet, ResNet])
+        print(model_manager.components_dict)
+        # {'AlexNet': <class '__main__.AlexNet'>, 'ResNet': <class '__main__.ResNet'>}
+
+    Examples 2:
+
+        # Or an easier way, using it as a Python decorator, while just add it above the class declaration.
+        from paddlers.models.ppseg.cvlibs.manager import ComponentManager
+
+        model_manager = ComponentManager()
+
+        @model_manager.add_component
+        class AlexNet: ...
+
+        @model_manager.add_component
+        class ResNet: ...
+
+        print(model_manager.components_dict)
+        # {'AlexNet': <class '__main__.AlexNet'>, 'ResNet': <class '__main__.ResNet'>}
+    """
+
+    def __init__(self, name=None):
+        self._components_dict = dict()
+        self._name = name
+
+    def __len__(self):
+        return len(self._components_dict)
+
+    def __repr__(self):
+        name_str = self._name if self._name else self.__class__.__name__
+        return "{}:{}".format(name_str, list(self._components_dict.keys()))
+
+    def __getitem__(self, item):
+        if item not in self._components_dict.keys():
+            raise KeyError("{} does not exist in availabel {}".format(
+                item, self))
+        return self._components_dict[item]
+
+    @property
+    def components_dict(self):
+        return self._components_dict
+
+    @property
+    def name(self):
+        return self._name
+
+    def _add_single_component(self, component):
+        """
+        Add a single component into the corresponding manager.
+
+        Args:
+            component (function|class): A new component.
+
+        Raises:
+            TypeError: When `component` is neither class nor function.
+            KeyError: When `component` was added already.
+        """
+
+        # Currently only support class or function type
+        if not (inspect.isclass(component) or inspect.isfunction(component)):
+            raise TypeError(
+                "Expect class/function type, but received {}".format(
+                    type(component)))
+
+        # Obtain the internal name of the component
+        component_name = component.__name__
+
+        # Check whether the component was added already
+        if component_name in self._components_dict.keys():
+            warnings.warn(
+                "{} exists already! It is now updated to {} !!!".format(
+                    component_name, component))
+            self._components_dict[component_name] = component
+
+        else:
+            # Take the internal name of the component as its key
+            self._components_dict[component_name] = component
+
+    def add_component(self, components):
+        """
+        Add component(s) into the corresponding manager.
+
+        Args:
+            components (function|class|list|tuple): Support four types of components.
+
+        Returns:
+            components (function|class|list|tuple): Same with input components.
+        """
+
+        # Check whether the type is a sequence
+        if isinstance(components, Sequence):
+            for component in components:
+                self._add_single_component(component)
+        else:
+            component = components
+            self._add_single_component(component)
+
+        return components
+
+
+MODELS = ComponentManager("models")
+BACKBONES = ComponentManager("backbones")
+DATASETS = ComponentManager("datasets")
+TRANSFORMS = ComponentManager("transforms")
+LOSSES = ComponentManager("losses")
--- a/paddlers/models/ppseg/cvlibs/param_init.py
+++ b/paddlers/models/ppseg/cvlibs/param_init.py
@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+
+def constant_init(param, **kwargs):
+    """
+    Initialize the `param` with constants.
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        param_init.constant_init(linear.weight, value=2.0)
+        print(linear.weight.numpy())
+        # result is [[2. 2. 2. 2.], [2. 2. 2. 2.]]
+
+    """
+    initializer = nn.initializer.Constant(**kwargs)
+    initializer(param, param.block)
+
+
+def normal_init(param, **kwargs):
+    """
+    Initialize the `param` with a Normal distribution.
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        param_init.normal_init(linear.weight, loc=0.0, scale=1.0)
+
+    """
+    initializer = nn.initializer.Normal(**kwargs)
+    initializer(param, param.block)
+
+
+def kaiming_normal_init(param, **kwargs):
+    r"""
+    Initialize the input tensor with Kaiming Normal initialization.
+
+    This function implements the `param` initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
+    .. math::
+        x = \sqrt{\\frac{6.0}{fan\_in}}
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+    .. math::
+        \sqrt{\\frac{2.0}{fan\_in}}
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        # uniform is used to decide whether to use uniform or normal distribution
+        param_init.kaiming_normal_init(linear.weight)
+
+    """
+    initializer = nn.initializer.KaimingNormal(**kwargs)
+    initializer(param, param.block)
+
+
+def kaiming_uniform(param, **kwargs):
+    r"""Implements the Kaiming Uniform initializer
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities.
+
+    In case of Uniform distribution, the range is [-x, x], where
+    .. math::
+        x = \sqrt{\\frac{6.0}{fan\_in}}
+
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddlers.models.ppseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        param_init.kaiming_uniform(linear.weight)
+    """
+
+    initializer = nn.initializer.KaimingUniform(**kwargs)
+    initializer(param, param.block)
--- a/paddlers/models/ppseg/datasets/init.py
+++ b/paddlers/models/ppseg/datasets/init.py
@ -0,0 +1,29 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset import Dataset
+from .cityscapes import Cityscapes
+from .voc import PascalVOC
+from .ade import ADE20K
+from .optic_disc_seg import OpticDiscSeg
+from .pascal_context import PascalContext
+from .mini_deep_globe_road_extraction import MiniDeepGlobeRoadExtraction
+from .eg1800 import EG1800
+from .supervisely import SUPERVISELY
+from .cocostuff import CocoStuff
+from .stare import STARE
+from .drive import DRIVE
+from .hrf import HRF
+from .chase_db1 import CHASEDB1
+from .pp_humanseg14k import PPHumanSeg14K
--- a/paddlers/models/ppseg/datasets/ade.py
+++ b/paddlers/models/ppseg/datasets/ade.py
@ -0,0 +1,111 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+from PIL import Image
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+import paddlers.models.ppseg.transforms.functional as F
+
+URL = "http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip"
+
+
+@manager.DATASETS.add_component
+class ADE20K(Dataset):
+    """
+    ADE20K dataset `http://sceneparsing.csail.mit.edu/`.
+
+    Args:
+        transforms (list): A list of image transformations.
+        dataset_root (str, optional): The ADK20K dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 150
+
+    def __init__(self, transforms, dataset_root=None, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val']:
+            raise ValueError(
+                "`mode` should be one of ('train', 'val') in ADE20K dataset, but got {}."
+                .format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME,
+                extraname='ADEChallengeData2016')
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            img_dir = os.path.join(self.dataset_root, 'images/training')
+            label_dir = os.path.join(self.dataset_root, 'annotations/training')
+        elif mode == 'val':
+            img_dir = os.path.join(self.dataset_root, 'images/validation')
+            label_dir = os.path.join(self.dataset_root,
+                                     'annotations/validation')
+        img_files = os.listdir(img_dir)
+        label_files = [i.replace('.jpg', '.png') for i in img_files]
+        for i in range(len(img_files)):
+            img_path = os.path.join(img_dir, img_files[i])
+            label_path = os.path.join(label_dir, label_files[i])
+            self.file_list.append([img_path, label_path])
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+        if self.mode == 'val':
+            im, _ = self.transforms(im=image_path)
+            label = np.asarray(Image.open(label_path))
+            # The class 0 is ignored. And it will equal to 255 after
+            # subtracted 1, because the dtype of label is uint8.
+            label = label - 1
+            label = label[np.newaxis, :, :]
+            return im, label
+        else:
+            im, label = self.transforms(im=image_path, label=label_path)
+            label = label - 1
+            # Recover the ignore pixels adding by transform
+            label[label == 254] = 255
+            if self.edge:
+                edge_mask = F.mask_to_binary_edge(
+                    label, radius=2, num_classes=self.num_classes)
+                return im, label, edge_mask
+            else:
+                return im, label
--- a/paddlers/models/ppseg/datasets/chase_db1.py
+++ b/paddlers/models/ppseg/datasets/chase_db1.py
@ -0,0 +1,98 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/chase_db1/chase_db1.zip'
+
+
+@manager.DATASETS.add_component
+class CHASEDB1(Dataset):
+    """
+    CHASE_DB1 dataset is a dataset for retinal vessel segmentation
+    which contains 28 color retina images with the size of 999×960 pixels.
+    It is collected from both left and right eyes of 14 school children.
+    Each image is annotated by two independent human experts, and we choose the labels from 1st expert.
+    (https://blogs.kingston.ac.uk/retinal/chasedb1/)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255  # labels only have 1/0, thus ignore_index is not necessary
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
--- a/paddlers/models/ppseg/datasets/cityscapes.py
+++ b/paddlers/models/ppseg/datasets/cityscapes.py
@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import glob
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class Cityscapes(Dataset):
+    """
+    Cityscapes dataset `https://www.cityscapes-dataset.com/`.
+    The folder structure is as follow:
+
+        cityscapes
+        |
+        |--leftImg8bit
+        |  |--train
+        |  |--val
+        |  |--test
+        |
+        |--gtFine
+        |  |--train
+        |  |--val
+        |  |--test
+
+    Make sure there are **labelTrainIds.png in gtFine directory. If not, please run the conver_cityscapes.py in tools.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): Cityscapes dataset directory.
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 19
+
+    def __init__(self, transforms, dataset_root, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "mode should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        img_dir = os.path.join(self.dataset_root, 'leftImg8bit')
+        label_dir = os.path.join(self.dataset_root, 'gtFine')
+        if self.dataset_root is None or not os.path.isdir(
+                self.dataset_root) or not os.path.isdir(
+                    img_dir) or not os.path.isdir(label_dir):
+            raise ValueError(
+                "The dataset is not Found or the folder structure is nonconfoumance."
+            )
+
+        label_files = sorted(
+            glob.glob(
+                os.path.join(label_dir, mode, '*',
+                             '*_gtFine_labelTrainIds.png')))
+        img_files = sorted(
+            glob.glob(os.path.join(img_dir, mode, '*', '*_leftImg8bit.png')))
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
--- a/paddlers/models/ppseg/datasets/cocostuff.py
+++ b/paddlers/models/ppseg/datasets/cocostuff.py
@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import glob
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class CocoStuff(Dataset):
+    """
+    COCO-Stuff dataset `https://github.com/nightrome/cocostuff`.
+    The folder structure is as follow:
+
+        cocostuff
+        |
+        |--images
+        |  |--train2017
+        |  |--val2017
+        |
+        |--annotations
+        |  |--train2017
+        |  |--val2017
+
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): Cityscapes dataset directory.
+        mode (str): Which part of dataset to use. it is one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 171
+
+    def __init__(self, transforms, dataset_root, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        mode = mode.lower()
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val']:
+            raise ValueError(
+                "mode should be 'train', 'val', but got {}.".format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        img_dir = os.path.join(self.dataset_root, 'images')
+        label_dir = os.path.join(self.dataset_root, 'annotations')
+        if self.dataset_root is None or not os.path.isdir(
+                self.dataset_root) or not os.path.isdir(
+                    img_dir) or not os.path.isdir(label_dir):
+            raise ValueError(
+                "The dataset is not Found or the folder structure is nonconfoumance."
+            )
+
+        label_files = sorted(
+            glob.glob(os.path.join(label_dir, mode + '2017', '*.png')))
+
+        img_files = sorted(
+            glob.glob(os.path.join(img_dir, mode + '2017', '*.jpg')))
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
--- a/paddlers/models/ppseg/datasets/dataset.py
+++ b/paddlers/models/ppseg/datasets/dataset.py
@ -0,0 +1,162 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import numpy as np
+from PIL import Image
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+import paddlers.models.ppseg.transforms.functional as F
+
+
+@manager.DATASETS.add_component
+class Dataset(paddle.io.Dataset):
+    """
+    Pass in a custom dataset that conforms to the format.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory.
+        num_classes (int): Number of classes.
+        mode (str, optional): which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+        train_path (str, optional): The train dataset file. When mode is 'train', train_path is necessary.
+            The contents of train_path file are as follow:
+            image1.jpg ground_truth1.png
+            image2.jpg ground_truth2.png
+        val_path (str. optional): The evaluation dataset file. When mode is 'val', val_path is necessary.
+            The contents is the same as train_path
+        test_path (str, optional): The test dataset file. When mode is 'test', test_path is necessary.
+            The annotation file is not necessary in test_path file.
+        separator (str, optional): The separator of dataset list. Default: ' '.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+
+        Examples:
+
+            import paddlers.models.ppseg.transforms as T
+            from paddlers.models.ppseg.datasets import Dataset
+
+            transforms = [T.RandomPaddingCrop(crop_size=(512,512)), T.Normalize()]
+            dataset_root = 'dataset_root_path'
+            train_path = 'train_path'
+            num_classes = 2
+            dataset = Dataset(transforms = transforms,
+                              dataset_root = dataset_root,
+                              num_classes = 2,
+                              train_path = train_path,
+                              mode = 'train')
+
+    """
+
+    def __init__(self,
+                 transforms,
+                 dataset_root,
+                 num_classes,
+                 mode='train',
+                 train_path=None,
+                 val_path=None,
+                 test_path=None,
+                 separator=' ',
+                 ignore_index=255,
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        self.file_list = list()
+        self.mode = mode.lower()
+        self.num_classes = num_classes
+        self.ignore_index = ignore_index
+        self.edge = edge
+
+        if self.mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "mode should be 'train', 'val' or 'test', but got {}.".format(
+                    self.mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if not os.path.exists(self.dataset_root):
+            raise FileNotFoundError('there is not `dataset_root`: {}.'.format(
+                self.dataset_root))
+
+        if self.mode == 'train':
+            if train_path is None:
+                raise ValueError(
+                    'When `mode` is "train", `train_path` is necessary, but it is None.'
+                )
+            elif not os.path.exists(train_path):
+                raise FileNotFoundError(
+                    '`train_path` is not found: {}'.format(train_path))
+            else:
+                file_path = train_path
+        elif self.mode == 'val':
+            if val_path is None:
+                raise ValueError(
+                    'When `mode` is "val", `val_path` is necessary, but it is None.'
+                )
+            elif not os.path.exists(val_path):
+                raise FileNotFoundError(
+                    '`val_path` is not found: {}'.format(val_path))
+            else:
+                file_path = val_path
+        else:
+            if test_path is None:
+                raise ValueError(
+                    'When `mode` is "test", `test_path` is necessary, but it is None.'
+                )
+            elif not os.path.exists(test_path):
+                raise FileNotFoundError(
+                    '`test_path` is not found: {}'.format(test_path))
+            else:
+                file_path = test_path
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split(separator)
+                if len(items) != 2:
+                    if self.mode == 'train' or self.mode == 'val':
+                        raise ValueError(
+                            "File list format incorrect! In training or evaluation task it should be"
+                            " image_name{}label_name\\n".format(separator))
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    label_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    label_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, label_path])
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+        if self.mode == 'test':
+            im, _ = self.transforms(im=image_path)
+            im = im[np.newaxis, ...]
+            return im, image_path
+        elif self.mode == 'val':
+            im, _ = self.transforms(im=image_path)
+            label = np.asarray(Image.open(label_path))
+            label = label[np.newaxis, :, :]
+            return im, label
+        else:
+            im, label = self.transforms(im=image_path, label=label_path)
+            if self.edge:
+                edge_mask = F.mask_to_binary_edge(
+                    label, radius=2, num_classes=self.num_classes)
+                return im, label, edge_mask
+            else:
+                return im, label
+
+    def __len__(self):
+        return len(self.file_list)
--- a/paddlers/models/ppseg/datasets/drive.py
+++ b/paddlers/models/ppseg/datasets/drive.py
@ -0,0 +1,96 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/drive/drive.zip'
+
+
+@manager.DATASETS.add_component
+class DRIVE(Dataset):
+    """
+    The Digital Retinal Images for Vessel Extraction (DRIVE) dataset is a dataset for retinal vessel segmentation.
+    It consists of a total of JPEG 40 color fundus images which is of size (584, 565); including 7 abnormal pathology cases.
+    (http://www.isi.uu.nl/Research/Databases/DRIVE/)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255  # labels only have 1/0, thus ignore_index is not necessary
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
--- a/paddlers/models/ppseg/datasets/eg1800.py
+++ b/paddlers/models/ppseg/datasets/eg1800.py
@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+
+import cv2
+import numpy as np
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+import paddlers.models.ppseg.transforms.functional as F
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/EG1800.zip"
+
+
+@manager.DATASETS.add_component
+class EG1800(Dataset):
+    """
+    EG1800 dataset `http://xiaoyongshen.me/webpage_portrait/index.html`.
+
+    Args:
+        common_transforms (list): A list of common image transformations for two inputs of portrait net.
+        transforms1 (list): A list of image transformations for the first input of portrait net.
+        transforms2 (list): A list of image transformations for the second input of portrait net.
+        dataset_root (str, optional): The EG1800 dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 common_transforms,
+                 transforms1,
+                 transforms2,
+                 dataset_root=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.common_transforms = Compose(common_transforms)
+        self.transforms = self.common_transforms
+        if transforms1 is not None:
+            self.transforms1 = Compose(transforms1, to_rgb=False)
+        if transforms2 is not None:
+            self.transforms2 = Compose(transforms2, to_rgb=False)
+        mode = mode.lower()
+        self.ignore_index = 255
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.input_width = 224
+        self.input_height = 224
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+            
+        if mode == 'train':
+            path = os.path.join(dataset_root, 'eg1800_train.txt')
+        else:
+            path = os.path.join(dataset_root, 'eg1800_test.txt')
+        with open(path, 'r') as f:
+            files = f.readlines()
+        img_files = [
+            os.path.join(dataset_root, 'Images', file).strip() for file in files
+        ]
+        label_files = [
+            os.path.join(dataset_root, 'Labels', file).strip() for file in files
+        ]
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
+        pass
+
+    def __getitem__(self, item):
+        image_path, label_path = self.file_list[item]
+        im = cv2.imread(image_path)
+        label = cv2.imread(label_path, 0)
+        label[label > 1] = 0
+
+        if self.mode == "val":
+            common_im, label = self.common_transforms(im=im, label=label)
+            im = np.float32(common_im[::-1, :, :])  # RGB => BGR
+            im_aug = copy.deepcopy(im)
+        else:
+            common_im, label = self.common_transforms(im=im, label=label)
+            common_im = np.transpose(common_im, [1, 2, 0])
+            # add augmentation
+            im, _ = self.transforms1(common_im)
+            im_aug, _ = self.transforms2(common_im)
+
+            im = np.float32(im[::-1, :, :])  # RGB => BGR
+            im_aug = np.float32(im_aug[::-1, :, :])  # RGB => BGR
+
+        label = cv2.resize(
+            np.uint8(label), (self.input_width, self.input_height),
+            interpolation=cv2.INTER_NEAREST)
+
+        # add mask blur
+        label = np.uint8(cv2.blur(label, (5, 5)))
+        label[label >= 0.5] = 1
+        label[label < 0.5] = 0
+
+        edge_mask = F.mask_to_binary_edge(
+            label, radius=4, num_classes=self.num_classes)
+        edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1)
+        im = np.concatenate([im_aug, im])
+        if self.mode == "train":
+            return im, label, edge_mask
+        else:
+            return im, label
--- a/paddlers/models/ppseg/datasets/hrf.py
+++ b/paddlers/models/ppseg/datasets/hrf.py
@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/hrf/hrf.zip'
+
+
+@manager.DATASETS.add_component
+class HRF(Dataset):
+    """
+    The HRF dataset is a dataset for retinal vessel segmentation which comprises 45 images and is organized as 15 subsets. Each subset contains one healthy fundus image, one image of patient with diabetic retinopathy and one glaucoma image. The image sizes are 3,304 x 2,336, with a training/testing image split of 21/24.
+    (https://doi.org/10.1155/2013/154860)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
--- a/paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
+++ b/paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
@ -0,0 +1,95 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .dataset import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/MiniDeepGlobeRoadExtraction.zip"
+
+
+@manager.DATASETS.add_component
+class MiniDeepGlobeRoadExtraction(Dataset):
+    """
+    MiniDeepGlobeRoadExtraction dataset is extraced from DeepGlobe CVPR2018 challenge (http://deepglobe.org/)
+
+    There are 800 images in the training set and 200 images in the validation set.
+
+    Args:
+        dataset_root (str, optional): The dataset directory. Default: None.
+        transforms (list, optional): Transforms for image. Default: None.
+        mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val']:
+            raise ValueError(
+                "`mode` should be 'train' or 'val', but got {}.".format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train.txt')
+        else:
+            file_path = os.path.join(self.dataset_root, 'val.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split('|')
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name|label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
--- a/paddlers/models/ppseg/datasets/optic_disc_seg.py
+++ b/paddlers/models/ppseg/datasets/optic_disc_seg.py
@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .dataset import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/optic_disc_seg.zip"
+
+
+@manager.DATASETS.add_component
+class OpticDiscSeg(Dataset):
+    """
+    OpticDiscSeg dataset is extraced from iChallenge-AMD
+    (https://ai.baidu.com/broad/subordinate?dataset=amd).
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+        else:
+            file_path = os.path.join(self.dataset_root, 'test_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
--- a/paddlers/models/ppseg/datasets/pascal_context.py
+++ b/paddlers/models/ppseg/datasets/pascal_context.py
@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from PIL import Image
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class PascalContext(Dataset):
+    """
+    PascalVOC2010 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`.
+    If you want to use pascal context dataset, please run the convert_voc2010.py in tools firstly.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        mode (str): Which part of dataset to use. it is one of ('train', 'trainval', 'context', 'val').
+            If you want to set mode to 'context', please make sure the dataset have been augmented. Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 60
+
+    def __init__(self, transforms=None, dataset_root=None, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'trainval', 'val']:
+            raise ValueError(
+                "`mode` should be one of ('train', 'trainval', 'val') in PascalContext dataset, but got {}."
+                .format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+        if self.dataset_root is None:
+            raise ValueError(
+                "The dataset is not Found or the folder structure is nonconfoumance."
+            )
+
+        image_set_dir = os.path.join(self.dataset_root, 'ImageSets',
+                                     'Segmentation')
+
+        if mode == 'train':
+            file_path = os.path.join(image_set_dir, 'train_context.txt')
+        elif mode == 'val':
+            file_path = os.path.join(image_set_dir, 'val_context.txt')
+        elif mode == 'trainval':
+            file_path = os.path.join(image_set_dir, 'trainval_context.txt')
+        if not os.path.exists(file_path):
+            raise RuntimeError(
+                "PASCAL-Context annotations are not ready, "
+                "Please make sure voc_context.py has been properly run.")
+
+        img_dir = os.path.join(self.dataset_root, 'JPEGImages')
+        label_dir = os.path.join(self.dataset_root, 'Context')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
+                label_path = os.path.join(label_dir, ''.join([line, '.png']))
+                self.file_list.append([image_path, label_path])
--- a/paddlers/models/ppseg/datasets/pp_humanseg14k.py
+++ b/paddlers/models/ppseg/datasets/pp_humanseg14k.py
@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .dataset import Dataset
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class PPHumanSeg14K(Dataset):
+    """
+    This is the PP-HumanSeg14K Dataset.
+
+    This dataset was introduced in the work:
+    Chu, Lutao, et al. "PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset." Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision. 2022.
+
+    This dataset is divided into training set, validation set and test set. The training set includes 8770 pictures, the validation set includes 2431 pictures, and the test set includes 2482 pictures.
+
+    Args:
+        dataset_root (str, optional): The dataset directory. Default: None.
+        transforms (list, optional): Transforms for image. Default: None.
+        mode (str, optional): Which part of dataset to use. It is one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val.txt')
+        else:
+            file_path = os.path.join(self.dataset_root, 'test.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split(' ')
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
--- a/paddlers/models/ppseg/datasets/stare.py
+++ b/paddlers/models/ppseg/datasets/stare.py
@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+
+URL = 'https://bj.bcebos.com/paddleseg/dataset/stare/stare.zip'
+
+
+@manager.DATASETS.add_component
+class STARE(Dataset):
+    """
+    STARE dataset is processed from the STARE(STructured Analysis of the Retina) project.
+    (https://cecas.clemson.edu/~ahoover/stare/)
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        edge (bool): whether extract edge infor in the output
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 dataset_root=None,
+                 transforms=None,
+                 edge=False,
+                 mode='train'):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.edge = edge
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+
+        if mode not in ['train', 'val', 'test']:
+            raise ValueError(
+                "`mode` should be 'train', 'val' or 'test', but got {}.".format(
+                    mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)  # data  STARE
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        if mode == 'train':
+            file_path = os.path.join(self.dataset_root, 'train_list.txt')
+        elif mode == 'val':
+            file_path = os.path.join(self.dataset_root, 'val_list.txt')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                items = line.strip().split()
+                if len(items) != 2:
+                    if mode == 'train' or mode == 'val':
+                        raise Exception(
+                            "File list format incorrect! It should be"
+                            " image_name label_name\\n")
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = None
+                else:
+                    image_path = os.path.join(self.dataset_root, items[0])
+                    grt_path = os.path.join(self.dataset_root, items[1])
+                self.file_list.append([image_path, grt_path])
--- a/paddlers/models/ppseg/datasets/supervisely.py
+++ b/paddlers/models/ppseg/datasets/supervisely.py
@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+
+import cv2
+import numpy as np
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+import paddlers.models.ppseg.transforms.functional as F
+
+URL = "https://paddleseg.bj.bcebos.com/dataset/Supervisely_face.zip"
+
+
+@manager.DATASETS.add_component
+class SUPERVISELY(Dataset):
+    """
+    Supervise.ly dataset `https://supervise.ly/`.
+
+    Args:
+        common_transforms (list): A list of common image transformations for two inputs of portrait net.
+        transforms1 (list): A list of image transformations for the first input of portrait net.
+        transforms2 (list): A list of image transformations for the second input of portrait net.
+        dataset_root (str, optional): The Supervise.ly dataset directory. Default: None.
+        mode (str, optional): A subset of the entire dataset. It should be one of ('train', 'val'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 2
+
+    def __init__(self,
+                 common_transforms,
+                 transforms1,
+                 transforms2,
+                 dataset_root=None,
+                 mode='train',
+                 edge=False):
+        self.dataset_root = dataset_root
+        self.common_transforms = Compose(common_transforms)
+        self.transforms = self.common_transforms
+        if transforms1 is not None:
+            self.transforms1 = Compose(transforms1, to_rgb=False)
+        if transforms2 is not None:
+            self.transforms2 = Compose(transforms2, to_rgb=False)
+        mode = mode.lower()
+        self.ignore_index = 255
+        self.mode = mode
+        self.num_classes = self.NUM_CLASSES
+        self.input_width = 224
+        self.input_height = 224
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME)
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+            
+        if mode == 'train':
+            path = os.path.join(dataset_root, 'supervisely_face_train_easy.txt')
+        else:
+            path = os.path.join(dataset_root, 'supervisely_face_test_easy.txt')
+        with open(path, 'r') as f:
+            files = f.readlines()
+        files = ["/".join(file.split('/')[1:]) for file in files]
+        img_files = [os.path.join(dataset_root, file).strip() for file in files]
+        label_files = [
+            os.path.join(dataset_root, file.replace('/img/', '/ann/')).strip()
+            for file in files
+        ]
+
+        self.file_list = [[
+            img_path, label_path
+        ] for img_path, label_path in zip(img_files, label_files)]
+
+    def __getitem__(self, item):
+        image_path, label_path = self.file_list[item]
+        im = cv2.imread(image_path)
+        label = cv2.imread(label_path, 0)
+        label[label > 0] = 1
+
+        if self.mode == "val":
+            common_im, label = self.common_transforms(im=im, label=label)
+            im = np.float32(common_im[::-1, :, :])  # RGB => BGR
+            im_aug = copy.deepcopy(im)
+        else:
+            common_im, label = self.common_transforms(im=im, label=label)
+            common_im = np.transpose(common_im, [1, 2, 0])
+            # add augmentation
+            im, _ = self.transforms1(common_im)
+            im_aug, _ = self.transforms2(common_im)
+
+            im = np.float32(im[::-1, :, :])  # RGB => BGR
+            im_aug = np.float32(im_aug[::-1, :, :])  # RGB => BGR
+
+        label = cv2.resize(
+            np.uint8(label), (self.input_width, self.input_height),
+            interpolation=cv2.INTER_NEAREST)
+
+        # add mask blur
+        label = np.uint8(cv2.blur(label, (5, 5)))
+        label[label >= 0.5] = 1
+        label[label < 0.5] = 0
+
+        edge_mask = F.mask_to_binary_edge(
+            label, radius=4, num_classes=self.num_classes)
+        edge_mask = np.transpose(edge_mask, [1, 2, 0]).squeeze(axis=-1)
+        im = np.concatenate([im_aug, im])
+        if self.mode == "train":
+            return im, label, edge_mask
+        else:
+            return im, label
--- a/paddlers/models/ppseg/datasets/voc.py
+++ b/paddlers/models/ppseg/datasets/voc.py
@ -0,0 +1,112 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlers.models.ppseg.datasets import Dataset
+from paddlers.models.ppseg.utils.download import download_file_and_uncompress
+from paddlers.models.ppseg.utils import seg_env
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.transforms import Compose
+
+URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar"
+
+
+@manager.DATASETS.add_component
+class PascalVOC(Dataset):
+    """
+    PascalVOC2012 dataset `http://host.robots.ox.ac.uk/pascal/VOC/`.
+    If you want to augment the dataset, please run the voc_augment.py in tools.
+
+    Args:
+        transforms (list): Transforms for image.
+        dataset_root (str): The dataset directory. Default: None
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'trainval', 'trainaug', 'val').
+            If you want to set mode to 'trainaug', please make sure the dataset have been augmented. Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False
+    """
+    NUM_CLASSES = 21
+
+    def __init__(self, transforms, dataset_root=None, mode='train', edge=False):
+        self.dataset_root = dataset_root
+        self.transforms = Compose(transforms)
+        mode = mode.lower()
+        self.mode = mode
+        self.file_list = list()
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = 255
+        self.edge = edge
+
+        if mode not in ['train', 'trainval', 'trainaug', 'val']:
+            raise ValueError(
+                "`mode` should be one of ('train', 'trainval', 'trainaug', 'val') in PascalVOC dataset, but got {}."
+                .format(mode))
+
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        if self.dataset_root is None:
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=seg_env.DATA_HOME,
+                extrapath=seg_env.DATA_HOME,
+                extraname='VOCdevkit')
+        elif not os.path.exists(self.dataset_root):
+            self.dataset_root = os.path.normpath(self.dataset_root)
+            savepath, extraname = self.dataset_root.rsplit(
+                sep=os.path.sep, maxsplit=1)
+            self.dataset_root = download_file_and_uncompress(
+                url=URL,
+                savepath=savepath,
+                extrapath=savepath,
+                extraname=extraname)
+
+        image_set_dir = os.path.join(self.dataset_root, 'VOC2012', 'ImageSets',
+                                     'Segmentation')
+        if mode == 'train':
+            file_path = os.path.join(image_set_dir, 'train.txt')
+        elif mode == 'val':
+            file_path = os.path.join(image_set_dir, 'val.txt')
+        elif mode == 'trainval':
+            file_path = os.path.join(image_set_dir, 'trainval.txt')
+        elif mode == 'trainaug':
+            file_path = os.path.join(image_set_dir, 'train.txt')
+            file_path_aug = os.path.join(image_set_dir, 'aug.txt')
+
+            if not os.path.exists(file_path_aug):
+                raise RuntimeError(
+                    "When `mode` is 'trainaug', Pascal Voc dataset should be augmented, "
+                    "Please make sure voc_augment.py has been properly run when using this mode."
+                )
+
+        img_dir = os.path.join(self.dataset_root, 'VOC2012', 'JPEGImages')
+        label_dir = os.path.join(self.dataset_root, 'VOC2012',
+                                 'SegmentationClass')
+        label_dir_aug = os.path.join(self.dataset_root, 'VOC2012',
+                                     'SegmentationClassAug')
+
+        with open(file_path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
+                label_path = os.path.join(label_dir, ''.join([line, '.png']))
+                self.file_list.append([image_path, label_path])
+        if mode == 'trainaug':
+            with open(file_path_aug, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    image_path = os.path.join(img_dir, ''.join([line, '.jpg']))
+                    label_path = os.path.join(label_dir_aug,
+                                              ''.join([line, '.png']))
+                    self.file_list.append([image_path, label_path])
--- a/paddlers/models/ppseg/models/init.py
+++ b/paddlers/models/ppseg/models/init.py
@ -0,0 +1,57 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .backbones import *
+from .losses import *
+
+from .ann import *
+from .bisenet import *
+from .danet import *
+from .deeplab import *
+from .fast_scnn import *
+from .fcn import *
+from .gcnet import *
+from .ocrnet import *
+from .pspnet import *
+from .gscnn import GSCNN
+from .unet import UNet
+from .hardnet import HarDNet
+from .u2net import U2Net, U2Netp
+from .attention_unet import AttentionUNet
+from .unet_plusplus import UNetPlusPlus
+from .unet_3plus import UNet3Plus
+from .decoupled_segnet import DecoupledSegNet
+from .emanet import *
+from .isanet import *
+from .dnlnet import *
+from .setr import *
+from .sfnet import *
+from .pphumanseg_lite import *
+from .mla_transformer import MLATransformer
+from .portraitnet import PortraitNet
+from .stdcseg import STDCSeg
+from .segformer import SegFormer
+from .pointrend import PointRend
+from .ginet import GINet
+from .segmenter import *
+from .segnet import SegNet
+from .encnet import ENCNet
+from .hrnet_contrast import HRNetW48Contrast
+from .espnet import ESPNetV2
+from .dmnet import DMNet
+from .espnetv1 import ESPNetV1
+from .enet import ENet
+from .bisenetv1 import BiseNetV1
+from .fastfcn import FastFCN
+from .pfpnnet import PFPNNet
--- a/paddlers/models/ppseg/models/ann.py
+++ b/paddlers/models/ppseg/models/ann.py
@ -0,0 +1,434 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ANN(nn.Layer):
+    """
+    The ANN implementation based on PaddlePaddle.
+
+    The original article refers to
+    Zhen, Zhu, et al. "Asymmetric Non-local Neural Networks for Semantic Segmentation"
+    (https://arxiv.org/pdf/1908.07678.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+        key_value_channels (int, optional): The key and value channels of self-attention map in both AFNB and APNB modules.
+            Default: 256.
+        inter_channels (int, optional): Both input and output channels of APNB modules. Default: 512.
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 key_value_channels=256,
+                 inter_channels=512,
+                 psp_size=(1, 3, 6, 8),
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = ANNHead(num_classes, backbone_indices, backbone_channels,
+                            key_value_channels, inter_channels, psp_size,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class ANNHead(nn.Layer):
+    """
+    The ANNHead implementation.
+
+    It mainly consists of AFNB and APNB modules.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            The first index will be taken as low-level features; the second one will be
+            taken as high-level features in AFNB module. Usually backbone consists of four
+            downsampling stage, such as ResNet, and return an output of each stage. If it is (2, 3),
+            it means taking feature map of the third stage and the fourth stage in backbone.
+        backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
+        key_value_channels (int): The key and value channels of self-attention map in both AFNB and APNB modules.
+        inter_channels (int): Both input and output channels of APNB modules.
+        psp_size (tuple): The out size of pooled feature maps.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 backbone_channels,
+                 key_value_channels,
+                 inter_channels,
+                 psp_size,
+                 enable_auxiliary_loss=True):
+        super().__init__()
+
+        low_in_channels = backbone_channels[0]
+        high_in_channels = backbone_channels[1]
+
+        self.fusion = AFNB(
+            low_in_channels=low_in_channels,
+            high_in_channels=high_in_channels,
+            out_channels=high_in_channels,
+            key_channels=key_value_channels,
+            value_channels=key_value_channels,
+            dropout_prob=0.05,
+            repeat_sizes=([1]),
+            psp_size=psp_size)
+
+        self.context = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=high_in_channels,
+                out_channels=inter_channels,
+                kernel_size=3,
+                padding=1),
+            APNB(
+                in_channels=inter_channels,
+                out_channels=inter_channels,
+                key_channels=key_value_channels,
+                value_channels=key_value_channels,
+                dropout_prob=0.05,
+                repeat_sizes=([1]),
+                psp_size=psp_size))
+
+        self.cls = nn.Conv2D(
+            in_channels=inter_channels, out_channels=num_classes, kernel_size=1)
+        self.auxlayer = layers.AuxLayer(
+            in_channels=low_in_channels,
+            inter_channels=low_in_channels // 2,
+            out_channels=num_classes,
+            dropout_prob=0.05)
+
+        self.backbone_indices = backbone_indices
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+    def forward(self, feat_list):
+        logit_list = []
+        low_level_x = feat_list[self.backbone_indices[0]]
+        high_level_x = feat_list[self.backbone_indices[1]]
+        x = self.fusion(low_level_x, high_level_x)
+        x = self.context(x)
+        logit = self.cls(x)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            auxiliary_logit = self.auxlayer(low_level_x)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
+
+
+class AFNB(nn.Layer):
+    """
+    Asymmetric Fusion Non-local Block.
+
+    Args:
+        low_in_channels (int): Low-level-feature channels.
+        high_in_channels (int): High-level-feature channels.
+        out_channels (int): Out channels of AFNB module.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        dropout_prob (float): The dropout rate of output.
+        repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]).
+        psp_size (tuple. optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 low_in_channels,
+                 high_in_channels,
+                 out_channels,
+                 key_channels,
+                 value_channels,
+                 dropout_prob,
+                 repeat_sizes=([1]),
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.psp_size = psp_size
+        self.stages = nn.LayerList([
+            SelfAttentionBlock_AFNB(low_in_channels, high_in_channels,
+                                    key_channels, value_channels, out_channels,
+                                    size) for size in repeat_sizes
+        ])
+        self.conv_bn = layers.ConvBN(
+            in_channels=out_channels + high_in_channels,
+            out_channels=out_channels,
+            kernel_size=1)
+        self.dropout = nn.Dropout(p=dropout_prob)
+
+    def forward(self, low_feats, high_feats):
+        priors = [stage(low_feats, high_feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+
+        output = self.conv_bn(paddle.concat([context, high_feats], axis=1))
+        output = self.dropout(output)
+
+        return output
+
+
+class APNB(nn.Layer):
+    """
+    Asymmetric Pyramid Non-local Block.
+
+    Args:
+        in_channels (int): The input channels of APNB module.
+        out_channels (int): Out channels of APNB module.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        dropout_prob (float): The dropout rate of output.
+        repeat_sizes (tuple, optional): The number of AFNB modules. Default: ([1]).
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 key_channels,
+                 value_channels,
+                 dropout_prob,
+                 repeat_sizes=([1]),
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.psp_size = psp_size
+        self.stages = nn.LayerList([
+            SelfAttentionBlock_APNB(in_channels, out_channels, key_channels,
+                                    value_channels, size)
+            for size in repeat_sizes
+        ])
+        self.conv_bn = layers.ConvBNReLU(
+            in_channels=in_channels * 2,
+            out_channels=out_channels,
+            kernel_size=1)
+        self.dropout = nn.Dropout(p=dropout_prob)
+
+    def forward(self, x):
+        priors = [stage(x) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+
+        output = self.conv_bn(paddle.concat([context, x], axis=1))
+        output = self.dropout(output)
+
+        return output
+
+
+def _pp_module(x, psp_size):
+    n, c, h, w = x.shape
+    priors = []
+    for size in psp_size:
+        feat = F.adaptive_avg_pool2d(x, size)
+        feat = paddle.reshape(feat, shape=(0, c, -1))
+        priors.append(feat)
+    center = paddle.concat(priors, axis=-1)
+    return center
+
+
+class SelfAttentionBlock_AFNB(nn.Layer):
+    """
+    Self-Attention Block for AFNB module.
+
+    Args:
+        low_in_channels (int): Low-level-feature channels.
+        high_in_channels (int): High-level-feature channels.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        out_channels (int, optional): Out channels of AFNB module. Default: None.
+        scale (int, optional): Pooling size. Default: 1.
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 low_in_channels,
+                 high_in_channels,
+                 key_channels,
+                 value_channels,
+                 out_channels=None,
+                 scale=1,
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.scale = scale
+        self.in_channels = low_in_channels
+        self.out_channels = out_channels
+        self.key_channels = key_channels
+        self.value_channels = value_channels
+        if out_channels == None:
+            self.out_channels = high_in_channels
+        self.pool = nn.MaxPool2D(scale)
+        self.f_key = layers.ConvBNReLU(
+            in_channels=low_in_channels,
+            out_channels=key_channels,
+            kernel_size=1)
+        self.f_query = layers.ConvBNReLU(
+            in_channels=high_in_channels,
+            out_channels=key_channels,
+            kernel_size=1)
+        self.f_value = nn.Conv2D(
+            in_channels=low_in_channels,
+            out_channels=value_channels,
+            kernel_size=1)
+
+        self.W = nn.Conv2D(
+            in_channels=value_channels,
+            out_channels=out_channels,
+            kernel_size=1)
+
+        self.psp_size = psp_size
+
+    def forward(self, low_feats, high_feats):
+        batch_size, _, h, w = high_feats.shape
+
+        value = self.f_value(low_feats)
+        value = _pp_module(value, self.psp_size)
+        value = paddle.transpose(value, (0, 2, 1))
+
+        query = self.f_query(high_feats)
+        query = paddle.reshape(query, shape=(0, self.key_channels, -1))
+        query = paddle.transpose(query, perm=(0, 2, 1))
+
+        key = self.f_key(low_feats)
+        key = _pp_module(key, self.psp_size)
+
+        sim_map = paddle.matmul(query, key)
+        sim_map = (self.key_channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        context = paddle.matmul(sim_map, value)
+        context = paddle.transpose(context, perm=(0, 2, 1))
+        hf_shape = paddle.shape(high_feats)
+        context = paddle.reshape(
+            context, shape=[0, self.value_channels, hf_shape[2], hf_shape[3]])
+
+        context = self.W(context)
+
+        return context
+
+
+class SelfAttentionBlock_APNB(nn.Layer):
+    """
+    Self-Attention Block for APNB module.
+
+    Args:
+        in_channels (int): The input channels of APNB module.
+        out_channels (int): The out channels of APNB module.
+        key_channels (int): The key channels in self-attention block.
+        value_channels (int): The value channels in self-attention block.
+        scale (int, optional): Pooling size. Default: 1.
+        psp_size (tuple, optional): The out size of pooled feature maps. Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 key_channels,
+                 value_channels,
+                 scale=1,
+                 psp_size=(1, 3, 6, 8)):
+        super().__init__()
+
+        self.scale = scale
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.key_channels = key_channels
+        self.value_channels = value_channels
+        self.pool = nn.MaxPool2D(scale)
+        self.f_key = layers.ConvBNReLU(
+            in_channels=self.in_channels,
+            out_channels=self.key_channels,
+            kernel_size=1)
+        self.f_query = self.f_key
+        self.f_value = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.value_channels,
+            kernel_size=1)
+        self.W = nn.Conv2D(
+            in_channels=self.value_channels,
+            out_channels=self.out_channels,
+            kernel_size=1)
+
+        self.psp_size = psp_size
+
+    def forward(self, x):
+        batch_size, _, h, w = x.shape
+        if self.scale > 1:
+            x = self.pool(x)
+
+        value = self.f_value(x)
+        value = _pp_module(value, self.psp_size)
+        value = paddle.transpose(value, perm=(0, 2, 1))
+
+        query = self.f_query(x)
+        query = paddle.reshape(query, shape=(0, self.key_channels, -1))
+        query = paddle.transpose(query, perm=(0, 2, 1))
+
+        key = self.f_key(x)
+        key = _pp_module(key, self.psp_size)
+
+        sim_map = paddle.matmul(query, key)
+        sim_map = (self.key_channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        context = paddle.matmul(sim_map, value)
+        context = paddle.transpose(context, perm=(0, 2, 1))
+
+        x_shape = paddle.shape(x)
+        context = paddle.reshape(
+            context, shape=[0, self.value_channels, x_shape[2], x_shape[3]])
+        context = self.W(context)
+
+        return context
--- a/paddlers/models/ppseg/models/attention_unet.py
+++ b/paddlers/models/ppseg/models/attention_unet.py
@ -0,0 +1,178 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg import utils
+import numpy as np
+
+
+@manager.MODELS.add_component
+class AttentionUNet(nn.Layer):
+    """
+    The Attention-UNet implementation based on PaddlePaddle.
+    As mentioned in the original paper, author proposes a novel attention gate (AG)
+    that automatically learns to focus on target structures of varying shapes and sizes.
+    Models trained with AGs implicitly learn to suppress irrelevant regions in an input image while
+    highlighting salient features useful for a specific task.
+
+    The original article refers to
+    Oktay, O, et, al. "Attention u-net: Learning where to look for the pancreas."
+    (https://arxiv.org/pdf/1804.03999.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self, num_classes, pretrained=None):
+        super().__init__()
+        n_channels = 3
+        self.encoder = Encoder(n_channels, [64, 128, 256, 512])
+        filters = np.array([64, 128, 256, 512, 1024])
+        self.up5 = UpConv(ch_in=filters[4], ch_out=filters[3])
+        self.att5 = AttentionBlock(
+            F_g=filters[3], F_l=filters[3], F_out=filters[2])
+        self.up_conv5 = ConvBlock(ch_in=filters[4], ch_out=filters[3])
+
+        self.up4 = UpConv(ch_in=filters[3], ch_out=filters[2])
+        self.att4 = AttentionBlock(
+            F_g=filters[2], F_l=filters[2], F_out=filters[1])
+        self.up_conv4 = ConvBlock(ch_in=filters[3], ch_out=filters[2])
+
+        self.up3 = UpConv(ch_in=filters[2], ch_out=filters[1])
+        self.att3 = AttentionBlock(
+            F_g=filters[1], F_l=filters[1], F_out=filters[0])
+        self.up_conv3 = ConvBlock(ch_in=filters[2], ch_out=filters[1])
+
+        self.up2 = UpConv(ch_in=filters[1], ch_out=filters[0])
+        self.att2 = AttentionBlock(
+            F_g=filters[0], F_l=filters[0], F_out=filters[0] // 2)
+        self.up_conv2 = ConvBlock(ch_in=filters[1], ch_out=filters[0])
+
+        self.conv_1x1 = nn.Conv2D(
+            filters[0], num_classes, kernel_size=1, stride=1, padding=0)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        x5, (x1, x2, x3, x4) = self.encoder(x)
+        d5 = self.up5(x5)
+        x4 = self.att5(g=d5, x=x4)
+        d5 = paddle.concat([x4, d5], axis=1)
+        d5 = self.up_conv5(d5)
+
+        d4 = self.up4(d5)
+        x3 = self.att4(g=d4, x=x3)
+        d4 = paddle.concat((x3, d4), axis=1)
+        d4 = self.up_conv4(d4)
+
+        d3 = self.up3(d4)
+        x2 = self.att3(g=d3, x=x2)
+        d3 = paddle.concat((x2, d3), axis=1)
+        d3 = self.up_conv3(d3)
+
+        d2 = self.up2(d3)
+        x1 = self.att2(g=d2, x=x1)
+        d2 = paddle.concat((x1, d2), axis=1)
+        d2 = self.up_conv2(d2)
+
+        logit = self.conv_1x1(d2)
+        logit_list = [logit]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class AttentionBlock(nn.Layer):
+    def __init__(self, F_g, F_l, F_out):
+        super().__init__()
+        self.W_g = nn.Sequential(
+            nn.Conv2D(F_g, F_out, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(F_out))
+
+        self.W_x = nn.Sequential(
+            nn.Conv2D(F_l, F_out, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(F_out))
+
+        self.psi = nn.Sequential(
+            nn.Conv2D(F_out, 1, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(1), nn.Sigmoid())
+
+        self.relu = nn.ReLU()
+
+    def forward(self, g, x):
+        g1 = self.W_g(g)
+        x1 = self.W_x(x)
+        psi = self.relu(g1 + x1)
+        psi = self.psi(psi)
+        res = x * psi
+        return res
+
+
+class UpConv(nn.Layer):
+    def __init__(self, ch_in, ch_out):
+        super().__init__()
+        self.up = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode="bilinear"),
+            nn.Conv2D(ch_in, ch_out, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2D(ch_out), nn.ReLU())
+
+    def forward(self, x):
+        return self.up(x)
+
+
+class Encoder(nn.Layer):
+    def __init__(self, input_channels, filters):
+        super().__init__()
+        self.double_conv = nn.Sequential(
+            layers.ConvBNReLU(input_channels, 64, 3),
+            layers.ConvBNReLU(64, 64, 3))
+        down_channels = filters
+        self.down_sample_list = nn.LayerList([
+            self.down_sampling(channel, channel * 2)
+            for channel in down_channels
+        ])
+
+    def down_sampling(self, in_channels, out_channels):
+        modules = []
+        modules.append(nn.MaxPool2D(kernel_size=2, stride=2))
+        modules.append(layers.ConvBNReLU(in_channels, out_channels, 3))
+        modules.append(layers.ConvBNReLU(out_channels, out_channels, 3))
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        short_cuts = []
+        x = self.double_conv(x)
+        for down_sample in self.down_sample_list:
+            short_cuts.append(x)
+            x = down_sample(x)
+        return x, short_cuts
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out):
+        super(ConvBlock, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(ch_in, ch_out, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2D(ch_out), nn.ReLU(),
+            nn.Conv2D(ch_out, ch_out, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2D(ch_out), nn.ReLU())
+
+    def forward(self, x):
+        return self.conv(x)
--- a/paddlers/models/ppseg/models/backbones/init.py
+++ b/paddlers/models/ppseg/models/backbones/init.py
@ -0,0 +1,23 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .hrnet import *
+from .resnet_vd import *
+from .xception_deeplab import *
+from .mobilenetv3 import *
+from .vision_transformer import *
+from .swin_transformer import *
+from .mobilenetv2 import *
+from .mix_transformer import *
+from .stdcnet import *
--- a/paddlers/models/ppseg/models/backbones/hrnet.py
+++ b/paddlers/models/ppseg/models/backbones/hrnet.py
@ -0,0 +1,837 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = [
+    "HRNet_W18_Small_V1", "HRNet_W18_Small_V2", "HRNet_W18", "HRNet_W30",
+    "HRNet_W32", "HRNet_W40", "HRNet_W44", "HRNet_W48", "HRNet_W60", "HRNet_W64"
+]
+
+
+class HRNet(nn.Layer):
+    """
+    The HRNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Jingdong Wang, et, al. "HRNet：Deep High-Resolution Representation Learning for Visual Recognition"
+    (https://arxiv.org/pdf/1908.07919.pdf).
+
+    Args:
+        pretrained (str, optional): The path of pretrained model.
+        stage1_num_modules (int, optional): Number of modules for stage1. Default 1.
+        stage1_num_blocks (list, optional): Number of blocks per module for stage1. Default (4).
+        stage1_num_channels (list, optional): Number of channels per branch for stage1. Default (64).
+        stage2_num_modules (int, optional): Number of modules for stage2. Default 1.
+        stage2_num_blocks (list, optional): Number of blocks per module for stage2. Default (4, 4).
+        stage2_num_channels (list, optional): Number of channels per branch for stage2. Default (18, 36).
+        stage3_num_modules (int, optional): Number of modules for stage3. Default 4.
+        stage3_num_blocks (list, optional): Number of blocks per module for stage3. Default (4, 4, 4).
+        stage3_num_channels (list, optional): Number of channels per branch for stage3. Default [18, 36, 72).
+        stage4_num_modules (int, optional): Number of modules for stage4. Default 3.
+        stage4_num_blocks (list, optional): Number of blocks per module for stage4. Default (4, 4, 4, 4).
+        stage4_num_channels (list, optional): Number of channels per branch for stage4. Default (18, 36, 72. 144).
+        has_se (bool, optional): Whether to use Squeeze-and-Excitation module. Default False.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 stage1_num_modules=1,
+                 stage1_num_blocks=(4, ),
+                 stage1_num_channels=(64, ),
+                 stage2_num_modules=1,
+                 stage2_num_blocks=(4, 4),
+                 stage2_num_channels=(18, 36),
+                 stage3_num_modules=4,
+                 stage3_num_blocks=(4, 4, 4),
+                 stage3_num_channels=(18, 36, 72),
+                 stage4_num_modules=3,
+                 stage4_num_blocks=(4, 4, 4, 4),
+                 stage4_num_channels=(18, 36, 72, 144),
+                 has_se=False,
+                 align_corners=False,
+                 padding_same=True):
+        super(HRNet, self).__init__()
+        self.pretrained = pretrained
+        self.stage1_num_modules = stage1_num_modules
+        self.stage1_num_blocks = stage1_num_blocks
+        self.stage1_num_channels = stage1_num_channels
+        self.stage2_num_modules = stage2_num_modules
+        self.stage2_num_blocks = stage2_num_blocks
+        self.stage2_num_channels = stage2_num_channels
+        self.stage3_num_modules = stage3_num_modules
+        self.stage3_num_blocks = stage3_num_blocks
+        self.stage3_num_channels = stage3_num_channels
+        self.stage4_num_modules = stage4_num_modules
+        self.stage4_num_blocks = stage4_num_blocks
+        self.stage4_num_channels = stage4_num_channels
+        self.has_se = has_se
+        self.align_corners = align_corners
+        self.feat_channels = [sum(stage4_num_channels)]
+
+        self.conv_layer1_1 = layers.ConvBNReLU(
+            in_channels=3,
+            out_channels=64,
+            kernel_size=3,
+            stride=2,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        self.conv_layer1_2 = layers.ConvBNReLU(
+            in_channels=64,
+            out_channels=64,
+            kernel_size=3,
+            stride=2,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        self.la1 = Layer1(
+            num_channels=64,
+            num_blocks=self.stage1_num_blocks[0],
+            num_filters=self.stage1_num_channels[0],
+            has_se=has_se,
+            name="layer2",
+            padding_same=padding_same)
+
+        self.tr1 = TransitionLayer(
+            in_channels=[self.stage1_num_channels[0] * 4],
+            out_channels=self.stage2_num_channels,
+            name="tr1",
+            padding_same=padding_same)
+
+        self.st2 = Stage(
+            num_channels=self.stage2_num_channels,
+            num_modules=self.stage2_num_modules,
+            num_blocks=self.stage2_num_blocks,
+            num_filters=self.stage2_num_channels,
+            has_se=self.has_se,
+            name="st2",
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+        self.tr2 = TransitionLayer(
+            in_channels=self.stage2_num_channels,
+            out_channels=self.stage3_num_channels,
+            name="tr2",
+            padding_same=padding_same)
+        self.st3 = Stage(
+            num_channels=self.stage3_num_channels,
+            num_modules=self.stage3_num_modules,
+            num_blocks=self.stage3_num_blocks,
+            num_filters=self.stage3_num_channels,
+            has_se=self.has_se,
+            name="st3",
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+        self.tr3 = TransitionLayer(
+            in_channels=self.stage3_num_channels,
+            out_channels=self.stage4_num_channels,
+            name="tr3",
+            padding_same=padding_same)
+        self.st4 = Stage(
+            num_channels=self.stage4_num_channels,
+            num_modules=self.stage4_num_modules,
+            num_blocks=self.stage4_num_blocks,
+            num_filters=self.stage4_num_channels,
+            has_se=self.has_se,
+            name="st4",
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+        self.init_weight()
+
+    def forward(self, x):
+        conv1 = self.conv_layer1_1(x)
+        conv2 = self.conv_layer1_2(conv1)
+
+        la1 = self.la1(conv2)
+
+        tr1 = self.tr1([la1])
+        st2 = self.st2(tr1)
+
+        tr2 = self.tr2(st2)
+        st3 = self.st3(tr2)
+
+        tr3 = self.tr3(st3)
+        st4 = self.st4(tr3)
+
+        size = paddle.shape(st4[0])[2:]
+        x1 = F.interpolate(
+            st4[1], size, mode='bilinear', align_corners=self.align_corners)
+        x2 = F.interpolate(
+            st4[2], size, mode='bilinear', align_corners=self.align_corners)
+        x3 = F.interpolate(
+            st4[3], size, mode='bilinear', align_corners=self.align_corners)
+        x = paddle.concat([st4[0], x1, x2, x3], axis=1)
+
+        return [x]
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class Layer1(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 num_blocks,
+                 has_se=False,
+                 name=None,
+                 padding_same=True):
+        super(Layer1, self).__init__()
+
+        self.bottleneck_block_list = []
+
+        for i in range(num_blocks):
+            bottleneck_block = self.add_sublayer(
+                "bb_{}_{}".format(name, i + 1),
+                BottleneckBlock(
+                    num_channels=num_channels if i == 0 else num_filters * 4,
+                    num_filters=num_filters,
+                    has_se=has_se,
+                    stride=1,
+                    downsample=True if i == 0 else False,
+                    name=name + '_' + str(i + 1),
+                    padding_same=padding_same))
+            self.bottleneck_block_list.append(bottleneck_block)
+
+    def forward(self, x):
+        conv = x
+        for block_func in self.bottleneck_block_list:
+            conv = block_func(conv)
+        return conv
+
+
+class TransitionLayer(nn.Layer):
+    def __init__(self, in_channels, out_channels, name=None, padding_same=True):
+        super(TransitionLayer, self).__init__()
+
+        num_in = len(in_channels)
+        num_out = len(out_channels)
+        self.conv_bn_func_list = []
+        for i in range(num_out):
+            residual = None
+            if i < num_in:
+                if in_channels[i] != out_channels[i]:
+                    residual = self.add_sublayer(
+                        "transition_{}_layer_{}".format(name, i + 1),
+                        layers.ConvBNReLU(
+                            in_channels=in_channels[i],
+                            out_channels=out_channels[i],
+                            kernel_size=3,
+                            padding=1 if not padding_same else 'same',
+                            bias_attr=False))
+            else:
+                residual = self.add_sublayer(
+                    "transition_{}_layer_{}".format(name, i + 1),
+                    layers.ConvBNReLU(
+                        in_channels=in_channels[-1],
+                        out_channels=out_channels[i],
+                        kernel_size=3,
+                        stride=2,
+                        padding=1 if not padding_same else 'same',
+                        bias_attr=False))
+            self.conv_bn_func_list.append(residual)
+
+    def forward(self, x):
+        outs = []
+        for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
+            if conv_bn_func is None:
+                outs.append(x[idx])
+            else:
+                if idx < len(x):
+                    outs.append(conv_bn_func(x[idx]))
+                else:
+                    outs.append(conv_bn_func(x[-1]))
+        return outs
+
+
+class Branches(nn.Layer):
+    def __init__(self,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 has_se=False,
+                 name=None,
+                 padding_same=True):
+        super(Branches, self).__init__()
+
+        self.basic_block_list = []
+
+        for i in range(len(out_channels)):
+            self.basic_block_list.append([])
+            for j in range(num_blocks[i]):
+                in_ch = in_channels[i] if j == 0 else out_channels[i]
+                basic_block_func = self.add_sublayer(
+                    "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
+                    BasicBlock(
+                        num_channels=in_ch,
+                        num_filters=out_channels[i],
+                        has_se=has_se,
+                        name=name + '_branch_layer_' + str(i + 1) + '_' +
+                        str(j + 1),
+                        padding_same=padding_same))
+                self.basic_block_list[i].append(basic_block_func)
+
+    def forward(self, x):
+        outs = []
+        for idx, input in enumerate(x):
+            conv = input
+            for basic_block_func in self.basic_block_list[idx]:
+                conv = basic_block_func(conv)
+            outs.append(conv)
+        return outs
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 has_se,
+                 stride=1,
+                 downsample=False,
+                 name=None,
+                 padding_same=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = layers.ConvBNReLU(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=1,
+            bias_attr=False)
+
+        self.conv2 = layers.ConvBNReLU(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            kernel_size=3,
+            stride=stride,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        self.conv3 = layers.ConvBN(
+            in_channels=num_filters,
+            out_channels=num_filters * 4,
+            kernel_size=1,
+            bias_attr=False)
+
+        if self.downsample:
+            self.conv_down = layers.ConvBN(
+                in_channels=num_channels,
+                out_channels=num_filters * 4,
+                kernel_size=1,
+                bias_attr=False)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters * 4,
+                num_filters=num_filters * 4,
+                reduction_ratio=16,
+                name=name + '_fc')
+
+        self.add = layers.Add()
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        residual = x
+        conv1 = self.conv1(x)
+        conv2 = self.conv2(conv1)
+        conv3 = self.conv3(conv2)
+
+        if self.downsample:
+            residual = self.conv_down(x)
+
+        if self.has_se:
+            conv3 = self.se(conv3)
+
+        y = self.add(conv3, residual)
+        y = self.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride=1,
+                 has_se=False,
+                 downsample=False,
+                 name=None,
+                 padding_same=True):
+        super(BasicBlock, self).__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = layers.ConvBNReLU(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=3,
+            stride=stride,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+        self.conv2 = layers.ConvBN(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            kernel_size=3,
+            padding=1 if not padding_same else 'same',
+            bias_attr=False)
+
+        if self.downsample:
+            self.conv_down = layers.ConvBNReLU(
+                in_channels=num_channels,
+                out_channels=num_filters,
+                kernel_size=1,
+                bias_attr=False)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters,
+                num_filters=num_filters,
+                reduction_ratio=16,
+                name=name + '_fc')
+
+        self.add = layers.Add()
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        residual = x
+        conv1 = self.conv1(x)
+        conv2 = self.conv2(conv1)
+
+        if self.downsample:
+            residual = self.conv_down(x)
+
+        if self.has_se:
+            conv2 = self.se(conv2)
+
+        y = self.add(conv2, residual)
+        y = self.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = nn.AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = nn.Linear(
+            num_channels,
+            med_ch,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(-stdv, stdv)))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = nn.Linear(
+            med_ch,
+            num_filters,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(-stdv, stdv)))
+
+    def forward(self, x):
+        pool = self.pool2d_gap(x)
+        pool = paddle.reshape(pool, shape=[-1, self._num_channels])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.reshape(
+            excitation, shape=[-1, self._num_channels, 1, 1])
+        out = x * excitation
+        return out
+
+
+class Stage(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_modules,
+                 num_blocks,
+                 num_filters,
+                 has_se=False,
+                 multi_scale_output=True,
+                 name=None,
+                 align_corners=False,
+                 padding_same=True):
+        super(Stage, self).__init__()
+
+        self._num_modules = num_modules
+
+        self.stage_func_list = []
+        for i in range(num_modules):
+            if i == num_modules - 1 and not multi_scale_output:
+                stage_func = self.add_sublayer(
+                    "stage_{}_{}".format(name, i + 1),
+                    HighResolutionModule(
+                        num_channels=num_channels,
+                        num_blocks=num_blocks,
+                        num_filters=num_filters,
+                        has_se=has_se,
+                        multi_scale_output=False,
+                        name=name + '_' + str(i + 1),
+                        align_corners=align_corners,
+                        padding_same=padding_same))
+            else:
+                stage_func = self.add_sublayer(
+                    "stage_{}_{}".format(name, i + 1),
+                    HighResolutionModule(
+                        num_channels=num_channels,
+                        num_blocks=num_blocks,
+                        num_filters=num_filters,
+                        has_se=has_se,
+                        name=name + '_' + str(i + 1),
+                        align_corners=align_corners,
+                        padding_same=padding_same))
+
+            self.stage_func_list.append(stage_func)
+
+    def forward(self, x):
+        out = x
+        for idx in range(self._num_modules):
+            out = self.stage_func_list[idx](out)
+        return out
+
+
+class HighResolutionModule(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_blocks,
+                 num_filters,
+                 has_se=False,
+                 multi_scale_output=True,
+                 name=None,
+                 align_corners=False,
+                 padding_same=True):
+        super(HighResolutionModule, self).__init__()
+
+        self.branches_func = Branches(
+            num_blocks=num_blocks,
+            in_channels=num_channels,
+            out_channels=num_filters,
+            has_se=has_se,
+            name=name,
+            padding_same=padding_same)
+
+        self.fuse_func = FuseLayers(
+            in_channels=num_filters,
+            out_channels=num_filters,
+            multi_scale_output=multi_scale_output,
+            name=name,
+            align_corners=align_corners,
+            padding_same=padding_same)
+
+    def forward(self, x):
+        out = self.branches_func(x)
+        out = self.fuse_func(out)
+        return out
+
+
+class FuseLayers(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 multi_scale_output=True,
+                 name=None,
+                 align_corners=False,
+                 padding_same=True):
+        super(FuseLayers, self).__init__()
+
+        self._actual_ch = len(in_channels) if multi_scale_output else 1
+        self._in_channels = in_channels
+        self.align_corners = align_corners
+
+        self.residual_func_list = []
+        for i in range(self._actual_ch):
+            for j in range(len(in_channels)):
+                if j > i:
+                    residual_func = self.add_sublayer(
+                        "residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
+                        layers.ConvBN(
+                            in_channels=in_channels[j],
+                            out_channels=out_channels[i],
+                            kernel_size=1,
+                            bias_attr=False))
+                    self.residual_func_list.append(residual_func)
+                elif j < i:
+                    pre_num_filters = in_channels[j]
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            residual_func = self.add_sublayer(
+                                "residual_{}_layer_{}_{}_{}".format(
+                                    name, i + 1, j + 1, k + 1),
+                                layers.ConvBN(
+                                    in_channels=pre_num_filters,
+                                    out_channels=out_channels[i],
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1 if not padding_same else 'same',
+                                    bias_attr=False))
+                            pre_num_filters = out_channels[i]
+                        else:
+                            residual_func = self.add_sublayer(
+                                "residual_{}_layer_{}_{}_{}".format(
+                                    name, i + 1, j + 1, k + 1),
+                                layers.ConvBNReLU(
+                                    in_channels=pre_num_filters,
+                                    out_channels=out_channels[j],
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=1 if not padding_same else 'same',
+                                    bias_attr=False))
+                            pre_num_filters = out_channels[j]
+                        self.residual_func_list.append(residual_func)
+
+    def forward(self, x):
+        outs = []
+        residual_func_idx = 0
+        for i in range(self._actual_ch):
+            residual = x[i]
+            residual_shape = paddle.shape(residual)[-2:]
+            for j in range(len(self._in_channels)):
+                if j > i:
+                    y = self.residual_func_list[residual_func_idx](x[j])
+                    residual_func_idx += 1
+
+                    y = F.interpolate(
+                        y,
+                        residual_shape,
+                        mode='bilinear',
+                        align_corners=self.align_corners)
+                    residual = residual + y
+                elif j < i:
+                    y = x[j]
+                    for k in range(i - j):
+                        y = self.residual_func_list[residual_func_idx](y)
+                        residual_func_idx += 1
+
+                    residual = residual + y
+
+            residual = F.relu(residual)
+            outs.append(residual)
+
+        return outs
+
+
+@manager.BACKBONES.add_component
+def HRNet_W18_Small_V1(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[1],
+        stage1_num_channels=[32],
+        stage2_num_modules=1,
+        stage2_num_blocks=[2, 2],
+        stage2_num_channels=[16, 32],
+        stage3_num_modules=1,
+        stage3_num_blocks=[2, 2, 2],
+        stage3_num_channels=[16, 32, 64],
+        stage4_num_modules=1,
+        stage4_num_blocks=[2, 2, 2, 2],
+        stage4_num_channels=[16, 32, 64, 128],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W18_Small_V2(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[2],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[2, 2],
+        stage2_num_channels=[18, 36],
+        stage3_num_modules=3,
+        stage3_num_blocks=[2, 2, 2],
+        stage3_num_channels=[18, 36, 72],
+        stage4_num_modules=2,
+        stage4_num_blocks=[2, 2, 2, 2],
+        stage4_num_channels=[18, 36, 72, 144],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W18(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[18, 36],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[18, 36, 72],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[18, 36, 72, 144],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W30(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[30, 60],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[30, 60, 120],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[30, 60, 120, 240],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W32(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[32, 64],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[32, 64, 128],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[32, 64, 128, 256],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W40(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[40, 80],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[40, 80, 160],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[40, 80, 160, 320],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W44(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[44, 88],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[44, 88, 176],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[44, 88, 176, 352],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W48(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[48, 96],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[48, 96, 192],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[48, 96, 192, 384],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W60(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[60, 120],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[60, 120, 240],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[60, 120, 240, 480],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def HRNet_W64(**kwargs):
+    model = HRNet(
+        stage1_num_modules=1,
+        stage1_num_blocks=[4],
+        stage1_num_channels=[64],
+        stage2_num_modules=1,
+        stage2_num_blocks=[4, 4],
+        stage2_num_channels=[64, 128],
+        stage3_num_modules=4,
+        stage3_num_blocks=[4, 4, 4],
+        stage3_num_channels=[64, 128, 256],
+        stage4_num_modules=3,
+        stage4_num_blocks=[4, 4, 4, 4],
+        stage4_num_channels=[64, 128, 256, 512],
+        **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/mix_transformer.py
+++ b/paddlers/models/ppseg/models/backbones/mix_transformer.py
@ -0,0 +1,588 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.nn.initializer as paddle_init
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models.backbones.transformer_utils import *
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.dim = dim
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2D(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x, H, W):
+        x_shape = paddle.shape(x)
+        B, N = x_shape[0], x_shape[1]
+        C = self.dim
+
+        q = self.q(x).reshape([B, N, self.num_heads,
+                               C // self.num_heads]).transpose([0, 2, 1, 3])
+
+        if self.sr_ratio > 1:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1])
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(
+                [B, -1, 2, self.num_heads,
+                 C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        else:
+            kv = self.kv(x).reshape(
+                [B, -1, 2, self.num_heads,
+                 C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.proj(x)
+        x_shape = paddle.shape(x)
+        H, W = x_shape[2], x_shape[3]
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class MixVisionTransformer(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 pretrained=None):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.feat_channels = embed_dims[:]
+
+        # patch_embed
+        self.patch_embed1 = OverlapPatchEmbed(
+            img_size=img_size,
+            patch_size=7,
+            stride=4,
+            in_chans=in_chans,
+            embed_dim=embed_dims[0])
+        self.patch_embed2 = OverlapPatchEmbed(
+            img_size=img_size // 4,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[0],
+            embed_dim=embed_dims[1])
+        self.patch_embed3 = OverlapPatchEmbed(
+            img_size=img_size // 8,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[1],
+            embed_dim=embed_dims[2])
+        self.patch_embed4 = OverlapPatchEmbed(
+            img_size=img_size // 16,
+            patch_size=3,
+            stride=2,
+            in_chans=embed_dims[2],
+            embed_dim=embed_dims[3])
+
+        # transformer encoder
+        dpr = [
+            x.numpy() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+        self.block1 = nn.LayerList([
+            Block(
+                dim=embed_dims[0],
+                num_heads=num_heads[0],
+                mlp_ratio=mlp_ratios[0],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[0]) for i in range(depths[0])
+        ])
+        self.norm1 = norm_layer(embed_dims[0])
+
+        cur += depths[0]
+        self.block2 = nn.LayerList([
+            Block(
+                dim=embed_dims[1],
+                num_heads=num_heads[1],
+                mlp_ratio=mlp_ratios[1],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[1]) for i in range(depths[1])
+        ])
+        self.norm2 = norm_layer(embed_dims[1])
+
+        cur += depths[1]
+        self.block3 = nn.LayerList([
+            Block(
+                dim=embed_dims[2],
+                num_heads=num_heads[2],
+                mlp_ratio=mlp_ratios[2],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[2]) for i in range(depths[2])
+        ])
+        self.norm3 = norm_layer(embed_dims[2])
+
+        cur += depths[2]
+        self.block4 = nn.LayerList([
+            Block(
+                dim=embed_dims[3],
+                num_heads=num_heads[3],
+                mlp_ratio=mlp_ratios[3],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[cur + i],
+                norm_layer=norm_layer,
+                sr_ratio=sr_ratios[3]) for i in range(depths[3])
+        ])
+        self.norm4 = norm_layer(embed_dims[3])
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+        else:
+            self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def reset_drop_path(self, drop_path_rate):
+        dpr = [
+            x.item()
+            for x in paddle.linspace(0, drop_path_rate, sum(self.depths))
+        ]
+        cur = 0
+        for i in range(self.depths[0]):
+            self.block1[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[0]
+        for i in range(self.depths[1]):
+            self.block2[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[1]
+        for i in range(self.depths[2]):
+            self.block3[i].drop_path.drop_prob = dpr[cur + i]
+
+        cur += self.depths[2]
+        for i in range(self.depths[3]):
+            self.block4[i].drop_path.drop_prob = dpr[cur + i]
+
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(
+            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = paddle.shape(x)[0]
+        outs = []
+
+        # stage 1
+        x, H, W = self.patch_embed1(x)
+        for i, blk in enumerate(self.block1):
+            x = blk(x, H, W)
+
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, self.feat_channels[0]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        # stage 2
+        x, H, W = self.patch_embed2(x)
+        for i, blk in enumerate(self.block2):
+            x = blk(x, H, W)
+        x = self.norm2(x)
+        x = x.reshape([B, H, W, self.feat_channels[1]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        # stage 3
+        x, H, W = self.patch_embed3(x)
+        for i, blk in enumerate(self.block3):
+            x = blk(x, H, W)
+        x = self.norm3(x)
+        x = x.reshape([B, H, W, self.feat_channels[2]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        # stage 4
+        x, H, W = self.patch_embed4(x)
+        for i, blk in enumerate(self.block4):
+            x = blk(x, H, W)
+        x = self.norm4(x)
+        x = x.reshape([B, H, W, self.feat_channels[3]]).transpose([0, 3, 1, 2])
+        outs.append(x)
+
+        return outs
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dim = dim
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        x_shape = paddle.shape(x)
+        B, N = x_shape[0], x_shape[1]
+        x = x.transpose([0, 2, 1]).reshape([B, self.dim, H, W])
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose([0, 2, 1])
+
+        return x
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B0(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[32, 64, 160, 256],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B1(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B2(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B3(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B4(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
+
+
+@manager.BACKBONES.add_component
+def MixVisionTransformer_B5(**kwargs):
+    return MixVisionTransformer(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 6, 40, 3],
+        sr_ratios=[8, 4, 2, 1],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        **kwargs)
--- a/paddlers/models/ppseg/models/backbones/mobilenetv2.py
+++ b/paddlers/models/ppseg/models/backbones/mobilenetv2.py
@ -0,0 +1,168 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg import utils
+
+
+@manager.BACKBONES.add_component
+class MobileNetV2(nn.Layer):
+    """
+        The MobileNetV2 implementation based on PaddlePaddle.
+
+        The original article refers to
+        Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
+        "MobileNetV2: Inverted Residuals and Linear Bottlenecks"
+        (https://arxiv.org/abs/1801.04381).
+
+        Args:
+            channel_ratio (float, optional): The ratio of channel. Default: 1.0
+            min_channel (int, optional): The minimum of channel. Default: 16
+            pretrained (str, optional): The path or url of pretrained model. Default: None
+        """
+
+    def __init__(self, channel_ratio=1.0, min_channel=16, pretrained=None):
+        super(MobileNetV2, self).__init__()
+        self.channel_ratio = channel_ratio
+        self.min_channel = min_channel
+        self.pretrained = pretrained
+
+        self.stage0 = conv_bn(3, self.depth(32), 3, 2)
+
+        self.stage1 = InvertedResidual(self.depth(32), self.depth(16), 1, 1)
+
+        self.stage2 = nn.Sequential(
+            InvertedResidual(self.depth(16), self.depth(24), 2, 6),
+            InvertedResidual(self.depth(24), self.depth(24), 1, 6),
+        )
+
+        self.stage3 = nn.Sequential(
+            InvertedResidual(self.depth(24), self.depth(32), 2, 6),
+            InvertedResidual(self.depth(32), self.depth(32), 1, 6),
+            InvertedResidual(self.depth(32), self.depth(32), 1, 6),
+        )
+
+        self.stage4 = nn.Sequential(
+            InvertedResidual(self.depth(32), self.depth(64), 2, 6),
+            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
+            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
+            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
+        )
+
+        self.stage5 = nn.Sequential(
+            InvertedResidual(self.depth(64), self.depth(96), 1, 6),
+            InvertedResidual(self.depth(96), self.depth(96), 1, 6),
+            InvertedResidual(self.depth(96), self.depth(96), 1, 6),
+        )
+
+        self.stage6 = nn.Sequential(
+            InvertedResidual(self.depth(96), self.depth(160), 2, 6),
+            InvertedResidual(self.depth(160), self.depth(160), 1, 6),
+            InvertedResidual(self.depth(160), self.depth(160), 1, 6),
+        )
+
+        self.stage7 = InvertedResidual(self.depth(160), self.depth(320), 1, 6)
+
+        self.init_weight()
+
+    def depth(self, channels):
+        min_channel = min(channels, self.min_channel)
+        return max(min_channel, int(channels * self.channel_ratio))
+
+    def forward(self, x):
+        feat_list = []
+
+        feature_1_2 = self.stage0(x)
+        feature_1_2 = self.stage1(feature_1_2)
+        feature_1_4 = self.stage2(feature_1_2)
+        feature_1_8 = self.stage3(feature_1_4)
+        feature_1_16 = self.stage4(feature_1_8)
+        feature_1_16 = self.stage5(feature_1_16)
+        feature_1_32 = self.stage6(feature_1_16)
+        feature_1_32 = self.stage7(feature_1_32)
+        feat_list.append(feature_1_4)
+        feat_list.append(feature_1_8)
+        feat_list.append(feature_1_16)
+        feat_list.append(feature_1_32)
+        return feat_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+def conv_bn(inp, oup, kernel, stride):
+    return nn.Sequential(
+        nn.Conv2D(
+            in_channels=inp,
+            out_channels=oup,
+            kernel_size=kernel,
+            stride=stride,
+            padding=(kernel - 1) // 2,
+            bias_attr=False),
+        nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+        nn.ReLU())
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, inp, oup, stride, expand_ratio, dilation=1):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                inp * expand_ratio,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias_attr=False),
+            nn.BatchNorm2D(
+                num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(
+                inp * expand_ratio,
+                inp * expand_ratio,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=inp * expand_ratio,
+                bias_attr=False),
+            nn.BatchNorm2D(
+                num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(
+                inp * expand_ratio,
+                oup,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+        )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
--- a/paddlers/models/ppseg/models/backbones/mobilenetv3.py
+++ b/paddlers/models/ppseg/models/backbones/mobilenetv3.py
@ -0,0 +1,364 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models import layers
+
+__all__ = [
+    "MobileNetV3_small_x0_35", "MobileNetV3_small_x0_5",
+    "MobileNetV3_small_x0_75", "MobileNetV3_small_x1_0",
+    "MobileNetV3_small_x1_25", "MobileNetV3_large_x0_35",
+    "MobileNetV3_large_x0_5", "MobileNetV3_large_x0_75",
+    "MobileNetV3_large_x1_0", "MobileNetV3_large_x1_25"
+]
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class MobileNetV3(nn.Layer):
+    """
+    The MobileNetV3 implementation based on PaddlePaddle.
+
+    The original article refers to Jingdong
+    Andrew Howard, et, al. "Searching for MobileNetV3"
+    (https://arxiv.org/pdf/1905.02244.pdf).
+
+    Args:
+        pretrained (str, optional): The path of pretrained model.
+        scale (float, optional): The scale of channels . Default: 1.0.
+        model_name (str, optional): Model name. It determines the type of MobileNetV3. The value is 'small' or 'large'. Defualt: 'small'.
+        output_stride (int, optional): The stride of output features compared to input images. The value should be one of (2, 4, 8, 16, 32). Default: None.
+
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 scale=1.0,
+                 model_name="small",
+                 output_stride=None):
+        super(MobileNetV3, self).__init__()
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", 1],
+                [3, 64, 24, False, "relu", 2],
+                [3, 72, 24, False, "relu", 1],  # output 1 -> out_index=2
+                [5, 72, 40, True, "relu", 2],
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],  # output 2 -> out_index=5
+                [3, 240, 80, False, "hard_swish", 2],
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish",
+                 1],  # output 3 -> out_index=11
+                [5, 672, 160, True, "hard_swish", 2],
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish",
+                 1],  # output 3 -> out_index=14
+            ]
+            self.out_indices = [2, 5, 11, 14]
+            self.feat_channels = [
+                make_divisible(i * scale) for i in [24, 40, 112, 160]
+            ]
+
+            self.cls_ch_squeeze = 960
+            self.cls_ch_expand = 1280
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", 2],  # output 1 -> out_index=0
+                [3, 72, 24, False, "relu", 2],
+                [3, 88, 24, False, "relu", 1],  # output 2 -> out_index=3
+                [5, 96, 40, True, "hard_swish", 2],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],  # output 3 -> out_index=7
+                [5, 288, 96, True, "hard_swish", 2],
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],  # output 4 -> out_index=10
+            ]
+            self.out_indices = [0, 3, 7, 10]
+            self.feat_channels = [
+                make_divisible(i * scale) for i in [16, 24, 48, 96]
+            ]
+
+            self.cls_ch_squeeze = 576
+            self.cls_ch_expand = 1280
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        ###################################################
+        # modify stride and dilation based on output_stride
+        self.dilation_cfg = [1] * len(self.cfg)
+        self.modify_bottle_params(output_stride=output_stride)
+        ###################################################
+
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act="hard_swish")
+
+        self.block_list = []
+
+        inplanes = make_divisible(inplanes * scale)
+        for i, (k, exp, c, se, nl, s) in enumerate(self.cfg):
+            ######################################
+            # add dilation rate
+            dilation_rate = self.dilation_cfg[i]
+            ######################################
+            self.block_list.append(
+                ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    dilation=dilation_rate,
+                    use_se=se,
+                    act=nl,
+                    name="conv" + str(i + 2)))
+            self.add_sublayer(
+                sublayer=self.block_list[-1], name="conv" + str(i + 2))
+            inplanes = make_divisible(scale * c)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def modify_bottle_params(self, output_stride=None):
+
+        if output_stride is not None and output_stride % 2 != 0:
+            raise ValueError("output stride must to be even number")
+        if output_stride is not None:
+            stride = 2
+            rate = 1
+            for i, _cfg in enumerate(self.cfg):
+                stride = stride * _cfg[-1]
+                if stride > output_stride:
+                    rate = rate * _cfg[-1]
+                    self.cfg[i][-1] = 1
+
+                self.dilation_cfg[i] = rate
+
+    def forward(self, inputs, label=None):
+        x = self.conv1(inputs)
+        # A feature list saves each downsampling feature.
+        feat_list = []
+        for i, block in enumerate(self.block_list):
+            x = block(x)
+            if i in self.out_indices:
+                feat_list.append(x)
+
+        return feat_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self.if_act = if_act
+        self.act = act
+
+        self.conv = nn.Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = layers.SyncBatchNorm(
+            num_features=out_c,
+            weight_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(0.0)),
+            bias_attr=paddle.ParamAttr(
+                regularizer=paddle.regularizer.L2Decay(0.0)))
+        self._act_op = layers.Activation(act='hardswish')
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self._act_op(x)
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 dilation=1,
+                 act=None,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding='same',
+            dilation=dilation,
+            num_groups=mid_c,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+        self.dilation = dilation
+
+    def forward(self, inputs):
+        x = self.expand_conv(inputs)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = inputs + x
+        return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.conv2 = nn.Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+def MobileNetV3_small_x0_35(**kwargs):
+    model = MobileNetV3(model_name="small", scale=0.35, **kwargs)
+    return model
+
+
+def MobileNetV3_small_x0_5(**kwargs):
+    model = MobileNetV3(model_name="small", scale=0.5, **kwargs)
+    return model
+
+
+def MobileNetV3_small_x0_75(**kwargs):
+    model = MobileNetV3(model_name="small", scale=0.75, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV3_small_x1_0(**kwargs):
+    model = MobileNetV3(model_name="small", scale=1.0, **kwargs)
+    return model
+
+
+def MobileNetV3_small_x1_25(**kwargs):
+    model = MobileNetV3(model_name="small", scale=1.25, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x0_35(**kwargs):
+    model = MobileNetV3(model_name="large", scale=0.35, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x0_5(**kwargs):
+    model = MobileNetV3(model_name="large", scale=0.5, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x0_75(**kwargs):
+    model = MobileNetV3(model_name="large", scale=0.75, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV3_large_x1_0(**kwargs):
+    model = MobileNetV3(model_name="large", scale=1.0, **kwargs)
+    return model
+
+
+def MobileNetV3_large_x1_25(**kwargs):
+    model = MobileNetV3(model_name="large", scale=1.25, **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/resnet_vd.py
+++ b/paddlers/models/ppseg/models/backbones/resnet_vd.py
@ -0,0 +1,398 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = [
+    "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd"
+]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 data_format='NCHW'):
+        super(ConvBNLayer, self).__init__()
+        if dilation != 1 and kernel_size != 3:
+            raise RuntimeError("When the dilation isn't 1," \
+                "the kernel_size should be 3.")
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = nn.AvgPool2D(
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            ceil_mode=True,
+            data_format=data_format)
+        self._conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2 \
+                if dilation == 1 else dilation,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=False,
+            data_format=data_format)
+
+        self._batch_norm = layers.SyncBatchNorm(
+            out_channels, data_format=data_format)
+        self._act_op = layers.Activation(act=act)
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        y = self._act_op(y)
+
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 dilation=1,
+                 data_format='NCHW'):
+        super(BottleneckBlock, self).__init__()
+
+        self.data_format = data_format
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            act='relu',
+            data_format=data_format)
+
+        self.dilation = dilation
+
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            act='relu',
+            dilation=dilation,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels * 4,
+            kernel_size=1,
+            act=None,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels * 4,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first or stride == 1 else True,
+                data_format=data_format)
+
+        self.shortcut = shortcut
+        # NOTE: Use the wrap layer for quantization training
+        self.add = layers.Add()
+        self.relu = layers.Activation(act="relu")
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = self.add(short, conv2)
+        y = self.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dilation=1,
+                 shortcut=True,
+                 if_first=False,
+                 data_format='NCHW'):
+        super(BasicBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            dilation=dilation,
+            act='relu',
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            dilation=dilation,
+            act=None,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                is_vd_mode=False if if_first or stride == 1 else True,
+                data_format=data_format)
+
+        self.shortcut = shortcut
+        self.dilation = dilation
+        self.data_format = data_format
+        self.add = layers.Add()
+        self.relu = layers.Activation(act="relu")
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = self.add(short, conv1)
+        y = self.relu(y)
+
+        return y
+
+
+class ResNet_vd(nn.Layer):
+    """
+    The ResNet_vd implementation based on PaddlePaddle.
+
+    The original article refers to Jingdong
+    Tong He, et, al. "Bag of Tricks for Image Classification with Convolutional Neural Networks"
+    (https://arxiv.org/pdf/1812.01187.pdf).
+
+    Args:
+        layers (int, optional): The layers of ResNet_vd. The supported layers are (18, 34, 50, 101, 152, 200). Default: 50.
+        output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 8.
+        multi_grid (tuple|list, optional): The grid of stage4. Defult: (1, 1, 1).
+        pretrained (str, optional): The path of pretrained model.
+
+    """
+
+    def __init__(self,
+                 layers=50,
+                 output_stride=8,
+                 multi_grid=(1, 1, 1),
+                 pretrained=None,
+                 data_format='NCHW'):
+        super(ResNet_vd, self).__init__()
+
+        self.data_format = data_format
+        self.conv1_logit = None  # for gscnn shape stream
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024
+                        ] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        # for channels of four returned stages
+        self.feat_channels = [c * 4 for c in num_filters
+                              ] if layers >= 50 else num_filters
+
+        dilation_dict = None
+        if output_stride == 8:
+            dilation_dict = {2: 2, 3: 4}
+        elif output_stride == 16:
+            dilation_dict = {3: 2}
+
+        self.conv1_1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=32,
+            kernel_size=3,
+            stride=2,
+            act='relu',
+            data_format=data_format)
+        self.conv1_2 = ConvBNLayer(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            data_format=data_format)
+        self.conv1_3 = ConvBNLayer(
+            in_channels=32,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            act='relu',
+            data_format=data_format)
+        self.pool2d_max = nn.MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+
+        # self.block_list = []
+        self.stage_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                block_list = []
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+
+                    ###############################################################################
+                    # Add dilation rate for some segmentation tasks, if dilation_dict is not None.
+                    dilation_rate = dilation_dict[
+                        block] if dilation_dict and block in dilation_dict else 1
+
+                    # Actually block here is 'stage', and i is 'block' in 'stage'
+                    # At the stage 4, expand the the dilation_rate if given multi_grid
+                    if block == 3:
+                        dilation_rate = dilation_rate * multi_grid[i]
+                    ###############################################################################
+
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0
+                            and dilation_rate == 1 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            dilation=dilation_rate,
+                            data_format=data_format))
+
+                    block_list.append(bottleneck_block)
+                    shortcut = True
+                self.stage_list.append(block_list)
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                block_list = []
+                for i in range(depth[block]):
+                    dilation_rate = dilation_dict[block] \
+                        if dilation_dict and block in dilation_dict else 1
+                    if block == 3:
+                        dilation_rate = dilation_rate * multi_grid[i]
+
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            in_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            out_channels=num_filters[block],
+                            stride=2 if i == 0 and block != 0 \
+                                and dilation_rate == 1 else 1,
+                            dilation=dilation_rate,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            data_format=data_format))
+                    block_list.append(basic_block)
+                    shortcut = True
+                self.stage_list.append(block_list)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        self.conv1_logit = y.clone()
+        y = self.pool2d_max(y)
+
+        # A feature list saves the output feature map of each stage.
+        feat_list = []
+        for stage in self.stage_list:
+            for block in stage:
+                y = block(y)
+            feat_list.append(y)
+
+        return feat_list
+
+    def init_weight(self):
+        utils.load_pretrained_model(self, self.pretrained)
+
+
+@manager.BACKBONES.add_component
+def ResNet18_vd(**args):
+    model = ResNet_vd(layers=18, **args)
+    return model
+
+
+def ResNet34_vd(**args):
+    model = ResNet_vd(layers=34, **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ResNet50_vd(**args):
+    model = ResNet_vd(layers=50, **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ResNet101_vd(**args):
+    model = ResNet_vd(layers=101, **args)
+    return model
+
+
+def ResNet152_vd(**args):
+    model = ResNet_vd(layers=152, **args)
+    return model
+
+
+def ResNet200_vd(**args):
+    model = ResNet_vd(layers=200, **args)
+    return model
--- a/paddlers/models/ppseg/models/backbones/stdcnet.py
+++ b/paddlers/models/ppseg/models/backbones/stdcnet.py
@ -0,0 +1,281 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models.layers.layer_libs import SyncBatchNorm
+
+__all__ = ["STDC1", "STDC2"]
+
+
+class STDCNet(nn.Layer):
+    """
+    The STDCNet implementation based on PaddlePaddle.
+
+    The original article refers to Meituan
+    Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation."
+    (https://arxiv.org/abs/2104.13188)
+
+    Args:
+        base(int, optional): base channels. Default: 64.
+        layers(list, optional): layers numbers list. It determines STDC block numbers of STDCNet's stage3\4\5. Defualt: [4, 5, 3].
+        block_num(int,optional): block_num of features block. Default: 4.
+        type(str,optional): feature fusion method "cat"/"add". Default: "cat".
+        num_classes(int, optional): class number for image classification. Default: 1000.
+        dropout(float,optional): dropout ratio. if >0,use dropout ratio.  Default: 0.20.
+        use_conv_last(bool,optional): whether to use the last ConvBNReLU layer . Default: False.
+        pretrained(str, optional): the path of pretrained model.
+    """
+
+    def __init__(self,
+                 base=64,
+                 layers=[4, 5, 3],
+                 block_num=4,
+                 type="cat",
+                 num_classes=1000,
+                 dropout=0.20,
+                 use_conv_last=False,
+                 pretrained=None):
+        super(STDCNet, self).__init__()
+        if type == "cat":
+            block = CatBottleneck
+        elif type == "add":
+            block = AddBottleneck
+        self.use_conv_last = use_conv_last
+        self.features = self._make_layers(base, layers, block_num, block)
+        self.conv_last = ConvBNRelu(base * 16, max(1024, base * 16), 1, 1)
+
+        if (layers == [4, 5, 3]):  #stdc1446
+            self.x2 = nn.Sequential(self.features[:1])
+            self.x4 = nn.Sequential(self.features[1:2])
+            self.x8 = nn.Sequential(self.features[2:6])
+            self.x16 = nn.Sequential(self.features[6:11])
+            self.x32 = nn.Sequential(self.features[11:])
+        elif (layers == [2, 2, 2]):  #stdc813
+            self.x2 = nn.Sequential(self.features[:1])
+            self.x4 = nn.Sequential(self.features[1:2])
+            self.x8 = nn.Sequential(self.features[2:4])
+            self.x16 = nn.Sequential(self.features[4:6])
+            self.x32 = nn.Sequential(self.features[6:])
+        else:
+            raise NotImplementedError(
+                "model with layers:{} is not implemented!".format(layers))
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        """
+        forward function for feature extract.
+        """
+        feat2 = self.x2(x)
+        feat4 = self.x4(feat2)
+        feat8 = self.x8(feat4)
+        feat16 = self.x16(feat8)
+        feat32 = self.x32(feat16)
+        if self.use_conv_last:
+            feat32 = self.conv_last(feat32)
+        return feat2, feat4, feat8, feat16, feat32
+
+    def _make_layers(self, base, layers, block_num, block):
+        features = []
+        features += [ConvBNRelu(3, base // 2, 3, 2)]
+        features += [ConvBNRelu(base // 2, base, 3, 2)]
+
+        for i, layer in enumerate(layers):
+            for j in range(layer):
+                if i == 0 and j == 0:
+                    features.append(block(base, base * 4, block_num, 2))
+                elif j == 0:
+                    features.append(
+                        block(base * int(math.pow(2, i + 1)),
+                              base * int(math.pow(2, i + 2)), block_num, 2))
+                else:
+                    features.append(
+                        block(base * int(math.pow(2, i + 2)),
+                              base * int(math.pow(2, i + 2)), block_num, 1))
+
+        return nn.Sequential(*features)
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class ConvBNRelu(nn.Layer):
+    def __init__(self, in_planes, out_planes, kernel=3, stride=1):
+        super(ConvBNRelu, self).__init__()
+        self.conv = nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=kernel,
+            stride=stride,
+            padding=kernel // 2,
+            bias_attr=False)
+        self.bn = SyncBatchNorm(out_planes, data_format='NCHW')
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out = self.relu(self.bn(self.conv(x)))
+        return out
+
+
+class AddBottleneck(nn.Layer):
+    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+        super(AddBottleneck, self).__init__()
+        assert block_num > 1, "block number should be larger than 1."
+        self.conv_list = nn.LayerList()
+        self.stride = stride
+        if stride == 2:
+            self.avd_layer = nn.Sequential(
+                nn.Conv2D(
+                    out_planes // 2,
+                    out_planes // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    groups=out_planes // 2,
+                    bias_attr=False),
+                nn.BatchNorm2D(out_planes // 2),
+            )
+            self.skip = nn.Sequential(
+                nn.Conv2D(
+                    in_planes,
+                    in_planes,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    groups=in_planes,
+                    bias_attr=False),
+                nn.BatchNorm2D(in_planes),
+                nn.Conv2D(
+                    in_planes, out_planes, kernel_size=1, bias_attr=False),
+                nn.BatchNorm2D(out_planes),
+            )
+            stride = 1
+
+        for idx in range(block_num):
+            if idx == 0:
+                self.conv_list.append(
+                    ConvBNRelu(in_planes, out_planes // 2, kernel=1))
+            elif idx == 1 and block_num == 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 2, stride=stride))
+            elif idx == 1 and block_num > 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 4, stride=stride))
+            elif idx < block_num - 1:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx + 1))))
+            else:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx))))
+
+    def forward(self, x):
+        out_list = []
+        out = x
+        for idx, conv in enumerate(self.conv_list):
+            if idx == 0 and self.stride == 2:
+                out = self.avd_layer(conv(out))
+            else:
+                out = conv(out)
+            out_list.append(out)
+        if self.stride == 2:
+            x = self.skip(x)
+        return paddle.concat(out_list, axis=1) + x
+
+
+class CatBottleneck(nn.Layer):
+    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+        super(CatBottleneck, self).__init__()
+        assert block_num > 1, "block number should be larger than 1."
+        self.conv_list = nn.LayerList()
+        self.stride = stride
+        if stride == 2:
+            self.avd_layer = nn.Sequential(
+                nn.Conv2D(
+                    out_planes // 2,
+                    out_planes // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    groups=out_planes // 2,
+                    bias_attr=False),
+                nn.BatchNorm2D(out_planes // 2),
+            )
+            self.skip = nn.AvgPool2D(kernel_size=3, stride=2, padding=1)
+            stride = 1
+
+        for idx in range(block_num):
+            if idx == 0:
+                self.conv_list.append(
+                    ConvBNRelu(in_planes, out_planes // 2, kernel=1))
+            elif idx == 1 and block_num == 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 2, stride=stride))
+            elif idx == 1 and block_num > 2:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // 2, out_planes // 4, stride=stride))
+            elif idx < block_num - 1:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx + 1))))
+            else:
+                self.conv_list.append(
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx))))
+
+    def forward(self, x):
+        out_list = []
+        out1 = self.conv_list[0](x)
+        for idx, conv in enumerate(self.conv_list[1:]):
+            if idx == 0:
+                if self.stride == 2:
+                    out = conv(self.avd_layer(out1))
+                else:
+                    out = conv(out1)
+            else:
+                out = conv(out)
+            out_list.append(out)
+
+        if self.stride == 2:
+            out1 = self.skip(out1)
+        out_list.insert(0, out1)
+        out = paddle.concat(out_list, axis=1)
+        return out
+
+
+@manager.BACKBONES.add_component
+def STDC2(**kwargs):
+    model = STDCNet(base=64, layers=[4, 5, 3], **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def STDC1(**kwargs):
+    model = STDCNet(base=64, layers=[2, 2, 2], **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/swin_transformer.py
+++ b/paddlers/models/ppseg/models/backbones/swin_transformer.py
@ -0,0 +1,792 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models.backbones.transformer_utils import *
+
+
+class Mlp(nn.Layer):
+    """ Multilayer perceptron."""
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4,
+                           5]).reshape([-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [B, H // window_size, W // window_size, window_size, window_size, -1])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    """
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = self.create_parameter(
+            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                   num_heads),
+            default_initializer=zeros_)
+        self.add_parameter("relative_position_bias_table",
+                           self.relative_position_bias_table)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h,
+                                               coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+
+        relative_coords = relative_coords.transpose([1, 2, 0])
+
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [B_, N, 3, self.num_heads,
+             C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+
+        index = self.relative_position_index.reshape([-1])
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """
+    Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, C])
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+
+        x = x.transpose([0, 3, 1, 2])
+        x = F.pad(x, [pad_l, pad_r, pad_t, pad_b])
+        x = x.transpose([0, 2, 3, 1])
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([B, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    """
+    Patch Merging Layer
+
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.reshape([B, H, W, C])
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = x.transpose([0, 3, 1, 2])
+            x = F.pad(x, [0, W % 2, 0, H % 2])
+            x = x.transpose([0, 2, 3, 1])
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([B, -1, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """
+    A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels.
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = paddle.zeros((1, Hp, Wp, 1))  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape(
+            [-1, self.window_size * self.window_size])
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+
+        huns = -100.0 * paddle.ones_like(attn_mask)
+        attn_mask = huns * (attn_mask != 0).astype("float32")
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """
+    Image to Patch Embedding.
+
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.shape
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@manager.BACKBONES.add_component
+class SwinTransformer(nn.Layer):
+    """
+    The SwinTransformer implementation based on PaddlePaddle.
+
+    The original article refers to
+    Liu, Ze, et al. "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows"
+    (https://arxiv.org/abs/2103.14030)
+
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default: 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. Default: -1.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 pretrained=None):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = self.create_parameter(
+                shape=(1, embed_dim, patches_resolution[0],
+                       patches_resolution[1]),
+                default_initializer=zeros_)
+            self.add_parameter("absolute_pos_embed", self.absolute_pos_embed)
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate, sum(depths)).tolist()
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if
+                (i_layer < self.num_layers - 1) else None)
+            self.layers.append(layer)
+
+        feat_channels = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.feat_channels = feat_channels
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(feat_channels[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self._freeze_stages()
+
+        self.pretrained = pretrained
+        self.init_weights(self.pretrained)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                layer = self.layers[i]
+                layer.eval()
+                for param in layer.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+        else:
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Linear):
+                    trunc_normal_(sublayer.weight)
+                    if isinstance(sublayer,
+                                  nn.Linear) and sublayer.bias is not None:
+                        zeros_(sublayer.bias)
+                elif isinstance(sublayer, nn.LayerNorm):
+                    zeros_(sublayer.bias)
+                    ones_(sublayer.weight)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        _, _, Wh, Ww = x.shape
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.reshape(
+                    [-1, H, W, self.feat_channels[i]]).transpose([0, 3, 1, 2])
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train()
+        self._freeze_stages()
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_tiny_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_small_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_base_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_base_patch4_window12_384(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_large_patch4_window7_224(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=224,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        **kwargs)
+
+    return model
+
+
+@manager.BACKBONES.add_component
+def SwinTransformer_large_patch4_window12_384(**kwargs):
+    model = SwinTransformer(
+        pretrain_img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        **kwargs)
+
+    return model
--- a/paddlers/models/ppseg/models/backbones/transformer_utils.py
+++ b/paddlers/models/ppseg/models/backbones/transformer_utils.py
@ -0,0 +1,83 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.initializer as paddle_init
+
+__all__ = [
+    'to_2tuple', 'DropPath', 'Identity', 'trunc_normal_', 'zeros_', 'ones_',
+    'init_weights'
+]
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+trunc_normal_ = paddle_init.TruncatedNormal(std=.02)
+zeros_ = paddle_init.Constant(value=0.)
+ones_ = paddle_init.Constant(value=1.)
+
+
+def init_weights(layer):
+    """
+    Init the weights of transformer.
+    Args:
+        layer(nn.Layer): The layer to init weights.
+    Returns:
+        None
+    """
+    if isinstance(layer, nn.Linear):
+        trunc_normal_(layer.weight)
+        if layer.bias is not None:
+            zeros_(layer.bias)
+    elif isinstance(layer, nn.LayerNorm):
+        zeros_(layer.bias)
+        ones_(layer.weight)
--- a/paddlers/models/ppseg/models/backbones/vision_transformer.py
+++ b/paddlers/models/ppseg/models/backbones/vision_transformer.py
@ -0,0 +1,410 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils, logger
+from paddlers.models.ppseg.models.backbones.transformer_utils import to_2tuple, DropPath, Identity
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        N, C = x_shape[1], x_shape[2]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads,
+                                   C // self.num_heads)).transpose((2, 0, 3, 1,
+                                                                    4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        self.img_size = to_2tuple(img_size)
+        self.patch_size = to_2tuple(patch_size)
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size[1]
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size[0]
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+
+
+@manager.BACKBONES.add_component
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 **args):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        self.pos_w = self.patch_embed.num_patches_in_w
+        self.pos_h = self.patch_embed.num_patches_in_h
+
+        self.pos_embed = self.create_parameter(
+            shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.final_norm = final_norm
+        if self.final_norm:
+            self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        utils.load_pretrained_model(self, self.pretrained)
+
+        # load and resize pos_embed
+        model_path = self.pretrained
+        if not os.path.exists(model_path):
+            model_path = utils.download_pretrained_model(model_path)
+
+        load_state_dict = paddle.load(model_path)
+        model_state_dict = self.state_dict()
+        pos_embed_name = "pos_embed"
+        if pos_embed_name in load_state_dict.keys():
+            load_pos_embed = paddle.to_tensor(
+                load_state_dict[pos_embed_name], dtype="float32")
+            if self.pos_embed.shape != load_pos_embed.shape:
+                pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                    load_pos_embed, (pos_size, pos_size),
+                    (self.pos_h, self.pos_w))
+                self.set_dict(model_state_dict)
+                logger.info(
+                    "Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x_shape = paddle.shape(x)  # b * c * h * w
+
+        cls_tokens = self.cls_token.expand((x_shape[0], -1, -1))
+        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
+        x = paddle.concat([cls_tokens, x], axis=1)
+
+        if paddle.shape(x)[1] == self.pos_embed.shape[1]:
+            x = x + self.pos_embed
+        else:
+            x = x + self.resize_pos_embed(self.pos_embed,
+                                          (self.pos_h, self.pos_w), x_shape[2:])
+        x = self.pos_drop(x)
+
+        res = []
+        for idx, blk in enumerate(self.blocks):
+            x = blk(x)
+            if self.final_norm and idx == len(self.blocks) - 1:
+                x = self.norm(x)
+            res.append(x[:, 1:, :])
+
+        return res, x_shape
+
+
+@manager.BACKBONES.add_component
+def ViT_small_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=8,
+        num_heads=8,
+        mlp_ratio=3,
+        qk_scale=768**-0.5,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_base_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_base_patch16_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_base_patch32_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_large_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_large_patch16_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_large_patch32_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_huge_patch16_224(**kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ViT_huge_patch32_384(**kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/xception_deeplab.py
+++ b/paddlers/models/ppseg/models/backbones/xception_deeplab.py
@ -0,0 +1,415 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models import layers
+
+__all__ = ["Xception41_deeplab", "Xception65_deeplab", "Xception71_deeplab"]
+
+
+def check_data(data, number):
+    if type(data) == int:
+        return [data] * number
+    assert len(data) == number
+    return data
+
+
+def check_stride(s, os):
+    if s <= os:
+        return True
+    else:
+        return False
+
+
+def check_points(count, points):
+    if points is None:
+        return False
+    else:
+        if isinstance(points, list):
+            return (True if count in points else False)
+        else:
+            return (True if count == points else False)
+
+
+def gen_bottleneck_params(backbone='xception_65'):
+    if backbone == 'xception_65':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_41':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (8, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_71':
+        bottleneck_params = {
+            "entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    else:
+        raise ValueError(
+            "Xception backbont only support xception_41/xception_65/xception_71"
+        )
+    return bottleneck_params
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            bias_attr=False)
+        self._bn = layers.SyncBatchNorm(
+            num_features=output_channels, epsilon=1e-3, momentum=0.99)
+
+        self._act_op = layers.Activation(act=act)
+
+    def forward(self, inputs):
+        return self._act_op(self._bn(self._conv(inputs)))
+
+
+class Seperate_Conv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride,
+                 filter,
+                 dilation=1,
+                 act=None,
+                 name=None):
+        super(Seperate_Conv, self).__init__()
+
+        self._conv1 = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=filter,
+            stride=stride,
+            groups=input_channels,
+            padding=(filter) // 2 * dilation,
+            dilation=dilation,
+            bias_attr=False)
+        self._bn1 = layers.SyncBatchNorm(
+            input_channels, epsilon=1e-3, momentum=0.99)
+
+        self._act_op1 = layers.Activation(act=act)
+
+        self._conv2 = nn.Conv2D(
+            input_channels,
+            output_channels,
+            1,
+            stride=1,
+            groups=1,
+            padding=0,
+            bias_attr=False)
+        self._bn2 = layers.SyncBatchNorm(
+            output_channels, epsilon=1e-3, momentum=0.99)
+
+        self._act_op2 = layers.Activation(act=act)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._bn1(x)
+        x = self._act_op1(x)
+        x = self._conv2(x)
+        x = self._bn2(x)
+        x = self._act_op2(x)
+        return x
+
+
+class Xception_Block(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 strides=1,
+                 filter_size=3,
+                 dilation=1,
+                 skip_conv=True,
+                 has_skip=True,
+                 activation_fn_in_separable_conv=False,
+                 name=None):
+        super(Xception_Block, self).__init__()
+
+        repeat_number = 3
+        output_channels = check_data(output_channels, repeat_number)
+        filter_size = check_data(filter_size, repeat_number)
+        strides = check_data(strides, repeat_number)
+
+        self.has_skip = has_skip
+        self.skip_conv = skip_conv
+        self.activation_fn_in_separable_conv = activation_fn_in_separable_conv
+        if not activation_fn_in_separable_conv:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                dilation=dilation,
+                name=name + "/separable_conv3")
+        else:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv3")
+
+        if has_skip and skip_conv:
+            self._short = ConvBNLayer(
+                input_channels,
+                output_channels[-1],
+                1,
+                stride=strides[-1],
+                padding=0,
+                name=name + "/shortcut")
+
+    def forward(self, inputs):
+        if not self.activation_fn_in_separable_conv:
+            x = F.relu(inputs)
+            x = self._conv1(x)
+            x = F.relu(x)
+            x = self._conv2(x)
+            x = F.relu(x)
+            x = self._conv3(x)
+        else:
+            x = self._conv1(inputs)
+            x = self._conv2(x)
+            x = self._conv3(x)
+        if self.has_skip is False:
+            return x
+        if self.skip_conv:
+            skip = self._short(inputs)
+        else:
+            skip = inputs
+        return x + skip
+
+
+class XceptionDeeplab(nn.Layer):
+    """
+    The Xception backobne of DeepLabv3+ implementation based on PaddlePaddle.
+
+    The original article refers to
+     Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation"
+     (https://arxiv.org/abs/1802.02611)
+
+     Args:
+         backbone (str): Which type of Xception_DeepLab to select. It should be one of ('xception_41', 'xception_65', 'xception_71').
+         pretrained (str, optional): The path of pretrained model.
+         output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 16.
+
+    """
+
+    def __init__(self, backbone, pretrained=None, output_stride=16):
+
+        super(XceptionDeeplab, self).__init__()
+
+        bottleneck_params = gen_bottleneck_params(backbone)
+        self.backbone = backbone
+        self.feat_channels = [128, 2048]
+
+        self._conv1 = ConvBNLayer(
+            3,
+            32,
+            3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv1")
+        self._conv2 = ConvBNLayer(
+            32,
+            64,
+            3,
+            stride=1,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv2")
+        """
+            bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+
+        if output_stride == 16:
+            entry_block3_stride = 2
+            middle_block_dilation = 1
+            exit_block_dilations = (1, 2)
+        elif output_stride == 8:
+            entry_block3_stride = 1
+            middle_block_dilation = 2
+            exit_block_dilations = (2, 4)
+
+        """
+        self.block_num = bottleneck_params["entry_flow"][0]
+        self.strides = bottleneck_params["entry_flow"][1]
+        self.chns = bottleneck_params["entry_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+
+        self.entry_flow = []
+        self.middle_flow = []
+
+        self.stride = 2
+        self.output_stride = output_stride
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/entry_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=64 if i == 0 else self.chns[i - 1],
+                    output_channels=self.chns[i],
+                    strides=[1, 1, self.stride],
+                    name=self.backbone + "/entry_flow/block" + str(i + 1)))
+            self.entry_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["middle_flow"][0]
+        self.strides = bottleneck_params["middle_flow"][1]
+        self.chns = bottleneck_params["middle_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/middle_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=728,
+                    output_channels=728,
+                    strides=[1, 1, self.strides[i]],
+                    skip_conv=False,
+                    name=self.backbone + "/middle_flow/block" + str(i + 1)))
+            self.middle_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["exit_flow"][0]
+        self.strides = bottleneck_params["exit_flow"][1]
+        self.chns = bottleneck_params["exit_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+        stride = self.strides[0] if check_stride(s * self.strides[0],
+                                                 self.output_stride) else 1
+        self._exit_flow_1 = Xception_Block(
+            728,
+            self.chns[0], [1, 1, stride],
+            name=self.backbone + "/exit_flow/block1")
+        s = s * stride
+        stride = self.strides[1] if check_stride(s * self.strides[1],
+                                                 self.output_stride) else 1
+        self._exit_flow_2 = Xception_Block(
+            self.chns[0][-1],
+            self.chns[1], [1, 1, stride],
+            dilation=2,
+            has_skip=False,
+            activation_fn_in_separable_conv=True,
+            name=self.backbone + "/exit_flow/block2")
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        feat_list = []
+        for i, ef in enumerate(self.entry_flow):
+            x = ef(x)
+            if i == 0:
+                feat_list.append(x)
+        for mf in self.middle_flow:
+            x = mf(x)
+        x = self._exit_flow_1(x)
+        x = self._exit_flow_2(x)
+        feat_list.append(x)
+        return feat_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+@manager.BACKBONES.add_component
+def Xception41_deeplab(**args):
+    model = XceptionDeeplab('xception_41', **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def Xception65_deeplab(**args):
+    model = XceptionDeeplab("xception_65", **args)
+    return model
+
+
+@manager.BACKBONES.add_component
+def Xception71_deeplab(**args):
+    model = XceptionDeeplab("xception_71", **args)
+    return model
--- a/paddlers/models/ppseg/models/bisenet.py
+++ b/paddlers/models/ppseg/models/bisenet.py
@ -0,0 +1,307 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class BiSeNetV2(nn.Layer):
+    """
+    The BiSeNet V2 implementation based on PaddlePaddle.
+
+    The original article refers to
+    Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
+    (https://arxiv.org/abs/2004.02147)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        lambd (float, optional): A factor for controlling the size of semantic branch channels. Default: 0.25.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 lambd=0.25,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        C1, C2, C3 = 64, 64, 128
+        db_channels = (C1, C2, C3)
+        C1, C3, C4, C5 = int(C1 * lambd), int(C3 * lambd), 64, 128
+        sb_channels = (C1, C3, C4, C5)
+        mid_channels = 128
+
+        self.db = DetailBranch(db_channels)
+        self.sb = SemanticBranch(sb_channels)
+
+        self.bga = BGA(mid_channels, align_corners)
+        self.aux_head1 = SegHead(C1, C1, num_classes)
+        self.aux_head2 = SegHead(C3, C3, num_classes)
+        self.aux_head3 = SegHead(C4, C4, num_classes)
+        self.aux_head4 = SegHead(C5, C5, num_classes)
+        self.head = SegHead(mid_channels, mid_channels, num_classes)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        dfm = self.db(x)
+        feat1, feat2, feat3, feat4, sfm = self.sb(x)
+        logit = self.head(self.bga(dfm, sfm))
+
+        if not self.training:
+            logit_list = [logit]
+        else:
+            logit1 = self.aux_head1(feat1)
+            logit2 = self.aux_head2(feat2)
+            logit3 = self.aux_head3(feat3)
+            logit4 = self.aux_head4(feat4)
+            logit_list = [logit, logit1, logit2, logit3, logit4]
+
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+        else:
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Conv2D):
+                    param_init.kaiming_normal_init(sublayer.weight)
+                elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                    param_init.constant_init(sublayer.weight, value=1.0)
+                    param_init.constant_init(sublayer.bias, value=0.0)
+
+
+class StemBlock(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(StemBlock, self).__init__()
+
+        self.conv = layers.ConvBNReLU(in_dim, out_dim, 3, stride=2)
+
+        self.left = nn.Sequential(
+            layers.ConvBNReLU(out_dim, out_dim // 2, 1),
+            layers.ConvBNReLU(out_dim // 2, out_dim, 3, stride=2))
+
+        self.right = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.fuse = layers.ConvBNReLU(out_dim * 2, out_dim, 3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        left = self.left(x)
+        right = self.right(x)
+        concat = paddle.concat([left, right], axis=1)
+        return self.fuse(concat)
+
+
+class ContextEmbeddingBlock(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(ContextEmbeddingBlock, self).__init__()
+
+        self.gap = nn.AdaptiveAvgPool2D(1)
+        self.bn = layers.SyncBatchNorm(in_dim)
+
+        self.conv_1x1 = layers.ConvBNReLU(in_dim, out_dim, 1)
+        self.add = layers.Add()
+        self.conv_3x3 = nn.Conv2D(out_dim, out_dim, 3, 1, 1)
+
+    def forward(self, x):
+        gap = self.gap(x)
+        bn = self.bn(gap)
+        conv1 = self.add(self.conv_1x1(bn), x)
+        return self.conv_3x3(conv1)
+
+
+class GatherAndExpansionLayer1(nn.Layer):
+    """Gather And Expansion Layer with stride 1"""
+
+    def __init__(self, in_dim, out_dim, expand):
+        super().__init__()
+
+        expand_dim = expand * in_dim
+
+        self.conv = nn.Sequential(
+            layers.ConvBNReLU(in_dim, in_dim, 3),
+            layers.DepthwiseConvBN(in_dim, expand_dim, 3),
+            layers.ConvBN(expand_dim, out_dim, 1))
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        return self.relu(self.conv(x) + x)
+
+
+class GatherAndExpansionLayer2(nn.Layer):
+    """Gather And Expansion Layer with stride 2"""
+
+    def __init__(self, in_dim, out_dim, expand):
+        super().__init__()
+
+        expand_dim = expand * in_dim
+
+        self.branch_1 = nn.Sequential(
+            layers.ConvBNReLU(in_dim, in_dim, 3),
+            layers.DepthwiseConvBN(in_dim, expand_dim, 3, stride=2),
+            layers.DepthwiseConvBN(expand_dim, expand_dim, 3),
+            layers.ConvBN(expand_dim, out_dim, 1))
+
+        self.branch_2 = nn.Sequential(
+            layers.DepthwiseConvBN(in_dim, in_dim, 3, stride=2),
+            layers.ConvBN(in_dim, out_dim, 1))
+
+        self.relu = layers.Activation("relu")
+
+    def forward(self, x):
+        return self.relu(self.branch_1(x) + self.branch_2(x))
+
+
+class DetailBranch(nn.Layer):
+    """The detail branch of BiSeNet, which has wide channels but shallow layers."""
+
+    def __init__(self, in_channels):
+        super().__init__()
+
+        C1, C2, C3 = in_channels
+
+        self.convs = nn.Sequential(
+            # stage 1
+            layers.ConvBNReLU(3, C1, 3, stride=2),
+            layers.ConvBNReLU(C1, C1, 3),
+            # stage 2
+            layers.ConvBNReLU(C1, C2, 3, stride=2),
+            layers.ConvBNReLU(C2, C2, 3),
+            layers.ConvBNReLU(C2, C2, 3),
+            # stage 3
+            layers.ConvBNReLU(C2, C3, 3, stride=2),
+            layers.ConvBNReLU(C3, C3, 3),
+            layers.ConvBNReLU(C3, C3, 3),
+        )
+
+    def forward(self, x):
+        return self.convs(x)
+
+
+class SemanticBranch(nn.Layer):
+    """The semantic branch of BiSeNet, which has narrow channels but deep layers."""
+
+    def __init__(self, in_channels):
+        super().__init__()
+        C1, C3, C4, C5 = in_channels
+
+        self.stem = StemBlock(3, C1)
+
+        self.stage3 = nn.Sequential(
+            GatherAndExpansionLayer2(C1, C3, 6),
+            GatherAndExpansionLayer1(C3, C3, 6))
+
+        self.stage4 = nn.Sequential(
+            GatherAndExpansionLayer2(C3, C4, 6),
+            GatherAndExpansionLayer1(C4, C4, 6))
+
+        self.stage5_4 = nn.Sequential(
+            GatherAndExpansionLayer2(C4, C5, 6),
+            GatherAndExpansionLayer1(C5, C5, 6),
+            GatherAndExpansionLayer1(C5, C5, 6),
+            GatherAndExpansionLayer1(C5, C5, 6))
+
+        self.ce = ContextEmbeddingBlock(C5, C5)
+
+    def forward(self, x):
+        stage2 = self.stem(x)
+        stage3 = self.stage3(stage2)
+        stage4 = self.stage4(stage3)
+        stage5_4 = self.stage5_4(stage4)
+        fm = self.ce(stage5_4)
+        return stage2, stage3, stage4, stage5_4, fm
+
+
+class BGA(nn.Layer):
+    """The Bilateral Guided Aggregation Layer, used to fuse the semantic features and spatial features."""
+
+    def __init__(self, out_dim, align_corners):
+        super().__init__()
+
+        self.align_corners = align_corners
+
+        self.db_branch_keep = nn.Sequential(
+            layers.DepthwiseConvBN(out_dim, out_dim, 3),
+            nn.Conv2D(out_dim, out_dim, 1))
+
+        self.db_branch_down = nn.Sequential(
+            layers.ConvBN(out_dim, out_dim, 3, stride=2),
+            nn.AvgPool2D(kernel_size=3, stride=2, padding=1))
+
+        self.sb_branch_keep = nn.Sequential(
+            layers.DepthwiseConvBN(out_dim, out_dim, 3),
+            nn.Conv2D(out_dim, out_dim, 1), layers.Activation(act='sigmoid'))
+
+        self.sb_branch_up = layers.ConvBN(out_dim, out_dim, 3)
+
+        self.conv = layers.ConvBN(out_dim, out_dim, 3)
+
+    def forward(self, dfm, sfm):
+        db_feat_keep = self.db_branch_keep(dfm)
+        db_feat_down = self.db_branch_down(dfm)
+        sb_feat_keep = self.sb_branch_keep(sfm)
+
+        sb_feat_up = self.sb_branch_up(sfm)
+        sb_feat_up = F.interpolate(
+            sb_feat_up,
+            paddle.shape(db_feat_keep)[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        sb_feat_up = F.sigmoid(sb_feat_up)
+        db_feat = db_feat_keep * sb_feat_up
+
+        sb_feat = db_feat_down * sb_feat_keep
+        sb_feat = F.interpolate(
+            sb_feat,
+            paddle.shape(db_feat)[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return self.conv(db_feat + sb_feat)
+
+
+class SegHead(nn.Layer):
+    def __init__(self, in_dim, mid_dim, num_classes):
+        super().__init__()
+
+        self.conv_3x3 = nn.Sequential(
+            layers.ConvBNReLU(in_dim, mid_dim, 3), nn.Dropout(0.1))
+
+        self.conv_1x1 = nn.Conv2D(mid_dim, num_classes, 1, 1)
+
+    def forward(self, x):
+        conv1 = self.conv_3x3(x)
+        conv2 = self.conv_1x1(conv1)
+        return conv2
--- a/paddlers/models/ppseg/models/bisenetv1.py
+++ b/paddlers/models/ppseg/models/bisenetv1.py
@ -0,0 +1,259 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class BiseNetV1(nn.Layer):
+    """
+    The BiSeNetV1 implementation based on PaddlePaddle.
+
+    The original article refers to
+    Yu, Changqian, et al. "BiSeNet V2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation"
+    (https://paperswithcode.com/paper/bisenet-bilateral-segmentation-network-for)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 conv_channel=128,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.spatial_path = SpatialPath(3, 128)
+        self.global_context = nn.Sequential(
+            nn.AdaptiveAvgPool2D(1),
+            layers.ConvBNReLU(512, conv_channel, 1, bias_attr=False),
+        )
+
+        self.arms = nn.LayerList([
+            AttentionRefinement(512, conv_channel),
+            AttentionRefinement(256, conv_channel),
+        ])
+        self.refines = nn.LayerList([
+            layers.ConvBNReLU(conv_channel,
+                              conv_channel,
+                              3,
+                              stride=1,
+                              padding=1,
+                              bias_attr=False),
+            layers.ConvBNReLU(conv_channel,
+                              conv_channel,
+                              3,
+                              stride=1,
+                              padding=1,
+                              bias_attr=False),
+        ])
+
+        self.heads = nn.LayerList([
+            BiSeNetHead(conv_channel, num_classes, 8, True),
+            BiSeNetHead(conv_channel, num_classes, 8, True),
+            BiSeNetHead(conv_channel * 2, num_classes, 8, False),
+        ])
+
+        self.ffm = FeatureFusion(conv_channel * 2, conv_channel * 2, 1)
+
+        self.pretrained = pretrained
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        spatial_out = self.spatial_path(x)
+        context_blocks = self.backbone(x)
+        context_blocks.reverse()
+
+        global_context = self.global_context(context_blocks[0])
+        global_context = F.interpolate(global_context,
+                                       size=paddle.shape(context_blocks[0])[2:],
+                                       mode='bilinear',
+                                       align_corners=True)
+        last_fm = global_context
+        pred_out = []
+
+        for i, (fm, arm, refine) in enumerate(
+                zip(context_blocks[:2], self.arms, self.refines)):
+            fm = arm(fm)
+            fm += last_fm
+            last_fm = F.interpolate(fm,
+                                    size=paddle.shape(context_blocks[i +
+                                                                     1])[2:],
+                                    mode='bilinear',
+                                    align_corners=True)
+            last_fm = refine(last_fm)
+            pred_out.append(last_fm)
+        context_out = last_fm
+
+        concate_fm = self.ffm(spatial_out, context_out)
+        pred_out.append(concate_fm)
+
+        output = []
+        if self.training:
+            for i, head in enumerate(self.heads):
+                out = head(pred_out[i])
+                output.append(out)
+        else:
+            out = self.heads[-1](pred_out[-1])
+            output.append(out)
+        return output
+
+
+class SpatialPath(nn.Layer):
+    """
+    SpatialPath module of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+    """
+    def __init__(self, in_channels, out_channels, inner_channel=64):
+        super().__init__()
+        self.conv_7x7 = layers.ConvBNReLU(in_channels,
+                                          inner_channel,
+                                          7,
+                                          stride=2,
+                                          padding=3,
+                                          bias_attr=False)
+        self.conv_3x3_1 = layers.ConvBNReLU(inner_channel,
+                                            inner_channel,
+                                            3,
+                                            stride=2,
+                                            padding=1,
+                                            bias_attr=False)
+        self.conv_3x3_2 = layers.ConvBNReLU(inner_channel,
+                                            inner_channel,
+                                            3,
+                                            stride=2,
+                                            padding=1,
+                                            bias_attr=False)
+        self.conv_1x1 = layers.ConvBNReLU(inner_channel,
+                                          out_channels,
+                                          1,
+                                          bias_attr=False)
+
+    def forward(self, x):
+        x = self.conv_7x7(x)
+        x = self.conv_3x3_1(x)
+        x = self.conv_3x3_2(x)
+        x = self.conv_1x1(x)
+        return x
+
+
+class BiSeNetHead(nn.Layer):
+    """
+    BiSeNet head of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+        scale (int, float): The scale factor of interpolation.
+    """
+    def __init__(self, in_channels, out_channels, scale, is_aux=False):
+        super().__init__()
+        inner_channel = 128 if is_aux else 64
+        self.conv_3x3 = layers.ConvBNReLU(in_channels,
+                                          inner_channel,
+                                          3,
+                                          stride=1,
+                                          padding=1,
+                                          bias_attr=False)
+        self.conv_1x1 = nn.Conv2D(inner_channel, out_channels, 1)
+        self.scale = scale
+
+    def forward(self, x):
+        x = self.conv_3x3(x)
+        x = self.conv_1x1(x)
+        if self.scale > 1:
+            x = F.interpolate(x,
+                              scale_factor=self.scale,
+                              mode='bilinear',
+                              align_corners=True)
+        return x
+
+
+class AttentionRefinement(nn.Layer):
+    """
+    AttentionRefinement module of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv_3x3 = layers.ConvBNReLU(in_channels,
+                                          out_channels,
+                                          3,
+                                          stride=1,
+                                          padding=1,
+                                          bias_attr=False)
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2D(1),
+            layers.ConvBNReLU(out_channels, out_channels, 1, bias_attr=False),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        x = self.conv_3x3(x)
+        se = self.channel_attention(x)
+        x = x * se
+        return x
+
+
+class FeatureFusion(nn.Layer):
+    """
+    AttentionRefinement module of BiseNetV1 model
+
+    Args:
+        in_channels (int): The number of input channels in spatial path module.
+        out_channels (int): The number of output channels in spatial path module.
+        reduction (int): A factor shrinks convolutional channels. Default: 1.
+    """
+    def __init__(self, in_channels, out_channels, reduction=1):
+        super().__init__()
+        self.conv_1x1 = layers.ConvBNReLU(in_channels,
+                                          out_channels,
+                                          1,
+                                          bias_attr=False)
+        self.channel_attention = nn.Sequential(
+            nn.AdaptiveAvgPool2D(1),
+            layers.ConvBNReLU(out_channels,
+                              out_channels // reduction,
+                              1,
+                              bias_attr=False),
+            layers.ConvBNReLU(out_channels // reduction,
+                              out_channels,
+                              1,
+                              bias_attr=False),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x1, x2):
+        fm = paddle.concat([x1, x2], axis=1)
+        fm = self.conv_1x1(fm)
+        fm_se = self.channel_attention(fm)
+        output = fm + fm * fm_se
+        return output
--- a/paddlers/models/ppseg/models/danet.py
+++ b/paddlers/models/ppseg/models/danet.py
@ -0,0 +1,218 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DANet(nn.Layer):
+    """
+    The DANet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Fu, jun, et al. "Dual Attention Network for Scene Segmentation"
+    (https://arxiv.org/pdf/1809.02983.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of
+            output of backbone.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+
+        self.head = DAHead(num_classes=num_classes, in_channels=in_channels)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        if not self.training:
+            logit_list = [logit_list[0]]
+
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                align_mode=1) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DAHead(nn.Layer):
+    """
+    The Dual attention head.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+    """
+
+    def __init__(self, num_classes, in_channels):
+        super().__init__()
+        in_channels = in_channels[-1]
+        inter_channels = in_channels // 4
+
+        self.channel_conv = layers.ConvBNReLU(in_channels, inter_channels, 3)
+        self.position_conv = layers.ConvBNReLU(in_channels, inter_channels, 3)
+        self.pam = PAM(inter_channels)
+        self.cam = CAM(inter_channels)
+        self.conv1 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
+        self.conv2 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
+
+        self.aux_head = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(in_channels, num_classes, 1))
+
+        self.aux_head_pam = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
+
+        self.aux_head_cam = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
+
+        self.cls_head = nn.Sequential(
+            nn.Dropout2D(0.1), nn.Conv2D(inter_channels, num_classes, 1))
+
+    def forward(self, feat_list):
+        feats = feat_list[-1]
+        channel_feats = self.channel_conv(feats)
+        channel_feats = self.cam(channel_feats)
+        channel_feats = self.conv1(channel_feats)
+
+        position_feats = self.position_conv(feats)
+        position_feats = self.pam(position_feats)
+        position_feats = self.conv2(position_feats)
+
+        feats_sum = position_feats + channel_feats
+        logit = self.cls_head(feats_sum)
+
+        if not self.training:
+            return [logit]
+
+        cam_logit = self.aux_head_cam(channel_feats)
+        pam_logit = self.aux_head_cam(position_feats)
+        aux_logit = self.aux_head(feats)
+        return [logit, cam_logit, pam_logit, aux_logit]
+
+
+class PAM(nn.Layer):
+    """Position attention module."""
+
+    def __init__(self, in_channels):
+        super().__init__()
+        mid_channels = in_channels // 8
+        self.mid_channels = mid_channels
+        self.in_channels = in_channels
+
+        self.query_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
+        self.key_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
+        self.value_conv = nn.Conv2D(in_channels, in_channels, 1, 1)
+
+        self.gamma = self.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0))
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+
+        # query: n, h * w, c1
+        query = self.query_conv(x)
+        query = paddle.reshape(query, (0, self.mid_channels, -1))
+        query = paddle.transpose(query, (0, 2, 1))
+
+        # key: n, c1, h * w
+        key = self.key_conv(x)
+        key = paddle.reshape(key, (0, self.mid_channels, -1))
+
+        # sim: n, h * w, h * w
+        sim = paddle.bmm(query, key)
+        sim = F.softmax(sim, axis=-1)
+
+        value = self.value_conv(x)
+        value = paddle.reshape(value, (0, self.in_channels, -1))
+        sim = paddle.transpose(sim, (0, 2, 1))
+
+        # feat: from (n, c2, h * w) -> (n, c2, h, w)
+        feat = paddle.bmm(value, sim)
+        feat = paddle.reshape(feat,
+                              (0, self.in_channels, x_shape[2], x_shape[3]))
+
+        out = self.gamma * feat + x
+        return out
+
+
+class CAM(nn.Layer):
+    """Channel attention module."""
+
+    def __init__(self, channels):
+        super().__init__()
+
+        self.channels = channels
+        self.gamma = self.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0))
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        # query: n, c, h * w
+        query = paddle.reshape(x, (0, self.channels, -1))
+        # key: n, h * w, c
+        key = paddle.reshape(x, (0, self.channels, -1))
+        key = paddle.transpose(key, (0, 2, 1))
+
+        # sim: n, c, c
+        sim = paddle.bmm(query, key)
+        # The danet author claims that this can avoid gradient divergence
+        sim = paddle.max(
+            sim, axis=-1, keepdim=True).tile([1, 1, self.channels]) - sim
+        sim = F.softmax(sim, axis=-1)
+
+        # feat: from (n, c, h * w) to (n, c, h, w)
+        value = paddle.reshape(x, (0, self.channels, -1))
+        feat = paddle.bmm(sim, value)
+        feat = paddle.reshape(feat, (0, self.channels, x_shape[2], x_shape[3]))
+
+        out = self.gamma * feat + x
+        return out
--- a/paddlers/models/ppseg/models/decoupled_segnet.py
+++ b/paddlers/models/ppseg/models/decoupled_segnet.py
@ -0,0 +1,228 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.models.backbones import resnet_vd
+from paddlers.models.ppseg.models import deeplab
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DecoupledSegNet(nn.Layer):
+    """
+    The DecoupledSegNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Xiangtai Li, et, al. "Improving Semantic Segmentation via Decoupled Body and Edge Supervision"
+    (https://arxiv.org/pdf/2007.10035.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+           Default: (0, 3).
+        aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
+            If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
+            If output_stride=8, aspp_ratios is (1, 12, 24, 36).
+            Default: (1, 6, 12, 18).
+        aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(0, 3),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        backbone_channels = self.backbone.feat_channels
+        self.head = DecoupledSegNetHead(num_classes, backbone_indices,
+                                        backbone_channels, aspp_ratios,
+                                        aspp_out_channels, align_corners)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+
+        seg_logit, body_logit, edge_logit = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+        if self.training:
+            return [seg_logit, body_logit, edge_logit, (seg_logit, edge_logit)]
+        return [seg_logit]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DecoupledSegNetHead(nn.Layer):
+    """
+    The DecoupledSegNetHead implementation based on PaddlePaddle.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            the first index will be taken as a low-level feature in Edge presevation component;
+            the second one will be taken as input of ASPP component.
+        backbone_channels (tuple): The channels of output of backbone.
+        aspp_ratios (tuple): The dilation rates using in ASSP module.
+        aspp_out_channels (int): The output channels of ASPP module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, num_classes, backbone_indices, backbone_channels,
+                 aspp_ratios, aspp_out_channels, align_corners):
+        super().__init__()
+        self.backbone_indices = backbone_indices
+        self.align_corners = align_corners
+        self.aspp = layers.ASPPModule(
+            aspp_ratios=aspp_ratios,
+            in_channels=backbone_channels[backbone_indices[1]],
+            out_channels=aspp_out_channels,
+            align_corners=align_corners,
+            image_pooling=True)
+
+        self.bot_fine = nn.Conv2D(
+            backbone_channels[backbone_indices[0]], 48, 1, bias_attr=False)
+        # decoupled
+        self.squeeze_body_edge = SqueezeBodyEdge(
+            256, align_corners=self.align_corners)
+        self.edge_fusion = nn.Conv2D(256 + 48, 256, 1, bias_attr=False)
+        self.sigmoid_edge = nn.Sigmoid()
+        self.edge_out = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=256,
+                out_channels=48,
+                kernel_size=3,
+                bias_attr=False), nn.Conv2D(48, 1, 1, bias_attr=False))
+        self.dsn_seg_body = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=256,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False), nn.Conv2D(
+                    256, num_classes, 1, bias_attr=False))
+
+        self.final_seg = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=512,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                in_channels=256,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False),
+            nn.Conv2D(256, num_classes, kernel_size=1, bias_attr=False))
+
+    def forward(self, feat_list):
+        fine_fea = feat_list[self.backbone_indices[0]]
+        fine_size = paddle.shape(fine_fea)
+        x = feat_list[self.backbone_indices[1]]
+        aspp = self.aspp(x)
+
+        # decoupled
+        seg_body, seg_edge = self.squeeze_body_edge(aspp)
+        # Edge presevation and edge out
+        fine_fea = self.bot_fine(fine_fea)
+        seg_edge = F.interpolate(
+            seg_edge,
+            fine_size[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        seg_edge = self.edge_fusion(paddle.concat([seg_edge, fine_fea], axis=1))
+        seg_edge_out = self.edge_out(seg_edge)
+        seg_edge_out = self.sigmoid_edge(seg_edge_out)  # seg_edge output
+        seg_body_out = self.dsn_seg_body(seg_body)  # body out
+
+        # seg_final out
+        seg_out = seg_edge + F.interpolate(
+            seg_body,
+            fine_size[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        aspp = F.interpolate(
+            aspp,
+            fine_size[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        seg_out = paddle.concat([aspp, seg_out], axis=1)
+        seg_final_out = self.final_seg(seg_out)
+
+        return [seg_final_out, seg_body_out, seg_edge_out]
+
+
+class SqueezeBodyEdge(nn.Layer):
+    def __init__(self, inplane, align_corners=False):
+        super().__init__()
+        self.align_corners = align_corners
+        self.down = nn.Sequential(
+            layers.ConvBNReLU(
+                inplane, inplane, kernel_size=3, groups=inplane, stride=2),
+            layers.ConvBNReLU(
+                inplane, inplane, kernel_size=3, groups=inplane, stride=2))
+        self.flow_make = nn.Conv2D(
+            inplane * 2, 2, kernel_size=3, padding='same', bias_attr=False)
+
+    def forward(self, x):
+        size = paddle.shape(x)[2:]
+        seg_down = self.down(x)
+        seg_down = F.interpolate(
+            seg_down,
+            size=size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        flow = self.flow_make(paddle.concat([x, seg_down], axis=1))
+        seg_flow_warp = self.flow_warp(x, flow, size)
+        seg_edge = x - seg_flow_warp
+        return seg_flow_warp, seg_edge
+
+    def flow_warp(self, input, flow, size):
+        input_shape = paddle.shape(input)
+        norm = size[::-1].reshape([1, 1, 1, -1])
+        norm.stop_gradient = True
+        h_grid = paddle.linspace(-1.0, 1.0, size[0]).reshape([-1, 1])
+        h_grid = h_grid.tile([size[1]])
+        w_grid = paddle.linspace(-1.0, 1.0, size[1]).reshape([-1, 1])
+        w_grid = w_grid.tile([size[0]]).transpose([1, 0])
+        grid = paddle.concat([w_grid.unsqueeze(2), h_grid.unsqueeze(2)], axis=2)
+        grid.unsqueeze(0).tile([input_shape[0], 1, 1, 1])
+        grid = grid + paddle.transpose(flow, (0, 2, 3, 1)) / norm
+
+        output = F.grid_sample(input, grid)
+        return output
--- a/paddlers/models/ppseg/models/deeplab.py
+++ b/paddlers/models/ppseg/models/deeplab.py
@ -0,0 +1,308 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = ['DeepLabV3P', 'DeepLabV3']
+
+
+@manager.MODELS.add_component
+class DeepLabV3P(nn.Layer):
+    """
+    The DeepLabV3Plus implementation based on PaddlePaddle.
+
+    The original article refers to
+     Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation"
+     (https://arxiv.org/abs/1802.02611)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd/Xception65.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+           Default: (0, 3).
+        aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
+            If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
+            If output_stride=8, aspp_ratios is (1, 12, 24, 36).
+            Default: (1, 6, 12, 18).
+        aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+        data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(0, 3),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = DeepLabV3PHead(
+            num_classes,
+            backbone_indices,
+            backbone_channels,
+            aspp_ratios,
+            aspp_out_channels,
+            align_corners,
+            data_format=data_format)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.data_format = data_format
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        if self.data_format == 'NCHW':
+            ori_shape = paddle.shape(x)[2:]
+        else:
+            ori_shape = paddle.shape(x)[1:3]
+        return [
+            F.interpolate(
+                logit,
+                ori_shape,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                data_format=self.data_format) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DeepLabV3PHead(nn.Layer):
+    """
+    The DeepLabV3PHead implementation based on PaddlePaddle.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            the first index will be taken as a low-level feature in Decoder component;
+            the second one will be taken as input of ASPP component.
+            Usually backbone consists of four downsampling stage, and return an output of
+            each stage. If we set it as (0, 3), it means taking feature map of the first
+            stage in backbone as low-level feature used in Decoder, and feature map of the fourth
+            stage as input of ASPP.
+        backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
+        aspp_ratios (tuple): The dilation rates using in ASSP module.
+        aspp_out_channels (int): The output channels of ASPP module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+        data_format(str, optional): Data format that specifies the layout of input. It can be "NCHW" or "NHWC". Default: "NCHW".
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 backbone_channels,
+                 aspp_ratios,
+                 aspp_out_channels,
+                 align_corners,
+                 data_format='NCHW'):
+        super().__init__()
+
+        self.aspp = layers.ASPPModule(
+            aspp_ratios,
+            backbone_channels[1],
+            aspp_out_channels,
+            align_corners,
+            use_sep_conv=True,
+            image_pooling=True,
+            data_format=data_format)
+        self.decoder = Decoder(
+            num_classes,
+            backbone_channels[0],
+            align_corners,
+            data_format=data_format)
+        self.backbone_indices = backbone_indices
+
+    def forward(self, feat_list):
+        logit_list = []
+        low_level_feat = feat_list[self.backbone_indices[0]]
+        x = feat_list[self.backbone_indices[1]]
+        x = self.aspp(x)
+        logit = self.decoder(x, low_level_feat)
+        logit_list.append(logit)
+
+        return logit_list
+
+
+@manager.MODELS.add_component
+class DeepLabV3(nn.Layer):
+    """
+    The DeepLabV3 implementation based on PaddlePaddle.
+
+    The original article refers to
+     Liang-Chieh Chen, et, al. "Rethinking Atrous Convolution for Semantic Image Segmentation"
+     (https://arxiv.org/pdf/1706.05587.pdf).
+
+    Args:
+        Please Refer to DeepLabV3P above.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(3, ),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = DeepLabV3Head(num_classes, backbone_indices,
+                                  backbone_channels, aspp_ratios,
+                                  aspp_out_channels, align_corners)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DeepLabV3Head(nn.Layer):
+    """
+    The DeepLabV3Head implementation based on PaddlePaddle.
+
+    Args:
+        Please Refer to DeepLabV3PHead above.
+    """
+
+    def __init__(self, num_classes, backbone_indices, backbone_channels,
+                 aspp_ratios, aspp_out_channels, align_corners):
+        super().__init__()
+
+        self.aspp = layers.ASPPModule(
+            aspp_ratios,
+            backbone_channels[0],
+            aspp_out_channels,
+            align_corners,
+            use_sep_conv=False,
+            image_pooling=True)
+
+        self.cls = nn.Conv2D(
+            in_channels=aspp_out_channels,
+            out_channels=num_classes,
+            kernel_size=1)
+
+        self.backbone_indices = backbone_indices
+
+    def forward(self, feat_list):
+        logit_list = []
+        x = feat_list[self.backbone_indices[0]]
+        x = self.aspp(x)
+        logit = self.cls(x)
+        logit_list.append(logit)
+
+        return logit_list
+
+
+class Decoder(nn.Layer):
+    """
+    Decoder module of DeepLabV3P model
+
+    Args:
+        num_classes (int): The number of classes.
+        in_channels (int): The number of input channels in decoder module.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 align_corners,
+                 data_format='NCHW'):
+        super(Decoder, self).__init__()
+
+        self.data_format = data_format
+        self.conv_bn_relu1 = layers.ConvBNReLU(
+            in_channels=in_channels,
+            out_channels=48,
+            kernel_size=1,
+            data_format=data_format)
+
+        self.conv_bn_relu2 = layers.SeparableConvBNReLU(
+            in_channels=304,
+            out_channels=256,
+            kernel_size=3,
+            padding=1,
+            data_format=data_format)
+        self.conv_bn_relu3 = layers.SeparableConvBNReLU(
+            in_channels=256,
+            out_channels=256,
+            kernel_size=3,
+            padding=1,
+            data_format=data_format)
+        self.conv = nn.Conv2D(
+            in_channels=256,
+            out_channels=num_classes,
+            kernel_size=1,
+            data_format=data_format)
+
+        self.align_corners = align_corners
+
+    def forward(self, x, low_level_feat):
+        low_level_feat = self.conv_bn_relu1(low_level_feat)
+        if self.data_format == 'NCHW':
+            low_level_shape = paddle.shape(low_level_feat)[-2:]
+            axis = 1
+        else:
+            low_level_shape = paddle.shape(low_level_feat)[1:3]
+            axis = -1
+        x = F.interpolate(
+            x,
+            low_level_shape,
+            mode='bilinear',
+            align_corners=self.align_corners,
+            data_format=self.data_format)
+        x = paddle.concat([x, low_level_feat], axis=axis)
+        x = self.conv_bn_relu2(x)
+        x = self.conv_bn_relu3(x)
+        x = self.conv(x)
+        return x
--- a/paddlers/models/ppseg/models/dmnet.py
+++ b/paddlers/models/ppseg/models/dmnet.py
@ -0,0 +1,149 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DMNet(nn.Layer):
+    """
+    The DMNet implementation based on PaddlePaddle.
+
+    The original article refers to
+     Junjun He, Zhongying Deng, Yu Qiao. "Dynamic Multi-scale Filters for Semantic Segmentation"
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
+        mid_channels (int): The middle channels of convolution layer. Default: 512.
+        filter_sizes (list, tuple): The filter size of generated convolution kernel used in Dynamic Convolutional Module. Default: [1, 3, 5, 7].
+        fusion (bool): Add one conv to fuse DCM output feature. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 mid_channels=512,
+                 filter_sizes=[1, 3, 5, 7],
+                 fusion=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.dcm_modules = nn.LayerList()
+        for filter_size in filter_sizes:
+            self.dcm_modules.append(
+                DCM(filter_size, fusion, self.backbone.feat_channels[-1],
+                    mid_channels), )
+        self.bottleneck = layers.ConvBNReLU(
+            self.backbone.feat_channels[-1] + len(filter_sizes) * mid_channels,
+            mid_channels,
+            3,
+            padding=1,
+        )
+        self.cls = nn.Conv2D(mid_channels, num_classes, 1)
+
+        self.fcn_head = nn.Sequential(
+            layers.ConvBNReLU(self.backbone.feat_channels[2],
+                              mid_channels,
+                              3,
+                              padding=1),
+            nn.Conv2D(mid_channels, num_classes, 1),
+        )
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        x = feats[-1]
+        dcm_outs = [x]
+        for dcm_module in self.dcm_modules:
+            dcm_outs.append(dcm_module(x))
+        dcm_outs = paddle.concat(dcm_outs, axis=1)
+        x = self.bottleneck(dcm_outs)
+        x = self.cls(x)
+        x = F.interpolate(x,
+                          scale_factor=8,
+                          mode='bilinear',
+                          align_corners=True)
+        output = [x]
+        if self.training:
+            fcn_out = self.fcn_head(feats[2])
+            fcn_out = F.interpolate(fcn_out,
+                                    scale_factor=8,
+                                    mode='bilinear',
+                                    align_corners=True)
+            output.append(fcn_out)
+            return output
+        return output
+
+
+class DCM(nn.Layer):
+    """
+    Dynamic Convolutional Module used in DMNet.
+
+    Args:
+        filter_size (int): The filter size of generated convolution kernel used in Dynamic Convolutional Module.
+        fusion (bool): Add one conv to fuse DCM output feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+    """
+    def __init__(self, filter_size, fusion, in_channels, channels):
+        super().__init__()
+        self.filter_size = filter_size
+        self.fusion = fusion
+        self.channels = channels
+
+        pad = (self.filter_size - 1) // 2
+        if (self.filter_size - 1) % 2 == 0:
+            self.pad = (pad, pad, pad, pad)
+        else:
+            self.pad = (pad + 1, pad, pad + 1, pad)
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(filter_size)
+        self.filter_gen_conv = nn.Conv2D(in_channels, channels, 1)
+        self.input_redu_conv = layers.ConvBNReLU(in_channels, channels, 1)
+
+        self.norm = layers.SyncBatchNorm(channels)
+        self.act = nn.ReLU()
+
+        if self.fusion:
+            self.fusion_conv = layers.ConvBNReLU(channels, channels, 1)
+
+    def forward(self, x):
+        generated_filter = self.filter_gen_conv(self.avg_pool(x))
+        x = self.input_redu_conv(x)
+        b, c, h, w = x.shape
+        x = x.reshape([1, b * c, h, w])
+        generated_filter = generated_filter.reshape(
+            [b * c, 1, self.filter_size, self.filter_size])
+
+        x = F.pad(x, self.pad, mode='constant', value=0)
+        output = F.conv2d(x, weight=generated_filter, groups=b * c)
+        output = output.reshape([b, self.channels, h, w])
+        output = self.norm(output)
+        output = self.act(output)
+        if self.fusion:
+            output = self.fusion_conv(output)
+        return output
--- a/paddlers/models/ppseg/models/dnlnet.py
+++ b/paddlers/models/ppseg/models/dnlnet.py
@ -0,0 +1,226 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class DNLNet(nn.Layer):
+    """Disentangled Non-Local Neural Networks.
+
+    The original article refers to
+    Minghao Yin, et al. "Disentangled Non-Local Neural Networks"
+    (https://arxiv.org/abs/2006.06668)
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: False.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian'.
+        temperature (float): Temperature to adjust attention. Default: 0.05.
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 temperature=0.05,
+                 concat_input=True,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+        self.head = DNLHead(num_classes, in_channels, reduction, use_scale,
+                            mode, temperature, concat_input,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                align_mode=1) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class DNLHead(nn.Layer):
+    """
+    The DNLNet head.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: False.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+        temperature (float): Temperature to adjust attention. Default: 0.05
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 reduction,
+                 use_scale,
+                 mode,
+                 temperature,
+                 concat_input=True,
+                 enable_auxiliary_loss=True,
+                 **kwargs):
+        super(DNLHead, self).__init__()
+        self.in_channels = in_channels[-1]
+        self.concat_input = concat_input
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        inter_channels = self.in_channels // 4
+
+        self.dnl_block = DisentangledNonLocal2D(
+            in_channels=inter_channels,
+            reduction=reduction,
+            use_scale=use_scale,
+            temperature=temperature,
+            mode=mode)
+        self.conv0 = layers.ConvBNReLU(
+            in_channels=self.in_channels,
+            out_channels=inter_channels,
+            kernel_size=3,
+            bias_attr=False)
+        self.conv1 = layers.ConvBNReLU(
+            in_channels=inter_channels,
+            out_channels=inter_channels,
+            kernel_size=3,
+            bias_attr=False)
+        self.cls = nn.Sequential(
+            nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1))
+        self.aux = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=1024,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False), nn.Dropout2D(p=0.1),
+            nn.Conv2D(256, num_classes, 1))
+        if self.concat_input:
+            self.conv_cat = layers.ConvBNReLU(
+                self.in_channels + inter_channels,
+                inter_channels,
+                kernel_size=3,
+                bias_attr=False)
+
+    def forward(self, feat_list):
+        C3, C4 = feat_list
+        output = self.conv0(C4)
+        output = self.dnl_block(output)
+        output = self.conv1(output)
+        if self.concat_input:
+            output = self.conv_cat(paddle.concat([C4, output], axis=1))
+        output = self.cls(output)
+        if self.enable_auxiliary_loss:
+            auxout = self.aux(C3)
+            return [output, auxout]
+        else:
+            return [output]
+
+
+class DisentangledNonLocal2D(layers.NonLocal2D):
+    """Disentangled Non-Local Blocks.
+
+    Args:
+        temperature (float): Temperature to adjust attention.
+    """
+
+    def __init__(self, temperature, *arg, **kwargs):
+        super().__init__(*arg, **kwargs)
+        self.temperature = temperature
+        self.conv_mask = nn.Conv2D(self.in_channels, 1, kernel_size=1)
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        if self.use_scale:
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight /= self.temperature
+        pairwise_weight = F.softmax(pairwise_weight, -1)
+        return pairwise_weight
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        g_x = self.g(x).reshape([0, self.inter_channels,
+                                 -1]).transpose([0, 2, 1])
+
+        if self.mode == "gaussian":
+            theta_x = paddle.transpose(
+                x.reshape([0, self.in_channels, -1]), [0, 2, 1])
+            if self.sub_sample:
+                phi_x = paddle.transpose(self.phi(x), [0, self.in_channels, -1])
+            else:
+                phi_x = paddle.transpose(x, [0, self.in_channels, -1])
+
+        elif self.mode == "concatenation":
+            theta_x = paddle.reshape(
+                self.theta(x), [0, self.inter_channels, -1, 1])
+            phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, 1, -1])
+
+        else:
+            theta_x = self.theta(x).reshape([0, self.inter_channels,
+                                             -1]).transpose([0, 2, 1])
+            phi_x = paddle.reshape(self.phi(x), [0, self.inter_channels, -1])
+
+        theta_x -= paddle.mean(theta_x, axis=-2, keepdim=True)
+        phi_x -= paddle.mean(phi_x, axis=-1, keepdim=True)
+
+        pairwise_func = getattr(self, self.mode)
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        y = paddle.matmul(pairwise_weight, g_x).transpose([0, 2, 1]).reshape(
+            [0, self.inter_channels, x_shape[2], x_shape[3]])
+        unary_mask = F.softmax(
+            paddle.reshape(self.conv_mask(x), [0, 1, -1]), -1)
+        unary_x = paddle.matmul(unary_mask, g_x).transpose([0, 2, 1]).reshape(
+            [0, self.inter_channels, 1, 1])
+        output = x + self.conv_out(y + unary_x)
+        return output
--- a/paddlers/models/ppseg/models/emanet.py
+++ b/paddlers/models/ppseg/models/emanet.py
@ -0,0 +1,215 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class EMANet(nn.Layer):
+    """
+    Expectation Maximization Attention Networks for Semantic Segmentation based on PaddlePaddle.
+
+    The original article refers to
+    Xia Li, et al. "Expectation-Maximization Attention Networks for Semantic Segmentation"
+    (https://arxiv.org/abs/1907.13426)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
+        ema_channels (int): EMA module channels.
+        gc_channels (int): The input channels to Global Context Block.
+        num_bases (int): Number of bases.
+        stage_num (int): The iteration number for EM.
+        momentum (float): The parameter for updating bases.
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 ema_channels=512,
+                 gc_channels=256,
+                 num_bases=64,
+                 stage_num=3,
+                 momentum=0.1,
+                 concat_input=True,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+        self.head = EMAHead(num_classes, in_channels, ema_channels, gc_channels,
+                            num_bases, stage_num, momentum, concat_input,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class EMAHead(nn.Layer):
+    """
+    The EMANet head.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+        ema_channels (int): EMA module channels.
+        gc_channels (int): The input channels to Global Context Block.
+        num_bases (int): Number of bases.
+        stage_num (int): The iteration number for EM.
+        momentum (float): The parameter for updating bases.
+        concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 ema_channels,
+                 gc_channels,
+                 num_bases,
+                 stage_num,
+                 momentum,
+                 concat_input=True,
+                 enable_auxiliary_loss=True):
+        super(EMAHead, self).__init__()
+
+        self.in_channels = in_channels[-1]
+        self.concat_input = concat_input
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+        self.emau = EMAU(ema_channels, num_bases, stage_num, momentum=momentum)
+        self.ema_in_conv = layers.ConvBNReLU(
+            in_channels=self.in_channels,
+            out_channels=ema_channels,
+            kernel_size=3)
+        self.ema_mid_conv = nn.Conv2D(ema_channels, ema_channels, kernel_size=1)
+        self.ema_out_conv = layers.ConvBNReLU(
+            in_channels=ema_channels, out_channels=ema_channels, kernel_size=1)
+        self.bottleneck = layers.ConvBNReLU(
+            in_channels=ema_channels, out_channels=gc_channels, kernel_size=3)
+        self.cls = nn.Sequential(
+            nn.Dropout2D(p=0.1), nn.Conv2D(gc_channels, num_classes, 1))
+        self.aux = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=1024, out_channels=256, kernel_size=3),
+            nn.Dropout2D(p=0.1), nn.Conv2D(256, num_classes, 1))
+        if self.concat_input:
+            self.conv_cat = layers.ConvBNReLU(
+                self.in_channels + gc_channels, gc_channels, kernel_size=3)
+
+    def forward(self, feat_list):
+        C3, C4 = feat_list
+        feats = self.ema_in_conv(C4)
+        identity = feats
+        feats = self.ema_mid_conv(feats)
+        recon = self.emau(feats)
+        recon = F.relu(recon)
+        recon = self.ema_out_conv(recon)
+        output = F.relu(identity + recon)
+        output = self.bottleneck(output)
+        if self.concat_input:
+            output = self.conv_cat(paddle.concat([C4, output], axis=1))
+        output = self.cls(output)
+        if self.enable_auxiliary_loss:
+            auxout = self.aux(C3)
+            return [output, auxout]
+        else:
+            return [output]
+
+
+class EMAU(nn.Layer):
+    '''The Expectation-Maximization Attention Unit (EMAU).
+
+    Arguments:
+        c (int): The input and output channel number.
+        k (int): The number of the bases.
+        stage_num (int): The iteration number for EM.
+        momentum (float): The parameter for updating bases.
+    '''
+
+    def __init__(self, c, k, stage_num=3, momentum=0.1):
+        super(EMAU, self).__init__()
+        assert stage_num >= 1
+        self.stage_num = stage_num
+        self.momentum = momentum
+        self.c = c
+
+        tmp_mu = self.create_parameter(
+            shape=[1, c, k],
+            default_initializer=paddle.nn.initializer.KaimingNormal(k))
+        mu = F.normalize(paddle.to_tensor(tmp_mu), axis=1, p=2)
+        self.register_buffer('mu', mu)
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        x = x.flatten(2)
+        mu = paddle.tile(self.mu, [x_shape[0], 1, 1])
+
+        with paddle.no_grad():
+            for i in range(self.stage_num):
+                x_t = paddle.transpose(x, [0, 2, 1])
+                z = paddle.bmm(x_t, mu)
+                z = F.softmax(z, axis=2)
+                z_ = F.normalize(z, axis=1, p=1)
+                mu = paddle.bmm(x, z_)
+                mu = F.normalize(mu, axis=1, p=2)
+
+        z_t = paddle.transpose(z, [0, 2, 1])
+        x = paddle.matmul(mu, z_t)
+        x = paddle.reshape(x, [0, self.c, x_shape[2], x_shape[3]])
+
+        if self.training:
+            mu = paddle.mean(mu, 0, keepdim=True)
+            mu = F.normalize(mu, axis=1, p=2)
+            mu = self.mu * (1 - self.momentum) + mu * self.momentum
+            if paddle.distributed.get_world_size() > 1:
+                mu = paddle.distributed.all_reduce(mu)
+                mu /= paddle.distributed.get_world_size()
+            self.mu = mu
+
+        return x
--- a/paddlers/models/ppseg/models/encnet.py
+++ b/paddlers/models/ppseg/models/encnet.py
@ -0,0 +1,224 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ENCNet(nn.Layer):
+    """
+    The ENCNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Hang Zhang, Kristin Dana, et, al. "Context Encoding for Semantic Segmentation".
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of
+            output of backbone.
+        num_codes (int): The number of encoded words. Default: 32.
+        mid_channels (int): The channels of middle layers. Default: 512.
+        use_se_loss (int): Whether use semantic encoding loss. Default: True.
+        add_lateral (int): Whether use lateral convolution layers. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=[1, 2, 3],
+                 num_codes=32,
+                 mid_channels=512,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 pretrained=None):
+        super().__init__()
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [
+            self.backbone.feat_channels[index] for index in backbone_indices
+        ]
+
+        self.bottleneck = layers.ConvBNReLU(
+            in_channels[-1],
+            mid_channels,
+            3,
+            padding=1,
+        )
+        if self.add_lateral:
+            self.lateral_convs = nn.LayerList()
+            for in_ch in in_channels[:-1]:
+                self.lateral_convs.append(
+                    layers.ConvBNReLU(
+                        in_ch,
+                        mid_channels,
+                        1,
+                    ))
+            self.fusion = layers.ConvBNReLU(
+                len(in_channels) * mid_channels,
+                mid_channels,
+                3,
+                padding=1,
+            )
+
+        self.enc_module = EncModule(mid_channels, num_codes)
+        self.head = nn.Conv2D(mid_channels, num_classes, 1)
+
+        self.fcn_head = layers.AuxLayer(self.backbone.feat_channels[2],
+                                        mid_channels, num_classes)
+
+        self.use_se_loss = use_se_loss
+        if use_se_loss:
+            self.se_layer = nn.Linear(mid_channels, num_classes)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, inputs):
+        N, C, H, W = paddle.shape(inputs)
+        feats = self.backbone(inputs)
+        fcn_feat = feats[2]
+
+        feats = [feats[i] for i in self.backbone_indices]
+        feat = self.bottleneck(feats[-1])
+
+        if self.add_lateral:
+            laterals = []
+            for j, lateral_conv in enumerate(self.lateral_convs):
+                laterals.append(
+                    F.interpolate(lateral_conv(feats[j]),
+                                  size=paddle.shape(feat)[2:],
+                                  mode='bilinear',
+                                  align_corners=False))
+            feat = self.fusion(paddle.concat([feat, *laterals], 1))
+        encode_feat, feat = self.enc_module(feat)
+        out = self.head(feat)
+        out = F.interpolate(out,
+                            size=[H, W],
+                            mode='bilinear',
+                            align_corners=False)
+        output = [out]
+        if self.training:
+            fcn_out = self.fcn_head(fcn_feat)
+            fcn_out = F.interpolate(fcn_out,
+                                    size=[H, W],
+                                    mode='bilinear',
+                                    align_corners=False)
+            output.append(fcn_out)
+            if self.use_se_loss:
+                se_out = self.se_layer(encode_feat)
+                output.append(se_out)
+            return output
+        return output
+
+
+class Encoding(nn.Layer):
+    def __init__(self, channels, num_codes):
+        super().__init__()
+        self.channels, self.num_codes = channels, num_codes
+
+        std = 1 / ((channels * num_codes)**0.5)
+        self.codewords = self.create_parameter(
+            shape=(num_codes, channels),
+            default_initializer=nn.initializer.Uniform(-std, std),
+        )
+        self.scale = self.create_parameter(
+            shape=(num_codes, ),
+            default_initializer=nn.initializer.Uniform(-1, 0),
+        )
+        self.channels = channels
+
+    def scaled_l2(self, x, codewords, scale):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_scale = scale.reshape([1, 1, num_codes])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+
+        scaled_l2_norm = paddle.multiply(
+            reshaped_scale,
+            (expanded_x - reshaped_codewords).pow(2).sum(axis=3))
+        return scaled_l2_norm
+
+    def aggregate(self, assignment_weights, x, codewords):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+
+        encoded_feat = paddle.multiply(
+            assignment_weights.unsqueeze(3),
+            (expanded_x - reshaped_codewords)).sum(axis=1)
+        encoded_feat = paddle.reshape(encoded_feat,
+                                      [-1, self.num_codes, self.channels])
+        return encoded_feat
+
+    def forward(self, x):
+        x_dims = x.ndim
+        assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format(
+            x_dims)
+        assert paddle.shape(
+            x
+        )[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format(
+            self.channels,
+            paddle.shape(x)[1])
+        batch_size = paddle.shape(x)[0]
+        x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1])
+        assignment_weights = F.softmax(self.scaled_l2(x, self.codewords,
+                                                      self.scale),
+                                       axis=2)
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        return encoded_feat
+
+
+class EncModule(nn.Layer):
+    def __init__(self, in_channels, num_codes):
+        super().__init__()
+        self.encoding_project = layers.ConvBNReLU(
+            in_channels,
+            in_channels,
+            1,
+        )
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            nn.BatchNorm1D(num_codes),
+            nn.ReLU(),
+        )
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels),
+            nn.Sigmoid(),
+        )
+        self.in_channels = in_channels
+
+    def forward(self, x):
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection)
+
+        encoding_feat = encoding_feat.mean(axis=1)
+        batch_size, _, _, _ = paddle.shape(x)
+
+        gamma = self.fc(encoding_feat)
+        y = gamma.reshape([batch_size, self.in_channels, 1, 1])
+        output = F.relu(x + x * y)
+        return encoding_feat, output
--- a/paddlers/models/ppseg/models/enet.py
+++ b/paddlers/models/ppseg/models/enet.py
@ -0,0 +1,622 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager, param_init
+
+__all__ = ['ENet']
+
+
+@manager.MODELS.add_component
+class ENet(nn.Layer):
+    """
+    The ENet implementation based on PaddlePaddle.
+
+    The original article refers to
+        Adam Paszke, Abhishek Chaurasia, Sangpil Kim, Eugenio Culurciello, et al."ENet: A Deep Neural Network Architecture for Real-Time Semantic Segmentation"
+        (https://arxiv.org/abs/1606.02147).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+        encoder_relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: False.
+        decoder_relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 num_classes,
+                 pretrained=None,
+                 encoder_relu=False,
+                 decoder_relu=True):
+        super(ENet, self).__init__()
+
+        self.numclasses = num_classes
+        self.initial_block = InitialBlock(3, 16, relu=encoder_relu)
+
+        self.downsample1_0 = DownsamplingBottleneck(16,
+                                                    64,
+                                                    return_indices=True,
+                                                    dropout_prob=0.01,
+                                                    relu=encoder_relu)
+        self.regular1_1 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+        self.regular1_2 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+        self.regular1_3 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+        self.regular1_4 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.01,
+                                            relu=encoder_relu)
+
+        self.downsample2_0 = DownsamplingBottleneck(64,
+                                                    128,
+                                                    return_indices=True,
+                                                    dropout_prob=0.1,
+                                                    relu=encoder_relu)
+        self.regular2_1 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated2_2 = RegularBottleneck(128,
+                                            dilation=2,
+                                            padding=2,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric2_3 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               padding=2,
+                                               asymmetric=True,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated2_4 = RegularBottleneck(128,
+                                            dilation=4,
+                                            padding=4,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.regular2_5 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated2_6 = RegularBottleneck(128,
+                                            dilation=8,
+                                            padding=8,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric2_7 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               asymmetric=True,
+                                               padding=2,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated2_8 = RegularBottleneck(128,
+                                            dilation=16,
+                                            padding=16,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+
+        self.regular3_0 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated3_1 = RegularBottleneck(128,
+                                            dilation=2,
+                                            padding=2,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric3_2 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               padding=2,
+                                               asymmetric=True,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated3_3 = RegularBottleneck(128,
+                                            dilation=4,
+                                            padding=4,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.regular3_4 = RegularBottleneck(128,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.dilated3_5 = RegularBottleneck(128,
+                                            dilation=8,
+                                            padding=8,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+        self.asymmetric3_6 = RegularBottleneck(128,
+                                               kernel_size=5,
+                                               asymmetric=True,
+                                               padding=2,
+                                               dropout_prob=0.1,
+                                               relu=encoder_relu)
+        self.dilated3_7 = RegularBottleneck(128,
+                                            dilation=16,
+                                            padding=16,
+                                            dropout_prob=0.1,
+                                            relu=encoder_relu)
+
+        self.upsample4_0 = UpsamplingBottleneck(128,
+                                                64,
+                                                dropout_prob=0.1,
+                                                relu=decoder_relu)
+        self.regular4_1 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=decoder_relu)
+        self.regular4_2 = RegularBottleneck(64,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=decoder_relu)
+
+        self.upsample5_0 = UpsamplingBottleneck(64,
+                                                16,
+                                                dropout_prob=0.1,
+                                                relu=decoder_relu)
+        self.regular5_1 = RegularBottleneck(16,
+                                            padding=1,
+                                            dropout_prob=0.1,
+                                            relu=decoder_relu)
+        self.transposed_conv = nn.Conv2DTranspose(16,
+                                                  num_classes,
+                                                  kernel_size=3,
+                                                  stride=2,
+                                                  padding=1,
+                                                  bias_attr=False)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+
+        input_size = x.shape
+        x = self.initial_block(x)
+
+        stage1_input_size = x.shape
+        x, max_indices1_0 = self.downsample1_0(x)
+        x = self.regular1_1(x)
+        x = self.regular1_2(x)
+        x = self.regular1_3(x)
+        x = self.regular1_4(x)
+
+        stage2_input_size = x.shape
+        x, max_indices2_0 = self.downsample2_0(x)
+        x = self.regular2_1(x)
+        x = self.dilated2_2(x)
+        x = self.asymmetric2_3(x)
+        x = self.dilated2_4(x)
+        x = self.regular2_5(x)
+        x = self.dilated2_6(x)
+        x = self.asymmetric2_7(x)
+        x = self.dilated2_8(x)
+
+        x = self.regular3_0(x)
+        x = self.dilated3_1(x)
+        x = self.asymmetric3_2(x)
+        x = self.dilated3_3(x)
+        x = self.regular3_4(x)
+        x = self.dilated3_5(x)
+        x = self.asymmetric3_6(x)
+        x = self.dilated3_7(x)
+
+        x = self.upsample4_0(x, max_indices2_0, output_size=stage2_input_size)
+        x = self.regular4_1(x)
+        x = self.regular4_2(x)
+
+        x = self.upsample5_0(x, max_indices1_0, output_size=stage1_input_size)
+        x = self.regular5_1(x)
+        x = self.transposed_conv(x, output_size=input_size[2:])
+        return [x]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class InitialBlock(nn.Layer):
+    """
+    The initial block is composed of two branches:
+    1. a main branch which performs a regular convolution with stride 2;
+    2. an extension branch which performs max-pooling.
+    Doing both operations in parallel and concatenating their results
+    allows for efficient downsampling and expansion. The main branch
+    outputs 13 feature maps while the extension branch outputs 3, for a
+    total of 16 feature maps after concatenation.
+
+    Args:
+        in_channels (int): the number of input channels.
+        out_channels (int): the number output channels.
+        kernel_size (int, optional): the kernel size of the filters used in
+            the convolution layer. Default: 3.
+        padding (int, optional): zero-padding added to both sides of the
+            input. Default: 0.
+        bias (bool, optional): Adds a learnable bias to the output if
+            ``True``. Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self, in_channels, out_channels, bias=False, relu=True):
+        super(InitialBlock, self).__init__()
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.main_branch = nn.Conv2D(in_channels,
+                                     out_channels - 3,
+                                     kernel_size=3,
+                                     stride=2,
+                                     padding=1,
+                                     bias_attr=bias)
+
+        self.ext_branch = nn.MaxPool2D(3, stride=2, padding=1)
+
+        self.batch_norm = layers.SyncBatchNorm(out_channels)
+
+        self.out_activation = activation()
+
+    def forward(self, x):
+        main = self.main_branch(x)
+        ext = self.ext_branch(x)
+
+        out = paddle.concat((main, ext), 1)
+
+        out = self.batch_norm(out)
+
+        return self.out_activation(out)
+
+
+class RegularBottleneck(nn.Layer):
+    """
+    Regular bottlenecks are the main building block of ENet.
+    Main branch:
+    1. Shortcut connection.
+    Extension branch:
+    1. 1x1 convolution which decreases the number of channels by
+        ``internal_ratio``, also called a projection;
+    2. regular, dilated or asymmetric convolution;
+    3. 1x1 convolution which increases the number of channels back to
+        ``channels``, also called an expansion;
+    4. dropout as a regularizer.
+
+    Args:
+        channels (int): the number of input and output channels.
+        internal_ratio (int, optional): a scale factor applied to
+            ``channels`` used to compute the number of
+            channels after the projection. eg. given ``channels`` equal to 128 and
+            internal_ratio equal to 2 the number of channels after the projection
+            is 64. Default: 4.
+        kernel_size (int, optional): the kernel size of the filters used in
+            the convolution layer described above in item 2 of the extension
+            branch. Default: 3.
+        padding (int, optional): zero-padding added to both sides of the
+            input. Default: 0.
+        dilation (int, optional): spacing between kernel elements for the
+            convolution described in item 2 of the extension branch. Default: 1.
+            asymmetric (bool, optional): flags if the convolution described in
+            item 2 of the extension branch is asymmetric or not. Default: False.
+        dropout_prob (float, optional): probability of an element to be
+            zeroed. Default: 0 (no dropout).
+        bias (bool, optional): Adds a learnable bias to the output if
+            ``True``. Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 channels,
+                 internal_ratio=4,
+                 kernel_size=3,
+                 padding=0,
+                 dilation=1,
+                 asymmetric=False,
+                 dropout_prob=0,
+                 bias=False,
+                 relu=True):
+        super(RegularBottleneck, self).__init__()
+
+        if internal_ratio <= 1 or internal_ratio > channels:
+            raise RuntimeError(
+                "Value out of range. Expected value in the "
+                "interval [1, {0}], got internal_scale={1}.".format(
+                    channels, internal_ratio))
+
+        internal_channels = channels // internal_ratio
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.ext_conv1 = nn.Sequential(
+            nn.Conv2D(channels,
+                      internal_channels,
+                      kernel_size=1,
+                      stride=1,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        if asymmetric:
+            self.ext_conv2 = nn.Sequential(
+                nn.Conv2D(internal_channels,
+                          internal_channels,
+                          kernel_size=(kernel_size, 1),
+                          stride=1,
+                          padding=(padding, 0),
+                          dilation=dilation,
+                          bias_attr=bias),
+                layers.SyncBatchNorm(internal_channels), activation(),
+                nn.Conv2D(internal_channels,
+                          internal_channels,
+                          kernel_size=(1, kernel_size),
+                          stride=1,
+                          padding=(0, padding),
+                          dilation=dilation,
+                          bias_attr=bias),
+                layers.SyncBatchNorm(internal_channels), activation())
+        else:
+            self.ext_conv2 = nn.Sequential(
+                nn.Conv2D(internal_channels,
+                          internal_channels,
+                          kernel_size=kernel_size,
+                          stride=1,
+                          padding=padding,
+                          dilation=dilation,
+                          bias_attr=bias),
+                layers.SyncBatchNorm(internal_channels), activation())
+
+        self.ext_conv3 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      channels,
+                      kernel_size=1,
+                      stride=1,
+                      bias_attr=bias), layers.SyncBatchNorm(channels),
+            activation())
+
+        self.ext_regul = nn.Dropout2D(p=dropout_prob)
+
+        self.out_activation = activation()
+
+    def forward(self, x):
+        main = x
+
+        ext = self.ext_conv1(x)
+        ext = self.ext_conv2(ext)
+        ext = self.ext_conv3(ext)
+        ext = self.ext_regul(ext)
+
+        out = main + ext
+
+        return self.out_activation(out)
+
+
+class DownsamplingBottleneck(nn.Layer):
+    """
+    Downsampling bottlenecks further downsample the feature map size.
+    Main branch:
+    1. max pooling with stride 2; indices are saved to be used for
+        unpooling later.
+    Extension branch:
+    1. 2x2 convolution with stride 2 that decreases the number of channels
+        by ``internal_ratio``, also called a projection;
+    2. regular convolution (by default, 3x3);
+    3. 1x1 convolution which increases the number of channels to
+        ``out_channels``, also called an expansion;
+    4. dropout as a regularizer.
+
+    Args:
+        in_channels (int): the number of input channels.
+        out_channels (int): the number of output channels.
+        internal_ratio (int, optional): a scale factor applied to ``channels``
+            used to compute the number of channels after the projection. eg. given
+            ``channels`` equal to 128 and internal_ratio equal to 2 the number of
+            channels after the projection is 64. Default: 4.
+        return_indices (bool, optional):  if ``True``, will return the max
+            indices along with the outputs. Useful when unpooling later.
+        dropout_prob (float, optional): probability of an element to be
+            zeroed. Default: 0 (no dropout).
+        bias (bool, optional): Adds a learnable bias to the output if
+            ``True``. Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 internal_ratio=4,
+                 return_indices=False,
+                 dropout_prob=0,
+                 bias=False,
+                 relu=True):
+        super(DownsamplingBottleneck, self).__init__()
+
+        self.return_indices = return_indices
+
+        if internal_ratio <= 1 or internal_ratio > in_channels:
+            raise RuntimeError(
+                "Value out of range. Expected value in the "
+                "interval [1, {0}], got internal_scale={1}. ".format(
+                    in_channels, internal_ratio))
+
+        internal_channels = in_channels // internal_ratio
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.main_max1 = nn.MaxPool2D(2, stride=2, return_mask=return_indices)
+
+        self.ext_conv1 = nn.Sequential(
+            nn.Conv2D(in_channels,
+                      internal_channels,
+                      kernel_size=2,
+                      stride=2,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        self.ext_conv2 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      internal_channels,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        self.ext_conv3 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      out_channels,
+                      kernel_size=1,
+                      stride=1,
+                      bias_attr=bias), layers.SyncBatchNorm(out_channels),
+            activation())
+
+        self.ext_regul = nn.Dropout2D(p=dropout_prob)
+
+        self.out_activation = activation()
+
+    def forward(self, x):
+        if self.return_indices:
+            main, max_indices = self.main_max1(x)
+        else:
+            main = self.main_max1(x)
+
+        ext = self.ext_conv1(x)
+        ext = self.ext_conv2(ext)
+        ext = self.ext_conv3(ext)
+        ext = self.ext_regul(ext)
+
+        n, ch_ext, h, w = ext.shape
+        ch_main = main.shape[1]
+        padding = paddle.zeros((n, ch_ext - ch_main, h, w))
+
+        main = paddle.concat((main, padding), 1)
+
+        out = main + ext
+
+        return self.out_activation(out), max_indices
+
+
+class UpsamplingBottleneck(nn.Layer):
+    """
+    The upsampling bottlenecks upsample the feature map resolution using max
+        pooling indices stored from the corresponding downsampling bottleneck.
+    Main branch:
+    1. 1x1 convolution with stride 1 that decreases the number of channels by
+        ``internal_ratio``, also called a projection;
+    2. max unpool layer using the max pool indices from the corresponding
+        downsampling max pool layer.
+    Extension branch:
+    1. 1x1 convolution with stride 1 that decreases the number of channels by
+        ``internal_ratio``, also called a projection;
+    2. transposed convolution (by default, 3x3);
+    3. 1x1 convolution which increases the number of channels to
+        ``out_channels``, also called an expansion;
+    4. dropout as a regularizer.
+
+    Args:
+        in_channels (int): the number of input channels.
+        out_channels (int): the number of output channels.
+        internal_ratio (int, optional): a scale factor applied to ``in_channels``
+            used to compute the number of channels after the projection. eg. given
+            ``in_channels`` equal to 128 and ``internal_ratio`` equal to 2 the number
+            of channels after the projection is 64. Default: 4.
+        dropout_prob (float, optional): probability of an element to be zeroed.
+            Default: 0 (no dropout).
+        bias (bool, optional): Adds a learnable bias to the output if ``True``.
+            Default: False.
+        relu (bool, optional): When ``True`` ReLU is used as the activation
+            function; otherwise, PReLU is used. Default: True.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 internal_ratio=4,
+                 dropout_prob=0,
+                 bias=False,
+                 relu=True):
+        super(UpsamplingBottleneck, self).__init__()
+
+        if internal_ratio <= 1 or internal_ratio > in_channels:
+            raise RuntimeError(
+                "Value out of range. Expected value in the "
+                "interval [1, {0}], got internal_scale={1}. ".format(
+                    in_channels, internal_ratio))
+
+        internal_channels = in_channels // internal_ratio
+
+        if relu:
+            activation = nn.ReLU
+        else:
+            activation = nn.PReLU
+
+        self.main_conv1 = nn.Sequential(
+            nn.Conv2D(in_channels, out_channels, kernel_size=1, bias_attr=bias),
+            layers.SyncBatchNorm(out_channels))
+
+        self.ext_conv1 = nn.Sequential(
+            nn.Conv2D(in_channels,
+                      internal_channels,
+                      kernel_size=1,
+                      bias_attr=bias), layers.SyncBatchNorm(internal_channels),
+            activation())
+
+        self.ext_tconv1 = nn.Conv2DTranspose(internal_channels,
+                                             internal_channels,
+                                             kernel_size=2,
+                                             stride=2,
+                                             bias_attr=bias)
+        self.ext_tconv1_bnorm = layers.SyncBatchNorm(internal_channels)
+        self.ext_tconv1_activation = activation()
+
+        self.ext_conv2 = nn.Sequential(
+            nn.Conv2D(internal_channels,
+                      out_channels,
+                      kernel_size=1,
+                      bias_attr=bias), layers.SyncBatchNorm(out_channels))
+
+        self.ext_regul = nn.Dropout2D(p=dropout_prob)
+
+        self.out_activation = activation()
+
+    def forward(self, x, max_indices, output_size):
+        main = self.main_conv1(x)
+        main = F.max_unpool2d(main,
+                              max_indices,
+                              kernel_size=2,
+                              output_size=output_size)
+
+        ext = self.ext_conv1(x)
+        ext = self.ext_tconv1(ext, output_size=output_size[2:])
+        ext = self.ext_tconv1_bnorm(ext)
+        ext = self.ext_tconv1_activation(ext)
+        ext = self.ext_conv2(ext)
+        ext = self.ext_regul(ext)
+
+        out = main + ext
+
+        return self.out_activation(out)
--- a/paddlers/models/ppseg/models/espnet.py
+++ b/paddlers/models/ppseg/models/espnet.py
@ -0,0 +1,477 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class ESPNetV2(nn.Layer):
+    """
+    The ESPNetV2 implementation based on PaddlePaddle.
+
+    The original article refers to
+    Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network"
+    (https://arxiv.org/abs/1811.11431).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): Number of input channels. Default: 3.
+        scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0.
+        drop_prob (floa, optional): The probability of dropout. Default: 0.1.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels=3,
+                 scale=1.0,
+                 drop_prob=0.1,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = EESPNetBackbone(in_channels, drop_prob, scale)
+        self.in_channels = self.backbone.out_channels
+        self.proj_l4_c = layers.ConvBNPReLU(self.in_channels[3],
+                                            self.in_channels[2],
+                                            1,
+                                            stride=1,
+                                            bias_attr=False)
+        psp_size = 2 * self.in_channels[2]
+        self.eesp_psp = nn.Sequential(
+            EESP(psp_size,
+                 psp_size // 2,
+                 stride=1,
+                 branches=4,
+                 kernel_size_maximum=7),
+            PSPModule(psp_size // 2, psp_size // 2),
+        )
+
+        self.project_l3 = nn.Sequential(
+            nn.Dropout2D(p=drop_prob),
+            nn.Conv2D(psp_size // 2, num_classes, 1, 1, bias_attr=False),
+        )
+        self.act_l3 = BNPReLU(num_classes)
+        self.project_l2 = layers.ConvBNPReLU(self.in_channels[1] + num_classes,
+                                             num_classes,
+                                             1,
+                                             stride=1,
+                                             bias_attr=False)
+        self.project_l1 = nn.Sequential(
+            nn.Dropout2D(p=drop_prob),
+            nn.Conv2D(self.in_channels[0] + num_classes,
+                      num_classes,
+                      1,
+                      1,
+                      bias_attr=False),
+        )
+
+        self.pretrained = pretrained
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def hierarchical_upsample(self, x, factor=3):
+        for i in range(factor):
+            x = F.interpolate(x,
+                              scale_factor=2,
+                              mode='bilinear',
+                              align_corners=True)
+        return x
+
+    def forward(self, x):
+        out_l1, out_l2, out_l3, out_l4 = self.backbone(x)
+
+        out_l4_proj = self.proj_l4_c(out_l4)
+        l4_to_l3 = F.interpolate(out_l4_proj,
+                                 scale_factor=2,
+                                 mode='bilinear',
+                                 align_corners=True)
+        merged_l3 = self.eesp_psp(paddle.concat([out_l3, l4_to_l3], axis=1))
+        proj_merge_l3 = self.project_l3(merged_l3)
+        proj_merge_l3 = self.act_l3(proj_merge_l3)
+
+        l3_to_l2 = F.interpolate(proj_merge_l3,
+                                 scale_factor=2,
+                                 mode='bilinear',
+                                 align_corners=True)
+        merged_l2 = self.project_l2(paddle.concat([out_l2, l3_to_l2], axis=1))
+
+        l2_to_l1 = F.interpolate(merged_l2,
+                                 scale_factor=2,
+                                 mode='bilinear',
+                                 align_corners=True)
+        merged_l1 = self.project_l1(paddle.concat([out_l1, l2_to_l1], axis=1))
+
+        if self.training:
+            return [
+                F.interpolate(merged_l1,
+                              scale_factor=2,
+                              mode='bilinear',
+                              align_corners=True),
+                self.hierarchical_upsample(proj_merge_l3),
+            ]
+        else:
+            return [
+                F.interpolate(merged_l1,
+                              scale_factor=2,
+                              mode='bilinear',
+                              align_corners=True)
+            ]
+
+
+class BNPReLU(nn.Layer):
+    def __init__(self, out_channels, **kwargs):
+        super().__init__()
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = layers.SyncBatchNorm(out_channels,
+                                                data_format=data_format)
+        self._prelu = layers.Activation("prelu")
+
+    def forward(self, x):
+        x = self._batch_norm(x)
+        x = self._prelu(x)
+        return x
+
+
+class EESP(nn.Layer):
+    """
+    EESP block, principle: reduce -> split -> transform -> merge
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int, optional): Factor by which we should skip (useful for down-sampling). If 2, then down-samples the feature map by 2. Default: 1.
+        branches (int, optional): Number of branches. Default: 4.
+        kernel_size_maximum (int, optional): A maximum value of receptive field allowed for EESP block. Default: 7.
+        down_method (str, optional): Down sample or not, only support 'avg' and 'esp'(equivalent to stride is 2 or not). Default: 'esp'.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 branches=4,
+                 kernel_size_maximum=7,
+                 down_method='esp'):
+        super(EESP, self).__init__()
+        if out_channels % branches != 0:
+            raise RuntimeError(
+                "The out_channes for EESP should be factorized by branches, but out_channels={} cann't be factorized by branches={}"
+                .format(out_channels, branches))
+        assert down_method in [
+            'avg', 'esp'
+        ], "The down_method for EESP only support 'avg' or 'esp', but got down_method={}".format(
+            down_method)
+        self.in_channels = in_channels
+        self.stride = stride
+
+        in_branch_channels = int(out_channels / branches)
+        self.group_conv_in = layers.ConvBNPReLU(in_channels,
+                                                in_branch_channels,
+                                                1,
+                                                stride=1,
+                                                groups=branches,
+                                                bias_attr=False)
+
+        map_ksize_dilation = {
+            3: 1,
+            5: 2,
+            7: 3,
+            9: 4,
+            11: 5,
+            13: 6,
+            15: 7,
+            17: 8
+        }
+        self.kernel_sizes = []
+        for i in range(branches):
+            kernel_size = 3 + 2 * i
+            kernel_size = kernel_size if kernel_size <= kernel_size_maximum else 3
+            self.kernel_sizes.append(kernel_size)
+        self.kernel_sizes.sort()
+
+        self.spp_modules = nn.LayerList()
+        for i in range(branches):
+            dilation = map_ksize_dilation[self.kernel_sizes[i]]
+            self.spp_modules.append(
+                nn.Conv2D(in_branch_channels,
+                          in_branch_channels,
+                          kernel_size=3,
+                          padding='same',
+                          stride=stride,
+                          dilation=dilation,
+                          groups=in_branch_channels,
+                          bias_attr=False))
+        self.group_conv_out = layers.ConvBN(out_channels,
+                                            out_channels,
+                                            kernel_size=1,
+                                            stride=1,
+                                            groups=branches,
+                                            bias_attr=False)
+        self.bn_act = BNPReLU(out_channels)
+        self._act = nn.PReLU()
+        self.down_method = True if down_method == 'avg' else False
+
+    @paddle.jit.not_to_static
+    def convert_group_x(self, group_merge, x):
+        if x.shape == group_merge.shape:
+            group_merge += x
+
+        return group_merge
+
+    def forward(self, x):
+        group_out = self.group_conv_in(x)
+        output = [self.spp_modules[0](group_out)]
+
+        for k in range(1, len(self.spp_modules)):
+            output_k = self.spp_modules[k](group_out)
+            output_k = output_k + output[k - 1]
+            output.append(output_k)
+
+        group_merge = self.group_conv_out(
+            self.bn_act(paddle.concat(output, axis=1)))
+
+        if self.stride == 2 and self.down_method:
+            return group_merge
+
+        group_merge = self.convert_group_x(group_merge, x)
+        out = self._act(group_merge)
+        return out
+
+
+class PSPModule(nn.Layer):
+    def __init__(self, in_channels, out_channels, sizes=4):
+        super().__init__()
+        self.stages = nn.LayerList([
+            nn.Conv2D(in_channels,
+                      in_channels,
+                      kernel_size=3,
+                      stride=1,
+                      groups=in_channels,
+                      padding='same',
+                      bias_attr=False) for _ in range(sizes)
+        ])
+        self.project = layers.ConvBNPReLU(in_channels * (sizes + 1),
+                                          out_channels,
+                                          1,
+                                          stride=1,
+                                          bias_attr=False)
+
+    def forward(self, feats):
+        h, w = paddle.shape(feats)[2:4]
+        out = [feats]
+        for stage in self.stages:
+            feats = F.avg_pool2d(feats, kernel_size=3, stride=2, padding='same')
+            upsampled = F.interpolate(stage(feats),
+                                      size=[h, w],
+                                      mode='bilinear',
+                                      align_corners=True)
+            out.append(upsampled)
+        return self.project(paddle.concat(out, axis=1))
+
+
+class DownSampler(nn.Layer):
+    """
+    Down sampler.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        branches (int, optional): Number of branches. Default: 9.
+        kernel_size_maximum (int, optional): A maximum value of kernel_size for EESP block. Default: 9.
+        shortcut (bool, optional): Use shortcut or not. Default: True.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 branches=4,
+                 kernel_size_maximum=9,
+                 shortcut=True):
+        super().__init__()
+        if out_channels < in_channels:
+            raise RuntimeError(
+                "The out_channes for DownSampler should be bigger than in_channels, but got in_channles={}, out_channels={}"
+                .format(in_channels, out_channels))
+        self.eesp = EESP(in_channels,
+                         out_channels - in_channels,
+                         stride=2,
+                         branches=branches,
+                         kernel_size_maximum=kernel_size_maximum,
+                         down_method='avg')
+        self.avg = nn.AvgPool2D(kernel_size=3, padding=1, stride=2)
+        if shortcut:
+            self.shortcut_layer = nn.Sequential(
+                layers.ConvBNPReLU(3, 3, 3, stride=1, bias_attr=False),
+                layers.ConvBN(3, out_channels, 1, stride=1, bias_attr=False),
+            )
+        self._act = nn.PReLU()
+
+    def forward(self, x, inputs=None):
+        avg_out = self.avg(x)
+        eesp_out = self.eesp(x)
+        output = paddle.concat([avg_out, eesp_out], axis=1)
+
+        if inputs is not None:
+            w1 = paddle.shape(avg_out)[2]
+            w2 = paddle.shape(inputs)[2]
+            
+            while w2 != w1:
+                inputs = F.avg_pool2d(inputs,
+                                      kernel_size=3,
+                                      padding=1,
+                                      stride=2)
+                w2 = paddle.shape(inputs)[2]
+            # import pdb
+            # pdb.set_trace()
+            output = output + self.shortcut_layer(inputs)
+        return self._act(output)
+
+
+class EESPNetBackbone(nn.Layer):
+    """
+    The EESPNetBackbone implementation based on PaddlePaddle.
+
+    The original article refers to
+    Sachin Mehta, Mohammad Rastegari, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNetv2: A Light-weight, Power Efficient, and General Purpose Convolutional Neural Network"
+    (https://arxiv.org/abs/1811.11431).
+
+    Args:
+        in_channels (int, optional): Number of input channels. Default: 3.
+        drop_prob (float, optional): The probability of dropout. Default: 3.
+        scale (float, optional): The scale of channels, only support scale <= 1.5 and scale == 2. Default: 1.0.
+    """
+    def __init__(self, in_channels=3, drop_prob=0.1, scale=1.0):
+        super().__init__()
+        reps = [0, 3, 7, 3]
+
+        num_level = 4  # 1/2, 1/4, 1/8, 1/16
+        kernel_size_limitations = [13, 11, 9, 7]  # kernel size limitation
+        branch_list = [4] * len(
+            kernel_size_limitations)  # branches at different levels
+
+        base_channels = 32  # first conv output channels
+        channels_config = [base_channels] * num_level
+
+        for i in range(num_level):
+            if i == 0:
+                channels = int(base_channels * scale)
+                channels = math.ceil(channels / branch_list[0]) * branch_list[0]
+                channels_config[
+                    i] = base_channels if channels > base_channels else channels
+            else:
+                channels_config[i] = channels * pow(2, i)
+
+        self.level1 = layers.ConvBNPReLU(in_channels,
+                                         channels_config[0],
+                                         3,
+                                         stride=2,
+                                         bias_attr=False)
+
+        self.level2 = DownSampler(
+            channels_config[0],
+            channels_config[1],
+            branches=branch_list[0],
+            kernel_size_maximum=kernel_size_limitations[0],
+            shortcut=True)
+
+        self.level3_0 = DownSampler(
+            channels_config[1],
+            channels_config[2],
+            branches=branch_list[1],
+            kernel_size_maximum=kernel_size_limitations[1],
+            shortcut=True)
+        self.level3 = nn.LayerList()
+        for i in range(reps[1]):
+            self.level3.append(
+                EESP(channels_config[2],
+                     channels_config[2],
+                     stride=1,
+                     branches=branch_list[2],
+                     kernel_size_maximum=kernel_size_limitations[2]))
+
+        self.level4_0 = DownSampler(
+            channels_config[2],
+            channels_config[3],
+            branches=branch_list[2],
+            kernel_size_maximum=kernel_size_limitations[2],
+            shortcut=True)
+        self.level4 = nn.LayerList()
+        for i in range(reps[2]):
+            self.level4.append(
+                EESP(channels_config[3],
+                     channels_config[3],
+                     stride=1,
+                     branches=branch_list[3],
+                     kernel_size_maximum=kernel_size_limitations[3]))
+
+        self.out_channels = channels_config
+
+        self.init_params()
+
+    def init_params(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                param_init.kaiming_normal_init(m.weight)
+                if m.bias is not None:
+                    param_init.constant_init(m.bias, value=0.0)
+            elif isinstance(m, nn.BatchNorm2D):
+                param_init.constant_init(m.weight, value=1.0)
+                param_init.constant_init(m.bias, value=0.0)
+            elif isinstance(m, nn.Linear):
+                param_init.normal_init(m.weight, std=0.001)
+                if m.bias is not None:
+                    param_init.constant_init(m.bias, value=0.0)
+
+    def forward(self, x):
+        out_l1 = self.level1(x)
+        out_l2 = self.level2(out_l1, x)
+        out_l3 = self.level3_0(out_l2, x)
+        for i, layer in enumerate(self.level3):
+            out_l3 = layer(out_l3)
+        out_l4 = self.level4_0(out_l3, x)
+        for i, layer in enumerate(self.level4):
+            out_l4 = layer(out_l4)
+        return out_l1, out_l2, out_l3, out_l4
+
+
+if __name__ == '__main__':
+    import paddle
+    import numpy as np
+
+    paddle.enable_static()
+
+    startup_prog = paddle.static.default_startup_program()
+
+    exe = paddle.static.Executor(paddle.CPUPlace())
+    exe.run(startup_prog)
+    path_prefix = "./output/model"
+
+    [inference_program, feed_target_names, fetch_targets] = (
+        paddle.static.load_inference_model(path_prefix, exe))
+    print('inference_program:', inference_program)
+
+    tensor_img = np.array(np.random.random((1, 3, 1024, 2048)), dtype=np.float32)
+    results = exe.run(inference_program,
+                feed={feed_target_names[0]: tensor_img},
+                fetch_list=fetch_targets)
--- a/paddlers/models/ppseg/models/espnetv1.py
+++ b/paddlers/models/ppseg/models/espnetv1.py
@ -0,0 +1,308 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ESPNetV1(nn.Layer):
+    """
+    The ESPNetV1 implementation based on PaddlePaddle.
+
+    The original article refers to
+      Sachin Mehta1, Mohammad Rastegari, Anat Caspi, Linda Shapiro, and Hannaneh Hajishirzi. "ESPNet: Efficient Spatial Pyramid of Dilated Convolutions for Semantic Segmentation"
+      (https://arxiv.org/abs/1803.06815).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): Number of input channels. Default: 3.
+        level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 2.
+        level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 in_channels=3,
+                 level2_depth=2,
+                 level3_depth=3,
+                 pretrained=None):
+        super().__init__()
+        self.encoder = ESPNetEncoder(num_classes, in_channels, level2_depth,
+                                     level3_depth)
+
+        self.level3_up = nn.Conv2DTranspose(num_classes,
+                                            num_classes,
+                                            2,
+                                            stride=2,
+                                            padding=0,
+                                            output_padding=0,
+                                            bias_attr=False)
+        self.br3 = layers.SyncBatchNorm(num_classes)
+        self.level2_proj = nn.Conv2D(in_channels + 128,
+                                     num_classes,
+                                     1,
+                                     bias_attr=False)
+        self.combine_l2_l3 = nn.Sequential(
+            BNPReLU(2 * num_classes),
+            DilatedResidualBlock(2 * num_classes, num_classes, residual=False),
+        )
+        self.level2_up = nn.Sequential(
+            nn.Conv2DTranspose(num_classes,
+                               num_classes,
+                               2,
+                               stride=2,
+                               padding=0,
+                               output_padding=0,
+                               bias_attr=False),
+            BNPReLU(num_classes),
+        )
+        self.out_proj = layers.ConvBNPReLU(16 + in_channels + num_classes,
+                                           num_classes,
+                                           3,
+                                           padding='same',
+                                           stride=1)
+        self.out_up = nn.Conv2DTranspose(num_classes,
+                                         num_classes,
+                                         2,
+                                         stride=2,
+                                         padding=0,
+                                         output_padding=0,
+                                         bias_attr=False)
+        self.pretrained = pretrained
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        p1, p2, p3 = self.encoder(x)
+        up_p3 = self.level3_up(p3)
+
+        combine = self.combine_l2_l3(paddle.concat([up_p3, p2], axis=1))
+        up_p2 = self.level2_up(combine)
+
+        combine = self.out_proj(paddle.concat([up_p2, p1], axis=1))
+        out = self.out_up(combine)
+        return [out]
+
+
+class BNPReLU(nn.Layer):
+    def __init__(self, channels):
+        super().__init__()
+        self.bn = layers.SyncBatchNorm(channels)
+        self.act = nn.PReLU(channels)
+
+    def forward(self, x):
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DownSampler(nn.Layer):
+    """
+    Down sampler.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        branch_channels = out_channels // 5
+        remain_channels = out_channels - branch_channels * 4
+        self.conv1 = nn.Conv2D(in_channels,
+                               branch_channels,
+                               3,
+                               stride=2,
+                               padding=1,
+                               bias_attr=False)
+        self.d_conv1 = nn.Conv2D(branch_channels,
+                                 remain_channels,
+                                 3,
+                                 padding=1,
+                                 bias_attr=False)
+        self.d_conv2 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=2,
+                                 dilation=2,
+                                 bias_attr=False)
+        self.d_conv4 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=4,
+                                 dilation=4,
+                                 bias_attr=False)
+        self.d_conv8 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=8,
+                                 dilation=8,
+                                 bias_attr=False)
+        self.d_conv16 = nn.Conv2D(branch_channels,
+                                  branch_channels,
+                                  3,
+                                  padding=16,
+                                  dilation=16,
+                                  bias_attr=False)
+        self.bn = layers.SyncBatchNorm(out_channels)
+        self.act = nn.PReLU(out_channels)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        d1 = self.d_conv1(x)
+        d2 = self.d_conv2(x)
+        d4 = self.d_conv4(x)
+        d8 = self.d_conv8(x)
+        d16 = self.d_conv16(x)
+
+        feat1 = d2
+        feat2 = feat1 + d4
+        feat3 = feat2 + d8
+        feat4 = feat3 + d16
+
+        feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1)
+        out = self.bn(feat)
+        out = self.act(out)
+        return out
+
+
+class DilatedResidualBlock(nn.Layer):
+    '''
+    ESP block, principle: reduce -> split -> transform -> merge
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        residual (bool, optional): Add a residual connection through identity operation. Default: True.
+    '''
+    def __init__(self, in_channels, out_channels, residual=True):
+        super().__init__()
+        branch_channels = out_channels // 5
+        remain_channels = out_channels - branch_channels * 4
+        self.conv1 = nn.Conv2D(in_channels, branch_channels, 1, bias_attr=False)
+        self.d_conv1 = nn.Conv2D(branch_channels,
+                                 remain_channels,
+                                 3,
+                                 padding=1,
+                                 bias_attr=False)
+        self.d_conv2 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=2,
+                                 dilation=2,
+                                 bias_attr=False)
+        self.d_conv4 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=4,
+                                 dilation=4,
+                                 bias_attr=False)
+        self.d_conv8 = nn.Conv2D(branch_channels,
+                                 branch_channels,
+                                 3,
+                                 padding=8,
+                                 dilation=8,
+                                 bias_attr=False)
+        self.d_conv16 = nn.Conv2D(branch_channels,
+                                  branch_channels,
+                                  3,
+                                  padding=16,
+                                  dilation=16,
+                                  bias_attr=False)
+
+        self.bn = BNPReLU(out_channels)
+        self.residual = residual
+
+    def forward(self, x):
+        x_proj = self.conv1(x)
+        d1 = self.d_conv1(x_proj)
+        d2 = self.d_conv2(x_proj)
+        d4 = self.d_conv4(x_proj)
+        d8 = self.d_conv8(x_proj)
+        d16 = self.d_conv16(x_proj)
+
+        feat1 = d2
+        feat2 = feat1 + d4
+        feat3 = feat2 + d8
+        feat4 = feat3 + d16
+
+        feat = paddle.concat([d1, feat1, feat2, feat3, feat4], axis=1)
+
+        if self.residual:
+            feat = feat + x
+        out = self.bn(feat)
+        return out
+
+
+class ESPNetEncoder(nn.Layer):
+    '''
+    The ESPNet-C implementation based on PaddlePaddle.
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): Number of input channels. Default: 3.
+        level2_depth (int, optional): Depth of DilatedResidualBlock. Default: 5.
+        level3_depth (int, optional): Depth of DilatedResidualBlock. Default: 3.
+    '''
+    def __init__(self,
+                 num_classes,
+                 in_channels=3,
+                 level2_depth=5,
+                 level3_depth=3):
+        super().__init__()
+        self.level1 = layers.ConvBNPReLU(in_channels,
+                                         16,
+                                         3,
+                                         padding='same',
+                                         stride=2)
+        self.br1 = BNPReLU(in_channels + 16)
+        self.proj1 = layers.ConvBNPReLU(in_channels + 16, num_classes, 1)
+
+        self.level2_0 = DownSampler(in_channels + 16, 64)
+        self.level2 = nn.Sequential(
+            *[DilatedResidualBlock(64, 64) for i in range(level2_depth)])
+        self.br2 = BNPReLU(in_channels + 128)
+        self.proj2 = layers.ConvBNPReLU(in_channels + 128, num_classes, 1)
+
+        self.level3_0 = DownSampler(in_channels + 128, 128)
+        self.level3 = nn.Sequential(
+            *[DilatedResidualBlock(128, 128) for i in range(level3_depth)])
+        self.br3 = BNPReLU(256)
+        self.proj3 = layers.ConvBNPReLU(256, num_classes, 1)
+
+    def forward(self, x):
+        f1 = self.level1(x)
+        down2 = F.adaptive_avg_pool2d(x, output_size=f1.shape[2:])
+        feat1 = paddle.concat([f1, down2], axis=1)
+        feat1 = self.br1(feat1)
+        p1 = self.proj1(feat1)
+
+        f2_res = self.level2_0(feat1)
+        f2 = self.level2(f2_res)
+        down4 = F.adaptive_avg_pool2d(x, output_size=f2.shape[2:])
+        feat2 = paddle.concat([f2, f2_res, down4], axis=1)
+        feat2 = self.br2(feat2)
+        p2 = self.proj2(feat2)
+
+        f3_res = self.level3_0(feat2)
+        f3 = self.level3(f3_res)
+        feat3 = paddle.concat([f3, f3_res], axis=1)
+        feat3 = self.br3(feat3)
+        p3 = self.proj3(feat3)
+
+        return p1, p2, p3
--- a/paddlers/models/ppseg/models/fast_scnn.py
+++ b/paddlers/models/ppseg/models/fast_scnn.py
@ -0,0 +1,316 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import paddle
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = ['FastSCNN']
+
+
+@manager.MODELS.add_component
+class FastSCNN(nn.Layer):
+    """
+    The FastSCNN implementation based on PaddlePaddle.
+    As mentioned in the original paper, FastSCNN is a real-time segmentation algorithm (123.5fps)
+    even for high resolution images (1024x2048).
+    The original article refers to
+    Poudel, Rudra PK, et al. "Fast-scnn: Fast semantic segmentation network"
+    (https://arxiv.org/pdf/1902.04502.pdf).
+    Args:
+        num_classes (int): The unique number of target classes.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
+            If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+
+        super().__init__()
+
+        self.learning_to_downsample = LearningToDownsample(32, 48, 64)
+        self.global_feature_extractor = GlobalFeatureExtractor(
+            in_channels=64,
+            block_channels=[64, 96, 128],
+            out_channels=128,
+            expansion=6,
+            num_blocks=[3, 3, 3],
+            align_corners=True)
+        self.feature_fusion = FeatureFusionModule(64, 128, 128, align_corners)
+        self.classifier = Classifier(128, num_classes)
+
+        if enable_auxiliary_loss:
+            self.auxlayer = layers.AuxLayer(64, 32, num_classes)
+
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        logit_list = []
+        input_size = paddle.shape(x)[2:]
+        higher_res_features = self.learning_to_downsample(x)
+        x = self.global_feature_extractor(higher_res_features)
+        x = self.feature_fusion(higher_res_features, x)
+        logit = self.classifier(x)
+        logit = F.interpolate(
+            logit,
+            input_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            auxiliary_logit = self.auxlayer(higher_res_features)
+            auxiliary_logit = F.interpolate(
+                auxiliary_logit,
+                input_size,
+                mode='bilinear',
+                align_corners=self.align_corners)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class LearningToDownsample(nn.Layer):
+    """
+    Learning to downsample module.
+    This module consists of three downsampling blocks (one conv and two separable conv)
+    Args:
+        dw_channels1 (int, optional): The input channels of the first sep conv. Default: 32.
+        dw_channels2 (int, optional): The input channels of the second sep conv. Default: 48.
+        out_channels (int, optional): The output channels of LearningToDownsample module. Default: 64.
+    """
+
+    def __init__(self, dw_channels1=32, dw_channels2=48, out_channels=64):
+        super(LearningToDownsample, self).__init__()
+
+        self.conv_bn_relu = layers.ConvBNReLU(
+            in_channels=3, out_channels=dw_channels1, kernel_size=3, stride=2)
+        self.dsconv_bn_relu1 = layers.SeparableConvBNReLU(
+            in_channels=dw_channels1,
+            out_channels=dw_channels2,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.dsconv_bn_relu2 = layers.SeparableConvBNReLU(
+            in_channels=dw_channels2,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+
+    def forward(self, x):
+        x = self.conv_bn_relu(x)
+        x = self.dsconv_bn_relu1(x)
+        x = self.dsconv_bn_relu2(x)
+        return x
+
+
+class GlobalFeatureExtractor(nn.Layer):
+    """
+    Global feature extractor module.
+    This module consists of three InvertedBottleneck blocks (like inverted residual introduced by MobileNetV2) and
+    a PPModule (introduced by PSPNet).
+    Args:
+        in_channels (int): The number of input channels to the module.
+        block_channels (tuple): A tuple represents output channels of each bottleneck block.
+        out_channels (int): The number of output channels of the module. Default:
+        expansion (int): The expansion factor in bottleneck.
+        num_blocks (tuple): It indicates the repeat time of each bottleneck.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, in_channels, block_channels, out_channels, expansion,
+                 num_blocks, align_corners):
+        super(GlobalFeatureExtractor, self).__init__()
+
+        self.bottleneck1 = self._make_layer(InvertedBottleneck, in_channels,
+                                            block_channels[0], num_blocks[0],
+                                            expansion, 2)
+        self.bottleneck2 = self._make_layer(
+            InvertedBottleneck, block_channels[0], block_channels[1],
+            num_blocks[1], expansion, 2)
+        self.bottleneck3 = self._make_layer(
+            InvertedBottleneck, block_channels[1], block_channels[2],
+            num_blocks[2], expansion, 1)
+
+        self.ppm = layers.PPModule(
+            block_channels[2],
+            out_channels,
+            bin_sizes=(1, 2, 3, 6),
+            dim_reduction=True,
+            align_corners=align_corners)
+
+    def _make_layer(self,
+                    block,
+                    in_channels,
+                    out_channels,
+                    blocks,
+                    expansion=6,
+                    stride=1):
+        layers = []
+        layers.append(block(in_channels, out_channels, expansion, stride))
+        for _ in range(1, blocks):
+            layers.append(block(out_channels, out_channels, expansion, 1))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.bottleneck1(x)
+        x = self.bottleneck2(x)
+        x = self.bottleneck3(x)
+        x = self.ppm(x)
+        return x
+
+
+class InvertedBottleneck(nn.Layer):
+    """
+    Single Inverted bottleneck implementation.
+    Args:
+        in_channels (int): The number of input channels to bottleneck block.
+        out_channels (int): The number of output channels of bottleneck block.
+        expansion (int, optional). The expansion factor in bottleneck. Default: 6.
+        stride (int, optional). The stride used in depth-wise conv. Defalt: 2.
+    """
+
+    def __init__(self, in_channels, out_channels, expansion=6, stride=2):
+        super().__init__()
+
+        self.use_shortcut = stride == 1 and in_channels == out_channels
+
+        expand_channels = in_channels * expansion
+        self.block = nn.Sequential(
+            # pw
+            layers.ConvBNReLU(
+                in_channels=in_channels,
+                out_channels=expand_channels,
+                kernel_size=1,
+                bias_attr=False),
+            # dw
+            layers.ConvBNReLU(
+                in_channels=expand_channels,
+                out_channels=expand_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=expand_channels,
+                bias_attr=False),
+            # pw-linear
+            layers.ConvBN(
+                in_channels=expand_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                bias_attr=False))
+
+    def forward(self, x):
+        out = self.block(x)
+        if self.use_shortcut:
+            out = x + out
+        return out
+
+
+class FeatureFusionModule(nn.Layer):
+    """
+    Feature Fusion Module Implementation.
+    This module fuses high-resolution feature and low-resolution feature.
+    Args:
+        high_in_channels (int): The channels of high-resolution feature (output of LearningToDownsample).
+        low_in_channels (int): The channels of low-resolution feature (output of GlobalFeatureExtractor).
+        out_channels (int): The output channels of this module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, high_in_channels, low_in_channels, out_channels,
+                 align_corners):
+        super().__init__()
+
+        # Only depth-wise conv
+        self.dwconv = layers.ConvBNReLU(
+            in_channels=low_in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+            groups=128,
+            bias_attr=False)
+
+        self.conv_low_res = layers.ConvBN(out_channels, out_channels, 1)
+        self.conv_high_res = layers.ConvBN(high_in_channels, out_channels, 1)
+        self.align_corners = align_corners
+
+    def forward(self, high_res_input, low_res_input):
+        low_res_input = F.interpolate(
+            low_res_input,
+            paddle.shape(high_res_input)[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        low_res_input = self.dwconv(low_res_input)
+        low_res_input = self.conv_low_res(low_res_input)
+        high_res_input = self.conv_high_res(high_res_input)
+        x = high_res_input + low_res_input
+
+        return F.relu(x)
+
+
+class Classifier(nn.Layer):
+    """
+    The Classifier module implementation.
+    This module consists of two depth-wise conv and one conv.
+    Args:
+        input_channels (int): The input channels to this module.
+        num_classes (int): The unique number of target classes.
+    """
+
+    def __init__(self, input_channels, num_classes):
+        super().__init__()
+
+        self.dsconv1 = layers.SeparableConvBNReLU(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.dsconv2 = layers.SeparableConvBNReLU(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.conv = nn.Conv2D(
+            in_channels=input_channels, out_channels=num_classes, kernel_size=1)
+
+        self.dropout = nn.Dropout(p=0.1)  # dropout_prob
+
+    def forward(self, x):
+        x = self.dsconv1(x)
+        x = self.dsconv2(x)
+        x = self.dropout(x)
+        x = self.conv(x)
+        return x
--- a/paddlers/models/ppseg/models/fastfcn.py
+++ b/paddlers/models/ppseg/models/fastfcn.py
@ -0,0 +1,240 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class FastFCN(nn.Layer):
+    """
+    The FastFCN implementation based on PaddlePaddle.
+
+    The original article refers to
+    Huikai Wu, Junge Zhang, Kaiqi Huang. "FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation".
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of
+            output of backbone.
+        num_codes (int): The number of encoded words. Default: 32.
+        mid_channels (int): The channels of middle layers. Default: 512.
+        use_jpu (bool): Whether use jpu module. Default: True.
+        aux_loss (bool): Whether use auxiliary head loss. Default: True.
+        use_se_loss (int): Whether use semantic encoding loss. Default: True.
+        add_lateral (int): Whether use lateral convolution layers. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 num_codes=32,
+                 mid_channels=512,
+                 use_jpu=True,
+                 aux_loss=True,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 pretrained=None):
+        super().__init__()
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.backbone = backbone
+        self.use_jpu = use_jpu
+        in_channels = self.backbone.feat_channels
+
+        if use_jpu:
+            self.jpu_layer = layers.JPU(in_channels, mid_channels)
+            in_channels[-1] = mid_channels * 4
+            self.bottleneck = layers.ConvBNReLU(
+                in_channels[-1],
+                mid_channels,
+                1,
+                padding=0,
+                bias_attr=False,
+            )
+        else:
+            self.bottleneck = layers.ConvBNReLU(
+                in_channels[-1],
+                mid_channels,
+                3,
+                padding=1,
+                bias_attr=False,
+            )
+        if self.add_lateral:
+            self.lateral_convs = nn.LayerList([
+                layers.ConvBNReLU(in_channels[0],
+                                  mid_channels,
+                                  1,
+                                  bias_attr=False),
+                layers.ConvBNReLU(in_channels[1],
+                                  mid_channels,
+                                  1,
+                                  bias_attr=False),
+            ])
+
+            self.fusion = layers.ConvBNReLU(
+                3 * mid_channels,
+                mid_channels,
+                3,
+                padding=1,
+                bias_attr=False,
+            )
+
+        self.enc_module = EncModule(mid_channels, num_codes)
+        self.cls_seg = nn.Conv2D(mid_channels, num_classes, 1)
+
+        self.aux_loss = aux_loss
+        if self.aux_loss:
+            self.fcn_head = layers.AuxLayer(in_channels[-2], mid_channels,
+                                            num_classes)
+
+        self.use_se_loss = use_se_loss
+        if use_se_loss:
+            self.se_layer = nn.Linear(mid_channels, num_classes)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, inputs):
+        imsize = paddle.shape(inputs)[2:]
+        feats = self.backbone(inputs)
+        if self.use_jpu:
+            feats = self.jpu_layer(*feats)
+
+        fcn_feat = feats[2]
+
+        feat = self.bottleneck(feats[-1])
+        if self.add_lateral:
+            laterals = []
+            for i, lateral_conv in enumerate(self.lateral_convs):
+                laterals.append(
+                    F.interpolate(lateral_conv(feats[i]),
+                                  size=paddle.shape(feat)[2:],
+                                  mode='bilinear',
+                                  align_corners=False))
+            feat = self.fusion(paddle.concat([feat, *laterals], 1))
+        encode_feat, feat = self.enc_module(feat)
+        out = self.cls_seg(feat)
+        out = F.interpolate(out,
+                            size=imsize,
+                            mode='bilinear',
+                            align_corners=False)
+        output = [out]
+
+        if self.training:
+            fcn_out = self.fcn_head(fcn_feat)
+            fcn_out = F.interpolate(fcn_out,
+                                    size=imsize,
+                                    mode='bilinear',
+                                    align_corners=False)
+            output.append(fcn_out)
+            if self.use_se_loss:
+                se_out = self.se_layer(encode_feat)
+                output.append(se_out)
+            return output
+        return output
+
+
+class Encoding(nn.Layer):
+    def __init__(self, channels, num_codes):
+        super().__init__()
+        self.channels, self.num_codes = channels, num_codes
+
+        std = 1 / ((channels * num_codes)**0.5)
+        self.codewords = self.create_parameter(
+            shape=(num_codes, channels),
+            default_initializer=nn.initializer.Uniform(-std, std),
+        )
+        self.scale = self.create_parameter(
+            shape=(num_codes, ),
+            default_initializer=nn.initializer.Uniform(-1, 0),
+        )
+
+    def scaled_l2(self, x, codewords, scale):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_scale = scale.reshape([1, 1, num_codes])
+        expanded_x = paddle.tile(x.unsqueeze(2), [1, 1, num_codes, 1])
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+
+        scaled_l2_norm = reshaped_scale * (
+            expanded_x - reshaped_codewords).pow(2).sum(axis=3)
+        return scaled_l2_norm
+
+    def aggregate(self, assignment_weights, x, codewords):
+        num_codes, channels = paddle.shape(codewords)
+        reshaped_codewords = codewords.reshape([1, 1, num_codes, channels])
+        expanded_x = paddle.tile(
+            x.unsqueeze(2),
+            [1, 1, num_codes, 1],
+        )
+        encoded_feat = (assignment_weights.unsqueeze(3) *
+                        (expanded_x - reshaped_codewords)).sum(axis=1)
+        return encoded_feat
+
+    def forward(self, x):
+        x_dims = x.ndim
+        assert x_dims == 4, "The dimension of input tensor must equal 4, but got {}.".format(
+            x_dims)
+        assert paddle.shape(
+            x
+        )[1] == self.channels, "Encoding channels error, excepted {} but got {}.".format(
+            self.channels,
+            paddle.shape(x)[1])
+        batch_size = paddle.shape(x)[0]
+        x = x.reshape([batch_size, self.channels, -1]).transpose([0, 2, 1])
+        assignment_weights = F.softmax(self.scaled_l2(x, self.codewords,
+                                                      self.scale),
+                                       axis=2)
+
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        encoded_feat = encoded_feat.reshape([batch_size, self.num_codes, -1])
+        return encoded_feat
+
+
+class EncModule(nn.Layer):
+    def __init__(self, in_channels, num_codes):
+        super().__init__()
+        self.encoding_project = layers.ConvBNReLU(
+            in_channels,
+            in_channels,
+            1,
+        )
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            nn.BatchNorm1D(num_codes),
+            nn.ReLU(),
+        )
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection).mean(axis=1)
+        batch_size, channels, _, _ = paddle.shape(x)
+        gamma = self.fc(encoding_feat)
+        y = gamma.reshape([batch_size, channels, 1, 1])
+        output = F.relu(x + x * y)
+        return encoding_feat, output
--- a/paddlers/models/ppseg/models/fcn.py
+++ b/paddlers/models/ppseg/models/fcn.py
@ -0,0 +1,145 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import paddle
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class FCN(nn.Layer):
+    """
+    A simple implementation for FCN based on PaddlePaddle.
+
+    The original article refers to
+    Evan Shelhamer, et, al. "Fully Convolutional Networks for Semantic Segmentation"
+    (https://arxiv.org/abs/1411.4038).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone networks.
+        backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone.
+            Default: (-1, ).
+        channels (int, optional): The channels between conv layer and the last layer of FCNHead.
+            If None, it will be the number of channels of input features. Default: None.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(-1, ),
+                 channels=None,
+                 align_corners=False,
+                 pretrained=None,
+                 bias=True,
+                 data_format="NCHW"):
+        super(FCN, self).__init__()
+
+        if data_format != 'NCHW':
+            raise ('fcn only support NCHW data format')
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = FCNHead(
+            num_classes,
+            backbone_indices,
+            backbone_channels,
+            channels,
+            bias=bias)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.data_format = data_format
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class FCNHead(nn.Layer):
+    """
+    A simple implementation for FCNHead based on PaddlePaddle
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone.
+            Default: (-1, ).
+        channels (int, optional): The channels between conv layer and the last layer of FCNHead.
+            If None, it will be the number of channels of input features. Default: None.
+        pretrained (str, optional): The path of pretrained model. Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices=(-1, ),
+                 backbone_channels=(270, ),
+                 channels=None,
+                 bias=True):
+        super(FCNHead, self).__init__()
+
+        self.num_classes = num_classes
+        self.backbone_indices = backbone_indices
+        if channels is None:
+            channels = backbone_channels[0]
+
+        self.conv_1 = layers.ConvBNReLU(
+            in_channels=backbone_channels[0],
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            bias_attr=bias)
+        self.cls = nn.Conv2D(
+            in_channels=channels,
+            out_channels=self.num_classes,
+            kernel_size=1,
+            stride=1,
+            bias_attr=bias)
+        self.init_weight()
+
+    def forward(self, feat_list):
+        logit_list = []
+        x = feat_list[self.backbone_indices[0]]
+        x = self.conv_1(x)
+        logit = self.cls(x)
+        logit_list.append(logit)
+        return logit_list
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
--- a/paddlers/models/ppseg/models/gcnet.py
+++ b/paddlers/models/ppseg/models/gcnet.py
@ -0,0 +1,222 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class GCNet(nn.Layer):
+    """
+    The GCNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Cao, Yue, et al. "GCnet: Non-local networks meet squeeze-excitation networks and beyond"
+    (https://arxiv.org/pdf/1904.11492.pdf).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+        gc_channels (int, optional): The input channels to Global Context Block. Default: 512.
+        ratio (float, optional): It indicates the ratio of attention channels and gc_channels. Default: 0.25.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 gc_channels=512,
+                 ratio=0.25,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = GCNetHead(num_classes, backbone_indices, backbone_channels,
+                              gc_channels, ratio, enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class GCNetHead(nn.Layer):
+    """
+    The GCNetHead implementation.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            The first index will be taken as a deep-supervision feature in auxiliary layer;
+            the second one will be taken as input of GlobalContextBlock.
+        backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index.
+        gc_channels (int): The input channels to Global Context Block.
+        ratio (float): It indicates the ratio of attention channels and gc_channels.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 backbone_channels,
+                 gc_channels,
+                 ratio,
+                 enable_auxiliary_loss=True):
+
+        super().__init__()
+
+        in_channels = backbone_channels[1]
+        self.conv_bn_relu1 = layers.ConvBNReLU(
+            in_channels=in_channels,
+            out_channels=gc_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.gc_block = GlobalContextBlock(
+            gc_channels=gc_channels, in_channels=gc_channels, ratio=ratio)
+
+        self.conv_bn_relu2 = layers.ConvBNReLU(
+            in_channels=gc_channels,
+            out_channels=gc_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.conv_bn_relu3 = layers.ConvBNReLU(
+            in_channels=in_channels + gc_channels,
+            out_channels=gc_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.dropout = nn.Dropout(p=0.1)
+
+        self.conv = nn.Conv2D(
+            in_channels=gc_channels, out_channels=num_classes, kernel_size=1)
+
+        if enable_auxiliary_loss:
+            self.auxlayer = layers.AuxLayer(
+                in_channels=backbone_channels[0],
+                inter_channels=backbone_channels[0] // 4,
+                out_channels=num_classes)
+
+        self.backbone_indices = backbone_indices
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+    def forward(self, feat_list):
+        logit_list = []
+        x = feat_list[self.backbone_indices[1]]
+
+        output = self.conv_bn_relu1(x)
+        output = self.gc_block(output)
+        output = self.conv_bn_relu2(output)
+
+        output = paddle.concat([x, output], axis=1)
+        output = self.conv_bn_relu3(output)
+
+        output = self.dropout(output)
+        logit = self.conv(output)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            low_level_feat = feat_list[self.backbone_indices[0]]
+            auxiliary_logit = self.auxlayer(low_level_feat)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
+
+
+class GlobalContextBlock(nn.Layer):
+    """
+    Global Context Block implementation.
+
+    Args:
+        in_channels (int): The input channels of Global Context Block.
+        ratio (float): The channels of attention map.
+    """
+
+    def __init__(self, gc_channels, in_channels, ratio):
+        super().__init__()
+        self.gc_channels = gc_channels
+
+        self.conv_mask = nn.Conv2D(
+            in_channels=in_channels, out_channels=1, kernel_size=1)
+
+        self.softmax = nn.Softmax(axis=2)
+
+        inter_channels = int(in_channels * ratio)
+        self.channel_add_conv = nn.Sequential(
+            nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=inter_channels,
+                kernel_size=1),
+            nn.LayerNorm(normalized_shape=[inter_channels, 1, 1]), nn.ReLU(),
+            nn.Conv2D(
+                in_channels=inter_channels,
+                out_channels=in_channels,
+                kernel_size=1))
+
+    def global_context_block(self, x):
+        x_shape = paddle.shape(x)
+
+        # [N, C, H * W]
+        input_x = paddle.reshape(x, shape=[0, self.gc_channels, -1])
+        # [N, 1, C, H * W]
+        input_x = paddle.unsqueeze(input_x, axis=1)
+        # [N, 1, H, W]
+        context_mask = self.conv_mask(x)
+        # [N, 1, H * W]
+        context_mask = paddle.reshape(context_mask, shape=[0, 1, -1])
+        context_mask = self.softmax(context_mask)
+        # [N, 1, H * W, 1]
+        context_mask = paddle.unsqueeze(context_mask, axis=-1)
+        # [N, 1, C, 1]
+        context = paddle.matmul(input_x, context_mask)
+        # [N, C, 1, 1]
+        context = paddle.reshape(context, shape=[0, self.gc_channels, 1, 1])
+
+        return context
+
+    def forward(self, x):
+        context = self.global_context_block(x)
+        channel_add_term = self.channel_add_conv(context)
+        out = x + channel_add_term
+        return out
--- a/paddlers/models/ppseg/models/ginet.py
+++ b/paddlers/models/ppseg/models/ginet.py
@ -0,0 +1,291 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import functional as F
+
+from paddlers.models.ppseg.utils import utils
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.MODELS.add_component
+class GINet(nn.Layer):
+    """
+    The GINet implementation based on PaddlePaddle.
+    The original article refers to
+    Wu, Tianyi, Yu Lu, Yu Zhu, Chuang Zhang, Ming Wu, Zhanyu Ma, and Guodong Guo. "GINet: Graph interaction network for scene parsing." In European Conference on Computer Vision, pp. 34-51. Springer, Cham, 2020.
+    (https://arxiv.org/pdf/2009.06160).
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network.
+        backbone_indices (tuple, optional): Values in the tuple indicate the indices of output of backbone.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
+            If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.. Default: False.
+        jpu (bool, optional)): whether to use jpu unit in the base forward. Default:True.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=[0, 1, 2, 3],
+                 enable_auxiliary_loss=True,
+                 align_corners=True,
+                 jpu=True,
+                 pretrained=None):
+        super().__init__()
+        self.nclass = num_classes
+        self.aux = enable_auxiliary_loss
+        self.jpu = jpu
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.align_corners = align_corners
+
+        self.jpu = layers.JPU([512, 1024, 2048], width=512) if jpu else None
+        self.head = GIHead(in_channels=2048, nclass=num_classes)
+
+        if self.aux:
+            self.auxlayer = layers.AuxLayer(
+                1024, 1024 // 4, num_classes, bias_attr=False)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def base_forward(self, x):
+        feat_list = self.backbone(x)
+
+        c1, c2, c3, c4 = [feat_list[i] for i in self.backbone_indices]
+
+        if self.jpu:
+            return self.jpu(c1, c2, c3, c4)
+        else:
+            return c1, c2, c3, c4
+
+    def forward(self, x):
+        _, _, h, w = paddle.shape(x)
+        _, _, c3, c4 = self.base_forward(x)
+
+        logit_list = []
+        x, _ = self.head(c4)
+        logit_list.append(x)
+
+        if self.aux:
+            auxout = self.auxlayer(c3)
+
+            logit_list.append(auxout)
+
+        return [
+            F.interpolate(
+                logit, (h, w),
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class GIHead(nn.Layer):
+    """The Graph Interaction Network head."""
+
+    def __init__(self, in_channels, nclass):
+        super().__init__()
+        self.nclass = nclass
+        inter_channels = in_channels // 4
+        self.inp = paddle.zeros(shape=(nclass, 300), dtype='float32')
+        self.inp = paddle.create_parameter(
+            shape=self.inp.shape,
+            dtype=str(self.inp.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(self.inp))
+        self.inp.stop_gradient = True
+
+        self.fc1 = nn.Sequential(
+            nn.Linear(300, 128), nn.BatchNorm1D(128), nn.ReLU())
+        self.fc2 = nn.Sequential(
+            nn.Linear(128, 256), nn.BatchNorm1D(256), nn.ReLU())
+        self.conv5 = layers.ConvBNReLU(
+            in_channels,
+            inter_channels,
+            3,
+            padding=1,
+            bias_attr=False,
+            stride=1)
+
+        self.gloru = GlobalReasonUnit(
+            in_channels=inter_channels,
+            num_state=256,
+            num_node=84,
+            nclass=nclass)
+        self.conv6 = nn.Sequential(
+            nn.Dropout(0.1), nn.Conv2D(inter_channels, nclass, 1))
+
+    def forward(self, x):
+
+        B, C, H, W = paddle.shape(x)
+        inp = self.inp
+
+        inp = self.fc1(inp)
+        inp = self.fc2(inp).unsqueeze(axis=0).transpose((0, 2, 1))\
+                           .expand((B, 256, self.nclass))
+
+        out = self.conv5(x)
+
+        out, se_out = self.gloru(out, inp)
+        out = self.conv6(out)
+        return out, se_out
+
+
+class GlobalReasonUnit(nn.Layer):
+    """
+        The original paper refers to:
+            Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks" (https://arxiv.org/abs/1811.12814)
+    """
+
+    def __init__(self, in_channels, num_state=256, num_node=84, nclass=59):
+        super().__init__()
+        self.num_state = num_state
+        self.conv_theta = nn.Conv2D(
+            in_channels, num_node, kernel_size=1, stride=1, padding=0)
+        self.conv_phi = nn.Conv2D(
+            in_channels, num_state, kernel_size=1, stride=1, padding=0)
+        self.graph = GraphLayer(num_state, num_node, nclass)
+        self.extend_dim = nn.Conv2D(
+            num_state, in_channels, kernel_size=1, bias_attr=False)
+
+        self.bn = layers.SyncBatchNorm(in_channels)
+
+    def forward(self, x, inp):
+        B = self.conv_theta(x)
+        sizeB = paddle.shape(B)
+        B = paddle.flatten(B, 2, 3)
+
+        sizex = paddle.shape(x)
+        x_reduce = self.conv_phi(x)
+
+        x_reduce = paddle.flatten(x_reduce, 2, 3).transpose((0, 2, 1))
+
+        V = paddle.bmm(B, x_reduce).transpose((0, 2, 1))
+        V = paddle.divide(V, (sizex[2] * sizex[3]).astype('float32'))
+
+        class_node, new_V = self.graph(inp, V)
+        D = B.transpose((0, 2, 1))
+        Y = paddle.bmm(D, new_V.transpose((0, 2, 1)))
+        Y = Y.transpose((0, 2, 1)).reshape((sizex[0], self.num_state, \
+                                            sizex[2], -1))
+        Y = self.extend_dim(Y)
+        Y = self.bn(Y)
+        out = Y + x
+
+        return out, class_node
+
+
+class GraphLayer(nn.Layer):
+    def __init__(self, num_state, num_node, num_class):
+        super().__init__()
+        self.vis_gcn = GCN(num_state, num_node)
+        self.word_gcn = GCN(num_state, num_class)
+        self.transfer = GraphTransfer(num_state)
+        self.gamma_vis = paddle.zeros([num_node])
+        self.gamma_word = paddle.zeros([num_class])
+        self.gamma_vis = paddle.create_parameter(
+            shape=paddle.shape(self.gamma_vis),
+            dtype=str(self.gamma_vis.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(self.gamma_vis))
+        self.gamma_word = paddle.create_parameter(
+            shape=paddle.shape(self.gamma_word),
+            dtype=str(self.gamma_word.numpy().dtype),
+            default_initializer=paddle.nn.initializer.Assign(self.gamma_word))
+
+    def forward(self, inp, vis_node):
+        inp = self.word_gcn(inp)
+        new_V = self.vis_gcn(vis_node)
+        class_node, vis_node = self.transfer(inp, new_V)
+
+        class_node = self.gamma_word * inp + class_node
+        new_V = self.gamma_vis * vis_node + new_V
+        return class_node, new_V
+
+
+class GCN(nn.Layer):
+    def __init__(self, num_state=128, num_node=64, bias=False):
+        super().__init__()
+        self.conv1 = nn.Conv1D(
+            num_node,
+            num_node,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+        )
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv1D(
+            num_state,
+            num_state,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            groups=1,
+            bias_attr=bias)
+
+    def forward(self, x):
+        h = self.conv1(x.transpose((0, 2, 1))).transpose((0, 2, 1))
+        h = h + x
+        h = self.relu(h)
+        h = self.conv2(h)
+        return h
+
+
+class GraphTransfer(nn.Layer):
+    """Transfer vis graph to class node, transfer class node to vis feature"""
+
+    def __init__(self, in_dim):
+        super().__init__()
+        self.channle_in = in_dim
+        self.query_conv = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1)
+        self.key_conv = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim // 2, kernel_size=1)
+        self.value_conv_vis = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim, kernel_size=1)
+        self.value_conv_word = nn.Conv1D(
+            in_channels=in_dim, out_channels=in_dim, kernel_size=1)
+        self.softmax_vis = nn.Softmax(axis=-1)
+        self.softmax_word = nn.Softmax(axis=-2)
+
+    def forward(self, word, vis_node):
+        m_batchsize, C, Nc = paddle.shape(word)
+        m_batchsize, C, Nn = paddle.shape(vis_node)
+
+        proj_query = self.query_conv(word).reshape((m_batchsize, -1, Nc))\
+                                          .transpose((0, 2, 1))
+        proj_key = self.key_conv(vis_node).reshape((m_batchsize, -1, Nn))
+
+        energy = paddle.bmm(proj_query, proj_key)
+        attention_vis = self.softmax_vis(energy).transpose((0, 2, 1))
+        attention_word = self.softmax_word(energy)
+
+        proj_value_vis = self.value_conv_vis(vis_node).reshape((m_batchsize, -1,
+                                                                Nn))
+        proj_value_word = self.value_conv_word(word).reshape((m_batchsize, -1,
+                                                              Nc))
+
+        class_out = paddle.bmm(proj_value_vis, attention_vis)
+        node_out = paddle.bmm(proj_value_word, attention_word)
+        return class_out, node_out
--- a/paddlers/models/ppseg/models/gscnn.py
+++ b/paddlers/models/ppseg/models/gscnn.py
@ -0,0 +1,353 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.models.backbones import resnet_vd
+from paddlers.models.ppseg.models import deeplab
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class GSCNN(nn.Layer):
+    """
+    The GSCNN implementation based on PaddlePaddle.
+    The original article refers to
+    Towaki Takikawa, et, al. "Gated-SCNN: Gated Shape CNNs for Semantic Segmentation"
+    (https://arxiv.org/pdf/1907.05740.pdf)
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet50_vd/Resnet101_vd.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+           Default: (0, 1, 2, 3).
+        aspp_ratios (tuple, optional): The dilation rate using in ASSP module.
+            If output_stride=16, aspp_ratios should be set as (1, 6, 12, 18).
+            If output_stride=8, aspp_ratios is (1, 12, 24, 36).
+            Default: (1, 6, 12, 18).
+        aspp_out_channels (int, optional): The output channels of ASPP module. Default: 256.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(0, 1, 2, 3),
+                 aspp_ratios=(1, 6, 12, 18),
+                 aspp_out_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.backbone = backbone
+        backbone_channels = self.backbone.feat_channels
+        self.head = GSCNNHead(num_classes, backbone_indices, backbone_channels,
+                              aspp_ratios, aspp_out_channels, align_corners)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(x, feat_list, self.backbone.conv1_logit)
+        seg_logit, edge_logit = [
+            F.interpolate(
+                logit,
+                x.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+        return [seg_logit, (seg_logit, edge_logit), edge_logit, seg_logit]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class GSCNNHead(nn.Layer):
+    """
+    The GSCNNHead implementation based on PaddlePaddle.
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone.
+            the first index will be taken as a low-level feature in Decoder component;
+            the last one will be taken as input of ASPP component; the second to fourth
+            will be taken as input for GCL component.
+            Usually backbone consists of four downsampling stage, and return an output of
+            each stage. If we set it as (0, 1, 2, 3), it means taking feature map of the first
+            stage in backbone as low-level feature used in Decoder, feature map of the fourth
+            stage as input of ASPP, and the feature map of the second to fourth stage as input of GCL.
+        backbone_channels (tuple): The channels of output of backbone.
+        aspp_ratios (tuple): The dilation rates using in ASSP module.
+        aspp_out_channels (int): The output channels of ASPP module.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, num_classes, backbone_indices, backbone_channels,
+                 aspp_ratios, aspp_out_channels, align_corners):
+        super().__init__()
+        self.backbone_indices = backbone_indices
+        self.align_corners = align_corners
+
+        self.dsn1 = nn.Conv2D(
+            backbone_channels[backbone_indices[1]], 1, kernel_size=1)
+        self.dsn2 = nn.Conv2D(
+            backbone_channels[backbone_indices[2]], 1, kernel_size=1)
+        self.dsn3 = nn.Conv2D(
+            backbone_channels[backbone_indices[3]], 1, kernel_size=1)
+
+        self.res1 = resnet_vd.BasicBlock(64, 64, stride=1)
+        self.d1 = nn.Conv2D(64, 32, kernel_size=1)
+        self.gate1 = GatedSpatailConv2d(32, 32)
+        self.res2 = resnet_vd.BasicBlock(32, 32, stride=1)
+        self.d2 = nn.Conv2D(32, 16, kernel_size=1)
+        self.gate2 = GatedSpatailConv2d(16, 16)
+        self.res3 = resnet_vd.BasicBlock(16, 16, stride=1)
+        self.d3 = nn.Conv2D(16, 8, kernel_size=1)
+        self.gate3 = GatedSpatailConv2d(8, 8)
+        self.fuse = nn.Conv2D(8, 1, kernel_size=1, bias_attr=False)
+
+        self.cw = nn.Conv2D(2, 1, kernel_size=1, bias_attr=False)
+
+        self.aspp = ASPPModule(
+            aspp_ratios=aspp_ratios,
+            in_channels=backbone_channels[-1],
+            out_channels=aspp_out_channels,
+            align_corners=self.align_corners,
+            image_pooling=True)
+
+        self.decoder = deeplab.Decoder(
+            num_classes=num_classes,
+            in_channels=backbone_channels[0],
+            align_corners=self.align_corners)
+
+    def forward(self, x, feat_list, s_input):
+        input_shape = paddle.shape(x)
+        m1f = F.interpolate(
+            s_input,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        l1, l2, l3 = [
+            feat_list[self.backbone_indices[i]]
+            for i in range(1, len(self.backbone_indices))
+        ]
+        s1 = F.interpolate(
+            self.dsn1(l1),
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        s2 = F.interpolate(
+            self.dsn2(l2),
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        s3 = F.interpolate(
+            self.dsn3(l3),
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        # Get image gradient
+        im_arr = x.numpy().transpose((0, 2, 3, 1))
+        im_arr = ((im_arr * 0.5 + 0.5) * 255).astype(np.uint8)
+        canny = np.zeros((input_shape[0], 1, input_shape[2], input_shape[3]))
+        for i in range(input_shape[0]):
+            canny[i] = cv2.Canny(im_arr[i], 10, 100)
+        canny = canny / 255
+        canny = paddle.to_tensor(canny).astype('float32')
+        canny.stop_gradient = True
+
+        cs = self.res1(m1f)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        cs = self.d1(cs)
+        cs = self.gate1(cs, s1)
+
+        cs = self.res2(cs)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        cs = self.d2(cs)
+        cs = self.gate2(cs, s2)
+
+        cs = self.res3(cs)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        cs = self.d3(cs)
+        cs = self.gate3(cs, s3)
+
+        cs = self.fuse(cs)
+        cs = F.interpolate(
+            cs,
+            input_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        edge_out = F.sigmoid(cs)  # Ouput of shape stream
+
+        cat = paddle.concat([edge_out, canny], axis=1)
+        acts = self.cw(cat)
+        acts = F.sigmoid(acts)  # Input of fusion module
+
+        x = self.aspp(l3, acts)
+
+        low_level_feat = feat_list[self.backbone_indices[0]]
+        logit = self.decoder(x, low_level_feat)
+        logit_list = [logit, edge_out]
+        return logit_list
+
+
+class GatedSpatailConv2d(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias_attr=False):
+        super().__init__()
+        self._gate_conv = nn.Sequential(
+            layers.SyncBatchNorm(in_channels + 1),
+            nn.Conv2D(in_channels + 1, in_channels + 1, kernel_size=1),
+            nn.ReLU(), nn.Conv2D(in_channels + 1, 1, kernel_size=1),
+            layers.SyncBatchNorm(1), nn.Sigmoid())
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias_attr)
+
+    def forward(self, input_features, gating_features):
+        cat = paddle.concat([input_features, gating_features], axis=1)
+        alphas = self._gate_conv(cat)
+        x = input_features * (alphas + 1)
+        x = self.conv(x)
+        return x
+
+
+class ASPPModule(nn.Layer):
+    """
+    Atrous Spatial Pyramid Pooling.
+    Args:
+        aspp_ratios (tuple): The dilation rate using in ASSP module.
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+        use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False.
+        image_pooling (bool, optional): If augmented with image-level features. Default: False
+    """
+
+    def __init__(self,
+                 aspp_ratios,
+                 in_channels,
+                 out_channels,
+                 align_corners,
+                 use_sep_conv=False,
+                 image_pooling=False):
+        super().__init__()
+
+        self.align_corners = align_corners
+        self.aspp_blocks = nn.LayerList()
+
+        for ratio in aspp_ratios:
+            if use_sep_conv and ratio > 1:
+                conv_func = layers.SeparableConvBNReLU
+            else:
+                conv_func = layers.ConvBNReLU
+
+            block = conv_func(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1 if ratio == 1 else 3,
+                dilation=ratio,
+                padding=0 if ratio == 1 else ratio)
+            self.aspp_blocks.append(block)
+
+        out_size = len(self.aspp_blocks)
+
+        if image_pooling:
+            self.global_avg_pool = nn.Sequential(
+                nn.AdaptiveAvgPool2D(output_size=(1, 1)),
+                layers.ConvBNReLU(
+                    in_channels, out_channels, kernel_size=1, bias_attr=False))
+            out_size += 1
+        self.image_pooling = image_pooling
+
+        self.edge_conv = layers.ConvBNReLU(
+            1, out_channels, kernel_size=1, bias_attr=False)
+        out_size += 1
+
+        self.conv_bn_relu = layers.ConvBNReLU(
+            in_channels=out_channels * out_size,
+            out_channels=out_channels,
+            kernel_size=1)
+
+        self.dropout = nn.Dropout(p=0.1)  # drop rate
+
+    def forward(self, x, edge):
+        outputs = []
+        x_shape = paddle.shape(x)
+        for block in self.aspp_blocks:
+            y = block(x)
+            y = F.interpolate(
+                y,
+                x_shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            outputs.append(y)
+
+        if self.image_pooling:
+            img_avg = self.global_avg_pool(x)
+            img_avg = F.interpolate(
+                img_avg,
+                x_shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            outputs.append(img_avg)
+
+        edge_features = F.interpolate(
+            edge,
+            size=x_shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        edge_features = self.edge_conv(edge_features)
+        outputs.append(edge_features)
+
+        x = paddle.concat(outputs, axis=1)
+        x = self.conv_bn_relu(x)
+        x = self.dropout(x)
+        return x
--- a/paddlers/models/ppseg/models/hardnet.py
+++ b/paddlers/models/ppseg/models/hardnet.py
@ -0,0 +1,308 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class HarDNet(nn.Layer):
+    """
+    [Real Time] The FC-HardDNet 70 implementation based on PaddlePaddle.
+    The original article refers to
+        Chao, Ping, et al. "HarDNet: A Low Memory Traffic Network"
+        (https://arxiv.org/pdf/1909.00948.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        stem_channels (tuple|list, optional): The number of channels before the encoder. Default: (16, 24, 32, 48).
+        ch_list (tuple|list, optional): The number of channels at each block in the encoder. Default: (64, 96, 160, 224, 320).
+        grmul (float, optional): The channel multiplying factor in HarDBlock, which is m in the paper. Default: 1.7.
+        gr (tuple|list, optional): The growth rate in each HarDBlock, which is k in the paper. Default: (10, 16, 18, 24, 32).
+        n_layers (tuple|list, optional): The number of layers in each HarDBlock. Default: (4, 4, 8, 8, 8).
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 stem_channels=(16, 24, 32, 48),
+                 ch_list=(64, 96, 160, 224, 320),
+                 grmul=1.7,
+                 gr=(10, 16, 18, 24, 32),
+                 n_layers=(4, 4, 8, 8, 8),
+                 align_corners=False,
+                 pretrained=None):
+
+        super().__init__()
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        encoder_blks_num = len(n_layers)
+        decoder_blks_num = encoder_blks_num - 1
+        encoder_in_channels = stem_channels[3]
+
+        self.stem = nn.Sequential(
+            layers.ConvBNReLU(
+                3, stem_channels[0], kernel_size=3, bias_attr=False),
+            layers.ConvBNReLU(
+                stem_channels[0],
+                stem_channels[1],
+                kernel_size=3,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                stem_channels[1],
+                stem_channels[2],
+                kernel_size=3,
+                stride=2,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                stem_channels[2],
+                stem_channels[3],
+                kernel_size=3,
+                bias_attr=False))
+
+        self.encoder = Encoder(encoder_blks_num, encoder_in_channels, ch_list,
+                               gr, grmul, n_layers)
+
+        skip_connection_channels = self.encoder.get_skip_channels()
+        decoder_in_channels = self.encoder.get_out_channels()
+
+        self.decoder = Decoder(decoder_blks_num, decoder_in_channels,
+                               skip_connection_channels, gr, grmul, n_layers,
+                               align_corners)
+
+        self.cls_head = nn.Conv2D(
+            in_channels=self.decoder.get_out_channels(),
+            out_channels=num_classes,
+            kernel_size=1)
+
+        self.init_weight()
+
+    def forward(self, x):
+        input_shape = paddle.shape(x)[2:]
+        x = self.stem(x)
+        x, skip_connections = self.encoder(x)
+        x = self.decoder(x, skip_connections)
+        logit = self.cls_head(x)
+        logit = F.interpolate(
+            logit,
+            size=input_shape,
+            mode="bilinear",
+            align_corners=self.align_corners)
+        return [logit]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class Encoder(nn.Layer):
+    """The Encoder implementation of FC-HardDNet 70.
+
+    Args:
+        n_blocks (int): The number of blocks in the Encoder module.
+        in_channels (int): The number of input channels.
+        ch_list (tuple|list): The number of channels at each block in the encoder.
+        grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper.
+        gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper.
+        n_layers (tuple|list): The number of layers in each HarDBlock.
+    """
+
+    def __init__(self, n_blocks, in_channels, ch_list, gr, grmul, n_layers):
+        super().__init__()
+        self.skip_connection_channels = []
+        self.shortcut_layers = []
+        self.blks = nn.LayerList()
+        ch = in_channels
+        for i in range(n_blocks):
+            blk = HarDBlock(ch, gr[i], grmul, n_layers[i])
+            ch = blk.get_out_ch()
+            self.skip_connection_channels.append(ch)
+            self.blks.append(blk)
+            if i < n_blocks - 1:
+                self.shortcut_layers.append(len(self.blks) - 1)
+            self.blks.append(
+                layers.ConvBNReLU(
+                    ch, ch_list[i], kernel_size=1, bias_attr=False))
+
+            ch = ch_list[i]
+            if i < n_blocks - 1:
+                self.blks.append(nn.AvgPool2D(kernel_size=2, stride=2))
+        self.out_channels = ch
+
+    def forward(self, x):
+        skip_connections = []
+        for i in range(len(self.blks)):
+            x = self.blks[i](x)
+            if i in self.shortcut_layers:
+                skip_connections.append(x)
+        return x, skip_connections
+
+    def get_skip_channels(self):
+        return self.skip_connection_channels
+
+    def get_out_channels(self):
+        return self.out_channels
+
+
+class Decoder(nn.Layer):
+    """The Decoder implementation of FC-HardDNet 70.
+
+    Args:
+        n_blocks (int): The number of blocks in the Encoder module.
+        in_channels (int): The number of input channels.
+        skip_connection_channels (tuple|list): The channels of shortcut layers in encoder.
+        grmul (float): The channel multiplying factor in HarDBlock, which is m in the paper.
+        gr (tuple|list): The growth rate in each HarDBlock, which is k in the paper.
+        n_layers (tuple|list): The number of layers in each HarDBlock.
+    """
+
+    def __init__(self,
+                 n_blocks,
+                 in_channels,
+                 skip_connection_channels,
+                 gr,
+                 grmul,
+                 n_layers,
+                 align_corners=False):
+        super().__init__()
+        prev_block_channels = in_channels
+        self.n_blocks = n_blocks
+        self.dense_blocks_up = nn.LayerList()
+        self.conv1x1_up = nn.LayerList()
+
+        for i in range(n_blocks - 1, -1, -1):
+            cur_channels_count = prev_block_channels + skip_connection_channels[
+                i]
+            conv1x1 = layers.ConvBNReLU(
+                cur_channels_count,
+                cur_channels_count // 2,
+                kernel_size=1,
+                bias_attr=False)
+            blk = HarDBlock(
+                base_channels=cur_channels_count // 2,
+                growth_rate=gr[i],
+                grmul=grmul,
+                n_layers=n_layers[i])
+
+            self.conv1x1_up.append(conv1x1)
+            self.dense_blocks_up.append(blk)
+
+            prev_block_channels = blk.get_out_ch()
+
+        self.out_channels = prev_block_channels
+        self.align_corners = align_corners
+
+    def forward(self, x, skip_connections):
+        for i in range(self.n_blocks):
+            skip = skip_connections.pop()
+            x = F.interpolate(
+                x,
+                size=paddle.shape(skip)[2:],
+                mode="bilinear",
+                align_corners=self.align_corners)
+            x = paddle.concat([x, skip], axis=1)
+            x = self.conv1x1_up[i](x)
+            x = self.dense_blocks_up[i](x)
+        return x
+
+    def get_out_channels(self):
+        return self.out_channels
+
+
+class HarDBlock(nn.Layer):
+    """The HarDBlock implementation
+
+    Args:
+        base_channels (int): The base channels.
+        growth_rate (tuple|list): The growth rate.
+        grmul (float): The channel multiplying factor.
+        n_layers (tuple|list): The number of layers.
+        keepBase (bool, optional): A bool value indicates whether concatenating the first layer. Default: False.
+    """
+
+    def __init__(self,
+                 base_channels,
+                 growth_rate,
+                 grmul,
+                 n_layers,
+                 keepBase=False):
+        super().__init__()
+        self.keepBase = keepBase
+        self.links = []
+        layers_ = []
+        self.out_channels = 0
+        for i in range(n_layers):
+            outch, inch, link = get_link(i + 1, base_channels, growth_rate,
+                                         grmul)
+
+            self.links.append(link)
+            layers_.append(
+                layers.ConvBNReLU(inch, outch, kernel_size=3, bias_attr=False))
+            if (i % 2 == 0) or (i == n_layers - 1):
+                self.out_channels += outch
+        self.layers = nn.LayerList(layers_)
+
+    def forward(self, x):
+        layers_ = [x]
+        for layer in range(len(self.layers)):
+            link = self.links[layer]
+            tin = []
+            for i in link:
+                tin.append(layers_[i])
+            if len(tin) > 1:
+                x = paddle.concat(tin, axis=1)
+            else:
+                x = tin[0]
+            out = self.layers[layer](x)
+            layers_.append(out)
+
+        t = len(layers_)
+        out_ = []
+        for i in range(t):
+            if (i == 0 and self.keepBase) or \
+                (i == t - 1) or (i % 2 == 1):
+                out_.append(layers_[i])
+        out = paddle.concat(out_, 1)
+
+        return out
+
+    def get_out_ch(self):
+        return self.out_channels
+
+
+def get_link(layer, base_ch, growth_rate, grmul):
+    if layer == 0:
+        return base_ch, 0, []
+    out_channels = growth_rate
+    link = []
+    for i in range(10):
+        dv = 2**i
+        if layer % dv == 0:
+            k = layer - dv
+            link.insert(0, k)
+            if i > 0:
+                out_channels *= grmul
+    out_channels = int(int(out_channels + 1) / 2) * 2
+    in_channels = 0
+    for i in link:
+        ch, _, _ = get_link(i, base_ch, growth_rate, grmul)
+        in_channels += ch
+    return out_channels, in_channels, link
--- a/paddlers/models/ppseg/models/hrnet_contrast.py
+++ b/paddlers/models/ppseg/models/hrnet_contrast.py
@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class HRNetW48Contrast(nn.Layer):
+    """
+    The HRNetW48Contrast implementation based on PaddlePaddle.
+
+    The original article refers to
+    Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation"
+    (https://arxiv.org/abs/2101.11939).
+
+    Args:
+        in_channels (int): The output dimensions of backbone.
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support HRNet_W48.
+        drop_prob (float): The probability of dropout.
+        proj_dim (int): The projection dimensions.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_classes,
+                 backbone,
+                 drop_prob,
+                 proj_dim,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.backbone = backbone
+        self.num_classes = num_classes
+        self.proj_dim = proj_dim
+        self.align_corners = align_corners
+
+        self.cls_head = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1),
+            nn.Dropout2D(drop_prob),
+            nn.Conv2D(
+                in_channels,
+                num_classes,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False),
+        )
+        self.proj_head = ProjectionHead(
+            dim_in=in_channels, proj_dim=self.proj_dim)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        feats = self.backbone(x)[0]
+        out = self.cls_head(feats)
+        logit_list = []
+        if self.training:
+            emb = self.proj_head(feats)
+            logit_list.append(
+                F.interpolate(
+                    out,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners))
+            logit_list.append({'seg': out, 'embed': emb})
+        else:
+            logit_list.append(
+                F.interpolate(
+                    out,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners))
+        return logit_list
+
+
+class ProjectionHead(nn.Layer):
+    """
+    The projection head used by contrast learning.
+    Args:
+        dim_in (int): The dimensions of input features.
+        proj_dim (int, optional): The output dimensions of projection head. Default: 256.
+        proj (str, optional): The type of projection head, only support 'linear' and 'convmlp'. Default: 'convmlp'.
+    """
+
+    def __init__(self, dim_in, proj_dim=256, proj='convmlp'):
+        super(ProjectionHead, self).__init__()
+        if proj == 'linear':
+            self.proj = nn.Conv2D(dim_in, proj_dim, kernel_size=1)
+        elif proj == 'convmlp':
+            self.proj = nn.Sequential(
+                layers.ConvBNReLU(dim_in, dim_in, kernel_size=1),
+                nn.Conv2D(dim_in, proj_dim, kernel_size=1),
+            )
+        else:
+            raise ValueError(
+                "The type of project head only support 'linear' and 'convmlp', but got {}."
+                .format(proj))
+
+    def forward(self, x):
+        return F.normalize(self.proj(x), p=2, axis=1)
--- a/paddlers/models/ppseg/models/isanet.py
+++ b/paddlers/models/ppseg/models/isanet.py
@ -0,0 +1,197 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class ISANet(nn.Layer):
+    """Interlaced Sparse Self-Attention for Semantic Segmentation.
+
+    The original article refers to Lang Huang, et al. "Interlaced Sparse Self-Attention for Semantic Segmentation"
+    (https://arxiv.org/abs/1907.12273).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): A backbone network.
+        backbone_indices (tuple): The values in the tuple indicate the indices of output of backbone.
+        isa_channels (int): The channels of ISA Module.
+        down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 isa_channels=256,
+                 down_factor=(8, 8),
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+        self.head = ISAHead(num_classes, in_channels, isa_channels, down_factor,
+                            enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners,
+                align_mode=1) for logit in logit_list
+        ]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class ISAHead(nn.Layer):
+    """
+    The ISAHead.
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (tuple): The number of input channels.
+        isa_channels (int): The channels of ISA Module.
+        down_factor (tuple): Divide the height and width dimension to (Ph, PW) groups.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+    """
+
+    def __init__(self, num_classes, in_channels, isa_channels, down_factor,
+                 enable_auxiliary_loss):
+        super(ISAHead, self).__init__()
+        self.in_channels = in_channels[-1]
+        inter_channels = self.in_channels // 4
+        self.inter_channels = inter_channels
+        self.down_factor = down_factor
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.in_conv = layers.ConvBNReLU(
+            self.in_channels, inter_channels, 3, bias_attr=False)
+        self.global_relation = SelfAttentionBlock(inter_channels, isa_channels)
+        self.local_relation = SelfAttentionBlock(inter_channels, isa_channels)
+        self.out_conv = layers.ConvBNReLU(
+            inter_channels * 2, inter_channels, 1, bias_attr=False)
+        self.cls = nn.Sequential(
+            nn.Dropout2D(p=0.1), nn.Conv2D(inter_channels, num_classes, 1))
+        self.aux = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels=1024,
+                out_channels=256,
+                kernel_size=3,
+                bias_attr=False), nn.Dropout2D(p=0.1),
+            nn.Conv2D(256, num_classes, 1))
+
+    def forward(self, feat_list):
+        C3, C4 = feat_list
+        x = self.in_conv(C4)
+        x_shape = paddle.shape(x)
+        P_h, P_w = self.down_factor
+        Q_h, Q_w = paddle.ceil(x_shape[2] / P_h).astype('int32'), paddle.ceil(
+            x_shape[3] / P_w).astype('int32')
+        pad_h, pad_w = (Q_h * P_h - x_shape[2]).astype('int32'), (
+            Q_w * P_w - x_shape[3]).astype('int32')
+        if pad_h > 0 or pad_w > 0:
+            padding = paddle.concat([
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ],
+                                    axis=0)
+            feat = F.pad(x, padding)
+        else:
+            feat = x
+
+        feat = feat.reshape([0, x_shape[1], Q_h, P_h, Q_w, P_w])
+        feat = feat.transpose([0, 3, 5, 1, 2,
+                               4]).reshape([-1, self.inter_channels, Q_h, Q_w])
+        feat = self.global_relation(feat)
+
+        feat = feat.reshape([x_shape[0], P_h, P_w, x_shape[1], Q_h, Q_w])
+        feat = feat.transpose([0, 4, 5, 3, 1,
+                               2]).reshape([-1, self.inter_channels, P_h, P_w])
+        feat = self.local_relation(feat)
+
+        feat = feat.reshape([x_shape[0], Q_h, Q_w, x_shape[1], P_h, P_w])
+        feat = feat.transpose([0, 3, 1, 4, 2, 5]).reshape(
+            [0, self.inter_channels, P_h * Q_h, P_w * Q_w])
+        if pad_h > 0 or pad_w > 0:
+            feat = paddle.slice(
+                feat,
+                axes=[2, 3],
+                starts=[pad_h // 2, pad_w // 2],
+                ends=[pad_h // 2 + x_shape[2], pad_w // 2 + x_shape[3]])
+
+        feat = self.out_conv(paddle.concat([feat, x], axis=1))
+        output = self.cls(feat)
+
+        if self.enable_auxiliary_loss:
+            auxout = self.aux(C3)
+            return [output, auxout]
+        else:
+            return [output]
+
+
+class SelfAttentionBlock(layers.AttentionBlock):
+    """General self-attention block/non-local block.
+
+       Args:
+            in_channels (int): Input channels of key/query feature.
+            channels (int): Output channels of key/query transform.
+    """
+
+    def __init__(self, in_channels, channels):
+        super(SelfAttentionBlock, self).__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=False)
+
+        self.output_project = self.build_project(
+            in_channels, in_channels, num_convs=1, use_conv_module=True)
+
+    def forward(self, x):
+        context = super(SelfAttentionBlock, self).forward(x, x)
+        return self.output_project(context)
--- a/paddlers/models/ppseg/models/layers/init.py
+++ b/paddlers/models/ppseg/models/layers/init.py
@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .layer_libs import ConvBNReLU, ConvBN, SeparableConvBNReLU, DepthwiseConvBN, AuxLayer, SyncBatchNorm, JPU, ConvBNPReLU
+from .activation import Activation
+from .pyramid_pool import ASPPModule, PPModule
+from .attention import AttentionBlock
+from .nonlocal2d import NonLocal2D
+from .wrap_functions import *
--- a/paddlers/models/ppseg/models/layers/activation.py
+++ b/paddlers/models/ppseg/models/layers/activation.py
@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+
+class Activation(nn.Layer):
+    """
+    The wrapper of activations.
+
+    Args:
+        act (str, optional): The activation name in lowercase. It must be one of ['elu', 'gelu',
+            'hardshrink', 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid',
+            'softmax', 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax',
+            'hsigmoid']. Default: None, means identical transformation.
+
+    Returns:
+        A callable object of Activation.
+
+    Raises:
+        KeyError: When parameter `act` is not in the optional range.
+
+    Examples:
+
+        from paddlers.models.ppseg.models.common.activation import Activation
+
+        relu = Activation("relu")
+        print(relu)
+        # <class 'paddle.nn.layer.activation.ReLU'>
+
+        sigmoid = Activation("sigmoid")
+        print(sigmoid)
+        # <class 'paddle.nn.layer.activation.Sigmoid'>
+
+        not_exit_one = Activation("not_exit_one")
+        # KeyError: "not_exit_one does not exist in the current dict_keys(['elu', 'gelu', 'hardshrink',
+        # 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', 'softmax',
+        # 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', 'hsigmoid'])"
+    """
+
+    def __init__(self, act=None):
+        super(Activation, self).__init__()
+
+        self._act = act
+        upper_act_names = nn.layer.activation.__dict__.keys()
+        lower_act_names = [act.lower() for act in upper_act_names]
+        act_dict = dict(zip(lower_act_names, upper_act_names))
+
+        if act is not None:
+            if act in act_dict.keys():
+                act_name = act_dict[act]
+                self.act_func = eval(
+                    "nn.layer.activation.{}()".format(act_name))
+            else:
+                raise KeyError("{} does not exist in the current {}".format(
+                    act, act_dict.keys()))
+
+    def forward(self, x):
+        if self._act is not None:
+            return self.act_func(x)
+        else:
+            return x
--- a/paddlers/models/ppseg/models/layers/attention.py
+++ b/paddlers/models/ppseg/models/layers/attention.py
@ -0,0 +1,146 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+
+
+class AttentionBlock(nn.Layer):
+    """General self-attention block/non-local block.
+
+    The original article refers to refer to https://arxiv.org/abs/1706.03762.
+    Args:
+        key_in_channels (int): Input channels of key feature.
+        query_in_channels (int): Input channels of query feature.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_downsample (nn.Module): Query downsample module.
+        key_downsample (nn.Module): Key downsample module.
+        key_query_num_convs (int): Number of convs for key/query projection.
+        value_out_num_convs (int): Number of convs for value projection.
+        key_query_norm (bool): Whether to use BN for key/query projection.
+        value_out_norm (bool): Whether to use BN for value projection.
+        matmul_norm (bool): Whether normalize attention map with sqrt of
+            channels
+        with_out (bool): Whether use out projection.
+    """
+
+    def __init__(self, key_in_channels, query_in_channels, channels,
+                 out_channels, share_key_query, query_downsample,
+                 key_downsample, key_query_num_convs, value_out_num_convs,
+                 key_query_norm, value_out_norm, matmul_norm, with_out):
+        super(AttentionBlock, self).__init__()
+        if share_key_query:
+            assert key_in_channels == query_in_channels
+        self.with_out = with_out
+        self.key_in_channels = key_in_channels
+        self.query_in_channels = query_in_channels
+        self.out_channels = out_channels
+        self.channels = channels
+        self.share_key_query = share_key_query
+        self.key_project = self.build_project(
+            key_in_channels,
+            channels,
+            num_convs=key_query_num_convs,
+            use_conv_module=key_query_norm)
+        if share_key_query:
+            self.query_project = self.key_project
+        else:
+            self.query_project = self.build_project(
+                query_in_channels,
+                channels,
+                num_convs=key_query_num_convs,
+                use_conv_module=key_query_norm)
+
+        self.value_project = self.build_project(
+            key_in_channels,
+            channels if self.with_out else out_channels,
+            num_convs=value_out_num_convs,
+            use_conv_module=value_out_norm)
+
+        if self.with_out:
+            self.out_project = self.build_project(
+                channels,
+                out_channels,
+                num_convs=value_out_num_convs,
+                use_conv_module=value_out_norm)
+        else:
+            self.out_project = None
+
+        self.query_downsample = query_downsample
+        self.key_downsample = key_downsample
+        self.matmul_norm = matmul_norm
+
+    def build_project(self, in_channels, channels, num_convs, use_conv_module):
+        if use_conv_module:
+            convs = [
+                layers.ConvBNReLU(
+                    in_channels=in_channels,
+                    out_channels=channels,
+                    kernel_size=1,
+                    bias_attr=False)
+            ]
+            for _ in range(num_convs - 1):
+                convs.append(
+                    layers.ConvBNReLU(
+                        in_channels=channels,
+                        out_channels=channels,
+                        kernel_size=1,
+                        bias_attr=False))
+        else:
+            convs = [nn.Conv2D(in_channels, channels, 1)]
+            for _ in range(num_convs - 1):
+                convs.append(nn.Conv2D(channels, channels, 1))
+
+        if len(convs) > 1:
+            convs = nn.Sequential(*convs)
+        else:
+            convs = convs[0]
+        return convs
+
+    def forward(self, query_feats, key_feats):
+        query_shape = paddle.shape(query_feats)
+        query = self.query_project(query_feats)
+        if self.query_downsample is not None:
+            query = self.query_downsample(query)
+        query = query.flatten(2).transpose([0, 2, 1])
+
+        key = self.key_project(key_feats)
+        value = self.value_project(key_feats)
+
+        if self.key_downsample is not None:
+            key = self.key_downsample(key)
+            value = self.key_downsample(value)
+
+        key = key.flatten(2)
+        value = value.flatten(2).transpose([0, 2, 1])
+        sim_map = paddle.matmul(query, key)
+        if self.matmul_norm:
+            sim_map = (self.channels**-0.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        context = paddle.matmul(sim_map, value)
+        context = paddle.transpose(context, [0, 2, 1])
+
+        context = paddle.reshape(
+            context, [0, self.out_channels, query_shape[2], query_shape[3]])
+
+        if self.out_project is not None:
+            context = self.out_project(context)
+        return context
--- a/paddlers/models/ppseg/models/layers/layer_libs.py
+++ b/paddlers/models/ppseg/models/layers/layer_libs.py
@ -0,0 +1,302 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlers.models.ppseg.models import layers
+
+
+def SyncBatchNorm(*args, **kwargs):
+    """In cpu environment nn.SyncBatchNorm does not have kernel so use nn.BatchNorm2D instead"""
+    if paddle.get_device() == 'cpu' or os.environ.get('PADDLESEG_EXPORT_STAGE'):
+        return nn.BatchNorm2D(*args, **kwargs)
+    elif paddle.distributed.ParallelEnv().nranks == 1:
+        return nn.BatchNorm2D(*args, **kwargs)
+    else:
+        return nn.SyncBatchNorm(*args, **kwargs)
+
+
+class ConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels, out_channels, kernel_size, padding=padding, **kwargs)
+
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+        self._relu = layers.Activation("relu")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        x = self._relu(x)
+        return x
+
+
+class ConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+        self._conv = nn.Conv2D(
+            in_channels, out_channels, kernel_size, padding=padding, **kwargs)
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        return x
+
+
+class ConvReLUPool(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1)
+        self._relu = layers.Activation("relu")
+        self._max_pool = nn.MaxPool2D(kernel_size=2, stride=2)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self._relu(x)
+        x = self._max_pool(x)
+        return x
+
+
+class SeparableConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 pointwise_bias=None,
+                 **kwargs):
+        super().__init__()
+        self.depthwise_conv = ConvBN(
+            in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=in_channels,
+            **kwargs)
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self.piontwise_conv = ConvBNReLU(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            groups=1,
+            data_format=data_format,
+            bias_attr=pointwise_bias)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.piontwise_conv(x)
+        return x
+
+
+class DepthwiseConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+        self.depthwise_conv = ConvBN(
+            in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=in_channels,
+            **kwargs)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        return x
+
+
+class AuxLayer(nn.Layer):
+    """
+    The auxiliary layer implementation for auxiliary loss.
+
+    Args:
+        in_channels (int): The number of input channels.
+        inter_channels (int): The intermediate channels.
+        out_channels (int): The number of output channels, and usually it is num_classes.
+        dropout_prob (float, optional): The drop rate. Default: 0.1.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 inter_channels,
+                 out_channels,
+                 dropout_prob=0.1,
+                 **kwargs):
+        super().__init__()
+
+        self.conv_bn_relu = ConvBNReLU(
+            in_channels=in_channels,
+            out_channels=inter_channels,
+            kernel_size=3,
+            padding=1,
+            **kwargs)
+
+        self.dropout = nn.Dropout(p=dropout_prob)
+
+        self.conv = nn.Conv2D(
+            in_channels=inter_channels,
+            out_channels=out_channels,
+            kernel_size=1)
+
+    def forward(self, x):
+        x = self.conv_bn_relu(x)
+        x = self.dropout(x)
+        x = self.conv(x)
+        return x
+
+
+class JPU(nn.Layer):
+    """
+    Joint Pyramid Upsampling of FCN.
+    The original paper refers to
+        Wu, Huikai, et al. "Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation." arXiv preprint arXiv:1903.11816 (2019).
+    """
+
+    def __init__(self, in_channels, width=512):
+        super().__init__()
+
+        self.conv5 = ConvBNReLU(
+            in_channels[-1], width, 3, padding=1, bias_attr=False)
+        self.conv4 = ConvBNReLU(
+            in_channels[-2], width, 3, padding=1, bias_attr=False)
+        self.conv3 = ConvBNReLU(
+            in_channels[-3], width, 3, padding=1, bias_attr=False)
+
+        self.dilation1 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=1,
+            pointwise_bias=False,
+            dilation=1,
+            bias_attr=False,
+            stride=1,
+        )
+        self.dilation2 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=2,
+            pointwise_bias=False,
+            dilation=2,
+            bias_attr=False,
+            stride=1)
+        self.dilation3 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=4,
+            pointwise_bias=False,
+            dilation=4,
+            bias_attr=False,
+            stride=1)
+        self.dilation4 = SeparableConvBNReLU(
+            3 * width,
+            width,
+            3,
+            padding=8,
+            pointwise_bias=False,
+            dilation=8,
+            bias_attr=False,
+            stride=1)
+
+    def forward(self, *inputs):
+        feats = [
+            self.conv5(inputs[-1]),
+            self.conv4(inputs[-2]),
+            self.conv3(inputs[-3])
+        ]
+        size = paddle.shape(feats[-1])[2:]
+        feats[-2] = F.interpolate(
+            feats[-2], size, mode='bilinear', align_corners=True)
+        feats[-3] = F.interpolate(
+            feats[-3], size, mode='bilinear', align_corners=True)
+
+        feat = paddle.concat(feats, axis=1)
+        feat = paddle.concat([
+            self.dilation1(feat),
+            self.dilation2(feat),
+            self.dilation3(feat),
+            self.dilation4(feat)
+        ],
+                             axis=1)
+
+        return inputs[0], inputs[1], inputs[2], feat
+
+
+class ConvBNPReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+
+        self._conv = nn.Conv2D(in_channels,
+                               out_channels,
+                               kernel_size,
+                               padding=padding,
+                               **kwargs)
+
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+        self._prelu = layers.Activation("prelu")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        x = self._prelu(x)
+        return x
+        
--- a/paddlers/models/ppseg/models/layers/nonlocal2d.py
+++ b/paddlers/models/ppseg/models/layers/nonlocal2d.py
@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+
+
+class NonLocal2D(nn.Layer):
+    """Basic Non-local module.
+    This model is the implementation of "Non-local Neural Networks"
+    (https://arxiv.org/abs/1711.07971)
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        reduction (int): Channel reduction ratio. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. Default: True.
+        sub_sample (bool): Whether to utilize max pooling after pairwise function. Default: False.
+        mode (str): Options are `gaussian`, `concatenation`, `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 reduction=2,
+                 use_scale=True,
+                 sub_sample=False,
+                 mode='embedded_gaussian'):
+        super(NonLocal2D, self).__init__()
+        self.in_channels = in_channels
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.sub_sample = sub_sample
+        self.mode = mode
+        if mode not in [
+                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
+        ]:
+            raise ValueError(
+                "Mode should be in 'gaussian', 'concatenation','embedded_gaussian' or 'dot_product'."
+            )
+
+        self.inter_channels = max(in_channels // reduction, 1)
+
+        self.g = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1)
+        self.conv_out = layers.ConvBNReLU(
+            in_channels=self.inter_channels,
+            out_channels=self.in_channels,
+            kernel_size=1,
+            bias_attr=False)
+
+        if self.mode != "gaussian":
+            self.theta = nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.inter_channels,
+                kernel_size=1)
+            self.phi = nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.inter_channels,
+                kernel_size=1)
+
+        if self.mode == "concatenation":
+            self.concat_project = layers.ConvBNReLU(
+                in_channels=self.inter_channels * 2,
+                out_channels=1,
+                kernel_size=1,
+                bias_attr=False)
+
+        if self.sub_sample:
+            max_pool_layer = nn.MaxPool2D(kernel_size=(2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+    def gaussian(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        pairwise_weight = F.softmax(pairwise_weight, axis=-1)
+        return pairwise_weight
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        if self.use_scale:
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight = F.softmax(pairwise_weight, -1)
+        return pairwise_weight
+
+    def dot_product(self, theta_x, phi_x):
+        pairwise_weight = paddle.matmul(theta_x, phi_x)
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def concatenation(self, theta_x, phi_x):
+        h = theta_x.shape[2]
+        w = phi_x.shape[3]
+        theta_x = paddle.tile(theta_x, [1, 1, 1, w])
+        phi_x = paddle.tile(phi_x, [1, 1, h, 1])
+
+        concat_feature = paddle.concat([theta_x, phi_x], axis=1)
+        pairwise_weight = self.concat_project(concat_feature)
+        n, _, h, w = pairwise_weight.shape
+        pairwise_weight = paddle.reshape(pairwise_weight, [n, h, w])
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def forward(self, x):
+        n, c, h, w = x.shape
+        g_x = paddle.reshape(self.g(x), [n, self.inter_channels, -1])
+        g_x = paddle.transpose(g_x, [0, 2, 1])
+
+        if self.mode == 'gaussian':
+            theta_x = paddle.reshape(x, [n, self.inter_channels, -1])
+            theta_x = paddle.transpose(theta_x, [0, 2, 1])
+            if self.sub_sample:
+                phi_x = paddle.reshape(
+                    self.phi(x), [n, self.inter_channels, -1])
+            else:
+                phi_x = paddle.reshape(x, [n, self.in_channels, -1])
+
+        elif self.mode == 'concatenation':
+            theta_x = paddle.reshape(
+                self.theta(x), [n, self.inter_channels, -1, 1])
+            phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, 1, -1])
+
+        else:
+            theta_x = paddle.reshape(
+                self.theta(x), [n, self.inter_channels, -1])
+            theta_x = paddle.transpose(theta_x, [0, 2, 1])
+            phi_x = paddle.reshape(self.phi(x), [n, self.inter_channels, -1])
+
+        pairwise_func = getattr(self, self.mode)
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+        y = paddle.matmul(pairwise_weight, g_x)
+        y = paddle.transpose(y, [0, 2, 1])
+        y = paddle.reshape(y, [n, self.inter_channels, h, w])
+
+        output = x + self.conv_out(y)
+
+        return output
--- a/paddlers/models/ppseg/models/layers/pyramid_pool.py
+++ b/paddlers/models/ppseg/models/layers/pyramid_pool.py
@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlers.models.ppseg.models import layers
+
+
+class ASPPModule(nn.Layer):
+    """
+    Atrous Spatial Pyramid Pooling.
+
+    Args:
+        aspp_ratios (tuple): The dilation rate using in ASSP module.
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+        use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False.
+        image_pooling (bool, optional): If augmented with image-level features. Default: False
+    """
+
+    def __init__(self,
+                 aspp_ratios,
+                 in_channels,
+                 out_channels,
+                 align_corners,
+                 use_sep_conv=False,
+                 image_pooling=False,
+                 data_format='NCHW'):
+        super().__init__()
+
+        self.align_corners = align_corners
+        self.data_format = data_format
+        self.aspp_blocks = nn.LayerList()
+
+        for ratio in aspp_ratios:
+            if use_sep_conv and ratio > 1:
+                conv_func = layers.SeparableConvBNReLU
+            else:
+                conv_func = layers.ConvBNReLU
+
+            block = conv_func(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1 if ratio == 1 else 3,
+                dilation=ratio,
+                padding=0 if ratio == 1 else ratio,
+                data_format=data_format)
+            self.aspp_blocks.append(block)
+
+        out_size = len(self.aspp_blocks)
+
+        if image_pooling:
+            self.global_avg_pool = nn.Sequential(
+                nn.AdaptiveAvgPool2D(
+                    output_size=(1, 1), data_format=data_format),
+                layers.ConvBNReLU(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    bias_attr=False,
+                    data_format=data_format))
+            out_size += 1
+        self.image_pooling = image_pooling
+
+        self.conv_bn_relu = layers.ConvBNReLU(
+            in_channels=out_channels * out_size,
+            out_channels=out_channels,
+            kernel_size=1,
+            data_format=data_format)
+
+        self.dropout = nn.Dropout(p=0.1)  # drop rate
+
+    def forward(self, x):
+        outputs = []
+        if self.data_format == 'NCHW':
+            interpolate_shape = paddle.shape(x)[2:]
+            axis = 1
+        else:
+            interpolate_shape = paddle.shape(x)[1:3]
+            axis = -1
+        for block in self.aspp_blocks:
+            y = block(x)
+            outputs.append(y)
+
+        if self.image_pooling:
+            img_avg = self.global_avg_pool(x)
+            img_avg = F.interpolate(
+                img_avg,
+                interpolate_shape,
+                mode='bilinear',
+                align_corners=self.align_corners,
+                data_format=self.data_format)
+            outputs.append(img_avg)
+
+        x = paddle.concat(outputs, axis=axis)
+        x = self.conv_bn_relu(x)
+        x = self.dropout(x)
+
+        return x
+
+
+class PPModule(nn.Layer):
+    """
+    Pyramid pooling module originally in PSPNet.
+
+    Args:
+        in_channels (int): The number of intput channels to pyramid pooling module.
+        out_channels (int): The number of output channels after pyramid pooling module.
+        bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 2, 3, 6).
+        dim_reduction (bool, optional): A bool value represents if reducing dimension after pooling. Default: True.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self, in_channels, out_channels, bin_sizes, dim_reduction,
+                 align_corners):
+        super().__init__()
+
+        self.bin_sizes = bin_sizes
+
+        inter_channels = in_channels
+        if dim_reduction:
+            inter_channels = in_channels // len(bin_sizes)
+
+        # we use dimension reduction after pooling mentioned in original implementation.
+        self.stages = nn.LayerList([
+            self._make_stage(in_channels, inter_channels, size)
+            for size in bin_sizes
+        ])
+
+        self.conv_bn_relu2 = layers.ConvBNReLU(
+            in_channels=in_channels + inter_channels * len(bin_sizes),
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1)
+
+        self.align_corners = align_corners
+
+    def _make_stage(self, in_channels, out_channels, size):
+        """
+        Create one pooling layer.
+
+        In our implementation, we adopt the same dimension reduction as the original paper that might be
+        slightly different with other implementations.
+
+        After pooling, the channels are reduced to 1/len(bin_sizes) immediately, while some other implementations
+        keep the channels to be same.
+
+        Args:
+            in_channels (int): The number of intput channels to pyramid pooling module.
+            size (int): The out size of the pooled layer.
+
+        Returns:
+            conv (Tensor): A tensor after Pyramid Pooling Module.
+        """
+
+        prior = nn.AdaptiveAvgPool2D(output_size=(size, size))
+        conv = layers.ConvBNReLU(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1)
+
+        return nn.Sequential(prior, conv)
+
+    def forward(self, input):
+        cat_layers = []
+        for stage in self.stages:
+            x = stage(input)
+            x = F.interpolate(
+                x,
+                paddle.shape(input)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            cat_layers.append(x)
+        cat_layers = [input] + cat_layers[::-1]
+        cat = paddle.concat(cat_layers, axis=1)
+        out = self.conv_bn_relu2(cat)
+
+        return out
--- a/paddlers/models/ppseg/models/layers/wrap_functions.py
+++ b/paddlers/models/ppseg/models/layers/wrap_functions.py
@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+"""
+Warp the functon api, so the normal and quantization training can use the same network.
+"""
+
+
+class Add(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.add(x, y, name)
+
+
+class Subtract(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.subtract(x, y, name)
+
+
+class Multiply(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.multiply(x, y, name)
+
+
+class Divide(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.divide(x, y, name)
+
+
+class Reshape(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, shape, name=None):
+        return paddle.reshape(x, shape, name)
+
+
+class Transpose(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, perm, name=None):
+        return paddle.transpose(x, perm, name)
+
+
+class Concat(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, axis=0, name=None):
+        return paddle.concat(x, axis, name)
+
+
+class Flatten(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, start_axis=0, stop_axis=-1, name=None):
+        return paddle.flatten(x, start_axis, stop_axis, name)
--- a/paddlers/models/ppseg/models/losses/init.py
+++ b/paddlers/models/ppseg/models/losses/init.py
@ -0,0 +1,36 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mixed_loss import MixedLoss
+from .cross_entropy_loss import CrossEntropyLoss
+from .cross_entropy_loss import DistillCrossEntropyLoss
+from .binary_cross_entropy_loss import BCELoss
+from .lovasz_loss import LovaszSoftmaxLoss, LovaszHingeLoss
+from .gscnn_dual_task_loss import DualTaskLoss
+from .edge_attention_loss import EdgeAttentionLoss
+from .bootstrapped_cross_entropy import BootstrappedCrossEntropyLoss
+from .dice_loss import DiceLoss
+from .ohem_cross_entropy_loss import OhemCrossEntropyLoss
+from .decoupledsegnet_relax_boundary_loss import RelaxBoundaryLoss
+from .ohem_edge_attention_loss import OhemEdgeAttentionLoss
+from .l1_loss import L1Loss
+from .mean_square_error_loss import MSELoss
+from .focal_loss import FocalLoss
+from .kl_loss import KLLoss
+from .rmi_loss import RMILoss
+from .detail_aggregate_loss import DetailAggregateLoss
+from .point_cross_entropy_loss import PointCrossEntropyLoss
+from .pixel_contrast_cross_entropy_loss import PixelContrastCrossEntropyLoss
+from .semantic_encode_cross_entropy_loss import SECrossEntropyLoss
+from .semantic_connectivity_loss import SemanticConnectivityLoss
--- a/paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
@ -0,0 +1,174 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class BCELoss(nn.Layer):
+    r"""
+    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
+    layer and some reduce operations.
+    This measures the element-wise probability error in classification tasks
+    in which each class is independent.
+    This can be thought of as predicting labels for a data-point, where labels
+    are not mutually exclusive. For example, a news article can be about
+    politics, technology or sports at the same time or none of these.
+    First this operator calculate loss function as follows:
+    .. math::
+           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+    .. math::
+           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    we reformulate the loss as follows:
+    .. math::
+           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
+    weight on every items in the batch. The ``pos_weight`` will attach different
+    weight on the positive label of each class.
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+    Note that the target labels ``label`` should be numbers between 0 and 1.
+    Args:
+        weight (Tensor | str, optional): A manual rescaling weight given to the loss of each
+            batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
+            The data type is float32, float64. If type is str, it should equal to 'dynamic'.
+            It will compute weight dynamically in every step.
+            Default is ``'None'``.
+        pos_weight (float|str, optional): A weight of positive examples. If type is str,
+            it should equal to 'dynamic'. It will compute weight dynamically in every step.
+            Default is ``'None'``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        edge_label (bool, optional): Whether to use edge label. Default: False
+    Shapes:
+        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
+            N is batch_size, `*` means number of additional dimensions. The ``logit``
+            is usually the output of Linear layer. Available dtype is float32, float64.
+        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+            ``logit``. The target labels which values should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+    Returns:
+        A callable object of BCEWithLogitsLoss.
+    Examples:
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
+            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
+            output = bce_logit_loss(logit, label)
+            print(output.numpy())  # [0.45618808]
+    """
+
+    def __init__(self,
+                 weight=None,
+                 pos_weight=None,
+                 ignore_index=255,
+                 edge_label=False):
+        super().__init__()
+        self.weight = weight
+        self.pos_weight = pos_weight
+        self.ignore_index = ignore_index
+        self.edge_label = edge_label
+        self.EPS = 1e-10
+
+        if self.weight is not None:
+            if isinstance(self.weight, str):
+                if self.weight != 'dynamic':
+                    raise ValueError(
+                        "if type of `weight` is str, it should equal to 'dynamic', but it is {}"
+                        .format(self.weight))
+            elif isinstance(self.weight, paddle.VarBase):
+                raise TypeError(
+                    'The type of `weight` is wrong, it should be Tensor or str, but it is {}'
+                    .format(type(self.weight)))
+
+        if self.pos_weight is not None:
+            if isinstance(self.pos_weight, str):
+                if self.pos_weight != 'dynamic':
+                    raise ValueError(
+                        "if type of `pos_weight` is str, it should equal to 'dynamic', but it is {}"
+                        .format(self.pos_weight))
+            elif isinstance(self.pos_weight, float):
+                self.pos_weight = paddle.to_tensor(
+                    self.pos_weight, dtype='float32')
+            else:
+                raise TypeError(
+                    'The type of `pos_weight` is wrong, it should be float or str, but it is {}'
+                    .format(type(self.pos_weight)))
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
+                value is 0 or 1, and if shape is more than 2D, this is
+                (N, C, D1, D2,..., Dk), k >= 1.
+        """
+        if len(label.shape) != len(logit.shape):
+            label = paddle.unsqueeze(label, 1)
+        mask = (label != self.ignore_index)
+        mask = paddle.cast(mask, 'float32')
+        # label.shape should equal to the logit.shape
+        if label.shape[1] != logit.shape[1]:
+            label = label.squeeze(1)
+            label = F.one_hot(label, logit.shape[1])
+            label = label.transpose((0, 3, 1, 2))
+        if isinstance(self.weight, str):
+            pos_index = (label == 1)
+            neg_index = (label == 0)
+            pos_num = paddle.sum(pos_index.astype('float32'))
+            neg_num = paddle.sum(neg_index.astype('float32'))
+            sum_num = pos_num + neg_num
+            weight_pos = 2 * neg_num / (sum_num + self.EPS)
+            weight_neg = 2 * pos_num / (sum_num + self.EPS)
+            weight = weight_pos * label + weight_neg * (1 - label)
+        else:
+            weight = self.weight
+        if isinstance(self.pos_weight, str):
+            pos_index = (label == 1)
+            neg_index = (label == 0)
+            pos_num = paddle.sum(pos_index.astype('float32'))
+            neg_num = paddle.sum(neg_index.astype('float32'))
+            sum_num = pos_num + neg_num
+            pos_weight = 2 * neg_num / (sum_num + self.EPS)
+        else:
+            pos_weight = self.pos_weight
+        label = label.astype('float32')
+        loss = paddle.nn.functional.binary_cross_entropy_with_logits(
+            logit,
+            label,
+            weight=weight,
+            reduction='none',
+            pos_weight=pos_weight)
+        loss = loss * mask
+        loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
+        label.stop_gradient = True
+        mask.stop_gradient = True
+
+        return loss
--- a/paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
+++ b/paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class BootstrappedCrossEntropyLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function.
+
+    Args:
+        min_K (int): the minimum number of pixels to be counted in loss computation.
+        loss_th (float): the loss threshold. Only loss that is larger than the threshold
+            would be calculated.
+        weight (tuple|list, optional): The weight for different classes. Default: None.
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default: 255.
+    """
+
+    def __init__(self, min_K, loss_th, weight=None, ignore_index=255):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.K = min_K
+        self.threshold = loss_th
+        if weight is not None:
+            weight = paddle.to_tensor(weight, dtype='float32')
+        self.weight = weight
+
+    def forward(self, logit, label):
+
+        n, c, h, w = logit.shape
+        total_loss = 0.0
+        if len(label.shape) != len(logit.shape):
+            label = paddle.unsqueeze(label, 1)
+
+        for i in range(n):
+            x = paddle.unsqueeze(logit[i], 0)
+            y = paddle.unsqueeze(label[i], 0)
+            x = paddle.transpose(x, (0, 2, 3, 1))
+            y = paddle.transpose(y, (0, 2, 3, 1))
+            x = paddle.reshape(x, shape=(-1, c))
+            y = paddle.reshape(y, shape=(-1, ))
+            loss = F.cross_entropy(
+                x,
+                y,
+                weight=self.weight,
+                ignore_index=self.ignore_index,
+                reduction="none")
+            sorted_loss = paddle.sort(loss, descending=True)
+            if sorted_loss[self.K] > self.threshold:
+                new_indices = paddle.nonzero(sorted_loss > self.threshold)
+                loss = paddle.gather(sorted_loss, new_indices)
+            else:
+                loss = sorted_loss[:self.K]
+
+            total_loss += paddle.mean(loss)
+        return total_loss / float(n)
--- a/paddlers/models/ppseg/models/losses/cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/cross_entropy_loss.py
@ -0,0 +1,218 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class CrossEntropyLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function.
+
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0].
+            When its value < 1.0, only compute the loss for the top k percent pixels
+            (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``.
+        data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``.
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=255,
+                 top_k_percent_pixels=1.0,
+                 data_format='NCHW'):
+        super(CrossEntropyLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.top_k_percent_pixels = top_k_percent_pixels
+        self.EPS = 1e-8
+        self.data_format = data_format
+        if weight is not None:
+            self.weight = paddle.to_tensor(weight, dtype='float32')
+        else:
+            self.weight = None
+
+    def forward(self, logit, label, semantic_weights=None):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels,
+                shape is the same as label. Default: None.
+        Returns:
+            (Tensor): The average loss.
+        """
+        channel_axis = 1 if self.data_format == 'NCHW' else -1
+        if self.weight is not None and logit.shape[channel_axis] != len(
+                self.weight):
+            raise ValueError(
+                'The number of weights = {} must be the same as the number of classes = {}.'
+                .format(len(self.weight), logit.shape[channel_axis]))
+
+        if channel_axis == 1:
+            logit = paddle.transpose(logit, [0, 2, 3, 1])
+        label = label.astype('int64')
+
+        # In F.cross_entropy, the ignore_index is invalid, which needs to be fixed.
+        # When there is 255 in the label and paddle version <= 2.1.3, the cross_entropy OP will report an error, which is fixed in paddle develop version.
+        loss = F.cross_entropy(
+            logit,
+            label,
+            ignore_index=self.ignore_index,
+            reduction='none',
+            weight=self.weight)
+
+        return self._post_process_loss(logit, label, semantic_weights, loss)
+
+    def _post_process_loss(self, logit, label, semantic_weights, loss):
+        """
+        Consider mask and top_k to calculate the final loss.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels,
+                shape is the same as label.
+            loss (Tensor): Loss tensor which is the output of cross_entropy. If soft_label
+                is False in cross_entropy, the shape of loss should be the same as the label.
+                If soft_label is True in cross_entropy, the shape of loss should be
+                (N, D1, D2,..., Dk, 1).
+        Returns:
+            (Tensor): The average loss.
+        """
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, 'float32')
+        label.stop_gradient = True
+        mask.stop_gradient = True
+
+        if loss.ndim > mask.ndim:
+            loss = paddle.squeeze(loss, axis=-1)
+        loss = loss * mask
+        if semantic_weights is not None:
+            loss = loss * semantic_weights
+
+        if self.weight is not None:
+            _one_hot = F.one_hot(label, logit.shape[-1])
+            coef = paddle.sum(_one_hot * self.weight, axis=-1)
+        else:
+            coef = paddle.ones_like(label)
+
+        if self.top_k_percent_pixels == 1.0:
+            avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS)
+        else:
+            loss = loss.reshape((-1, ))
+            top_k_pixels = int(self.top_k_percent_pixels * loss.numel())
+            loss, indices = paddle.topk(loss, top_k_pixels)
+            coef = coef.reshape((-1, ))
+            coef = paddle.gather(coef, indices)
+            coef.stop_gradient = True
+            coef = coef.astype('float32')
+            avg_loss = loss.mean() / (paddle.mean(coef) + self.EPS)
+
+        return avg_loss
+
+
+@manager.LOSSES.add_component
+class DistillCrossEntropyLoss(CrossEntropyLoss):
+    """
+    The implementation of distill cross entropy loss.
+
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0].
+            When its value < 1.0, only compute the loss for the top k percent pixels
+            (e.g., the top 20% pixels). This is useful for hard pixel mining.
+            Default ``1.0``.
+        data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'.
+            Default ``'NCHW'``.
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=255,
+                 top_k_percent_pixels=1.0,
+                 data_format='NCHW'):
+        super().__init__(weight, ignore_index, top_k_percent_pixels,
+                         data_format)
+
+    def forward(self,
+                student_logit,
+                teacher_logit,
+                label,
+                semantic_weights=None):
+        """
+        Forward computation.
+
+        Args:
+            student_logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            teacher_logit (Tensor): Logit tensor, the data type is float32, float64. The shape
+                is the same as the student_logit.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels,
+                shape is the same as label. Default: None.
+        """
+
+        if student_logit.shape != teacher_logit.shape:
+            raise ValueError(
+                'The shape of student_logit = {} must be the same as the shape of teacher_logit = {}.'
+                .format(student_logit.shape, teacher_logit.shape))
+
+        channel_axis = 1 if self.data_format == 'NCHW' else -1
+        if self.weight is not None and student_logit.shape[channel_axis] != len(
+                self.weight):
+            raise ValueError(
+                'The number of weights = {} must be the same as the number of classes = {}.'
+                .format(len(self.weight), student_logit.shape[channel_axis]))
+
+        if channel_axis == 1:
+            student_logit = paddle.transpose(student_logit, [0, 2, 3, 1])
+            teacher_logit = paddle.transpose(teacher_logit, [0, 2, 3, 1])
+
+        teacher_logit = F.softmax(teacher_logit)
+
+        loss = F.cross_entropy(
+            student_logit,
+            teacher_logit,
+            weight=self.weight,
+            reduction='none',
+            soft_label=True)
+
+        return self._post_process_loss(student_logit, label, semantic_weights,
+                                       loss)
--- a/paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
+++ b/paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from scipy.ndimage.interpolation import shift
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class RelaxBoundaryLoss(nn.Layer):
+    """
+    Implements the ohem cross entropy loss function.
+
+    Args:
+        border (int, optional): The value of border to relax. Default: 1.
+        calculate_weights (bool, optional): Whether to calculate weights for every classes. Default: False.
+        upper_bound (float, optional): The upper bound of weights if calculating weights for every classes. Default: 1.0.
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default: 255.
+    """
+
+    def __init__(self,
+                 border=1,
+                 calculate_weights=False,
+                 upper_bound=1.0,
+                 ignore_index=255):
+        super(RelaxBoundaryLoss, self).__init__()
+        self.border = border
+        self.calculate_weights = calculate_weights
+        self.upper_bound = upper_bound
+        self.ignore_index = ignore_index
+        self.EPS = 1e-5
+
+    def relax_onehot(self, label, num_classes):
+        # pad label, and let ignore_index as num_classes
+        if len(label.shape) == 3:
+            label = label.unsqueeze(1)
+        h, w = label.shape[-2], label.shape[-1]
+        label = F.pad(label, [self.border] * 4, value=num_classes)
+        label = label.squeeze(1)
+        ignore_mask = (label == self.ignore_index).astype('int64')
+        label = label * (1 - ignore_mask) + num_classes * ignore_mask
+
+        onehot = 0
+        for i in range(-self.border, self.border + 1):
+            for j in range(-self.border, self.border + 1):
+                h_start, h_end = 1 + i, h + 1 + i
+                w_start, w_end = 1 + j, w + 1 + j
+                label_ = label[:, h_start:h_end, w_start:w_end]
+                onehot_ = F.one_hot(label_, num_classes + 1)
+                onehot += onehot_
+        onehot = (onehot > 0).astype('int64')
+        onehot = paddle.transpose(onehot, (0, 3, 1, 2))
+
+        return onehot
+
+    def calculate_weights(self, label):
+        hist = paddle.sum(label, axis=(1, 2)) * 1.0 / label.sum()
+        hist = ((hist != 0) * self.upper_bound * (1 - hist)) + 1
+
+    def custom_nll(self,
+                   logit,
+                   label,
+                   class_weights=None,
+                   border_weights=None,
+                   ignore_mask=None):
+        soft = F.softmax(logit, axis=1)
+        # calculate the valid soft where label is 1.
+        soft_label = ((soft * label[:, :-1, :, :]).sum(
+            1, keepdim=True)) * (label[:, :-1, :, :].astype('float32'))
+        soft = soft * (1 - label[:, :-1, :, :]) + soft_label
+        logsoft = paddle.log(soft)
+        if class_weights is not None:
+            logsoft = class_weights.unsqueeze((0, 2, 3))
+        logsoft = label[:, :-1, :, :] * logsoft
+        logsoft = logsoft.sum(1)
+        # border loss is divided equally
+        logsoft = -1 / border_weights * logsoft * (1. - ignore_mask)
+        n, _, h, w = label.shape
+        logsoft = logsoft.sum() / (n * h * w - ignore_mask.sum() + 1)
+        return logsoft
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+        """
+        n, c, h, w = logit.shape
+        label.stop_gradient = True
+        label = self.relax_onehot(label, c)
+        weights = label[:, :-1, :, :].sum(1).astype('float32')
+        ignore_mask = (weights == 0).astype('float32')
+        # border is greater than 1, other is 1
+        border_weights = weights + ignore_mask
+
+        loss = 0
+        class_weights = None
+        for i in range(n):
+            if self.calculate_weights:
+                class_weights = self.calculate_weights(label[i])
+            loss = loss + self.custom_nll(
+                logit[i].unsqueeze(0),
+                label[i].unsqueeze(0),
+                class_weights=class_weights,
+                border_weights=border_weights,
+                ignore_mask=ignore_mask[i])
+        return loss
--- a/paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
+++ b/paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
@ -0,0 +1,116 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class DetailAggregateLoss(nn.Layer):
+    """
+    DetailAggregateLoss's implementation based on PaddlePaddle.
+
+    The original article refers to Meituan
+    Fan, Mingyuan, et al. "Rethinking BiSeNet For Real-time Semantic Segmentation."
+    (https://arxiv.org/abs/2104.13188)
+
+    Args:
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+
+    """
+
+    def __init__(self, ignore_index=255):
+        super(DetailAggregateLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.laplacian_kernel = paddle.to_tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1], dtype='float32').reshape(
+            (1, 1, 3, 3))
+        self.fuse_kernel = paddle.create_parameter([1, 3, 1, 1], dtype='float32')
+
+    def forward(self, logits, label):
+        """
+        Args:
+            logits (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+        Returns: loss
+        """
+        boundary_targets = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                    padding=1)
+        boundary_targets = paddle.clip(boundary_targets, min=0)
+        boundary_targets = boundary_targets > 0.1
+        boundary_targets = boundary_targets.astype('float32')
+
+        boundary_targets_x2 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                       stride=2, padding=1)
+        boundary_targets_x2 = paddle.clip(boundary_targets_x2, min=0)
+        boundary_targets_x4 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                       stride=4, padding=1)
+        boundary_targets_x4 = paddle.clip(boundary_targets_x4, min=0)
+
+        boundary_targets_x8 = F.conv2d(paddle.unsqueeze(label, axis=1).astype('float32'), self.laplacian_kernel,
+                                       stride=8, padding=1)
+        boundary_targets_x8 = paddle.clip(boundary_targets_x8, min=0)
+
+        boundary_targets_x8_up = F.interpolate(boundary_targets_x8, boundary_targets.shape[2:], mode='nearest')
+        boundary_targets_x4_up = F.interpolate(boundary_targets_x4, boundary_targets.shape[2:], mode='nearest')
+        boundary_targets_x2_up = F.interpolate(boundary_targets_x2, boundary_targets.shape[2:], mode='nearest')
+
+        boundary_targets_x2_up = boundary_targets_x2_up > 0.1
+        boundary_targets_x2_up = boundary_targets_x2_up.astype('float32')
+
+        boundary_targets_x4_up = boundary_targets_x4_up > 0.1
+        boundary_targets_x4_up = boundary_targets_x4_up.astype('float32')
+
+        boundary_targets_x8_up = boundary_targets_x8_up > 0.1
+        boundary_targets_x8_up = boundary_targets_x8_up.astype('float32')
+
+        boudary_targets_pyramids = paddle.stack((boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up),
+                                                axis=1)
+
+        boudary_targets_pyramids = paddle.squeeze(boudary_targets_pyramids, axis=2)
+        boudary_targets_pyramid = F.conv2d(boudary_targets_pyramids, self.fuse_kernel)
+
+        boudary_targets_pyramid = boudary_targets_pyramid > 0.1
+        boudary_targets_pyramid = boudary_targets_pyramid.astype('float32')
+
+        if logits.shape[-1] != boundary_targets.shape[-1]:
+            logits = F.interpolate(
+                logits, boundary_targets.shape[2:], mode='bilinear', align_corners=True)
+
+        bce_loss = F.binary_cross_entropy_with_logits(logits, boudary_targets_pyramid)
+        dice_loss = self.fixed_dice_loss_func(F.sigmoid(logits), boudary_targets_pyramid)
+        detail_loss = bce_loss + dice_loss
+
+        label.stop_gradient = True
+        return detail_loss
+
+    def fixed_dice_loss_func(self, input, target):
+        """
+            simplified diceloss for DetailAggregateLoss.
+        """
+        smooth = 1.
+        n = input.shape[0]
+        iflat = paddle.reshape(input, [n, -1])
+        tflat = paddle.reshape(target, [n, -1])
+        intersection = paddle.sum((iflat * tflat), axis=1)
+        loss = 1 - ((2. * intersection + smooth) /
+                    (paddle.sum(iflat, axis=1) + paddle.sum(tflat, axis=1) + smooth))
+        return paddle.mean(loss)
--- a/paddlers/models/ppseg/models/losses/dice_loss.py
+++ b/paddlers/models/ppseg/models/losses/dice_loss.py
@ -0,0 +1,56 @@
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class DiceLoss(nn.Layer):
+    """
+    Implements the dice loss function.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        smooth (float32): laplace smoothing,
+            to smooth dice loss and accelerate convergence. following:
+            https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
+    """
+
+    def __init__(self, ignore_index=255, smooth=0.):
+        super(DiceLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.eps = 1e-5
+        self.smooth = smooth
+
+    def forward(self, logits, labels):
+        labels = paddle.cast(labels, dtype='int32')
+        labels_one_hot = F.one_hot(labels, num_classes=logits.shape[1])
+        labels_one_hot = paddle.transpose(labels_one_hot, [0, 3, 1, 2])
+        labels_one_hot = paddle.cast(labels_one_hot, dtype='float32')
+
+        logits = F.softmax(logits, axis=1)
+
+        mask = (paddle.unsqueeze(labels, 1) != self.ignore_index)
+        logits = logits * mask
+        labels_one_hot = labels_one_hot * mask
+
+        dims = (0, ) + tuple(range(2, labels.ndimension() + 1))
+
+        intersection = paddle.sum(logits * labels_one_hot, dims)
+        cardinality = paddle.sum(logits + labels_one_hot, dims)
+        dice_loss = ((2. * intersection + self.smooth) /
+                     (cardinality + self.eps + self.smooth)).mean()
+        return 1 - dice_loss
--- a/paddlers/models/ppseg/models/losses/edge_attention_loss.py
+++ b/paddlers/models/ppseg/models/losses/edge_attention_loss.py
@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import losses
+
+
+@manager.LOSSES.add_component
+class EdgeAttentionLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function. It only compute the edge part.
+
+    Args:
+        edge_threshold (float): The pixels greater edge_threshold as edges.
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, edge_threshold=0.8, ignore_index=255):
+        super().__init__()
+        self.edge_threshold = edge_threshold
+        self.ignore_index = ignore_index
+        self.EPS = 1e-10
+        self.mean_mask = 1
+
+    def forward(self, logits, label):
+        """
+        Forward computation.
+
+        Args:
+            logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit .
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, C, D1, D2,..., Dk), k >= 1.
+        """
+        seg_logit, edge_logit = logits[0], logits[1]
+        if len(label.shape) != len(seg_logit.shape):
+            label = paddle.unsqueeze(label, 1)
+        if edge_logit.shape != label.shape:
+            raise ValueError(
+                'The shape of edge_logit should equal to the label, but they are {} != {}'
+                .format(edge_logit.shape, label.shape))
+
+        filler = paddle.ones_like(label) * self.ignore_index
+        label = paddle.where(edge_logit > self.edge_threshold, label, filler)
+
+        seg_logit = paddle.transpose(seg_logit, [0, 2, 3, 1])
+        label = paddle.transpose(label, [0, 2, 3, 1])
+        loss = F.softmax_with_cross_entropy(
+            seg_logit, label, ignore_index=self.ignore_index, axis=-1)
+
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, 'float32')
+        loss = loss * mask
+        avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
+        if paddle.mean(mask) < self.mean_mask:
+            self.mean_mask = paddle.mean(mask)
+
+        label.stop_gradient = True
+        mask.stop_gradient = True
+        return avg_loss
--- a/paddlers/models/ppseg/models/losses/focal_loss.py
+++ b/paddlers/models/ppseg/models/losses/focal_loss.py
@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class FocalLoss(nn.Layer):
+    """
+    Focal Loss.
+
+    Code referenced from:
+    https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
+
+    Args:
+        gamma (float): the coefficient of Focal Loss.
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, gamma=2.0, ignore_index=255, edge_label=False):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.ignore_index = ignore_index
+        self.edge_label = edge_label
+
+    def forward(self, logit, label):
+        logit = paddle.reshape(
+            logit, [logit.shape[0], logit.shape[1], -1])  # N,C,H,W => N,C,H*W
+        logit = paddle.transpose(logit, [0, 2, 1])  # N,C,H*W => N,H*W,C
+        logit = paddle.reshape(logit,
+                               [-1, logit.shape[2]])  # N,H*W,C => N*H*W,C
+        label = paddle.reshape(label, [-1, 1])
+        range_ = paddle.arange(0, label.shape[0])
+        range_ = paddle.unsqueeze(range_, axis=-1)
+        label = paddle.cast(label, dtype='int64')
+        label = paddle.concat([range_, label], axis=-1)
+        logpt = F.log_softmax(logit)
+        logpt = paddle.gather_nd(logpt, label)
+
+        pt = paddle.exp(logpt.detach())
+        loss = -1 * (1 - pt)**self.gamma * logpt
+        loss = paddle.mean(loss)
+        return loss
--- a/paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
+++ b/paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
@ -0,0 +1,141 @@
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class DualTaskLoss(nn.Layer):
+    """
+    The dual task loss implement of GSCNN
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        tau (float): the tau of gumbel softmax sample.
+    """
+
+    def __init__(self, ignore_index=255, tau=0.5):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.tau = tau
+
+    def _gumbel_softmax_sample(self, logit, tau=1, eps=1e-10):
+        """
+        Draw a sample from the Gumbel-Softmax distribution
+
+        based on
+        https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb
+        (MIT license)
+        """
+        gumbel_noise = paddle.rand(logit.shape)
+        gumbel_noise = -paddle.log(eps - paddle.log(gumbel_noise + eps))
+        logit = logit + gumbel_noise
+        return F.softmax(logit / tau, axis=1)
+
+    def compute_grad_mag(self, x):
+        eps = 1e-6
+        n, c, h, w = x.shape
+        if h <= 1 or w <= 1:
+            raise ValueError(
+                'The width and height of tensor to compute grad must be greater than 1, but the shape is {}.'
+                .format(x.shape))
+
+        x = self.conv_tri(x, r=4)
+        kernel = [[-1, 0, 1]]
+        kernel = paddle.to_tensor(kernel).astype('float32')
+        kernel = 0.5 * kernel
+
+        kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0)
+        grad_x = F.conv2d(x, kernel_x, padding='same', groups=c)
+        kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0)
+        grad_y = F.conv2d(x, kernel_y, padding='same', groups=c)
+        mag = paddle.sqrt(grad_x * grad_x + grad_y * grad_y + eps)
+
+        return mag / mag.max()
+
+    def conv_tri(self, input, r):
+        """
+        Convolves an image by a 2D triangle filter (the 1D triangle filter f is
+        [1:r r+1 r:-1:1]/(r+1)^2, the 2D version is simply conv2(f,f'))
+        """
+        if r <= 1:
+            raise ValueError(
+                '`r` should be greater than 1, but it is {}.'.format(r))
+
+        kernel = [
+            list(range(1, r + 1)) + [r + 1] + list(reversed(range(1, r + 1)))
+        ]
+        kernel = paddle.to_tensor(kernel).astype('float32')
+        kernel = kernel / (r + 1)**2
+        input_ = F.pad(input, [1, 1, 0, 0], mode='replicate')
+        input_ = F.pad(input_, [r, r, 0, 0], mode='reflect')
+        input_ = [input_[:, :, :, :r], input, input_[:, :, :, -r:]]
+        input_ = paddle.concat(input_, axis=3)
+        tem = input_.clone()
+
+        input_ = F.pad(input_, [0, 0, 1, 1], mode='replicate')
+        input_ = F.pad(input_, [0, 0, r, r], mode='reflect')
+        input_ = [input_[:, :, :r, :], tem, input_[:, :, -r:, :]]
+        input_ = paddle.concat(input_, axis=2)
+
+        c = input.shape[1]
+        kernel_x = paddle.concat([kernel.unsqueeze((0, 1))] * c, axis=0)
+        output = F.conv2d(input_, kernel_x, padding=0, groups=c)
+        kernel_y = paddle.concat([kernel.t().unsqueeze((0, 1))] * c, axis=0)
+        output = F.conv2d(output, kernel_y, padding=0, groups=c)
+        return output
+
+    def forward(self, logit, labels):
+        # import pdb; pdb.set_trace()
+        n, c, h, w = logit.shape
+        th = 1e-8
+        eps = 1e-10
+        if len(labels.shape) == 3:
+            labels = labels.unsqueeze(1)
+        mask = (labels != self.ignore_index)
+        mask.stop_gradient = True
+        logit = logit * mask
+
+        labels = labels * mask
+        if len(labels.shape) == 4:
+            labels = labels.squeeze(1)
+        labels.stop_gradient = True
+        labels = F.one_hot(labels, logit.shape[1]).transpose((0, 3, 1, 2))
+        labels.stop_gradient = True
+
+        g = self._gumbel_softmax_sample(logit, tau=self.tau)
+        g = self.compute_grad_mag(g)
+        g_hat = self.compute_grad_mag(labels)
+        loss = F.l1_loss(g, g_hat, reduction='none')
+        loss = loss * mask
+
+        g_mask = (g > th).astype('float32')
+        g_mask.stop_gradient = True
+        g_mask_sum = paddle.sum(g_mask)
+        loss_g = paddle.sum(loss * g_mask)
+        if g_mask_sum > eps:
+            loss_g = loss_g / g_mask_sum
+
+        g_hat_mask = (g_hat > th).astype('float32')
+        g_hat_mask.stop_gradient = True
+        g_hat_mask_sum = paddle.sum(g_hat_mask)
+        loss_g_hat = paddle.sum(loss * g_hat_mask)
+        if g_hat_mask_sum > eps:
+            loss_g_hat = loss_g_hat / g_hat_mask_sum
+
+        total_loss = 0.5 * loss_g + 0.5 * loss_g_hat
+
+        return total_loss
--- a/paddlers/models/ppseg/models/losses/kl_loss.py
+++ b/paddlers/models/ppseg/models/losses/kl_loss.py
@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class KLLoss(nn.Layer):
+    """
+    The implementation of Kullback-Leibler divergence Loss.
+    Refer to https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        temperature (float): the coefficient of kl_loss.
+    """
+
+    def __init__(self, ignore_index=255, temperature=1):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+
+        self.kl_loss = nn.KLDivLoss(reduction="none")
+        self.EPS = 1e-8
+
+    def forward(self, logit_1, logit_2, label=None):
+        """
+        Calculate the KL loss. If the label is not None, it considers the
+        ignore_index in label and calculates the masked loss.
+
+        Args:
+            logit_1 (Tensor): Logit tensor, the data type is float32 or float64.
+                The shape is (N, C), where C is number of classes, and if shape is
+                more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
+            logit_2 (Tensor): Logit tensor, the data type is float32 or float64.
+                The shape of logit_2 and logit_1 are the same.
+            label (Tensor, optional): Label tensor, the data type is int64.
+                The shape is (N), where each value is 0 <= label[i] <= C-1, and
+                if shape is more than 2D, this is (N, D1, D2,..., Dk), k >= 1.
+        Returns:
+            (Tensor): The average loss.
+        """
+        if logit_1.shape != logit_2.shape:
+            raise ValueError(
+                'The shape of logit_1 = {} must be the same as the shape of logit_2 = {}.'
+                .format(logit_1.shape, logit_2.shape))
+
+        logit_1 = F.log_softmax(logit_1 / self.temperature, axis=1)
+        logit_2 = F.softmax(logit_2 / self.temperature, axis=1)
+        loss = self.kl_loss(logit_1, logit_2)
+        loss = loss * self.temperature * self.temperature
+
+        if label is None:
+            avg_loss = paddle.mean(loss)
+        else:
+            mask = label != self.ignore_index
+            mask = paddle.cast(mask, 'float32')
+            mask = paddle.unsqueeze(mask, axis=1)
+            label.stop_gradient = True
+            mask.stop_gradient = True
+
+            loss = loss * mask
+            avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
+        return avg_loss
--- a/paddlers/models/ppseg/models/losses/l1_loss.py
+++ b/paddlers/models/ppseg/models/losses/l1_loss.py
@ -0,0 +1,76 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class L1Loss(nn.L1Loss):
+    r"""
+    This interface is used to construct a callable object of the ``L1Loss`` class.
+    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
+     If `reduction` set to ``'none'``, the loss is:
+    .. math::
+        Out = \lvert input - label\rvert
+    If `reduction` set to ``'mean'``, the loss is:
+    .. math::
+        Out = MEAN(\lvert input - label\rvert)
+    If `reduction` set to ``'sum'``, the loss is:
+    .. math::
+        Out = SUM(\lvert input - label\rvert)
+
+    Args:
+        reduction (str, optional): Indicate the reduction to apply to the loss,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'none'``, the unreduced loss is returned;
+            If `reduction` is ``'mean'``, the reduced mean loss is returned.
+            If `reduction` is ``'sum'``, the reduced sum loss is returned.
+            Default is ``'mean'``.
+        ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255.
+    Shape:
+        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        output (Tensor): The L1 Loss of ``input`` and ``label``.
+            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
+            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            l1_loss = paddle.nn.L1Loss()
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [0.35]
+            l1_loss = paddle.nn.L1Loss(reduction='sum')
+            output = l1_loss(input, label)
+            print(output.numpy())
+            # [1.4]
+            l1_loss = paddle.nn.L1Loss(reduction='none')
+            output = l1_loss(input, label)
+            print(output)
+            # [[0.20000005 0.19999999]
+            # [0.2        0.79999995]]
+    """
+
+    def __init__(self, reduction='mean', ignore_index=255):
+        super().__init__(reduction=reduction)
--- a/paddlers/models/ppseg/models/losses/lovasz_loss.py
+++ b/paddlers/models/ppseg/models/losses/lovasz_loss.py
@ -0,0 +1,222 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lovasz-Softmax and Jaccard hinge loss in PaddlePaddle"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class LovaszSoftmaxLoss(nn.Layer):
+    """
+    Multi-class Lovasz-Softmax loss.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``.
+        classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+    """
+
+    def __init__(self, ignore_index=255, classes='present'):
+        super(LovaszSoftmaxLoss, self).__init__()
+        self.ignore_index = ignore_index
+        self.classes = classes
+
+    def forward(self, logits, labels):
+        r"""
+        Forward computation.
+
+        Args:
+            logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty).
+            labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], ground truth labels (between 0 and C - 1).
+        """
+        probas = F.softmax(logits, axis=1)
+        vprobas, vlabels = flatten_probas(probas, labels, self.ignore_index)
+        loss = lovasz_softmax_flat(vprobas, vlabels, classes=self.classes)
+        return loss
+
+
+@manager.LOSSES.add_component
+class LovaszHingeLoss(nn.Layer):
+    """
+    Binary Lovasz hinge loss.
+
+    Args:
+        ignore_index (int64): Specifies a target value that is ignored and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, ignore_index=255):
+        super(LovaszHingeLoss, self).__init__()
+        self.ignore_index = ignore_index
+
+    def forward(self, logits, labels):
+        r"""
+        Forward computation.
+
+        Args:
+            logits (Tensor): Shape is [N, 1, H, W] or [N, 2, H, W], logits at each pixel (between -\infty and +\infty).
+            labels (Tensor): Shape is [N, 1, H, W] or [N, H, W], binary ground truth masks (0 or 1).
+        """
+        if logits.shape[1] == 2:
+            logits = binary_channel_to_unary(logits)
+        loss = lovasz_hinge_flat(
+            *flatten_binary_scores(logits, labels, self.ignore_index))
+        return loss
+
+
+def lovasz_grad(gt_sorted):
+    """
+    Computes gradient of the Lovasz extension w.r.t sorted errors.
+    See Alg. 1 in paper.
+    """
+    gts = paddle.sum(gt_sorted)
+    p = len(gt_sorted)
+
+    intersection = gts - paddle.cumsum(gt_sorted, axis=0)
+    union = gts + paddle.cumsum(1 - gt_sorted, axis=0)
+    jaccard = 1.0 - intersection.cast('float32') / union.cast('float32')
+
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def binary_channel_to_unary(logits, eps=1e-9):
+    """
+    Converts binary channel logits to unary channel logits for lovasz hinge loss.
+    """
+    probas = F.softmax(logits, axis=1)
+    probas = probas[:, 1, :, :]
+    logits = paddle.log(probas + eps / (1 - probas + eps))
+    logits = logits.unsqueeze(1)
+    return logits
+
+
+def lovasz_hinge_flat(logits, labels):
+    r"""
+    Binary Lovasz hinge loss.
+
+    Args:
+        logits (Tensor): Shape is [P], logits at each prediction (between -\infty and +\infty).
+        labels (Tensor): Shape is [P], binary ground truth labels (0 or 1).
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels - 1.
+    signs.stop_gradient = True
+    errors = 1. - logits * signs
+    errors_sorted, perm = paddle.fluid.core.ops.argsort(errors, 'axis', 0,
+                                                        'descending', True)
+    errors_sorted.stop_gradient = False
+    gt_sorted = paddle.gather(labels, perm)
+    grad = lovasz_grad(gt_sorted)
+    grad.stop_gradient = True
+    loss = paddle.sum(F.relu(errors_sorted) * grad)
+    return loss
+
+
+def flatten_binary_scores(scores, labels, ignore=None):
+    """
+    Flattens predictions in the batch (binary case).
+    Remove labels according to 'ignore'.
+    """
+    scores = paddle.reshape(scores, [-1])
+    labels = paddle.reshape(labels, [-1])
+    labels.stop_gradient = True
+    if ignore is None:
+        return scores, labels
+    valid = labels != ignore
+    valid_mask = paddle.reshape(valid, (-1, 1))
+    indexs = paddle.nonzero(valid_mask)
+    indexs.stop_gradient = True
+    vscores = paddle.gather(scores, indexs[:, 0])
+    vlabels = paddle.gather(labels, indexs[:, 0])
+    return vscores, vlabels
+
+
+def lovasz_softmax_flat(probas, labels, classes='present'):
+    """
+    Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probas (Tensor): Shape is [P, C], class probabilities at each prediction (between 0 and 1).
+        labels (Tensor): Shape is [P], ground truth labels (between 0 and C - 1).
+        classes (str|list): 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+    """
+    if probas.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probas * 0.
+    C = probas.shape[1]
+    losses = []
+    classes_to_sum = list(range(C)) if classes in ['all', 'present'
+                                                   ] else classes
+    for c in classes_to_sum:
+        fg = paddle.cast(labels == c, probas.dtype)  # foreground for class c
+        if classes == 'present' and fg.sum() == 0:
+            continue
+        fg.stop_gradient = True
+        if C == 1:
+            if len(classes_to_sum) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probas[:, 0]
+        else:
+            class_pred = probas[:, c]
+        errors = paddle.abs(fg - class_pred)
+        errors_sorted, perm = paddle.fluid.core.ops.argsort(
+            errors, 'axis', 0, 'descending', True)
+        errors_sorted.stop_gradient = False
+
+        fg_sorted = paddle.gather(fg, perm)
+        fg_sorted.stop_gradient = True
+
+        grad = lovasz_grad(fg_sorted)
+        grad.stop_gradient = True
+        loss = paddle.sum(errors_sorted * grad)
+        losses.append(loss)
+
+    if len(classes_to_sum) == 1:
+        return losses[0]
+
+    losses_tensor = paddle.stack(losses)
+    mean_loss = paddle.mean(losses_tensor)
+    return mean_loss
+
+
+def flatten_probas(probas, labels, ignore=None):
+    """
+    Flattens predictions in the batch.
+    """
+    if len(probas.shape) == 3:
+        probas = paddle.unsqueeze(probas, axis=1)
+    C = probas.shape[1]
+    probas = paddle.transpose(probas, [0, 2, 3, 1])
+    probas = paddle.reshape(probas, [-1, C])
+    labels = paddle.reshape(labels, [-1])
+    if ignore is None:
+        return probas, labels
+    valid = labels != ignore
+    valid_mask = paddle.reshape(valid, [-1, 1])
+    indexs = paddle.nonzero(valid_mask)
+    indexs.stop_gradient = True
+    vprobas = paddle.gather(probas, indexs[:, 0])
+    vlabels = paddle.gather(labels, indexs[:, 0])
+    return vprobas, vlabels
--- a/paddlers/models/ppseg/models/losses/mean_square_error_loss.py
+++ b/paddlers/models/ppseg/models/losses/mean_square_error_loss.py
@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class MSELoss(nn.MSELoss):
+    r"""
+    **Mean Square Error Loss**
+    Computes the mean square error (squared L2 norm) of given input and label.
+    If :attr:`reduction` is set to ``'none'``, loss is calculated as:
+    .. math::
+        Out = (input - label)^2
+    If :attr:`reduction` is set to ``'mean'``, loss is calculated as:
+    .. math::
+        Out = \operatorname{mean}((input - label)^2)
+    If :attr:`reduction` is set to ``'sum'``, loss is calculated as:
+    .. math::
+        Out = \operatorname{sum}((input - label)^2)
+    where `input` and `label` are `float32` tensors of same shape.
+
+    Args:
+        reduction (string, optional): The reduction method for the output,
+            could be 'none' | 'mean' | 'sum'.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned.
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
+            Default is ``'mean'``.
+        ignore_index (int, optional): Specifies a target value that is ignored and does not contribute to the input gradient. Default: 255.
+    Shape:
+        input (Tensor): Input tensor, the data type is float32 or float64
+        label (Tensor): Label tensor, the data type is float32 or float64
+        output (Tensor): output tensor storing the MSE loss of input and label, the data type is same as input.
+    Examples:
+        .. code-block:: python
+            import numpy as np
+            import paddle
+            input_data = np.array([1.5]).astype("float32")
+            label_data = np.array([1.7]).astype("float32")
+            mse_loss = paddle.nn.loss.MSELoss()
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
+            output = mse_loss(input, label)
+            print(output)
+            # [0.04000002]
+    """
+
+    def __init__(self, reduction='mean', ignore_index=255):
+        super().__init__(reduction=reduction)
--- a/paddlers/models/ppseg/models/losses/mixed_loss.py
+++ b/paddlers/models/ppseg/models/losses/mixed_loss.py
@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class MixedLoss(nn.Layer):
+    """
+    Weighted computations for multiple Loss.
+    The advantage is that mixed loss training can be achieved without changing the networking code.
+
+    Args:
+        losses (list[nn.Layer]): A list consisting of multiple loss classes
+        coef (list[float|int]): Weighting coefficient of multiple loss
+
+    Returns:
+        A callable object of MixedLoss.
+    """
+
+    def __init__(self, losses, coef):
+        super(MixedLoss, self).__init__()
+        if not isinstance(losses, list):
+            raise TypeError('`losses` must be a list!')
+        if not isinstance(coef, list):
+            raise TypeError('`coef` must be a list!')
+        len_losses = len(losses)
+        len_coef = len(coef)
+        if len_losses != len_coef:
+            raise ValueError(
+                'The length of `losses` should equal to `coef`, but they are {} and {}.'
+                .format(len_losses, len_coef))
+
+        self.losses = losses
+        self.coef = coef
+
+    def forward(self, logits, labels):
+        loss_list = []
+        for i, loss in enumerate(self.losses):
+            output = loss(logits, labels)
+            loss_list.append(output * self.coef[i])
+        return loss_list
--- a/paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
@ -0,0 +1,99 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class OhemCrossEntropyLoss(nn.Layer):
+    """
+    Implements the ohem cross entropy loss function.
+
+    Args:
+        thresh (float, optional): The threshold of ohem. Default: 0.7.
+        min_kept (int, optional): The min number to keep in loss computation. Default: 10000.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, thresh=0.7, min_kept=10000, ignore_index=255):
+        super(OhemCrossEntropyLoss, self).__init__()
+        self.thresh = thresh
+        self.min_kept = min_kept
+        self.ignore_index = ignore_index
+        self.EPS = 1e-5
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+        """
+        if len(label.shape) != len(logit.shape):
+            label = paddle.unsqueeze(label, 1)
+
+        # get the label after ohem
+        n, c, h, w = logit.shape
+        label = label.reshape((-1, ))
+        valid_mask = (label != self.ignore_index).astype('int64')
+        num_valid = valid_mask.sum()
+        label = label * valid_mask
+
+        prob = F.softmax(logit, axis=1)
+        prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))
+
+        if self.min_kept < num_valid and num_valid > 0:
+            # let the value which ignored greater than 1
+            prob = prob + (1 - valid_mask)
+
+            # get the prob of relevant label
+            label_onehot = F.one_hot(label, c)
+            label_onehot = label_onehot.transpose((1, 0))
+            prob = prob * label_onehot
+            prob = paddle.sum(prob, axis=0)
+
+            threshold = self.thresh
+            if self.min_kept > 0:
+                index = prob.argsort()
+                threshold_index = index[min(len(index), self.min_kept) - 1]
+                threshold_index = int(threshold_index.numpy()[0])
+                if prob[threshold_index] > self.thresh:
+                    threshold = prob[threshold_index]
+                kept_mask = (prob < threshold).astype('int64')
+                label = label * kept_mask
+                valid_mask = valid_mask * kept_mask
+
+        # make the invalid region as ignore
+        label = label + (1 - valid_mask) * self.ignore_index
+
+        label = label.reshape((n, 1, h, w))
+        valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
+        loss = F.softmax_with_cross_entropy(
+            logit, label, ignore_index=self.ignore_index, axis=1)
+        loss = loss * valid_mask
+        avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)
+
+        label.stop_gradient = True
+        valid_mask.stop_gradient = True
+        return avg_loss
--- a/paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
+++ b/paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
@ -0,0 +1,114 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import losses
+
+
+@manager.LOSSES.add_component
+class OhemEdgeAttentionLoss(nn.Layer):
+    """
+    Implements the cross entropy loss function. It only compute the edge part.
+
+    Args:
+        edge_threshold (float, optional): The pixels greater edge_threshold as edges. Default: 0.8.
+        thresh (float, optional): The threshold of ohem. Default: 0.7.
+        min_kept (int, optional): The min number to keep in loss computation. Default: 5000.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self,
+                 edge_threshold=0.8,
+                 thresh=0.7,
+                 min_kept=5000,
+                 ignore_index=255):
+        super().__init__()
+        self.edge_threshold = edge_threshold
+        self.thresh = thresh
+        self.min_kept = min_kept
+        self.ignore_index = ignore_index
+        self.EPS = 1e-10
+
+    def forward(self, logits, label):
+        """
+        Forward computation.
+
+        Args:
+            logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is
+                (N, C), where C is number of classes, and if shape is more than 2D, this
+                is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit .
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, C, D1, D2,..., Dk), k >= 1.
+        """
+        seg_logit, edge_logit = logits[0], logits[1]
+        if len(label.shape) != len(seg_logit.shape):
+            label = paddle.unsqueeze(label, 1)
+        if edge_logit.shape != label.shape:
+            raise ValueError(
+                'The shape of edge_logit should equal to the label, but they are {} != {}'
+                .format(edge_logit.shape, label.shape))
+
+        # Filter out edge
+        filler = paddle.ones_like(label) * self.ignore_index
+        label = paddle.where(edge_logit > self.edge_threshold, label, filler)
+
+        # ohem
+        n, c, h, w = seg_logit.shape
+        label = label.reshape((-1, ))
+        valid_mask = (label != self.ignore_index).astype('int64')
+        num_valid = valid_mask.sum()
+        label = label * valid_mask
+
+        prob = F.softmax(seg_logit, axis=1)
+        prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))
+
+        if self.min_kept < num_valid and num_valid > 0:
+            # let the value which ignored greater than 1
+            prob = prob + (1 - valid_mask)
+
+            # get the prob of relevant label
+            label_onehot = F.one_hot(label, c)
+            label_onehot = label_onehot.transpose((1, 0))
+            prob = prob * label_onehot
+            prob = paddle.sum(prob, axis=0)
+
+            threshold = self.thresh
+            if self.min_kept > 0:
+                index = prob.argsort()
+                threshold_index = index[min(len(index), self.min_kept) - 1]
+                threshold_index = int(threshold_index.numpy()[0])
+                if prob[threshold_index] > self.thresh:
+                    threshold = prob[threshold_index]
+                kept_mask = (prob < threshold).astype('int64')
+                label = label * kept_mask
+                valid_mask = valid_mask * kept_mask
+        # make the invalid region as ignore
+        label = label + (1 - valid_mask) * self.ignore_index
+        label = label.reshape((n, 1, h, w))
+        valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
+
+        loss = F.softmax_with_cross_entropy(
+            seg_logit, label, ignore_index=self.ignore_index, axis=1)
+        loss = loss * valid_mask
+        avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)
+
+        label.stop_gradient = True
+        valid_mask.stop_gradient = True
+        return avg_loss
--- a/paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
@ -0,0 +1,199 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class PixelContrastCrossEntropyLoss(nn.Layer):
+    """
+    The PixelContrastCrossEntropyLoss implementation based on PaddlePaddle.
+
+    The original article refers to
+    Wenguan Wang, Tianfei Zhou, et al. "Exploring Cross-Image Pixel Contrast for Semantic Segmentation"
+    (https://arxiv.org/abs/2101.11939).
+
+    Args:
+        temperature (float, optional): Controling the numerical similarity of features. Default: 0.1.
+        base_temperature (float, optional): Controling the numerical range of contrast loss. Default: 0.07.
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default 255.
+        max_samples (int, optional): Max sampling anchors. Default: 1024.
+        max_views (int): Sampled samplers of a class. Default: 100.
+    """
+    def __init__(self,
+                 temperature=0.1,
+                 base_temperature=0.07,
+                 ignore_index=255,
+                 max_samples=1024,
+                 max_views=100):
+        super().__init__()
+        self.temperature = temperature
+        self.base_temperature = base_temperature
+        self.ignore_index = ignore_index
+        self.max_samples = max_samples
+        self.max_views = max_views
+
+    def _hard_anchor_sampling(self, X, y_hat, y):
+        """
+        Args:
+            X (Tensor): reshaped feats, shape = [N, H * W, feat_channels]
+            y_hat (Tensor): reshaped label, shape = [N, H * W]
+            y (Tensor): reshaped predict, shape = [N, H * W]
+        """
+        batch_size, feat_dim = paddle.shape(X)[0], paddle.shape(X)[-1]
+        classes = []
+        total_classes = 0
+        for i in range(batch_size):
+            current_y = y_hat[i]
+            current_classes = paddle.unique(current_y)
+            current_classes = [
+                x for x in current_classes if x != self.ignore_index
+            ]
+            current_classes = [
+                x for x in current_classes
+                if (current_y == x).nonzero().shape[0] > self.max_views
+            ]
+
+            classes.append(current_classes)
+            total_classes += len(current_classes)
+
+        n_view = self.max_samples // total_classes
+        n_view = min(n_view, self.max_views)
+
+        X_ = []
+        y_ = paddle.zeros([total_classes], dtype='float32')
+
+        X_ptr = 0
+        for i in range(batch_size):
+            this_y_hat = y_hat[i]
+            current_y = y[i]
+            current_classes = classes[i]
+
+            for cls_id in current_classes:
+                hard_indices = paddle.logical_and(
+                    (this_y_hat == cls_id), (current_y != cls_id)).nonzero()
+                easy_indices = paddle.logical_and(
+                    (this_y_hat == cls_id), (current_y == cls_id)).nonzero()
+
+                num_hard = hard_indices.shape[0]
+                num_easy = easy_indices.shape[0]
+
+                if num_hard >= n_view / 2 and num_easy >= n_view / 2:
+                    num_hard_keep = n_view // 2
+                    num_easy_keep = n_view - num_hard_keep
+                elif num_hard >= n_view / 2:
+                    num_easy_keep = num_easy
+                    num_hard_keep = n_view - num_easy_keep
+                else:
+                    num_hard_keep = num_hard
+                    num_easy_keep = n_view - num_hard_keep
+
+                indices = None
+                if num_hard > 0:
+                    perm = paddle.randperm(num_hard)
+                    hard_indices = hard_indices[perm[:num_hard_keep]].reshape(
+                        (-1, hard_indices.shape[-1]))
+                    indices = hard_indices
+                if num_easy > 0:
+                    perm = paddle.randperm(num_easy)
+                    easy_indices = easy_indices[perm[:num_easy_keep]].reshape(
+                        (-1, easy_indices.shape[-1]))
+                    if indices is None:
+                        indices = easy_indices
+                    else:
+                        indices = paddle.concat((indices, easy_indices), axis=0)
+                if indices is None:
+                    raise UserWarning('hard sampling indice error')
+
+                X_.append(paddle.index_select(X[i, :, :], indices.squeeze(1)))
+                y_[X_ptr] = float(cls_id)
+                X_ptr += 1
+        X_ = paddle.stack(X_, axis=0)
+        return X_, y_
+
+    def _contrastive(self, feats_, labels_):
+        """
+        Args:
+            feats_ (Tensor): sampled pixel, shape = [total_classes, n_view, feat_dim], total_classes = batch_size * single image classes
+            labels_ (Tensor): label, shape = [total_classes]
+        """
+        anchor_num, n_view = feats_.shape[0], feats_.shape[1]
+
+        labels_ = labels_.reshape((-1, 1))
+        mask = paddle.equal(labels_, paddle.transpose(labels_,
+                                                      [1, 0])).astype('float32')
+
+        contrast_count = n_view
+        contrast_feature = paddle.concat(paddle.unbind(feats_, axis=1), axis=0)
+
+        anchor_feature = contrast_feature
+        anchor_count = contrast_count
+
+        anchor_dot_contrast = paddle.matmul(
+            anchor_feature, paddle.transpose(contrast_feature,
+                                             [1, 0])) / self.temperature
+        logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max
+
+        mask = paddle.tile(mask, [anchor_count, contrast_count])
+        neg_mask = 1 - mask
+
+        logits_mask = 1 - paddle.eye(mask.shape[0]).astype('float32')
+        mask = mask * logits_mask
+
+        neg_logits = paddle.exp(logits) * neg_mask
+        neg_logits = neg_logits.sum(1, keepdim=True)
+
+        exp_logits = paddle.exp(logits)
+
+        log_prob = logits - paddle.log(exp_logits + neg_logits)
+
+        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
+
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = loss.mean()
+
+        return loss
+
+    def contrast_criterion(self, feats, labels=None, predict=None):
+        labels = labels.unsqueeze(1)
+        labels = F.interpolate(labels, feats.shape[2:], mode='nearest')
+        labels = labels.squeeze(1)
+
+        batch_size = feats.shape[0]
+        labels = labels.reshape((batch_size, -1))
+        predict = predict.reshape((batch_size, -1))
+        feats = paddle.transpose(feats, [0, 2, 3, 1])
+        feats = feats.reshape((feats.shape[0], -1, feats.shape[-1]))
+
+        feats_, labels_ = self._hard_anchor_sampling(feats, labels, predict)
+
+        loss = self._contrastive(feats_, labels_)
+        return loss
+
+    def forward(self, preds, label):
+        assert "seg" in preds, "The input of PixelContrastCrossEntropyLoss should include 'seg' output, but not found."
+        assert "embed" in preds, "The input of PixelContrastCrossEntropyLoss should include 'embed' output, but not found."
+
+        seg = preds['seg']
+        embedding = preds['embed']
+
+        predict = paddle.argmax(seg, axis=1)
+        loss = self.contrast_criterion(embedding, label, predict)
+        return loss
--- a/paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
@ -0,0 +1,160 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+@manager.LOSSES.add_component
+class PointCrossEntropyLoss(nn.Layer):
+    """
+    Implements the point cross entropy loss function.
+
+    The original article refers to
+    Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering."
+    (https://arxiv.org/abs/1912.08193).
+
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+        top_k_percent_pixels (float, optional): the value lies in [0.0, 1.0]. When its value < 1.0, only compute the loss for
+            the top k percent pixels (e.g., the top 20% pixels). This is useful for hard pixel mining. Default ``1.0``.
+        data_format (str, optional): The tensor format to use, 'NCHW' or 'NHWC'. Default ``'NCHW'``.
+    """
+
+    def __init__(self,
+                 weight=None,
+                 ignore_index=255,
+                 top_k_percent_pixels=1.0,
+                 data_format='NCHW',
+                 align_corners = False):
+        super(PointCrossEntropyLoss, self).__init__()
+        if weight is not None:
+            weight = paddle.to_tensor(weight, dtype='float32')
+        self.weight = weight
+        self.ignore_index = ignore_index
+        self.top_k_percent_pixels = top_k_percent_pixels
+        self.EPS = 1e-8
+        self.data_format = data_format
+        self.align_corners = align_corners
+
+    def forward(self, logits, label, semantic_weights=None):
+        """
+        Forward computation.
+
+        Args:
+            logits (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (logit,points). logit'shape: [N, C, point_num]. logit'shape:[N, point_num, 2], where C is number of classes.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
+                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
+                (N, D1, D2,..., Dk), k >= 1.
+            semantic_weights (Tensor, optional): Weights about loss for each pixels, shape is the same as label. Default: None.
+        """
+        # for loss
+        logit, points = logits # [N, C, point_num],[N, point_num, 2]
+        label = label.unsqueeze(1) # [N,1,H,W]
+        label = point_sample(
+            label.astype('float32'),
+            points,
+            mode='nearest',
+            align_corners=self.align_corners) # [N, 1, point_num]
+        label = paddle.squeeze(label,axis=1).astype('int64') # [N, xx]
+
+        channel_axis = 1 if self.data_format == 'NCHW' else -1
+        if self.weight is not None and logit.shape[channel_axis] != len(
+                self.weight):
+            raise ValueError(
+                'The number of weights = {} must be the same as the number of classes = {}.'
+                .format(len(self.weight), logit.shape[1]))
+
+        logit = paddle.transpose(logit, [0, 2, 1])
+        no_ignore_label = label
+        #no_ignore_label[label==self.ignore_index] = 0
+        loss = F.cross_entropy(
+            logit,
+            no_ignore_label,
+            ignore_index=self.ignore_index,
+            reduction='none')
+
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, 'float32')
+
+        loss = loss * mask
+        if semantic_weights is not None:
+            loss = loss * semantic_weights
+
+        if self.weight is not None:
+            _one_hot = F.one_hot(label, logit.shape[-1])
+            _one_hot_weight = _one_hot * self.weight
+            loss = loss * _one_hot_weight.argmax(-1)
+            coef = paddle.sum(_one_hot_weight, axis=-1)
+            #coef = paddle.ones_like(label)
+        else:
+            coef = paddle.ones_like(label)
+
+        label.stop_gradient = True
+        mask.stop_gradient = True
+        if self.top_k_percent_pixels == 1.0:
+            avg_loss = paddle.mean(loss) / (paddle.mean(mask * coef) + self.EPS)
+            return avg_loss
+
+        loss = loss.reshape((-1, ))
+        top_k_pixels = int(self.top_k_percent_pixels * loss.numel())
+        loss, indices = paddle.topk(loss, top_k_pixels)
+        coef = coef.reshape((-1, ))
+        coef = paddle.gather(coef, indices)
+        coef.stop_gradient = True
+
+        return loss.mean() / (paddle.mean(coef) + self.EPS)
+
+def point_sample(input, points, align_corners=False, **kwargs):
+    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+    Args:
+        input (Tensor): Feature map, shape (N, C, H, W).
+        points (Tensor): Image based absolute point coordinates (normalized),
+            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
+        align_corners (bool): Whether align_corners. Default: False
+    Returns:
+        Tensor: Features of `point` on `input`, shape (N, C, P) or
+            (N, C, Hgrid, Wgrid).
+    """
+
+    def denormalize(grid):
+        """Denormalize input grid from range [0, 1] to [-1, 1]
+        Args:
+            grid (Tensor): The grid to be denormalize, range [0, 1].
+        Returns:
+            Tensor: Denormalized grid, range [-1, 1].
+        """
+
+        return grid * 2.0 - 1.0
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = paddle.unsqueeze(points,axis=2) # [2, 2048, 1, 2]
+    output = F.grid_sample(
+        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = paddle.squeeze(output,axis=3)
+    return output
+
+
--- a/paddlers/models/ppseg/models/losses/rmi_loss.py
+++ b/paddlers/models/ppseg/models/losses/rmi_loss.py
@ -0,0 +1,256 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""rmi loss in PaddlePaddle"""
+import numpy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+_euler_num = 2.718281828
+_pi = 3.14159265
+_ln_2_pi = 1.837877
+_CLIP_MIN = 1e-6
+_CLIP_MAX = 1.0
+_POS_ALPHA = 5e-4
+_IS_SUM = 1
+
+
+@manager.LOSSES.add_component
+class RMILoss(nn.Layer):
+    """
+    Implements the Region Mutual Information(RMI) Loss（https://arxiv.org/abs/1910.12037） for Semantic Segmentation.
+    Unlike vanilla rmi loss which contains Cross Entropy Loss, we disband them and only
+    left the RMI-related parts.
+    The motivation is to allow for a more flexible combination of losses during training.
+    For example, by employing mixed loss to merge RMI Loss with Boostrap Cross Entropy Loss,
+    we can achieve the online mining of hard examples together with attention to region information.
+    Args:
+        weight (tuple|list|ndarray|Tensor, optional): A manual rescaling weight
+            given to each class. Its length must be equal to the number of classes.
+            Default ``None``.
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self,
+                 num_classes=19,
+                 rmi_radius=3,
+                 rmi_pool_way=0,
+                 rmi_pool_size=3,
+                 rmi_pool_stride=3,
+                 loss_weight_lambda=0.5,
+                 ignore_index=255):
+        super(RMILoss, self).__init__()
+
+        self.num_classes = num_classes
+        assert rmi_radius in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.rmi_radius = rmi_radius
+        assert rmi_pool_way in [0, 1, 2, 3]
+        self.rmi_pool_way = rmi_pool_way
+        assert rmi_pool_size == rmi_pool_stride
+        self.rmi_pool_size = rmi_pool_size
+        self.rmi_pool_stride = rmi_pool_stride
+        self.weight_lambda = loss_weight_lambda
+        self.half_d = self.rmi_radius * self.rmi_radius
+        self.d = 2 * self.half_d
+        self.kernel_padding = self.rmi_pool_size // 2
+        self.ignore_index = ignore_index
+
+    def forward(self, logits_4D, labels_4D, do_rmi=True):
+        """
+        Forward computation.
+        Args:
+            logits (Tensor): Shape is [N, C, H, W], logits at each prediction (between -\infty and +\infty).
+            labels (Tensor): Shape is [N, H, W], ground truth labels (between 0 and C - 1).
+        """
+        logits_4D = paddle.cast(logits_4D, dtype='float32')
+        labels_4D = paddle.cast(labels_4D, dtype='float32')
+
+        loss = self.forward_sigmoid(logits_4D, labels_4D, do_rmi=do_rmi)
+        return loss
+
+    def forward_sigmoid(self, logits_4D, labels_4D, do_rmi=False):
+        """
+        Using the sigmiod operation both.
+        Args:
+                logits_4D   :   [N, C, H, W], dtype=float32
+                labels_4D   :   [N, H, W], dtype=long
+                do_rmi          :       bool
+        """
+        label_mask_3D = labels_4D != self.ignore_index
+        valid_onehot_labels_4D = paddle.cast(
+            F.one_hot(
+                paddle.cast(labels_4D, dtype='int64') * paddle.cast(
+                    label_mask_3D, dtype='int64'),
+                num_classes=self.num_classes),
+            dtype='float32')
+        # label_mask_flat = paddle.cast(
+        #     paddle.reshape(label_mask_3D, [-1]), dtype='float32')
+
+        valid_onehot_labels_4D = valid_onehot_labels_4D * paddle.unsqueeze(
+            label_mask_3D, axis=3)
+        valid_onehot_labels_4D.stop_gradient = True
+        probs_4D = F.sigmoid(logits_4D) * paddle.unsqueeze(
+            label_mask_3D, axis=1) + _CLIP_MIN
+
+        valid_onehot_labels_4D = paddle.transpose(valid_onehot_labels_4D,
+                                                  [0, 3, 1, 2])
+        valid_onehot_labels_4D.stop_gradient = True
+        rmi_loss = self.rmi_lower_bound(valid_onehot_labels_4D, probs_4D)
+
+        return rmi_loss
+
+    def inverse(self, x):
+        return paddle.inverse(x)
+
+    def rmi_lower_bound(self, labels_4D, probs_4D):
+        """
+        calculate the lower bound of the region mutual information.
+        Args:
+                labels_4D   :   [N, C, H, W], dtype=float32
+                probs_4D    :   [N, C, H, W], dtype=float32
+        """
+        assert labels_4D.shape == probs_4D.shape, print(
+            'shapes', labels_4D.shape, probs_4D.shape)
+
+        p, s = self.rmi_pool_size, self.rmi_pool_stride
+        if self.rmi_pool_stride > 1:
+            if self.rmi_pool_way == 0:
+                labels_4D = F.max_pool2d(
+                    labels_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+                probs_4D = F.max_pool2d(
+                    probs_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+            elif self.rmi_pool_way == 1:
+                labels_4D = F.avg_pool2d(
+                    labels_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+                probs_4D = F.avg_pool2d(
+                    probs_4D,
+                    kernel_size=p,
+                    stride=s,
+                    padding=self.kernel_padding)
+            elif self.rmi_pool_way == 2:
+                shape = labels_4D.shape
+                new_h, new_w = shape[2] // s, shape[3] // s
+                labels_4D = F.interpolate(
+                    labels_4D, size=(new_h, new_w), mode='nearest')
+                probs_4D = F.interpolate(
+                    probs_4D,
+                    size=(new_h, new_w),
+                    mode='bilinear',
+                    align_corners=True)
+            else:
+                raise NotImplementedError("Pool way of RMI is not defined!")
+
+        label_shape = labels_4D.shape
+        n, c = label_shape[0], label_shape[1]
+
+        la_vectors, pr_vectors = self.map_get_pairs(
+            labels_4D, probs_4D, radius=self.rmi_radius, is_combine=0)
+
+        la_vectors = paddle.reshape(la_vectors, [n, c, self.half_d, -1])
+        la_vectors = paddle.cast(la_vectors, dtype='float64')
+        la_vectors.stop_gradient = True
+
+        pr_vectors = paddle.reshape(pr_vectors, [n, c, self.half_d, -1])
+        pr_vectors = paddle.cast(pr_vectors, dtype='float64')
+
+        diag_matrix = paddle.unsqueeze(
+            paddle.unsqueeze(paddle.eye(self.half_d), axis=0), axis=0)
+        la_vectors = la_vectors - paddle.mean(la_vectors, axis=3, keepdim=True)
+
+        la_cov = paddle.matmul(la_vectors,
+                               paddle.transpose(la_vectors, [0, 1, 3, 2]))
+        pr_vectors = pr_vectors - paddle.mean(pr_vectors, axis=3, keepdim=True)
+        pr_cov = paddle.matmul(pr_vectors,
+                               paddle.transpose(pr_vectors, [0, 1, 3, 2]))
+
+        pr_cov_inv = self.inverse(
+            pr_cov + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA)
+
+        la_pr_cov = paddle.matmul(la_vectors,
+                                  paddle.transpose(pr_vectors, [0, 1, 3, 2]))
+
+        appro_var = la_cov - paddle.matmul(
+            paddle.matmul(la_pr_cov, pr_cov_inv),
+            paddle.transpose(la_pr_cov, [0, 1, 3, 2]))
+
+        rmi_now = 0.5 * self.log_det_by_cholesky(
+            appro_var + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA)
+
+        rmi_per_class = paddle.cast(
+            paddle.mean(
+                paddle.reshape(rmi_now, [-1, self.num_classes]), axis=0),
+            dtype='float32')
+        rmi_per_class = paddle.divide(rmi_per_class,
+                                      paddle.to_tensor(float(self.half_d)))
+
+        rmi_loss = paddle.sum(rmi_per_class) if _IS_SUM else paddle.mean(
+            rmi_per_class)
+
+        return rmi_loss
+
+    def log_det_by_cholesky(self, matrix):
+        """
+        Args:
+            matrix: matrix must be a positive define matrix.
+                    shape [N, C, D, D].
+        """
+
+        chol = paddle.cholesky(matrix)
+        diag = paddle.diagonal(chol, offset=0, axis1=-2, axis2=-1)
+        chol = paddle.log(diag + 1e-8)
+
+        return 2.0 * paddle.sum(chol, axis=-1)
+
+    def map_get_pairs(self, labels_4D, probs_4D, radius=3, is_combine=True):
+        """
+        Args:
+            labels_4D   :   labels, shape [N, C, H, W]
+            probs_4D    :   probabilities, shape [N, C, H, W]
+            radius      :   the square radius
+        Return:
+            tensor with shape [N, C, radius * radius, H - (radius - 1), W - (radius - 1)]
+        """
+
+        label_shape = labels_4D.shape
+        h, w = label_shape[2], label_shape[3]
+        new_h, new_w = h - (radius - 1), w - (radius - 1)
+        la_ns = []
+        pr_ns = []
+        for y in range(0, radius, 1):
+            for x in range(0, radius, 1):
+                la_now = labels_4D[:, :, y:y + new_h, x:x + new_w]
+                pr_now = probs_4D[:, :, y:y + new_h, x:x + new_w]
+                la_ns.append(la_now)
+                pr_ns.append(pr_now)
+
+        if is_combine:
+            pair_ns = la_ns + pr_ns
+            p_vectors = paddle.stack(pair_ns, axis=2)
+            return p_vectors
+        else:
+            la_vectors = paddle.stack(la_ns, axis=2)
+            pr_vectors = paddle.stack(pr_ns, axis=2)
+            return la_vectors, pr_vectors
--- a/paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
+++ b/paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class SemanticConnectivityLoss(nn.Layer):
+    '''
+    SCL (Semantic Connectivity-aware Learning) framework, which introduces a SC Loss (Semantic Connectivity-aware Loss)
+    to improve the quality of segmentation results from the perspective of connectivity. Support multi-class segmentation.
+
+    The original article refers to
+        Lutao Chu, Yi Liu, Zewu Wu, Shiyu Tang, Guowei Chen, Yuying Hao, Juncai Peng, Zhiliang Yu, Zeyu Chen, Baohua Lai, Haoyi Xiong.
+        "PP-HumanSeg: Connectivity-Aware Portrait Segmentation with a Large-Scale Teleconferencing Video Dataset"
+        In WACV 2022 workshop
+        https://arxiv.org/abs/2112.07146
+
+    Running process:
+    Step 1. Connected Components Calculation
+    Step 2. Connected Components Matching and SC Loss Calculation
+    '''
+
+    def __init__(self, ignore_index=255, max_pred_num_conn=10, use_argmax=True):
+        '''
+        Args:
+            ignore_index (int): Specify a pixel value to be ignored in the annotated image and does not contribute to
+                the input gradient.When there are pixels that cannot be marked (or difficult to be marked) in the marked
+                image, they can be marked as a specific gray value. When calculating the loss value, the pixel corresponding
+                to the original image will not be used as the independent variable of the loss function. *Default:``255``*
+            max_pred_num_conn (int): Maximum number of predicted connected components. At the beginning of training,
+                there will be a large number of connected components, and the calculation is very time-consuming.
+                Therefore, it is necessary to limit the maximum number of predicted connected components,
+                and the rest will not participate in the calculation.
+            use_argmax (bool): Whether to use argmax for logits.
+        '''
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.max_pred_num_conn = max_pred_num_conn
+        self.use_argmax = use_argmax
+
+    def forward(self, logits, labels):
+        '''
+        Args:
+            logits (Tensor): [N, C, H, W]
+            lables (Tensor): [N, H, W]
+        '''
+        preds = paddle.argmax(logits, axis=1) if self.use_argmax else logits
+        preds_np = preds.astype('uint8').numpy()
+        labels_np = labels.astype('uint8').numpy()
+        preds = paddle.to_tensor(preds, 'float32', stop_gradient=False)
+        multi_class_sc_loss = paddle.zeros([preds.shape[0]])
+        zero = paddle.to_tensor([0.])  # for accelerating
+
+        # Traverse each image
+        for i in range(preds.shape[0]):
+            sc_loss = 0
+            class_num = 0
+
+            pred_i = preds[i]
+            preds_np_i = preds_np[i]
+            labels_np_i = labels_np[i]
+
+            # Traverse each class
+            for class_ in np.unique(labels_np_i):
+                if class_ == self.ignore_index:
+                    continue
+                class_num += 1
+
+                # Connected Components Calculation
+                preds_np_class = preds_np_i == class_
+                labels_np_class = labels_np_i == class_
+                pred_num_conn, pred_conn = cv2.connectedComponents(
+                    preds_np_class.astype(np.uint8))  # pred_conn.shape = [H,W]
+                label_num_conn, label_conn = cv2.connectedComponents(
+                    labels_np_class.astype(np.uint8))
+
+                if pred_num_conn > 2 * label_num_conn:
+                    pred_num_conn = min(pred_num_conn, self.max_pred_num_conn)
+                real_pred_num = pred_num_conn - 1
+                real_label_num = label_num_conn - 1
+
+                # Connected Components Matching and SC Loss Calculation
+                if real_label_num > 0 and real_pred_num > 0:
+                    img_connectivity = compute_class_connectiveity(
+                        pred_conn, label_conn, pred_num_conn, label_num_conn,
+                        pred_i, real_label_num, real_pred_num, zero)
+                    sc_loss += 1 - img_connectivity
+                elif real_label_num == 0 and real_pred_num == 0:
+                    # if no connected component, SC Loss = 0, so pass
+                    pass
+                else:
+                    preds_class = pred_i == int(class_)
+                    not_preds_class = paddle.bitwise_not(preds_class)
+                    labels_class = paddle.to_tensor(labels_np_class)
+                    missed_detect = labels_class * not_preds_class
+                    missed_detect_area = paddle.sum(missed_detect).astype(
+                        'float32')
+                    sc_loss += missed_detect_area / missed_detect.numel() + 1
+
+            multi_class_sc_loss[
+                i] = sc_loss / class_num if class_num != 0 else 0
+        multi_class_sc_loss = paddle.mean(multi_class_sc_loss)
+        return multi_class_sc_loss
+
+
+def compute_class_connectiveity(pred_conn, label_conn, pred_num_conn,
+                                label_num_conn, pred, real_label_num,
+                                real_pred_num, zero):
+
+    pred_conn = paddle.to_tensor(pred_conn)
+    label_conn = paddle.to_tensor(label_conn)
+    pred_conn = F.one_hot(pred_conn, pred_num_conn)
+    label_conn = F.one_hot(label_conn, label_num_conn)
+
+    ious = paddle.zeros((real_label_num, real_pred_num))
+    pair_conn_sum = paddle.to_tensor([0.], stop_gradient=False)
+
+    for i in range(1, label_num_conn):
+        label_i = label_conn[:, :, i]
+
+        pair_conn = paddle.to_tensor([0.], stop_gradient=False)
+        pair_conn_num = 0
+
+        for j in range(1, pred_num_conn):
+            pred_j_mask = pred_conn[:, :, j]
+            pred_j = pred_j_mask * pred
+
+            iou = compute_iou(pred_j, label_i, zero)
+            ious[i - 1, j - 1] = iou
+            if iou != 0:
+                pair_conn += iou
+                pair_conn_num += 1
+
+        if pair_conn_num != 0:
+            pair_conn_sum += pair_conn / pair_conn_num
+    lone_pred_num = 0
+
+    pred_sum = paddle.sum(ious, axis=0)
+    for m in range(0, real_pred_num):
+        if pred_sum[m] == 0:
+            lone_pred_num += 1
+    img_connectivity = pair_conn_sum / (real_label_num + lone_pred_num)
+    return img_connectivity
+
+
+def compute_iou(pred_i, label_i, zero):
+    intersect_area_i = paddle.sum(pred_i * label_i)
+    if paddle.equal(intersect_area_i, zero):
+        return 0
+
+    pred_area_i = paddle.sum(pred_i)
+    label_area_i = paddle.sum(label_i)
+    union_area_i = pred_area_i + label_area_i - intersect_area_i
+    if paddle.equal(union_area_i, zero):
+        return 1
+    else:
+        return intersect_area_i / union_area_i
--- a/paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
@ -0,0 +1,47 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.LOSSES.add_component
+class SECrossEntropyLoss(nn.Layer):
+    """
+    The Semantic Encoding Loss implementation based on PaddlePaddle.
+
+    """
+    def __init__(self, *args, **kwargs):
+        super(SECrossEntropyLoss, self).__init__()
+
+    def forward(self, logit, label):
+        if logit.ndim == 4:
+            logit = logit.squeeze(2).squeeze(3)
+        assert logit.ndim == 2, "The shape of logit should be [N, C, 1, 1] or [N, C], but the logit dim is  {}.".format(
+            logit.ndim)
+
+        batch_size, num_classes = paddle.shape(logit)
+        se_label = paddle.zeros([batch_size, num_classes])
+        for i in range(batch_size):
+            hist = paddle.histogram(label[i],
+                                    bins=num_classes,
+                                    min=0,
+                                    max=num_classes - 1)
+            hist = hist.astype('float32') / hist.sum().astype('float32')
+            se_label[i] = (hist > 0).astype('float32')
+        loss = F.binary_cross_entropy_with_logits(logit, se_label)
+        return loss
--- a/paddlers/models/ppseg/models/mla_transformer.py
+++ b/paddlers/models/ppseg/models/mla_transformer.py
@ -0,0 +1,241 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+class MLAHeads(nn.Layer):
+    def __init__(self, mlahead_channels=128):
+        super(MLAHeads, self).__init__()
+        self.head2 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+        self.head3 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+        self.head4 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+        self.head5 = nn.Sequential(
+            layers.ConvBNReLU(
+                mlahead_channels * 2,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBNReLU(
+                mlahead_channels,
+                mlahead_channels,
+                3,
+                padding=1,
+                bias_attr=False))
+
+    def forward(self, mla_p2, mla_p3, mla_p4, mla_p5):
+        head2 = F.interpolate(
+            self.head2(mla_p2),
+            size=(4 * mla_p2.shape[3], 4 * mla_p2.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+        head3 = F.interpolate(
+            self.head3(mla_p3),
+            size=(4 * mla_p3.shape[3], 4 * mla_p3.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+        head4 = F.interpolate(
+            self.head4(mla_p4),
+            size=(4 * mla_p4.shape[3], 4 * mla_p4.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+        head5 = F.interpolate(
+            self.head5(mla_p5),
+            size=(4 * mla_p5.shape[3], 4 * mla_p5.shape[3]),
+            mode='bilinear',
+            align_corners=True)
+
+        return paddle.concat([head2, head3, head4, head5], axis=1)
+
+
+@manager.MODELS.add_component
+class MLATransformer(nn.Layer):
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 backbone,
+                 mlahead_channels=128,
+                 aux_channels=256,
+                 norm_layer=nn.BatchNorm2D,
+                 pretrained=None,
+                 **kwargs):
+        super(MLATransformer, self).__init__()
+
+        self.BatchNorm = norm_layer
+        self.mlahead_channels = mlahead_channels
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.backbone = backbone
+
+        self.mlahead = MLAHeads(mlahead_channels=self.mlahead_channels)
+        self.cls = nn.Conv2D(
+            4 * self.mlahead_channels, self.num_classes, 3, padding=1)
+
+        self.conv0 = layers.ConvBNReLU(
+            self.in_channels[0],
+            self.in_channels[0] * 2,
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv1 = layers.ConvBNReLU(
+            self.in_channels[1],
+            self.in_channels[1],
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv21 = layers.ConvBNReLU(
+            self.in_channels[2],
+            self.in_channels[2],
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv22 = layers.ConvBNReLU(
+            self.in_channels[2],
+            self.in_channels[2] // 2,
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv31 = layers.ConvBNReLU(
+            self.in_channels[3],
+            self.in_channels[3],
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv32 = layers.ConvBNReLU(
+            self.in_channels[3],
+            self.in_channels[3] // 2,
+            3,
+            padding=1,
+            bias_attr=False)
+        self.conv33 = layers.ConvBNReLU(
+            self.in_channels[3] // 2,
+            self.in_channels[3] // 4,
+            3,
+            padding=1,
+            bias_attr=False)
+
+        self.aux_head = nn.Sequential(
+            layers.ConvBN(
+                in_channels=self.in_channels[2],
+                out_channels=aux_channels,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False),
+            nn.Conv2D(
+                in_channels=aux_channels,
+                out_channels=self.num_classes,
+                kernel_size=1,
+            ))
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        inputs = self.backbone(x)
+
+        inputs0 = self.conv0(inputs[0])
+        inputs1 = F.interpolate(
+            self.conv1(inputs[1]),
+            size=inputs[0].shape[2:],
+            mode='bilinear',
+            align_corners=True)
+        inputs2 = F.interpolate(
+            self.conv21(inputs[2]),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        inputs2 = F.interpolate(
+            self.conv22(inputs2),
+            size=inputs[0].shape[2:],
+            mode='bilinear',
+            align_corners=True)
+        inputs3 = F.interpolate(
+            self.conv31(inputs[3]),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        inputs3 = F.interpolate(
+            self.conv32(inputs3),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        inputs3 = F.interpolate(
+            self.conv33(inputs3),
+            size=inputs[0].shape[2:],
+            mode='bilinear',
+            align_corners=True)
+        inputs2 = inputs2 + inputs3
+        inputs1 = inputs1 + inputs2
+        inputs0 = inputs0 + inputs1
+
+        feats = self.mlahead(inputs0, inputs1, inputs2, inputs3)
+        logit = self.cls(feats)
+        logit_list = [logit]
+
+        if self.training:
+            logit_list.append(self.aux_head(inputs[2]))
+
+        logit_list = [
+            F.interpolate(
+                logit, paddle.shape(x)[2:], mode='bilinear', align_corners=True)
+            for logit in logit_list
+        ]
+        return logit_list
--- a/paddlers/models/ppseg/models/ocrnet.py
+++ b/paddlers/models/ppseg/models/ocrnet.py
@ -0,0 +1,246 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+
+
+@manager.MODELS.add_component
+class OCRNet(nn.Layer):
+    """
+    The OCRNet implementation based on PaddlePaddle.
+    The original article refers to
+        Yuan, Yuhui, et al. "Object-Contextual Representations for Semantic Segmentation"
+        (https://arxiv.org/pdf/1909.11065.pdf)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network.
+        backbone_indices (tuple): A tuple indicates the indices of output of backbone.
+            It can be either one or two values, if two values, the first index will be taken as
+            a deep-supervision feature in auxiliary layer; the second one will be taken as
+            input of pixel representation. If one value, it is taken by both above.
+        ocr_mid_channels (int, optional): The number of middle channels in OCRHead. Default: 512.
+        ocr_key_channels (int, optional): The number of key channels in ObjectAttentionBlock. Default: 256.
+        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
+            is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.  Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 ocr_mid_channels=512,
+                 ocr_key_channels=256,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        in_channels = [self.backbone.feat_channels[i] for i in backbone_indices]
+
+        self.head = OCRHead(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            ocr_mid_channels=ocr_mid_channels,
+            ocr_key_channels=ocr_key_channels)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        if not self.training:
+            logit_list = [logit_list[0]]
+
+        logit_list = [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class OCRHead(nn.Layer):
+    """
+    The Object contextual representation head.
+
+    Args:
+        num_classes(int): The unique number of target classes.
+        in_channels(tuple): The number of input channels.
+        ocr_mid_channels(int, optional): The number of middle channels in OCRHead. Default: 512.
+        ocr_key_channels(int, optional): The number of key channels in ObjectAttentionBlock. Default: 256.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 ocr_mid_channels=512,
+                 ocr_key_channels=256):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.spatial_gather = SpatialGatherBlock(ocr_mid_channels, num_classes)
+        self.spatial_ocr = SpatialOCRModule(ocr_mid_channels, ocr_key_channels,
+                                            ocr_mid_channels)
+
+        self.indices = [-2, -1] if len(in_channels) > 1 else [-1, -1]
+
+        self.conv3x3_ocr = layers.ConvBNReLU(
+            in_channels[self.indices[1]], ocr_mid_channels, 3, padding=1)
+        self.cls_head = nn.Conv2D(ocr_mid_channels, self.num_classes, 1)
+        self.aux_head = nn.Sequential(
+            layers.ConvBNReLU(in_channels[self.indices[0]],
+                              in_channels[self.indices[0]], 1),
+            nn.Conv2D(in_channels[self.indices[0]], self.num_classes, 1))
+
+        self.init_weight()
+
+    def forward(self, feat_list):
+        feat_shallow, feat_deep = feat_list[self.indices[0]], feat_list[
+            self.indices[1]]
+
+        soft_regions = self.aux_head(feat_shallow)
+        pixels = self.conv3x3_ocr(feat_deep)
+
+        object_regions = self.spatial_gather(pixels, soft_regions)
+        ocr = self.spatial_ocr(pixels, object_regions)
+
+        logit = self.cls_head(ocr)
+        return [logit, soft_regions]
+
+    def init_weight(self):
+        """Initialize the parameters of model parts."""
+        for sublayer in self.sublayers():
+            if isinstance(sublayer, nn.Conv2D):
+                param_init.normal_init(sublayer.weight, std=0.001)
+            elif isinstance(sublayer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(sublayer.weight, value=1.0)
+                param_init.constant_init(sublayer.bias, value=0.0)
+
+
+class SpatialGatherBlock(nn.Layer):
+    """Aggregation layer to compute the pixel-region representation."""
+
+    def __init__(self, pixels_channels, regions_channels):
+        super().__init__()
+        self.pixels_channels = pixels_channels
+        self.regions_channels = regions_channels
+
+    def forward(self, pixels, regions):
+        # pixels: from (n, c, h, w) to (n, h*w, c)
+        pixels = paddle.reshape(pixels, (0, self.pixels_channels, -1))
+        pixels = paddle.transpose(pixels, (0, 2, 1))
+
+        # regions: from (n, k, h, w) to (n, k, h*w)
+        regions = paddle.reshape(regions, (0, self.regions_channels, -1))
+        regions = F.softmax(regions, axis=2)
+
+        # feats: from (n, k, c) to (n, c, k, 1)
+        feats = paddle.bmm(regions, pixels)
+        feats = paddle.transpose(feats, (0, 2, 1))
+        feats = paddle.unsqueeze(feats, axis=-1)
+
+        return feats
+
+
+class SpatialOCRModule(nn.Layer):
+    """Aggregate the global object representation to update the representation for each pixel."""
+
+    def __init__(self,
+                 in_channels,
+                 key_channels,
+                 out_channels,
+                 dropout_rate=0.1):
+        super().__init__()
+
+        self.attention_block = ObjectAttentionBlock(in_channels, key_channels)
+        self.conv1x1 = nn.Sequential(
+            layers.ConvBNReLU(2 * in_channels, out_channels, 1),
+            nn.Dropout2D(dropout_rate))
+
+    def forward(self, pixels, regions):
+        context = self.attention_block(pixels, regions)
+        feats = paddle.concat([context, pixels], axis=1)
+        feats = self.conv1x1(feats)
+
+        return feats
+
+
+class ObjectAttentionBlock(nn.Layer):
+    """A self-attention module."""
+
+    def __init__(self, in_channels, key_channels):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.key_channels = key_channels
+
+        self.f_pixel = nn.Sequential(
+            layers.ConvBNReLU(in_channels, key_channels, 1),
+            layers.ConvBNReLU(key_channels, key_channels, 1))
+
+        self.f_object = nn.Sequential(
+            layers.ConvBNReLU(in_channels, key_channels, 1),
+            layers.ConvBNReLU(key_channels, key_channels, 1))
+
+        self.f_down = layers.ConvBNReLU(in_channels, key_channels, 1)
+
+        self.f_up = layers.ConvBNReLU(key_channels, in_channels, 1)
+
+    def forward(self, x, proxy):
+        x_shape = paddle.shape(x)
+        # query : from (n, c1, h1, w1) to (n, h1*w1, key_channels)
+        query = self.f_pixel(x)
+        query = paddle.reshape(query, (0, self.key_channels, -1))
+        query = paddle.transpose(query, (0, 2, 1))
+
+        # key : from (n, c2, h2, w2) to (n, key_channels, h2*w2)
+        key = self.f_object(proxy)
+        key = paddle.reshape(key, (0, self.key_channels, -1))
+
+        # value : from (n, c2, h2, w2) to (n, h2*w2, key_channels)
+        value = self.f_down(proxy)
+        value = paddle.reshape(value, (0, self.key_channels, -1))
+        value = paddle.transpose(value, (0, 2, 1))
+
+        # sim_map (n, h1*w1, h2*w2)
+        sim_map = paddle.bmm(query, key)
+        sim_map = (self.key_channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, axis=-1)
+
+        # context from (n, h1*w1, key_channels) to (n , out_channels, h1, w1)
+        context = paddle.bmm(sim_map, value)
+        context = paddle.transpose(context, (0, 2, 1))
+        context = paddle.reshape(context,
+                                 (0, self.key_channels, x_shape[2], x_shape[3]))
+        context = self.f_up(context)
+
+        return context
--- a/paddlers/models/ppseg/models/pfpnnet.py
+++ b/paddlers/models/ppseg/models/pfpnnet.py
@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class PFPNNet(nn.Layer):
+    """
+    The Panoptic Feature Pyramid Networks implementation based on PaddlePaddle.
+
+    The original article refers to
+    Alexander Kirillov, Ross Girshick, Kaiming He, Piotr Dollár, et al. "Panoptic Feature Pyramid Networks"
+    (https://arxiv.org/abs/1901.02446)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple): Four values in the tuple indicate the indices of output of backbone.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices,
+                 channels,
+                 enable_auxiliary_loss=False,
+                 align_corners=False,
+                 dropout_ratio=0.1,
+                 fpn_inplanes=[256, 512, 1024, 2048],
+                 pretrained=None):
+        super(PFPNNet, self).__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.in_channels = [
+            self.backbone.feat_channels[i] for i in backbone_indices
+        ]
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+        self.head = PFPNHead(num_class=num_classes,
+                             fpn_inplanes=fpn_inplanes,
+                             dropout_ratio=dropout_ratio,
+                             channels=channels,
+                             fpn_dim=channels,
+                             enable_auxiliary_loss=self.enable_auxiliary_loss)
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        logit_list = self.head(feats)
+        return [
+            F.interpolate(logit,
+                          paddle.shape(x)[2:],
+                          mode='bilinear',
+                          align_corners=self.align_corners)
+            for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class PFPNHead(nn.Layer):
+    """
+    The PFPNHead implementation.
+
+    Args:
+        inplane (int): Input channels of PPM module.
+        num_class (int): The unique number of target classes.
+        fpn_inplanes (list): The feature channels from backbone.
+        fpn_dim (int, optional): The input channels of FPN module. Default: 512.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: False.
+    """
+    def __init__(self,
+                 num_class,
+                 fpn_inplanes,
+                 channels,
+                 dropout_ratio=0.1,
+                 fpn_dim=256,
+                 enable_auxiliary_loss=False,
+                 align_corners=False):
+        super(PFPNHead, self).__init__()
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.align_corners = align_corners
+        self.lateral_convs = nn.LayerList()
+        self.fpn_out = nn.LayerList()
+
+        for fpn_inplane in fpn_inplanes:
+            self.lateral_convs.append(
+                nn.Sequential(nn.Conv2D(fpn_inplane, fpn_dim, 1),
+                              layers.SyncBatchNorm(fpn_dim), nn.ReLU()))
+            self.fpn_out.append(
+                nn.Sequential(
+                    layers.ConvBNReLU(fpn_dim, fpn_dim, 3, bias_attr=False)))
+
+        self.scale_heads = nn.LayerList()
+        for index in range(len(fpn_inplanes)):
+            head_length = max(
+                1, int(np.log2(fpn_inplanes[index]) - np.log2(fpn_inplanes[0])))
+            scale_head = nn.LayerList()
+            for head_index in range(head_length):
+                scale_head.append(
+                    layers.ConvBNReLU(
+                        fpn_dim,
+                        channels,
+                        3,
+                        padding=1,
+                    ))
+                if fpn_inplanes[index] != fpn_inplanes[0]:
+                    scale_head.append(
+                        nn.Upsample(scale_factor=2,
+                                    mode='bilinear',
+                                    align_corners=align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+        if dropout_ratio:
+            self.dropout = nn.Dropout2D(dropout_ratio)
+            if self.enable_auxiliary_loss:
+                self.dsn = nn.Sequential(
+                    layers.ConvBNReLU(fpn_inplanes[2],
+                                      fpn_inplanes[2],
+                                      3,
+                                      padding=1), nn.Dropout2D(dropout_ratio),
+                    nn.Conv2D(fpn_inplanes[2], num_class, kernel_size=1))
+        else:
+            self.dropout = None
+            if self.enable_auxiliary_loss:
+                self.dsn = nn.Sequential(
+                    layers.ConvBNReLU(fpn_inplanes[2],
+                                      fpn_inplanes[2],
+                                      3,
+                                      padding=1),
+                    nn.Conv2D(fpn_inplanes[2], num_class, kernel_size=1))
+
+        self.conv_last = nn.Sequential(
+            layers.ConvBNReLU(len(fpn_inplanes) * fpn_dim,
+                              fpn_dim,
+                              3,
+                              bias_attr=False),
+            nn.Conv2D(fpn_dim, num_class, kernel_size=1))
+        self.conv_seg = nn.Conv2D(channels, num_class, kernel_size=1)
+
+    def cls_seg(self, feat):
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def forward(self, conv_out):
+        last_out = self.lateral_convs[-1](conv_out[-1])
+        f = last_out
+        fpn_feature_list = [last_out]
+        for i in reversed(range(len(conv_out) - 1)):
+            conv_x = conv_out[i]
+            conv_x = self.lateral_convs[i](conv_x)
+            prev_shape = paddle.shape(conv_x)[2:]
+            f = conv_x + F.interpolate(
+                f, prev_shape, mode='bilinear', align_corners=True)
+            fpn_feature_list.append(self.fpn_out[i](f))
+
+        output_size = paddle.shape(fpn_feature_list[-1])[2:]
+
+        x = self.scale_heads[0](fpn_feature_list[-1])
+        for index in range(len(self.scale_heads) - 2, 0, -1):
+            x = x + F.interpolate(self.scale_heads[index](
+                fpn_feature_list[index]),
+                                  size=output_size,
+                                  mode='bilinear',
+                                  align_corners=self.align_corners)
+        x = self.cls_seg(x)
+        if self.enable_auxiliary_loss:
+            dsn = self.dsn(conv_out[2])
+            return [x, dsn]
+        else:
+            return [x]
--- a/paddlers/models/ppseg/models/pointrend.py
+++ b/paddlers/models/ppseg/models/pointrend.py
@ -0,0 +1,832 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+
+@manager.MODELS.add_component
+class PointRend(nn.Layer):
+    """
+    The SemanticFPN-PointRend implementation based on PaddlePaddle.
+
+    The original article refers to
+    Kirillov A, Wu Y, He K, et al. "PointRend: Image Segmentation As Rendering."
+    (https://arxiv.org/abs/1912.08193).
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Four values in the tuple indicate the indices of output of backbone.
+        fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction in FPN. Default: [256, 512, 1024, 2048].
+        fpn_outplanes (int, optional): The output channels in FPN. Default: 256.
+        point_num_fcs (int, optional): Number of fc layers in the head in PointHead. Default: 3.
+        point_in_channels (list, optional): input channels of fc block in PointHead. Default: [256].
+        point_out_channels (int, optional): Fc block's output channels in PointHead. Default: 256.
+        point_in_index (list, optional): The indexs of input features to use in PointHead. Default: [0].
+        point_num_points (int, optional): The number of point in training mode in PointHead. Default: 2048.
+        point_oversample_ratio (int, optional): The sample ratio of points when in training mode in PointHead.
+            sampled_point = num_points * oversample_ratio. Default: 3.
+        point_importance_sample_ratio (float, optional): The importance sample ratio for compute num_uncertain_points in PointHead. Default: 0.75.
+        point_scale_factor(int, optinal): The scale factor of F.interpolate in refine seg logits stage when in inference in PointHead. Default: 2.
+        point_subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference in PointHead. Default: 2.
+        point_subdivision_num_points(int, optional): The points number for refine seg logits when in inference in PointHead. Default: 8196.
+        point_dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in PointHead. Default: 0.1.
+        point_coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
+            the output of each fc layer in PointHead. Default: True.
+        point_conv_cfg(str): The config of Conv in PointHead. Default: 'Conv1D'.
+        point_input_transform(str): The features transform method of inputs in PointHead.
+            it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
+        PFN_feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2 in FPNHead. The first
+            one is of largest resolution. Default: [4, 8, 16, 32].
+        PFN_in_channels(list): The input feature's channels list in FPNHead. Default: [256, 256, 256, 256].
+        PFN_channels(int,optional): The output channels of scale_head's Conv before Upsample block in FPNHead. Default: 128.
+        PFN_in_index(list): The indexs of input features to use. it's shape should keep with in_channels in FPNHead. Default: [0, 1, 2, 3].
+        PFN_dropout_ratio(float,optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio in FPNHead. Default: 0.1.
+        PFN_conv_cfg(str): The config of Conv. Default: 'Conv2D'.
+        PFN_input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs' in FPNHead. Defalut: 'multiple_select'.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            backbone,
+            backbone_indices,
+            fpn_inplanes=[256, 512, 1024, 2048],
+            fpn_outplanes=256,
+            point_in_channels=[256],
+            point_out_channels=256,
+            point_in_index=[0],
+            point_num_fcs=3,
+            point_num_points=2048,
+            point_oversample_ratio=3,
+            point_importance_sample_ratio=0.75,
+            point_scale_factor=2,
+            point_subdivision_steps=2,
+            point_subdivision_num_points=8196,
+            point_dropout_ratio=0,
+            point_coarse_pred_each_layer=True,
+            point_input_transform='multiple_select',  # resize_concat
+            point_conv_cfg='Conv1D',
+            PFN_feature_strides=[4, 8, 16, 32],
+            PFN_in_channels=[256, 256, 256, 256],
+            PFN_channels=128,
+            PFN_in_index=[0, 1, 2, 3],
+            PFN_dropout_ratio=0,
+            PFN_conv_cfg='Conv2D',
+            PFN_input_transform='multiple_select',
+            align_corners=False,
+            pretrained=None):
+        super(PointRend, self).__init__()
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        self.in_channels = [
+            self.backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.neck = FPNNeck(
+            fpn_inplanes=fpn_inplanes, fpn_outplanes=fpn_outplanes)
+        self.pointhead = PointHead(
+            in_channels=point_in_channels,
+            out_channels=point_out_channels,
+            num_classes=num_classes,
+            in_index=point_in_index,
+            num_fcs=point_num_fcs,
+            num_points=point_num_points,
+            oversample_ratio=point_oversample_ratio,
+            importance_sample_ratio=point_importance_sample_ratio,
+            scale_factor=point_scale_factor,
+            subdivision_steps=point_subdivision_steps,
+            subdivision_num_points=point_subdivision_num_points,
+            dropout_ratio=point_dropout_ratio,
+            align_corners=align_corners,
+            coarse_pred_each_layer=point_coarse_pred_each_layer,
+            input_transform=point_input_transform,  # resize_concat
+            conv_cfg=point_conv_cfg)
+        self.fpnhead = FPNHead(
+            feature_strides=PFN_feature_strides,
+            in_channels=PFN_in_channels,
+            channels=PFN_channels,
+            num_class=num_classes,
+            in_index=PFN_in_index,
+            dropout_ratio=PFN_dropout_ratio,
+            conv_cfg=PFN_conv_cfg,
+            input_transform=PFN_input_transform,
+            align_corners=align_corners)
+
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feats = self.backbone(x)
+        feats = [feats[i] for i in self.backbone_indices]
+        fpn_feats = self.neck(feats)  # [n,256,64,128]*3 & [n,256,128,256]
+        pfn_logits = self.fpnhead(
+            fpn_feats
+        )  # segmainoutput decode_head[0] 512*1024->[n, 19, 64, 128]
+        point_logits = self.pointhead(
+            fpn_feats, pfn_logits)  # segpointoutput decode_head[1]
+
+        if self.training:
+            logit_list = [
+                F.interpolate(
+                    logit,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for logit in pfn_logits
+            ]
+            logit_list.append(point_logits)
+        else:
+            logit_list = [
+                F.interpolate(
+                    logit,
+                    paddle.shape(x)[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for logit in point_logits
+            ]
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class PointHead(nn.Layer):
+    """
+    The PointHead implementation based on PaddlePaddle.
+
+    PointHead use shared multi-layer perceptron (equivalent to
+    nn.Conv1D) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    The original article refers to:
+    Kirillov A , Wu Y , He K , et al "PointRend: Image Segmentation As Rendering."
+    (https://arxiv.org/abs/1912.08193)
+
+    Args:
+        num_classes (int): Number of classes for logits. Default: 19.
+        num_fcs (int, optional): Number of fc layers in the head. Default: 3.
+        in_channels (list): input channels of fc block. Default: [256].
+        out_channels (int, optional): Fc block's output channels. Default: 256.
+        in_index (list): The indexs of input features to use. Default: [0].
+        num_points (int, optional): The number of point in training mode. Default: 2048.
+        oversample_ratio (int, optional): The sample ratio of points when in training mode.
+            sampled_point = num_points * oversample_ratio. Default: 3.
+        importance_sample_ratio(float, optional): The importance sample ratio for compute num_uncertain_points. Default: 0.75.
+        scale_factor(int, optional): The scale factor of F.interpolate in refine seg logits stage when in inference. Default: 2.
+        subdivision_steps(int, optional): Then refine steps in refine seg logits stage when in inference. Default: 2.
+        subdivision_num_points(int, optional): The points number for refine seg logits when in inference. Default: 8196.
+        dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
+        coarse_pred_each_layer(bool, optional): Whether concatenate coarse feature with
+            the output of each fc layer. Default: True.
+        conv_cfg(str): The config of Conv. Default: 'Conv1D'.
+        input_transform(str): The features transform method of inputs.
+            it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+    """
+
+    def __init__(
+            self,
+            num_classes=19,
+            num_fcs=3,
+            in_channels=[256],
+            out_channels=256,
+            in_index=[0],
+            num_points=2048,
+            oversample_ratio=3,
+            importance_sample_ratio=0.75,
+            scale_factor=2,
+            subdivision_steps=2,
+            subdivision_num_points=8196,
+            dropout_ratio=0.1,
+            coarse_pred_each_layer=True,
+            conv_cfg='Conv1D',
+            input_transform='multiple_select',  # resize_concat
+            align_corners=False):
+        super(PointHead, self).__init__()
+
+        self.in_channels = in_channels
+        self.channels = out_channels
+        self.in_index = in_index
+        self.num_classes = num_classes
+        self.num_fcs = num_fcs
+        self.num_points = num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.scale_factor = scale_factor
+        self.subdivision_steps = subdivision_steps
+        self.subdivision_num_points = paddle.to_tensor(subdivision_num_points, dtype="int32")
+        self.dropout_ratio = dropout_ratio
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+        self.align_corners = align_corners
+        self.input_transform = input_transform
+
+        fc_in_channels = sum(self.in_channels) + self.num_classes
+        fc_channels = self.channels
+        self.fcs = nn.LayerList()
+        for k in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+            )
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += self.num_classes if self.coarse_pred_each_layer else 0
+        self.fc_seg = nn.Conv1D(
+            fc_in_channels,
+            self.num_classes,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+
+        if self.dropout_ratio > 0:
+            self.dropout = nn.Dropout(self.dropout_ratio)
+        else:
+            self.dropout = None
+
+    def cls_seg(self, feat):
+        """Classify each pixel with fc."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.fc_seg(feat)
+        return output
+
+    def _get_fine_grained_point_feats(self, x, points):
+        """
+        Sample from fine grained features.
+
+        Args:
+            x (list[Tensor]): Feature pyramid from by neck or backbone.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+        Returns:
+            fine_grained_feats (Tensor): Sampled fine grained feature,
+                shape (batch_size, sum(channels of x), num_points).
+        """
+
+        fine_grained_feats_list = [
+            point_sample(_, points, align_corners=self.align_corners) for _ in x
+        ]
+        if len(fine_grained_feats_list) > 1:
+            fine_grained_feats = paddle.concat(fine_grained_feats_list, axis=1)
+        else:
+            fine_grained_feats = fine_grained_feats_list[0]
+        return fine_grained_feats
+
+    def _get_coarse_point_feats(self, prev_output, points):
+        """
+        Sample from fine grained features.
+
+        Args:
+            prev_output (list[Tensor]): Prediction of previous decode head.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+        Returns:
+            coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
+                num_classes, num_points).
+        """
+
+        coarse_feats = point_sample(
+            prev_output, points, align_corners=self.align_corners)
+        return coarse_feats
+
+    def _transform_inputs(self, inputs):
+        """
+        Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                F.interpolate(
+                    x,
+                    size=paddle.shape(inputs[0])[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = paddle.concat(upsampled_inputs, axis=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index[0]]
+        return inputs
+
+    def get_points_train(self, seg_logits, uncertainty_func):  # finish
+        """
+        Sample points for training.
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        'uncertainty_func' function that takes point's logit prediction as
+        input.
+
+        Args:
+            seg_logits (Tensor): Semantic segmentation logits, shape (
+                batch_size, num_classes, height, width).
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Training config of point head.
+        Returns:
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains the coordinates of ``num_points`` sampled
+                points.
+        """
+
+        num_points = self.num_points
+        oversample_ratio = self.oversample_ratio
+        importance_sample_ratio = self.importance_sample_ratio
+        assert oversample_ratio >= 1
+        assert 0 <= importance_sample_ratio <= 1
+        batch_size = paddle.shape(seg_logits)[0]
+        num_sampled = int(num_points * oversample_ratio)
+        point_coords = paddle.rand([batch_size, num_sampled, 2])
+        point_logits = point_sample(seg_logits, point_coords)
+        # It is crucial to calculate uncertainty based on the sampled
+        # prediction value for the points. Calculating uncertainties of the
+        # coarse predictions first and sampling them for points leads to
+        # incorrect results.  To illustrate this: assume uncertainty func(
+        # logits)=-abs(logits), a sampled point between two coarse
+        # predictions with -1 and 1 logits has 0 logits, and therefore 0
+        # uncertainty value. However, if we calculate uncertainties for the
+        # coarse predictions first, both will have -1 uncertainty,
+        # and sampled point will get -1 uncertainty.
+        point_uncertainties = uncertainty_func(point_logits)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = paddle.topk(
+            point_uncertainties[:, 0, :], k=num_uncertain_points, axis=1)[1]
+        shift = num_sampled * paddle.arange(batch_size, dtype='int64')
+        idx += shift.unsqueeze([-1])
+        idx = idx.reshape([-1])
+        point_coords = paddle.index_select(
+            point_coords.reshape([-1, 2]), idx, axis=0)
+        point_coords = point_coords.reshape(
+            [batch_size, num_uncertain_points, 2])
+        if num_random_points > 0:
+            rand_point_coords = paddle.rand([batch_size, num_random_points, 2])
+            point_coords = paddle.concat((point_coords, rand_point_coords),
+                                         axis=1)
+        return point_coords
+
+    def get_points_test(self, seg_logits, uncertainty_func):  # finish
+        """
+        Sample points for testing.
+        Find ``num_points`` most uncertain points from ``uncertainty_map``.
+
+        Args:
+            seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
+                height, width) for class-specific or class-agnostic prediction.
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Testing config of point head.
+        Returns:
+            point_indices (Tensor): A tensor of shape (batch_size, num_points)
+                that contains indices from [0, height x width) of the most
+                uncertain points.
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the ``height x width`` grid .
+        """
+
+        num_points = self.subdivision_num_points
+        uncertainty_map = uncertainty_func(seg_logits)
+        batch_size = paddle.shape(uncertainty_map)[0]
+        height = paddle.shape(uncertainty_map)[2]
+        width = paddle.shape(uncertainty_map)[3]
+        h_step = 1.0 / height
+        w_step = 1.0 / width
+
+        uncertainty_map = uncertainty_map.reshape([batch_size, height * width])
+        num_points = paddle.min(paddle.concat([height * width, num_points]))
+        point_indices = paddle.topk(uncertainty_map, num_points, axis=1)[1]
+        point_coords = paddle.zeros([batch_size, num_points, 2],
+                                    dtype='float32')
+        point_coords[:, :, 0] = w_step / 2.0 + (
+            point_indices % width).astype('float32') * w_step
+        point_coords[:, :, 1] = h_step / 2.0 + (
+            point_indices // width).astype('float32') * h_step
+        return point_indices, point_coords
+
+    def scatter_paddle(self, refined_seg_logits, point_indices, point_logits):
+        """
+        paddle version scatter : equal to pytorch version scatter(-1,point_indices,point_logits).
+
+        Args:
+            refined_seg_logits(Tensor): shape=[batch_size, channels, height * width]
+            point_indices(Tensor): shape=[batch_size, channels, height * width]
+            point_logits(Tensor): shape[batch_size, channels, height * width]
+        Returns:
+            scattered refined_seg_logits(Tensor).
+        """
+
+        original_shape = paddle.shape(refined_seg_logits)  # [batch_size, channels, height * width]
+        new_refined_seg_logits = refined_seg_logits.flatten(0, 1)  # [N*C,H*W]
+        offsets = (paddle.arange(paddle.shape(new_refined_seg_logits)[0]) *
+                   paddle.shape(new_refined_seg_logits)[1]).unsqueeze(-1)  # [N*C,1]
+        point_indices = point_indices.flatten(0, 1)  # [N*C,H*W]
+        new_point_indices = (point_indices + offsets).flatten()
+        point_logits = point_logits.flatten()  # [N*C*H*W]
+        refined_seg_logits = paddle.scatter(
+            refined_seg_logits.flatten(),
+            new_point_indices,
+            point_logits,
+            overwrite=True)
+        return refined_seg_logits.reshape(shape=original_shape)
+
+    def forward_train(self, x, prev_output):
+        with paddle.no_grad():
+            points = self.get_points_train(prev_output, calculate_uncertainty)
+
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, points)  # [2, 256, 2048]
+        coarse_point_feats = self._get_coarse_point_feats(
+            prev_output, points)  # [2, 19, 2048]
+        # forward for train
+        fusion_point_feats = paddle.concat(
+            [fine_grained_point_feats, coarse_point_feats], axis=1)
+        for fc in self.fcs:
+            fusion_point_feats = fc(fusion_point_feats)
+            if self.coarse_pred_each_layer:
+                fusion_point_feats = paddle.concat(
+                    (fusion_point_feats, coarse_point_feats), axis=1)
+        point_logits = self.cls_seg(fusion_point_feats)
+        return [point_logits, points]  # for points loss
+
+    def forward(self, inputs, prev_output):
+        """
+        Forward function.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+        Returns:
+            [point_logits,points]: For points loss when in training.
+            [refined_seg_logits]: Output refined seg logits when in inference.
+        """
+
+        prev_output = prev_output[0]
+        x = self._transform_inputs(inputs)
+        if self.training:
+            return self.forward_train(x, prev_output)
+        else:
+            refined_seg_logits = prev_output.clone()
+            for _ in range(self.subdivision_steps):
+                refined_seg_logits = F.interpolate(
+                    refined_seg_logits,
+                    scale_factor=self.scale_factor,
+                    mode='bilinear',
+                    align_corners=self.align_corners)
+
+                save_shape = paddle.shape(refined_seg_logits)
+                point_indices, points = self.get_points_test(
+                    refined_seg_logits, calculate_uncertainty)
+                fine_grained_point_feats = self._get_fine_grained_point_feats(
+                    x, points)
+                coarse_point_feats = self._get_coarse_point_feats(
+                    prev_output, points)
+                # forward for inference
+                fusion_point_feats = paddle.concat(
+                    [fine_grained_point_feats, coarse_point_feats], axis=1)
+                for fc in self.fcs:
+                    fusion_point_feats = fc(fusion_point_feats)
+                    if self.coarse_pred_each_layer:
+                        fusion_point_feats = paddle.concat(
+                            (fusion_point_feats, coarse_point_feats), axis=1)
+                point_logits = self.cls_seg(fusion_point_feats)
+                point_indices = paddle.unsqueeze(point_indices, axis=1)
+                point_indices = paddle.expand(point_indices, [-1, save_shape[1], -1])
+
+                refined_seg_logits = paddle.flatten(refined_seg_logits, 2)
+                refined_seg_logits = self.scatter_paddle(
+                    refined_seg_logits, point_indices,
+                    point_logits)  # 2->height * width dim
+                refined_seg_logits = refined_seg_logits.reshape(save_shape)
+            return [refined_seg_logits]
+
+
+class FPNHead(nn.Layer):
+    """
+    This head is the implementation of Semantic FPN in paddle.
+
+    The original article refers to:
+    Kirillov, A. , et al. "Panoptic Feature Pyramid Networks."
+    (https://arxiv.org/abs/1901.02446)
+
+    Args:
+        num_classes(int): The unique number of target classes. Default: 19.
+        feature_strides(list): The strides for input feature maps and all strides suppose to be power of 2. The first
+            one is of largest resolution. Default: [4, 8, 16, 32].
+        in_channels(list): The input feature's channels list. Default: [256, 256, 256, 256].
+        channels(int, optional): The output channels of scale_head's Conv before Upsample block. Default: 128.
+        in_index(list): The indexs of input features to use. it's shape should keep with in_channels. Default: [0, 1, 2, 3].
+        dropout_ratio(float, optional): If the dropout_ratio >0, to use Dropout before output and the p of dropout is dropout_ratio. Default: 0.1.
+        conv_cfg(str): The config of Conv. Default: 'Conv2D'.
+        input_transform(str): The features transform method of inputs. it can be found in function '_transform_inputs'. Defalut: 'multiple_select'.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+    """
+
+    def __init__(
+            self,
+            num_class=19,
+            feature_strides=[4, 8, 16, 32],
+            in_channels=[256, 256, 256, 256],
+            channels=128,
+            in_index=[0, 1, 2, 3],
+            dropout_ratio=0.1,
+            conv_cfg='Conv2D',
+            input_transform='multiple_select',
+            align_corners=False,
+    ):
+        super(FPNHead, self).__init__()
+        assert len(feature_strides) == len(in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+        self.in_channels = in_channels
+        self.channels = channels
+        self.in_index = in_index
+        self.num_class = num_class
+        self.conv_cfg = conv_cfg
+        self.dropout_ratio = dropout_ratio
+        self.input_transform = input_transform
+        self.align_corners = align_corners
+        self.scale_heads = nn.LayerList()
+
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.in_channels[i] if k == 0 else self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+        self.conv_seg = nn.Conv2D(self.channels, self.num_class, kernel_size=1)
+
+        if self.dropout_ratio is not None:
+            self.dropout = nn.Dropout2D(self.dropout_ratio)
+        else:
+            self.dropout = None
+
+    def cls_seg(self, feat):
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def _transform_inputs(self, inputs):
+        """
+        Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                F.interpolate(
+                    x,
+                    size=paddle.shape(inputs[0])[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = paddle.concat(upsampled_inputs, axis=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index[0]]
+
+        return inputs
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            output = output + F.interpolate(
+                self.scale_heads[i](x[i]),
+                size=paddle.shape(output)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        output = self.cls_seg(output)
+        return [output]
+
+
+class FPNNeck(nn.Layer):
+    """
+    The FPN Neck implementation in paddle.
+
+    Args:
+        fpn_inplanes (list, optional): Input channels list(the feature channels from backbone) for lateral_conv constraction. Default: [256, 512, 1024, 2048].
+        fpn_outplanes (int, optional): The output channels. Default: 256.
+    """
+
+    def __init__(
+            self,
+            fpn_inplanes=[256, 512, 1024, 2048],
+            fpn_outplanes=256,
+    ):
+        super(FPNNeck, self).__init__()
+        self.lateral_convs = []
+        self.fpn_out = []
+
+        # FPN head
+        for fpn_inplane in fpn_inplanes:
+            self.lateral_convs.append(
+                nn.Sequential(
+                    nn.Conv2D(fpn_inplane, fpn_outplanes, 1),
+                    layers.SyncBatchNorm(fpn_outplanes), nn.ReLU()))
+            self.fpn_out.append(
+                nn.Sequential(
+                    layers.ConvBNReLU(
+                        fpn_outplanes, fpn_outplanes, 3, bias_attr=False)))
+
+        self.lateral_convs = nn.LayerList(self.lateral_convs)
+        self.fpn_out = nn.LayerList(self.fpn_out)
+
+    def forward(self, conv_out):
+        last_out = self.lateral_convs[-1](conv_out[-1])
+        f = last_out
+        fpn_feature_list = [last_out]
+        for i in reversed(range(len(conv_out) - 1)):
+            conv_x = conv_out[i]
+            conv_x = self.lateral_convs[i](conv_x)
+            prev_shape = paddle.shape(conv_x)[2:]
+            f = conv_x + F.interpolate(
+                f, prev_shape, mode='bilinear', align_corners=True)
+            fpn_feature_list.append(self.fpn_out[i](f))
+        return fpn_feature_list
+
+
+class ConvModule(nn.Layer):
+    """
+    ConvModule includes Conv1/Conv2D.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding=0,
+                 stride=1,
+                 conv_cfg='Conv1D',
+                 norm_cfg='None',
+                 **kwargs):
+        super().__init__()
+        if (conv_cfg == 'Conv1D'):
+            self._conv = nn.Conv1D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=padding,
+                **kwargs)
+        if (conv_cfg == 'Conv2D'):
+            self._conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=padding,
+                **kwargs)
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        if (norm_cfg != 'None'):
+            self._batch_norm = layers.SyncBatchNorm(
+                out_channels, data_format=data_format)
+        else:
+            self._batch_norm = None
+
+    def forward(self, x):
+        x = self._conv(x)
+        if (self._batch_norm != None):
+            x = self._batch_norm(x)
+        x = F.relu(x)
+        return x
+
+
+class Upsample(nn.Layer):
+    """
+    Upsample Module.
+    """
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            return F.interpolate(x, None, self.scale_factor, self.mode, self.align_corners)
+        else:
+            return F.interpolate(x, self.size, None, self.mode, self.align_corners)
+
+
+def point_sample(input, points, align_corners=False, **kwargs):
+    """
+    A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+
+    Args:
+        input (Tensor): Feature map, shape (N, C, H, W).
+        points (Tensor): Image based absolute point coordinates (normalized),
+            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
+        align_corners (bool): Whether align_corners. Default: False
+    Returns:
+        Tensor: Features of `point` on `input`, shape (N, C, P) or
+            (N, C, Hgrid, Wgrid).
+    """
+
+    def denormalize(grid):
+        """Denormalize input grid from range [0, 1] to [-1, 1]
+        Args:
+            grid (Tensor): The grid to be denormalize, range [0, 1].
+        Returns:
+            Tensor: Denormalized grid, range [-1, 1].
+        """
+        return grid * 2.0 - 1.0
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = paddle.unsqueeze(points, axis=2)
+    output = F.grid_sample(
+        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = paddle.squeeze(output, axis=3)
+    return output
+
+
+def calculate_uncertainty(seg_logits):
+    """
+    Estimate uncertainty based on seg logits.
+    For each location of the prediction ``seg_logits`` we estimate
+    uncertainty as the difference between top first and top second
+    predicted logits.
+
+    Args:
+        seg_logits (Tensor): Semantic segmentation logits,
+            shape (batch_size, num_classes, height, width).
+    Returns:
+        scores (Tensor): T uncertainty scores with the most uncertain
+            locations having the highest uncertainty score, shape (
+            batch_size, 1, height, width)
+    """
+
+    top2_scores = paddle.topk(seg_logits, k=2, axis=1)[0]
+    return paddle.unsqueeze(top2_scores[:, 1] - top2_scores[:, 0], axis=1)
--- a/paddlers/models/ppseg/models/portraitnet.py
+++ b/paddlers/models/ppseg/models/portraitnet.py
@ -0,0 +1,226 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+
+from paddlers.models.ppseg import utils
+from paddlers.models.ppseg.cvlibs import manager
+
+
+@manager.MODELS.add_component
+class PortraitNet(nn.Layer):
+    """
+    The PortraitNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Song-Hai Zhanga, Xin Donga, Jia Lib, Ruilong Lia, Yong-Liang Yangc
+    "PortraitNet: Real-time Portrait Segmentation Network for Mobile Device"
+    (https://www.yongliangyang.net/docs/mobilePotrait_c&g19.pdf).
+
+    Args:
+        num_classes (int, optional): The unique number of target classes.  Default: 2.
+        backbone (Paddle.nn.Layer): Backbone network, currently support MobileNetV2.
+        add_edge (bool, optional): Whether output to edge. Default: False
+        pretrained (str, optional): The path or url of pretrained model. Default: None
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 min_channel=16,
+                 channel_ratio=1.0,
+                 add_edge=False,
+                 pretrained=None):
+        super(PortraitNet, self).__init__()
+        self.backbone = backbone
+        self.head = PortraitNetHead(num_classes, min_channel, channel_ratio,
+                                    add_edge)
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        img = x[:, :3, :, :]
+        img_ori = x[:, 3:, :, :]
+
+        feat_list = self.backbone(img)
+        logits_list = self.head(feat_list)
+
+        feat_list = self.backbone(img_ori)
+        logits_ori_list = self.head(feat_list)
+
+        return [
+            logits_list[0], logits_ori_list[0], logits_list[1],
+            logits_ori_list[1]
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class PortraitNetHead(nn.Layer):
+    def __init__(self,
+                 num_classes,
+                 min_channel=16,
+                 channel_ratio=1.0,
+                 add_edge=False):
+        super().__init__()
+        self.min_channel = min_channel
+        self.channel_ratio = channel_ratio
+        self.add_edge = add_edge
+        self.deconv1 = nn.Conv2DTranspose(
+            self.depth(96),
+            self.depth(96),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv2 = nn.Conv2DTranspose(
+            self.depth(32),
+            self.depth(32),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv3 = nn.Conv2DTranspose(
+            self.depth(24),
+            self.depth(24),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv4 = nn.Conv2DTranspose(
+            self.depth(16),
+            self.depth(16),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+        self.deconv5 = nn.Conv2DTranspose(
+            self.depth(8),
+            self.depth(8),
+            groups=1,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias_attr=False)
+
+        self.transit1 = ResidualBlock(self.depth(320), self.depth(96))
+        self.transit2 = ResidualBlock(self.depth(96), self.depth(32))
+        self.transit3 = ResidualBlock(self.depth(32), self.depth(24))
+        self.transit4 = ResidualBlock(self.depth(24), self.depth(16))
+        self.transit5 = ResidualBlock(self.depth(16), self.depth(8))
+
+        self.pred = nn.Conv2D(
+            self.depth(8), num_classes, 3, 1, 1, bias_attr=False)
+        if self.add_edge:
+            self.edge = nn.Conv2D(
+                self.depth(8), num_classes, 3, 1, 1, bias_attr=False)
+
+    def depth(self, channels):
+        min_channel = min(channels, self.min_channel)
+        return max(min_channel, int(channels * self.channel_ratio))
+
+    def forward(self, feat_list):
+        feature_1_4, feature_1_8, feature_1_16, feature_1_32 = feat_list
+        up_1_16 = self.deconv1(self.transit1(feature_1_32))
+        up_1_8 = self.deconv2(self.transit2(feature_1_16 + up_1_16))
+        up_1_4 = self.deconv3(self.transit3(feature_1_8 + up_1_8))
+        up_1_2 = self.deconv4(self.transit4(feature_1_4 + up_1_4))
+        up_1_1 = self.deconv5(self.transit5(up_1_2))
+
+        pred = self.pred(up_1_1)
+        if self.add_edge:
+            edge = self.edge(up_1_1)
+            return pred, edge
+        else:
+            return pred
+
+
+class ConvDw(nn.Layer):
+    def __init__(self, inp, oup, kernel, stride):
+        super(ConvDw, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                inp,
+                kernel,
+                stride, (kernel - 1) // 2,
+                groups=inp,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=inp, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(inp, oup, 1, 1, 0, bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class ResidualBlock(nn.Layer):
+    def __init__(self, inp, oup, stride=1):
+        super(ResidualBlock, self).__init__()
+
+        self.block = nn.Sequential(
+            ConvDw(inp, oup, 3, stride=stride),
+            nn.Conv2D(
+                in_channels=oup,
+                out_channels=oup,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=oup,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+            nn.ReLU(),
+            nn.Conv2D(
+                in_channels=oup,
+                out_channels=oup,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+        )
+        if inp == oup:
+            self.residual = None
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2D(
+                    in_channels=inp,
+                    out_channels=oup,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias_attr=False),
+                nn.BatchNorm2D(num_features=oup, epsilon=1e-05, momentum=0.1),
+            )
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+
+        out = self.block(x)
+        if self.residual is not None:
+            residual = self.residual(x)
+
+        out += residual
+        out = self.relu(out)
+        return out
--- a/paddlers/models/ppseg/models/pphumanseg_lite.py
+++ b/paddlers/models/ppseg/models/pphumanseg_lite.py
@ -0,0 +1,226 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddlers.models.ppseg.models import layers
+from paddlers.models.ppseg.utils import utils
+
+__all__ = ['PPHumanSegLite']
+
+
+@manager.MODELS.add_component
+class PPHumanSegLite(nn.Layer):
+    "A self-developed ultra lightweight model from paddlers.models.ppseg, is suitable for real-time scene segmentation on web or mobile terminals."
+
+    def __init__(self, num_classes, pretrained=None, align_corners=False):
+        super().__init__()
+        self.pretrained = pretrained
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+
+        self.conv_bn0 = _ConvBNReLU(3, 36, 3, 2, 1)
+        self.conv_bn1 = _ConvBNReLU(36, 18, 1, 1, 0)
+
+        self.block1 = nn.Sequential(
+            InvertedResidual(36, stride=2, out_channels=72),
+            InvertedResidual(72, stride=1), InvertedResidual(72, stride=1),
+            InvertedResidual(72, stride=1))
+
+        self.block2 = nn.Sequential(
+            InvertedResidual(72, stride=2), InvertedResidual(144, stride=1),
+            InvertedResidual(144, stride=1), InvertedResidual(144, stride=1),
+            InvertedResidual(144, stride=1), InvertedResidual(144, stride=1),
+            InvertedResidual(144, stride=1), InvertedResidual(144, stride=1))
+
+        self.depthwise_separable0 = _SeparableConvBNReLU(144, 64, 3, stride=1)
+        self.depthwise_separable1 = _SeparableConvBNReLU(82, 64, 3, stride=1)
+        self.depthwise_separable2 = _SeparableConvBNReLU(
+            64, self.num_classes, 3, stride=1)
+
+        self.init_weight()
+
+    def forward(self, x):
+        # Encoder
+        input_shape = paddle.shape(x)[2:]
+
+        x = self.conv_bn0(x)  # 1/2
+        shortcut = self.conv_bn1(x)  # shortcut
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)  # 1/4
+        x = self.block1(x)  # 1/8
+        x = self.block2(x)  # 1/16
+
+        # Decoder
+        x = self.depthwise_separable0(x)
+        shortcut_shape = paddle.shape(shortcut)[2:]
+        x = F.interpolate(
+            x,
+            shortcut_shape,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = paddle.concat(x=[shortcut, x], axis=1)
+        x = self.depthwise_separable1(x)
+
+        logit = self.depthwise_separable2(x)
+        logit = F.interpolate(
+            logit,
+            input_shape,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return [logit]
+
+    def init_weight(self):
+        for layer in self.sublayers():
+            if isinstance(layer, nn.Conv2D):
+                param_init.normal_init(layer.weight, std=0.001)
+            elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)):
+                param_init.constant_init(layer.weight, value=1.0)
+                param_init.constant_init(layer.bias, value=0.0)
+        if self.pretrained is not None:
+            utils.load_pretrained_model(self, self.pretrained)
+
+
+class _ConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 **kwargs):
+        super().__init__()
+        weight_attr = paddle.ParamAttr(
+            learning_rate=1, initializer=nn.initializer.KaimingUniform())
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            stride=stride,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=False,
+            **kwargs)
+
+        self._batch_norm = layers.SyncBatchNorm(out_channels)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        x = F.relu(x)
+        return x
+
+
+class _ConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 **kwargs):
+        super().__init__()
+        weight_attr = paddle.ParamAttr(
+            learning_rate=1, initializer=nn.initializer.KaimingUniform())
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            stride=stride,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=False,
+            **kwargs)
+
+        self._batch_norm = layers.SyncBatchNorm(out_channels)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        return x
+
+
+class _SeparableConvBNReLU(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
+        super().__init__()
+        self.depthwise_conv = _ConvBN(
+            in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            padding=int(kernel_size / 2),
+            groups=in_channels,
+            **kwargs)
+        self.piontwise_conv = _ConvBNReLU(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            groups=1,
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.piontwise_conv(x)
+        return x
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, input_channels, stride, out_channels=None):
+        super().__init__()
+        if stride == 1:
+            branch_channel = int(input_channels / 2)
+        else:
+            branch_channel = input_channels
+
+        if out_channels is None:
+            self.in_channels = int(branch_channel)
+        else:
+            self.in_channels = int(out_channels / 2)
+
+        self._depthwise_separable_0 = _SeparableConvBNReLU(
+            input_channels, self.in_channels, 3, stride=stride)
+        self._conv = _ConvBNReLU(
+            branch_channel, self.in_channels, 1, stride=1, padding=0)
+        self._depthwise_separable_1 = _SeparableConvBNReLU(
+            self.in_channels, self.in_channels, 3, stride=stride)
+
+        self.stride = stride
+
+    def forward(self, input):
+
+        if self.stride == 1:
+            shortcut, branch = paddle.split(x=input, num_or_sections=2, axis=1)
+        else:
+            branch = input
+            shortcut = self._depthwise_separable_0(input)
+
+        branch_1x1 = self._conv(branch)
+        branch_dw1x1 = self._depthwise_separable_1(branch_1x1)
+        output = paddle.concat(x=[shortcut, branch_dw1x1], axis=1)
+
+        # channel shuffle
+        out_shape = paddle.shape(output)
+        h, w = out_shape[2], out_shape[3]
+        output = paddle.reshape(x=output, shape=[0, 2, self.in_channels, h, w])
+        output = paddle.transpose(x=output, perm=[0, 2, 1, 3, 4])
+        output = paddle.reshape(x=output, shape=[0, 2 * self.in_channels, h, w])
+        return output
+
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`PaddleSeg commit fec42fd869b6f796c74cd510671595e3512bc8e9`