Update ppseg

3 years ago · 5834df2fad
parent 19cf02c1c2
commit 5834df2fad
131 changed files with 7340 additions and 1592 deletions
--- a/paddlers/models/ppseg/init.py
+++ b/paddlers/models/ppseg/init.py
@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/paddlers/models/ppseg/core/infer.py
+++ b/paddlers/models/ppseg/core/infer.py
@ -21,88 +21,16 @@ import paddle
 import paddle.nn.functional as F


-def get_reverse_list(ori_shape, transforms):
-    """
-    get reverse list of transform.
-
-    Args:
-        ori_shape (list): Origin shape of image.
-        transforms (list): List of transform.
-
-    Returns:
-        list: List of tuple, there are two format:
-            ('resize', (h, w)) The image shape before resize,
-            ('padding', (h, w)) The image shape before padding.
-    """
-    reverse_list = []
-    h, w = ori_shape[0], ori_shape[1]
-    for op in transforms:
-        if op.__class__.__name__ in ['Resize']:
-            reverse_list.append(('resize', (h, w)))
-            h, w = op.target_size[0], op.target_size[1]
-        if op.__class__.__name__ in ['ResizeByLong']:
-            reverse_list.append(('resize', (h, w)))
-            long_edge = max(h, w)
-            short_edge = min(h, w)
-            short_edge = int(round(short_edge * op.long_size / long_edge))
-            long_edge = op.long_size
-            if h > w:
-                h = long_edge
-                w = short_edge
-            else:
-                w = long_edge
-                h = short_edge
-        if op.__class__.__name__ in ['ResizeByShort']:
-            reverse_list.append(('resize', (h, w)))
-            long_edge = max(h, w)
-            short_edge = min(h, w)
-            long_edge = int(round(long_edge * op.short_size / short_edge))
-            short_edge = op.short_size
-            if h > w:
-                h = long_edge
-                w = short_edge
-            else:
-                w = long_edge
-                h = short_edge
-        if op.__class__.__name__ in ['Pad']:
-            reverse_list.append(('padding', (h, w)))
-            w, h = op.target_size[0], op.target_size[1]
-        if op.__class__.__name__ in ['PadByAspectRatio']:
-            reverse_list.append(('padding', (h, w)))
-            ratio = w / h
-            if ratio == op.aspect_ratio:
-                pass
-            elif ratio > op.aspect_ratio:
-                h = int(w / op.aspect_ratio)
-            else:
-                w = int(h * op.aspect_ratio)
-        if op.__class__.__name__ in ['LimitLong']:
-            long_edge = max(h, w)
-            short_edge = min(h, w)
-            if ((op.max_long is not None) and (long_edge > op.max_long)):
-                reverse_list.append(('resize', (h, w)))
-                long_edge = op.max_long
-                short_edge = int(round(short_edge * op.max_long / long_edge))
-            elif ((op.min_long is not None) and (long_edge < op.min_long)):
-                reverse_list.append(('resize', (h, w)))
-                long_edge = op.min_long
-                short_edge = int(round(short_edge * op.min_long / long_edge))
-            if h > w:
-                h = long_edge
-                w = short_edge
-            else:
-                w = long_edge
-                h = short_edge
-    return reverse_list
-
-
-def reverse_transform(pred, ori_shape, transforms, mode='nearest'):
+def reverse_transform(pred, trans_info, mode='nearest'):
    """recover pred to origin shape"""
-    reverse_list = get_reverse_list(ori_shape, transforms)
    intTypeList = [paddle.int8, paddle.int16, paddle.int32, paddle.int64]
    dtype = pred.dtype
-    for item in reverse_list[::-1]:
-        if item[0] == 'resize':
+    for item in trans_info[::-1]:
+        if isinstance(item[0], list):
+            trans_mode = item[0][0]
+        else:
+            trans_mode = item[0]
+        if trans_mode == 'resize':
            h, w = item[1][0], item[1][1]
            if paddle.get_device() == 'cpu' and dtype in intTypeList:
                pred = paddle.cast(pred, 'float32')
@ -110,7 +38,7 @@ def reverse_transform(pred, ori_shape, transforms, mode='nearest'):
                pred = paddle.cast(pred, dtype)
            else:
                pred = F.interpolate(pred, (h, w), mode=mode)
-        elif item[0] == 'padding':
+        elif trans_mode == 'padding':
            h, w = item[1][0], item[1][1]
            pred = pred[:, :, 0:h, 0:w]
        else:
@ -205,8 +133,7 @@ def slide_inference(model, im, crop_size, stride):

 def inference(model,
              im,
-              ori_shape=None,
-              transforms=None,
+              trans_info=None,
              is_slide=False,
              stride=None,
              crop_size=None):
@ -216,8 +143,7 @@ def inference(model,
    Args:
        model (paddle.nn.Layer): model to get logits of image.
        im (Tensor): the input image.
-        ori_shape (list): Origin shape of image.
-        transforms (list): Transforms for image.
+        trans_info (list): Image shape informating changed process. Default: None.
        is_slide (bool): Whether to infer by sliding window. Default: False.
        crop_size (tuple|list). The size of sliding window, (w, h). It should be probided if is_slide is True.
        stride (tuple|list). The size of stride, (w, h). It should be probided if is_slide is True.
@ -239,8 +165,8 @@ def inference(model,
        logit = slide_inference(model, im, crop_size=crop_size, stride=stride)
    if hasattr(model, 'data_format') and model.data_format == 'NHWC':
        logit = logit.transpose((0, 3, 1, 2))
-    if ori_shape is not None:
-        logit = reverse_transform(logit, ori_shape, transforms, mode='bilinear')
+    if trans_info is not None:
+        logit = reverse_transform(logit, trans_info, mode='bilinear')
        pred = paddle.argmax(logit, axis=1, keepdim=True, dtype='int32')
        return pred, logit
    else:
@ -249,8 +175,7 @@ def inference(model,

 def aug_inference(model,
                  im,
-                  ori_shape,
-                  transforms,
+                  trans_info,
                  scales=1.0,
                  flip_horizontal=False,
                  flip_vertical=False,
@ -263,8 +188,7 @@ def aug_inference(model,
    Args:
        model (paddle.nn.Layer): model to get logits of image.
        im (Tensor): the input image.
-        ori_shape (list): Origin shape of image.
-        transforms (list): Transforms for image.
+        trans_info (list): Transforms for image.
        scales (float|tuple|list):  Scales for resize. Default: 1.
        flip_horizontal (bool): Whether to flip horizontally. Default: False.
        flip_vertical (bool): Whether to flip vertically. Default: False.
@ -302,8 +226,7 @@ def aug_inference(model,
            logit = F.softmax(logit, axis=1)
            final_logit = final_logit + logit

-    final_logit = reverse_transform(
-        final_logit, ori_shape, transforms, mode='bilinear')
+    final_logit = reverse_transform(final_logit, trans_info, mode='bilinear')
    pred = paddle.argmax(final_logit, axis=1, keepdim=True, dtype='int32')

    return pred, final_logit
--- a/paddlers/models/ppseg/core/predict.py
+++ b/paddlers/models/ppseg/core/predict.py
@ -19,9 +19,9 @@ import cv2
 import numpy as np
 import paddle

-from paddlers.models.ppseg import utils
-from paddlers.models.ppseg.core import infer
-from paddlers.models.ppseg.utils import logger, progbar, visualize
+from paddleseg import utils
+from paddleseg.core import infer
+from paddleseg.utils import logger, progbar, visualize


 def mkdir(path):
@ -36,6 +36,15 @@ def partition_list(arr, m):
    return [arr[i:i + n] for i in range(0, len(arr), n)]


+def preprocess(im_path, transforms):
+    data = {}
+    data['img'] = im_path
+    data = transforms(data)
+    data['img'] = data['img'][np.newaxis, ...]
+    data['img'] = paddle.to_tensor(data['img'])
+    return data
+
+
 def predict(model,
            model_path,
            transforms,
@ -89,18 +98,13 @@ def predict(model,
    color_map = visualize.get_color_map_list(256, custom_color=custom_color)
    with paddle.no_grad():
        for i, im_path in enumerate(img_lists[local_rank]):
-            im = cv2.imread(im_path)
-            ori_shape = im.shape[:2]
-            im, _ = transforms(im)
-            im = im[np.newaxis, ...]
-            im = paddle.to_tensor(im)
+            data = preprocess(im_path, transforms)

            if aug_pred:
                pred, _ = infer.aug_inference(
                    model,
-                    im,
-                    ori_shape=ori_shape,
-                    transforms=transforms.transforms,
+                    data['img'],
+                    trans_info=data['trans_info'],
                    scales=scales,
                    flip_horizontal=flip_horizontal,
                    flip_vertical=flip_vertical,
@ -110,9 +114,8 @@ def predict(model,
            else:
                pred, _ = infer.inference(
                    model,
-                    im,
-                    ori_shape=ori_shape,
-                    transforms=transforms.transforms,
+                    data['img'],
+                    trans_info=data['trans_info'],
                    is_slide=is_slide,
                    stride=stride,
                    crop_size=crop_size)
@ -141,9 +144,4 @@ def predict(model,
            mkdir(pred_saved_path)
            pred_mask.save(pred_saved_path)

-            # pred_im = utils.visualize(im_path, pred, weight=0.0)
-            # pred_saved_path = os.path.join(pred_saved_dir, im_file)
-            # mkdir(pred_saved_path)
-            # cv2.imwrite(pred_saved_path, pred_im)
-
            progbar_pred.update(i + 1)
--- a/paddlers/models/ppseg/core/train.py
+++ b/paddlers/models/ppseg/core/train.py
@ -20,10 +20,9 @@ import shutil
 import paddle
 import paddle.nn.functional as F

-from paddlers.models.ppseg.utils import (TimeAverager, calculate_eta, resume,
-                                         logger, worker_init_fn, train_profiler,
-                                         op_flops_funs)
-from paddlers.models.ppseg.core.val import evaluate
+from paddleseg.utils import (TimeAverager, calculate_eta, resume, logger,
+                             worker_init_fn, train_profiler, op_flops_funs)
+from paddleseg.core.val import evaluate


 def check_logits_losses(logits_list, losses):
@ -35,17 +34,15 @@ def check_logits_losses(logits_list, losses):
            .format(len_logits, len_losses))


-def loss_computation(logits_list, labels, losses, edges=None):
+def loss_computation(logits_list, labels, edges, losses):
    check_logits_losses(logits_list, losses)
    loss_list = []
    for i in range(len(logits_list)):
        logits = logits_list[i]
        loss_i = losses['types'][i]
        coef_i = losses['coef'][i]
-
-        if loss_i.__class__.__name__ in ('BCELoss', 'FocalLoss'
-                                         ) and loss_i.edge_label:
-            # If use edges as labels According to loss type.
+        if loss_i.__class__.__name__ in ('BCELoss', ) and loss_i.edge_label:
+            # Use edges as labels According to loss type.
            loss_list.append(coef_i * loss_i(logits, edges))
        elif loss_i.__class__.__name__ == 'MixedLoss':
            mixed_loss_list = loss_i(logits, labels)
@ -75,13 +72,14 @@ def train(model,
          keep_checkpoint_max=5,
          test_config=None,
          precision='fp32',
+          amp_level='O1',
          profiler_options=None,
          to_static_training=False):
    """
    Launch training.

    Args:
-        model（nn.Layer): A sementic segmentation model.
+        model（nn.Layer): A semantic segmentation model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
@ -98,6 +96,9 @@ def train(model,
        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
        test_config(dict, optional): Evaluation config.
        precision (str, optional): Use AMP if precision='fp16'. If precision='fp32', the training is normal.
+        amp_level (str, optional): Auto mixed precision level. Accepted values are “O1” and “O2”: O1 represent mixed precision, 
+            the input data type of each operator will be casted by white_list and black_list; O2 represent Pure fp16, all operators 
+            parameters and input data will be casted to fp16, except operators in black_list, don’t support fp16 kernel and batchnorm. Default is O1(amp)
        profiler_options (str, optional): The option of train profiler.
        to_static_training (bool, optional): Whether to use @to_static for training.
    """
@ -112,7 +113,18 @@ def train(model,
    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
-        os.makedirs(save_dir)
+        os.makedirs(save_dir, exist_ok=True)
+
+    # use amp
+    if precision == 'fp16':
+        logger.info('use AMP to train. AMP level = {}'.format(amp_level))
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+        if amp_level == 'O2':
+            model, optimizer = paddle.amp.decorate(
+                models=model,
+                optimizers=optimizer,
+                level='O2',
+                save_dtype='float32')

    if nranks > 1:
        paddle.distributed.fleet.init(is_collective=True)
@ -130,18 +142,13 @@ def train(model,
        return_list=True,
        worker_init_fn=worker_init_fn, )

-    # use amp
-    if precision == 'fp16':
-        logger.info('use amp to train')
-        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-
    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    if to_static_training:
        model = paddle.jit.to_static(model)
-        logger.info("Successfully to apply @to_static")
+        logger.info("Successfully applied @to_static")

    avg_loss = 0.0
    avg_loss_list = []
@ -164,30 +171,29 @@ def train(model,
                else:
                    break
            reader_cost_averager.record(time.time() - batch_start)
-            images = data[0]
-            labels = data[1].astype('int64')
+            images = data['img']
+            labels = data['label'].astype('int64')
            edges = None
-            if len(data) == 3:
-                edges = data[2].astype('int64')
+            if 'edge' in data.keys():
+                edges = data['edge'].astype('int64')
            if hasattr(model, 'data_format') and model.data_format == 'NHWC':
                images = images.transpose((0, 2, 3, 1))

            if precision == 'fp16':
                with paddle.amp.auto_cast(
+                        level=amp_level,
                        enable=True,
                        custom_white_list={
                            "elementwise_add", "batch_norm", "sync_batch_norm"
                        },
                        custom_black_list={'bilinear_interp_v2'}):
-                    if nranks > 1:
-                        logits_list = ddp_model(images)
-                    else:
-                        logits_list = model(images)
+                    logits_list = ddp_model(images) if nranks > 1 else model(
+                        images)
                    loss_list = loss_computation(
                        logits_list=logits_list,
                        labels=labels,
-                        losses=losses,
-                        edges=edges)
+                        edges=edges,
+                        losses=losses)
                    loss = sum(loss_list)

                scaled = scaler.scale(loss)  # scale the loss
@ -197,15 +203,12 @@ def train(model,
                else:
                    scaler.minimize(optimizer, scaled)  # update parameters
            else:
-                if nranks > 1:
-                    logits_list = ddp_model(images)
-                else:
-                    logits_list = model(images)
+                logits_list = ddp_model(images) if nranks > 1 else model(images)
                loss_list = loss_computation(
                    logits_list=logits_list,
                    labels=labels,
-                    losses=losses,
-                    edges=edges)
+                    edges=edges,
+                    losses=losses)
                loss = sum(loss_list)
                loss.backward()
                # if the optimizer is ReduceOnPlateau, the loss is the one which has been pass into step.
@ -278,7 +281,12 @@ def train(model,
                    test_config = {}

                mean_iou, acc, _, _, _ = evaluate(
-                    model, val_dataset, num_workers=num_workers, **test_config)
+                    model,
+                    val_dataset,
+                    num_workers=num_workers,
+                    precision=precision,
+                    amp_level=amp_level,
+                    **test_config)

                model.train()

@ -314,7 +322,7 @@ def train(model,
            batch_start = time.time()

    # Calculate flops.
-    if local_rank == 0:
+    if local_rank == 0 and not (precision == 'fp16' and amp_level == 'O2'):
        _, c, h, w = images.shape
        _ = paddle.flops(
            model, [1, c, h, w],
--- a/paddlers/models/ppseg/core/val.py
+++ b/paddlers/models/ppseg/core/val.py
@ -19,8 +19,8 @@ import time
 import paddle
 import paddle.nn.functional as F

-from paddlers.models.ppseg.utils import metrics, TimeAverager, calculate_eta, logger, progbar
-from paddlers.models.ppseg.core import infer
+from paddleseg.utils import metrics, TimeAverager, calculate_eta, logger, progbar
+from paddleseg.core import infer

 np.set_printoptions(suppress=True)

@ -34,6 +34,8 @@ def evaluate(model,
             is_slide=False,
             stride=None,
             crop_size=None,
+             precision='fp32',
+             amp_level='O1',
             num_workers=0,
             print_detail=True,
             auc_roc=False):
@ -41,7 +43,7 @@ def evaluate(model,
    Launch evalution.

    Args:
-        model（nn.Layer): A sementic segmentation model.
+        model（nn.Layer): A semantic segmentation model.
        eval_dataset (paddle.io.Dataset): Used to read and process validation datasets.
        aug_eval (bool, optional): Whether to use mulit-scales and flip augment for evaluation. Default: False.
        scales (list|float, optional): Scales for augment. It is valid when `aug_eval` is True. Default: 1.0.
@ -52,6 +54,8 @@ def evaluate(model,
            It should be provided when `is_slide` is True.
        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
            It should be provided when `is_slide` is True.
+        precision (str, optional): Use AMP if precision='fp16'. If precision='fp32', the evaluation is normal.
+        amp_level (str, optional): Auto mixed precision level. Accepted values are “O1” and “O2”: O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don’t support fp16 kernel and batchnorm. Default is O1(amp)
        num_workers (int, optional): Num workers for data loader. Default: 0.
        print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True.
        auc_roc(bool, optional): whether add auc_roc metric
@ -93,32 +97,66 @@ def evaluate(model,
    batch_cost_averager = TimeAverager()
    batch_start = time.time()
    with paddle.no_grad():
-        for iter, (im, label) in enumerate(loader):
+        for iter, data in enumerate(loader):
            reader_cost_averager.record(time.time() - batch_start)
-            label = label.astype('int64')
+            label = data['label'].astype('int64')

-            ori_shape = label.shape[-2:]
            if aug_eval:
-                pred, logits = infer.aug_inference(
-                    model,
-                    im,
-                    ori_shape=ori_shape,
-                    transforms=eval_dataset.transforms.transforms,
-                    scales=scales,
-                    flip_horizontal=flip_horizontal,
-                    flip_vertical=flip_vertical,
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size)
+                if precision == 'fp16':
+                    with paddle.amp.auto_cast(
+                            level=amp_level,
+                            enable=True,
+                            custom_white_list={
+                                "elementwise_add", "batch_norm",
+                                "sync_batch_norm"
+                            },
+                            custom_black_list={'bilinear_interp_v2'}):
+                        pred, logits = infer.aug_inference(
+                            model,
+                            data['img'],
+                            trans_info=data['trans_info'],
+                            scales=scales,
+                            flip_horizontal=flip_horizontal,
+                            flip_vertical=flip_vertical,
+                            is_slide=is_slide,
+                            stride=stride,
+                            crop_size=crop_size)
+                else:
+                    pred, logits = infer.aug_inference(
+                        model,
+                        data['img'],
+                        trans_info=data['trans_info'],
+                        scales=scales,
+                        flip_horizontal=flip_horizontal,
+                        flip_vertical=flip_vertical,
+                        is_slide=is_slide,
+                        stride=stride,
+                        crop_size=crop_size)
            else:
-                pred, logits = infer.inference(
-                    model,
-                    im,
-                    ori_shape=ori_shape,
-                    transforms=eval_dataset.transforms.transforms,
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size)
+                if precision == 'fp16':
+                    with paddle.amp.auto_cast(
+                            level=amp_level,
+                            enable=True,
+                            custom_white_list={
+                                "elementwise_add", "batch_norm",
+                                "sync_batch_norm"
+                            },
+                            custom_black_list={'bilinear_interp_v2'}):
+                        pred, logits = infer.inference(
+                            model,
+                            data['img'],
+                            trans_info=data['trans_info'],
+                            is_slide=is_slide,
+                            stride=stride,
+                            crop_size=crop_size)
+                else:
+                    pred, logits = infer.inference(
+                        model,
+                        data['img'],
+                        trans_info=data['trans_info'],
+                        is_slide=is_slide,
+                        stride=stride,
+                        crop_size=crop_size)

            intersect_area, pred_area, label_area = metrics.calculate_area(
                pred,
@ -175,12 +213,12 @@ def evaluate(model,
            batch_cost_averager.reset()
            batch_start = time.time()

-    class_iou, miou = metrics.mean_iou(intersect_area_all, pred_area_all,
-                                       label_area_all)
-    class_acc, acc = metrics.accuracy(intersect_area_all, pred_area_all)
-    kappa = metrics.kappa(intersect_area_all, pred_area_all, label_area_all)
-    class_dice, mdice = metrics.dice(intersect_area_all, pred_area_all,
-                                     label_area_all)
+    metrics_input = (intersect_area_all, pred_area_all, label_area_all)
+    class_iou, miou = metrics.mean_iou(*metrics_input)
+    acc, class_precision, class_recall = metrics.class_measurement(
+        *metrics_input)
+    kappa = metrics.kappa(*metrics_input)
+    class_dice, mdice = metrics.dice(*metrics_input)

    if auc_roc:
        auc_roc = metrics.auc_roc(
@ -193,5 +231,7 @@ def evaluate(model,
        infor = infor + auc_infor if auc_roc else infor
        logger.info(infor)
        logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4)))
-        logger.info("[EVAL] Class Acc: \n" + str(np.round(class_acc, 4)))
-    return miou, acc, class_iou, class_acc, kappa
+        logger.info("[EVAL] Class Precision: \n" + str(
+            np.round(class_precision, 4)))
+        logger.info("[EVAL] Class Recall: \n" + str(np.round(class_recall, 4)))
+    return miou, acc, class_iou, class_precision, kappa
--- a/paddlers/models/ppseg/cvlibs/callbacks.py
+++ b/paddlers/models/ppseg/cvlibs/callbacks.py
@ -19,8 +19,8 @@ import numpy as np
 import paddle
 from paddle.distributed.parallel import ParallelEnv
 from visualdl import LogWriter
-from paddlers.models.ppseg.utils.progbar import Progbar
-import paddlers.models.ppseg.utils.logger as logger
+from paddleseg.utils.progbar import Progbar
+import paddleseg.utils.logger as logger


 class CallbackList(object):
--- a/paddlers/models/ppseg/cvlibs/config.py
+++ b/paddlers/models/ppseg/cvlibs/config.py
@ -15,12 +15,15 @@
 import codecs
 import os
 from typing import Any, Dict, Generic
+import warnings
+from ast import literal_eval

 import paddle
 import yaml
+import six

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import logger
+from paddleseg.cvlibs import manager
+from paddleseg.utils import logger


 class Config(object):
@ -51,7 +54,7 @@ class Config(object):

    Examples:

-        from paddlers.models.ppseg.cvlibs.config import Config
+        from paddleseg.cvlibs.config import Config

        # Create a cfg object with yaml file path.
        cfg = Config(yaml_cfg_path)
@ -69,7 +72,8 @@ class Config(object):
                 path: str,
                 learning_rate: float=None,
                 batch_size: int=None,
-                 iters: int=None):
+                 iters: int=None,
+                 opts: list=None):
        if not path:
            raise ValueError('Please specify the configuration file path.')

@ -84,7 +88,18 @@ class Config(object):
            raise RuntimeError('Config file should in yaml format!')

        self.update(
-            learning_rate=learning_rate, batch_size=batch_size, iters=iters)
+            learning_rate=learning_rate,
+            batch_size=batch_size,
+            iters=iters,
+            opts=opts)
+
+        model_cfg = self.dic.get('model', None)
+        if model_cfg is None:
+            raise RuntimeError('No model specified in the configuration file.')
+        if (not self.train_dataset_config) and (not self.val_dataset_config):
+            raise ValueError(
+                'One of `train_dataset` or `val_dataset should be given, but there are none.'
+            )

    def _update_dic(self, dic, base_dic):
        """
@ -121,7 +136,8 @@ class Config(object):
    def update(self,
               learning_rate: float=None,
               batch_size: int=None,
-               iters: int=None):
+               iters: int=None,
+               opts: list=None):
        '''Update config'''
        if learning_rate:
            if 'lr_scheduler' in self.dic:
@ -135,6 +151,27 @@ class Config(object):
        if iters:
            self.dic['iters'] = iters

+        # fix parameters by --opts of command
+        if opts is not None:
+            if len(opts) % 2 != 0 or len(opts) == 0:
+                raise ValueError(
+                    "Command line options config `--opts` format error! It should be even length like: k1 v1 k2 v2 ... Please check it: {}".
+                    format(opts))
+            for key, value in zip(opts[0::2], opts[1::2]):
+                if isinstance(value, six.string_types):
+                    try:
+                        value = literal_eval(value)
+                    except ValueError:
+                        pass
+                    except SyntaxError:
+                        pass
+                key_list = key.split('.')
+                dic = self.dic
+                for subkey in key_list[:-1]:
+                    dic.setdefault(subkey, dict())
+                    dic = dic[subkey]
+                dic[key_list[-1]] = value
+
    @property
    def batch_size(self) -> int:
        return self.dic.get('batch_size', 1)
@ -153,13 +190,32 @@ class Config(object):
                'No `lr_scheduler` specified in the configuration file.')
        params = self.dic.get('lr_scheduler')

+        use_warmup = False
+        if 'warmup_iters' in params:
+            use_warmup = True
+            warmup_iters = params.pop('warmup_iters')
+            assert 'warmup_start_lr' in params, \
+                "When use warmup, please set warmup_start_lr and warmup_iters in lr_scheduler"
+            warmup_start_lr = params.pop('warmup_start_lr')
+            end_lr = params['learning_rate']
+
        lr_type = params.pop('type')
        if lr_type == 'PolynomialDecay':
-            params.setdefault('decay_steps', self.iters)
+            iters = self.iters - warmup_iters if use_warmup else self.iters
+            iters = max(iters, 1)
+            params.setdefault('decay_steps', iters)
            params.setdefault('end_lr', 0)
            params.setdefault('power', 0.9)
+        lr_sche = getattr(paddle.optimizer.lr, lr_type)(**params)
+
+        if use_warmup:
+            lr_sche = paddle.optimizer.lr.LinearWarmup(
+                learning_rate=lr_sche,
+                warmup_steps=warmup_iters,
+                start_lr=warmup_start_lr,
+                end_lr=end_lr)

-        return getattr(paddle.optimizer.lr, lr_type)(**params)
+        return lr_sche

    @property
    def learning_rate(self) -> paddle.optimizer.lr.LRScheduler:
@ -202,15 +258,33 @@ class Config(object):
        args = self.optimizer_args
        optimizer_type = args.pop('type')

+        params = self.model.parameters()
+        if 'backbone_lr_mult' in args:
+            if not hasattr(self.model, 'backbone'):
+                logger.warning('The backbone_lr_mult is not effective because'
+                               ' the model does not have backbone')
+            else:
+                backbone_lr_mult = args.pop('backbone_lr_mult')
+                backbone_params = self.model.backbone.parameters()
+                backbone_params_id = [id(x) for x in backbone_params]
+                other_params = [
+                    x for x in params if id(x) not in backbone_params_id
+                ]
+                params = [{
+                    'params': backbone_params,
+                    'learning_rate': backbone_lr_mult
+                }, {
+                    'params': other_params
+                }]
+
        if optimizer_type == 'sgd':
-            return paddle.optimizer.Momentum(
-                lr, parameters=self.model.parameters(), **args)
+            return paddle.optimizer.Momentum(lr, parameters=params, **args)
        elif optimizer_type == 'adam':
-            return paddle.optimizer.Adam(
-                lr, parameters=self.model.parameters(), **args)
+            return paddle.optimizer.Adam(lr, parameters=params, **args)
        elif optimizer_type in paddle.optimizer.__all__:
-            return getattr(paddle.optimizer, optimizer_type)(
-                lr, parameters=self.model.parameters(), **args)
+            return getattr(paddle.optimizer, optimizer_type)(lr,
+                                                             parameters=params,
+                                                             **args)

        raise RuntimeError('Unknown optimizer type {}.'.format(optimizer_type))

@ -295,24 +369,6 @@ class Config(object):
    @property
    def model(self) -> paddle.nn.Layer:
        model_cfg = self.dic.get('model').copy()
-        if not model_cfg:
-            raise RuntimeError('No model specified in the configuration file.')
-        if not 'num_classes' in model_cfg:
-            num_classes = None
-            if self.train_dataset_config:
-                if hasattr(self.train_dataset_class, 'NUM_CLASSES'):
-                    num_classes = self.train_dataset_class.NUM_CLASSES
-                elif hasattr(self.train_dataset, 'num_classes'):
-                    num_classes = self.train_dataset.num_classes
-            elif self.val_dataset_config:
-                if hasattr(self.val_dataset_class, 'NUM_CLASSES'):
-                    num_classes = self.val_dataset_class.NUM_CLASSES
-                elif hasattr(self.val_dataset, 'num_classes'):
-                    num_classes = self.val_dataset.num_classes
-
-            if num_classes is not None:
-                model_cfg['num_classes'] = num_classes
-
        if not self._model:
            self._model = self._load_object(model_cfg)
        return self._model
@ -401,3 +457,94 @@ class Config(object):

    def __str__(self) -> str:
        return yaml.dump(self.dic)
+
+    @property
+    def val_transforms(self) -> list:
+        """Get val_transform from val_dataset"""
+        _val_dataset = self.val_dataset_config
+        if not _val_dataset:
+            return []
+        _transforms = _val_dataset.get('transforms', [])
+        transforms = []
+        for i in _transforms:
+            transforms.append(self._load_object(i))
+        return transforms
+
+    def check_sync_info(self) -> None:
+        """
+        Check and sync the info, such as num_classes and img_channels, 
+        between the config of model, train_dataset and val_dataset.
+        """
+        self._check_sync_num_classes()
+        self._check_sync_img_channels()
+
+    def _check_sync_num_classes(self):
+        num_classes_set = set()
+
+        if self.dic['model'].get('num_classes', None) is not None:
+            num_classes_set.add(self.dic['model'].get('num_classes'))
+        if self.train_dataset_config:
+            if hasattr(self.train_dataset_class, 'NUM_CLASSES'):
+                num_classes_set.add(self.train_dataset_class.NUM_CLASSES)
+            elif 'num_classes' in self.train_dataset_config:
+                num_classes_set.add(self.train_dataset_config['num_classes'])
+        if self.val_dataset_config:
+            if hasattr(self.val_dataset_class, 'NUM_CLASSES'):
+                num_classes_set.add(self.val_dataset_class.NUM_CLASSES)
+            elif 'num_classes' in self.val_dataset_config:
+                num_classes_set.add(self.val_dataset_config['num_classes'])
+
+        if len(num_classes_set) == 0:
+            raise ValueError(
+                '`num_classes` is not found. Please set it in model, train_dataset or val_dataset'
+            )
+        elif len(num_classes_set) > 1:
+            raise ValueError(
+                '`num_classes` is not consistent: {}. Please set it consistently in model or train_dataset or val_dataset'
+                .format(num_classes_set))
+
+        num_classes = num_classes_set.pop()
+        self.dic['model']['num_classes'] = num_classes
+        if self.train_dataset_config and \
+            (not hasattr(self.train_dataset_class, 'NUM_CLASSES')):
+            self.dic['train_dataset']['num_classes'] = num_classes
+        if self.val_dataset_config and \
+            (not hasattr(self.val_dataset_class, 'NUM_CLASSES')):
+            self.dic['val_dataset']['num_classes'] = num_classes
+
+    def _check_sync_img_channels(self):
+        img_channels_set = set()
+        model_cfg = self.dic['model']
+
+        # If the model has backbone, in_channels is the input params of backbone.
+        # Otherwise, in_channels is the input params of the model.
+        if 'backbone' in model_cfg:
+            x = model_cfg['backbone'].get('in_channels', None)
+            if x is not None:
+                img_channels_set.add(x)
+        elif model_cfg.get('in_channels', None) is not None:
+            img_channels_set.add(model_cfg.get('in_channels'))
+        if self.train_dataset_config and \
+            ('img_channels' in self.train_dataset_config):
+            img_channels_set.add(self.train_dataset_config['img_channels'])
+        if self.val_dataset_config and \
+            ('img_channels' in self.val_dataset_config):
+            img_channels_set.add(self.val_dataset_config['img_channels'])
+
+        if len(img_channels_set) > 1:
+            raise ValueError(
+                '`img_channels` is not consistent: {}. Please set it consistently in model or train_dataset or val_dataset'
+                .format(img_channels_set))
+
+        img_channels = 3 if len(img_channels_set) == 0 \
+            else img_channels_set.pop()
+        if 'backbone' in model_cfg:
+            self.dic['model']['backbone']['in_channels'] = img_channels
+        else:
+            self.dic['model']['in_channels'] = img_channels
+        if self.train_dataset_config and \
+            self.train_dataset_config['type'] == "Dataset":
+            self.dic['train_dataset']['img_channels'] = img_channels
+        if self.val_dataset_config and \
+            self.val_dataset_config['type'] == "Dataset":
+            self.dic['val_dataset']['img_channels'] = img_channels
--- a/paddlers/models/ppseg/cvlibs/manager.py
+++ b/paddlers/models/ppseg/cvlibs/manager.py
@ -31,7 +31,7 @@ class ComponentManager:

    Examples 1:

-        from paddlers.models.ppseg.cvlibs.manager import ComponentManager
+        from paddleseg.cvlibs.manager import ComponentManager

        model_manager = ComponentManager()

@ -49,7 +49,7 @@ class ComponentManager:
    Examples 2:

        # Or an easier way, using it as a Python decorator, while just add it above the class declaration.
-        from paddlers.models.ppseg.cvlibs.manager import ComponentManager
+        from paddleseg.cvlibs.manager import ComponentManager

        model_manager = ComponentManager()

--- a/paddlers/models/ppseg/cvlibs/param_init.py
+++ b/paddlers/models/ppseg/cvlibs/param_init.py
@ -24,7 +24,7 @@ def constant_init(param, **kwargs):

    Examples:

-        from paddlers.models.ppseg.cvlibs import param_init
+        from paddleseg.cvlibs import param_init
        import paddle.nn as nn

        linear = nn.Linear(2, 4)
@ -46,7 +46,7 @@ def normal_init(param, **kwargs):

    Examples:

-        from paddlers.models.ppseg.cvlibs import param_init
+        from paddleseg.cvlibs import param_init
        import paddle.nn as nn

        linear = nn.Linear(2, 4)
@ -79,7 +79,7 @@ def kaiming_normal_init(param, **kwargs):

    Examples:

-        from paddlers.models.ppseg.cvlibs import param_init
+        from paddleseg.cvlibs import param_init
        import paddle.nn as nn

        linear = nn.Linear(2, 4)
@ -109,7 +109,7 @@ def kaiming_uniform(param, **kwargs):

    Examples:

-        from paddlers.models.ppseg.cvlibs import param_init
+        from paddleseg.cvlibs import param_init
        import paddle.nn as nn

        linear = nn.Linear(2, 4)
@ -118,3 +118,29 @@ def kaiming_uniform(param, **kwargs):

    initializer = nn.initializer.KaimingUniform(**kwargs)
    initializer(param, param.block)
+
+
+def xavier_uniform(param, **kwargs):
+    r"""
+    This implements the Xavier weight initializer from the paper
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
+    This initializer is designed to keep the scale of the gradients
+    approximately same in all the layers. In case of Uniform distribution,
+    the range is [-x, x], where
+    .. math::
+        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}
+    Args:
+        param (Tensor): Tensor that needs to be initialized.
+
+    Examples:
+
+        from paddleseg.cvlibs import param_init
+        import paddle.nn as nn
+
+        linear = nn.Linear(2, 4)
+        param_init.xavier_uniform(linear.weight)
+    """
+    initializer = nn.initializer.XavierUniform(**kwargs)
+    initializer(param, param.block)
--- a/paddlers/models/ppseg/datasets/init.py
+++ b/paddlers/models/ppseg/datasets/init.py
@ -27,3 +27,4 @@ from .drive import DRIVE
 from .hrf import HRF
 from .chase_db1 import CHASEDB1
 from .pp_humanseg14k import PPHumanSeg14K
+from .pssl import PSSLDataset
--- a/paddlers/models/ppseg/datasets/ade.py
+++ b/paddlers/models/ppseg/datasets/ade.py
@ -17,12 +17,12 @@ import os
 import numpy as np
 from PIL import Image

-from paddlers.models.ppseg.datasets import Dataset
-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-import paddlers.models.ppseg.transforms.functional as F
+from paddleseg.datasets import Dataset
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+import paddleseg.transforms.functional as F

 URL = "http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip"

@ -89,23 +89,31 @@ class ADE20K(Dataset):
            self.file_list.append([img_path, label_path])

    def __getitem__(self, idx):
+        data = {}
+        data['trans_info'] = []
        image_path, label_path = self.file_list[idx]
+        data['img'] = image_path
+        data['gt_fields'] = [
+        ]  # If key in gt_fields, the data[key] have transforms synchronous.
+
        if self.mode == 'val':
-            im, _ = self.transforms(im=image_path)
+            data = self.transforms(data)
            label = np.asarray(Image.open(label_path))
            # The class 0 is ignored. And it will equal to 255 after
            # subtracted 1, because the dtype of label is uint8.
            label = label - 1
            label = label[np.newaxis, :, :]
-            return im, label
+            data['label'] = label
+            return data
        else:
-            im, label = self.transforms(im=image_path, label=label_path)
-            label = label - 1
+            data['label'] = label_path
+            data['gt_fields'].append('label')
+            data = self.transforms(data)
+            data['label'] = data['label'] - 1
            # Recover the ignore pixels adding by transform
-            label[label == 254] = 255
+            data['label'][data['label'] == 254] = 255
            if self.edge:
                edge_mask = F.mask_to_binary_edge(
                    label, radius=2, num_classes=self.num_classes)
-                return im, label, edge_mask
-            else:
-                return im, label
+                data['edge'] = edge_mask
+            return data
--- a/paddlers/models/ppseg/datasets/chase_db1.py
+++ b/paddlers/models/ppseg/datasets/chase_db1.py
@ -14,11 +14,11 @@

 import os

-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-from paddlers.models.ppseg.datasets import Dataset
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+from paddleseg.datasets import Dataset

 URL = 'https://bj.bcebos.com/paddleseg/dataset/chase_db1/chase_db1.zip'

--- a/paddlers/models/ppseg/datasets/cityscapes.py
+++ b/paddlers/models/ppseg/datasets/cityscapes.py
@ -15,9 +15,9 @@
 import os
 import glob

-from paddlers.models.ppseg.datasets import Dataset
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
+from paddleseg.datasets import Dataset
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose


@manager.DATASETS.add_component
--- a/paddlers/models/ppseg/datasets/cocostuff.py
+++ b/paddlers/models/ppseg/datasets/cocostuff.py
@ -15,9 +15,9 @@
 import os
 import glob

-from paddlers.models.ppseg.datasets import Dataset
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
+from paddleseg.datasets import Dataset
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose


@manager.DATASETS.add_component
--- a/paddlers/models/ppseg/datasets/dataset.py
+++ b/paddlers/models/ppseg/datasets/dataset.py
@ -18,9 +18,9 @@ import paddle
 import numpy as np
 from PIL import Image

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-import paddlers.models.ppseg.transforms.functional as F
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+import paddleseg.transforms.functional as F


@manager.DATASETS.add_component
@ -46,10 +46,10 @@ class Dataset(paddle.io.Dataset):

        Examples:

-            import paddlers.models.ppseg.transforms as T
-            from paddlers.models.ppseg.datasets import Dataset
+            import paddleseg.transforms as T
+            from paddleseg.datasets import Dataset

-            transforms = [T.RandomPadCrop(crop_size=(512,512)), T.Normalize()]
+            transforms = [T.RandomPaddingCrop(crop_size=(512,512)), T.Normalize()]
            dataset_root = 'dataset_root_path'
            train_path = 'train_path'
            num_classes = 2
@ -62,10 +62,11 @@ class Dataset(paddle.io.Dataset):
    """

    def __init__(self,
-                 transforms,
+                 mode,
                 dataset_root,
+                 transforms,
                 num_classes,
-                 mode='train',
+                 img_channels=3,
                 train_path=None,
                 val_path=None,
                 test_path=None,
@ -73,10 +74,11 @@ class Dataset(paddle.io.Dataset):
                 ignore_index=255,
                 edge=False):
        self.dataset_root = dataset_root
-        self.transforms = Compose(transforms)
+        self.transforms = Compose(transforms, img_channels=img_channels)
        self.file_list = list()
        self.mode = mode.lower()
        self.num_classes = num_classes
+        self.img_channels = img_channels
        self.ignore_index = ignore_index
        self.edge = edge

@ -84,13 +86,18 @@ class Dataset(paddle.io.Dataset):
            raise ValueError(
                "mode should be 'train', 'val' or 'test', but got {}.".format(
                    self.mode))
-
-        if self.transforms is None:
-            raise ValueError("`transforms` is necessary, but it is None.")
-
        if not os.path.exists(self.dataset_root):
            raise FileNotFoundError('there is not `dataset_root`: {}.'.format(
                self.dataset_root))
+        if self.transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+        if num_classes < 1:
+            raise ValueError(
+                "`num_classes` should be greater than 1, but got {}".format(
+                    num_classes))
+        if img_channels not in [1, 3]:
+            raise ValueError("`img_channels` should in [1, 3], but got {}".
+                             format(img_channels))

        if self.mode == 'train':
            if train_path is None:
@ -139,24 +146,25 @@ class Dataset(paddle.io.Dataset):
                self.file_list.append([image_path, label_path])

    def __getitem__(self, idx):
+        data = {}
+        data['trans_info'] = []
        image_path, label_path = self.file_list[idx]
-        if self.mode == 'test':
-            im, _ = self.transforms(im=image_path)
-            im = im[np.newaxis, ...]
-            return im, image_path
-        elif self.mode == 'val':
-            im, _ = self.transforms(im=image_path)
-            label = np.asarray(Image.open(label_path))
-            label = label[np.newaxis, :, :]
-            return im, label
+        data['img'] = image_path
+        data['label'] = label_path
+        # If key in gt_fields, the data[key] have transforms synchronous.
+        data['gt_fields'] = []
+        if self.mode == 'val':
+            data = self.transforms(data)
+            data['label'] = data['label'][np.newaxis, :, :]
+
        else:
-            im, label = self.transforms(im=image_path, label=label_path)
+            data['gt_fields'].append('label')
+            data = self.transforms(data)
            if self.edge:
                edge_mask = F.mask_to_binary_edge(
-                    label, radius=2, num_classes=self.num_classes)
-                return im, label, edge_mask
-            else:
-                return im, label
+                    data['label'], radius=2, num_classes=self.num_classes)
+                data['edge'] = edge_mask
+        return data

    def __len__(self):
        return len(self.file_list)
--- a/paddlers/models/ppseg/datasets/drive.py
+++ b/paddlers/models/ppseg/datasets/drive.py
@ -14,11 +14,11 @@

 import os

-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-from paddlers.models.ppseg.datasets import Dataset
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+from paddleseg.datasets import Dataset

 URL = 'https://bj.bcebos.com/paddleseg/dataset/drive/drive.zip'

--- a/paddlers/models/ppseg/datasets/eg1800.py
+++ b/paddlers/models/ppseg/datasets/eg1800.py
@ -18,12 +18,12 @@ import copy
 import cv2
 import numpy as np

-from paddlers.models.ppseg.datasets import Dataset
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-import paddlers.models.ppseg.transforms.functional as F
+from paddleseg.datasets import Dataset
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+import paddleseg.transforms.functional as F

 URL = "https://paddleseg.bj.bcebos.com/dataset/EG1800.zip"

--- a/paddlers/models/ppseg/datasets/hrf.py
+++ b/paddlers/models/ppseg/datasets/hrf.py
@ -14,11 +14,11 @@

 import os

-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-from paddlers.models.ppseg.datasets import Dataset
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+from paddleseg.datasets import Dataset

 URL = 'https://bj.bcebos.com/paddleseg/dataset/hrf/hrf.zip'

--- a/paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
+++ b/paddlers/models/ppseg/datasets/mini_deep_globe_road_extraction.py
@ -15,10 +15,10 @@
 import os

 from .dataset import Dataset
-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose

 URL = "https://paddleseg.bj.bcebos.com/dataset/MiniDeepGlobeRoadExtraction.zip"

--- a/paddlers/models/ppseg/datasets/optic_disc_seg.py
+++ b/paddlers/models/ppseg/datasets/optic_disc_seg.py
@ -15,10 +15,10 @@
 import os

 from .dataset import Dataset
-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose

 URL = "https://paddleseg.bj.bcebos.com/dataset/optic_disc_seg.zip"

--- a/paddlers/models/ppseg/datasets/pascal_context.py
+++ b/paddlers/models/ppseg/datasets/pascal_context.py
@ -15,9 +15,9 @@
 import os

 from PIL import Image
-from paddlers.models.ppseg.datasets import Dataset
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
+from paddleseg.datasets import Dataset
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose


@manager.DATASETS.add_component
--- a/paddlers/models/ppseg/datasets/pp_humanseg14k.py
+++ b/paddlers/models/ppseg/datasets/pp_humanseg14k.py
@ -15,8 +15,8 @@
 import os

 from .dataset import Dataset
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose


@manager.DATASETS.add_component
--- a/paddlers/models/ppseg/datasets/pssl.py
+++ b/paddlers/models/ppseg/datasets/pssl.py
@ -0,0 +1,135 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+from paddleseg.datasets import Dataset
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+
+
+@manager.DATASETS.add_component
+class PSSLDataset(Dataset):
+    """
+    The PSSL dataset for segmentation. PSSL is short for Pseudo Semantic Segmentation Labels, where the pseudo label
+    is computed by the Consensus explanation algorithm.
+
+    The PSSL refers to "Distilling Ensemble of Explanations for Weakly-Supervised Pre-Training of Image Segmentation 
+    Models" (https://arxiv.org/abs/2207.03335). 
+    
+    The Consensus explanation refers to "Cross-Model Consensus of Explanations and Beyond for Image Classification 
+    Models: An Empirical Study" (https://arxiv.org/abs/2109.00707).
+
+    To use this dataset, we need to additionally prepare the orignal ImageNet dataset, which has the folder structure
+    as follows:
+
+        imagenet_root
+        |
+        |--train
+        |  |--n01440764
+        |  |  |--n01440764_10026.JPEG
+        |  |  |--...
+        |  |--nxxxxxxxx
+        |  |--...
+
+    where only the "train" set is needed.
+
+    The PSSL dataset has the folder structure as follows:
+
+        pssl_root
+        |
+        |--train
+        |  |--n01440764
+        |  |  |--n01440764_10026.JPEG_eiseg.npz
+        |  |  |--...
+        |  |--nxxxxxxxx
+        |  |--...
+        |
+        |--imagenet_lsvrc_2015_synsets.txt
+        |--train.txt
+
+    where "train.txt" and "imagenet_lsvrc_2015_synsets.txt" are included in the PSSL dataset.
+
+    Args:
+        transforms (list): Transforms for image.
+        imagenet_root (str): The path to the original ImageNet dataset.
+        pssl_root (str): The path to the PSSL dataset.
+        mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'.
+        edge (bool, optional): Whether to compute edge while training. Default: False.
+    """
+    ignore_index = 1001  # 0~999 is target class, 1000 is bg
+    NUM_CLASSES = 1001  # consider target class and bg
+
+    def __init__(self,
+                 transforms,
+                 imagenet_root,
+                 pssl_root,
+                 mode='train',
+                 edge=False):
+        mode = mode.lower()
+        if mode not in ['train']:
+            raise ValueError("mode should be 'train', but got {}.".format(mode))
+        if transforms is None:
+            raise ValueError("`transforms` is necessary, but it is None.")
+
+        self.transforms = Compose(transforms)
+        self.mode = mode
+        self.edge = edge
+
+        self.num_classes = self.NUM_CLASSES
+        self.ignore_index = self.num_classes  # 1001
+        self.file_list = []
+        self.class_id_dict = {}
+
+        if imagenet_root is None or not os.path.isdir(pssl_root):
+            raise ValueError(
+                "The dataset is not Found or the folder structure is nonconfoumance."
+            )
+
+        train_list_file = os.path.join(pssl_root, "train.txt")
+        if not os.path.exists(train_list_file):
+            raise ValueError("Train list file isn't exists.")
+        for idx, line in enumerate(open(train_list_file)):
+            # line: train/n04118776/n04118776_45912.JPEG_eiseg.npz
+            label_path = line.strip()
+            img_path = label_path.split('.JPEG')[0] + '.JPEG'
+            label_path = os.path.join(pssl_root, label_path)
+            img_path = os.path.join(imagenet_root, img_path)
+            self.file_list.append([img_path, label_path])
+
+        # mapping class name to class id.
+        class_id_file = os.path.join(pssl_root,
+                                     "imagenet_lsvrc_2015_synsets.txt")
+        if not os.path.exists(class_id_file):
+            raise ValueError("Class id file isn't exists.")
+        for idx, line in enumerate(open(class_id_file)):
+            class_name = line.strip()
+            self.class_id_dict[class_name] = idx
+
+    def __getitem__(self, idx):
+        image_path, label_path = self.file_list[idx]
+
+        # transform label
+        class_name = (image_path.split('/')[-1]).split('_')[0]
+        class_id = self.class_id_dict[class_name]
+
+        pssl_seg = np.load(label_path)['arr_0']
+        gt_semantic_seg = np.zeros_like(pssl_seg, dtype=np.int64) + 1000
+        # [0, 999] for imagenet classes, 1000 for background, others(-1) will be ignored during training.
+        gt_semantic_seg[pssl_seg == 1] = class_id
+
+        im, label = self.transforms(im=image_path, label=gt_semantic_seg)
+
+        return im, label
--- a/paddlers/models/ppseg/datasets/stare.py
+++ b/paddlers/models/ppseg/datasets/stare.py
@ -14,11 +14,11 @@

 import os

-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-from paddlers.models.ppseg.datasets import Dataset
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+from paddleseg.datasets import Dataset

 URL = 'https://bj.bcebos.com/paddleseg/dataset/stare/stare.zip'

--- a/paddlers/models/ppseg/datasets/supervisely.py
+++ b/paddlers/models/ppseg/datasets/supervisely.py
@ -18,12 +18,12 @@ import copy
 import cv2
 import numpy as np

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
-from paddlers.models.ppseg.datasets import Dataset
-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-import paddlers.models.ppseg.transforms.functional as F
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose
+from paddleseg.datasets import Dataset
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+import paddleseg.transforms.functional as F

 URL = "https://paddleseg.bj.bcebos.com/dataset/Supervisely_face.zip"

--- a/paddlers/models/ppseg/datasets/voc.py
+++ b/paddlers/models/ppseg/datasets/voc.py
@ -14,11 +14,11 @@

 import os

-from paddlers.models.ppseg.datasets import Dataset
-from paddlers.models.ppseg.utils.download import download_file_and_uncompress
-from paddlers.models.ppseg.utils import seg_env
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.transforms import Compose
+from paddleseg.datasets import Dataset
+from paddleseg.utils.download import download_file_and_uncompress
+from paddleseg.utils import seg_env
+from paddleseg.cvlibs import manager
+from paddleseg.transforms import Compose

 URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar"

--- a/paddlers/models/ppseg/models/init.py
+++ b/paddlers/models/ppseg/models/init.py
@ -49,9 +49,18 @@ from .segnet import SegNet
 from .encnet import ENCNet
 from .hrnet_contrast import HRNetW48Contrast
 from .espnet import ESPNetV2
+from .pp_liteseg import PPLiteSeg
 from .dmnet import DMNet
 from .espnetv1 import ESPNetV1
 from .enet import ENet
 from .bisenetv1 import BiseNetV1
 from .fastfcn import FastFCN
 from .pfpnnet import PFPNNet
+from .glore import GloRe
+from .ddrnet import DDRNet_23
+from .ccnet import CCNet
+from .mobileseg import MobileSeg
+from .upernet import UPerNet
+from .sinet import SINet
+from .lraspp import LRASPP
+from .topformer import TopFormer
--- a/paddlers/models/ppseg/models/ann.py
+++ b/paddlers/models/ppseg/models/ann.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/attention_unet.py
+++ b/paddlers/models/ppseg/models/attention_unet.py
@ -14,9 +14,9 @@

 import paddle
 import paddle.nn as nn
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg import utils
 import numpy as np


@ -35,13 +35,13 @@ class AttentionUNet(nn.Layer):

    Args:
        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): The channels of input image. Default: 3.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
    """

-    def __init__(self, num_classes, pretrained=None):
+    def __init__(self, num_classes, in_channels=3, pretrained=None):
        super().__init__()
-        n_channels = 3
-        self.encoder = Encoder(n_channels, [64, 128, 256, 512])
+        self.encoder = Encoder(in_channels, [64, 128, 256, 512])
        filters = np.array([64, 128, 256, 512, 1024])
        self.up5 = UpConv(ch_in=filters[4], ch_out=filters[3])
        self.att5 = AttentionBlock(
--- a/paddlers/models/ppseg/models/backbones/init.py
+++ b/paddlers/models/ppseg/models/backbones/init.py
@ -21,3 +21,7 @@ from .swin_transformer import *
 from .mobilenetv2 import *
 from .mix_transformer import *
 from .stdcnet import *
+from .lite_hrnet import *
+from .shufflenetv2 import *
+from .ghostnet import *
+from .top_transformer import *
--- a/paddlers/models/ppseg/models/backbones/ghostnet.py
+++ b/paddlers/models/ppseg/models/backbones/ghostnet.py
@ -0,0 +1,318 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/ghostnet_pytorch
+
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform, KaimingNormal
+
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils, logger
+
+__all__ = ["GhostNet_x0_5", "GhostNet_x1_0", "GhostNet_x1_3"]
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+
+        self._batch_norm = BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale", regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset", regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, num_channels, reduction_ratio=4, name=None):
+        super(SEBlock, self).__init__()
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        self._num_channels = num_channels
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        med_ch = num_channels // reduction_ratio
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_1_weights"),
+            bias_attr=ParamAttr(name=name + "_1_offset"))
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_channels,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_2_weights"),
+            bias_attr=ParamAttr(name=name + "_2_offset"))
+
+    def forward(self, inputs):
+        pool = self.pool2d_gap(inputs)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = paddle.clip(x=excitation, min=0, max=1)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(inputs, excitation)
+        return out
+
+
+class GhostModule(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 output_channels,
+                 kernel_size=1,
+                 ratio=2,
+                 dw_size=3,
+                 stride=1,
+                 relu=True,
+                 name=None):
+        super(GhostModule, self).__init__()
+        init_channels = int(math.ceil(output_channels / ratio))
+        new_channels = int(init_channels * (ratio - 1))
+        self.primary_conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=init_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=1,
+            act="relu" if relu else None,
+            name=name + "_primary_conv")
+        self.cheap_operation = ConvBNLayer(
+            in_channels=init_channels,
+            out_channels=new_channels,
+            kernel_size=dw_size,
+            stride=1,
+            groups=init_channels,
+            act="relu" if relu else None,
+            name=name + "_cheap_operation")
+
+    def forward(self, inputs):
+        x = self.primary_conv(inputs)
+        y = self.cheap_operation(x)
+        out = paddle.concat([x, y], axis=1)
+        return out
+
+
+class GhostBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 hidden_dim,
+                 output_channels,
+                 kernel_size,
+                 stride,
+                 use_se,
+                 name=None):
+        super(GhostBottleneck, self).__init__()
+        self._stride = stride
+        self._use_se = use_se
+        self._num_channels = in_channels
+        self._output_channels = output_channels
+        self.ghost_module_1 = GhostModule(
+            in_channels=in_channels,
+            output_channels=hidden_dim,
+            kernel_size=1,
+            stride=1,
+            relu=True,
+            name=name + "_ghost_module_1")
+        if stride == 2:
+            self.depthwise_conv = ConvBNLayer(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=hidden_dim,
+                act=None,
+                name=name +
+                "_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+        if use_se:
+            self.se_block = SEBlock(num_channels=hidden_dim, name=name + "_se")
+        self.ghost_module_2 = GhostModule(
+            in_channels=hidden_dim,
+            output_channels=output_channels,
+            kernel_size=1,
+            relu=False,
+            name=name + "_ghost_module_2")
+        if stride != 1 or in_channels != output_channels:
+            self.shortcut_depthwise = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=in_channels,
+                act=None,
+                name=name +
+                "_shortcut_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+            self.shortcut_conv = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=output_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                act=None,
+                name=name + "_shortcut_conv")
+
+    def forward(self, inputs):
+        x = self.ghost_module_1(inputs)
+        if self._stride == 2:
+            x = self.depthwise_conv(x)
+        if self._use_se:
+            x = self.se_block(x)
+        x = self.ghost_module_2(x)
+        if self._stride == 1 and self._num_channels == self._output_channels:
+            shortcut = inputs
+        else:
+            shortcut = self.shortcut_depthwise(inputs)
+            shortcut = self.shortcut_conv(shortcut)
+        return paddle.add(x=x, y=shortcut)
+
+
+class GhostNet(nn.Layer):
+    def __init__(self, scale, in_channels=3, pretrained=None):
+        super(GhostNet, self).__init__()
+        self.cfgs = [
+            # k, t, c, SE, s
+            [3, 16, 16, 0, 1],
+            [3, 48, 24, 0, 2],
+            [3, 72, 24, 0, 1],  # x4
+            [5, 72, 40, 1, 2],
+            [5, 120, 40, 1, 1],  # x8
+            [3, 240, 80, 0, 2],
+            [3, 200, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 480, 112, 1, 1],
+            [3, 672, 112, 1, 1],  # x16
+            [5, 672, 160, 1, 2],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1]  # x32
+        ]
+        self.scale = scale
+        self.pretrained = pretrained
+
+        output_channels = int(self._make_divisible(16 * self.scale, 4))
+        self.conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=output_channels,
+            kernel_size=3,
+            stride=2,
+            groups=1,
+            act="relu",
+            name="conv1")
+
+        # build inverted residual blocks
+        self.out_index = [2, 4, 10, 15]
+        self.feat_channels = []
+        self.ghost_bottleneck_list = []
+        for idx, (k, exp_size, c, use_se, s) in enumerate(self.cfgs):
+            in_channels = output_channels
+            output_channels = int(self._make_divisible(c * self.scale, 4))
+            hidden_dim = int(self._make_divisible(exp_size * self.scale, 4))
+            ghost_bottleneck = self.add_sublayer(
+                name="_ghostbottleneck_" + str(idx),
+                sublayer=GhostBottleneck(
+                    in_channels=in_channels,
+                    hidden_dim=hidden_dim,
+                    output_channels=output_channels,
+                    kernel_size=k,
+                    stride=s,
+                    use_se=use_se,
+                    name="_ghostbottleneck_" + str(idx)))
+            self.ghost_bottleneck_list.append(ghost_bottleneck)
+            if idx in self.out_index:
+                self.feat_channels.append(output_channels)
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, inputs):
+        feat_list = []
+        x = self.conv1(inputs)
+        for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list):
+            x = ghost_bottleneck(x)
+            if idx in self.out_index:
+                feat_list.append(x)
+        return feat_list
+
+    def _make_divisible(self, v, divisor, min_value=None):
+        """
+        This function is taken from the original tf repo.
+        It ensures that all layers have a channel number that is divisible by 8
+        It can be seen here:
+        https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+        """
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+        # Make sure that round down does not go down by more than 10%.
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+
+
+@manager.BACKBONES.add_component
+def GhostNet_x0_5(**kwargs):
+    model = GhostNet(scale=0.5, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def GhostNet_x1_0(**kwargs):
+    model = GhostNet(scale=1.0, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def GhostNet_x1_3(**kwargs):
+    model = GhostNet(scale=1.3, **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/hrnet.py
+++ b/paddlers/models/ppseg/models/backbones/hrnet.py
@ -18,9 +18,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager, param_init
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager, param_init
+from paddleseg.models import layers
+from paddleseg.utils import utils

 __all__ = [
    "HRNet_W18_Small_V1", "HRNet_W18_Small_V2", "HRNet_W18", "HRNet_W30",
@ -37,6 +37,7 @@ class HRNet(nn.Layer):
    (https://arxiv.org/pdf/1908.07919.pdf).

    Args:
+        in_channels (int, optional): The channels of input image. Default: 3.
        pretrained (str, optional): The path of pretrained model.
        stage1_num_modules (int, optional): Number of modules for stage1. Default 1.
        stage1_num_blocks (list, optional): Number of blocks per module for stage1. Default (4).
@ -56,6 +57,7 @@ class HRNet(nn.Layer):
    """

    def __init__(self,
+                 in_channels=3,
                 pretrained=None,
                 stage1_num_modules=1,
                 stage1_num_blocks=(4, ),
@ -91,7 +93,7 @@ class HRNet(nn.Layer):
        self.feat_channels = [sum(stage4_num_channels)]

        self.conv_layer1_1 = layers.ConvBNReLU(
-            in_channels=3,
+            in_channels=in_channels,
            out_channels=64,
            kernel_size=3,
            stride=2,
--- a/paddlers/models/ppseg/models/backbones/lite_hrnet.py
+++ b/paddlers/models/ppseg/models/backbones/lite_hrnet.py
@ -0,0 +1,974 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on
+https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py
+"""
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from numbers import Integral
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Normal, Constant
+
+from paddleseg.cvlibs import manager
+from paddleseg import utils
+
+__all__ = [
+    "Lite_HRNet_18", "Lite_HRNet_30", "Lite_HRNet_naive",
+    "Lite_HRNet_wider_naive", "LiteHRNet"
+]
+
+
+def Conv2d(in_channels,
+           out_channels,
+           kernel_size,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=1,
+           bias=True,
+           weight_init=Normal(std=0.001),
+           bias_init=Constant(0.)):
+    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
+    if bias:
+        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
+    else:
+        bias_attr = False
+    conv = nn.Conv2D(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr)
+    return conv
+
+
+def channel_shuffle(x, groups):
+    x_shape = paddle.shape(x)
+    batch_size, height, width = x_shape[0], x_shape[2], x_shape[3]
+    num_channels = x.shape[1]
+    channels_per_group = num_channels // groups
+
+    x = paddle.reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+    x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4])
+    x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width])
+
+    return x
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 norm_type=None,
+                 norm_groups=32,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None):
+        super(ConvNormLayer, self).__init__()
+        self.act = act
+        norm_lr = 0. if freeze_norm else 1.
+        if norm_type is not None:
+            assert norm_type in ['bn', 'sync_bn', 'gn'], \
+                "norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type)
+            param_attr = ParamAttr(
+                initializer=Constant(1.0),
+                learning_rate=norm_lr,
+                regularizer=L2Decay(norm_decay), )
+            bias_attr = ParamAttr(
+                learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
+            global_stats = True if freeze_norm else None
+            if norm_type in ['bn', 'sync_bn']:
+                self.norm = nn.BatchNorm2D(
+                    ch_out,
+                    weight_attr=param_attr,
+                    bias_attr=bias_attr,
+                    use_global_stats=global_stats, )
+            elif norm_type == 'gn':
+                self.norm = nn.GroupNorm(
+                    num_groups=norm_groups,
+                    num_channels=ch_out,
+                    weight_attr=param_attr,
+                    bias_attr=bias_attr)
+            norm_params = self.norm.parameters()
+            if freeze_norm:
+                for param in norm_params:
+                    param.stop_gradient = True
+            conv_bias_attr = False
+        else:
+            conv_bias_attr = True
+            self.norm = None
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=Normal(
+                mean=0., std=0.001)),
+            bias_attr=conv_bias_attr)
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        if self.norm is not None:
+            out = self.norm(out)
+
+        if self.act == 'relu':
+            out = F.relu(out)
+        elif self.act == 'sigmoid':
+            out = F.sigmoid(out)
+        return out
+
+
+class DepthWiseSeparableConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride=1,
+                 dw_norm_type=None,
+                 pw_norm_type=None,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 dw_act=None,
+                 pw_act=None):
+        super(DepthWiseSeparableConvNormLayer, self).__init__()
+        self.depthwise_conv = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_in,
+            filter_size=filter_size,
+            stride=stride,
+            groups=ch_in,
+            norm_type=dw_norm_type,
+            act=dw_act,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm, )
+        self.pointwise_conv = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=1,
+            stride=1,
+            norm_type=pw_norm_type,
+            act=pw_act,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm, )
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+
+
+class CrossResolutionWeightingModule(nn.Layer):
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(CrossResolutionWeightingModule, self).__init__()
+        self.channels = channels
+        total_channel = sum(channels)
+        self.conv1 = ConvNormLayer(
+            ch_in=total_channel,
+            ch_out=total_channel // ratio,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.conv2 = ConvNormLayer(
+            ch_in=total_channel // ratio,
+            ch_out=total_channel,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='sigmoid',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+
+    def forward(self, x):
+        out = []
+        for idx, xi in enumerate(x[:-1]):
+            kernel_size = stride = pow(2, len(x) - idx - 1)
+            xi = F.avg_pool2d(xi, kernel_size=kernel_size, stride=stride)
+            out.append(xi)
+        out.append(x[-1])
+
+        out = paddle.concat(out, 1)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        out = paddle.split(out, self.channels, 1)
+        out = [
+            s * F.interpolate(
+                a, paddle.shape(s)[-2:], mode='nearest') for s, a in zip(x, out)
+        ]
+        return out
+
+
+class SpatialWeightingModule(nn.Layer):
+    def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.):
+        super(SpatialWeightingModule, self).__init__()
+        self.global_avgpooling = nn.AdaptiveAvgPool2D(1)
+        self.conv1 = ConvNormLayer(
+            ch_in=in_channel,
+            ch_out=in_channel // ratio,
+            filter_size=1,
+            stride=1,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.conv2 = ConvNormLayer(
+            ch_in=in_channel // ratio,
+            ch_out=in_channel,
+            filter_size=1,
+            stride=1,
+            act='sigmoid',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+
+    def forward(self, x):
+        out = self.global_avgpooling(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class ConditionalChannelWeightingBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 stride,
+                 reduce_ratio,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(ConditionalChannelWeightingBlock, self).__init__()
+        assert stride in [1, 2]
+        branch_channels = [channel // 2 for channel in in_channels]
+
+        self.cross_resolution_weighting = CrossResolutionWeightingModule(
+            branch_channels,
+            ratio=reduce_ratio,
+            norm_type=norm_type,
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.depthwise_convs = nn.LayerList([
+            ConvNormLayer(
+                channel,
+                channel,
+                filter_size=3,
+                stride=stride,
+                groups=channel,
+                norm_type=norm_type,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay) for channel in branch_channels
+        ])
+
+        self.spatial_weighting = nn.LayerList([
+            SpatialWeightingModule(
+                channel,
+                ratio=4,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay) for channel in branch_channels
+        ])
+
+    def forward(self, x):
+        x = [s.chunk(2, axis=1) for s in x]
+        x1 = [s[0] for s in x]
+        x2 = [s[1] for s in x]
+
+        x2 = self.cross_resolution_weighting(x2)
+        x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
+        x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
+
+        out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)]
+        out = [channel_shuffle(s, groups=2) for s in out]
+        return out
+
+
+class ShuffleUnit(nn.Layer):
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 stride,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(ShuffleUnit, self).__init__()
+        branch_channel = out_channel // 2
+        self.stride = stride
+        if self.stride == 1:
+            assert in_channel == branch_channel * 2, \
+                "when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2)
+        if stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvNormLayer(
+                    ch_in=in_channel,
+                    ch_out=in_channel,
+                    filter_size=3,
+                    stride=self.stride,
+                    groups=in_channel,
+                    norm_type=norm_type,
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay),
+                ConvNormLayer(
+                    ch_in=in_channel,
+                    ch_out=branch_channel,
+                    filter_size=1,
+                    stride=1,
+                    norm_type=norm_type,
+                    act='relu',
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay), )
+        self.branch2 = nn.Sequential(
+            ConvNormLayer(
+                ch_in=branch_channel if stride == 1 else in_channel,
+                ch_out=branch_channel,
+                filter_size=1,
+                stride=1,
+                norm_type=norm_type,
+                act='relu',
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay),
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=branch_channel,
+                filter_size=3,
+                stride=self.stride,
+                groups=branch_channel,
+                norm_type=norm_type,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay),
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=branch_channel,
+                filter_size=1,
+                stride=1,
+                norm_type=norm_type,
+                act='relu',
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay), )
+
+    def forward(self, x):
+        if self.stride > 1:
+            x1 = self.branch1(x)
+            x2 = self.branch2(x)
+        else:
+            x1, x2 = x.chunk(2, axis=1)
+            x2 = self.branch2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        out = channel_shuffle(out, groups=2)
+        return out
+
+
+class IterativeHead(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(IterativeHead, self).__init__()
+        num_branches = len(in_channels)
+        self.in_channels = in_channels[::-1]
+
+        projects = []
+        for i in range(num_branches):
+            if i != num_branches - 1:
+                projects.append(
+                    DepthWiseSeparableConvNormLayer(
+                        ch_in=self.in_channels[i],
+                        ch_out=self.in_channels[i + 1],
+                        filter_size=3,
+                        stride=1,
+                        dw_act=None,
+                        pw_act='relu',
+                        dw_norm_type=norm_type,
+                        pw_norm_type=norm_type,
+                        freeze_norm=freeze_norm,
+                        norm_decay=norm_decay))
+            else:
+                projects.append(
+                    DepthWiseSeparableConvNormLayer(
+                        ch_in=self.in_channels[i],
+                        ch_out=self.in_channels[i],
+                        filter_size=3,
+                        stride=1,
+                        dw_act=None,
+                        pw_act='relu',
+                        dw_norm_type=norm_type,
+                        pw_norm_type=norm_type,
+                        freeze_norm=freeze_norm,
+                        norm_decay=norm_decay))
+        self.projects = nn.LayerList(projects)
+
+    def forward(self, x):
+        x = x[::-1]
+        y = []
+        last_x = None
+        for i, s in enumerate(x):
+            if last_x is not None:
+                last_x = F.interpolate(
+                    last_x,
+                    size=paddle.shape(s)[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                s = s + last_x
+            s = self.projects[i](s)
+            y.append(s)
+            last_x = s
+
+        return y[::-1]
+
+
+class Stem(nn.Layer):
+    def __init__(self,
+                 in_channel,
+                 stem_channel,
+                 out_channel,
+                 expand_ratio,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(Stem, self).__init__()
+        self.conv1 = ConvNormLayer(
+            in_channel,
+            stem_channel,
+            filter_size=3,
+            stride=2,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        mid_channel = int(round(stem_channel * expand_ratio))
+        branch_channel = stem_channel // 2
+        if stem_channel == out_channel:
+            inc_channel = out_channel - branch_channel
+        else:
+            inc_channel = out_channel - stem_channel
+        self.branch1 = nn.Sequential(
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=branch_channel,
+                filter_size=3,
+                stride=2,
+                groups=branch_channel,
+                norm_type=norm_type,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay),
+            ConvNormLayer(
+                ch_in=branch_channel,
+                ch_out=inc_channel,
+                filter_size=1,
+                stride=1,
+                norm_type=norm_type,
+                act='relu',
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay), )
+        self.expand_conv = ConvNormLayer(
+            ch_in=branch_channel,
+            ch_out=mid_channel,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.depthwise_conv = ConvNormLayer(
+            ch_in=mid_channel,
+            ch_out=mid_channel,
+            filter_size=3,
+            stride=2,
+            groups=mid_channel,
+            norm_type=norm_type,
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+        self.linear_conv = ConvNormLayer(
+            ch_in=mid_channel,
+            ch_out=branch_channel
+            if stem_channel == out_channel else stem_channel,
+            filter_size=1,
+            stride=1,
+            norm_type=norm_type,
+            act='relu',
+            freeze_norm=freeze_norm,
+            norm_decay=norm_decay)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x1, x2 = x.chunk(2, axis=1)
+        x1 = self.branch1(x1)
+        x2 = self.expand_conv(x2)
+        x2 = self.depthwise_conv(x2)
+        x2 = self.linear_conv(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        out = channel_shuffle(out, groups=2)
+
+        return out
+
+
+class LiteHRNetModule(nn.Layer):
+    def __init__(self,
+                 num_branches,
+                 num_blocks,
+                 in_channels,
+                 reduce_ratio,
+                 module_type,
+                 multiscale_output=False,
+                 with_fuse=True,
+                 norm_type='bn',
+                 freeze_norm=False,
+                 norm_decay=0.):
+        super(LiteHRNetModule, self).__init__()
+        assert num_branches == len(in_channels),\
+            "num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels))
+        assert module_type in [
+            'LITE', 'NAIVE'
+        ], "module_type should be one of ['LITE', 'NAIVE']"
+        self.num_branches = num_branches
+        self.in_channels = in_channels
+        self.multiscale_output = multiscale_output
+        self.with_fuse = with_fuse
+        self.norm_type = 'bn'
+        self.module_type = module_type
+
+        if self.module_type == 'LITE':
+            self.layers = self._make_weighting_blocks(
+                num_blocks,
+                reduce_ratio,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay)
+        elif self.module_type == 'NAIVE':
+            self.layers = self._make_naive_branches(
+                num_branches,
+                num_blocks,
+                freeze_norm=freeze_norm,
+                norm_decay=norm_decay)
+
+        if self.with_fuse:
+            self.fuse_layers = self._make_fuse_layers(
+                freeze_norm=freeze_norm, norm_decay=norm_decay)
+            self.relu = nn.ReLU()
+
+    def _make_weighting_blocks(self,
+                               num_blocks,
+                               reduce_ratio,
+                               stride=1,
+                               freeze_norm=False,
+                               norm_decay=0.):
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                ConditionalChannelWeightingBlock(
+                    self.in_channels,
+                    stride=stride,
+                    reduce_ratio=reduce_ratio,
+                    norm_type=self.norm_type,
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay))
+        return nn.Sequential(*layers)
+
+    def _make_naive_branches(self,
+                             num_branches,
+                             num_blocks,
+                             freeze_norm=False,
+                             norm_decay=0.):
+        branches = []
+        for branch_idx in range(num_branches):
+            layers = []
+            for i in range(num_blocks):
+                layers.append(
+                    ShuffleUnit(
+                        self.in_channels[branch_idx],
+                        self.in_channels[branch_idx],
+                        stride=1,
+                        norm_type=self.norm_type,
+                        freeze_norm=freeze_norm,
+                        norm_decay=norm_decay))
+            branches.append(nn.Sequential(*layers))
+        return nn.LayerList(branches)
+
+    def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.):
+        if self.num_branches == 1:
+            return None
+        fuse_layers = []
+        num_out_branches = self.num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(self.num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            Conv2d(
+                                self.in_channels[j],
+                                self.in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False, ),
+                            nn.BatchNorm2D(self.in_channels[i]),
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=self.in_channels[j],
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[j]),
+                                    Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[i],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[i])))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=self.in_channels[j],
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[j]),
+                                    Conv2d(
+                                        self.in_channels[j],
+                                        self.in_channels[j],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False, ),
+                                    nn.BatchNorm2D(self.in_channels[j]),
+                                    nn.ReLU()))
+
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.LayerList(fuse_layer))
+
+        return nn.LayerList(fuse_layers)
+
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.layers[0](x[0])]
+        if self.module_type == 'LITE':
+            out = self.layers(x)
+        elif self.module_type == 'NAIVE':
+            for i in range(self.num_branches):
+                x[i] = self.layers[i](x[i])
+            out = x
+        if self.with_fuse:
+            out_fuse = []
+            for i in range(len(self.fuse_layers)):
+                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
+                for j in range(self.num_branches):
+                    if j == 0:
+                        y += y
+                    elif i == j:
+                        y += out[j]
+                    else:
+                        y += self.fuse_layers[i][j](out[j])
+                    if i == 0:
+                        out[i] = y
+                out_fuse.append(self.relu(y))
+            out = out_fuse
+        elif not self.multiscale_output:
+            out = [out[0]]
+        return out
+
+
+class LiteHRNet(nn.Layer):
+    """
+    @inproceedings{Yulitehrnet21,
+    title={Lite-HRNet: A Lightweight High-Resolution Network},
+        author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
+        booktitle={CVPR},year={2021}
+    }
+
+    Args:
+        network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"],
+            "naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet.
+            "wider_naive": Naive network with wider channels in each block.
+            "lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting.
+            "lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18.
+        in_channels (int, optional): The channels of input image. Default: 3.
+        freeze_at (int): the stage to freeze
+        freeze_norm (bool): whether to freeze norm in HRNet
+        norm_decay (float): weight decay for normalization layer weights
+        return_idx (List): the stage to return
+    """
+
+    def __init__(self,
+                 network_type,
+                 in_channels=3,
+                 freeze_at=0,
+                 freeze_norm=True,
+                 norm_decay=0.,
+                 return_idx=[0, 1, 2, 3],
+                 use_head=False,
+                 pretrained=None):
+        super(LiteHRNet, self).__init__()
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \
+            "the network_type should be one of [lite_18, lite_30, naive, wider_naive]"
+        assert len(return_idx) > 0, "need one or more return index"
+        self.freeze_at = freeze_at
+        self.freeze_norm = freeze_norm
+        self.norm_decay = norm_decay
+        self.return_idx = return_idx
+        self.norm_type = 'bn'
+        self.use_head = use_head
+        self.pretrained = pretrained
+
+        self.module_configs = {
+            "lite_18": {
+                "num_modules": [2, 4, 2],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["LITE", "LITE", "LITE"],
+                "reduce_ratios": [8, 8, 8],
+                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            },
+            "lite_30": {
+                "num_modules": [3, 8, 3],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["LITE", "LITE", "LITE"],
+                "reduce_ratios": [8, 8, 8],
+                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            },
+            "naive": {
+                "num_modules": [2, 4, 2],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
+                "reduce_ratios": [1, 1, 1],
+                "num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
+            },
+            "wider_naive": {
+                "num_modules": [2, 4, 2],
+                "num_branches": [2, 3, 4],
+                "num_blocks": [2, 2, 2],
+                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
+                "reduce_ratios": [1, 1, 1],
+                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+            },
+        }
+
+        self.stages_config = self.module_configs[network_type]
+
+        self.stem = Stem(in_channels, 32, 32, 1)
+        num_channels_pre_layer = [32]
+        for stage_idx in range(3):
+            num_channels = self.stages_config["num_channels"][stage_idx]
+            setattr(self, 'transition{}'.format(stage_idx),
+                    self._make_transition_layer(num_channels_pre_layer,
+                                                num_channels, self.freeze_norm,
+                                                self.norm_decay))
+            stage, num_channels_pre_layer = self._make_stage(
+                self.stages_config, stage_idx, num_channels, True,
+                self.freeze_norm, self.norm_decay)
+            setattr(self, 'stage{}'.format(stage_idx), stage)
+
+        num_channels = self.stages_config["num_channels"][-1]
+        self.feat_channels = num_channels
+
+        if self.use_head:
+            self.head_layer = IterativeHead(num_channels_pre_layer, 'bn',
+                                            self.freeze_norm, self.norm_decay)
+
+            self.feat_channels = [num_channels[0]]
+            for i in range(1, len(num_channels)):
+                self.feat_channels.append(num_channels[i] // 2)
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def _make_transition_layer(self,
+                               num_channels_pre_layer,
+                               num_channels_cur_layer,
+                               freeze_norm=False,
+                               norm_decay=0.):
+        num_branches_pre = len(num_channels_pre_layer)
+        num_branches_cur = len(num_channels_cur_layer)
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_pre_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                groups=num_channels_pre_layer[i],
+                                bias=False),
+                            nn.BatchNorm2D(num_channels_pre_layer[i]),
+                            Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False, ),
+                            nn.BatchNorm2D(num_channels_cur_layer[i]),
+                            nn.ReLU()))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            Conv2d(
+                                num_channels_pre_layer[-1],
+                                num_channels_pre_layer[-1],
+                                groups=num_channels_pre_layer[-1],
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False, ),
+                            nn.BatchNorm2D(num_channels_pre_layer[-1]),
+                            Conv2d(
+                                num_channels_pre_layer[-1],
+                                num_channels_cur_layer[i]
+                                if j == i - num_branches_pre else
+                                num_channels_pre_layer[-1],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False, ),
+                            nn.BatchNorm2D(num_channels_cur_layer[i]
+                                           if j == i - num_branches_pre else
+                                           num_channels_pre_layer[-1]),
+                            nn.ReLU()))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+        return nn.LayerList(transition_layers)
+
+    def _make_stage(self,
+                    stages_config,
+                    stage_idx,
+                    in_channels,
+                    multiscale_output,
+                    freeze_norm=False,
+                    norm_decay=0.):
+        num_modules = stages_config["num_modules"][stage_idx]
+        num_branches = stages_config["num_branches"][stage_idx]
+        num_blocks = stages_config["num_blocks"][stage_idx]
+        reduce_ratio = stages_config['reduce_ratios'][stage_idx]
+        module_type = stages_config['module_type'][stage_idx]
+
+        modules = []
+        for i in range(num_modules):
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+            modules.append(
+                LiteHRNetModule(
+                    num_branches,
+                    num_blocks,
+                    in_channels,
+                    reduce_ratio,
+                    module_type,
+                    multiscale_output=reset_multiscale_output,
+                    with_fuse=True,
+                    freeze_norm=freeze_norm,
+                    norm_decay=norm_decay))
+            in_channels = modules[-1].in_channels
+        return nn.Sequential(*modules), in_channels
+
+    def forward(self, x):
+        x = self.stem(x)
+
+        y_list = [x]
+        for stage_idx in range(3):
+            x_list = []
+            transition = getattr(self, 'transition{}'.format(stage_idx))
+            for j in range(self.stages_config["num_branches"][stage_idx]):
+                if transition[j] is not None:
+                    if j >= len(y_list):
+                        x_list.append(transition[j](y_list[-1]))
+                    else:
+                        x_list.append(transition[j](y_list[j]))
+                else:
+                    x_list.append(y_list[j])
+            y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list)
+
+        if self.use_head:
+            y_list = self.head_layer(y_list)
+
+        res = []
+        for i, layer in enumerate(y_list):
+            if i == self.freeze_at:
+                layer.stop_gradient = True
+            if i in self.return_idx:
+                res.append(layer)
+        return res
+
+
+@manager.BACKBONES.add_component
+def Lite_HRNet_18(**kwargs):
+    model = LiteHRNet(network_type="lite_18", **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def Lite_HRNet_30(**kwargs):
+    model = LiteHRNet(network_type="lite_30", **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def Lite_HRNet_naive(**kwargs):
+    model = LiteHRNet(network_type="naive", **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def Lite_HRNet_wider_naive(**kwargs):
+    model = LiteHRNet(network_type="wider_naive", **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/mix_transformer.py
+++ b/paddlers/models/ppseg/models/backbones/mix_transformer.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -20,9 +20,9 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 import paddle.nn.initializer as paddle_init

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
-from paddlers.models.ppseg.models.backbones.transformer_utils import *
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils
+from paddleseg.models.backbones.transformer_utils import *


 class Mlp(nn.Layer):
@ -260,7 +260,7 @@ class MixVisionTransformer(nn.Layer):
    def __init__(self,
                 img_size=224,
                 patch_size=16,
-                 in_chans=3,
+                 in_channels=3,
                 num_classes=1000,
                 embed_dims=[64, 128, 256, 512],
                 num_heads=[1, 2, 4, 8],
@ -284,7 +284,7 @@ class MixVisionTransformer(nn.Layer):
            img_size=img_size,
            patch_size=7,
            stride=4,
-            in_chans=in_chans,
+            in_chans=in_channels,
            embed_dim=embed_dims[0])
        self.patch_embed2 = OverlapPatchEmbed(
            img_size=img_size // 4,
--- a/paddlers/models/ppseg/models/backbones/mobilenetv2.py
+++ b/paddlers/models/ppseg/models/backbones/mobilenetv2.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -12,13 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import paddle
+from paddle import ParamAttr
 import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg import utils
+from paddleseg.cvlibs import manager
+from paddleseg import utils
+
+__all__ = [
+    "MobileNetV2_x0_25",
+    "MobileNetV2_x0_5",
+    "MobileNetV2_x0_75",
+    "MobileNetV2_x1_0",
+    "MobileNetV2_x1_5",
+    "MobileNetV2_x2_0",
+]


-@manager.BACKBONES.add_component
 class MobileNetV2(nn.Layer):
    """
        The MobileNetV2 implementation based on PaddlePaddle.
@ -29,69 +42,70 @@ class MobileNetV2(nn.Layer):
        (https://arxiv.org/abs/1801.04381).

        Args:
-            channel_ratio (float, optional): The ratio of channel. Default: 1.0
-            min_channel (int, optional): The minimum of channel. Default: 16
+            scale (float, optional): The scale of channel. Default: 1.0
+            in_channels (int, optional): The channels of input image. Default: 3.
            pretrained (str, optional): The path or url of pretrained model. Default: None
        """

-    def __init__(self, channel_ratio=1.0, min_channel=16, pretrained=None):
-        super(MobileNetV2, self).__init__()
-        self.channel_ratio = channel_ratio
-        self.min_channel = min_channel
+    def __init__(self, scale=1.0, in_channels=3, pretrained=None):
+        super().__init__()
+        self.scale = scale
        self.pretrained = pretrained
+        prefix_name = ""

-        self.stage0 = conv_bn(3, self.depth(32), 3, 2)
-
-        self.stage1 = InvertedResidual(self.depth(32), self.depth(16), 1, 1)
-
-        self.stage2 = nn.Sequential(
-            InvertedResidual(self.depth(16), self.depth(24), 2, 6),
-            InvertedResidual(self.depth(24), self.depth(24), 1, 6), )
-
-        self.stage3 = nn.Sequential(
-            InvertedResidual(self.depth(24), self.depth(32), 2, 6),
-            InvertedResidual(self.depth(32), self.depth(32), 1, 6),
-            InvertedResidual(self.depth(32), self.depth(32), 1, 6), )
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),  # x4
+            (6, 32, 3, 2),  # x8
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),  # x16
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),  # x32
+        ]
+        self.out_index = [1, 2, 4, 6]

-        self.stage4 = nn.Sequential(
-            InvertedResidual(self.depth(32), self.depth(64), 2, 6),
-            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
-            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
-            InvertedResidual(self.depth(64), self.depth(64), 1, 6), )
+        self.conv1 = ConvBNLayer(
+            num_channels=in_channels,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            name=prefix_name + "conv1_1")

-        self.stage5 = nn.Sequential(
-            InvertedResidual(self.depth(64), self.depth(96), 1, 6),
-            InvertedResidual(self.depth(96), self.depth(96), 1, 6),
-            InvertedResidual(self.depth(96), self.depth(96), 1, 6), )
+        self.block_list = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            block = self.add_sublayer(
+                prefix_name + "conv" + str(i),
+                sublayer=InvresiBlocks(
+                    in_c=in_c,
+                    t=t,
+                    c=int(c * scale),
+                    n=n,
+                    s=s,
+                    name=prefix_name + "conv" + str(i)))
+            self.block_list.append(block)
+            in_c = int(c * scale)

-        self.stage6 = nn.Sequential(
-            InvertedResidual(self.depth(96), self.depth(160), 2, 6),
-            InvertedResidual(self.depth(160), self.depth(160), 1, 6),
-            InvertedResidual(self.depth(160), self.depth(160), 1, 6), )
-
-        self.stage7 = InvertedResidual(self.depth(160), self.depth(320), 1, 6)
+        out_channels = [
+            bottleneck_params_list[idx][1] for idx in self.out_index
+        ]
+        self.feat_channels = [int(c * scale) for c in out_channels]

        self.init_weight()

-    def depth(self, channels):
-        min_channel = min(channels, self.min_channel)
-        return max(min_channel, int(channels * self.channel_ratio))
-
-    def forward(self, x):
+    def forward(self, inputs):
        feat_list = []

-        feature_1_2 = self.stage0(x)
-        feature_1_2 = self.stage1(feature_1_2)
-        feature_1_4 = self.stage2(feature_1_2)
-        feature_1_8 = self.stage3(feature_1_4)
-        feature_1_16 = self.stage4(feature_1_8)
-        feature_1_16 = self.stage5(feature_1_16)
-        feature_1_32 = self.stage6(feature_1_16)
-        feature_1_32 = self.stage7(feature_1_32)
-        feat_list.append(feature_1_4)
-        feat_list.append(feature_1_8)
-        feat_list.append(feature_1_16)
-        feat_list.append(feature_1_32)
+        y = self.conv1(inputs, if_act=True)
+        for idx, block in enumerate(self.block_list):
+            y = block(y)
+            if idx in self.out_index:
+                feat_list.append(y)
+
        return feat_list

    def init_weight(self):
@ -99,66 +113,153 @@ class MobileNetV2(nn.Layer):
            utils.load_entire_model(self, self.pretrained)


-def conv_bn(inp, oup, kernel, stride):
-    return nn.Sequential(
-        nn.Conv2D(
-            in_channels=inp,
-            out_channels=oup,
-            kernel_size=kernel,
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 name=None,
+                 use_cudnn=True):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = F.relu6(y)
+        return y
+
+
+class InvertedResidualUnit(nn.Layer):
+    def __init__(self, num_channels, num_in_filter, num_filters, stride,
+                 filter_size, padding, expansion_factor, name):
+        super(InvertedResidualUnit, self).__init__()
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_expand")
+
+        self._bottleneck_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
            stride=stride,
-            padding=(kernel - 1) // 2,
-            bias_attr=False),
-        nn.BatchNorm2D(
-            num_features=oup, epsilon=1e-05, momentum=0.1),
-        nn.ReLU())
-
-
-class InvertedResidual(nn.Layer):
-    def __init__(self, inp, oup, stride, expand_ratio, dilation=1):
-        super(InvertedResidual, self).__init__()
-        self.stride = stride
-        assert stride in [1, 2]
-        self.use_res_connect = self.stride == 1 and inp == oup
-
-        self.conv = nn.Sequential(
-            nn.Conv2D(
-                inp,
-                inp * expand_ratio,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                bias_attr=False),
-            nn.BatchNorm2D(
-                num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
-            nn.ReLU(),
-            nn.Conv2D(
-                inp * expand_ratio,
-                inp * expand_ratio,
-                kernel_size=3,
-                stride=stride,
-                padding=dilation,
-                dilation=dilation,
-                groups=inp * expand_ratio,
-                bias_attr=False),
-            nn.BatchNorm2D(
-                num_features=inp * expand_ratio, epsilon=1e-05, momentum=0.1),
-            nn.ReLU(),
-            nn.Conv2D(
-                inp * expand_ratio,
-                oup,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                bias_attr=False),
-            nn.BatchNorm2D(
-                num_features=oup, epsilon=1e-05, momentum=0.1), )
-
-    def forward(self, x):
-        if self.use_res_connect:
-            return x + self.conv(x)
-        else:
-            return self.conv(x)
+            padding=padding,
+            num_groups=num_expfilter,
+            use_cudnn=False,
+            name=name + "_dwise")
+
+        self._linear_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_linear")
+
+    def forward(self, inputs, ifshortcut):
+        y = self._expand_conv(inputs, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = paddle.add(inputs, y)
+        return y
+
+
+class InvresiBlocks(nn.Layer):
+    def __init__(self, in_c, t, c, n, s, name):
+        super(InvresiBlocks, self).__init__()
+
+        self._first_block = InvertedResidualUnit(
+            num_channels=in_c,
+            num_in_filter=in_c,
+            num_filters=c,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t,
+            name=name + "_1")
+
+        self._block_list = []
+        for i in range(1, n):
+            block = self.add_sublayer(
+                name + "_" + str(i + 1),
+                sublayer=InvertedResidualUnit(
+                    num_channels=c,
+                    num_in_filter=c,
+                    num_filters=c,
+                    stride=1,
+                    filter_size=3,
+                    padding=1,
+                    expansion_factor=t,
+                    name=name + "_" + str(i + 1)))
+            self._block_list.append(block)
+
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for block in self._block_list:
+            y = block(y, ifshortcut=True)
+        return y
+
+
+@manager.BACKBONES.add_component
+def MobileNetV2_x0_25(**kwargs):
+    model = MobileNetV2(scale=0.25, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV2_x0_5(**kwargs):
+    model = MobileNetV2(scale=0.5, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV2_x0_75(**kwargs):
+    model = MobileNetV2(scale=0.75, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV2_x1_0(**kwargs):
+    model = MobileNetV2(scale=1.0, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV2_x1_5(**kwargs):
+    model = MobileNetV2(scale=1.5, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV2_x2_0(**kwargs):
+    model = MobileNetV2(scale=2.0, **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/mobilenetv3.py
+++ b/paddlers/models/ppseg/models/backbones/mobilenetv3.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -14,11 +14,13 @@

 import paddle
 import paddle.nn as nn
-import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
-from paddlers.models.ppseg.models import layers
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils, logger
+from paddleseg.models import layers

 __all__ = [
    "MobileNetV3_small_x0_35", "MobileNetV3_small_x0_5",
@ -28,8 +30,92 @@ __all__ = [
    "MobileNetV3_large_x1_0", "MobileNetV3_large_x1_25"
 ]

-
-def make_divisible(v, divisor=8, min_value=None):
+MODEL_STAGES_PATTERN = {
+    "MobileNetV3_small": ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"],
+    "MobileNetV3_large":
+    ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"]
+}
+
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+# d: dilation rate in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],  # x4
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],  # x8
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],  # x16
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],  # x32
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ],
+    "large_os8": [
+        # k, exp, c, se, act, s, {d}
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],  # x4
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],  # x8
+        [3, 240, 80, False, "hardswish", 1],
+        [3, 200, 80, False, "hardswish", 1, 2],
+        [3, 184, 80, False, "hardswish", 1, 2],
+        [3, 184, 80, False, "hardswish", 1, 2],
+        [3, 480, 112, True, "hardswish", 1, 2],
+        [3, 672, 112, True, "hardswish", 1, 2],
+        [5, 672, 160, True, "hardswish", 1, 2],
+        [5, 960, 160, True, "hardswish", 1, 4],
+        [5, 960, 160, True, "hardswish", 1, 4],
+    ],
+    "small_os8": [
+        # k, exp, c, se, act, s, {d}
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1, 2],
+        [5, 240, 40, True, "hardswish", 1, 2],
+        [5, 120, 48, True, "hardswish", 1, 2],
+        [5, 144, 48, True, "hardswish", 1, 2],
+        [5, 288, 96, True, "hardswish", 1, 2],
+        [5, 576, 96, True, "hardswish", 1, 4],
+        [5, 576, 96, True, "hardswish", 1, 4],
+    ]
+}
+
+OUT_INDEX = {"large": [2, 5, 11, 14], "small": [0, 2, 7, 10]}
+
+
+def _make_divisible(v, divisor=8, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
@ -38,156 +124,113 @@ def make_divisible(v, divisor=8, min_value=None):
    return new_v


-class MobileNetV3(nn.Layer):
-    """
-    The MobileNetV3 implementation based on PaddlePaddle.
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))

-    The original article refers to Jingdong
-    Andrew Howard, et, al. "Searching for MobileNetV3"
-    (https://arxiv.org/pdf/1905.02244.pdf).

+class MobileNetV3(nn.Layer):
+    """
+    MobileNetV3
    Args:
-        pretrained (str, optional): The path of pretrained model.
-        scale (float, optional): The scale of channels . Default: 1.0.
-        model_name (str, optional): Model name. It determines the type of MobileNetV3. The value is 'small' or 'large'. Defualt: 'small'.
-        output_stride (int, optional): The stride of output features compared to input images. The value should be one of (2, 4, 8, 16, 32). Default: None.
-
+        config: list. MobileNetV3 depthwise blocks config.
+        in_channels (int, optional): The channels of input image. Default: 3.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
    """

    def __init__(self,
-                 pretrained=None,
+                 config,
+                 stages_pattern,
+                 out_index,
+                 in_channels=3,
                 scale=1.0,
-                 model_name="small",
-                 output_stride=None):
-        super(MobileNetV3, self).__init__()
+                 pretrained=None):
+        super().__init__()

+        self.cfg = config
+        self.out_index = out_index
+        self.scale = scale
+        self.pretrained = pretrained
        inplanes = 16
-        if model_name == "large":
-            self.cfg = [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, False, "relu", 1],
-                [3, 64, 24, False, "relu", 2],
-                [3, 72, 24, False, "relu", 1],  # output 1 -> out_index=2
-                [5, 72, 40, True, "relu", 2],
-                [5, 120, 40, True, "relu", 1],
-                [5, 120, 40, True, "relu", 1],  # output 2 -> out_index=5
-                [3, 240, 80, False, "hard_swish", 2],
-                [3, 200, 80, False, "hard_swish", 1],
-                [3, 184, 80, False, "hard_swish", 1],
-                [3, 184, 80, False, "hard_swish", 1],
-                [3, 480, 112, True, "hard_swish", 1],
-                [3, 672, 112, True, "hard_swish",
-                 1],  # output 3 -> out_index=11
-                [5, 672, 160, True, "hard_swish", 2],
-                [5, 960, 160, True, "hard_swish", 1],
-                [5, 960, 160, True, "hard_swish",
-                 1],  # output 3 -> out_index=14
-            ]
-            self.out_indices = [2, 5, 11, 14]
-            self.feat_channels = [
-                make_divisible(i * scale) for i in [24, 40, 112, 160]
-            ]
-
-            self.cls_ch_squeeze = 960
-            self.cls_ch_expand = 1280
-        elif model_name == "small":
-            self.cfg = [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, True, "relu", 2],  # output 1 -> out_index=0
-                [3, 72, 24, False, "relu", 2],
-                [3, 88, 24, False, "relu", 1],  # output 2 -> out_index=3
-                [5, 96, 40, True, "hard_swish", 2],
-                [5, 240, 40, True, "hard_swish", 1],
-                [5, 240, 40, True, "hard_swish", 1],
-                [5, 120, 48, True, "hard_swish", 1],
-                [5, 144, 48, True, "hard_swish", 1],  # output 3 -> out_index=7
-                [5, 288, 96, True, "hard_swish", 2],
-                [5, 576, 96, True, "hard_swish", 1],
-                [5, 576, 96, True, "hard_swish", 1],  # output 4 -> out_index=10
-            ]
-            self.out_indices = [0, 3, 7, 10]
-            self.feat_channels = [
-                make_divisible(i * scale) for i in [16, 24, 48, 96]
-            ]
-
-            self.cls_ch_squeeze = 576
-            self.cls_ch_expand = 1280
-        else:
-            raise NotImplementedError(
-                "mode[{}_model] is not implemented!".format(model_name))
-
-        ###################################################
-        # modify stride and dilation based on output_stride
-        self.dilation_cfg = [1] * len(self.cfg)
-        self.modify_bottle_params(output_stride=output_stride)
-        ###################################################
-
-        self.conv1 = ConvBNLayer(
-            in_c=3,
-            out_c=make_divisible(inplanes * scale),
+
+        self.conv = ConvBNLayer(
+            in_c=in_channels,
+            out_c=_make_divisible(inplanes * self.scale),
            filter_size=3,
            stride=2,
            padding=1,
            num_groups=1,
            if_act=True,
-            act="hard_swish")
-
-        self.block_list = []
-
-        inplanes = make_divisible(inplanes * scale)
-        for i, (k, exp, c, se, nl, s) in enumerate(self.cfg):
-            ######################################
-            # add dilation rate
-            dilation_rate = self.dilation_cfg[i]
-            ######################################
-            self.block_list.append(
-                ResidualUnit(
-                    in_c=inplanes,
-                    mid_c=make_divisible(scale * exp),
-                    out_c=make_divisible(scale * c),
-                    filter_size=k,
-                    stride=s,
-                    dilation=dilation_rate,
-                    use_se=se,
-                    act=nl,
-                    name="conv" + str(i + 2)))
-            self.add_sublayer(
-                sublayer=self.block_list[-1], name="conv" + str(i + 2))
-            inplanes = make_divisible(scale * c)
-
-        self.pretrained = pretrained
+            act="hardswish")
+        self.blocks = nn.Sequential(*[
+            ResidualUnit(
+                in_c=_make_divisible(inplanes * self.scale if i == 0 else
+                                     self.cfg[i - 1][2] * self.scale),
+                mid_c=_make_divisible(self.scale * exp),
+                out_c=_make_divisible(self.scale * c),
+                filter_size=k,
+                stride=s,
+                use_se=se,
+                act=act,
+                dilation=td[0] if td else 1)
+            for i, (k, exp, c, se, act, s, *td) in enumerate(self.cfg)
+        ])
+
+        out_channels = [config[idx][2] for idx in self.out_index]
+        self.feat_channels = [
+            _make_divisible(self.scale * c) for c in out_channels
+        ]
+
+        self.init_res(stages_pattern)
        self.init_weight()

-    def modify_bottle_params(self, output_stride=None):
-
-        if output_stride is not None and output_stride % 2 != 0:
-            raise ValueError("output stride must to be even number")
-        if output_stride is not None:
-            stride = 2
-            rate = 1
-            for i, _cfg in enumerate(self.cfg):
-                stride = stride * _cfg[-1]
-                if stride > output_stride:
-                    rate = rate * _cfg[-1]
-                    self.cfg[i][-1] = 1
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def init_res(self, stages_pattern, return_patterns=None,
+                 return_stages=None):
+        if return_patterns and return_stages:
+            msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+            logger.warning(msg)
+            return_stages = None
+
+        if return_stages is True:
+            return_patterns = stages_pattern
+        # return_stages is int or bool
+        if type(return_stages) is int:
+            return_stages = [return_stages]
+        if isinstance(return_stages, list):
+            if max(return_stages) > len(stages_pattern) or min(
+                    return_stages) < 0:
+                msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                logger.warning(msg)
+                return_stages = [
+                    val for val in return_stages
+                    if val >= 0 and val < len(stages_pattern)
+                ]
+            return_patterns = [stages_pattern[i] for i in return_stages]

-                self.dilation_cfg[i] = rate
+    def forward(self, x):
+        x = self.conv(x)

-    def forward(self, inputs, label=None):
-        x = self.conv1(inputs)
-        # A feature list saves each downsampling feature.
        feat_list = []
-        for i, block in enumerate(self.block_list):
+        for idx, block in enumerate(self.blocks):
            x = block(x)
-            if i in self.out_indices:
+            if idx in self.out_index:
                feat_list.append(x)

        return feat_list

-    def init_weight(self):
-        if self.pretrained is not None:
-            utils.load_pretrained_model(self, self.pretrained)
-

 class ConvBNLayer(nn.Layer):
    def __init__(self,
@ -196,36 +239,34 @@ class ConvBNLayer(nn.Layer):
                 filter_size,
                 stride,
                 padding,
-                 dilation=1,
                 num_groups=1,
                 if_act=True,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-        self.if_act = if_act
-        self.act = act
+                 act=None,
+                 dilation=1):
+        super().__init__()

-        self.conv = nn.Conv2D(
+        self.conv = Conv2D(
            in_channels=in_c,
            out_channels=out_c,
            kernel_size=filter_size,
            stride=stride,
            padding=padding,
-            dilation=dilation,
            groups=num_groups,
-            bias_attr=False)
-        self.bn = layers.SyncBatchNorm(
-            num_features=out_c,
-            weight_attr=paddle.ParamAttr(
-                regularizer=paddle.regularizer.L2Decay(0.0)),
-            bias_attr=paddle.ParamAttr(
-                regularizer=paddle.regularizer.L2Decay(0.0)))
-        self._act_op = layers.Activation(act='hardswish')
+            bias_attr=False,
+            dilation=dilation)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.if_act:
-            x = self._act_op(x)
+            x = self.act(x)
        return x


@ -237,10 +278,9 @@ class ResidualUnit(nn.Layer):
                 filter_size,
                 stride,
                 use_se,
-                 dilation=1,
                 act=None,
-                 name=''):
-        super(ResidualUnit, self).__init__()
+                 dilation=1):
+        super().__init__()
        self.if_shortcut = stride == 1 and in_c == out_c
        self.if_se = use_se

@ -252,19 +292,18 @@ class ResidualUnit(nn.Layer):
            padding=0,
            if_act=True,
            act=act)
-
        self.bottleneck_conv = ConvBNLayer(
            in_c=mid_c,
            out_c=mid_c,
            filter_size=filter_size,
            stride=stride,
-            padding='same',
-            dilation=dilation,
+            padding=int((filter_size - 1) // 2) * dilation,
            num_groups=mid_c,
            if_act=True,
-            act=act)
+            act=act,
+            dilation=dilation)
        if self.if_se:
-            self.mid_se = SEModule(mid_c, name=name + "_se")
+            self.mid_se = SEModule(mid_c)
        self.linear_conv = ConvBNLayer(
            in_c=mid_c,
            out_c=out_c,
@ -273,92 +312,187 @@ class ResidualUnit(nn.Layer):
            padding=0,
            if_act=False,
            act=None)
-        self.dilation = dilation

-    def forward(self, inputs):
-        x = self.expand_conv(inputs)
+    def forward(self, x):
+        identity = x
+        x = self.expand_conv(x)
        x = self.bottleneck_conv(x)
        if self.if_se:
            x = self.mid_se(x)
        x = self.linear_conv(x)
        if self.if_shortcut:
-            x = inputs + x
+            x = paddle.add(identity, x)
        return x


+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(nn.Layer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return nn.functional.hardsigmoid(
+            x, slope=self.slope, offset=self.offset)
+
+
 class SEModule(nn.Layer):
-    def __init__(self, channel, reduction=4, name=""):
-        super(SEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2D(1)
-        self.conv1 = nn.Conv2D(
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
            in_channels=channel,
            out_channels=channel // reduction,
            kernel_size=1,
            stride=1,
            padding=0)
-        self.conv2 = nn.Conv2D(
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
            in_channels=channel // reduction,
            out_channels=channel,
            kernel_size=1,
            stride=1,
            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)

-    def forward(self, inputs):
-        outputs = self.avg_pool(inputs)
-        outputs = self.conv1(outputs)
-        outputs = F.relu(outputs)
-        outputs = self.conv2(outputs)
-        outputs = F.hardsigmoid(outputs)
-        return paddle.multiply(x=inputs, y=outputs)
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)


+@manager.BACKBONES.add_component
 def MobileNetV3_small_x0_35(**kwargs):
-    model = MobileNetV3(model_name="small", scale=0.35, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        out_index=OUT_INDEX["small"],
+        **kwargs)
    return model


+@manager.BACKBONES.add_component
 def MobileNetV3_small_x0_5(**kwargs):
-    model = MobileNetV3(model_name="small", scale=0.5, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        out_index=OUT_INDEX["small"],
+        **kwargs)
    return model


+@manager.BACKBONES.add_component
 def MobileNetV3_small_x0_75(**kwargs):
-    model = MobileNetV3(model_name="small", scale=0.75, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        out_index=OUT_INDEX["small"],
+        **kwargs)
    return model


@manager.BACKBONES.add_component
 def MobileNetV3_small_x1_0(**kwargs):
-    model = MobileNetV3(model_name="small", scale=1.0, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        out_index=OUT_INDEX["small"],
+        **kwargs)
    return model


+@manager.BACKBONES.add_component
 def MobileNetV3_small_x1_25(**kwargs):
-    model = MobileNetV3(model_name="small", scale=1.25, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        out_index=OUT_INDEX["small"],
+        **kwargs)
    return model


+@manager.BACKBONES.add_component
 def MobileNetV3_large_x0_35(**kwargs):
-    model = MobileNetV3(model_name="large", scale=0.35, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        out_index=OUT_INDEX["large"],
+        **kwargs)
    return model


+@manager.BACKBONES.add_component
 def MobileNetV3_large_x0_5(**kwargs):
-    model = MobileNetV3(model_name="large", scale=0.5, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        out_index=OUT_INDEX["large"],
+        **kwargs)
    return model


+@manager.BACKBONES.add_component
 def MobileNetV3_large_x0_75(**kwargs):
-    model = MobileNetV3(model_name="large", scale=0.75, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        out_index=OUT_INDEX["large"],
+        **kwargs)
    return model


@manager.BACKBONES.add_component
 def MobileNetV3_large_x1_0(**kwargs):
-    model = MobileNetV3(model_name="large", scale=1.0, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        out_index=OUT_INDEX["large"],
+        **kwargs)
    return model


+@manager.BACKBONES.add_component
 def MobileNetV3_large_x1_25(**kwargs):
-    model = MobileNetV3(model_name="large", scale=1.25, **kwargs)
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        out_index=OUT_INDEX["large"],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV3_large_x1_0_os8(**kwargs):
+    model = MobileNetV3(
+        config=NET_CONFIG["large_os8"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        out_index=OUT_INDEX["large"],
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def MobileNetV3_small_x1_0_os8(**kwargs):
+    model = MobileNetV3(
+        config=NET_CONFIG["small_os8"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        out_index=OUT_INDEX["small"],
+        **kwargs)
    return model
--- a/paddlers/models/ppseg/models/backbones/resnet_vd.py
+++ b/paddlers/models/ppseg/models/backbones/resnet_vd.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils

 __all__ = [
    "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet152_vd"
@ -206,15 +206,16 @@ class ResNet_vd(nn.Layer):
        layers (int, optional): The layers of ResNet_vd. The supported layers are (18, 34, 50, 101, 152, 200). Default: 50.
        output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 8.
        multi_grid (tuple|list, optional): The grid of stage4. Defult: (1, 1, 1).
+        in_channels (int, optional): The channels of input image. Default: 3.
        pretrained (str, optional): The path of pretrained model.

    """

    def __init__(self,
-                 input_channel=3,
                 layers=50,
                 output_stride=8,
                 multi_grid=(1, 1, 1),
+                 in_channels=3,
                 pretrained=None,
                 data_format='NCHW'):
        super(ResNet_vd, self).__init__()
@ -252,7 +253,7 @@ class ResNet_vd(nn.Layer):
            dilation_dict = {3: 2}

        self.conv1_1 = ConvBNLayer(
-            in_channels=input_channel,
+            in_channels=in_channels,
            out_channels=32,
            kernel_size=3,
            stride=2,
--- a/paddlers/models/ppseg/models/backbones/shufflenetv2.py
+++ b/paddlers/models/ppseg/models/backbones/shufflenetv2.py
@ -0,0 +1,315 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr, reshape, transpose, concat, split
+from paddle.nn import Layer, Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm, Linear
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn.functional import swish
+
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils, logger
+
+__all__ = [
+    'ShuffleNetV2_x0_25', 'ShuffleNetV2_x0_33', 'ShuffleNetV2_x0_5',
+    'ShuffleNetV2_x1_0', 'ShuffleNetV2_x1_5', 'ShuffleNetV2_x2_0',
+    'ShuffleNetV2_swish'
+]
+
+
+def channel_shuffle(x, groups):
+    x_shape = paddle.shape(x)
+    batch_size, height, width = x_shape[0], x_shape[2], x_shape[3]
+    num_channels = x.shape[1]
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+
+    # transpose
+    x = transpose(x=x, perm=[0, 2, 1, 3, 4])
+
+    # flatten
+    x = reshape(x=x, shape=[batch_size, num_channels, height, width])
+
+    return x
+
+
+class ConvBNLayer(Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups=1,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            out_channels,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            act=act,
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class InvertedResidual(Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu",
+                 name=None):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1, x2 = split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu",
+                 name=None):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None,
+            name='stage_' + name + '_conv4')
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv5')
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+class ShuffleNet(Layer):
+    def __init__(self, scale=1.0, act="relu", in_channels=3, pretrained=None):
+        super(ShuffleNet, self).__init__()
+        self.scale = scale
+        self.pretrained = pretrained
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+
+        self.out_index = [3, 11, 15]
+        self.feat_channels = stage_out_channels[1:5]
+
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act,
+            name='stage1_conv')
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                self._block_list.append(block)
+
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, inputs):
+        feat_list = []
+
+        y = self._conv1(inputs)
+        y = self._max_pool(y)
+        feat_list.append(y)
+
+        for idx, inv in enumerate(self._block_list):
+            y = inv(y)
+            if idx in self.out_index:
+                feat_list.append(y)
+        return feat_list
+
+
+@manager.BACKBONES.add_component
+def ShuffleNetV2_x0_25(**kwargs):
+    model = ShuffleNet(scale=0.25, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ShuffleNetV2_x0_33(**kwargs):
+    model = ShuffleNet(scale=0.33, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ShuffleNetV2_x0_5(**kwargs):
+    model = ShuffleNet(scale=0.5, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ShuffleNetV2_x1_0(**kwargs):
+    model = ShuffleNet(scale=1.0, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ShuffleNetV2_x1_5(**kwargs):
+    model = ShuffleNet(scale=1.5, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ShuffleNetV2_x2_0(**kwargs):
+    model = ShuffleNet(scale=2.0, **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def ShuffleNetV2_swish(**kwargs):
+    model = ShuffleNet(scale=1.0, act="swish", **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/stdcnet.py
+++ b/paddlers/models/ppseg/models/backbones/stdcnet.py
@ -17,9 +17,9 @@ import math
 import paddle
 import paddle.nn as nn

-from paddlers.models.ppseg.utils import utils
-from paddlers.models.ppseg.cvlibs import manager, param_init
-from paddlers.models.ppseg.models.layers.layer_libs import SyncBatchNorm
+from paddleseg.utils import utils
+from paddleseg.cvlibs import manager, param_init
+from paddleseg.models.layers.layer_libs import SyncBatchNorm

 __all__ = ["STDC1", "STDC2"]

@ -37,9 +37,9 @@ class STDCNet(nn.Layer):
        layers(list, optional): layers numbers list. It determines STDC block numbers of STDCNet's stage3\4\5. Defualt: [4, 5, 3].
        block_num(int,optional): block_num of features block. Default: 4.
        type(str,optional): feature fusion method "cat"/"add". Default: "cat".
-        num_classes(int, optional): class number for image classification. Default: 1000.
-        dropout(float,optional): dropout ratio. if >0,use dropout ratio.  Default: 0.20.
-        use_conv_last(bool,optional): whether to use the last ConvBNReLU layer . Default: False.
+        relative_lr(float,optional): parameters here receive a different learning rate when updating. The effective 
+            learning rate is the prodcut of relative_lr and the global learning rate. Default: 1.0. 
+        in_channels (int, optional): The channels of input image. Default: 3.
        pretrained(str, optional): the path of pretrained model.
    """

@ -48,34 +48,18 @@ class STDCNet(nn.Layer):
                 layers=[4, 5, 3],
                 block_num=4,
                 type="cat",
-                 num_classes=1000,
-                 dropout=0.20,
-                 use_conv_last=False,
+                 relative_lr=1.0,
+                 in_channels=3,
                 pretrained=None):
        super(STDCNet, self).__init__()
        if type == "cat":
            block = CatBottleneck
        elif type == "add":
            block = AddBottleneck
-        self.use_conv_last = use_conv_last
-        self.features = self._make_layers(base, layers, block_num, block)
-        self.conv_last = ConvBNRelu(base * 16, max(1024, base * 16), 1, 1)
-
-        if (layers == [4, 5, 3]):  #stdc1446
-            self.x2 = nn.Sequential(self.features[:1])
-            self.x4 = nn.Sequential(self.features[1:2])
-            self.x8 = nn.Sequential(self.features[2:6])
-            self.x16 = nn.Sequential(self.features[6:11])
-            self.x32 = nn.Sequential(self.features[11:])
-        elif (layers == [2, 2, 2]):  #stdc813
-            self.x2 = nn.Sequential(self.features[:1])
-            self.x4 = nn.Sequential(self.features[1:2])
-            self.x8 = nn.Sequential(self.features[2:4])
-            self.x16 = nn.Sequential(self.features[4:6])
-            self.x32 = nn.Sequential(self.features[6:])
-        else:
-            raise NotImplementedError(
-                "model with layers:{} is not implemented!".format(layers))
+        self.layers = layers
+        self.feat_channels = [base // 2, base, base * 4, base * 8, base * 16]
+        self.features = self._make_layers(in_channels, base, layers, block_num,
+                                          block, relative_lr)

        self.pretrained = pretrained
        self.init_weight()
@ -84,32 +68,42 @@ class STDCNet(nn.Layer):
        """
        forward function for feature extract.
        """
-        feat2 = self.x2(x)
-        feat4 = self.x4(feat2)
-        feat8 = self.x8(feat4)
-        feat16 = self.x16(feat8)
-        feat32 = self.x32(feat16)
-        if self.use_conv_last:
-            feat32 = self.conv_last(feat32)
-        return feat2, feat4, feat8, feat16, feat32
-
-    def _make_layers(self, base, layers, block_num, block):
+        out_feats = []
+
+        x = self.features[0](x)
+        out_feats.append(x)
+        x = self.features[1](x)
+        out_feats.append(x)
+
+        idx = [[2, 2 + self.layers[0]],
+               [2 + self.layers[0], 2 + sum(self.layers[0:2])],
+               [2 + sum(self.layers[0:2]), 2 + sum(self.layers)]]
+        for start_idx, end_idx in idx:
+            for i in range(start_idx, end_idx):
+                x = self.features[i](x)
+            out_feats.append(x)
+
+        return out_feats
+
+    def _make_layers(self, in_channels, base, layers, block_num, block,
+                     relative_lr):
        features = []
-        features += [ConvBNRelu(3, base // 2, 3, 2)]
-        features += [ConvBNRelu(base // 2, base, 3, 2)]
+        features += [ConvBNRelu(in_channels, base // 2, 3, 2, relative_lr)]
+        features += [ConvBNRelu(base // 2, base, 3, 2, relative_lr)]

        for i, layer in enumerate(layers):
            for j in range(layer):
                if i == 0 and j == 0:
-                    features.append(block(base, base * 4, block_num, 2))
+                    features.append(
+                        block(base, base * 4, block_num, 2, relative_lr))
                elif j == 0:
                    features.append(
                        block(base * int(math.pow(2, i + 1)), base * int(
-                            math.pow(2, i + 2)), block_num, 2))
+                            math.pow(2, i + 2)), block_num, 2, relative_lr))
                else:
                    features.append(
                        block(base * int(math.pow(2, i + 2)), base * int(
-                            math.pow(2, i + 2)), block_num, 1))
+                            math.pow(2, i + 2)), block_num, 1, relative_lr))

        return nn.Sequential(*features)

@ -125,16 +119,24 @@ class STDCNet(nn.Layer):


 class ConvBNRelu(nn.Layer):
-    def __init__(self, in_planes, out_planes, kernel=3, stride=1):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel=3,
+                 stride=1,
+                 relative_lr=1.0):
        super(ConvBNRelu, self).__init__()
+        param_attr = paddle.ParamAttr(learning_rate=relative_lr)
        self.conv = nn.Conv2D(
            in_planes,
            out_planes,
            kernel_size=kernel,
            stride=stride,
            padding=kernel // 2,
+            weight_attr=param_attr,
            bias_attr=False)
-        self.bn = SyncBatchNorm(out_planes, data_format='NCHW')
+        self.bn = nn.BatchNorm2D(
+            out_planes, weight_attr=param_attr, bias_attr=param_attr)
        self.relu = nn.ReLU()

    def forward(self, x):
@ -143,11 +145,17 @@ class ConvBNRelu(nn.Layer):


 class AddBottleneck(nn.Layer):
-    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 block_num=3,
+                 stride=1,
+                 relative_lr=1.0):
        super(AddBottleneck, self).__init__()
        assert block_num > 1, "block number should be larger than 1."
        self.conv_list = nn.LayerList()
        self.stride = stride
+        param_attr = paddle.ParamAttr(learning_rate=relative_lr)
        if stride == 2:
            self.avd_layer = nn.Sequential(
                nn.Conv2D(
@ -157,8 +165,12 @@ class AddBottleneck(nn.Layer):
                    stride=2,
                    padding=1,
                    groups=out_planes // 2,
+                    weight_attr=param_attr,
                    bias_attr=False),
-                nn.BatchNorm2D(out_planes // 2), )
+                nn.BatchNorm2D(
+                    out_planes // 2,
+                    weight_attr=param_attr,
+                    bias_attr=param_attr), )
            self.skip = nn.Sequential(
                nn.Conv2D(
                    in_planes,
@ -167,34 +179,53 @@ class AddBottleneck(nn.Layer):
                    stride=2,
                    padding=1,
                    groups=in_planes,
+                    weight_attr=param_attr,
                    bias_attr=False),
-                nn.BatchNorm2D(in_planes),
+                nn.BatchNorm2D(
+                    in_planes, weight_attr=param_attr, bias_attr=param_attr),
                nn.Conv2D(
-                    in_planes, out_planes, kernel_size=1, bias_attr=False),
-                nn.BatchNorm2D(out_planes), )
+                    in_planes,
+                    out_planes,
+                    kernel_size=1,
+                    bias_attr=False,
+                    weight_attr=param_attr),
+                nn.BatchNorm2D(
+                    out_planes, weight_attr=param_attr, bias_attr=param_attr), )
            stride = 1

        for idx in range(block_num):
            if idx == 0:
                self.conv_list.append(
                    ConvBNRelu(
-                        in_planes, out_planes // 2, kernel=1))
+                        in_planes,
+                        out_planes // 2,
+                        kernel=1,
+                        relative_lr=relative_lr))
            elif idx == 1 and block_num == 2:
                self.conv_list.append(
                    ConvBNRelu(
-                        out_planes // 2, out_planes // 2, stride=stride))
+                        out_planes // 2,
+                        out_planes // 2,
+                        stride=stride,
+                        relative_lr=relative_lr))
            elif idx == 1 and block_num > 2:
                self.conv_list.append(
                    ConvBNRelu(
-                        out_planes // 2, out_planes // 4, stride=stride))
+                        out_planes // 2,
+                        out_planes // 4,
+                        stride=stride,
+                        relative_lr=relative_lr))
            elif idx < block_num - 1:
                self.conv_list.append(
-                    ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes
-                               // int(math.pow(2, idx + 1))))
+                    ConvBNRelu(
+                        out_planes // int(math.pow(2, idx)),
+                        out_planes // int(math.pow(2, idx + 1)),
+                        relative_lr=relative_lr))
            else:
                self.conv_list.append(
-                    ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes
-                               // int(math.pow(2, idx))))
+                    ConvBNRelu(out_planes // int(math.pow(2, idx)),
+                               out_planes // int(math.pow(2, idx))),
+                    relative_lr=relative_lr)

    def forward(self, x):
        out_list = []
@ -211,11 +242,17 @@ class AddBottleneck(nn.Layer):


 class CatBottleneck(nn.Layer):
-    def __init__(self, in_planes, out_planes, block_num=3, stride=1):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 block_num=3,
+                 stride=1,
+                 relative_lr=1.0):
        super(CatBottleneck, self).__init__()
        assert block_num > 1, "block number should be larger than 1."
        self.conv_list = nn.LayerList()
        self.stride = stride
+        param_attr = paddle.ParamAttr(learning_rate=relative_lr)
        if stride == 2:
            self.avd_layer = nn.Sequential(
                nn.Conv2D(
@ -225,8 +262,12 @@ class CatBottleneck(nn.Layer):
                    stride=2,
                    padding=1,
                    groups=out_planes // 2,
+                    weight_attr=param_attr,
                    bias_attr=False),
-                nn.BatchNorm2D(out_planes // 2), )
+                nn.BatchNorm2D(
+                    out_planes // 2,
+                    weight_attr=param_attr,
+                    bias_attr=param_attr), )
            self.skip = nn.AvgPool2D(kernel_size=3, stride=2, padding=1)
            stride = 1

@ -234,23 +275,36 @@ class CatBottleneck(nn.Layer):
            if idx == 0:
                self.conv_list.append(
                    ConvBNRelu(
-                        in_planes, out_planes // 2, kernel=1))
+                        in_planes,
+                        out_planes // 2,
+                        kernel=1,
+                        relative_lr=relative_lr))
            elif idx == 1 and block_num == 2:
                self.conv_list.append(
                    ConvBNRelu(
-                        out_planes // 2, out_planes // 2, stride=stride))
+                        out_planes // 2,
+                        out_planes // 2,
+                        stride=stride,
+                        relative_lr=relative_lr))
            elif idx == 1 and block_num > 2:
                self.conv_list.append(
                    ConvBNRelu(
-                        out_planes // 2, out_planes // 4, stride=stride))
+                        out_planes // 2,
+                        out_planes // 4,
+                        stride=stride,
+                        relative_lr=relative_lr))
            elif idx < block_num - 1:
                self.conv_list.append(
-                    ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes
-                               // int(math.pow(2, idx + 1))))
+                    ConvBNRelu(
+                        out_planes // int(math.pow(2, idx)),
+                        out_planes // int(math.pow(2, idx + 1)),
+                        relative_lr=relative_lr))
            else:
                self.conv_list.append(
-                    ConvBNRelu(out_planes // int(math.pow(2, idx)), out_planes
-                               // int(math.pow(2, idx))))
+                    ConvBNRelu(
+                        out_planes // int(math.pow(2, idx)),
+                        out_planes // int(math.pow(2, idx)),
+                        relative_lr=relative_lr))

    def forward(self, x):
        out_list = []
--- a/paddlers/models/ppseg/models/backbones/swin_transformer.py
+++ b/paddlers/models/ppseg/models/backbones/swin_transformer.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -17,9 +17,9 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 import numpy as np

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
-from paddlers.models.ppseg.models.backbones.transformer_utils import *
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils
+from paddleseg.models.backbones.transformer_utils import *


 class Mlp(nn.Layer):
@ -531,7 +531,7 @@ class SwinTransformer(nn.Layer):
    Args:
        pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default: 224.
        patch_size (int | tuple(int)): Patch size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
+        in_channels (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        depths (tuple[int]): Depths of each Swin Transformer stage.
        num_heads (tuple[int]): Number of attention head of each stage.
@ -553,7 +553,7 @@ class SwinTransformer(nn.Layer):
    def __init__(self,
                 pretrain_img_size=224,
                 patch_size=4,
-                 in_chans=3,
+                 in_channels=3,
                 embed_dim=96,
                 depths=[2, 2, 6, 2],
                 num_heads=[3, 6, 12, 24],
@ -583,7 +583,7 @@ class SwinTransformer(nn.Layer):
        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            patch_size=patch_size,
-            in_chans=in_chans,
+            in_chans=in_channels,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)

--- a/paddlers/models/ppseg/models/backbones/top_transformer.py
+++ b/paddlers/models/ppseg/models/backbones/top_transformer.py
@ -0,0 +1,716 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file refers to https://github.com/hustvl/TopFormer and https://github.com/BR-IDL/PaddleViT
+"""
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import manager
+from paddleseg import utils
+from paddleseg.models.backbones.transformer_utils import Identity, DropPath
+
+__all__ = ["TopTransformer_Base", "TopTransformer_Small", "TopTransformer_Tiny"]
+
+
+def make_divisible(val, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(val + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * val:
+        new_v += divisor
+    return new_v
+
+
+class HSigmoid(nn.Layer):
+    def __init__(self, inplace=True):
+        super().__init__()
+        self.relu = nn.ReLU6()
+
+    def forward(self, x):
+        return self.relu(x + 3) / 6
+
+
+class Conv2DBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ks=1,
+                 stride=1,
+                 pad=0,
+                 dilation=1,
+                 groups=1,
+                 bn_weight_init=1,
+                 lr_mult=1.0):
+        super().__init__()
+        conv_weight_attr = paddle.ParamAttr(learning_rate=lr_mult)
+        self.c = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=ks,
+            stride=stride,
+            padding=pad,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=conv_weight_attr,
+            bias_attr=False)
+        bn_weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Constant(bn_weight_init),
+            learning_rate=lr_mult)
+        bn_bias_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Constant(0), learning_rate=lr_mult)
+        self.bn = nn.BatchNorm2D(
+            out_channels, weight_attr=bn_weight_attr, bias_attr=bn_bias_attr)
+
+    def forward(self, inputs):
+        out = self.c(inputs)
+        out = self.bn(out)
+        return out
+
+
+class ConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 norm=nn.BatchNorm2D,
+                 act=None,
+                 bias_attr=False,
+                 lr_mult=1.0):
+        super(ConvBNAct, self).__init__()
+        param_attr = paddle.ParamAttr(learning_rate=lr_mult)
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=param_attr,
+            bias_attr=param_attr if bias_attr else False)
+        self.act = act() if act is not None else Identity()
+        self.bn = norm(out_channels, weight_attr=param_attr, bias_attr=param_attr) \
+            if norm is not None else Identity()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class MLP(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.ReLU,
+                 drop=0.,
+                 lr_mult=1.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = Conv2DBN(in_features, hidden_features, lr_mult=lr_mult)
+        param_attr = paddle.ParamAttr(learning_rate=lr_mult)
+        self.dwconv = nn.Conv2D(
+            hidden_features,
+            hidden_features,
+            3,
+            1,
+            1,
+            groups=hidden_features,
+            weight_attr=param_attr,
+            bias_attr=param_attr)
+        self.act = act_layer()
+        self.fc2 = Conv2DBN(hidden_features, out_features, lr_mult=lr_mult)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 expand_ratio,
+                 activations=None,
+                 lr_mult=1.0):
+        super(InvertedResidual, self).__init__()
+        assert stride in [1, 2], "The stride should be 1 or 2."
+
+        if activations is None:
+            activations = nn.ReLU
+
+        hidden_dim = int(round(in_channels * expand_ratio))
+        self.use_res_connect = stride == 1 and in_channels == out_channels
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                Conv2DBN(
+                    in_channels, hidden_dim, ks=1, lr_mult=lr_mult))
+            layers.append(activations())
+        layers.extend([
+            Conv2DBN(
+                hidden_dim,
+                hidden_dim,
+                ks=kernel_size,
+                stride=stride,
+                pad=kernel_size // 2,
+                groups=hidden_dim,
+                lr_mult=lr_mult), activations(), Conv2DBN(
+                    hidden_dim, out_channels, ks=1, lr_mult=lr_mult)
+        ])
+        self.conv = nn.Sequential(*layers)
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class TokenPyramidModule(nn.Layer):
+    def __init__(self,
+                 cfgs,
+                 out_indices,
+                 in_channels=3,
+                 inp_channel=16,
+                 activation=nn.ReLU,
+                 width_mult=1.,
+                 lr_mult=1.):
+        super().__init__()
+        self.out_indices = out_indices
+
+        self.stem = nn.Sequential(
+            Conv2DBN(
+                in_channels, inp_channel, 3, 2, 1, lr_mult=lr_mult),
+            activation())
+
+        self.layers = []
+        for i, (k, t, c, s) in enumerate(cfgs):
+            output_channel = make_divisible(c * width_mult, 8)
+            exp_size = t * inp_channel
+            exp_size = make_divisible(exp_size * width_mult, 8)
+            layer_name = 'layer{}'.format(i + 1)
+            layer = InvertedResidual(
+                inp_channel,
+                output_channel,
+                kernel_size=k,
+                stride=s,
+                expand_ratio=t,
+                activations=activation,
+                lr_mult=lr_mult)
+            self.add_sublayer(layer_name, layer)
+            self.layers.append(layer_name)
+            inp_channel = output_channel
+
+    def forward(self, x):
+        outs = []
+        x = self.stem(x)
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return outs
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 key_dim,
+                 num_heads,
+                 attn_ratio=4,
+                 activation=None,
+                 lr_mult=1.0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+
+        self.to_q = Conv2DBN(dim, nh_kd, 1, lr_mult=lr_mult)
+        self.to_k = Conv2DBN(dim, nh_kd, 1, lr_mult=lr_mult)
+        self.to_v = Conv2DBN(dim, self.dh, 1, lr_mult=lr_mult)
+
+        self.proj = nn.Sequential(
+            activation(),
+            Conv2DBN(
+                self.dh, dim, bn_weight_init=0, lr_mult=lr_mult))
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        H, W = x_shape[2], x_shape[3]
+
+        qq = self.to_q(x).reshape(
+            [0, self.num_heads, self.key_dim, -1]).transpose([0, 1, 3, 2])
+        kk = self.to_k(x).reshape([0, self.num_heads, self.key_dim, -1])
+        vv = self.to_v(x).reshape([0, self.num_heads, self.d, -1]).transpose(
+            [0, 1, 3, 2])
+
+        attn = paddle.matmul(qq, kk)
+        attn = F.softmax(attn, axis=-1)
+
+        xx = paddle.matmul(attn, vv)
+
+        xx = xx.transpose([0, 1, 3, 2]).reshape([0, self.dh, H, W])
+        xx = self.proj(xx)
+        return xx
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 key_dim,
+                 num_heads,
+                 mlp_ratios=4.,
+                 attn_ratio=2.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.ReLU,
+                 lr_mult=1.0):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratios = mlp_ratios
+
+        self.attn = Attention(
+            dim,
+            key_dim=key_dim,
+            num_heads=num_heads,
+            attn_ratio=attn_ratio,
+            activation=act_layer,
+            lr_mult=lr_mult)
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        mlp_hidden_dim = int(dim * mlp_ratios)
+        self.mlp = MLP(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       lr_mult=lr_mult)
+
+    def forward(self, x):
+        h = x
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+
+        h = x
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = x + h
+        return x
+
+
+class BasicLayer(nn.Layer):
+    def __init__(self,
+                 block_num,
+                 embedding_dim,
+                 key_dim,
+                 num_heads,
+                 mlp_ratios=4.,
+                 attn_ratio=2.,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=None,
+                 lr_mult=1.0):
+        super().__init__()
+        self.block_num = block_num
+
+        self.transformer_blocks = nn.LayerList()
+        for i in range(self.block_num):
+            self.transformer_blocks.append(
+                Block(
+                    embedding_dim,
+                    key_dim=key_dim,
+                    num_heads=num_heads,
+                    mlp_ratios=mlp_ratios,
+                    attn_ratio=attn_ratio,
+                    drop=drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list) else drop_path,
+                    act_layer=act_layer,
+                    lr_mult=lr_mult))
+
+    def forward(self, x):
+        # token * N 
+        for i in range(self.block_num):
+            x = self.transformer_blocks[i](x)
+        return x
+
+
+class PyramidPoolAgg(nn.Layer):
+    def __init__(self, stride):
+        super().__init__()
+        self.stride = stride
+        self.tmp = Identity()  # avoid the error of paddle.flops
+
+    def forward(self, inputs):
+        '''
+        # The F.adaptive_avg_pool2d does not support the (H, W) be Tensor,
+        # so exporting the inference model will raise error.
+        _, _, H, W = inputs[-1].shape
+        H = (H - 1) // self.stride + 1
+        W = (W - 1) // self.stride + 1
+        return paddle.concat(
+            [F.adaptive_avg_pool2d(inp, (H, W)) for inp in inputs], axis=1)
+        '''
+        out = []
+        ks = 2**len(inputs)
+        stride = self.stride**len(inputs)
+        for x in inputs:
+            x = F.avg_pool2d(x, int(ks), int(stride))
+            ks /= 2
+            stride /= 2
+            out.append(x)
+        out = paddle.concat(out, axis=1)
+        return out
+
+
+class InjectionMultiSum(nn.Layer):
+    def __init__(self, in_channels, out_channels, activations=None,
+                 lr_mult=1.0):
+        super(InjectionMultiSum, self).__init__()
+
+        self.local_embedding = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, lr_mult=lr_mult)
+        self.global_embedding = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, lr_mult=lr_mult)
+        self.global_act = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, lr_mult=lr_mult)
+        self.act = HSigmoid()
+
+    def forward(self, x_low, x_global):
+        xl_hw = paddle.shape(x_low)[2:]
+        local_feat = self.local_embedding(x_low)
+
+        global_act = self.global_act(x_global)
+        sig_act = F.interpolate(
+            self.act(global_act), xl_hw, mode='bilinear', align_corners=False)
+
+        global_feat = self.global_embedding(x_global)
+        global_feat = F.interpolate(
+            global_feat, xl_hw, mode='bilinear', align_corners=False)
+
+        out = local_feat * sig_act + global_feat
+        return out
+
+
+class InjectionMultiSumCBR(nn.Layer):
+    def __init__(self, in_channels, out_channels, activations=None):
+        '''
+        local_embedding: conv-bn-relu
+        global_embedding: conv-bn-relu
+        global_act: conv
+        '''
+        super(InjectionMultiSumCBR, self).__init__()
+
+        self.local_embedding = ConvBNAct(
+            in_channels, out_channels, kernel_size=1)
+        self.global_embedding = ConvBNAct(
+            in_channels, out_channels, kernel_size=1)
+        self.global_act = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, norm=None, act=None)
+        self.act = HSigmoid()
+
+    def forward(self, x_low, x_global):
+        xl_hw = paddle.shape(x)[2:]
+        local_feat = self.local_embedding(x_low)
+        # kernel
+        global_act = self.global_act(x_global)
+        global_act = F.interpolate(
+            self.act(global_act), xl_hw, mode='bilinear', align_corners=False)
+        # feat_h
+        global_feat = self.global_embedding(x_global)
+        global_feat = F.interpolate(
+            global_feat, xl_hw, mode='bilinear', align_corners=False)
+        out = local_feat * global_act + global_feat
+        return out
+
+
+class FuseBlockSum(nn.Layer):
+    def __init__(self, in_channels, out_channels, activations=None):
+        super(FuseBlockSum, self).__init__()
+
+        self.fuse1 = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, act=None)
+        self.fuse2 = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, act=None)
+
+    def forward(self, x_low, x_high):
+        xl_hw = paddle.shape(x)[2:]
+        inp = self.fuse1(x_low)
+        kernel = self.fuse2(x_high)
+        feat_h = F.interpolate(
+            kernel, xl_hw, mode='bilinear', align_corners=False)
+        out = inp + feat_h
+        return out
+
+
+class FuseBlockMulti(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            stride=1,
+            activations=None, ):
+        super(FuseBlockMulti, self).__init__()
+        assert stride in [1, 2], "The stride should be 1 or 2."
+
+        self.fuse1 = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, act=None)
+        self.fuse2 = ConvBNAct(
+            in_channels, out_channels, kernel_size=1, act=None)
+        self.act = HSigmoid()
+
+    def forward(self, x_low, x_high):
+        xl_hw = paddle.shape(x)[2:]
+        inp = self.fuse1(x_low)
+        sig_act = self.fuse2(x_high)
+        sig_act = F.interpolate(
+            self.act(sig_act), xl_hw, mode='bilinear', align_corners=False)
+        out = inp * sig_act
+        return out
+
+
+SIM_BLOCK = {
+    "fuse_sum": FuseBlockSum,
+    "fuse_multi": FuseBlockMulti,
+    "multi_sum": InjectionMultiSum,
+    "multi_sum_cbr": InjectionMultiSumCBR,
+}
+
+
+class TopTransformer(nn.Layer):
+    def __init__(self,
+                 cfgs,
+                 injection_out_channels,
+                 encoder_out_indices,
+                 trans_out_indices=[1, 2, 3],
+                 depths=4,
+                 key_dim=16,
+                 num_heads=8,
+                 attn_ratios=2,
+                 mlp_ratios=2,
+                 c2t_stride=2,
+                 drop_path_rate=0.,
+                 act_layer=nn.ReLU6,
+                 injection_type="muli_sum",
+                 injection=True,
+                 lr_mult=1.0,
+                 in_channels=3,
+                 pretrained=None):
+        super().__init__()
+        self.feat_channels = [
+            c[2] for i, c in enumerate(cfgs) if i in encoder_out_indices
+        ]
+        self.injection_out_channels = injection_out_channels
+        self.injection = injection
+        self.embed_dim = sum(self.feat_channels)
+        self.trans_out_indices = trans_out_indices
+
+        self.tpm = TokenPyramidModule(
+            cfgs=cfgs,
+            out_indices=encoder_out_indices,
+            in_channels=in_channels,
+            lr_mult=lr_mult)
+        self.ppa = PyramidPoolAgg(stride=c2t_stride)
+
+        dpr = [x.item() for x in \
+               paddle.linspace(0, drop_path_rate, depths)]
+        self.trans = BasicLayer(
+            block_num=depths,
+            embedding_dim=self.embed_dim,
+            key_dim=key_dim,
+            num_heads=num_heads,
+            mlp_ratios=mlp_ratios,
+            attn_ratio=attn_ratios,
+            drop=0,
+            attn_drop=0,
+            drop_path=dpr,
+            act_layer=act_layer,
+            lr_mult=lr_mult)
+
+        self.SIM = nn.LayerList()
+        inj_module = SIM_BLOCK[injection_type]
+        if self.injection:
+            for i in range(len(self.feat_channels)):
+                if i in trans_out_indices:
+                    self.SIM.append(
+                        inj_module(
+                            self.feat_channels[i],
+                            injection_out_channels[i],
+                            activations=act_layer,
+                            lr_mult=lr_mult))
+                else:
+                    self.SIM.append(Identity())
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        ouputs = self.tpm(x)
+        out = self.ppa(ouputs)
+        out = self.trans(out)
+
+        if self.injection:
+            xx = out.split(self.feat_channels, axis=1)
+            results = []
+            for i in range(len(self.feat_channels)):
+                if i in self.trans_out_indices:
+                    local_tokens = ouputs[i]
+                    global_semantics = xx[i]
+                    out_ = self.SIM[i](local_tokens, global_semantics)
+                    results.append(out_)
+            return results
+        else:
+            ouputs.append(out)
+            return ouputs
+
+
+@manager.BACKBONES.add_component
+def TopTransformer_Base(**kwargs):
+    cfgs = [
+        # k,  t,  c, s
+        [3, 1, 16, 1],  # 1/2        
+        [3, 4, 32, 2],  # 1/4 1      
+        [3, 3, 32, 1],  #            
+        [5, 3, 64, 2],  # 1/8 3      
+        [5, 3, 64, 1],  #            
+        [3, 3, 128, 2],  # 1/16 5     
+        [3, 3, 128, 1],  #            
+        [5, 6, 160, 2],  # 1/32 7     
+        [5, 6, 160, 1],  #            
+        [3, 6, 160, 1],  #            
+    ]
+
+    model = TopTransformer(
+        cfgs=cfgs,
+        injection_out_channels=[None, 256, 256, 256],
+        encoder_out_indices=[2, 4, 6, 9],
+        trans_out_indices=[1, 2, 3],
+        depths=4,
+        key_dim=16,
+        num_heads=8,
+        attn_ratios=2,
+        mlp_ratios=2,
+        c2t_stride=2,
+        drop_path_rate=0.,
+        act_layer=nn.ReLU6,
+        injection_type="multi_sum",
+        injection=True,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def TopTransformer_Small(**kwargs):
+    cfgs = [
+        # k,  t,  c, s
+        [3, 1, 16, 1],  # 1/2        
+        [3, 4, 24, 2],  # 1/4 1      
+        [3, 3, 24, 1],  #            
+        [5, 3, 48, 2],  # 1/8 3      
+        [5, 3, 48, 1],  #            
+        [3, 3, 96, 2],  # 1/16 5     
+        [3, 3, 96, 1],  #            
+        [5, 6, 128, 2],  # 1/32 7     
+        [5, 6, 128, 1],  #            
+        [3, 6, 128, 1],  #           
+    ]
+
+    model = TopTransformer(
+        cfgs=cfgs,
+        injection_out_channels=[None, 192, 192, 192],
+        encoder_out_indices=[2, 4, 6, 9],
+        trans_out_indices=[1, 2, 3],
+        depths=4,
+        key_dim=16,
+        num_heads=6,
+        attn_ratios=2,
+        mlp_ratios=2,
+        c2t_stride=2,
+        drop_path_rate=0.,
+        act_layer=nn.ReLU6,
+        injection_type="multi_sum",
+        injection=True,
+        **kwargs)
+    return model
+
+
+@manager.BACKBONES.add_component
+def TopTransformer_Tiny(**kwargs):
+    cfgs = [
+        # k,  t,  c, s
+        [3, 1, 16, 1],  # 1/2       
+        [3, 4, 16, 2],  # 1/4 1      
+        [3, 3, 16, 1],  #            
+        [5, 3, 32, 2],  # 1/8 3      
+        [5, 3, 32, 1],  #            
+        [3, 3, 64, 2],  # 1/16 5     
+        [3, 3, 64, 1],  #            
+        [5, 6, 96, 2],  # 1/32 7     
+        [5, 6, 96, 1],  #               
+    ]
+
+    model = TopTransformer(
+        cfgs=cfgs,
+        injection_out_channels=[None, 128, 128, 128],
+        encoder_out_indices=[2, 4, 6, 8],
+        trans_out_indices=[1, 2, 3],
+        depths=4,
+        key_dim=16,
+        num_heads=4,
+        attn_ratios=2,
+        mlp_ratios=2,
+        c2t_stride=2,
+        drop_path_rate=0.,
+        act_layer=nn.ReLU6,
+        injection_type="multi_sum",
+        injection=True,
+        **kwargs)
+    return model
--- a/paddlers/models/ppseg/models/backbones/transformer_utils.py
+++ b/paddlers/models/ppseg/models/backbones/transformer_utils.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -35,7 +35,7 @@ def drop_path(x, drop_prob=0., training=False):
        return x
    keep_prob = paddle.to_tensor(1 - drop_prob)
    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
-    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
    random_tensor = paddle.floor(random_tensor)  # binarize
    output = x.divide(keep_prob) * random_tensor
    return output
--- a/paddlers/models/ppseg/models/backbones/vision_transformer.py
+++ b/paddlers/models/ppseg/models/backbones/vision_transformer.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -20,9 +20,9 @@ import paddle.nn as nn
 import paddle.nn.functional as F
 import numpy as np

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils, logger
-from paddlers.models.ppseg.models.backbones.transformer_utils import to_2tuple, DropPath, Identity
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils, logger
+from paddleseg.models.backbones.transformer_utils import to_2tuple, DropPath, Identity


 class Mlp(nn.Layer):
@ -154,7 +154,7 @@ class VisionTransformer(nn.Layer):
    def __init__(self,
                 img_size=224,
                 patch_size=16,
-                 in_chans=3,
+                 in_channels=3,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
@ -176,7 +176,7 @@ class VisionTransformer(nn.Layer):
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
-            in_chans=in_chans,
+            in_chans=in_channels,
            embed_dim=embed_dim)
        self.pos_w = self.patch_embed.num_patches_in_w
        self.pos_h = self.patch_embed.num_patches_in_h
--- a/paddlers/models/ppseg/models/backbones/xception_deeplab.py
+++ b/paddlers/models/ppseg/models/backbones/xception_deeplab.py
@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -15,9 +15,9 @@
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
-from paddlers.models.ppseg.models import layers
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils
+from paddleseg.models import layers

 __all__ = ["Xception41_deeplab", "Xception65_deeplab", "Xception71_deeplab"]

@ -255,12 +255,17 @@ class XceptionDeeplab(nn.Layer):

     Args:
         backbone (str): Which type of Xception_DeepLab to select. It should be one of ('xception_41', 'xception_65', 'xception_71').
+         in_channels (int, optional): The channels of input image. Default: 3.
         pretrained (str, optional): The path of pretrained model.
         output_stride (int, optional): The stride of output features compared to input images. It is 8 or 16. Default: 16.

    """

-    def __init__(self, backbone, pretrained=None, output_stride=16):
+    def __init__(self,
+                 backbone,
+                 in_channels=3,
+                 pretrained=None,
+                 output_stride=16):

        super(XceptionDeeplab, self).__init__()

@ -269,7 +274,7 @@ class XceptionDeeplab(nn.Layer):
        self.feat_channels = [128, 2048]

        self._conv1 = ConvBNLayer(
-            3,
+            in_channels,
            32,
            3,
            stride=2,
--- a/paddlers/models/ppseg/models/bisenet.py
+++ b/paddlers/models/ppseg/models/bisenet.py
@ -18,9 +18,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg import utils
-from paddlers.models.ppseg.cvlibs import manager, param_init
-from paddlers.models.ppseg.models import layers
+from paddleseg import utils
+from paddleseg.cvlibs import manager, param_init
+from paddleseg.models import layers


@manager.MODELS.add_component
@ -35,6 +35,7 @@ class BiSeNetV2(nn.Layer):
    Args:
        num_classes (int): The unique number of target classes.
        lambd (float, optional): A factor for controlling the size of semantic branch channels. Default: 0.25.
+        in_channels (int, optional): The channels of input image. Default: 3.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
    """

@ -42,6 +43,7 @@ class BiSeNetV2(nn.Layer):
                 num_classes,
                 lambd=0.25,
                 align_corners=False,
+                 in_channels=3,
                 pretrained=None):
        super().__init__()

@ -51,8 +53,8 @@ class BiSeNetV2(nn.Layer):
        sb_channels = (C1, C3, C4, C5)
        mid_channels = 128

-        self.db = DetailBranch(db_channels)
-        self.sb = SemanticBranch(sb_channels)
+        self.db = DetailBranch(in_channels, db_channels)
+        self.sb = SemanticBranch(in_channels, sb_channels)

        self.bga = BGA(mid_channels, align_corners)
        self.aux_head1 = SegHead(C1, C1, num_classes)
@ -189,15 +191,15 @@ class GatherAndExpansionLayer2(nn.Layer):
 class DetailBranch(nn.Layer):
    """The detail branch of BiSeNet, which has wide channels but shallow layers."""

-    def __init__(self, in_channels):
+    def __init__(self, in_channels, feature_channels):
        super().__init__()

-        C1, C2, C3 = in_channels
+        C1, C2, C3 = feature_channels

        self.convs = nn.Sequential(
            # stage 1
            layers.ConvBNReLU(
-                3, C1, 3, stride=2),
+                in_channels, C1, 3, stride=2),
            layers.ConvBNReLU(C1, C1, 3),
            # stage 2
            layers.ConvBNReLU(
@ -217,11 +219,11 @@ class DetailBranch(nn.Layer):
 class SemanticBranch(nn.Layer):
    """The semantic branch of BiSeNet, which has narrow channels but deep layers."""

-    def __init__(self, in_channels):
+    def __init__(self, in_channels, feature_channels):
        super().__init__()
-        C1, C3, C4, C5 = in_channels
+        C1, C3, C4, C5 = feature_channels

-        self.stem = StemBlock(3, C1)
+        self.stem = StemBlock(in_channels, C1)

        self.stage3 = nn.Sequential(
            GatherAndExpansionLayer2(C1, C3, 6),
--- a/paddlers/models/ppseg/models/bisenetv1.py
+++ b/paddlers/models/ppseg/models/bisenetv1.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/ccnet.py
+++ b/paddlers/models/ppseg/models/ccnet.py
@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils
+
+
+@manager.MODELS.add_component
+class CCNet(nn.Layer):
+    """
+    The CCNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Zilong Huang, et al. "CCNet: Criss-Cross Attention for Semantic Segmentation"
+    (https://arxiv.org/abs/1811.11721)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (paddle.nn.Layer): Backbone network, currently support Resnet18_vd/Resnet34_vd/Resnet50_vd/Resnet101_vd.
+        backbone_indices (tuple, list, optional): Two values in the tuple indicate the indices of output of backbone. Default: (2, 3).
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        dropout_prob (float, optional): The probability of dropout. Default: 0.0.
+        recurrence (int, optional): The number of recurrent operations. Defautl: 1.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 enable_auxiliary_loss=True,
+                 dropout_prob=0.0,
+                 recurrence=1,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.recurrence = recurrence
+        self.align_corners = align_corners
+
+        self.backbone = backbone
+        self.backbone_indices = backbone_indices
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        if enable_auxiliary_loss:
+            self.aux_head = layers.AuxLayer(
+                backbone_channels[0],
+                512,
+                num_classes,
+                dropout_prob=dropout_prob)
+        self.head = RCCAModule(
+            backbone_channels[1],
+            512,
+            num_classes,
+            dropout_prob=dropout_prob,
+            recurrence=recurrence)
+        self.pretrained = pretrained
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = []
+        output = self.head(feat_list[self.backbone_indices[-1]])
+        logit_list.append(output)
+        if self.training and self.enable_auxiliary_loss:
+            aux_out = self.aux_head(feat_list[self.backbone_indices[-2]])
+            logit_list.append(aux_out)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+
+class RCCAModule(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_classes,
+                 dropout_prob=0.1,
+                 recurrence=1):
+        super().__init__()
+        inter_channels = in_channels // 4
+        self.recurrence = recurrence
+        self.conva = layers.ConvBNLeakyReLU(
+            in_channels, inter_channels, 3, padding=1, bias_attr=False)
+        self.cca = CrissCrossAttention(inter_channels)
+        self.convb = layers.ConvBNLeakyReLU(
+            inter_channels, inter_channels, 3, padding=1, bias_attr=False)
+        self.out = layers.AuxLayer(
+            in_channels + inter_channels,
+            out_channels,
+            num_classes,
+            dropout_prob=dropout_prob)
+
+    def forward(self, x):
+        feat = self.conva(x)
+        for i in range(self.recurrence):
+            feat = self.cca(feat)
+        feat = self.convb(feat)
+        output = self.out(paddle.concat([x, feat], axis=1))
+        return output
+
+
+class CrissCrossAttention(nn.Layer):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.q_conv = nn.Conv2D(in_channels, in_channels // 8, kernel_size=1)
+        self.k_conv = nn.Conv2D(in_channels, in_channels // 8, kernel_size=1)
+        self.v_conv = nn.Conv2D(in_channels, in_channels, kernel_size=1)
+        self.softmax = nn.Softmax(axis=3)
+        self.gamma = self.create_parameter(
+            shape=(1, ), default_initializer=nn.initializer.Constant(0))
+        self.inf_tensor = paddle.full(shape=(1, ), fill_value=float('inf'))
+
+    def forward(self, x):
+        b, c, h, w = paddle.shape(x)
+        proj_q = self.q_conv(x)
+        proj_q_h = proj_q.transpose([0, 3, 1, 2]).reshape(
+            [b * w, -1, h]).transpose([0, 2, 1])
+        proj_q_w = proj_q.transpose([0, 2, 1, 3]).reshape(
+            [b * h, -1, w]).transpose([0, 2, 1])
+
+        proj_k = self.k_conv(x)
+        proj_k_h = proj_k.transpose([0, 3, 1, 2]).reshape([b * w, -1, h])
+        proj_k_w = proj_k.transpose([0, 2, 1, 3]).reshape([b * h, -1, w])
+
+        proj_v = self.v_conv(x)
+        proj_v_h = proj_v.transpose([0, 3, 1, 2]).reshape([b * w, -1, h])
+        proj_v_w = proj_v.transpose([0, 2, 1, 3]).reshape([b * h, -1, w])
+
+        energy_h = (paddle.bmm(proj_q_h, proj_k_h) + self.Inf(b, h, w)).reshape(
+            [b, w, h, h]).transpose([0, 2, 1, 3])
+        energy_w = paddle.bmm(proj_q_w, proj_k_w).reshape([b, h, w, w])
+        concate = self.softmax(paddle.concat([energy_h, energy_w], axis=3))
+
+        attn_h = concate[:, :, :, 0:h].transpose([0, 2, 1, 3]).reshape(
+            [b * w, h, h])
+        attn_w = concate[:, :, :, h:h + w].reshape([b * h, w, w])
+        out_h = paddle.bmm(proj_v_h, attn_h.transpose([0, 2, 1])).reshape(
+            [b, w, -1, h]).transpose([0, 2, 3, 1])
+        out_w = paddle.bmm(proj_v_w, attn_w.transpose([0, 2, 1])).reshape(
+            [b, h, -1, w]).transpose([0, 2, 1, 3])
+        return self.gamma * (out_h + out_w) + x
+
+    def Inf(self, B, H, W):
+        return -paddle.tile(
+            paddle.diag(paddle.tile(self.inf_tensor, [H]), 0).unsqueeze(0),
+            [B * W, 1, 1])
--- a/paddlers/models/ppseg/models/danet.py
+++ b/paddlers/models/ppseg/models/danet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/ddrnet.py
+++ b/paddlers/models/ppseg/models/ddrnet.py
@ -0,0 +1,403 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import manager, param_init
+from paddleseg.models import layers
+from paddleseg.utils import utils
+
+
+class DualResNet(nn.Layer):
+    """
+    The DDRNet implementation based on PaddlePaddle.
+
+    The original article refers to
+    Yuanduo Hong, Huihui Pan, Weichao Sun, et al. "Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes"
+    (https://arxiv.org/abs/2101.06085)
+
+    Args:
+        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): Number of input channels. Default: 3.
+        block_layers (list, tuple): The numbers of layers in different blocks. Default: [2, 2, 2, 2].
+        planes (int): Base channels in network. Default: 64.
+        spp_planes (int): Branch channels for DAPPM. Default: 128.
+        head_planes (int): Mid channels of segmentation head. Default: 128.
+        enable_auxiliary_loss (bool): Whether use auxiliary head for stage3. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels=3,
+                 block_layers=[2, 2, 2, 2],
+                 planes=64,
+                 spp_planes=128,
+                 head_planes=128,
+                 enable_auxiliary_loss=False,
+                 pretrained=None):
+        super().__init__()
+        highres_planes = planes * 2
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+        self.conv1 = nn.Sequential(
+            layers.ConvBNReLU(
+                in_channels, planes, kernel_size=3, stride=2, padding=1),
+            layers.ConvBNReLU(
+                planes, planes, kernel_size=3, stride=2, padding=1), )
+        self.relu = nn.ReLU()
+        self.layer1 = self._make_layers(BasicBlock, planes, planes,
+                                        block_layers[0])
+        self.layer2 = self._make_layers(
+            BasicBlock, planes, planes * 2, block_layers[1], stride=2)
+        self.layer3 = self._make_layers(
+            BasicBlock, planes * 2, planes * 4, block_layers[2], stride=2)
+        self.layer4 = self._make_layers(
+            BasicBlock, planes * 4, planes * 8, block_layers[3], stride=2)
+
+        self.compression3 = layers.ConvBN(
+            planes * 4, highres_planes, kernel_size=1, bias_attr=False)
+
+        self.compression4 = layers.ConvBN(
+            planes * 8, highres_planes, kernel_size=1, bias_attr=False)
+
+        self.down3 = layers.ConvBN(
+            highres_planes,
+            planes * 4,
+            kernel_size=3,
+            stride=2,
+            bias_attr=False)
+
+        self.down4 = nn.Sequential(
+            layers.ConvBNReLU(
+                highres_planes,
+                planes * 4,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias_attr=False),
+            layers.ConvBN(
+                planes * 4,
+                planes * 8,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias_attr=False))
+
+        self.layer3_ = self._make_layers(BasicBlock, planes * 2, highres_planes,
+                                         2)
+        self.layer4_ = self._make_layers(BasicBlock, highres_planes,
+                                         highres_planes, 2)
+        self.layer5_ = self._make_layers(Bottleneck, highres_planes,
+                                         highres_planes, 1)
+        self.layer5 = self._make_layers(
+            Bottleneck, planes * 8, planes * 8, 1, stride=2)
+
+        self.spp = DAPPM(planes * 16, spp_planes, planes * 4)
+        if self.enable_auxiliary_loss:
+            self.aux_head = DDRNetHead(highres_planes, head_planes, num_classes)
+        self.head = DDRNetHead(planes * 4, head_planes, num_classes)
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+        else:
+            for m in self.sublayers():
+                if isinstance(m, nn.Conv2D):
+                    param_init.kaiming_normal_init(m.weight)
+                elif isinstance(m, nn.BatchNorm2D):
+                    param_init.constant_init(m.weight, value=1)
+                    param_init.constant_init(m.bias, value=0)
+
+    def _make_layers(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2D(
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias_attr=False),
+                nn.BatchNorm2D(planes * block.expansion), )
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            if i == (blocks - 1):
+                layers.append(block(inplanes, planes, stride=1, no_relu=True))
+            else:
+                layers.append(block(inplanes, planes, stride=1, no_relu=False))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        n, c, h, w = paddle.shape(x)
+        width_output = w // 8
+        height_output = h // 8
+
+        x = self.conv1(x)
+        stage1_out = self.layer1(x)
+        stage2_out = self.layer2(self.relu(stage1_out))
+        stage3_out = self.layer3(self.relu(stage2_out))
+        stage3_out_dual = self.layer3_(self.relu(stage2_out))
+        x = stage3_out + self.down3(self.relu(stage3_out_dual))
+        stage3_merge = stage3_out_dual + F.interpolate(
+            self.compression3(self.relu(stage3_out)),
+            size=[height_output, width_output],
+            mode='bilinear')
+
+        stage4_out = self.layer4(self.relu(x))
+        stage4_out_dual = self.layer4_(self.relu(stage3_merge))
+
+        x = stage4_out + self.down4(self.relu(stage4_out_dual))
+        stage4_merge = stage4_out_dual + F.interpolate(
+            self.compression4(self.relu(stage4_out)),
+            size=[height_output, width_output],
+            mode='bilinear')
+
+        stage5_out_dual = self.layer5_(self.relu(stage4_merge))
+        x = F.interpolate(
+            self.spp(self.layer5(self.relu(x))),
+            size=[height_output, width_output],
+            mode='bilinear')
+
+        output = self.head(x + stage5_out_dual)
+        logit_list = []
+        logit_list.append(output)
+
+        if self.enable_auxiliary_loss:
+            aux_out = self.aux_head(stage3_merge)
+            logit_list.append(aux_out)
+        return [
+            F.interpolate(
+                logit, [h, w], mode='bilinear') for logit in logit_list
+        ]
+
+
+class BasicBlock(nn.Layer):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 no_relu=False):
+        super().__init__()
+        self.conv_bn_relu = layers.ConvBNReLU(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias_attr=False)
+        self.relu = nn.ReLU()
+        self.conv_bn = layers.ConvBN(
+            planes, planes, kernel_size=3, stride=1, padding=1, bias_attr=False)
+        self.downsample = downsample
+        self.stride = stride
+        self.no_relu = no_relu
+
+    def forward(self, x):
+        residual = x
+        out = self.conv_bn_relu(x)
+        out = self.conv_bn(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        if self.no_relu:
+            return out
+        else:
+            return self.relu(out)
+
+
+class Bottleneck(nn.Layer):
+    expansion = 2
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 no_relu=True):
+        super().__init__()
+        self.conv_bn_relu1 = layers.ConvBNReLU(
+            inplanes, planes, kernel_size=1, bias_attr=False)
+        self.conv_bn_relu2 = layers.ConvBNReLU(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias_attr=False)
+        self.conv_bn = layers.ConvBN(
+            planes, planes * self.expansion, kernel_size=1, bias_attr=False)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+        self.no_relu = no_relu
+
+    def forward(self, x):
+        residual = x
+        out = self.conv_bn_relu1(x)
+        out = self.conv_bn_relu2(out)
+        out = self.conv_bn(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        if self.no_relu:
+            return out
+        else:
+            return self.relu(out)
+
+
+class DAPPM(nn.Layer):
+    def __init__(self, inplanes, branch_planes, outplanes):
+        super().__init__()
+        self.scale1 = nn.Sequential(
+            nn.AvgPool2D(
+                kernel_size=5, stride=2, padding=2),
+            layers.SyncBatchNorm(inplanes),
+            nn.ReLU(),
+            nn.Conv2D(
+                inplanes, branch_planes, kernel_size=1, bias_attr=False), )
+        self.scale2 = nn.Sequential(
+            nn.AvgPool2D(
+                kernel_size=9, stride=4, padding=4),
+            layers.SyncBatchNorm(inplanes),
+            nn.ReLU(),
+            nn.Conv2D(
+                inplanes, branch_planes, kernel_size=1, bias_attr=False), )
+        self.scale3 = nn.Sequential(
+            nn.AvgPool2D(
+                kernel_size=17, stride=8, padding=8),
+            layers.SyncBatchNorm(inplanes),
+            nn.ReLU(),
+            nn.Conv2D(
+                inplanes, branch_planes, kernel_size=1, bias_attr=False), )
+        self.scale4 = nn.Sequential(
+            nn.AdaptiveAvgPool2D((1, 1)),
+            layers.SyncBatchNorm(inplanes),
+            nn.ReLU(),
+            nn.Conv2D(
+                inplanes, branch_planes, kernel_size=1, bias_attr=False), )
+        self.scale0 = nn.Sequential(
+            layers.SyncBatchNorm(inplanes),
+            nn.ReLU(),
+            nn.Conv2D(
+                inplanes, branch_planes, kernel_size=1, bias_attr=False), )
+        self.process1 = nn.Sequential(
+            layers.SyncBatchNorm(branch_planes),
+            nn.ReLU(),
+            nn.Conv2D(
+                branch_planes,
+                branch_planes,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False), )
+        self.process2 = nn.Sequential(
+            layers.SyncBatchNorm(branch_planes),
+            nn.ReLU(),
+            nn.Conv2D(
+                branch_planes,
+                branch_planes,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False), )
+        self.process3 = nn.Sequential(
+            layers.SyncBatchNorm(branch_planes),
+            nn.ReLU(),
+            nn.Conv2D(
+                branch_planes,
+                branch_planes,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False), )
+        self.process4 = nn.Sequential(
+            layers.SyncBatchNorm(branch_planes),
+            nn.ReLU(),
+            nn.Conv2D(
+                branch_planes,
+                branch_planes,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False), )
+        self.compression = nn.Sequential(
+            layers.SyncBatchNorm(branch_planes * 5),
+            nn.ReLU(),
+            nn.Conv2D(
+                branch_planes * 5, outplanes, kernel_size=1, bias_attr=False))
+        self.shortcut = nn.Sequential(
+            layers.SyncBatchNorm(inplanes),
+            nn.ReLU(),
+            nn.Conv2D(
+                inplanes, outplanes, kernel_size=1, bias_attr=False))
+
+    def forward(self, x):
+        n, c, h, w = paddle.shape(x)
+        x0 = self.scale0(x)
+        x1 = self.process1(
+            F.interpolate(
+                self.scale1(x), size=[h, w], mode='bilinear') + x0)
+        x2 = self.process2(
+            F.interpolate(
+                self.scale2(x), size=[h, w], mode='bilinear') + x1)
+        x3 = self.process3(
+            F.interpolate(
+                self.scale3(x), size=[h, w], mode='bilinear') + x2)
+        x4 = self.process4(
+            F.interpolate(
+                self.scale4(x), size=[h, w], mode='bilinear') + x3)
+
+        out = self.compression(paddle.concat([x0, x1, x2, x3, x4],
+                                             1)) + self.shortcut(x)
+        return out
+
+
+class DDRNetHead(nn.Layer):
+    def __init__(self, inplanes, interplanes, outplanes, scale_factor=None):
+        super().__init__()
+        self.bn1 = nn.BatchNorm2D(inplanes)
+        self.relu = nn.ReLU()
+        self.conv_bn_relu = layers.ConvBNReLU(
+            inplanes, interplanes, kernel_size=3, padding=1, bias_attr=False)
+        self.conv = nn.Conv2D(
+            interplanes, outplanes, kernel_size=1, padding=0, bias_attr=True)
+
+        self.scale_factor = scale_factor
+
+    def forward(self, x):
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv_bn_relu(x)
+        out = self.conv(x)
+
+        if self.scale_factor is not None:
+            out = F.interpolate(
+                out, scale_factor=self.scale_factor, mode='bilinear')
+        return out
+
+
+@manager.MODELS.add_component
+def DDRNet_23(**kwargs):
+    return DualResNet(
+        block_layers=[2, 2, 2, 2],
+        planes=64,
+        spp_planes=128,
+        head_planes=128,
+        **kwargs)
--- a/paddlers/models/ppseg/models/decoupled_segnet.py
+++ b/paddlers/models/ppseg/models/decoupled_segnet.py
@ -18,11 +18,11 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.models.backbones import resnet_vd
-from paddlers.models.ppseg.models import deeplab
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.models.backbones import resnet_vd
+from paddleseg.models import deeplab
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/deeplab.py
+++ b/paddlers/models/ppseg/models/deeplab.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils

 __all__ = ['DeepLabV3P', 'DeepLabV3']

--- a/paddlers/models/ppseg/models/dmnet.py
+++ b/paddlers/models/ppseg/models/dmnet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/dnlnet.py
+++ b/paddlers/models/ppseg/models/dnlnet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/emanet.py
+++ b/paddlers/models/ppseg/models/emanet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils


@manager.MODELS.add_component
@ -209,7 +209,9 @@ class EMAU(nn.Layer):
            mu = F.normalize(mu, axis=1, p=2)
            mu = self.mu * (1 - self.momentum) + mu * self.momentum
            if paddle.distributed.get_world_size() > 1:
-                mu = paddle.distributed.all_reduce(mu)
+                out = paddle.distributed.all_reduce(mu)
+                if out is not None:
+                    mu = out
                mu /= paddle.distributed.get_world_size()
            self.mu = mu

--- a/paddlers/models/ppseg/models/encnet.py
+++ b/paddlers/models/ppseg/models/encnet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/enet.py
+++ b/paddlers/models/ppseg/models/enet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg import utils
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.cvlibs import manager, param_init
+from paddleseg import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager, param_init

 __all__ = ['ENet']

@ -34,6 +34,7 @@ class ENet(nn.Layer):

    Args:
        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): The channels of input image. Default: 3.
        pretrained (str, optional): The path or url of pretrained model. Default: None.
        encoder_relu (bool, optional): When ``True`` ReLU is used as the activation
            function; otherwise, PReLU is used. Default: False.
@ -43,13 +44,14 @@ class ENet(nn.Layer):

    def __init__(self,
                 num_classes,
+                 in_channels=3,
                 pretrained=None,
                 encoder_relu=False,
                 decoder_relu=True):
        super(ENet, self).__init__()

        self.numclasses = num_classes
-        self.initial_block = InitialBlock(3, 16, relu=encoder_relu)
+        self.initial_block = InitialBlock(in_channels, 16, relu=encoder_relu)

        self.downsample1_0 = DownsamplingBottleneck(
            16, 64, return_indices=True, dropout_prob=0.01, relu=encoder_relu)
--- a/paddlers/models/ppseg/models/espnet.py
+++ b/paddlers/models/ppseg/models/espnet.py
@ -18,9 +18,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg import utils
-from paddlers.models.ppseg.cvlibs import manager, param_init
-from paddlers.models.ppseg.models import layers
+from paddleseg import utils
+from paddleseg.cvlibs import manager, param_init
+from paddleseg.models import layers


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/espnetv1.py
+++ b/paddlers/models/ppseg/models/espnetv1.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/fast_scnn.py
+++ b/paddlers/models/ppseg/models/fast_scnn.py
@ -16,9 +16,9 @@ import paddle.nn as nn
 import paddle.nn.functional as F

 import paddle
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils

 __all__ = ['FastSCNN']

@ -34,6 +34,7 @@ class FastSCNN(nn.Layer):
    (https://arxiv.org/pdf/1902.04502.pdf).
    Args:
        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): The channels of input image. Default: 3.
        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss.
            If true, auxiliary loss will be added after LearningToDownsample module. Default: False.
        align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature
@ -43,13 +44,15 @@ class FastSCNN(nn.Layer):

    def __init__(self,
                 num_classes,
+                 in_channels=3,
                 enable_auxiliary_loss=True,
                 align_corners=False,
                 pretrained=None):

        super().__init__()

-        self.learning_to_downsample = LearningToDownsample(32, 48, 64)
+        self.learning_to_downsample = LearningToDownsample(in_channels, 32, 48,
+                                                           64)
        self.global_feature_extractor = GlobalFeatureExtractor(
            in_channels=64,
            block_channels=[64, 96, 128],
@ -108,11 +111,18 @@ class LearningToDownsample(nn.Layer):
        out_channels (int, optional): The output channels of LearningToDownsample module. Default: 64.
    """

-    def __init__(self, dw_channels1=32, dw_channels2=48, out_channels=64):
+    def __init__(self,
+                 in_channels=3,
+                 dw_channels1=32,
+                 dw_channels2=48,
+                 out_channels=64):
        super(LearningToDownsample, self).__init__()

        self.conv_bn_relu = layers.ConvBNReLU(
-            in_channels=3, out_channels=dw_channels1, kernel_size=3, stride=2)
+            in_channels=in_channels,
+            out_channels=dw_channels1,
+            kernel_size=3,
+            stride=2)
        self.dsconv_bn_relu1 = layers.SeparableConvBNReLU(
            in_channels=dw_channels1,
            out_channels=dw_channels2,
--- a/paddlers/models/ppseg/models/fastfcn.py
+++ b/paddlers/models/ppseg/models/fastfcn.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/fcn.py
+++ b/paddlers/models/ppseg/models/fcn.py
@ -16,9 +16,9 @@ import paddle.nn as nn
 import paddle.nn.functional as F

 import paddle
-from paddlers.models.ppseg import utils
-from paddlers.models.ppseg.cvlibs import manager, param_init
-from paddlers.models.ppseg.models import layers
+from paddleseg import utils
+from paddleseg.cvlibs import manager, param_init
+from paddleseg.models import layers


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/gcnet.py
+++ b/paddlers/models/ppseg/models/gcnet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/ginet.py
+++ b/paddlers/models/ppseg/models/ginet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 from paddle.nn import functional as F

-from paddlers.models.ppseg.utils import utils
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.utils import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager


@manager.MODELS.add_component
@ -92,7 +92,7 @@ class GINet(nn.Layer):

        return [
            F.interpolate(
-                logit, (h, w),
+                logit, [h, w],
                mode='bilinear',
                align_corners=self.align_corners) for logit in logit_list
        ]
--- a/paddlers/models/ppseg/models/glore.py
+++ b/paddlers/models/ppseg/models/glore.py
@ -0,0 +1,198 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils
+
+
+@manager.MODELS.add_component
+class GloRe(nn.Layer):
+    """
+    The GloRe implementation based on PaddlePaddle.
+
+    The original article refers to:
+       Chen, Yunpeng, et al. "Graph-Based Global Reasoning Networks"
+       (https://arxiv.org/pdf/1811.12814.pdf)
+    
+    Args:
+        num_classes (int): The unique number of target classes.
+        backbone (Paddle.nn.Layer): Backbone network, currently support Resnet50/101.
+        backbone_indices (tuple, optional): Two values in the tuple indicate the indices of output of backbone.
+        gru_channels (int, optional): The number of input channels in GloRe Unit. Default: 512.
+        gru_num_state (int, optional): The number of states in GloRe Unit. Default: 128.
+        gru_num_node (tuple, optional): The number of nodes in GloRe Unit. Default: Default: 128.
+        enable_auxiliary_loss (bool, optional): A bool value indicates whether adding auxiliary loss. Default: True.
+        align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even,
+            e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=(2, 3),
+                 gru_channels=512,
+                 gru_num_state=128,
+                 gru_num_node=64,
+                 enable_auxiliary_loss=True,
+                 align_corners=False,
+                 pretrained=None):
+        super().__init__()
+
+        self.backbone = backbone
+        backbone_channels = [
+            backbone.feat_channels[i] for i in backbone_indices
+        ]
+
+        self.head = GloReHead(num_classes, backbone_indices, backbone_channels,
+                              gru_channels, gru_num_state, gru_num_node,
+                              enable_auxiliary_loss)
+        self.align_corners = align_corners
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        feat_list = self.backbone(x)
+        logit_list = self.head(feat_list)
+        return [
+            F.interpolate(
+                logit,
+                paddle.shape(x)[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for logit in logit_list
+        ]
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class GloReHead(nn.Layer):
+    def __init__(self,
+                 num_classes,
+                 backbone_indices,
+                 backbone_channels,
+                 gru_channels=512,
+                 gru_num_state=128,
+                 gru_num_node=64,
+                 enable_auxiliary_loss=True):
+        super().__init__()
+
+        in_channels = backbone_channels[1]
+        self.conv_bn_relu = layers.ConvBNReLU(
+            in_channels, gru_channels, 1, bias_attr=False)
+        self.gru_module = GruModule(
+            num_input=gru_channels,
+            num_state=gru_num_state,
+            num_node=gru_num_node)
+
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Conv2D(512, num_classes, kernel_size=1)
+        self.auxlayer = layers.AuxLayer(
+            in_channels=backbone_channels[0],
+            inter_channels=backbone_channels[0] // 4,
+            out_channels=num_classes)
+
+        self.backbone_indices = backbone_indices
+        self.enable_auxiliary_loss = enable_auxiliary_loss
+
+    def forward(self, feat_list):
+
+        logit_list = []
+        x = feat_list[self.backbone_indices[1]]
+
+        feature = self.conv_bn_relu(x)
+        gru_output = self.gru_module(feature)
+        output = self.dropout(gru_output)
+        logit = self.classifier(output)
+        logit_list.append(logit)
+
+        if self.enable_auxiliary_loss:
+            low_level_feat = feat_list[self.backbone_indices[0]]
+            auxiliary_logit = self.auxlayer(low_level_feat)
+            logit_list.append(auxiliary_logit)
+
+        return logit_list
+
+
+class GCN(nn.Layer):
+    def __init__(self, num_state, num_node, bias=False):
+        super(GCN, self).__init__()
+        self.conv1 = nn.Conv1D(num_node, num_node, kernel_size=1)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv1D(
+            num_state, num_state, kernel_size=1, bias_attr=bias)
+
+    def forward(self, x):
+        h = self.conv1(paddle.transpose(x, perm=(0, 2, 1)))
+        h = paddle.transpose(h, perm=(0, 2, 1))
+        h = h + x
+        h = self.relu(self.conv2(h))
+        return h
+
+
+class GruModule(nn.Layer):
+    def __init__(self,
+                 num_input=512,
+                 num_state=128,
+                 num_node=64,
+                 normalize=False):
+        super(GruModule, self).__init__()
+        self.normalize = normalize
+        self.num_state = num_state
+        self.num_node = num_node
+        self.reduction_dim = nn.Conv2D(num_input, num_state, kernel_size=1)
+        self.projection_mat = nn.Conv2D(num_input, num_node, kernel_size=1)
+        self.gcn = GCN(num_state=self.num_state, num_node=self.num_node)
+        self.extend_dim = nn.Conv2D(
+            self.num_state, num_input, kernel_size=1, bias_attr=False)
+        self.extend_bn = layers.SyncBatchNorm(num_input, epsilon=1e-4)
+
+    def forward(self, input):
+        n, c, h, w = input.shape
+        # B, C, H, W
+        reduction_dim = self.reduction_dim(input)
+        # B, N, H, W
+        mat_B = self.projection_mat(input)
+        # B, C, H*W
+        reshaped_reduction = paddle.reshape(
+            reduction_dim, shape=[n, self.num_state, h * w])
+        # B, N, H*W
+        reshaped_B = paddle.reshape(mat_B, shape=[n, self.num_node, h * w])
+        # B, N, H*W
+        reproject = reshaped_B
+        # B, C, N
+        node_state_V = paddle.matmul(
+            reshaped_reduction, paddle.transpose(
+                reshaped_B, perm=[0, 2, 1]))
+
+        if self.normalize:
+            node_state_V = node_state_V * (1. / reshaped_reduction.shape[2])
+
+        # B, C, N
+        gcn_out = self.gcn(node_state_V)
+        # B, C, H*W
+        Y = paddle.matmul(gcn_out, reproject)
+        # B, C, H, W
+        Y = paddle.reshape(Y, shape=[n, self.num_state, h, w])
+        Y_extend = self.extend_dim(Y)
+        Y_extend = self.extend_bn(Y_extend)
+
+        out = input + Y_extend
+        return out
--- a/paddlers/models/ppseg/models/gscnn.py
+++ b/paddlers/models/ppseg/models/gscnn.py
@ -18,11 +18,11 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.models.backbones import resnet_vd
-from paddlers.models.ppseg.models import deeplab
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.models.backbones import resnet_vd
+from paddleseg.models import deeplab
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/hardnet.py
+++ b/paddlers/models/ppseg/models/hardnet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
@ -31,6 +31,7 @@ class HarDNet(nn.Layer):

    Args:
        num_classes (int): The unique number of target classes.
+        in_channels (int, optional): The channels of input image. Default: 3.
        stem_channels (tuple|list, optional): The number of channels before the encoder. Default: (16, 24, 32, 48).
        ch_list (tuple|list, optional): The number of channels at each block in the encoder. Default: (64, 96, 160, 224, 320).
        grmul (float, optional): The channel multiplying factor in HarDBlock, which is m in the paper. Default: 1.7.
@ -43,6 +44,7 @@ class HarDNet(nn.Layer):

    def __init__(self,
                 num_classes,
+                 in_channels=3,
                 stem_channels=(16, 24, 32, 48),
                 ch_list=(64, 96, 160, 224, 320),
                 grmul=1.7,
@ -60,7 +62,7 @@ class HarDNet(nn.Layer):

        self.stem = nn.Sequential(
            layers.ConvBNReLU(
-                3, stem_channels[0], kernel_size=3, bias_attr=False),
+                in_channels, stem_channels[0], kernel_size=3, bias_attr=False),
            layers.ConvBNReLU(
                stem_channels[0],
                stem_channels[1],
--- a/paddlers/models/ppseg/models/hrnet_contrast.py
+++ b/paddlers/models/ppseg/models/hrnet_contrast.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.utils import utils
+from paddleseg.cvlibs import manager
+from paddleseg.models import layers
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/isanet.py
+++ b/paddlers/models/ppseg/models/isanet.py
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils


@manager.MODELS.add_component
--- a/paddlers/models/ppseg/models/layers/init.py
+++ b/paddlers/models/ppseg/models/layers/init.py
@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .layer_libs import ConvBNReLU, ConvBN, SeparableConvBNReLU, DepthwiseConvBN, AuxLayer, SyncBatchNorm, JPU, ConvBNPReLU
+from .layer_libs import ConvBNReLU, ConvBN, SeparableConvBNReLU, DepthwiseConvBN, AuxLayer, SyncBatchNorm, JPU, ConvBNPReLU, ConvBNAct, ConvBNLeakyReLU
 from .activation import Activation
 from .pyramid_pool import ASPPModule, PPModule
 from .attention import AttentionBlock
 from .nonlocal2d import NonLocal2D
 from .wrap_functions import *
+from .tensor_fusion import UAFM_SpAtten, UAFM_SpAtten_S, UAFM_ChAtten, UAFM_ChAtten_S, UAFM, UAFMMobile, UAFMMobile_SpAtten
--- a/paddlers/models/ppseg/models/layers/activation.py
+++ b/paddlers/models/ppseg/models/layers/activation.py
@ -33,7 +33,7 @@ class Activation(nn.Layer):

    Examples:

-        from paddlers.models.ppseg.models.common.activation import Activation
+        from paddleseg.models.common.activation import Activation

        relu = Activation("relu")
        print(relu)
--- a/paddlers/models/ppseg/models/layers/attention.py
+++ b/paddlers/models/ppseg/models/layers/attention.py
@ -16,7 +16,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.models import layers
+from paddleseg.models import layers


 class AttentionBlock(nn.Layer):
@ -144,3 +144,129 @@ class AttentionBlock(nn.Layer):
        if self.out_project is not None:
            context = self.out_project(context)
        return context
+
+
+class DualAttentionModule(nn.Layer):
+    """
+    Dual attention module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        inter_channels = in_channels // 4
+
+        self.channel_conv = layers.ConvBNReLU(in_channels, inter_channels, 1)
+        self.position_conv = layers.ConvBNReLU(in_channels, inter_channels, 1)
+        self.pam = PAM(inter_channels)
+        self.cam = CAM(inter_channels)
+        self.conv1 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
+        self.conv2 = layers.ConvBNReLU(inter_channels, inter_channels, 3)
+        self.conv3 = layers.ConvBNReLU(inter_channels, out_channels, 3)
+
+    def forward(self, feats):
+        channel_feats = self.channel_conv(feats)
+        channel_feats = self.cam(channel_feats)
+        channel_feats = self.conv1(channel_feats)
+
+        position_feats = self.position_conv(feats)
+        position_feats = self.pam(position_feats)
+        position_feats = self.conv2(position_feats)
+
+        feats_sum = position_feats + channel_feats
+        out = self.conv3(feats_sum)
+        return out
+
+
+class PAM(nn.Layer):
+    """
+    Position attention module.
+    Args:
+        in_channels (int): The number of input channels.
+    """
+
+    def __init__(self, in_channels):
+        super().__init__()
+        mid_channels = in_channels // 8
+        self.mid_channels = mid_channels
+        self.in_channels = in_channels
+
+        self.query_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
+        self.key_conv = nn.Conv2D(in_channels, mid_channels, 1, 1)
+        self.value_conv = nn.Conv2D(in_channels, in_channels, 1, 1)
+
+        self.gamma = self.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0))
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+
+        # query: n, h * w, c1
+        query = self.query_conv(x)
+        query = paddle.reshape(query, (0, self.mid_channels, -1))
+        query = paddle.transpose(query, (0, 2, 1))
+
+        # key: n, c1, h * w
+        key = self.key_conv(x)
+        key = paddle.reshape(key, (0, self.mid_channels, -1))
+
+        # sim: n, h * w, h * w
+        sim = paddle.bmm(query, key)
+        sim = F.softmax(sim, axis=-1)
+
+        value = self.value_conv(x)
+        value = paddle.reshape(value, (0, self.in_channels, -1))
+        sim = paddle.transpose(sim, (0, 2, 1))
+
+        # feat: from (n, c2, h * w) -> (n, c2, h, w)
+        feat = paddle.bmm(value, sim)
+        feat = paddle.reshape(feat,
+                              (0, self.in_channels, x_shape[2], x_shape[3]))
+
+        out = self.gamma * feat + x
+        return out
+
+
+class CAM(nn.Layer):
+    """
+    Channel attention module.
+    Args:
+        in_channels (int): The number of input channels.
+    """
+
+    def __init__(self, channels):
+        super().__init__()
+
+        self.channels = channels
+        self.gamma = self.create_parameter(
+            shape=[1],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(0))
+
+    def forward(self, x):
+        x_shape = paddle.shape(x)
+        # query: n, c, h * w
+        query = paddle.reshape(x, (0, self.channels, -1))
+        # key: n, h * w, c
+        key = paddle.reshape(x, (0, self.channels, -1))
+        key = paddle.transpose(key, (0, 2, 1))
+
+        # sim: n, c, c
+        sim = paddle.bmm(query, key)
+        # The danet author claims that this can avoid gradient divergence
+        sim = paddle.max(sim, axis=-1, keepdim=True).tile(
+            [1, 1, self.channels]) - sim
+        sim = F.softmax(sim, axis=-1)
+
+        # feat: from (n, c, h * w) to (n, c, h, w)
+        value = paddle.reshape(x, (0, self.channels, -1))
+        feat = paddle.bmm(sim, value)
+        feat = paddle.reshape(feat, (0, self.channels, x_shape[2], x_shape[3]))
+
+        out = self.gamma * feat + x
+        return out
--- a/paddlers/models/ppseg/models/layers/layer_libs.py
+++ b/paddlers/models/ppseg/models/layers/layer_libs.py
@ -17,7 +17,7 @@ import os
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddlers.models.ppseg.models import layers
+from paddleseg.models import layers


 def SyncBatchNorm(*args, **kwargs):
@ -56,6 +56,37 @@ class ConvBNReLU(nn.Layer):
        return x


+class ConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 act_type=None,
+                 **kwargs):
+        super().__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels, out_channels, kernel_size, padding=padding, **kwargs)
+
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+
+        self._act_type = act_type
+        if act_type is not None:
+            self._act = layers.Activation(act_type)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self._act_type is not None:
+            x = self._act(x)
+        return x
+
+
 class ConvBN(nn.Layer):
    def __init__(self,
                 in_channels,
@ -293,3 +324,29 @@ class ConvBNPReLU(nn.Layer):
        x = self._batch_norm(x)
        x = self._prelu(x)
        return x
+
+
+class ConvBNLeakyReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 padding='same',
+                 **kwargs):
+        super().__init__()
+
+        self._conv = nn.Conv2D(
+            in_channels, out_channels, kernel_size, padding=padding, **kwargs)
+
+        if 'data_format' in kwargs:
+            data_format = kwargs['data_format']
+        else:
+            data_format = 'NCHW'
+        self._batch_norm = SyncBatchNorm(out_channels, data_format=data_format)
+        self._relu = layers.Activation("leakyrelu")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        x = self._relu(x)
+        return x
--- a/paddlers/models/ppseg/models/layers/nonlocal2d.py
+++ b/paddlers/models/ppseg/models/layers/nonlocal2d.py
@ -16,7 +16,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.models import layers
+from paddleseg.models import layers


 class NonLocal2D(nn.Layer):
--- a/paddlers/models/ppseg/models/layers/pyramid_pool.py
+++ b/paddlers/models/ppseg/models/layers/pyramid_pool.py
@ -16,7 +16,7 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn

-from paddlers.models.ppseg.models import layers
+from paddleseg.models import layers


 class ASPPModule(nn.Layer):
--- a/paddlers/models/ppseg/models/layers/tensor_fusion.py
+++ b/paddlers/models/ppseg/models/layers/tensor_fusion.py
@ -0,0 +1,285 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddleseg.models import layers
+from paddleseg.models.layers import tensor_fusion_helper as helper
+
+
+class UAFM(nn.Layer):
+    """
+    The base of Unified Attention Fusion Module.
+    Args:
+        x_ch (int): The channel of x tensor, which is the low level feature.
+        y_ch (int): The channel of y tensor, which is the high level feature.
+        out_ch (int): The channel of output tensor.
+        ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
+        resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
+    """
+
+    def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
+        super().__init__()
+
+        self.conv_x = layers.ConvBNReLU(
+            x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False)
+        self.conv_out = layers.ConvBNReLU(
+            y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False)
+        self.resize_mode = resize_mode
+
+    def check(self, x, y):
+        assert x.ndim == 4 and y.ndim == 4
+        x_h, x_w = x.shape[2:]
+        y_h, y_w = y.shape[2:]
+        assert x_h >= y_h and x_w >= y_w
+
+    def prepare(self, x, y):
+        x = self.prepare_x(x, y)
+        y = self.prepare_y(x, y)
+        return x, y
+
+    def prepare_x(self, x, y):
+        x = self.conv_x(x)
+        return x
+
+    def prepare_y(self, x, y):
+        y_up = F.interpolate(y, paddle.shape(x)[2:], mode=self.resize_mode)
+        return y_up
+
+    def fuse(self, x, y):
+        out = x + y
+        out = self.conv_out(out)
+        return out
+
+    def forward(self, x, y):
+        """
+        Args:
+            x (Tensor): The low level feature.
+            y (Tensor): The high level feature.
+        """
+        self.check(x, y)
+        x, y = self.prepare(x, y)
+        out = self.fuse(x, y)
+        return out
+
+
+class UAFM_ChAtten(UAFM):
+    """
+    The UAFM with channel attention, which uses mean and max values.
+    Args:
+        x_ch (int): The channel of x tensor, which is the low level feature.
+        y_ch (int): The channel of y tensor, which is the high level feature.
+        out_ch (int): The channel of output tensor.
+        ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
+        resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
+    """
+
+    def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
+        super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
+
+        self.conv_xy_atten = nn.Sequential(
+            layers.ConvBNAct(
+                4 * y_ch,
+                y_ch // 2,
+                kernel_size=1,
+                bias_attr=False,
+                act_type="leakyrelu"),
+            layers.ConvBN(
+                y_ch // 2, y_ch, kernel_size=1, bias_attr=False))
+
+    def fuse(self, x, y):
+        """
+        Args:
+            x (Tensor): The low level feature.
+            y (Tensor): The high level feature.
+        """
+        atten = helper.avg_max_reduce_hw([x, y], self.training)
+        atten = F.sigmoid(self.conv_xy_atten(atten))
+
+        out = x * atten + y * (1 - atten)
+        out = self.conv_out(out)
+        return out
+
+
+class UAFM_ChAtten_S(UAFM):
+    """
+    The UAFM with channel attention, which uses mean values.
+    Args:
+        x_ch (int): The channel of x tensor, which is the low level feature.
+        y_ch (int): The channel of y tensor, which is the high level feature.
+        out_ch (int): The channel of output tensor.
+        ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
+        resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
+    """
+
+    def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
+        super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
+
+        self.conv_xy_atten = nn.Sequential(
+            layers.ConvBNAct(
+                2 * y_ch,
+                y_ch // 2,
+                kernel_size=1,
+                bias_attr=False,
+                act_type="leakyrelu"),
+            layers.ConvBN(
+                y_ch // 2, y_ch, kernel_size=1, bias_attr=False))
+
+    def fuse(self, x, y):
+        """
+        Args:
+            x (Tensor): The low level feature.
+            y (Tensor): The high level feature.
+        """
+        atten = helper.avg_reduce_hw([x, y])
+        atten = F.sigmoid(self.conv_xy_atten(atten))
+
+        out = x * atten + y * (1 - atten)
+        out = self.conv_out(out)
+        return out
+
+
+class UAFM_SpAtten(UAFM):
+    """
+    The UAFM with spatial attention, which uses mean and max values.
+    Args:
+        x_ch (int): The channel of x tensor, which is the low level feature.
+        y_ch (int): The channel of y tensor, which is the high level feature.
+        out_ch (int): The channel of output tensor.
+        ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
+        resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
+    """
+
+    def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
+        super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
+
+        self.conv_xy_atten = nn.Sequential(
+            layers.ConvBNReLU(
+                4, 2, kernel_size=3, padding=1, bias_attr=False),
+            layers.ConvBN(
+                2, 1, kernel_size=3, padding=1, bias_attr=False))
+        self._scale = self.create_parameter(
+            shape=[1],
+            attr=ParamAttr(initializer=Constant(value=1.)),
+            dtype="float32")
+        self._scale.stop_gradient = True
+
+    def fuse(self, x, y):
+        """
+        Args:
+            x (Tensor): The low level feature.
+            y (Tensor): The high level feature.
+        """
+        atten = helper.avg_max_reduce_channel([x, y])
+        atten = F.sigmoid(self.conv_xy_atten(atten))
+
+        out = x * atten + y * (self._scale - atten)
+        out = self.conv_out(out)
+        return out
+
+
+class UAFM_SpAtten_S(UAFM):
+    """
+    The UAFM with spatial attention, which uses mean values.
+    Args:
+        x_ch (int): The channel of x tensor, which is the low level feature.
+        y_ch (int): The channel of y tensor, which is the high level feature.
+        out_ch (int): The channel of output tensor.
+        ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
+        resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
+    """
+
+    def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
+        super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
+
+        self.conv_xy_atten = nn.Sequential(
+            layers.ConvBNReLU(
+                2, 2, kernel_size=3, padding=1, bias_attr=False),
+            layers.ConvBN(
+                2, 1, kernel_size=3, padding=1, bias_attr=False))
+
+    def fuse(self, x, y):
+        """
+        Args:
+            x (Tensor): The low level feature.
+            y (Tensor): The high level feature.
+        """
+        atten = helper.avg_reduce_channel([x, y])
+        atten = F.sigmoid(self.conv_xy_atten(atten))
+
+        out = x * atten + y * (1 - atten)
+        out = self.conv_out(out)
+        return out
+
+
+class UAFMMobile(UAFM):
+    """
+    Unified Attention Fusion Module for mobile.
+    Args:
+        x_ch (int): The channel of x tensor, which is the low level feature.
+        y_ch (int): The channel of y tensor, which is the high level feature.
+        out_ch (int): The channel of output tensor.
+        ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
+        resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
+    """
+
+    def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
+        super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
+
+        self.conv_x = layers.SeparableConvBNReLU(
+            x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False)
+        self.conv_out = layers.SeparableConvBNReLU(
+            y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False)
+
+
+class UAFMMobile_SpAtten(UAFM):
+    """
+    Unified Attention Fusion Module with spatial attention for mobile.
+    Args:
+        x_ch (int): The channel of x tensor, which is the low level feature.
+        y_ch (int): The channel of y tensor, which is the high level feature.
+        out_ch (int): The channel of output tensor.
+        ksize (int, optional): The kernel size of the conv for x tensor. Default: 3.
+        resize_mode (str, optional): The resize model in unsampling y tensor. Default: bilinear.
+    """
+
+    def __init__(self, x_ch, y_ch, out_ch, ksize=3, resize_mode='bilinear'):
+        super().__init__(x_ch, y_ch, out_ch, ksize, resize_mode)
+
+        self.conv_x = layers.SeparableConvBNReLU(
+            x_ch, y_ch, kernel_size=ksize, padding=ksize // 2, bias_attr=False)
+        self.conv_out = layers.SeparableConvBNReLU(
+            y_ch, out_ch, kernel_size=3, padding=1, bias_attr=False)
+
+        self.conv_xy_atten = nn.Sequential(
+            layers.ConvBNReLU(
+                4, 2, kernel_size=3, padding=1, bias_attr=False),
+            layers.ConvBN(
+                2, 1, kernel_size=3, padding=1, bias_attr=False))
+
+    def fuse(self, x, y):
+        """
+        Args:
+            x (Tensor): The low level feature.
+            y (Tensor): The high level feature.
+        """
+        atten = helper.avg_max_reduce_channel([x, y])
+        atten = F.sigmoid(self.conv_xy_atten(atten))
+
+        out = x * atten + y * (1 - atten)
+        out = self.conv_out(out)
+        return out
--- a/paddlers/models/ppseg/models/layers/tensor_fusion_helper.py
+++ b/paddlers/models/ppseg/models/layers/tensor_fusion_helper.py
@ -0,0 +1,133 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def avg_reduce_hw(x):
+    # Reduce hw by avg
+    # Return cat([avg_pool_0, avg_pool_1, ...])
+    if not isinstance(x, (list, tuple)):
+        return F.adaptive_avg_pool2d(x, 1)
+    elif len(x) == 1:
+        return F.adaptive_avg_pool2d(x[0], 1)
+    else:
+        res = []
+        for xi in x:
+            res.append(F.adaptive_avg_pool2d(xi, 1))
+        return paddle.concat(res, axis=1)
+
+
+def avg_max_reduce_hw_helper(x, is_training, use_concat=True):
+    assert not isinstance(x, (list, tuple))
+    avg_pool = F.adaptive_avg_pool2d(x, 1)
+    # TODO(pjc): when axis=[2, 3], the paddle.max api has bug for training.
+    if is_training:
+        max_pool = F.adaptive_max_pool2d(x, 1)
+    else:
+        max_pool = paddle.max(x, axis=[2, 3], keepdim=True)
+
+    if use_concat:
+        res = paddle.concat([avg_pool, max_pool], axis=1)
+    else:
+        res = [avg_pool, max_pool]
+    return res
+
+
+def avg_max_reduce_hw(x, is_training):
+    # Reduce hw by avg and max
+    # Return cat([avg_pool_0, avg_pool_1, ..., max_pool_0, max_pool_1, ...])
+    if not isinstance(x, (list, tuple)):
+        return avg_max_reduce_hw_helper(x, is_training)
+    elif len(x) == 1:
+        return avg_max_reduce_hw_helper(x[0], is_training)
+    else:
+        res_avg = []
+        res_max = []
+        for xi in x:
+            avg, max = avg_max_reduce_hw_helper(xi, is_training, False)
+            res_avg.append(avg)
+            res_max.append(max)
+        res = res_avg + res_max
+        return paddle.concat(res, axis=1)
+
+
+def avg_reduce_channel(x):
+    # Reduce channel by avg
+    # Return cat([avg_ch_0, avg_ch_1, ...])
+    if not isinstance(x, (list, tuple)):
+        return paddle.mean(x, axis=1, keepdim=True)
+    elif len(x) == 1:
+        return paddle.mean(x[0], axis=1, keepdim=True)
+    else:
+        res = []
+        for xi in x:
+            res.append(paddle.mean(xi, axis=1, keepdim=True))
+        return paddle.concat(res, axis=1)
+
+
+def max_reduce_channel(x):
+    # Reduce channel by max
+    # Return cat([max_ch_0, max_ch_1, ...])
+    if not isinstance(x, (list, tuple)):
+        return paddle.max(x, axis=1, keepdim=True)
+    elif len(x) == 1:
+        return paddle.max(x[0], axis=1, keepdim=True)
+    else:
+        res = []
+        for xi in x:
+            res.append(paddle.max(xi, axis=1, keepdim=True))
+        return paddle.concat(res, axis=1)
+
+
+def avg_max_reduce_channel_helper(x, use_concat=True):
+    # Reduce hw by avg and max, only support single input
+    assert not isinstance(x, (list, tuple))
+    mean_value = paddle.mean(x, axis=1, keepdim=True)
+    max_value = paddle.max(x, axis=1, keepdim=True)
+
+    if use_concat:
+        res = paddle.concat([mean_value, max_value], axis=1)
+    else:
+        res = [mean_value, max_value]
+    return res
+
+
+def avg_max_reduce_channel(x):
+    # Reduce hw by avg and max
+    # Return cat([avg_ch_0, max_ch_0, avg_ch_1, max_ch_1, ...])
+    if not isinstance(x, (list, tuple)):
+        return avg_max_reduce_channel_helper(x)
+    elif len(x) == 1:
+        return avg_max_reduce_channel_helper(x[0])
+    else:
+        res = []
+        for xi in x:
+            res.extend(avg_max_reduce_channel_helper(xi, False))
+        return paddle.concat(res, axis=1)
+
+
+def cat_avg_max_reduce_channel(x):
+    # Reduce hw by cat+avg+max
+    assert isinstance(x, (list, tuple)) and len(x) > 1
+
+    x = paddle.concat(x, axis=1)
+
+    mean_value = paddle.mean(x, axis=1, keepdim=True)
+    max_value = paddle.max(x, axis=1, keepdim=True)
+    res = paddle.concat([mean_value, max_value], axis=1)
+
+    return res
--- a/paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/binary_cross_entropy_loss.py
@ -16,7 +16,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
@ -99,7 +99,7 @@ class BCELoss(nn.Layer):
                    raise ValueError(
                        "if type of `weight` is str, it should equal to 'dynamic', but it is {}"
                        .format(self.weight))
-            elif isinstance(self.weight, paddle.VarBase):
+            elif not isinstance(self.weight, paddle.Tensor):
                raise TypeError(
                    'The type of `weight` is wrong, it should be Tensor or str, but it is {}'
                    .format(type(self.weight)))
--- a/paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
+++ b/paddlers/models/ppseg/models/losses/bootstrapped_cross_entropy.py
@ -16,7 +16,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/cross_entropy_loss.py
@ -16,7 +16,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
@ -78,8 +78,6 @@ class CrossEntropyLoss(nn.Layer):
            logit = paddle.transpose(logit, [0, 2, 3, 1])
        label = label.astype('int64')

-        # In F.cross_entropy, the ignore_index is invalid, which needs to be fixed.
-        # When there is 255 in the label and paddle version <= 2.1.3, the cross_entropy OP will report an error, which is fixed in paddle develop version.
        loss = F.cross_entropy(
            logit,
            label,
@ -121,7 +119,7 @@ class CrossEntropyLoss(nn.Layer):
            loss = loss * semantic_weights

        if self.weight is not None:
-            _one_hot = F.one_hot(label, logit.shape[-1])
+            _one_hot = F.one_hot(label * mask, logit.shape[-1])
            coef = paddle.sum(_one_hot * self.weight, axis=-1)
        else:
            coef = paddle.ones_like(label)
--- a/paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
+++ b/paddlers/models/ppseg/models/losses/decoupledsegnet_relax_boundary_loss.py
@ -16,9 +16,9 @@ import numpy as np
 import paddle
 from paddle import nn
 import paddle.nn.functional as F
-from scipy.ndimage.interpolation import shift
+from scipy.ndimage import shift

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
+++ b/paddlers/models/ppseg/models/losses/detail_aggregate_loss.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -16,7 +16,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/dice_loss.py
+++ b/paddlers/models/ppseg/models/losses/dice_loss.py
@ -13,44 +13,65 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
 class DiceLoss(nn.Layer):
    """
-    Implements the dice loss function.
+    The implements of the dice loss.

    Args:
-        ignore_index (int64): Specifies a target value that is ignored
-            and does not contribute to the input gradient. Default ``255``.
-        smooth (float32): laplace smoothing,
-            to smooth dice loss and accelerate convergence. following:
-            https://github.com/pytorch/pytorch/issues/1249#issuecomment-337999895
+        weight (list[float], optional): The weight for each class. Default: None.
+        ignore_index (int64): ignore_index (int64, optional): Specifies a target value that
+            is ignored and does not contribute to the input gradient. Default ``255``.
+        smooth (float32): Laplace smoothing to smooth dice loss and accelerate convergence.
+            Default: 1.0
    """

-    def __init__(self, ignore_index=255, smooth=0.):
-        super(DiceLoss, self).__init__()
+    def __init__(self, weight=None, ignore_index=255, smooth=1.0):
+        super().__init__()
+        self.weight = weight
        self.ignore_index = ignore_index
-        self.eps = 1e-5
        self.smooth = smooth
+        self.eps = 1e-8

    def forward(self, logits, labels):
-        labels = paddle.cast(labels, dtype='int32')
-        labels_one_hot = F.one_hot(labels, num_classes=logits.shape[1])
-        labels_one_hot = paddle.transpose(labels_one_hot, [0, 3, 1, 2])
-        labels_one_hot = paddle.cast(labels_one_hot, dtype='float32')
+        num_class = logits.shape[1]
+        if self.weight is not None:
+            assert num_class == len(self.weight), \
+                "The lenght of weight should be euqal to the num class"
+
+        mask = labels != self.ignore_index
+        mask = paddle.cast(paddle.unsqueeze(mask, 1), 'float32')

+        labels[labels == self.ignore_index] = 0
+        labels_one_hot = F.one_hot(labels, num_class)
+        labels_one_hot = paddle.transpose(labels_one_hot, [0, 3, 1, 2])
        logits = F.softmax(logits, axis=1)

-        mask = (paddle.unsqueeze(labels, 1) != self.ignore_index)
-        logits = logits * mask
-        labels_one_hot = labels_one_hot * mask
+        dice_loss = 0.0
+        for i in range(num_class):
+            dice_loss_i = dice_loss_helper(logits[:, i], labels_one_hot[:, i],
+                                           mask, self.smooth, self.eps)
+            if self.weight is not None:
+                dice_loss_i *= self.weight[i]
+            dice_loss += dice_loss_i
+        dice_loss = dice_loss / num_class
+
+        return dice_loss

-        dims = (0, ) + tuple(range(2, labels.ndimension() + 1))

-        intersection = paddle.sum(logits * labels_one_hot, dims)
-        cardinality = paddle.sum(logits + labels_one_hot, dims)
-        dice_loss = ((2. * intersection + self.smooth) /
-                     (cardinality + self.eps + self.smooth)).mean()
-        return 1 - dice_loss
+def dice_loss_helper(logit, label, mask, smooth, eps):
+    assert logit.shape == label.shape, \
+        "The shape of logit and label should be the same"
+    logit = paddle.reshape(logit, [0, -1])
+    label = paddle.reshape(label, [0, -1])
+    mask = paddle.reshape(mask, [0, -1])
+    logit *= mask
+    label *= mask
+    intersection = paddle.sum(logit * label, axis=1)
+    cardinality = paddle.sum(logit + label, axis=1)
+    dice_loss = 1 - (2 * intersection + smooth) / (cardinality + smooth + eps)
+    dice_loss = dice_loss.mean()
+    return dice_loss
--- a/paddlers/models/ppseg/models/losses/edge_attention_loss.py
+++ b/paddlers/models/ppseg/models/losses/edge_attention_loss.py
@ -16,8 +16,8 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import losses
+from paddleseg.cvlibs import manager
+from paddleseg.models import losses


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/focal_loss.py
+++ b/paddlers/models/ppseg/models/losses/focal_loss.py
@ -17,44 +17,116 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
 class FocalLoss(nn.Layer):
    """
-    Focal Loss.
+    The implement of focal loss.

-    Code referenced from:
-    https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
+    The focal loss requires the label is 0 or 1 for now.

    Args:
-        gamma (float): the coefficient of Focal Loss.
-        ignore_index (int64): Specifies a target value that is ignored
+        alpha (float, list, optional): The alpha of focal loss. alpha is the weight
+            of class 1, 1-alpha is the weight of class 0. Default: 0.25
+        gamma (float, optional): The gamma of Focal Loss. Default: 2.0
+        ignore_index (int64, optional): Specifies a target value that is ignored
            and does not contribute to the input gradient. Default ``255``.
    """

-    def __init__(self, gamma=2.0, ignore_index=255, edge_label=False):
-        super(FocalLoss, self).__init__()
+    def __init__(self, alpha=0.25, gamma=2.0, ignore_index=255):
+        super().__init__()
+        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
-        self.edge_label = edge_label
+        self.EPS = 1e-10

    def forward(self, logit, label):
-        logit = paddle.reshape(
-            logit, [logit.shape[0], logit.shape[1], -1])  # N,C,H,W => N,C,H*W
-        logit = paddle.transpose(logit, [0, 2, 1])  # N,C,H*W => N,H*W,C
-        logit = paddle.reshape(logit,
-                               [-1, logit.shape[2]])  # N,H*W,C => N*H*W,C
-        label = paddle.reshape(label, [-1, 1])
-        range_ = paddle.arange(0, label.shape[0])
-        range_ = paddle.unsqueeze(range_, axis=-1)
-        label = paddle.cast(label, dtype='int64')
-        label = paddle.concat([range_, label], axis=-1)
-        logpt = F.log_softmax(logit)
-        logpt = paddle.gather_nd(logpt, label)
-
-        pt = paddle.exp(logpt.detach())
-        loss = -1 * (1 - pt)**self.gamma * logpt
-        loss = paddle.mean(loss)
-        return loss
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C, H, W), where C is number of classes.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, W, W),
+                where each value is 0 <= label[i] <= C-1.
+        Returns:
+            (Tensor): The average loss.
+        """
+        assert logit.ndim == 4, "The ndim of logit should be 4."
+        assert logit.shape[1] == 2, "The channel of logit should be 2."
+        assert label.ndim == 3, "The ndim of label should be 3."
+
+        class_num = logit.shape[1]  # class num is 2
+        logit = paddle.transpose(logit, [0, 2, 3, 1])  # N,C,H,W => N,H,W,C
+
+        mask = label != self.ignore_index  # N,H,W
+        mask = paddle.unsqueeze(mask, 3)
+        mask = paddle.cast(mask, 'float32')
+        mask.stop_gradient = True
+
+        label = F.one_hot(label, class_num)  # N,H,W,C
+        label = paddle.cast(label, logit.dtype)
+        label.stop_gradient = True
+
+        loss = F.sigmoid_focal_loss(
+            logit=logit,
+            label=label,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            reduction='none')
+        loss = loss * mask
+        avg_loss = paddle.sum(loss) / (
+            paddle.sum(paddle.cast(mask != 0., 'int32')) * class_num + self.EPS)
+        return avg_loss
+
+
+@manager.LOSSES.add_component
+class MultiClassFocalLoss(nn.Layer):
+    """
+    The implement of focal loss for multi class.
+
+    Args:
+        alpha (float, list, optional): The alpha of focal loss. alpha is the weight
+            of class 1, 1-alpha is the weight of class 0. Default: 0.25
+        gamma (float, optional): The gamma of Focal Loss. Default: 2.0
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. Default ``255``.
+    """
+
+    def __init__(self, num_class, alpha=1.0, gamma=2.0, ignore_index=255):
+        super().__init__()
+        self.num_class = num_class
+        self.alpha = alpha
+        self.gamma = gamma
+        self.ignore_index = ignore_index
+        self.EPS = 1e-10
+
+    def forward(self, logit, label):
+        """
+        Forward computation.
+
+        Args:
+            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
+                (N, C, H, W), where C is number of classes.
+            label (Tensor): Label tensor, the data type is int64. Shape is (N, W, W),
+                where each value is 0 <= label[i] <= C-1.
+        Returns:
+            (Tensor): The average loss.
+        """
+        assert logit.ndim == 4, "The ndim of logit should be 4."
+        assert label.ndim == 3, "The ndim of label should be 3."
+
+        logit = paddle.transpose(logit, [0, 2, 3, 1])
+        label = label.astype('int64')
+        ce_loss = F.cross_entropy(
+            logit, label, ignore_index=self.ignore_index, reduction='none')
+
+        pt = paddle.exp(-ce_loss)
+        focal_loss = self.alpha * ((1 - pt)**self.gamma) * ce_loss
+
+        mask = paddle.cast(label != self.ignore_index, 'float32')
+        focal_loss *= mask
+        avg_loss = paddle.mean(focal_loss) / (paddle.mean(mask) + self.EPS)
+        return avg_loss
--- a/paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
+++ b/paddlers/models/ppseg/models/losses/gscnn_dual_task_loss.py
@ -13,7 +13,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/kl_loss.py
+++ b/paddlers/models/ppseg/models/losses/kl_loss.py
@ -16,7 +16,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/l1_loss.py
+++ b/paddlers/models/ppseg/models/losses/l1_loss.py
@ -16,7 +16,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
@ -74,3 +74,25 @@ class L1Loss(nn.L1Loss):

    def __init__(self, reduction='mean', ignore_index=255):
        super().__init__(reduction=reduction)
+        self.ignore_index = ignore_index
+        self.EPS = 1e-10
+
+    def forward(self, input, label):
+        mask = label != self.ignore_index
+        mask = paddle.cast(mask, "float32")
+        label.stop_gradient = True
+        mask.stop_gradient = True
+
+        output = paddle.nn.functional.l1_loss(
+            input, label, "none", name=self.name) * mask
+
+        if self.reduction == "mean":
+            return paddle.mean(output) / (paddle.mean(mask) + self.EPS)
+        elif self.reduction == "none":
+            return output
+        elif self.reduction == "sum":
+            return paddle.sum(output)
+        else:
+            raise ValueError(
+                "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % self.reduction)
--- a/paddlers/models/ppseg/models/losses/lovasz_loss.py
+++ b/paddlers/models/ppseg/models/losses/lovasz_loss.py
@ -22,7 +22,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
@ -124,8 +124,12 @@ def lovasz_hinge_flat(logits, labels):
    signs = 2. * labels - 1.
    signs.stop_gradient = True
    errors = 1. - logits * signs
-    errors_sorted, perm = paddle.fluid.core.ops.argsort(errors, 'axis', 0,
-                                                        'descending', True)
+    if hasattr(paddle, "_legacy_C_ops"):
+        errors_sorted, perm = paddle._legacy_C_ops.argsort(errors, 'axis', 0,
+                                                           'descending', True)
+    else:
+        errors_sorted, perm = paddle._C_ops.argsort(errors, 'axis', 0,
+                                                    'descending', True)
    errors_sorted.stop_gradient = False
    gt_sorted = paddle.gather(labels, perm)
    grad = lovasz_grad(gt_sorted)
@ -181,8 +185,12 @@ def lovasz_softmax_flat(probas, labels, classes='present'):
        else:
            class_pred = probas[:, c]
        errors = paddle.abs(fg - class_pred)
-        errors_sorted, perm = paddle.fluid.core.ops.argsort(errors, 'axis', 0,
-                                                            'descending', True)
+        if hasattr(paddle, "_legacy_C_ops"):
+            errors_sorted, perm = paddle._legacy_C_ops.argsort(
+                errors, 'axis', 0, 'descending', True)
+        else:
+            errors_sorted, perm = paddle._C_ops.argsort(errors, 'axis', 0,
+                                                        'descending', True)
        errors_sorted.stop_gradient = False

        fg_sorted = paddle.gather(fg, perm)
--- a/paddlers/models/ppseg/models/losses/mean_square_error_loss.py
+++ b/paddlers/models/ppseg/models/losses/mean_square_error_loss.py
@ -16,7 +16,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/mixed_loss.py
+++ b/paddlers/models/ppseg/models/losses/mixed_loss.py
@ -16,7 +16,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/ohem_cross_entropy_loss.py
@ -16,7 +16,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
@ -55,7 +55,7 @@ class OhemCrossEntropyLoss(nn.Layer):

        # get the label after ohem
        n, c, h, w = logit.shape
-        label = label.reshape((-1, ))
+        label = label.reshape((-1, )).astype('int64')
        valid_mask = (label != self.ignore_index).astype('int64')
        num_valid = valid_mask.sum()
        label = label * valid_mask
--- a/paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
+++ b/paddlers/models/ppseg/models/losses/ohem_edge_attention_loss.py
@ -16,8 +16,8 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.models import losses
+from paddleseg.cvlibs import manager
+from paddleseg.models import losses


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/pixel_contrast_cross_entropy_loss.py
@ -16,7 +16,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
@ -101,9 +101,12 @@ class PixelContrastCrossEntropyLoss(nn.Layer):
                elif num_hard >= n_view / 2:
                    num_easy_keep = num_easy
                    num_hard_keep = n_view - num_easy_keep
-                else:
+                elif num_easy >= n_view / 2:
                    num_hard_keep = num_hard
                    num_easy_keep = n_view - num_hard_keep
+                else:
+                    num_hard_keep = num_hard
+                    num_easy_keep = num_easy

                indices = None
                if num_hard > 0:
--- a/paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/point_cross_entropy_loss.py
@ -16,7 +16,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/losses/rmi_loss.py
+++ b/paddlers/models/ppseg/models/losses/rmi_loss.py
@ -17,7 +17,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager

 _euler_num = 2.718281828
 _pi = 3.14159265
--- a/paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
+++ b/paddlers/models/ppseg/models/losses/semantic_connectivity_loss.py
@ -18,7 +18,7 @@ import paddle
 from paddle import nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
@ -92,6 +92,7 @@ class SemanticConnectivityLoss(nn.Layer):
                label_num_conn, label_conn = cv2.connectedComponents(
                    labels_np_class.astype(np.uint8))

+                origin_pred_num_conn = pred_num_conn
                if pred_num_conn > 2 * label_num_conn:
                    pred_num_conn = min(pred_num_conn, self.max_pred_num_conn)
                real_pred_num = pred_num_conn - 1
@ -100,8 +101,9 @@ class SemanticConnectivityLoss(nn.Layer):
                # Connected Components Matching and SC Loss Calculation
                if real_label_num > 0 and real_pred_num > 0:
                    img_connectivity = compute_class_connectiveity(
-                        pred_conn, label_conn, pred_num_conn, label_num_conn,
-                        pred_i, real_label_num, real_pred_num, zero)
+                        pred_conn, label_conn, pred_num_conn,
+                        origin_pred_num_conn, label_num_conn, pred_i,
+                        real_label_num, real_pred_num, zero)
                    sc_loss += 1 - img_connectivity
                elif real_label_num == 0 and real_pred_num == 0:
                    # if no connected component, SC Loss = 0, so pass
@ -122,12 +124,12 @@ class SemanticConnectivityLoss(nn.Layer):


 def compute_class_connectiveity(pred_conn, label_conn, pred_num_conn,
-                                label_num_conn, pred, real_label_num,
-                                real_pred_num, zero):
+                                origin_pred_num_conn, label_num_conn, pred,
+                                real_label_num, real_pred_num, zero):

    pred_conn = paddle.to_tensor(pred_conn)
    label_conn = paddle.to_tensor(label_conn)
-    pred_conn = F.one_hot(pred_conn, pred_num_conn)
+    pred_conn = F.one_hot(pred_conn, origin_pred_num_conn)
    label_conn = F.one_hot(label_conn, label_num_conn)

    ious = paddle.zeros((real_label_num, real_pred_num))
--- a/paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
+++ b/paddlers/models/ppseg/models/losses/semantic_encode_cross_entropy_loss.py
@ -16,7 +16,7 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.cvlibs import manager
+from paddleseg.cvlibs import manager


@manager.LOSSES.add_component
--- a/paddlers/models/ppseg/models/lraspp.py
+++ b/paddlers/models/ppseg/models/lraspp.py
@ -0,0 +1,162 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager
+
+
+@manager.MODELS.add_component
+class LRASPP(nn.Layer):
+    """
+    Semantic segmentation model with a light R-ASPP head.
+    
+    The original article refers to
+        Howard, Andrew, et al. "Searching for mobilenetv3."
+        (https://arxiv.org/pdf/1909.11065.pdf)
+
+    Args:
+        num_classes (int): The number of target classes.
+        backbone(nn.Layer): Backbone network, such as stdc1net and resnet18. The backbone must
+            has feat_channels, of which the length is 5.
+        backbone_indices (List(int), optional): The values indicate the indices of backbone output 
+            used as the input of the LR-ASPP head.
+            Default: [0, 1, 3].
+        lraspp_head_inter_chs (List(int), optional): The intermediate channels of LR-ASPP head.
+            Default: [32, 64].
+        lraspp_head_out_ch (int, optional): The output channels of each ASPP branch in the LR-ASPP head.
+            Default: 128
+        resize_mode (str, optional): The resize mode for the upsampling operation in the LR-ASPP head.
+            Default: bilinear.
+        use_gap (bool, optional): If true, use global average pooling in the LR-ASPP head; otherwise, use
+            a 49x49 kernel for average pooling.
+            Default: True.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=[0, 1, 3],
+                 lraspp_head_inter_chs=[32, 64],
+                 lraspp_head_out_ch=128,
+                 resize_mode='bilinear',
+                 use_gap=True,
+                 pretrained=None):
+        super().__init__()
+
+        # backbone
+        assert hasattr(backbone, 'feat_channels'), \
+            "The backbone should has feat_channels."
+        assert len(backbone.feat_channels) >= len(backbone_indices), \
+            f"The length of input backbone_indices ({len(backbone_indices)}) should not be" \
+            f"greater than the length of feat_channels ({len(backbone.feat_channels)})."
+        assert len(backbone.feat_channels) > max(backbone_indices), \
+            f"The max value ({max(backbone_indices)}) of backbone_indices should be " \
+            f"less than the length of feat_channels ({len(backbone.feat_channels)})."
+        self.backbone = backbone
+
+        assert len(backbone_indices) >= 1, "The lenght of backbone_indices " \
+            "should not be lesser than 1"
+
+        # head
+        assert len(backbone_indices) == len(
+            lraspp_head_inter_chs
+        ) + 1, "The length of backbone_indices should be 1 greater than lraspp_head_inter_chs."
+        self.backbone_indices = backbone_indices
+
+        self.lraspp_head = LRASPPHead(backbone_indices, backbone.feat_channels,
+                                      lraspp_head_inter_chs, lraspp_head_out_ch,
+                                      num_classes, resize_mode, use_gap)
+
+        # pretrained
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        x_hw = paddle.shape(x)[2:]
+
+        feats_backbone = self.backbone(x)
+        assert len(feats_backbone) >= len(self.backbone_indices), \
+            f"The nums of backbone feats ({len(feats_backbone)}) should be greater or " \
+            f"equal than the nums of backbone_indices ({len(self.backbone_indices)})"
+
+        y = self.lraspp_head(feats_backbone)
+        y = F.interpolate(y, x_hw, mode='bilinear', align_corners=False)
+        logit_list = [y]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class LRASPPHead(nn.Layer):
+    def __init__(self,
+                 indices,
+                 in_chs,
+                 mid_chs,
+                 out_ch,
+                 n_classes,
+                 resize_mode,
+                 use_gap,
+                 align_corners=False):
+        super().__init__()
+
+        self.indices = indices[-2::-1]
+        self.in_chs = [in_chs[i] for i in indices[::-1]]
+        self.mid_chs = mid_chs[::-1]
+        self.convs = nn.LayerList()
+        self.conv_ups = nn.LayerList()
+        for in_ch, mid_ch in zip(self.in_chs[1:], self.mid_chs):
+            self.convs.append(
+                nn.Conv2D(
+                    in_ch, mid_ch, kernel_size=1, bias_attr=False))
+            self.conv_ups.append(layers.ConvBNReLU(out_ch + mid_ch, out_ch, 1))
+        self.conv_w = nn.Sequential(
+            nn.AvgPool2D(
+                kernel_size=(49, 49), stride=(16, 20))
+            if not use_gap else nn.AdaptiveAvgPool2D(1),
+            nn.Conv2D(
+                self.in_chs[0], out_ch, 1, bias_attr=False),
+            nn.Sigmoid())
+        self.conv_v = layers.ConvBNReLU(self.in_chs[0], out_ch, 1)
+        self.conv_t = nn.Conv2D(out_ch, out_ch, kernel_size=1, bias_attr=False)
+        self.conv_out = nn.Conv2D(
+            out_ch, n_classes, kernel_size=1, bias_attr=False)
+
+        self.interp = partial(
+            F.interpolate, mode=resize_mode, align_corners=align_corners)
+
+    def forward(self, in_feat_list):
+        x = in_feat_list[-1]
+
+        x = self.conv_v(x) * self.interp(self.conv_w(x), paddle.shape(x)[2:])
+        y = self.conv_t(x)
+
+        for idx, conv, conv_up in zip(self.indices, self.convs, self.conv_ups):
+            feat = in_feat_list[idx]
+            y = self.interp(y, paddle.shape(feat)[2:])
+            y = paddle.concat([y, conv(feat)], axis=1)
+            y = conv_up(y)
+
+        y = self.conv_out(y)
+        return y
--- a/paddlers/models/ppseg/models/mla_transformer.py
+++ b/paddlers/models/ppseg/models/mla_transformer.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -16,9 +16,9 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from paddlers.models.ppseg.models import layers
-from paddlers.models.ppseg.cvlibs import manager
-from paddlers.models.ppseg.utils import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager
+from paddleseg.utils import utils


 class MLAHeads(nn.Layer):
--- a/paddlers/models/ppseg/models/mobileseg.py
+++ b/paddlers/models/ppseg/models/mobileseg.py
@ -0,0 +1,289 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleseg import utils
+from paddleseg.models import layers
+from paddleseg.cvlibs import manager
+
+
+@manager.MODELS.add_component
+class MobileSeg(nn.Layer):
+    """
+    The semantic segmentation models for mobile devices.
+
+    Args:
+        num_classes (int): The number of target classes.
+        backbone(nn.Layer): Backbone network, such as stdc1net and resnet18. The backbone must
+            has feat_channels, of which the length is 5.
+        backbone_indices (List(int), optional): The values indicate the indices of output of backbone.
+            Default: [2, 3, 4].
+        cm_bin_sizes (List(int), optional): The bin size of context module. Default: [1,2,4].
+        cm_out_ch (int, optional): The output channel of the last context module. Default: 128.
+        arm_type (str, optional): The type of attention refinement module. Default: ARM_Add_SpAttenAdd3.
+        arm_out_chs (List(int), optional): The out channels of each arm module. Default: [64, 96, 128].
+        seg_head_inter_chs (List(int), optional): The intermediate channels of segmentation head.
+            Default: [64, 64, 64].
+        resize_mode (str, optional): The resize mode for the upsampling operation in decoder.
+            Default: bilinear.
+        use_last_fuse (bool, optional): Whether use fusion in the last. Default: False.
+        pretrained (str, optional): The path or url of pretrained model. Default: None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 backbone,
+                 backbone_indices=[1, 2, 3],
+                 cm_bin_sizes=[1, 2],
+                 cm_out_ch=64,
+                 arm_type='UAFMMobile',
+                 arm_out_chs=[32, 48, 64],
+                 seg_head_inter_chs=[32, 32, 32],
+                 resize_mode='bilinear',
+                 use_last_fuse=False,
+                 pretrained=None):
+        super().__init__()
+
+        # backbone
+        assert hasattr(backbone, 'feat_channels'), \
+            "The backbone should has feat_channels."
+        assert len(backbone.feat_channels) >= len(backbone_indices), \
+            f"The length of input backbone_indices ({len(backbone_indices)}) should not be" \
+            f"greater than the length of feat_channels ({len(backbone.feat_channels)})."
+        assert len(backbone.feat_channels) > max(backbone_indices), \
+            f"The max value ({max(backbone_indices)}) of backbone_indices should be " \
+            f"less than the length of feat_channels ({len(backbone.feat_channels)})."
+        self.backbone = backbone
+
+        assert len(backbone_indices) >= 1, "The lenght of backbone_indices " \
+            "should not be lesser than 1"
+        self.backbone_indices = backbone_indices  # [..., x16_id, x32_id]
+        backbone_out_chs = [backbone.feat_channels[i] for i in backbone_indices]
+
+        # head
+        if len(arm_out_chs) == 1:
+            arm_out_chs = arm_out_chs * len(backbone_indices)
+        assert len(arm_out_chs) == len(backbone_indices), "The length of " \
+            "arm_out_chs and backbone_indices should be equal"
+
+        self.ppseg_head = MobileSegHead(backbone_out_chs, arm_out_chs,
+                                        cm_bin_sizes, cm_out_ch, arm_type,
+                                        resize_mode, use_last_fuse)
+
+        if len(seg_head_inter_chs) == 1:
+            seg_head_inter_chs = seg_head_inter_chs * len(backbone_indices)
+        assert len(seg_head_inter_chs) == len(backbone_indices), "The length of " \
+            "seg_head_inter_chs and backbone_indices should be equal"
+        self.seg_heads = nn.LayerList()  # [..., head_16, head32]
+        for in_ch, mid_ch in zip(arm_out_chs, seg_head_inter_chs):
+            self.seg_heads.append(SegHead(in_ch, mid_ch, num_classes))
+
+        # pretrained
+        self.pretrained = pretrained
+        self.init_weight()
+
+    def forward(self, x):
+        x_hw = paddle.shape(x)[2:]
+
+        feats_backbone = self.backbone(x)  # [x4, x8, x16, x32]
+        assert len(feats_backbone) >= len(self.backbone_indices), \
+            f"The nums of backbone feats ({len(feats_backbone)}) should be greater or " \
+            f"equal than the nums of backbone_indices ({len(self.backbone_indices)})"
+
+        feats_selected = [feats_backbone[i] for i in self.backbone_indices]
+        feats_head = self.ppseg_head(feats_selected)  # [..., x8, x16, x32]
+
+        if self.training:
+            logit_list = []
+            for x, seg_head in zip(feats_head, self.seg_heads):
+                x = seg_head(x)
+                logit_list.append(x)
+            logit_list = [
+                F.interpolate(
+                    x, x_hw, mode='bilinear', align_corners=False)
+                for x in logit_list
+            ]
+        else:
+            x = self.seg_heads[0](feats_head[0])
+            x = F.interpolate(x, x_hw, mode='bilinear', align_corners=False)
+            logit_list = [x]
+
+        return logit_list
+
+    def init_weight(self):
+        if self.pretrained is not None:
+            utils.load_entire_model(self, self.pretrained)
+
+
+class MobileSegHead(nn.Layer):
+    """
+    The head of MobileSeg.
+
+    Args:
+        backbone_out_chs (List(Tensor)): The channels of output tensors in the backbone.
+        arm_out_chs (List(int)): The out channels of each arm module.
+        cm_bin_sizes (List(int)): The bin size of context module.
+        cm_out_ch (int): The output channel of the last context module.
+        arm_type (str): The type of attention refinement module.
+        resize_mode (str): The resize mode for the upsampling operation in decoder.
+    """
+
+    def __init__(self, backbone_out_chs, arm_out_chs, cm_bin_sizes, cm_out_ch,
+                 arm_type, resize_mode, use_last_fuse):
+        super().__init__()
+
+        self.cm = MobileContextModule(backbone_out_chs[-1], cm_out_ch,
+                                      cm_out_ch, cm_bin_sizes)
+
+        assert hasattr(layers,arm_type), \
+            "Not support arm_type ({})".format(arm_type)
+        arm_class = eval("layers." + arm_type)
+
+        self.arm_list = nn.LayerList()  # [..., arm8, arm16, arm32]
+        for i in range(len(backbone_out_chs)):
+            low_chs = backbone_out_chs[i]
+            high_ch = cm_out_ch if i == len(
+                backbone_out_chs) - 1 else arm_out_chs[i + 1]
+            out_ch = arm_out_chs[i]
+            arm = arm_class(
+                low_chs, high_ch, out_ch, ksize=3, resize_mode=resize_mode)
+            self.arm_list.append(arm)
+
+        self.use_last_fuse = use_last_fuse
+        if self.use_last_fuse:
+            self.fuse_convs = nn.LayerList()
+            for i in range(1, len(arm_out_chs)):
+                conv = layers.SeparableConvBNReLU(
+                    arm_out_chs[i],
+                    arm_out_chs[0],
+                    kernel_size=3,
+                    bias_attr=False)
+                self.fuse_convs.append(conv)
+            self.last_conv = layers.SeparableConvBNReLU(
+                len(arm_out_chs) * arm_out_chs[0],
+                arm_out_chs[0],
+                kernel_size=3,
+                bias_attr=False)
+
+    def forward(self, in_feat_list):
+        """
+        Args:
+            in_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32].
+                x2, x4 and x8 are optional.
+        Returns:
+            out_feat_list (List(Tensor)): Such as [x2, x4, x8, x16, x32].
+                x2, x4 and x8 are optional.
+                The length of in_feat_list and out_feat_list are the same.
+        """
+
+        high_feat = self.cm(in_feat_list[-1])
+        out_feat_list = []
+
+        for i in reversed(range(len(in_feat_list))):
+            low_feat = in_feat_list[i]
+            arm = self.arm_list[i]
+            high_feat = arm(low_feat, high_feat)
+            out_feat_list.insert(0, high_feat)
+
+        if self.use_last_fuse:
+            x_list = [out_feat_list[0]]
+            size = paddle.shape(out_feat_list[0])[2:]
+            for i, (x, conv
+                    ) in enumerate(zip(out_feat_list[1:], self.fuse_convs)):
+                x = conv(x)
+                x = F.interpolate(
+                    x, size=size, mode='bilinear', align_corners=False)
+                x_list.append(x)
+            x = paddle.concat(x_list, axis=1)
+            x = self.last_conv(x)
+            out_feat_list[0] = x
+
+        return out_feat_list
+
+
+class MobileContextModule(nn.Layer):
+    """
+    Context Module for Mobile Model.
+
+    Args:
+        in_channels (int): The number of input channels to pyramid pooling module.
+        inter_channels (int): The number of inter channels to pyramid pooling module.
+        out_channels (int): The number of output channels after pyramid pooling module.
+        bin_sizes (tuple, optional): The out size of pooled feature maps. Default: (1, 3).
+        align_corners (bool): An argument of F.interpolate. It should be set to False
+            when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 inter_channels,
+                 out_channels,
+                 bin_sizes,
+                 align_corners=False):
+        super().__init__()
+
+        self.stages = nn.LayerList([
+            self._make_stage(in_channels, inter_channels, size)
+            for size in bin_sizes
+        ])
+
+        self.conv_out = layers.SeparableConvBNReLU(
+            in_channels=inter_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            bias_attr=False)
+
+        self.align_corners = align_corners
+
+    def _make_stage(self, in_channels, out_channels, size):
+        prior = nn.AdaptiveAvgPool2D(output_size=size)
+        conv = layers.ConvBNReLU(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1)
+        return nn.Sequential(prior, conv)
+
+    def forward(self, input):
+        out = None
+        input_shape = paddle.shape(input)[2:]
+
+        for stage in self.stages:
+            x = stage(input)
+            x = F.interpolate(
+                x,
+                input_shape,
+                mode='bilinear',
+                align_corners=self.align_corners)
+            if out is None:
+                out = x
+            else:
+                out += x
+
+        out = self.conv_out(out)
+        return out
+
+
+class SegHead(nn.Layer):
+    def __init__(self, in_chan, mid_chan, n_classes):
+        super().__init__()
+        self.conv = layers.SeparableConvBNReLU(
+            in_chan, mid_chan, kernel_size=3, bias_attr=False)
+        self.conv_out = nn.Conv2D(
+            mid_chan, n_classes, kernel_size=1, bias_attr=False)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.conv_out(x)
+        return x
--- a/Show More
+++ b/Show More