add blendmask

5 years ago · 4b4d2675b2
parent 0f2d3c1bfe
commit 4b4d2675b2
40 changed files with 1351 additions and 54 deletions
--- a/.gitignore
+++ b/.gitignore
@ -40,7 +40,14 @@ dist/
 # project dirs
 /datasets/coco
 /datasets/lvis
+/datasets/pic
+/datasets/ytvos
 /models
+/demo_outputs
+/example_inputs
 /debug
 /weights
+/export
 eval.sh
+train.sh
+benchmark.sh
--- a/README.md
+++ b/README.md
@ -24,6 +24,34 @@ Name | box AP | download
 --- |:---:|:---:
 [FCOS_R_50_1x](configs/FCOS-Detection/R_50_1x.yaml) | 38.7 | [model](https://cloudstor.aarnet.edu.au/plus/s/glqFc13cCoEyHYy/download)

+### COCO Instance Segmentation Baselines with [BlendMask](https://arxiv.org/abs/2001.00309)
+
+Model | Name |inference time (ms/im) | box AP | mask AP | download
+--- |:---:|:---:|:---:|:---:|:---:
+Mask R-CNN | [550_R_50_3x](configs/RCNN/550_R_50_FPN_3x.yaml) | 63 | 39.1 | 35.3 |
+BlendMask | [550_R_50_3x](configs/BlendMask/550_R_50_3x.yaml) | 40 | 38.7 | 34.5 | [model](https://cloudstor.aarnet.edu.au/plus/s/o0bpkmhMiuYgIcQ/download)
+Mask R-CNN | [R_50_1x](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml) | 90 | 38.6 | 35.2 |
+BlendMask | [R_50_1x](configs/BlendMask/R_50_1x.yaml) | 83 | 39.9 | 35.8 | [model](https://cloudstor.aarnet.edu.au/plus/s/crpmeVCnQ3StvSz/download)
+Mask R-CNN | [R_50_3x](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml) |  | 41.0 | 37.2 | 
+BlendMask | [R_50_3x](configs/BlendMask/R_50_3x.yaml) |  | 42.7 | 37.8 | [model](https://cloudstor.aarnet.edu.au/plus/s/9u1cG2zXvEva5SM/download)
+Mask R-CNN | [R_101_3x](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml) |  | 42.9 | 38.6 |
+BlendMask | [R_101_3x](configs/BlendMask/R_101_3x.yaml) |  | 44.8 | 39.5 | [model](https://cloudstor.aarnet.edu.au/plus/s/mYm5VCXICoeLNHq/download)
+BlendMask | [R_101_dcni3_5x](configs/BlendMask/R_101_dcni3_5x.yaml) |  | 46.8 | 41.1 | [model](https://cloudstor.aarnet.edu.au/plus/s/TAZPxSDvPuhegKp/download)
+
+### COCO Panoptic Segmentation Baselines with BlendMask
+Model | Name | PQ | PQ<sup>Th</sup> | PQ<sup>St</sup> | download
+--- |:---:|:---:|:---:|:---:|:---:
+Panoptic FPN | [R_50_3x](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml) | 41.5 | 48.3 | 31.2 | 
+BlendMask | [R_50_3x](configs/BlendMask/Panoptic/R_50_3x.yaml) | 42.5 | 49.5 | 32.0 | [model](https://cloudstor.aarnet.edu.au/plus/s/bG0IhYeMAvlTGTq/download)
+Panoptic FPN | [R_101_3x](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-InstanceSegmentation/panoptic_fpn_R_101_3x.yaml) | 43.0 | 49.7 | 32.9 |
+BlendMask | [R_101_3x](configs/BlendMask/Panoptic/R_101_3x.yaml) | 44.3 | 51.6 | 33.2 | [model](https://cloudstor.aarnet.edu.au/plus/s/AEwbhyQ9F3lqvsz/download)
+BlendMask | [R_101_dcni3_5x](configs/BlendMask/Panoptic/R_101_dcni3_5x.yaml) | 46.0 | 52.9 | 35.5 | [model](https://cloudstor.aarnet.edu.au/plus/s/GyWDhsukAYYokZg/download)
+
+### Person in Context with BlendMask
+Model | Name | box AP | mask AP | download
+--- |:---:|:---:|:---:|:---:
+BlendMask | [R_50_1x](configs/BlendMask/Person/R_50_1x.yaml) | 70.6 | 66.7 | [model](https://cloudstor.aarnet.edu.au/plus/s/d4f16WshXYbOuIo)
+
 ## Installation

 First install Detectron2 following the official guide: [INSTALL.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md). Then build AdelaiDet with:
--- a/adet/config/defaults.py
+++ b/adet/config/defaults.py
@ -46,12 +46,11 @@ _C.MODEL.FCOS.NUM_SHARE_CONVS = 0
 _C.MODEL.FCOS.CENTER_SAMPLE = True
 _C.MODEL.FCOS.POS_RADIUS = 1.5
 _C.MODEL.FCOS.LOC_LOSS_TYPE = 'giou'
-
+_C.MODEL.FCOS.YIELD_PROPOSAL = False

 # ---------------------------------------------------------------------------- #
 # VoVNet backbone
 # ---------------------------------------------------------------------------- #
-
 _C.MODEL.VOVNET = CN()
 _C.MODEL.VOVNET.CONV_BODY = "V-39-eSE"
 _C.MODEL.VOVNET.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
@ -59,4 +58,33 @@ _C.MODEL.VOVNET.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
 # Options: FrozenBN, GN, "SyncBN", "BN"
 _C.MODEL.VOVNET.NORM = "FrozenBN"
 _C.MODEL.VOVNET.OUT_CHANNELS = 256
-_C.MODEL.VOVNET.BACKBONE_OUT_CHANNELS = 256
+_C.MODEL.VOVNET.BACKBONE_OUT_CHANNELS = 256
+
+# ---------------------------------------------------------------------------- #
+# BlendMask Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BLENDMASK = CN()
+_C.MODEL.BLENDMASK.ATTN_SIZE = 14
+_C.MODEL.BLENDMASK.TOP_INTERP = "bilinear"
+_C.MODEL.BLENDMASK.BOTTOM_RESOLUTION = 56
+_C.MODEL.BLENDMASK.POOLER_TYPE = "ROIAlignV2"
+_C.MODEL.BLENDMASK.POOLER_SAMPLING_RATIO = 1
+_C.MODEL.BLENDMASK.POOLER_SCALES = (0.25,)
+_C.MODEL.BLENDMASK.INSTANCE_LOSS_WEIGHT = 1.0
+_C.MODEL.BLENDMASK.VISUALIZE = False
+
+# ---------------------------------------------------------------------------- #
+# Basis Module Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BASIS_MODULE = CN()
+_C.MODEL.BASIS_MODULE.NAME = "ProtoNet"
+_C.MODEL.BASIS_MODULE.NUM_BASES = 4
+_C.MODEL.BASIS_MODULE.LOSS_ON = False
+_C.MODEL.BASIS_MODULE.ANN_SET = "coco"
+_C.MODEL.BASIS_MODULE.CONVS_DIM = 128
+_C.MODEL.BASIS_MODULE.IN_FEATURES = ["p3", "p4", "p5"]
+_C.MODEL.BASIS_MODULE.NORM = "SyncBN"
+_C.MODEL.BASIS_MODULE.NUM_CONVS = 3
+_C.MODEL.BASIS_MODULE.COMMON_STRIDE = 8
+_C.MODEL.BASIS_MODULE.NUM_CLASSES = 80
+_C.MODEL.BASIS_MODULE.LOSS_WEIGHT = 0.3
--- a/adet/data/init.py
+++ b/adet/data/init.py
@ -1,5 +1,5 @@
 from . import builtin  # ensure the builtin datasets are registered
-# from .dataset_mapper import DatasetMapperWithBasis
+from .dataset_mapper import DatasetMapperWithBasis


-# __all__ = ["DatasetMapperWithBasis"]
+__all__ = ["DatasetMapperWithBasis"]
--- a/adet/data/builtin.py
+++ b/adet/data/builtin.py
@ -1,8 +1,9 @@
 import os

 from detectron2.data.datasets.register_coco import register_coco_instances
+from detectron2.data.datasets.builtin_meta import _get_builtin_metadata

-# register person in context dataset
+# register plane reconstruction

 _PREDEFINED_SPLITS_PIC = {
    "pic_person_train": ("pic/image/train", "pic/annotations/train_person.json"),
@ -24,4 +25,5 @@ def register_all_coco(root="datasets"):
            os.path.join(root, image_root),
        )

-register_all_coco()
+
+register_all_coco()
--- a/adet/data/dataset_mapper.py
+++ b/adet/data/dataset_mapper.py
@ -0,0 +1,141 @@
+import copy
+import numpy as np
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image
+
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.detection_utils import SizeMismatchError
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+
+"""
+This file contains the default mapping that's applied to "dataset dicts".
+"""
+
+__all__ = ["DatasetMapperWithBasis"]
+
+
+class DatasetMapperWithBasis(DatasetMapper):
+    """
+    This caller enables the default Detectron2 mapper to read an additional basis semantic label
+    """
+
+    def __init__(self, cfg, is_train=True):
+        super().__init__(cfg, is_train)
+
+        # fmt: off
+        self.basis_loss_on  = cfg.MODEL.BASIS_MODULE.LOSS_ON
+        self.ann_set        = cfg.MODEL.BASIS_MODULE.ANN_SET
+        # fmt: on
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        # USER: Write your own image loading if it's not from a file
+        try:
+            image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        except Exception as e:
+            print(dataset_dict["file_name"])
+            print(e)
+            raise e
+        try:
+            utils.check_image_size(dataset_dict, image)
+        except SizeMismatchError as e:
+            expected_wh = (dataset_dict["width"], dataset_dict["height"])
+            image_wh = (image.shape[1], image.shape[0])
+            if (image_wh[1], image_wh[0]) == expected_wh:
+                print("transposing image {}".format(dataset_dict["file_name"]))
+                image = image.transpose(1, 0, 2)
+            else:
+                raise e
+
+        if "annotations" not in dataset_dict or len(dataset_dict["annotations"]) == 0:
+            image, transforms = T.apply_transform_gens(
+                ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image
+            )
+        else:
+            # Crop around an instance if there are instances in the image.
+            # USER: Remove if you don't use cropping
+            if self.crop_gen:
+                crop_tfm = utils.gen_crop_transform_with_instance(
+                    self.crop_gen.get_crop_size(image.shape[:2]),
+                    image.shape[:2],
+                    np.random.choice(dataset_dict["annotations"]),
+                )
+                image = crop_tfm.apply_image(image)
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+            if self.crop_gen:
+                transforms = crop_tfm + transforms
+
+        image_shape = image.shape[:2]  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
+        # Can use uint8 if it turns out to be slow some day
+
+        # USER: Remove if you don't use pre-computed proposals.
+        if self.load_proposals:
+            utils.transform_proposals(
+                dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk
+            )
+
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            dataset_dict.pop("sem_seg_file_name", None)
+            dataset_dict.pop("pano_seg_file_name", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.mask_on:
+                    anno.pop("segmentation", None)
+                if not self.keypoint_on:
+                    anno.pop("keypoints", None)
+
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(
+                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+                )
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = utils.annotations_to_instances(
+                annos, image_shape, mask_format=self.mask_format
+            )
+            # Create a tight bounding box from masks, useful when image is cropped
+            if self.crop_gen and instances.has("gt_masks"):
+                instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+
+        # USER: Remove if you don't do semantic/panoptic segmentation.
+        if "sem_seg_file_name" in dataset_dict:
+            with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f:
+                sem_seg_gt = Image.open(f)
+                sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8")
+            sem_seg_gt = transforms.apply_segmentation(sem_seg_gt)
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+            dataset_dict["sem_seg"] = sem_seg_gt
+
+        if self.basis_loss_on and self.is_train:
+            # load basis supervisions
+            if self.ann_set == "coco":
+                basis_sem_path = dataset_dict["file_name"].replace('train2017', 'thing_train2017').replace('image/train', 'thing_train')
+            else:
+                basis_sem_path = dataset_dict["file_name"].replace('coco', 'lvis').replace('train2017', 'thing_train').replace('jpg', 'npz')
+            basis_sem_path = basis_sem_path.replace('jpg', 'npz')
+            basis_sem_gt = np.load(basis_sem_path)["mask"]
+            basis_sem_gt = transforms.apply_segmentation(basis_sem_gt)
+            basis_sem_gt = torch.as_tensor(basis_sem_gt.astype("long"))
+            dataset_dict["basis_sem"] = basis_sem_gt
+        return dataset_dict
--- a/adet/layers/conv_with_kaiming_uniform.py
+++ b/adet/layers/conv_with_kaiming_uniform.py
@ -37,7 +37,7 @@ def conv_with_kaiming_uniform(
            if norm is None:
                nn.init.constant_(conv.bias, 0)
        module = [conv,]
-        if norm is not None:
+        if norm is not None and len(norm) > 0:
            if norm == "GN":
                norm_module = nn.GroupNorm(32, out_channels)
            else:
--- a/adet/modeling/init.py
+++ b/adet/modeling/init.py
@ -1,4 +1,5 @@
 from .fcos import FCOS
+from .blendmask import BlendMask
 from .backbone import build_fcos_resnet_fpn_backbone
 from .one_stage_detector import OneStageDetector

--- a/adet/modeling/blendmask/init.py
+++ b/adet/modeling/blendmask/init.py
@ -0,0 +1,2 @@
+from .basis_module import build_basis_module
+from .blendmask import BlendMask
--- a/adet/modeling/blendmask/basis_module.py
+++ b/adet/modeling/blendmask/basis_module.py
@ -0,0 +1,104 @@
+from typing import Dict
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.utils.registry import Registry
+from detectron2.layers import ShapeSpec
+
+from adet.layers import conv_with_kaiming_uniform
+
+
+BASIS_MODULE_REGISTRY = Registry("BASIS_MODULE")
+BASIS_MODULE_REGISTRY.__doc__ = """
+Registry for basis module, which produces global bases from feature maps.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+
+
+def build_basis_module(cfg, input_shape):
+    name = cfg.MODEL.BASIS_MODULE.NAME
+    return BASIS_MODULE_REGISTRY.get(name)(cfg, input_shape)
+
+
+@BASIS_MODULE_REGISTRY.register()
+class ProtoNet(nn.Module):
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        """
+        TODO: support deconv and variable channel width
+        """
+        # official protonet has a relu after each conv
+        super().__init__()
+        # fmt: off
+        mask_dim          = cfg.MODEL.BASIS_MODULE.NUM_BASES
+        planes            = cfg.MODEL.BASIS_MODULE.CONVS_DIM
+        self.in_features  = cfg.MODEL.BASIS_MODULE.IN_FEATURES
+        self.loss_on      = cfg.MODEL.BASIS_MODULE.LOSS_ON
+        norm              = cfg.MODEL.BASIS_MODULE.NORM
+        num_convs         = cfg.MODEL.BASIS_MODULE.NUM_CONVS
+        self.visualize    = cfg.MODEL.BLENDMASK.VISUALIZE
+        # fmt: on
+
+        feature_channels = {k: v.channels for k, v in input_shape.items()}
+
+        conv_block = conv_with_kaiming_uniform(norm, True)  # conv relu bn
+        self.refine = nn.ModuleList()
+        for in_feature in self.in_features:
+            self.refine.append(conv_block(
+                feature_channels[in_feature], planes, 3, 1))
+        tower = []
+        for i in range(num_convs):
+            tower.append(
+                conv_block(planes, planes, 3, 1))
+        tower.append(
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False))
+        tower.append(
+            conv_block(planes, planes, 3, 1))
+        tower.append(
+            nn.Conv2d(planes, mask_dim, 1))
+        self.add_module('tower', nn.Sequential(*tower))
+
+        if self.loss_on:
+            # fmt: off
+            self.common_stride   = cfg.MODEL.BASIS_MODULE.COMMON_STRIDE
+            num_classes          = cfg.MODEL.BASIS_MODULE.NUM_CLASSES + 1
+            self.sem_loss_weight = cfg.MODEL.BASIS_MODULE.LOSS_WEIGHT
+            # fmt: on
+
+            inplanes = feature_channels[self.in_features[0]]
+            self.seg_head = nn.Sequential(nn.Conv2d(inplanes, planes, kernel_size=3,
+                                                    stride=1, padding=1, bias=False),
+                                          nn.BatchNorm2d(planes),
+                                          nn.ReLU(),
+                                          nn.Conv2d(planes, planes, kernel_size=3,
+                                                    stride=1, padding=1, bias=False),
+                                          nn.BatchNorm2d(planes),
+                                          nn.ReLU(),
+                                          nn.Conv2d(planes, num_classes, kernel_size=1,
+                                                    stride=1))
+
+    def forward(self, features, targets=None):
+        for i, f in enumerate(self.in_features):
+            if i == 0:
+                x = self.refine[i](features[f])
+            else:
+                x_p = self.refine[i](features[f])
+                x_p = F.interpolate(x_p, x.size()[2:], mode="bilinear", align_corners=False)
+                # x_p = aligned_bilinear(x_p, x.size(3) // x_p.size(3))
+                x = x + x_p
+        outputs = {"bases": [self.tower(x)]}
+        losses = {}
+        # auxiliary thing semantic loss
+        if self.training and self.loss_on:
+            sem_out = self.seg_head(features[self.in_features[0]])
+            # resize target to reduce memory
+            gt_sem = targets.unsqueeze(1).float()
+            gt_sem = F.interpolate(
+                gt_sem, scale_factor=1 / self.common_stride)
+            seg_loss = F.cross_entropy(
+                sem_out, gt_sem.squeeze().long())
+            losses['loss_basis_sem'] = seg_loss * self.sem_loss_weight
+        elif self.visualize and hasattr(self, "seg_head"):
+            outputs["seg_thing_out"] = self.seg_head(features[self.in_features[0]])
+        return outputs, losses
--- a/adet/modeling/blendmask/blender.py
+++ b/adet/modeling/blendmask/blender.py
@ -0,0 +1,110 @@
+import torch
+from torch.nn import functional as F
+
+from detectron2.layers import cat
+from detectron2.modeling.poolers import ROIPooler
+
+
+def build_blender(cfg):
+    return Blender(cfg)
+
+
+class Blender(object):
+    def __init__(self, cfg):
+
+        # fmt: off
+        self.pooler_resolution = cfg.MODEL.BLENDMASK.BOTTOM_RESOLUTION
+        sampling_ratio         = cfg.MODEL.BLENDMASK.POOLER_SAMPLING_RATIO
+        pooler_type            = cfg.MODEL.BLENDMASK.POOLER_TYPE
+        pooler_scales          = cfg.MODEL.BLENDMASK.POOLER_SCALES
+        self.attn_size         = cfg.MODEL.BLENDMASK.ATTN_SIZE
+        self.top_interp        = cfg.MODEL.BLENDMASK.TOP_INTERP
+        num_bases              = cfg.MODEL.BASIS_MODULE.NUM_BASES
+        # fmt: on
+
+        self.attn_len = num_bases * self.attn_size * self.attn_size
+
+        self.pooler = ROIPooler(
+            output_size=self.pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+            canonical_level=2)
+
+    def __call__(self, bases, proposals, gt_instances):
+        if gt_instances is not None:
+            # training
+            # reshape attns
+            extras = proposals["extras"]
+            attns = proposals["top_feats"]
+            pos_inds = extras["pos_inds"]
+            if pos_inds.numel() == 0:
+                return None, {"loss_mask": sum([x.sum() * 0 for x in attns]) + bases[0].sum() * 0}
+
+            gt_inds = extras["gt_inds"]
+            attns = cat(
+                [
+                    # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
+                    x.permute(0, 2, 3, 1).reshape(-1, self.attn_len)
+                    for x in attns
+                ], dim=0,)
+            attns = attns[pos_inds]
+
+            rois = self.pooler(bases, [x.gt_boxes for x in gt_instances])
+            rois = rois[gt_inds]
+            pred_mask_logits = self.merge_bases(rois, attns)
+
+            # gen targets
+            gt_masks = []
+            for instances_per_image in gt_instances:
+                if len(instances_per_image.gt_boxes.tensor) == 0:
+                    continue
+                gt_mask_per_image = instances_per_image.gt_masks.crop_and_resize(
+                    instances_per_image.gt_boxes.tensor, self.pooler_resolution
+                ).to(device=pred_mask_logits.device)
+                gt_masks.append(gt_mask_per_image)
+            gt_masks = cat(gt_masks, dim=0)
+            gt_masks = gt_masks[gt_inds]
+            N = gt_masks.size(0)
+            gt_masks = gt_masks.view(N, -1)
+
+            gt_ctr = extras["gt_ctr"]
+            loss_denorm = extras["loss_denorm"]
+            mask_losses = F.binary_cross_entropy_with_logits(
+                pred_mask_logits, gt_masks.to(dtype=torch.float32), reduction="none")
+            mask_loss = ((mask_losses.mean(dim=-1) * gt_ctr).sum()
+                         / loss_denorm)
+            return None, {"loss_mask": mask_loss}
+        else:
+            # no proposals
+            total_instances = sum([len(x) for x in proposals])
+            if total_instances == 0:
+                # add empty pred_masks results
+                for box in proposals:
+                    box.pred_masks = box.pred_classes.view(
+                        -1, 1, self.pooler_resolution, self.pooler_resolution)
+                return proposals, {}
+            rois = self.pooler(bases, [x.pred_boxes for x in proposals])
+            attns = cat([x.top_feat for x in proposals], dim=0)
+            pred_mask_logits = self.merge_bases(rois, attns).sigmoid()
+            pred_mask_logits = pred_mask_logits.view(
+                -1, 1, self.pooler_resolution, self.pooler_resolution)
+            start_ind = 0
+            for box in proposals:
+                end_ind = start_ind + len(box)
+                box.pred_masks = pred_mask_logits[start_ind:end_ind]
+                start_ind = end_ind
+            return proposals, {}
+
+    def merge_bases(self, rois, coeffs, location_to_inds=None):
+        # merge predictions
+        N = coeffs.size(0)
+        if location_to_inds is not None:
+            rois = rois[location_to_inds]
+        N, B, H, W = rois.size()
+
+        coeffs = coeffs.view(N, -1, self.attn_size, self.attn_size)
+        coeffs = F.interpolate(coeffs, (H, W),
+                               mode=self.top_interp).softmax(dim=1)
+        masks_preds = (rois * coeffs).sum(dim=1)
+        return masks_preds.view(N, -1)
--- a/adet/modeling/blendmask/blendmask.py
+++ b/adet/modeling/blendmask/blendmask.py
@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+from torch import nn
+
+from detectron2.structures import ImageList
+from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess
+from detectron2.modeling.proposal_generator import build_proposal_generator
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.meta_arch.panoptic_fpn import combine_semantic_and_instance_outputs
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.meta_arch.semantic_seg import build_sem_seg_head
+
+from .blender import build_blender
+from .basis_module import build_basis_module
+
+__all__ = ["BlendMask"]
+
+
+@META_ARCH_REGISTRY.register()
+class BlendMask(nn.Module):
+    """
+    Main class for BlendMask architectures (see https://arxiv.org/abd/1901.02446).
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.instance_loss_weight = cfg.MODEL.BLENDMASK.INSTANCE_LOSS_WEIGHT
+
+        self.backbone = build_backbone(cfg)
+        self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
+        self.blender = build_blender(cfg)
+        self.basis_module = build_basis_module(cfg, self.backbone.output_shape())
+
+        # options when combining instance & semantic outputs
+        self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED
+        if self.combine_on:
+            self.panoptic_module = build_sem_seg_head(cfg, self.backbone.output_shape())
+            self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH
+            self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT
+            self.combine_instances_confidence_threshold = (
+                cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH)
+
+        # build top module
+        in_channels = cfg.MODEL.FPN.OUT_CHANNELS
+        num_bases = cfg.MODEL.BASIS_MODULE.NUM_BASES
+        attn_size = cfg.MODEL.BLENDMASK.ATTN_SIZE
+        attn_len = num_bases * attn_size * attn_size
+        self.top_layer = nn.Conv2d(
+            in_channels, attn_len,
+            kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(self.top_layer.weight, std=0.01)
+        torch.nn.init.constant_(self.top_layer.bias, 0)
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+
+        For now, each item in the list is a dict that contains:
+            image: Tensor, image in (C, H, W) format.
+            instances: Instances
+            sem_seg: semantic segmentation ground truth.
+            Other information that's included in the original dicts, such as:
+                "height", "width" (int): the output resolution of the model, used in inference.
+                    See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]: each dict is the results for one image. The dict
+                contains the following keys:
+                "instances": see :meth:`GeneralizedRCNN.forward` for its format.
+                "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
+                "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
+                    See the return value of
+                    :func:`combine_semantic_and_instance_outputs` for its format.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        features = self.backbone(images.tensor)
+
+        if self.combine_on:
+            if "sem_seg" in batched_inputs[0]:
+                gt_sem = [x["sem_seg"].to(self.device) for x in batched_inputs]
+                gt_sem = ImageList.from_tensors(
+                    gt_sem, self.backbone.size_divisibility, self.panoptic_module.ignore_value
+                ).tensor
+            else:
+                gt_sem = None
+            sem_seg_results, sem_seg_losses = self.panoptic_module(features, gt_sem)
+
+        if "basis_sem" in batched_inputs[0]:
+            basis_sem = [x["basis_sem"].to(self.device) for x in batched_inputs]
+            basis_sem = ImageList.from_tensors(
+                basis_sem, self.backbone.size_divisibility, 0).tensor
+        else:
+            basis_sem = None
+        basis_out, basis_losses = self.basis_module(features, basis_sem)
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        proposals, proposal_losses = self.proposal_generator(
+            images, features, gt_instances, self.top_layer)
+        detector_results, detector_losses = self.blender(
+            basis_out["bases"], proposals, gt_instances)
+
+        if self.training:
+            losses = {}
+            losses.update(basis_losses)
+            losses.update({k: v * self.instance_loss_weight for k, v in detector_losses.items()})
+            losses.update(proposal_losses)
+            if self.combine_on:
+                losses.update(sem_seg_losses)
+            return losses
+
+        processed_results = []
+        for i, (detector_result, input_per_image, image_size) in enumerate(zip(
+                detector_results, batched_inputs, images.image_sizes)):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            detector_r = detector_postprocess(detector_result, height, width)
+            processed_result = {"instances": detector_r}
+            if self.combine_on:
+                sem_seg_r = sem_seg_postprocess(
+                    sem_seg_results[i], image_size, height, width)
+                processed_result["sem_seg"] = sem_seg_r
+            if "seg_thing_out" in basis_out:
+                seg_thing_r = sem_seg_postprocess(
+                    basis_out["seg_thing_out"], image_size, height, width)
+                processed_result["sem_thing_seg"] = seg_thing_r
+            if self.basis_module.visualize:
+                processed_result["bases"] = basis_out["bases"]
+            processed_results.append(processed_result)
+
+            if self.combine_on:
+                panoptic_r = combine_semantic_and_instance_outputs(
+                    detector_r,
+                    sem_seg_r.argmax(dim=0),
+                    self.combine_overlap_threshold,
+                    self.combine_stuff_area_limit,
+                    self.combine_instances_confidence_threshold)
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+        return processed_results
--- a/adet/modeling/fcos/fcos.py
+++ b/adet/modeling/fcos/fcos.py
@ -45,6 +45,7 @@ class FCOS(nn.Module):
        self.pre_nms_topk_train   = cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN
        self.pre_nms_topk_test    = cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST
        self.nms_thresh           = cfg.MODEL.FCOS.NMS_TH
+        self.yield_proposal       = cfg.MODEL.FCOS.YIELD_PROPOSAL
        self.post_nms_topk_train  = cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN
        self.post_nms_topk_test   = cfg.MODEL.FCOS.POST_NMS_TOPK_TEST
        self.thresh_with_ctr      = cfg.MODEL.FCOS.THRESH_WITH_CTR
@ -60,7 +61,13 @@ class FCOS(nn.Module):
        self.sizes_of_interest = soi
        self.fcos_head = FCOSHead(cfg, [input_shape[f] for f in self.in_features])

-    def forward(self, images, features, gt_instances):
+    def forward_head(self, features, top_module=None):
+        features = [features[f] for f in self.in_features]
+        pred_class_logits, pred_deltas, pred_centerness, top_feats, bbox_towers = self.fcos_head(
+            features, top_module, self.yield_proposal)
+        return pred_class_logits, pred_deltas, pred_centerness, top_feats, bbox_towers
+
+    def forward(self, images, features, gt_instances=None, top_module=None):
        """
        Arguments:
            images (list[Tensor] or ImageList): images to be processed
@ -75,7 +82,8 @@ class FCOS(nn.Module):
        """
        features = [features[f] for f in self.in_features]
        locations = self.compute_locations(features)
-        logits_pred, reg_pred, ctrness_pred, bbox_towers = self.fcos_head(features)
+        logits_pred, reg_pred, ctrness_pred, top_feats, bbox_towers = self.fcos_head(
+            features, top_module, self.yield_proposal)

        if self.training:
            pre_nms_thresh = self.pre_nms_thresh_train
@ -108,12 +116,29 @@ class FCOS(nn.Module):
            gt_instances
        )

+        results = {}
+        if self.yield_proposal:
+            results["features"] = {
+                f: b for f, b in zip(self.in_features, bbox_towers)}
+
        if self.training:
-            losses, _ = outputs.losses()
-            return None, losses
+            losses, extras = outputs.losses()
+            
+            if top_module is not None:
+                results["extras"] = extras
+                results["top_feats"] = top_feats
+            if self.yield_proposal:
+                with torch.no_grad():
+                    results["proposals"] = outputs.predict_proposals(top_feats)
        else:
-            proposals = outputs.predict_proposals()
-            return proposals, {}
+            losses = {}
+            with torch.no_grad():
+                proposals = outputs.predict_proposals(top_feats)
+            if self.yield_proposal:
+                results["proposals"] = proposals
+            else:
+                results = proposals
+        return results, losses

    def compute_locations(self, features):
        locations = []
@ -173,9 +198,9 @@ class FCOSHead(nn.Module):
                conv_func = nn.Conv2d
            for i in range(num_convs):
                tower.append(conv_func(
-                        in_channels, in_channels,
-                        kernel_size=3, stride=1,
-                        padding=1, bias=True
+                    in_channels, in_channels,
+                    kernel_size=3, stride=1,
+                    padding=1, bias=True
                ))
                if norm == "GN":
                    tower.append(nn.GroupNorm(32, in_channels))
@ -192,7 +217,7 @@ class FCOSHead(nn.Module):
            in_channels, 4, kernel_size=3,
            stride=1, padding=1
        )
-        self.ctrness = nn.Conv2d(
+        self.centerness = nn.Conv2d(
            in_channels, 1, kernel_size=3,
            stride=1, padding=1
        )
@ -205,7 +230,7 @@ class FCOSHead(nn.Module):
        for modules in [
            self.cls_tower, self.bbox_tower,
            self.share_tower, self.cls_logits,
-            self.bbox_pred, self.ctrness
+            self.bbox_pred, self.centerness
        ]:
            for l in modules.modules():
                if isinstance(l, nn.Conv2d):
@ -217,22 +242,26 @@ class FCOSHead(nn.Module):
        bias_value = -math.log((1 - prior_prob) / prior_prob)
        torch.nn.init.constant_(self.cls_logits.bias, bias_value)

-    def forward(self, x):
+    def forward(self, x, top_module=None, yield_bbox_towers=False):
        logits = []
        bbox_reg = []
        ctrness = []
+        top_feats = []
        bbox_towers = []
        for l, feature in enumerate(x):
            feature = self.share_tower(feature)
            cls_tower = self.cls_tower(feature)
            bbox_tower = self.bbox_tower(feature)
+            if yield_bbox_towers:
+                bbox_towers.append(bbox_tower)

            logits.append(self.cls_logits(cls_tower))
-            ctrness.append(self.ctrness(bbox_tower))
+            ctrness.append(self.centerness(bbox_tower))
            reg = self.bbox_pred(bbox_tower)
            if self.scales is not None:
                reg = self.scales[l](reg)
            # Note that we use relu, as in the improved FCOS, instead of exp.
            bbox_reg.append(F.relu(reg))
-
-        return logits, bbox_reg, ctrness, bbox_towers
+            if top_module is not None:
+                top_feats.append(top_module(bbox_tower))
+        return logits, bbox_reg, ctrness, top_feats, bbox_towers
--- a/adet/modeling/fcos/fcos_outputs.py
+++ b/adet/modeling/fcos/fcos_outputs.py
@ -34,7 +34,7 @@ Naming convention:
    reg_pred: the predicted (left, top, right, bottom), corresponding to reg_targets 

    ctrness_pred: predicted centerness scores
-    
+
 """


@ -57,6 +57,7 @@ def fcos_losses(
        focal_loss_alpha,
        focal_loss_gamma,
        iou_loss,
+        gt_inds,
 ):
    num_classes = logits_pred.size(1)
    labels = labels.flatten()
@ -82,29 +83,40 @@ def fcos_losses(
    reg_pred = reg_pred[pos_inds]
    reg_targets = reg_targets[pos_inds]
    ctrness_pred = ctrness_pred[pos_inds]
+    gt_inds = gt_inds[pos_inds]

    ctrness_targets = compute_ctrness_targets(reg_targets)
    ctrness_targets_sum = ctrness_targets.sum()
-    ctrness_norm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
-
-    reg_loss = iou_loss(
-        reg_pred,
-        reg_targets,
-        ctrness_targets
-    ) / ctrness_norm
+    loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)

-    ctrness_loss = F.binary_cross_entropy_with_logits(
-        ctrness_pred,
-        ctrness_targets,
-        reduction="sum"
-    ) / num_pos_avg
+    if pos_inds.numel() > 0:
+        reg_loss = iou_loss(
+            reg_pred,
+            reg_targets,
+            ctrness_targets
+        ) / loss_denorm
+        
+        ctrness_loss = F.binary_cross_entropy_with_logits(
+            ctrness_pred,
+            ctrness_targets,
+            reduction="sum"
+        ) / num_pos_avg
+    else:
+        reg_loss = reg_pred.sum() * 0
+        ctrness_loss = ctrness_pred.sum() * 0

    losses = {
        "loss_fcos_cls": class_loss,
        "loss_fcos_loc": reg_loss,
        "loss_fcos_ctr": ctrness_loss
    }
-    return losses, {}
+    extras = {
+        "pos_inds": pos_inds,
+        "gt_inds": gt_inds,
+        "gt_ctr": ctrness_targets,
+        "loss_denorm": loss_denorm
+    }
+    return losses, extras


 class FCOSOutputs(object):
@ -236,8 +248,10 @@ class FCOSOutputs(object):
    def compute_targets_for_locations(self, locations, targets, size_ranges):
        labels = []
        reg_targets = []
+        target_inds = []
        xs, ys = locations[:, 0], locations[:, 1]

+        num_targets = 0
        for im_i in range(len(targets)):
            targets_per_im = targets[im_i]
            bboxes = targets_per_im.gt_boxes.tensor
@ -247,6 +261,7 @@ class FCOSOutputs(object):
            if bboxes.numel() == 0:
                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
                reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
                continue

            area = targets_per_im.gt_boxes.area()
@ -280,14 +295,19 @@ class FCOSOutputs(object):
            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)

            reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds]
+            target_inds_per_im = locations_to_gt_inds + num_targets

            labels_per_im = labels_per_im[locations_to_gt_inds]
            labels_per_im[locations_to_min_area == INF] = self.num_classes

            labels.append(labels_per_im)
            reg_targets.append(reg_targets_per_im)
+            target_inds.append(target_inds_per_im)

-        return {"labels": labels, "reg_targets": reg_targets}
+        return {
+            "labels": labels,
+            "reg_targets": reg_targets,
+            "target_inds": target_inds}

    def losses(self):
        """
@ -298,7 +318,10 @@ class FCOSOutputs(object):
        """

        training_targets = self._get_ground_truth()
-        labels, reg_targets = training_targets["labels"], training_targets["reg_targets"]
+        labels, reg_targets, gt_inds = (
+            training_targets["labels"],
+            training_targets["reg_targets"],
+            training_targets["target_inds"])

        # Collect all logits and regression predictions over feature maps
        # and images to arrive at the same shape as the labels and targets
@ -327,6 +350,12 @@ class FCOSOutputs(object):
                x.reshape(-1) for x in labels
            ], dim=0,)

+        gt_inds = cat(
+            [
+                # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
+                x.reshape(-1) for x in gt_inds
+            ], dim=0,)
+
        reg_targets = cat(
            [
                # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
@ -341,25 +370,35 @@ class FCOSOutputs(object):
            ctrness_pred,
            self.focal_loss_alpha,
            self.focal_loss_gamma,
-            self.iou_loss
+            self.iou_loss,
+            gt_inds
        )

-    def predict_proposals(self):
+    def predict_proposals(self, top_feats):
        sampled_boxes = []

-        bundle = (
-            self.locations, self.logits_pred,
-            self.reg_pred, self.ctrness_pred,
-            self.strides
-        )
+        bundle = {
+            "l": self.locations, "o": self.logits_pred,
+            "r": self.reg_pred, "c": self.ctrness_pred,
+            "s": self.strides,
+        }
+
+        if len(top_feats) > 0:
+            bundle["t"] = top_feats

-        for i, (l, o, r, c, s) in enumerate(zip(*bundle)):
+        for i, instance in enumerate(zip(*bundle.values())):
+            instance_dict = dict(zip(bundle.keys(), instance))
            # recall that during training, we normalize regression targets with FPN's stride.
            # we denormalize them here.
-            r = r * s
+            l = instance_dict["l"]
+            o = instance_dict["o"]
+            r = instance_dict["r"] * instance_dict["s"]
+            c = instance_dict["c"]
+            t = instance_dict["t"] if "t" in bundle else None
+
            sampled_boxes.append(
                self.forward_for_single_feature_map(
-                    l, o, r, c, self.image_sizes
+                    l, o, r, c, self.image_sizes, t
                )
            )

@ -370,8 +409,8 @@ class FCOSOutputs(object):

    def forward_for_single_feature_map(
            self, locations, box_cls,
-            reg_pred, ctrness, image_sizes
-    ):
+            reg_pred, ctrness,
+            image_sizes, top_feat=None):
        N, C, H, W = box_cls.shape

        # put in the same format as locations
@ -381,6 +420,9 @@ class FCOSOutputs(object):
        box_regression = box_regression.reshape(N, -1, 4)
        ctrness = ctrness.view(N, 1, H, W).permute(0, 2, 3, 1)
        ctrness = ctrness.reshape(N, -1).sigmoid()
+        if top_feat is not None:
+            top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1)
+            top_feat = top_feat.reshape(N, H * W, -1)

        # if self.thresh_with_ctr is True, we multiply the classification
        # scores with centerness scores before applying the threshold.
@ -406,6 +448,9 @@ class FCOSOutputs(object):
            per_box_regression = box_regression[i]
            per_box_regression = per_box_regression[per_box_loc]
            per_locations = locations[per_box_loc]
+            if top_feat is not None:
+                per_top_feat = top_feat[i]
+                per_top_feat = per_top_feat[per_box_loc]

            per_pre_nms_top_n = pre_nms_top_n[i]

@ -415,6 +460,8 @@ class FCOSOutputs(object):
                per_class = per_class[top_k_indices]
                per_box_regression = per_box_regression[top_k_indices]
                per_locations = per_locations[top_k_indices]
+                if top_feat is not None:
+                    per_top_feat = per_top_feat[top_k_indices]

            detections = torch.stack([
                per_locations[:, 0] - per_box_regression[:, 0],
@ -428,7 +475,8 @@ class FCOSOutputs(object):
            boxlist.scores = torch.sqrt(per_box_cls)
            boxlist.pred_classes = per_class
            boxlist.locations = per_locations
-
+            if top_feat is not None:
+                boxlist.top_feat = per_top_feat
            results.append(boxlist)

        return results
--- a/configs/BlendMask/550_R_50_1x.yaml
+++ b/configs/BlendMask/550_R_50_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "Base-550.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+OUTPUT_DIR: "output/blendmask/550_R_50_1x"
--- a/configs/BlendMask/550_R_50_3x.yaml
+++ b/configs/BlendMask/550_R_50_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "Base-550.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+OUTPUT_DIR: "output/blendmask/550_R_50_3x"
--- a/configs/BlendMask/550_R_50_dcni3_5x.yaml
+++ b/configs/BlendMask/550_R_50_dcni3_5x.yaml
@ -0,0 +1,18 @@
+_BASE_: "Base-550.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    DEFORM_MODULATED: True
+    DEFORM_INTERVAL: 3
+INPUT:
+  MIN_SIZE_TRAIN: (440, 594)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 990
+  CROP:
+    ENABLED: True
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+OUTPUT_DIR: "output/blendmask/550_R_50_dcni3_5x"
--- a/configs/BlendMask/Base-550.yaml
+++ b/configs/BlendMask/Base-550.yaml
@ -0,0 +1,17 @@
+_BASE_: "Base-BlendMask.yaml"
+MODEL:
+  FCOS:
+    TOP_LEVELS: 1
+    IN_FEATURES: ["p3", "p4", "p5", "p6"]
+    FPN_STRIDES: [8, 16, 32, 64]
+    SIZES_OF_INTEREST: [64, 128, 256]
+    NUM_SHARE_CONVS: 3
+    NUM_CLS_CONVS: 0
+    NUM_BOX_CONVS: 0
+  BASIS_MODULE:
+    NUM_CONVS: 2
+INPUT:
+  MIN_SIZE_TRAIN: (440, 462, 484, 506, 528, 550)
+  MAX_SIZE_TRAIN: 916
+  MIN_SIZE_TEST: 550
+  MAX_SIZE_TEST: 916
--- a/configs/BlendMask/Base-BlendMask.yaml
+++ b/configs/BlendMask/Base-BlendMask.yaml
@ -0,0 +1,29 @@
+MODEL:
+  META_ARCHITECTURE: "BlendMask"
+  MASK_ON: True
+  BACKBONE:
+    NAME: "build_fcos_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res3", "res4", "res5"]
+  PROPOSAL_GENERATOR:
+    NAME: "FCOS"
+  BASIS_MODULE:
+    LOSS_ON: True
+  PANOPTIC_FPN:
+    COMBINE:
+      ENABLED: False
+  FCOS:
+    THRESH_WITH_CTR: True
+    USE_SCALE: False
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
--- a/configs/BlendMask/Panoptic/Base-Panoptic.yaml
+++ b/configs/BlendMask/Panoptic/Base-Panoptic.yaml
@ -0,0 +1,16 @@
+_BASE_: "../Base-BlendMask.yaml"
+MODEL:
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+  PANOPTIC_FPN:
+    COMBINE:
+      ENABLED: True
+      INSTANCES_CONFIDENCE_THRESH: 0.2
+      OVERLAP_THRESH: 0.4
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_separated",)
+  TEST: ("coco_2017_val_panoptic_separated",)
--- a/configs/BlendMask/Panoptic/R_101_3x.yaml
+++ b/configs/BlendMask/Panoptic/R_101_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "Base-Panoptic.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+OUTPUT_DIR: "output/panoptic/blendmask/R_101_3x"
--- a/configs/BlendMask/Panoptic/R_101_dcni3_5x.yaml
+++ b/configs/BlendMask/Panoptic/R_101_dcni3_5x.yaml
@ -0,0 +1,18 @@
+_BASE_: "Base-Panoptic.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    DEFORM_MODULATED: True
+    DEFORM_INTERVAL: 3
+SOLVER:
+  STEPS: (280000, 360000)
+  MAX_ITER: 400000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1333
+  CROP:
+    ENABLED: True
+OUTPUT_DIR: "output/panoptic/blendmask/R_101_dcni3_5x"
--- a/configs/BlendMask/Panoptic/R_50_1x.yaml
+++ b/configs/BlendMask/Panoptic/R_50_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "Base-Panoptic.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+OUTPUT_DIR: "output/panoptic/blendmask/R_50_1x"
--- a/configs/BlendMask/Panoptic/R_50_3x.yaml
+++ b/configs/BlendMask/Panoptic/R_50_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "Base-Panoptic.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+OUTPUT_DIR: "output/panoptic/blendmask/R_50_3x"
--- a/configs/BlendMask/Panoptic/R_50_dcni3_5x.yaml
+++ b/configs/BlendMask/Panoptic/R_50_dcni3_5x.yaml
@ -0,0 +1,18 @@
+_BASE_: "Base-Panoptic.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    DEFORM_MODULATED: True
+    DEFORM_INTERVAL: 3
+SOLVER:
+  STEPS: (280000, 360000)
+  MAX_ITER: 400000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+OUTPUT_DIR: "output/panoptic/blendmask/R_50_dcni3_5x"
--- a/configs/BlendMask/Person/Base-Person.yaml
+++ b/configs/BlendMask/Person/Base-Person.yaml
@ -0,0 +1,9 @@
+_BASE_: "../Base-BlendMask.yaml"
+MODEL:
+  BASIS_MODULE:
+    NUM_CLASSES: 1
+  FCOS:
+    NUM_CLASSES: 1
+DATASETS:
+  TRAIN: ("pic_person_train",)
+  TEST: ("pic_person_val",)
--- a/configs/BlendMask/Person/R_50_1x.yaml
+++ b/configs/BlendMask/Person/R_50_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "Base-Person.yaml"
+MODEL:
+  WEIGHTS: "https://cloudstor.aarnet.edu.au/plus/s/9u1cG2zXvEva5SM/download#R_50_3x.pth"
+  RESNETS:
+    DEPTH: 50
+OUTPUT_DIR: "output/person/blendmask/R_50_1x"
--- a/configs/BlendMask/R_101_3x.yaml
+++ b/configs/BlendMask/R_101_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "Base-BlendMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+OUTPUT_DIR: "output/blendmask/R_101_3x"
--- a/configs/BlendMask/R_101_dcni3_5x.yaml
+++ b/configs/BlendMask/R_101_dcni3_5x.yaml
@ -0,0 +1,20 @@
+_BASE_: "Base-BlendMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    DEFORM_MODULATED: True
+    DEFORM_INTERVAL: 3
+SOLVER:
+  STEPS: (280000, 360000)
+  MAX_ITER: 400000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  EVAL_PERIOD: 20000
+OUTPUT_DIR: "output/blendmask/R_101_dcni3_5x"
--- a/configs/BlendMask/R_50_1x.yaml
+++ b/configs/BlendMask/R_50_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "Base-BlendMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+OUTPUT_DIR: "output/blendmask/R_50_1x"
--- a/configs/BlendMask/R_50_3x.yaml
+++ b/configs/BlendMask/R_50_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "Base-BlendMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+OUTPUT_DIR: "output/blendmask/R_50_3x"
--- a/configs/RCNN/550_R_50_FPN_3x.yaml
+++ b/configs/RCNN/550_R_50_FPN_3x.yaml
@ -0,0 +1,15 @@
+_BASE_: "Base-RCNN.yaml"
+MODEL:
+  WEIGHTS: "output/mask_rcnn/550_R_50_3x/model_final.pth"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+INPUT:
+  MIN_SIZE_TRAIN: (440, 462, 484, 506, 528, 550)
+  MAX_SIZE_TRAIN: 916
+  MIN_SIZE_TEST: 550
+  MAX_SIZE_TEST: 916
+OUTPUT_DIR: "output/mask_rcnn/550_R_50_3x"
--- a/configs/RCNN/Base-RCNN.yaml
+++ b/configs/RCNN/Base-RCNN.yaml
@ -0,0 +1,42 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
--- a/configs/RCNN/LVIS/R_50_1x.yaml
+++ b/configs/RCNN/LVIS/R_50_1x.yaml
@ -0,0 +1,6 @@
+_BASE_: "Base-LVIS.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+OUTPUT_DIR: "output/lvis/mask_rcnn/R_50_1x"
--- a/configs/RCNN/R_101_3x.yaml
+++ b/configs/RCNN/R_101_3x.yaml
@ -0,0 +1,9 @@
+_BASE_: "Base-RCNN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
--- a/datasets/README.md
+++ b/datasets/README.md
@ -0,0 +1,33 @@
+## Expected dataset structure for AdelaiDet instance detection:
+
+```
+coco/
+  thing_train2017/
+    # thing class label maps for auxiliary semantic loss
+lvis/
+  thing_train/
+    # semantic labels for LVIS
+```
+
+Run `python prepare_thing_sem_from_instance.py`, to extract semantic labels from instance annotations.
+
+Run `python prepare_thing_sem_from_lvis.py`, to extract semantic labels from LVIS annotations.
+
+## Expected dataset structure for Person In Context instance detection:
+
+```
+pic/
+  thing_train/
+    # thing class label maps for auxiliary semantic loss
+  annotations/
+    train_person.json
+    val_person.json
+  image/
+    train/
+    ...
+  
+```
+
+First link the PIC_2.0 dataset to this folder with `ln -s \path\to\PIC_2.0 pic`. Then use the `python gen_coco_person.py` to generate train and validation annotation jsons.
+
+Run `python prepare_thing_sem_from_instance.py --dataset-name pic`, to extract semantic labels from instance annotations.
--- a/datasets/gen_coco_person.py
+++ b/datasets/gen_coco_person.py
@ -0,0 +1,101 @@
+import numpy as np
+import cv2
+import os
+import json
+error_list = ['23382.png', '23441.png', '20714.png', '20727.png', '23300.png', '21200.png']
+
+def mask2box(mask):
+    index = np.argwhere(mask == 1)
+    rows = index[:, 0]
+    clos = index[:, 1]
+    y1 = int(np.min(rows))  # y
+    x1 = int(np.min(clos))  # x
+    y2 = int(np.max(rows))
+    x2 = int(np.max(clos))
+    return (x1, y1, x2, y2)
+
+def gen_coco(phase):
+    result = {
+        "info": {"description": "PIC2.0 dataset."},
+        "categories": [
+            {"supercategory": "none", "id": 1, "name": "person"}
+        ]
+    }
+    out_json = phase +'_person.json'
+    store_segmentation = True
+
+    images_info = []
+    labels_info = []
+    img_id = 0
+    files = tuple(open("pic/list5/"+phase+'_id', 'r'))
+    files = (_.strip() for _ in files)
+
+    for index, image_name in enumerate(files):
+        image_name = image_name+".png"
+        print(index, image_name)
+        if image_name in error_list:
+            continue
+        instance = cv2.imread(os.path.join('instance', phase, image_name), flags=cv2.IMREAD_GRAYSCALE)
+        semantic = cv2.imread(os.path.join('semantic', phase, image_name), flags=cv2.IMREAD_GRAYSCALE)
+        # print(instance.shape, semantic.shape)
+        h = instance.shape[0]
+        w = instance.shape[1]
+        images_info.append(
+            {
+                "file_name": image_name[:-4]+'.jpg',
+                "height": h,
+                "width": w,
+                "id": index
+            }
+        )
+        instance_max_num = instance.max()
+        instance_ids = np.unique(instance)
+        for instance_id in instance_ids:
+            if instance_id == 0:
+                continue
+            instance_part = instance == instance_id
+            object_pos = instance_part.nonzero()
+            # category_id_ = int(semantic[object_pos[0][0], object_pos[1][0]])
+            category_id = int(np.max(semantic[object_pos[0], object_pos[1]]))
+            # assert category_id_ == category_id, (category_id_, category_id)
+            if category_id != 1:
+                continue
+            area = int(instance_part.sum())
+            x1, y1, x2, y2 = mask2box(instance_part)
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            segmentation = []
+            if store_segmentation:
+                contours, hierarchy = cv2.findContours((instance_part * 255).astype(np.uint8), cv2.RETR_TREE,
+                                                            cv2.CHAIN_APPROX_SIMPLE)
+                for contour in contours:
+                    contour = contour.flatten().tolist()
+                    if len(contour) > 4:
+                        segmentation.append(contour)
+                if len(segmentation) == 0:
+                    print('error')
+                    continue
+            labels_info.append(
+                {
+                    "segmentation": segmentation,  # poly
+                    "area": area,  # segmentation area
+                    "iscrowd": 0,
+                    "image_id": index,
+                    "bbox": [x1, y1, w, h],
+                    "category_id": category_id,
+                    "id": img_id
+                },
+            )
+            img_id += 1
+        # break
+    result["images"] = images_info
+    result["annotations"] = labels_info
+    with open('pic/annotations/' + out_json, 'w') as f:
+        json.dump(result, f, indent=4)
+
+if __name__ == "__main__":
+    if not os.path.exists('pic/annotations/'):
+        os.mkdirs('pic/annotations/')
+    gen_coco("train")
+    gen_coco("val")
+    #gen_coco("test")
--- a/datasets/prepare_thing_sem_from_instance.py
+++ b/datasets/prepare_thing_sem_from_instance.py
@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import time
+import functools
+import multiprocessing as mp
+import numpy as np
+import os
+import argparse
+from pycocotools.coco import COCO
+from pycocotools import mask as maskUtils
+
+from detectron2.data.datasets.builtin_meta import _get_coco_instances_meta
+
+
+def annToRLE(ann, img_size):
+    h, w = img_size
+    segm = ann['segmentation']
+    if type(segm) == list:
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(segm, h, w)
+        rle = maskUtils.merge(rles)
+    elif type(segm['counts']) == list:
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(segm, h, w)
+    else:
+        # rle
+        rle = ann['segmentation']
+    return rle
+
+
+def annToMask(ann, img_size):
+    rle = annToRLE(ann, img_size)
+    m = maskUtils.decode(rle)
+    return m
+
+
+def _process_instance_to_semantic(anns, output_semantic, img, categories):
+    img_size = (img["height"], img["width"])
+    output = np.zeros(img_size, dtype=np.uint8)
+    for ann in anns:
+        mask = annToMask(ann, img_size)
+        output[mask == 1] = categories[ann["category_id"]] + 1
+    # save as compressed npz
+    np.savez_compressed(output_semantic, mask=output)
+    # Image.fromarray(output).save(output_semantic)
+
+
+def create_coco_semantic_from_instance(instance_json, sem_seg_root, categories):
+    """
+    Create semantic segmentation annotations from panoptic segmentation
+    annotations, to be used by PanopticFPN.
+
+    It maps all thing categories to contiguous ids starting from 1, and maps all unlabeled pixels to class 0
+
+    Args:
+        instance_json (str): path to the instance json file, in COCO's format.
+        sem_seg_root (str): a directory to output semantic annotation files
+        categories (dict): category metadata. Each dict needs to have:
+            "id": corresponds to the "category_id" in the json annotations
+            "isthing": 0 or 1
+    """
+    os.makedirs(sem_seg_root, exist_ok=True)
+
+    coco_detection = COCO(instance_json)
+
+    def iter_annotations():
+        for img_id in coco_detection.getImgIds():
+            anns_ids = coco_detection.getAnnIds(img_id)
+            anns = coco_detection.loadAnns(anns_ids)
+            img = coco_detection.loadImgs(int(img_id))[0]
+            output = os.path.join(sem_seg_root, img["file_name"].replace('jpg', 'npz'))
+            yield anns, output, img
+
+    # single process
+    # print("Start writing to {} ...".format(sem_seg_root))
+    # start = time.time()
+    # for anno, oup, img in iter_annotations():
+    #     _process_instance_to_semantic(
+    #         anno, oup, img, categories)
+    # print("Finished. time: {:.2f}s".format(time.time() - start))
+    # return
+
+    pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
+
+    print("Start writing to {} ...".format(sem_seg_root))
+    start = time.time()
+    pool.starmap(
+        functools.partial(
+            _process_instance_to_semantic,
+            categories=categories),
+        iter_annotations(),
+        chunksize=100,
+    )
+    print("Finished. time: {:.2f}s".format(time.time() - start))
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Keep only model in ckpt")
+    parser.add_argument(
+        "--dataset-name",
+        default="coco",
+        help="dataset to generate",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_parser().parse_args()
+    dataset_dir = os.path.join(os.path.dirname(__file__), args.dataset_name)
+    if args.dataset_name == "coco":
+        thing_id_to_contiguous_id = _get_coco_instances_meta()["thing_dataset_id_to_contiguous_id"]
+        split_name = 'train2017'
+        annotation_name = "annotations/instances_{}.json"
+    else:
+        thing_id_to_contiguous_id = {1: 0}
+        split_name = 'train'
+        annotation_name = "annotations/{}_person.json"
+    for s in ["train2017"]:
+        create_coco_semantic_from_instance(
+            os.path.join(dataset_dir, "annotations/instances_{}.json".format(s)),
+            os.path.join(dataset_dir, "thing_{}".format(s)),
+            thing_id_to_contiguous_id
+        )
--- a/datasets/prepare_thing_sem_from_lvis.py
+++ b/datasets/prepare_thing_sem_from_lvis.py
@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import time
+import functools
+import multiprocessing as mp
+import numpy as np
+import os
+from lvis import LVIS
+from pycocotools import mask as maskUtils
+
+
+def annToRLE(ann, img_size):
+    h, w = img_size
+    segm = ann['segmentation']
+    if type(segm) == list:
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(segm, h, w)
+        rle = maskUtils.merge(rles)
+    elif type(segm['counts']) == list:
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(segm, h, w)
+    else:
+        # rle
+        rle = ann['segmentation']
+    return rle
+
+
+def annToMask(ann, img_size):
+    rle = annToRLE(ann, img_size)
+    m = maskUtils.decode(rle)
+    return m
+
+
+def _process_instance_to_semantic(anns, output_semantic, img):
+    img_size = (img["height"], img["width"])
+    output = np.zeros(img_size, dtype=np.uint8)
+    for ann in anns:
+        mask = annToMask(ann, img_size)
+        output[mask == 1] = ann["category_id"] // 5
+    # save as compressed npz
+    np.savez_compressed(output_semantic, mask=output)
+    # Image.fromarray(output).save(output_semantic)
+
+
+def create_lvis_semantic_from_instance(instance_json, sem_seg_root):
+    """
+    Create semantic segmentation annotations from panoptic segmentation
+    annotations, to be used by PanopticFPN.
+
+    It maps all thing categories to contiguous ids starting from 1, and maps all unlabeled pixels to class 0
+
+    Args:
+        instance_json (str): path to the instance json file, in COCO's format.
+        sem_seg_root (str): a directory to output semantic annotation files
+    """
+    os.makedirs(sem_seg_root, exist_ok=True)
+
+    lvis_detection = LVIS(instance_json)
+
+    def iter_annotations():
+        for img_id in lvis_detection.get_img_ids():
+            anns_ids = lvis_detection.get_ann_ids([img_id])
+            anns = lvis_detection.load_anns(anns_ids)
+            img = lvis_detection.load_imgs([img_id])[0]
+            output = os.path.join(sem_seg_root, img["file_name"].replace('jpg', 'npz'))
+            yield anns, output, img
+
+    # # single process
+    # print("Start writing to {} ...".format(sem_seg_root))
+    # start = time.time()
+    # for anno, oup, img in iter_annotations():
+    #     _process_instance_to_semantic(
+    #         anno, oup, img)
+    # print("Finished. time: {:.2f}s".format(time.time() - start))
+    # return
+
+    pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
+
+    print("Start writing to {} ...".format(sem_seg_root))
+    start = time.time()
+    pool.starmap(
+        functools.partial(
+            _process_instance_to_semantic),
+        iter_annotations(),
+        chunksize=100,
+    )
+    print("Finished. time: {:.2f}s".format(time.time() - start))
+
+
+if __name__ == "__main__":
+    dataset_dir = os.path.join(os.path.dirname(__file__), "lvis")
+    for s in ["train"]:
+        create_lvis_semantic_from_instance(
+            os.path.join(dataset_dir, "lvis_v0.5_{}.json".format(s)),
+            os.path.join(dataset_dir, "thing_{}".format(s)),
+        )
--- a/tools/train_net.py
+++ b/tools/train_net.py
@ -37,7 +37,7 @@ from detectron2.evaluation import (
 )
 from detectron2.modeling import GeneralizedRCNNWithTTA

-from detectron2.data.dataset_mapper import DatasetMapper
+from adet.data.dataset_mapper import DatasetMapperWithBasis
 from adet.config import get_cfg
 from adet.checkpoint import AdetCheckpointer

@ -123,7 +123,7 @@ class Trainer(DefaultTrainer):
        It calls :func:`detectron2.data.build_detection_train_loader` with a customized
        DatasetMapper, which adds categorical labels as a semantic mask.
        """
-        mapper = DatasetMapper(cfg, True)
+        mapper = DatasetMapperWithBasis(cfg, True)
        return build_detection_train_loader(cfg, mapper)

    @classmethod