# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F import paddlers.models.ppseg as ppseg import paddlers.utils.logging as logging from paddlers.models.ppseg.cvlibs import param_init from paddlers.rs_models.seg.layers import layers_lib as layers from paddlers.models.ppseg.utils import utils class C2FNet(nn.Layer): """ A Coarse-to-Fine Segmentation Network for Small Objects in Remote Sensing Images. Args: num_classes (int): The unique number of target classes. backbone (str): The backbone network. backbone_indices (tuple, optional): The values in the tuple indicate the indices of output of backbone. Default: (-1, ). kernel_sizes(tuple, optional): The sliding windows' size. Default: (128,128). training_stride(int, optional): The stride of sliding windows. Default: 32. samples_per_gpu(int, optional): The fined process's batch size. Default: 32. channels (int, optional): The channels between conv layer and the last layer of FCNHead. If None, it will be the number of channels of input features. Default: None. align_corners (bool, optional): An argument of `F.interpolate`. It should be set to False when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. """ def __init__(self, num_classes, backbone, backbone_indices=(-1, ), kernel_sizes=(128, 128), training_stride=32, samples_per_gpu=32, channels=None, align_corners=False): super(C2FNet, self).__init__() self.backbone = backbone backbone_channels = [ backbone.feat_channels[i] for i in backbone_indices ] self.head_fgbg = FCNHead(2, backbone_indices, backbone_channels, channels) self.num_cls = num_classes self.kernel_sizes = [kernel_sizes[0], kernel_sizes[1]] self.training_stride = training_stride self.samples = samples_per_gpu self.align_corners = align_corners def forward(self, x, heatmaps, label=None): ori_heatmap = heatmaps heatmap = paddle.argmax(heatmaps, axis=1, keepdim=True, dtype='int32') if paddle.max(heatmap) > 15: logging.warning( "Please note that currently C2FNet can only be trained and evaluated on the iSAID dataset." ) heatmap = paddle.where( (heatmap == 10) | (heatmap == 11) | (heatmap == 8) | (heatmap == 15) | (heatmap == 9) | (heatmap == 1) | (heatmap == 14), paddle.ones_like(heatmap), paddle.zeros_like(heatmap)).astype('float32') if self.training: label = paddle.unsqueeze(label, axis=1).astype('float32') label = paddle.where((label == 10) | (label == 11) | (label == 8) | (label == 15) | (label == 9) | (label == 1) | (label == 14), paddle.ones_like(label), paddle.zeros_like(label)) mask_regions = F.unfold( heatmap, kernel_sizes=self.kernel_sizes, strides=self.training_stride, paddings=0, dilations=1, name=None) mask_regions = paddle.transpose(mask_regions, perm=[0, 2, 1]) mask_regions = paddle.reshape( mask_regions, shape=[-1, self.kernel_sizes[0] * self.kernel_sizes[1]]) img_regions = F.unfold( x, kernel_sizes=self.kernel_sizes, strides=self.training_stride, paddings=0, dilations=1, name=None) img_regions = paddle.transpose(img_regions, perm=[0, 2, 1]) img_regions = paddle.reshape( img_regions, shape=[-1, 3 * self.kernel_sizes[0] * self.kernel_sizes[1]]) label_regions = F.unfold( label, kernel_sizes=self.kernel_sizes, strides=self.training_stride, paddings=0, dilations=1, name=None) label_regions = paddle.transpose(label_regions, perm=[0, 2, 1]) label_regions = paddle.reshape( label_regions, shape=[-1, self.kernel_sizes[0] * self.kernel_sizes[1]]) mask_regions_sum = paddle.sum(mask_regions, axis=1) mask_regions_selected = paddle.where( mask_regions_sum > 0, paddle.ones_like(mask_regions_sum), paddle.zeros_like(mask_regions_sum)) final_mask_regions_selected = paddle.zeros_like( mask_regions_selected).astype('bool') final_mask_regions_selected.stop_gradient = True theld = self.samples * paddle.shape(x)[0] if paddle.sum(mask_regions_selected) >= theld: _, top_k_idx = paddle.topk(mask_regions_sum, k=theld) final_mask_regions_selected[top_k_idx] = True selected_img_regions = img_regions[final_mask_regions_selected] selected_img_regions = paddle.reshape( selected_img_regions, shape=[ theld, 3, self.kernel_sizes[0], self.kernel_sizes[1] ]) selected_label_regions = label_regions[ final_mask_regions_selected] selected_label_regions = paddle.reshape( selected_label_regions, shape=[theld, self.kernel_sizes[0], self.kernel_sizes[1]]).astype('int32') feat_list = self.backbone(selected_img_regions) bgfg = self.head_fgbg(feat_list) binary_fea = F.interpolate( bgfg[0], self.kernel_sizes, mode='bilinear', align_corners=self.align_corners) return [binary_fea, selected_label_regions] else: theld = theld // 8 _, top_k_idx = paddle.topk(mask_regions_sum, k=theld) final_mask_regions_selected[top_k_idx] = True selected_img_regions = img_regions[final_mask_regions_selected] selected_img_regions = paddle.reshape( selected_img_regions, shape=[ theld, 3, self.kernel_sizes[0], self.kernel_sizes[1] ]) selected_label_regions = label_regions[ final_mask_regions_selected] selected_label_regions = paddle.reshape( selected_label_regions, shape=[theld, self.kernel_sizes[0], self.kernel_sizes[1]]).astype('int32') feat_list = self.backbone(selected_img_regions) bgfg = self.head_fgbg(feat_list) binary_fea = F.interpolate( bgfg[0], self.kernel_sizes, mode='bilinear', align_corners=self.align_corners) return [binary_fea, selected_label_regions] else: mask_regions = F.unfold( heatmap, kernel_sizes=self.kernel_sizes, strides=self.kernel_sizes[0], paddings=0, dilations=1, name=None) mask_regions = paddle.transpose(mask_regions, perm=[0, 2, 1]) mask_regions = paddle.reshape( mask_regions, shape=[-1, self.kernel_sizes[0] * self.kernel_sizes[1]]) img_regions = F.unfold( x, kernel_sizes=self.kernel_sizes, strides=self.kernel_sizes[0], paddings=0, dilations=1, name=None) img_regions = paddle.transpose(img_regions, perm=[0, 2, 1]) img_regions = paddle.reshape( img_regions, shape=[-1, 3 * self.kernel_sizes[0] * self.kernel_sizes[1]]) mask_regions_sum = paddle.sum(mask_regions, axis=1) mask_regions_selected = paddle.where( mask_regions_sum > 0, paddle.ones_like(mask_regions_sum), paddle.zeros_like(mask_regions_sum)).astype('bool') if paddle.sum(mask_regions_selected.astype('int')) == 0: return [ori_heatmap] else: ori_fea_regions = F.unfold( ori_heatmap, kernel_sizes=self.kernel_sizes, strides=self.kernel_sizes[0], paddings=0, dilations=1, name=None) ori_fea_regions = paddle.transpose( ori_fea_regions, perm=[0, 2, 1]) ori_fea_regions = paddle.reshape( ori_fea_regions, shape=[ -1, self.num_cls * self.kernel_sizes[0] * self.kernel_sizes[1] ]) selected_img_regions = img_regions[mask_regions_selected] selected_img_regions = paddle.reshape( selected_img_regions, shape=[ paddle.shape(selected_img_regions)[0], 3, self.kernel_sizes[0], self.kernel_sizes[1] ]) selected_fea_regions = ori_fea_regions[mask_regions_selected] selected_fea_regions = paddle.reshape( selected_fea_regions, shape=[ paddle.shape(selected_fea_regions)[0], self.num_cls, self.kernel_sizes[0], self.kernel_sizes[1] ]) feat_list = self.backbone(selected_img_regions) bgfg = self.head_fgbg(feat_list) binary_fea = F.interpolate( bgfg[0], self.kernel_sizes, mode='bilinear', align_corners=self.align_corners) binary_fea = F.softmax(binary_fea, axis=1) bg_binary, fg_binary = paddle.chunk( binary_fea, chunks=2, axis=1) front, ship, mid, lv, sv, hl, swp, mid2, pl, hb = paddle.split( selected_fea_regions, num_or_sections=[1, 1, 6, 1, 1, 1, 1, 2, 1, 1], axis=1) ship = paddle.add(ship, fg_binary) lv = paddle.add(lv, fg_binary) sv = paddle.add(sv, fg_binary) hl = paddle.add(hl, fg_binary) swp = paddle.add(swp, fg_binary) pl = paddle.add(pl, fg_binary) hb = paddle.add(hb, fg_binary) selected_fea_regions = paddle.concat( x=[front, ship, mid, lv, sv, hl, swp, mid2, pl, hb], axis=1) selected_fea_regions = paddle.reshape( selected_fea_regions, shape=[paddle.shape(selected_fea_regions)[0], -1]) ori_fea_regions[mask_regions_selected] = selected_fea_regions ori_fea_regions = paddle.reshape( ori_fea_regions, shape=[ paddle.shape(x)[0], -1, self.num_cls * self.kernel_sizes[0] * self.kernel_sizes[1] ]) ori_fea_regions = paddle.transpose( ori_fea_regions, perm=[0, 2, 1]) fea_out = F.fold( ori_fea_regions, [paddle.shape(x)[2], paddle.shape(x)[3]], self.kernel_sizes, strides=self.kernel_sizes[0], paddings=0, dilations=1, name=None) return [fea_out] class FCNHead(nn.Layer): def __init__(self, num_classes, backbone_indices=(-1, ), backbone_channels=(270, ), channels=None): super(FCNHead, self).__init__() self.num_classes = num_classes self.backbone_indices = backbone_indices if channels is None: channels = backbone_channels[0] self.conv_1 = layers.ConvBNReLU( in_channels=backbone_channels[0], out_channels=channels, kernel_size=1, stride=1, bias_attr=True) self.cls = nn.Conv2D( in_channels=channels, out_channels=self.num_classes, kernel_size=1, stride=1, bias_attr=True) self.init_weight() def forward(self, feat_list): logit_list = [] x = feat_list[self.backbone_indices[0]] x = self.conv_1(x) logit = self.cls(x) logit_list.append(logit) return logit_list def init_weight(self): for layer in self.sublayers(): if isinstance(layer, nn.Conv2D): param_init.normal_init(layer.weight, std=0.001) elif isinstance(layer, (nn.BatchNorm, nn.SyncBatchNorm)): param_init.constant_init(layer.weight, value=1.0) param_init.constant_init(layer.bias, value=0.0)